i2s_jun2024 / trainer_state.json
obulikrish's picture
i2s_jun2024
567d2b4
{
"best_metric": 1.3218775987625122,
"best_model_checkpoint": "saved_model/i2s_jun2024/checkpoint-4644",
"epoch": 8.0,
"eval_steps": 500,
"global_step": 4644,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 71.5168,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 18.802278518676758,
"learning_rate": 2.5e-06,
"loss": 74.047,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 21.658679962158203,
"learning_rate": 6e-06,
"loss": 73.7187,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 14.339091300964355,
"learning_rate": 1.05e-05,
"loss": 73.4207,
"step": 30
},
{
"epoch": 0.07,
"grad_norm": 15.059852600097656,
"learning_rate": 1.55e-05,
"loss": 71.3845,
"step": 40
},
{
"epoch": 0.09,
"grad_norm": 14.70816707611084,
"learning_rate": 2.05e-05,
"loss": 69.0148,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 16.725433349609375,
"learning_rate": 2.5500000000000003e-05,
"loss": 64.3212,
"step": 60
},
{
"epoch": 0.12,
"grad_norm": 25.34491539001465,
"learning_rate": 3.05e-05,
"loss": 55.119,
"step": 70
},
{
"epoch": 0.14,
"grad_norm": 30.524017333984375,
"learning_rate": 3.55e-05,
"loss": 35.3988,
"step": 80
},
{
"epoch": 0.16,
"grad_norm": 14.260645866394043,
"learning_rate": 4.05e-05,
"loss": 14.7682,
"step": 90
},
{
"epoch": 0.17,
"grad_norm": 7.6033244132995605,
"learning_rate": 4.55e-05,
"loss": 7.8366,
"step": 100
},
{
"epoch": 0.19,
"grad_norm": 6.007198333740234,
"learning_rate": 5.05e-05,
"loss": 5.3438,
"step": 110
},
{
"epoch": 0.21,
"grad_norm": 6.973146438598633,
"learning_rate": 5.550000000000001e-05,
"loss": 4.38,
"step": 120
},
{
"epoch": 0.22,
"grad_norm": 7.27716588973999,
"learning_rate": 6.05e-05,
"loss": 3.9717,
"step": 130
},
{
"epoch": 0.24,
"grad_norm": 6.905131816864014,
"learning_rate": 6.55e-05,
"loss": 3.6261,
"step": 140
},
{
"epoch": 0.26,
"grad_norm": 7.467597007751465,
"learning_rate": 7.05e-05,
"loss": 3.3891,
"step": 150
},
{
"epoch": 0.28,
"grad_norm": 6.636981010437012,
"learning_rate": 7.55e-05,
"loss": 3.3898,
"step": 160
},
{
"epoch": 0.29,
"grad_norm": 8.696107864379883,
"learning_rate": 8.05e-05,
"loss": 3.1741,
"step": 170
},
{
"epoch": 0.31,
"grad_norm": 6.677704811096191,
"learning_rate": 8.55e-05,
"loss": 3.0704,
"step": 180
},
{
"epoch": 0.33,
"grad_norm": 6.542475700378418,
"learning_rate": 9.05e-05,
"loss": 3.1193,
"step": 190
},
{
"epoch": 0.34,
"grad_norm": 6.453402042388916,
"learning_rate": 9.55e-05,
"loss": 3.0298,
"step": 200
},
{
"epoch": 0.36,
"grad_norm": 5.4417500495910645,
"learning_rate": 9.998214285714286e-05,
"loss": 2.9418,
"step": 210
},
{
"epoch": 0.38,
"grad_norm": 4.76828670501709,
"learning_rate": 9.980357142857143e-05,
"loss": 2.8558,
"step": 220
},
{
"epoch": 0.4,
"grad_norm": 5.404451370239258,
"learning_rate": 9.9625e-05,
"loss": 2.7935,
"step": 230
},
{
"epoch": 0.41,
"grad_norm": 4.168063163757324,
"learning_rate": 9.944642857142857e-05,
"loss": 2.7137,
"step": 240
},
{
"epoch": 0.43,
"grad_norm": 4.073235034942627,
"learning_rate": 9.926785714285715e-05,
"loss": 2.5657,
"step": 250
},
{
"epoch": 0.45,
"grad_norm": 3.113050699234009,
"learning_rate": 9.908928571428571e-05,
"loss": 2.4517,
"step": 260
},
{
"epoch": 0.47,
"grad_norm": 2.8151700496673584,
"learning_rate": 9.891071428571429e-05,
"loss": 2.4306,
"step": 270
},
{
"epoch": 0.48,
"grad_norm": 4.368308067321777,
"learning_rate": 9.873214285714287e-05,
"loss": 2.3841,
"step": 280
},
{
"epoch": 0.5,
"grad_norm": 3.0557169914245605,
"learning_rate": 9.855357142857143e-05,
"loss": 2.3246,
"step": 290
},
{
"epoch": 0.52,
"grad_norm": 2.5763227939605713,
"learning_rate": 9.8375e-05,
"loss": 2.2142,
"step": 300
},
{
"epoch": 0.53,
"grad_norm": 3.2247154712677,
"learning_rate": 9.819642857142858e-05,
"loss": 2.2083,
"step": 310
},
{
"epoch": 0.55,
"grad_norm": 2.6648075580596924,
"learning_rate": 9.801785714285715e-05,
"loss": 2.1679,
"step": 320
},
{
"epoch": 0.57,
"grad_norm": 3.017408609390259,
"learning_rate": 9.783928571428572e-05,
"loss": 2.148,
"step": 330
},
{
"epoch": 0.59,
"grad_norm": 2.380321741104126,
"learning_rate": 9.76607142857143e-05,
"loss": 2.1343,
"step": 340
},
{
"epoch": 0.6,
"grad_norm": 2.6373605728149414,
"learning_rate": 9.748214285714286e-05,
"loss": 2.1052,
"step": 350
},
{
"epoch": 0.62,
"grad_norm": 2.7756423950195312,
"learning_rate": 9.730357142857144e-05,
"loss": 2.1182,
"step": 360
},
{
"epoch": 0.64,
"grad_norm": 3.484689950942993,
"learning_rate": 9.7125e-05,
"loss": 2.0797,
"step": 370
},
{
"epoch": 0.65,
"grad_norm": 2.5313141345977783,
"learning_rate": 9.694642857142857e-05,
"loss": 2.0304,
"step": 380
},
{
"epoch": 0.67,
"grad_norm": 2.3977627754211426,
"learning_rate": 9.676785714285714e-05,
"loss": 2.0538,
"step": 390
},
{
"epoch": 0.69,
"grad_norm": 3.0346198081970215,
"learning_rate": 9.658928571428572e-05,
"loss": 2.0083,
"step": 400
},
{
"epoch": 0.71,
"grad_norm": 2.3251254558563232,
"learning_rate": 9.641071428571428e-05,
"loss": 1.9846,
"step": 410
},
{
"epoch": 0.72,
"grad_norm": 2.010366439819336,
"learning_rate": 9.623214285714286e-05,
"loss": 1.9756,
"step": 420
},
{
"epoch": 0.74,
"grad_norm": 2.3924219608306885,
"learning_rate": 9.605357142857144e-05,
"loss": 2.0645,
"step": 430
},
{
"epoch": 0.76,
"grad_norm": 2.37774395942688,
"learning_rate": 9.5875e-05,
"loss": 2.0311,
"step": 440
},
{
"epoch": 0.78,
"grad_norm": 2.794346809387207,
"learning_rate": 9.569642857142858e-05,
"loss": 2.0278,
"step": 450
},
{
"epoch": 0.79,
"grad_norm": 2.4213197231292725,
"learning_rate": 9.551785714285715e-05,
"loss": 1.9756,
"step": 460
},
{
"epoch": 0.81,
"grad_norm": 2.1344525814056396,
"learning_rate": 9.533928571428572e-05,
"loss": 1.9617,
"step": 470
},
{
"epoch": 0.83,
"grad_norm": 2.6219308376312256,
"learning_rate": 9.516071428571429e-05,
"loss": 1.908,
"step": 480
},
{
"epoch": 0.84,
"grad_norm": 1.903001308441162,
"learning_rate": 9.498214285714287e-05,
"loss": 1.9123,
"step": 490
},
{
"epoch": 0.86,
"grad_norm": 2.3412868976593018,
"learning_rate": 9.480357142857143e-05,
"loss": 1.9922,
"step": 500
},
{
"epoch": 0.88,
"grad_norm": 2.0525269508361816,
"learning_rate": 9.462500000000001e-05,
"loss": 1.9335,
"step": 510
},
{
"epoch": 0.9,
"grad_norm": 2.0901846885681152,
"learning_rate": 9.444642857142859e-05,
"loss": 1.9466,
"step": 520
},
{
"epoch": 0.91,
"grad_norm": 1.9739652872085571,
"learning_rate": 9.426785714285715e-05,
"loss": 1.9212,
"step": 530
},
{
"epoch": 0.93,
"grad_norm": 2.675729751586914,
"learning_rate": 9.408928571428573e-05,
"loss": 1.9025,
"step": 540
},
{
"epoch": 0.95,
"grad_norm": 1.9041796922683716,
"learning_rate": 9.391071428571429e-05,
"loss": 1.8316,
"step": 550
},
{
"epoch": 0.96,
"grad_norm": 2.364295244216919,
"learning_rate": 9.373214285714285e-05,
"loss": 1.9436,
"step": 560
},
{
"epoch": 0.98,
"grad_norm": 1.7344969511032104,
"learning_rate": 9.355357142857143e-05,
"loss": 1.8546,
"step": 570
},
{
"epoch": 1.0,
"grad_norm": 1.711107611656189,
"learning_rate": 9.3375e-05,
"loss": 1.8762,
"step": 580
},
{
"epoch": 1.0,
"eval_loss": 1.6403552293777466,
"eval_runtime": 188.058,
"eval_samples_per_second": 264.498,
"eval_steps_per_second": 4.137,
"step": 580
},
{
"epoch": 1.02,
"grad_norm": 2.3606512546539307,
"learning_rate": 9.319642857142857e-05,
"loss": 1.8004,
"step": 590
},
{
"epoch": 1.03,
"grad_norm": 2.242635726928711,
"learning_rate": 9.301785714285715e-05,
"loss": 1.8469,
"step": 600
},
{
"epoch": 1.05,
"grad_norm": 2.228389024734497,
"learning_rate": 9.283928571428572e-05,
"loss": 1.7991,
"step": 610
},
{
"epoch": 1.07,
"grad_norm": 1.9997934103012085,
"learning_rate": 9.266071428571429e-05,
"loss": 1.8775,
"step": 620
},
{
"epoch": 1.09,
"grad_norm": 2.089266538619995,
"learning_rate": 9.248214285714286e-05,
"loss": 1.8204,
"step": 630
},
{
"epoch": 1.1,
"grad_norm": 2.12202525138855,
"learning_rate": 9.230357142857144e-05,
"loss": 1.7901,
"step": 640
},
{
"epoch": 1.12,
"grad_norm": 2.0948925018310547,
"learning_rate": 9.2125e-05,
"loss": 1.8531,
"step": 650
},
{
"epoch": 1.14,
"grad_norm": 1.9177907705307007,
"learning_rate": 9.194642857142858e-05,
"loss": 1.7605,
"step": 660
},
{
"epoch": 1.15,
"grad_norm": 1.9887701272964478,
"learning_rate": 9.176785714285714e-05,
"loss": 1.7871,
"step": 670
},
{
"epoch": 1.17,
"grad_norm": 1.8522242307662964,
"learning_rate": 9.158928571428572e-05,
"loss": 1.7708,
"step": 680
},
{
"epoch": 1.19,
"grad_norm": 2.551814556121826,
"learning_rate": 9.14107142857143e-05,
"loss": 1.8094,
"step": 690
},
{
"epoch": 1.21,
"grad_norm": 1.845168948173523,
"learning_rate": 9.123214285714286e-05,
"loss": 1.8073,
"step": 700
},
{
"epoch": 1.22,
"grad_norm": 1.822838544845581,
"learning_rate": 9.105357142857144e-05,
"loss": 1.6686,
"step": 710
},
{
"epoch": 1.24,
"grad_norm": 1.8166040182113647,
"learning_rate": 9.0875e-05,
"loss": 1.8582,
"step": 720
},
{
"epoch": 1.26,
"grad_norm": 1.5486347675323486,
"learning_rate": 9.069642857142858e-05,
"loss": 1.816,
"step": 730
},
{
"epoch": 1.27,
"grad_norm": 1.9094125032424927,
"learning_rate": 9.051785714285714e-05,
"loss": 1.7152,
"step": 740
},
{
"epoch": 1.29,
"grad_norm": 1.9882993698120117,
"learning_rate": 9.033928571428572e-05,
"loss": 1.794,
"step": 750
},
{
"epoch": 1.31,
"grad_norm": 1.8697319030761719,
"learning_rate": 9.01607142857143e-05,
"loss": 1.7417,
"step": 760
},
{
"epoch": 1.33,
"grad_norm": 1.819305419921875,
"learning_rate": 8.998214285714286e-05,
"loss": 1.8305,
"step": 770
},
{
"epoch": 1.34,
"grad_norm": 1.715167760848999,
"learning_rate": 8.980357142857143e-05,
"loss": 1.7865,
"step": 780
},
{
"epoch": 1.36,
"grad_norm": 2.109316110610962,
"learning_rate": 8.962500000000001e-05,
"loss": 1.7901,
"step": 790
},
{
"epoch": 1.38,
"grad_norm": 1.5844603776931763,
"learning_rate": 8.944642857142857e-05,
"loss": 1.7622,
"step": 800
},
{
"epoch": 1.4,
"grad_norm": 1.6469178199768066,
"learning_rate": 8.926785714285715e-05,
"loss": 1.7613,
"step": 810
},
{
"epoch": 1.41,
"grad_norm": 1.6666046380996704,
"learning_rate": 8.908928571428571e-05,
"loss": 1.6603,
"step": 820
},
{
"epoch": 1.43,
"grad_norm": 1.6979730129241943,
"learning_rate": 8.891071428571429e-05,
"loss": 1.7146,
"step": 830
},
{
"epoch": 1.45,
"grad_norm": 1.8366163969039917,
"learning_rate": 8.873214285714287e-05,
"loss": 1.7286,
"step": 840
},
{
"epoch": 1.46,
"grad_norm": 1.638338327407837,
"learning_rate": 8.855357142857143e-05,
"loss": 1.7321,
"step": 850
},
{
"epoch": 1.48,
"grad_norm": 1.7783935070037842,
"learning_rate": 8.837500000000001e-05,
"loss": 1.8262,
"step": 860
},
{
"epoch": 1.5,
"grad_norm": 1.8703325986862183,
"learning_rate": 8.819642857142858e-05,
"loss": 1.7427,
"step": 870
},
{
"epoch": 1.52,
"grad_norm": 1.6613126993179321,
"learning_rate": 8.801785714285715e-05,
"loss": 1.7327,
"step": 880
},
{
"epoch": 1.53,
"grad_norm": 2.376027822494507,
"learning_rate": 8.783928571428572e-05,
"loss": 1.8232,
"step": 890
},
{
"epoch": 1.55,
"grad_norm": 1.796240210533142,
"learning_rate": 8.766071428571429e-05,
"loss": 1.7516,
"step": 900
},
{
"epoch": 1.57,
"grad_norm": 1.9593281745910645,
"learning_rate": 8.748214285714286e-05,
"loss": 1.7093,
"step": 910
},
{
"epoch": 1.58,
"grad_norm": 1.889053225517273,
"learning_rate": 8.730357142857143e-05,
"loss": 1.7592,
"step": 920
},
{
"epoch": 1.6,
"grad_norm": 1.6226258277893066,
"learning_rate": 8.7125e-05,
"loss": 1.7322,
"step": 930
},
{
"epoch": 1.62,
"grad_norm": 1.7704793214797974,
"learning_rate": 8.694642857142857e-05,
"loss": 1.7636,
"step": 940
},
{
"epoch": 1.64,
"grad_norm": 2.601127862930298,
"learning_rate": 8.676785714285714e-05,
"loss": 1.7245,
"step": 950
},
{
"epoch": 1.65,
"grad_norm": 1.6599385738372803,
"learning_rate": 8.658928571428572e-05,
"loss": 1.71,
"step": 960
},
{
"epoch": 1.67,
"grad_norm": 1.7746946811676025,
"learning_rate": 8.641071428571428e-05,
"loss": 1.7225,
"step": 970
},
{
"epoch": 1.69,
"grad_norm": 1.729849934577942,
"learning_rate": 8.623214285714286e-05,
"loss": 1.6965,
"step": 980
},
{
"epoch": 1.71,
"grad_norm": 1.785882592201233,
"learning_rate": 8.605357142857144e-05,
"loss": 1.7038,
"step": 990
},
{
"epoch": 1.72,
"grad_norm": 1.473311185836792,
"learning_rate": 8.5875e-05,
"loss": 1.6833,
"step": 1000
},
{
"epoch": 1.74,
"grad_norm": 1.6640801429748535,
"learning_rate": 8.569642857142858e-05,
"loss": 1.7092,
"step": 1010
},
{
"epoch": 1.76,
"grad_norm": 1.5122531652450562,
"learning_rate": 8.551785714285715e-05,
"loss": 1.6075,
"step": 1020
},
{
"epoch": 1.77,
"grad_norm": 1.6112641096115112,
"learning_rate": 8.533928571428572e-05,
"loss": 1.6411,
"step": 1030
},
{
"epoch": 1.79,
"grad_norm": 1.7034529447555542,
"learning_rate": 8.51607142857143e-05,
"loss": 1.6864,
"step": 1040
},
{
"epoch": 1.81,
"grad_norm": 1.4635982513427734,
"learning_rate": 8.498214285714287e-05,
"loss": 1.7144,
"step": 1050
},
{
"epoch": 1.83,
"grad_norm": 1.6717389822006226,
"learning_rate": 8.480357142857143e-05,
"loss": 1.6714,
"step": 1060
},
{
"epoch": 1.84,
"grad_norm": 1.7127422094345093,
"learning_rate": 8.4625e-05,
"loss": 1.6383,
"step": 1070
},
{
"epoch": 1.86,
"grad_norm": 1.6816381216049194,
"learning_rate": 8.444642857142857e-05,
"loss": 1.5957,
"step": 1080
},
{
"epoch": 1.88,
"grad_norm": 1.6419484615325928,
"learning_rate": 8.426785714285714e-05,
"loss": 1.7279,
"step": 1090
},
{
"epoch": 1.89,
"grad_norm": 1.5984368324279785,
"learning_rate": 8.408928571428571e-05,
"loss": 1.6922,
"step": 1100
},
{
"epoch": 1.91,
"grad_norm": 1.7579293251037598,
"learning_rate": 8.391071428571429e-05,
"loss": 1.6335,
"step": 1110
},
{
"epoch": 1.93,
"grad_norm": 2.1119790077209473,
"learning_rate": 8.373214285714285e-05,
"loss": 1.6613,
"step": 1120
},
{
"epoch": 1.95,
"grad_norm": 1.8950587511062622,
"learning_rate": 8.355357142857143e-05,
"loss": 1.6867,
"step": 1130
},
{
"epoch": 1.96,
"grad_norm": 1.5633883476257324,
"learning_rate": 8.337500000000001e-05,
"loss": 1.7352,
"step": 1140
},
{
"epoch": 1.98,
"grad_norm": 1.5965347290039062,
"learning_rate": 8.319642857142857e-05,
"loss": 1.7453,
"step": 1150
},
{
"epoch": 2.0,
"grad_norm": 1.7829467058181763,
"learning_rate": 8.301785714285715e-05,
"loss": 1.6783,
"step": 1160
},
{
"epoch": 2.0,
"eval_loss": 1.4701509475708008,
"eval_runtime": 190.5514,
"eval_samples_per_second": 261.037,
"eval_steps_per_second": 4.083,
"step": 1161
},
{
"epoch": 2.02,
"grad_norm": 1.8088414669036865,
"learning_rate": 8.283928571428572e-05,
"loss": 1.6,
"step": 1170
},
{
"epoch": 2.03,
"grad_norm": 1.609508752822876,
"learning_rate": 8.266071428571429e-05,
"loss": 1.6683,
"step": 1180
},
{
"epoch": 2.05,
"grad_norm": 1.5094209909439087,
"learning_rate": 8.248214285714286e-05,
"loss": 1.66,
"step": 1190
},
{
"epoch": 2.07,
"grad_norm": 1.420783519744873,
"learning_rate": 8.230357142857144e-05,
"loss": 1.7407,
"step": 1200
},
{
"epoch": 2.08,
"grad_norm": 1.5500482320785522,
"learning_rate": 8.2125e-05,
"loss": 1.6455,
"step": 1210
},
{
"epoch": 2.1,
"grad_norm": 1.9022321701049805,
"learning_rate": 8.194642857142858e-05,
"loss": 1.6988,
"step": 1220
},
{
"epoch": 2.12,
"grad_norm": 1.4118092060089111,
"learning_rate": 8.176785714285716e-05,
"loss": 1.6608,
"step": 1230
},
{
"epoch": 2.14,
"grad_norm": 1.583893060684204,
"learning_rate": 8.158928571428572e-05,
"loss": 1.5432,
"step": 1240
},
{
"epoch": 2.15,
"grad_norm": 1.6913859844207764,
"learning_rate": 8.141071428571428e-05,
"loss": 1.5941,
"step": 1250
},
{
"epoch": 2.17,
"grad_norm": 1.5015137195587158,
"learning_rate": 8.123214285714286e-05,
"loss": 1.6033,
"step": 1260
},
{
"epoch": 2.19,
"grad_norm": 1.7772740125656128,
"learning_rate": 8.105357142857142e-05,
"loss": 1.6563,
"step": 1270
},
{
"epoch": 2.2,
"grad_norm": 2.0110361576080322,
"learning_rate": 8.0875e-05,
"loss": 1.6264,
"step": 1280
},
{
"epoch": 2.22,
"grad_norm": 2.3244762420654297,
"learning_rate": 8.069642857142858e-05,
"loss": 1.6507,
"step": 1290
},
{
"epoch": 2.24,
"grad_norm": 1.4885693788528442,
"learning_rate": 8.051785714285714e-05,
"loss": 1.6528,
"step": 1300
},
{
"epoch": 2.26,
"grad_norm": 1.5776846408843994,
"learning_rate": 8.033928571428572e-05,
"loss": 1.6196,
"step": 1310
},
{
"epoch": 2.27,
"grad_norm": 1.8749977350234985,
"learning_rate": 8.01607142857143e-05,
"loss": 1.5766,
"step": 1320
},
{
"epoch": 2.29,
"grad_norm": 1.726862907409668,
"learning_rate": 7.998214285714286e-05,
"loss": 1.5926,
"step": 1330
},
{
"epoch": 2.31,
"grad_norm": 1.5350719690322876,
"learning_rate": 7.980357142857143e-05,
"loss": 1.6054,
"step": 1340
},
{
"epoch": 2.33,
"grad_norm": 1.5497677326202393,
"learning_rate": 7.962500000000001e-05,
"loss": 1.5752,
"step": 1350
},
{
"epoch": 2.34,
"grad_norm": 1.8540089130401611,
"learning_rate": 7.944642857142857e-05,
"loss": 1.5771,
"step": 1360
},
{
"epoch": 2.36,
"grad_norm": 1.4316684007644653,
"learning_rate": 7.926785714285715e-05,
"loss": 1.6257,
"step": 1370
},
{
"epoch": 2.38,
"grad_norm": 1.671826958656311,
"learning_rate": 7.908928571428573e-05,
"loss": 1.622,
"step": 1380
},
{
"epoch": 2.39,
"grad_norm": 1.5158424377441406,
"learning_rate": 7.891071428571429e-05,
"loss": 1.6551,
"step": 1390
},
{
"epoch": 2.41,
"grad_norm": 1.5484036207199097,
"learning_rate": 7.873214285714287e-05,
"loss": 1.5857,
"step": 1400
},
{
"epoch": 2.43,
"grad_norm": 1.7882516384124756,
"learning_rate": 7.855357142857143e-05,
"loss": 1.5469,
"step": 1410
},
{
"epoch": 2.45,
"grad_norm": 1.7476617097854614,
"learning_rate": 7.8375e-05,
"loss": 1.5668,
"step": 1420
},
{
"epoch": 2.46,
"grad_norm": 1.7887372970581055,
"learning_rate": 7.819642857142857e-05,
"loss": 1.5988,
"step": 1430
},
{
"epoch": 2.48,
"grad_norm": 1.6280275583267212,
"learning_rate": 7.801785714285715e-05,
"loss": 1.6253,
"step": 1440
},
{
"epoch": 2.5,
"grad_norm": 1.6212425231933594,
"learning_rate": 7.783928571428571e-05,
"loss": 1.664,
"step": 1450
},
{
"epoch": 2.52,
"grad_norm": 2.2675745487213135,
"learning_rate": 7.766071428571429e-05,
"loss": 1.6108,
"step": 1460
},
{
"epoch": 2.53,
"grad_norm": 1.5932976007461548,
"learning_rate": 7.748214285714286e-05,
"loss": 1.6551,
"step": 1470
},
{
"epoch": 2.55,
"grad_norm": 1.3544069528579712,
"learning_rate": 7.730357142857143e-05,
"loss": 1.5929,
"step": 1480
},
{
"epoch": 2.57,
"grad_norm": 1.9113808870315552,
"learning_rate": 7.7125e-05,
"loss": 1.6015,
"step": 1490
},
{
"epoch": 2.58,
"grad_norm": 1.5239685773849487,
"learning_rate": 7.694642857142858e-05,
"loss": 1.6222,
"step": 1500
},
{
"epoch": 2.6,
"grad_norm": 1.3187998533248901,
"learning_rate": 7.676785714285714e-05,
"loss": 1.5639,
"step": 1510
},
{
"epoch": 2.62,
"grad_norm": 1.6183714866638184,
"learning_rate": 7.658928571428572e-05,
"loss": 1.6302,
"step": 1520
},
{
"epoch": 2.64,
"grad_norm": 1.6767323017120361,
"learning_rate": 7.641071428571428e-05,
"loss": 1.6897,
"step": 1530
},
{
"epoch": 2.65,
"grad_norm": 1.5148464441299438,
"learning_rate": 7.623214285714286e-05,
"loss": 1.5924,
"step": 1540
},
{
"epoch": 2.67,
"grad_norm": 1.5788357257843018,
"learning_rate": 7.605357142857144e-05,
"loss": 1.6388,
"step": 1550
},
{
"epoch": 2.69,
"grad_norm": 1.5306918621063232,
"learning_rate": 7.5875e-05,
"loss": 1.5771,
"step": 1560
},
{
"epoch": 2.7,
"grad_norm": 1.8422303199768066,
"learning_rate": 7.569642857142858e-05,
"loss": 1.5735,
"step": 1570
},
{
"epoch": 2.72,
"grad_norm": 1.676352858543396,
"learning_rate": 7.551785714285715e-05,
"loss": 1.5903,
"step": 1580
},
{
"epoch": 2.74,
"grad_norm": 1.6177210807800293,
"learning_rate": 7.533928571428572e-05,
"loss": 1.6255,
"step": 1590
},
{
"epoch": 2.76,
"grad_norm": 1.643314003944397,
"learning_rate": 7.516071428571428e-05,
"loss": 1.6214,
"step": 1600
},
{
"epoch": 2.77,
"grad_norm": 1.7559303045272827,
"learning_rate": 7.498214285714286e-05,
"loss": 1.5746,
"step": 1610
},
{
"epoch": 2.79,
"grad_norm": 1.644892692565918,
"learning_rate": 7.480357142857143e-05,
"loss": 1.6139,
"step": 1620
},
{
"epoch": 2.81,
"grad_norm": 1.8428288698196411,
"learning_rate": 7.4625e-05,
"loss": 1.5731,
"step": 1630
},
{
"epoch": 2.83,
"grad_norm": 1.6845279932022095,
"learning_rate": 7.444642857142857e-05,
"loss": 1.5103,
"step": 1640
},
{
"epoch": 2.84,
"grad_norm": 1.987294316291809,
"learning_rate": 7.426785714285714e-05,
"loss": 1.5953,
"step": 1650
},
{
"epoch": 2.86,
"grad_norm": 1.5020558834075928,
"learning_rate": 7.408928571428571e-05,
"loss": 1.5949,
"step": 1660
},
{
"epoch": 2.88,
"grad_norm": 1.648449182510376,
"learning_rate": 7.391071428571429e-05,
"loss": 1.6046,
"step": 1670
},
{
"epoch": 2.89,
"grad_norm": 1.5231221914291382,
"learning_rate": 7.373214285714285e-05,
"loss": 1.5809,
"step": 1680
},
{
"epoch": 2.91,
"grad_norm": 1.5675526857376099,
"learning_rate": 7.355357142857143e-05,
"loss": 1.5945,
"step": 1690
},
{
"epoch": 2.93,
"grad_norm": 1.8311553001403809,
"learning_rate": 7.337500000000001e-05,
"loss": 1.5831,
"step": 1700
},
{
"epoch": 2.95,
"grad_norm": 1.494588851928711,
"learning_rate": 7.319642857142857e-05,
"loss": 1.6073,
"step": 1710
},
{
"epoch": 2.96,
"grad_norm": 2.0507028102874756,
"learning_rate": 7.301785714285715e-05,
"loss": 1.5649,
"step": 1720
},
{
"epoch": 2.98,
"grad_norm": 1.7288378477096558,
"learning_rate": 7.283928571428572e-05,
"loss": 1.5977,
"step": 1730
},
{
"epoch": 3.0,
"grad_norm": 1.6537200212478638,
"learning_rate": 7.266071428571429e-05,
"loss": 1.5865,
"step": 1740
},
{
"epoch": 3.0,
"eval_loss": 1.4159988164901733,
"eval_runtime": 189.5375,
"eval_samples_per_second": 262.434,
"eval_steps_per_second": 4.105,
"step": 1741
},
{
"epoch": 3.01,
"grad_norm": 1.583219289779663,
"learning_rate": 7.248214285714286e-05,
"loss": 1.5617,
"step": 1750
},
{
"epoch": 3.03,
"grad_norm": 1.5875566005706787,
"learning_rate": 7.230357142857144e-05,
"loss": 1.5863,
"step": 1760
},
{
"epoch": 3.05,
"grad_norm": 1.9724568128585815,
"learning_rate": 7.2125e-05,
"loss": 1.5523,
"step": 1770
},
{
"epoch": 3.07,
"grad_norm": 1.301005482673645,
"learning_rate": 7.194642857142857e-05,
"loss": 1.4872,
"step": 1780
},
{
"epoch": 3.08,
"grad_norm": 1.6533854007720947,
"learning_rate": 7.176785714285714e-05,
"loss": 1.6099,
"step": 1790
},
{
"epoch": 3.1,
"grad_norm": 1.5016844272613525,
"learning_rate": 7.158928571428571e-05,
"loss": 1.634,
"step": 1800
},
{
"epoch": 3.12,
"grad_norm": 1.6958973407745361,
"learning_rate": 7.141071428571428e-05,
"loss": 1.5694,
"step": 1810
},
{
"epoch": 3.14,
"grad_norm": 1.6577497720718384,
"learning_rate": 7.123214285714286e-05,
"loss": 1.6145,
"step": 1820
},
{
"epoch": 3.15,
"grad_norm": 1.8398799896240234,
"learning_rate": 7.105357142857142e-05,
"loss": 1.5634,
"step": 1830
},
{
"epoch": 3.17,
"grad_norm": 1.901779055595398,
"learning_rate": 7.0875e-05,
"loss": 1.552,
"step": 1840
},
{
"epoch": 3.19,
"grad_norm": 1.464153528213501,
"learning_rate": 7.069642857142858e-05,
"loss": 1.5429,
"step": 1850
},
{
"epoch": 3.2,
"grad_norm": 1.6301567554473877,
"learning_rate": 7.051785714285714e-05,
"loss": 1.5626,
"step": 1860
},
{
"epoch": 3.22,
"grad_norm": 2.9975762367248535,
"learning_rate": 7.033928571428572e-05,
"loss": 1.5484,
"step": 1870
},
{
"epoch": 3.24,
"grad_norm": 1.6475188732147217,
"learning_rate": 7.01607142857143e-05,
"loss": 1.5073,
"step": 1880
},
{
"epoch": 3.26,
"grad_norm": 1.5309525728225708,
"learning_rate": 6.998214285714286e-05,
"loss": 1.5285,
"step": 1890
},
{
"epoch": 3.27,
"grad_norm": 1.5738170146942139,
"learning_rate": 6.980357142857144e-05,
"loss": 1.5845,
"step": 1900
},
{
"epoch": 3.29,
"grad_norm": 1.40895414352417,
"learning_rate": 6.962500000000001e-05,
"loss": 1.5577,
"step": 1910
},
{
"epoch": 3.31,
"grad_norm": 1.4286799430847168,
"learning_rate": 6.944642857142858e-05,
"loss": 1.576,
"step": 1920
},
{
"epoch": 3.32,
"grad_norm": 1.398916482925415,
"learning_rate": 6.926785714285715e-05,
"loss": 1.5434,
"step": 1930
},
{
"epoch": 3.34,
"grad_norm": 1.5930207967758179,
"learning_rate": 6.908928571428573e-05,
"loss": 1.5388,
"step": 1940
},
{
"epoch": 3.36,
"grad_norm": 1.4278233051300049,
"learning_rate": 6.891071428571428e-05,
"loss": 1.4956,
"step": 1950
},
{
"epoch": 3.38,
"grad_norm": 1.7049891948699951,
"learning_rate": 6.873214285714286e-05,
"loss": 1.5466,
"step": 1960
},
{
"epoch": 3.39,
"grad_norm": 1.6808096170425415,
"learning_rate": 6.855357142857143e-05,
"loss": 1.548,
"step": 1970
},
{
"epoch": 3.41,
"grad_norm": 1.5364686250686646,
"learning_rate": 6.8375e-05,
"loss": 1.5668,
"step": 1980
},
{
"epoch": 3.43,
"grad_norm": 1.7469326257705688,
"learning_rate": 6.819642857142857e-05,
"loss": 1.5352,
"step": 1990
},
{
"epoch": 3.45,
"grad_norm": 1.5400209426879883,
"learning_rate": 6.801785714285715e-05,
"loss": 1.5633,
"step": 2000
},
{
"epoch": 3.46,
"grad_norm": 1.4586174488067627,
"learning_rate": 6.783928571428571e-05,
"loss": 1.542,
"step": 2010
},
{
"epoch": 3.48,
"grad_norm": 1.5306270122528076,
"learning_rate": 6.766071428571429e-05,
"loss": 1.5397,
"step": 2020
},
{
"epoch": 3.5,
"grad_norm": 1.5310778617858887,
"learning_rate": 6.748214285714287e-05,
"loss": 1.5251,
"step": 2030
},
{
"epoch": 3.51,
"grad_norm": 1.666965126991272,
"learning_rate": 6.730357142857143e-05,
"loss": 1.5171,
"step": 2040
},
{
"epoch": 3.53,
"grad_norm": 1.6258907318115234,
"learning_rate": 6.7125e-05,
"loss": 1.5201,
"step": 2050
},
{
"epoch": 3.55,
"grad_norm": 2.181856393814087,
"learning_rate": 6.694642857142858e-05,
"loss": 1.5404,
"step": 2060
},
{
"epoch": 3.57,
"grad_norm": 1.4031646251678467,
"learning_rate": 6.676785714285715e-05,
"loss": 1.5207,
"step": 2070
},
{
"epoch": 3.58,
"grad_norm": 1.7784829139709473,
"learning_rate": 6.658928571428572e-05,
"loss": 1.5642,
"step": 2080
},
{
"epoch": 3.6,
"grad_norm": 1.5500433444976807,
"learning_rate": 6.64107142857143e-05,
"loss": 1.5823,
"step": 2090
},
{
"epoch": 3.62,
"grad_norm": 1.6340274810791016,
"learning_rate": 6.623214285714286e-05,
"loss": 1.5667,
"step": 2100
},
{
"epoch": 3.63,
"grad_norm": 1.6798614263534546,
"learning_rate": 6.605357142857144e-05,
"loss": 1.5767,
"step": 2110
},
{
"epoch": 3.65,
"grad_norm": 1.383573293685913,
"learning_rate": 6.5875e-05,
"loss": 1.5524,
"step": 2120
},
{
"epoch": 3.67,
"grad_norm": 1.5610862970352173,
"learning_rate": 6.569642857142857e-05,
"loss": 1.5256,
"step": 2130
},
{
"epoch": 3.69,
"grad_norm": 1.445110559463501,
"learning_rate": 6.551785714285714e-05,
"loss": 1.5982,
"step": 2140
},
{
"epoch": 3.7,
"grad_norm": 1.5573856830596924,
"learning_rate": 6.533928571428572e-05,
"loss": 1.5598,
"step": 2150
},
{
"epoch": 3.72,
"grad_norm": 1.5723471641540527,
"learning_rate": 6.516071428571428e-05,
"loss": 1.5462,
"step": 2160
},
{
"epoch": 3.74,
"grad_norm": 1.5293879508972168,
"learning_rate": 6.498214285714286e-05,
"loss": 1.5993,
"step": 2170
},
{
"epoch": 3.76,
"grad_norm": 1.5668715238571167,
"learning_rate": 6.480357142857144e-05,
"loss": 1.5375,
"step": 2180
},
{
"epoch": 3.77,
"grad_norm": 1.46014404296875,
"learning_rate": 6.4625e-05,
"loss": 1.5524,
"step": 2190
},
{
"epoch": 3.79,
"grad_norm": 1.6666390895843506,
"learning_rate": 6.444642857142858e-05,
"loss": 1.5131,
"step": 2200
},
{
"epoch": 3.81,
"grad_norm": 1.685606598854065,
"learning_rate": 6.426785714285715e-05,
"loss": 1.5067,
"step": 2210
},
{
"epoch": 3.82,
"grad_norm": 1.7276148796081543,
"learning_rate": 6.408928571428572e-05,
"loss": 1.5759,
"step": 2220
},
{
"epoch": 3.84,
"grad_norm": 1.4950209856033325,
"learning_rate": 6.391071428571429e-05,
"loss": 1.5587,
"step": 2230
},
{
"epoch": 3.86,
"grad_norm": 1.4240978956222534,
"learning_rate": 6.373214285714287e-05,
"loss": 1.5853,
"step": 2240
},
{
"epoch": 3.88,
"grad_norm": 1.4469460248947144,
"learning_rate": 6.355357142857143e-05,
"loss": 1.5442,
"step": 2250
},
{
"epoch": 3.89,
"grad_norm": 1.7306181192398071,
"learning_rate": 6.337500000000001e-05,
"loss": 1.5651,
"step": 2260
},
{
"epoch": 3.91,
"grad_norm": 1.3304357528686523,
"learning_rate": 6.319642857142857e-05,
"loss": 1.4652,
"step": 2270
},
{
"epoch": 3.93,
"grad_norm": 1.4111214876174927,
"learning_rate": 6.301785714285715e-05,
"loss": 1.495,
"step": 2280
},
{
"epoch": 3.94,
"grad_norm": 1.3847770690917969,
"learning_rate": 6.283928571428573e-05,
"loss": 1.4988,
"step": 2290
},
{
"epoch": 3.96,
"grad_norm": 1.7292548418045044,
"learning_rate": 6.266071428571429e-05,
"loss": 1.5777,
"step": 2300
},
{
"epoch": 3.98,
"grad_norm": 1.5064436197280884,
"learning_rate": 6.248214285714285e-05,
"loss": 1.527,
"step": 2310
},
{
"epoch": 4.0,
"grad_norm": 1.5890984535217285,
"learning_rate": 6.230357142857143e-05,
"loss": 1.484,
"step": 2320
},
{
"epoch": 4.0,
"eval_loss": 1.3862602710723877,
"eval_runtime": 189.6547,
"eval_samples_per_second": 262.271,
"eval_steps_per_second": 4.102,
"step": 2322
},
{
"epoch": 4.01,
"grad_norm": 1.3771491050720215,
"learning_rate": 6.2125e-05,
"loss": 1.4681,
"step": 2330
},
{
"epoch": 4.03,
"grad_norm": 1.4516774415969849,
"learning_rate": 6.194642857142857e-05,
"loss": 1.5396,
"step": 2340
},
{
"epoch": 4.05,
"grad_norm": 1.5169875621795654,
"learning_rate": 6.176785714285715e-05,
"loss": 1.4995,
"step": 2350
},
{
"epoch": 4.07,
"grad_norm": 1.67533540725708,
"learning_rate": 6.158928571428572e-05,
"loss": 1.5102,
"step": 2360
},
{
"epoch": 4.08,
"grad_norm": 1.8251153230667114,
"learning_rate": 6.141071428571429e-05,
"loss": 1.5588,
"step": 2370
},
{
"epoch": 4.1,
"grad_norm": 1.6662224531173706,
"learning_rate": 6.123214285714286e-05,
"loss": 1.5322,
"step": 2380
},
{
"epoch": 4.12,
"grad_norm": 1.9041415452957153,
"learning_rate": 6.105357142857143e-05,
"loss": 1.4883,
"step": 2390
},
{
"epoch": 4.13,
"grad_norm": 1.690784215927124,
"learning_rate": 6.0875e-05,
"loss": 1.4626,
"step": 2400
},
{
"epoch": 4.15,
"grad_norm": 1.4311233758926392,
"learning_rate": 6.069642857142858e-05,
"loss": 1.5406,
"step": 2410
},
{
"epoch": 4.17,
"grad_norm": 1.4492675065994263,
"learning_rate": 6.051785714285715e-05,
"loss": 1.551,
"step": 2420
},
{
"epoch": 4.19,
"grad_norm": 1.75238037109375,
"learning_rate": 6.033928571428572e-05,
"loss": 1.4758,
"step": 2430
},
{
"epoch": 4.2,
"grad_norm": 1.5559812784194946,
"learning_rate": 6.016071428571429e-05,
"loss": 1.5527,
"step": 2440
},
{
"epoch": 4.22,
"grad_norm": 1.6166861057281494,
"learning_rate": 5.9982142857142866e-05,
"loss": 1.5045,
"step": 2450
},
{
"epoch": 4.24,
"grad_norm": 1.7287530899047852,
"learning_rate": 5.9803571428571436e-05,
"loss": 1.4543,
"step": 2460
},
{
"epoch": 4.25,
"grad_norm": 1.8894718885421753,
"learning_rate": 5.9625e-05,
"loss": 1.5252,
"step": 2470
},
{
"epoch": 4.27,
"grad_norm": 1.577805757522583,
"learning_rate": 5.944642857142857e-05,
"loss": 1.4759,
"step": 2480
},
{
"epoch": 4.29,
"grad_norm": 1.4729737043380737,
"learning_rate": 5.926785714285714e-05,
"loss": 1.5391,
"step": 2490
},
{
"epoch": 4.31,
"grad_norm": 1.320513129234314,
"learning_rate": 5.9089285714285716e-05,
"loss": 1.4658,
"step": 2500
},
{
"epoch": 4.32,
"grad_norm": 1.5098971128463745,
"learning_rate": 5.8910714285714286e-05,
"loss": 1.4565,
"step": 2510
},
{
"epoch": 4.34,
"grad_norm": 1.41256844997406,
"learning_rate": 5.8732142857142856e-05,
"loss": 1.5108,
"step": 2520
},
{
"epoch": 4.36,
"grad_norm": 1.56814444065094,
"learning_rate": 5.855357142857143e-05,
"loss": 1.5464,
"step": 2530
},
{
"epoch": 4.38,
"grad_norm": 1.4572267532348633,
"learning_rate": 5.8375e-05,
"loss": 1.5219,
"step": 2540
},
{
"epoch": 4.39,
"grad_norm": 1.4478589296340942,
"learning_rate": 5.819642857142857e-05,
"loss": 1.5104,
"step": 2550
},
{
"epoch": 4.41,
"grad_norm": 1.3739742040634155,
"learning_rate": 5.801785714285714e-05,
"loss": 1.4983,
"step": 2560
},
{
"epoch": 4.43,
"grad_norm": 1.5668545961380005,
"learning_rate": 5.783928571428572e-05,
"loss": 1.4455,
"step": 2570
},
{
"epoch": 4.44,
"grad_norm": 1.5617165565490723,
"learning_rate": 5.766071428571429e-05,
"loss": 1.5091,
"step": 2580
},
{
"epoch": 4.46,
"grad_norm": 1.710605502128601,
"learning_rate": 5.748214285714286e-05,
"loss": 1.4836,
"step": 2590
},
{
"epoch": 4.48,
"grad_norm": 1.5531827211380005,
"learning_rate": 5.7303571428571436e-05,
"loss": 1.4712,
"step": 2600
},
{
"epoch": 4.5,
"grad_norm": 1.9922209978103638,
"learning_rate": 5.7125000000000006e-05,
"loss": 1.4905,
"step": 2610
},
{
"epoch": 4.51,
"grad_norm": 1.573633074760437,
"learning_rate": 5.6946428571428576e-05,
"loss": 1.4924,
"step": 2620
},
{
"epoch": 4.53,
"grad_norm": 1.5242887735366821,
"learning_rate": 5.676785714285715e-05,
"loss": 1.4651,
"step": 2630
},
{
"epoch": 4.55,
"grad_norm": 1.4906481504440308,
"learning_rate": 5.658928571428572e-05,
"loss": 1.4601,
"step": 2640
},
{
"epoch": 4.57,
"grad_norm": 1.7079437971115112,
"learning_rate": 5.6410714285714286e-05,
"loss": 1.5236,
"step": 2650
},
{
"epoch": 4.58,
"grad_norm": 1.482407569885254,
"learning_rate": 5.6232142857142856e-05,
"loss": 1.4854,
"step": 2660
},
{
"epoch": 4.6,
"grad_norm": 1.4641469717025757,
"learning_rate": 5.6053571428571426e-05,
"loss": 1.5994,
"step": 2670
},
{
"epoch": 4.62,
"grad_norm": 1.6065988540649414,
"learning_rate": 5.5875e-05,
"loss": 1.5009,
"step": 2680
},
{
"epoch": 4.63,
"grad_norm": 1.6542418003082275,
"learning_rate": 5.569642857142857e-05,
"loss": 1.4303,
"step": 2690
},
{
"epoch": 4.65,
"grad_norm": 2.0338478088378906,
"learning_rate": 5.551785714285714e-05,
"loss": 1.5283,
"step": 2700
},
{
"epoch": 4.67,
"grad_norm": 1.4684573411941528,
"learning_rate": 5.533928571428571e-05,
"loss": 1.4597,
"step": 2710
},
{
"epoch": 4.69,
"grad_norm": 1.562809944152832,
"learning_rate": 5.516071428571429e-05,
"loss": 1.4543,
"step": 2720
},
{
"epoch": 4.7,
"grad_norm": 1.5070769786834717,
"learning_rate": 5.498214285714286e-05,
"loss": 1.4618,
"step": 2730
},
{
"epoch": 4.72,
"grad_norm": 1.9504728317260742,
"learning_rate": 5.480357142857143e-05,
"loss": 1.525,
"step": 2740
},
{
"epoch": 4.74,
"grad_norm": 1.7092479467391968,
"learning_rate": 5.4625000000000006e-05,
"loss": 1.4626,
"step": 2750
},
{
"epoch": 4.75,
"grad_norm": 1.3619625568389893,
"learning_rate": 5.4446428571428576e-05,
"loss": 1.4583,
"step": 2760
},
{
"epoch": 4.77,
"grad_norm": 1.6583805084228516,
"learning_rate": 5.4267857142857146e-05,
"loss": 1.4683,
"step": 2770
},
{
"epoch": 4.79,
"grad_norm": 1.3457785844802856,
"learning_rate": 5.408928571428572e-05,
"loss": 1.442,
"step": 2780
},
{
"epoch": 4.81,
"grad_norm": 1.5891212224960327,
"learning_rate": 5.391071428571429e-05,
"loss": 1.4551,
"step": 2790
},
{
"epoch": 4.82,
"grad_norm": 1.5150017738342285,
"learning_rate": 5.373214285714286e-05,
"loss": 1.4816,
"step": 2800
},
{
"epoch": 4.84,
"grad_norm": 1.5407203435897827,
"learning_rate": 5.355357142857143e-05,
"loss": 1.4252,
"step": 2810
},
{
"epoch": 4.86,
"grad_norm": 1.6189481019973755,
"learning_rate": 5.3374999999999996e-05,
"loss": 1.4591,
"step": 2820
},
{
"epoch": 4.88,
"grad_norm": 1.7718898057937622,
"learning_rate": 5.3196428571428566e-05,
"loss": 1.5234,
"step": 2830
},
{
"epoch": 4.89,
"grad_norm": 1.4359188079833984,
"learning_rate": 5.301785714285714e-05,
"loss": 1.5157,
"step": 2840
},
{
"epoch": 4.91,
"grad_norm": 1.6485284566879272,
"learning_rate": 5.283928571428571e-05,
"loss": 1.4695,
"step": 2850
},
{
"epoch": 4.93,
"grad_norm": 1.391932487487793,
"learning_rate": 5.266071428571428e-05,
"loss": 1.4815,
"step": 2860
},
{
"epoch": 4.94,
"grad_norm": 1.486312747001648,
"learning_rate": 5.248214285714286e-05,
"loss": 1.4879,
"step": 2870
},
{
"epoch": 4.96,
"grad_norm": 1.693805456161499,
"learning_rate": 5.230357142857143e-05,
"loss": 1.5132,
"step": 2880
},
{
"epoch": 4.98,
"grad_norm": 1.6018401384353638,
"learning_rate": 5.2125e-05,
"loss": 1.4547,
"step": 2890
},
{
"epoch": 5.0,
"grad_norm": 1.6688446998596191,
"learning_rate": 5.1946428571428577e-05,
"loss": 1.5073,
"step": 2900
},
{
"epoch": 5.0,
"eval_loss": 1.3449677228927612,
"eval_runtime": 189.8982,
"eval_samples_per_second": 261.935,
"eval_steps_per_second": 4.097,
"step": 2902
},
{
"epoch": 5.01,
"grad_norm": 1.5247100591659546,
"learning_rate": 5.1767857142857147e-05,
"loss": 1.4935,
"step": 2910
},
{
"epoch": 5.03,
"grad_norm": 2.0261752605438232,
"learning_rate": 5.1589285714285717e-05,
"loss": 1.4976,
"step": 2920
},
{
"epoch": 5.05,
"grad_norm": 1.5454976558685303,
"learning_rate": 5.1410714285714287e-05,
"loss": 1.3959,
"step": 2930
},
{
"epoch": 5.06,
"grad_norm": 1.5475680828094482,
"learning_rate": 5.123214285714286e-05,
"loss": 1.4942,
"step": 2940
},
{
"epoch": 5.08,
"grad_norm": 1.5716584920883179,
"learning_rate": 5.105357142857143e-05,
"loss": 1.4271,
"step": 2950
},
{
"epoch": 5.1,
"grad_norm": 1.6714199781417847,
"learning_rate": 5.0875e-05,
"loss": 1.4526,
"step": 2960
},
{
"epoch": 5.12,
"grad_norm": 1.7494091987609863,
"learning_rate": 5.069642857142858e-05,
"loss": 1.4616,
"step": 2970
},
{
"epoch": 5.13,
"grad_norm": 1.6469745635986328,
"learning_rate": 5.051785714285715e-05,
"loss": 1.4462,
"step": 2980
},
{
"epoch": 5.15,
"grad_norm": 1.491341233253479,
"learning_rate": 5.033928571428572e-05,
"loss": 1.4449,
"step": 2990
},
{
"epoch": 5.17,
"grad_norm": 1.5558992624282837,
"learning_rate": 5.016071428571428e-05,
"loss": 1.4606,
"step": 3000
},
{
"epoch": 5.19,
"grad_norm": 1.6068426370620728,
"learning_rate": 4.998214285714286e-05,
"loss": 1.4342,
"step": 3010
},
{
"epoch": 5.2,
"grad_norm": 1.4941500425338745,
"learning_rate": 4.980357142857143e-05,
"loss": 1.4106,
"step": 3020
},
{
"epoch": 5.22,
"grad_norm": 1.8874638080596924,
"learning_rate": 4.962500000000001e-05,
"loss": 1.4428,
"step": 3030
},
{
"epoch": 5.24,
"grad_norm": 1.3385238647460938,
"learning_rate": 4.944642857142857e-05,
"loss": 1.4638,
"step": 3040
},
{
"epoch": 5.25,
"grad_norm": 2.0951528549194336,
"learning_rate": 4.926785714285714e-05,
"loss": 1.4332,
"step": 3050
},
{
"epoch": 5.27,
"grad_norm": 1.492944359779358,
"learning_rate": 4.908928571428572e-05,
"loss": 1.4347,
"step": 3060
},
{
"epoch": 5.29,
"grad_norm": 1.5473136901855469,
"learning_rate": 4.891071428571429e-05,
"loss": 1.4416,
"step": 3070
},
{
"epoch": 5.31,
"grad_norm": 1.602656602859497,
"learning_rate": 4.873214285714286e-05,
"loss": 1.4227,
"step": 3080
},
{
"epoch": 5.32,
"grad_norm": 1.6464648246765137,
"learning_rate": 4.8553571428571434e-05,
"loss": 1.4306,
"step": 3090
},
{
"epoch": 5.34,
"grad_norm": 1.6388453245162964,
"learning_rate": 4.8375000000000004e-05,
"loss": 1.4303,
"step": 3100
},
{
"epoch": 5.36,
"grad_norm": 1.6323235034942627,
"learning_rate": 4.8196428571428574e-05,
"loss": 1.4782,
"step": 3110
},
{
"epoch": 5.37,
"grad_norm": 1.5395444631576538,
"learning_rate": 4.801785714285715e-05,
"loss": 1.4795,
"step": 3120
},
{
"epoch": 5.39,
"grad_norm": 1.4868383407592773,
"learning_rate": 4.7839285714285714e-05,
"loss": 1.471,
"step": 3130
},
{
"epoch": 5.41,
"grad_norm": 1.5986500978469849,
"learning_rate": 4.7660714285714284e-05,
"loss": 1.4955,
"step": 3140
},
{
"epoch": 5.43,
"grad_norm": 1.6404626369476318,
"learning_rate": 4.748214285714286e-05,
"loss": 1.499,
"step": 3150
},
{
"epoch": 5.44,
"grad_norm": 1.483614444732666,
"learning_rate": 4.730357142857143e-05,
"loss": 1.4403,
"step": 3160
},
{
"epoch": 5.46,
"grad_norm": 1.6691210269927979,
"learning_rate": 4.7125e-05,
"loss": 1.4795,
"step": 3170
},
{
"epoch": 5.48,
"grad_norm": 1.4705153703689575,
"learning_rate": 4.694642857142858e-05,
"loss": 1.4726,
"step": 3180
},
{
"epoch": 5.5,
"grad_norm": 1.6572233438491821,
"learning_rate": 4.676785714285715e-05,
"loss": 1.4638,
"step": 3190
},
{
"epoch": 5.51,
"grad_norm": 1.472894549369812,
"learning_rate": 4.658928571428572e-05,
"loss": 1.4727,
"step": 3200
},
{
"epoch": 5.53,
"grad_norm": 1.4686111211776733,
"learning_rate": 4.6410714285714294e-05,
"loss": 1.47,
"step": 3210
},
{
"epoch": 5.55,
"grad_norm": 1.5975010395050049,
"learning_rate": 4.623214285714286e-05,
"loss": 1.4248,
"step": 3220
},
{
"epoch": 5.56,
"grad_norm": 1.6790850162506104,
"learning_rate": 4.605357142857143e-05,
"loss": 1.5178,
"step": 3230
},
{
"epoch": 5.58,
"grad_norm": 1.5130410194396973,
"learning_rate": 4.5875000000000004e-05,
"loss": 1.4056,
"step": 3240
},
{
"epoch": 5.6,
"grad_norm": 1.5177439451217651,
"learning_rate": 4.5696428571428574e-05,
"loss": 1.5222,
"step": 3250
},
{
"epoch": 5.62,
"grad_norm": 1.5346488952636719,
"learning_rate": 4.5517857142857144e-05,
"loss": 1.4468,
"step": 3260
},
{
"epoch": 5.63,
"grad_norm": 1.5213022232055664,
"learning_rate": 4.533928571428572e-05,
"loss": 1.4405,
"step": 3270
},
{
"epoch": 5.65,
"grad_norm": 1.5120964050292969,
"learning_rate": 4.516071428571429e-05,
"loss": 1.4897,
"step": 3280
},
{
"epoch": 5.67,
"grad_norm": 1.7035776376724243,
"learning_rate": 4.498214285714286e-05,
"loss": 1.4612,
"step": 3290
},
{
"epoch": 5.68,
"grad_norm": 1.5996119976043701,
"learning_rate": 4.480357142857143e-05,
"loss": 1.457,
"step": 3300
},
{
"epoch": 5.7,
"grad_norm": 1.5173403024673462,
"learning_rate": 4.4625e-05,
"loss": 1.4775,
"step": 3310
},
{
"epoch": 5.72,
"grad_norm": 1.7526136636734009,
"learning_rate": 4.444642857142857e-05,
"loss": 1.4666,
"step": 3320
},
{
"epoch": 5.74,
"grad_norm": 1.353821873664856,
"learning_rate": 4.426785714285715e-05,
"loss": 1.4534,
"step": 3330
},
{
"epoch": 5.75,
"grad_norm": 1.5952380895614624,
"learning_rate": 4.408928571428572e-05,
"loss": 1.4622,
"step": 3340
},
{
"epoch": 5.77,
"grad_norm": 1.5540388822555542,
"learning_rate": 4.391071428571429e-05,
"loss": 1.4302,
"step": 3350
},
{
"epoch": 5.79,
"grad_norm": 1.5415778160095215,
"learning_rate": 4.373214285714286e-05,
"loss": 1.4521,
"step": 3360
},
{
"epoch": 5.81,
"grad_norm": 1.7444086074829102,
"learning_rate": 4.3553571428571434e-05,
"loss": 1.4728,
"step": 3370
},
{
"epoch": 5.82,
"grad_norm": 1.6915000677108765,
"learning_rate": 4.3375000000000004e-05,
"loss": 1.4637,
"step": 3380
},
{
"epoch": 5.84,
"grad_norm": 1.4718912839889526,
"learning_rate": 4.3196428571428574e-05,
"loss": 1.4875,
"step": 3390
},
{
"epoch": 5.86,
"grad_norm": 1.4398149251937866,
"learning_rate": 4.3017857142857144e-05,
"loss": 1.4283,
"step": 3400
},
{
"epoch": 5.87,
"grad_norm": 1.5392788648605347,
"learning_rate": 4.2839285714285714e-05,
"loss": 1.4407,
"step": 3410
},
{
"epoch": 5.89,
"grad_norm": 1.7398900985717773,
"learning_rate": 4.2660714285714284e-05,
"loss": 1.435,
"step": 3420
},
{
"epoch": 5.91,
"grad_norm": 1.6999260187149048,
"learning_rate": 4.248214285714286e-05,
"loss": 1.4778,
"step": 3430
},
{
"epoch": 5.93,
"grad_norm": 1.5439287424087524,
"learning_rate": 4.230357142857143e-05,
"loss": 1.4389,
"step": 3440
},
{
"epoch": 5.94,
"grad_norm": 1.4627364873886108,
"learning_rate": 4.2125e-05,
"loss": 1.4618,
"step": 3450
},
{
"epoch": 5.96,
"grad_norm": 2.0613064765930176,
"learning_rate": 4.194642857142858e-05,
"loss": 1.4854,
"step": 3460
},
{
"epoch": 5.98,
"grad_norm": 1.6311384439468384,
"learning_rate": 4.176785714285715e-05,
"loss": 1.4269,
"step": 3470
},
{
"epoch": 5.99,
"grad_norm": 1.7284331321716309,
"learning_rate": 4.158928571428571e-05,
"loss": 1.429,
"step": 3480
},
{
"epoch": 6.0,
"eval_loss": 1.3424557447433472,
"eval_runtime": 189.4806,
"eval_samples_per_second": 262.512,
"eval_steps_per_second": 4.106,
"step": 3483
},
{
"epoch": 6.01,
"grad_norm": 1.6682264804840088,
"learning_rate": 4.141071428571429e-05,
"loss": 1.474,
"step": 3490
},
{
"epoch": 6.03,
"grad_norm": 1.5376884937286377,
"learning_rate": 4.123214285714286e-05,
"loss": 1.3874,
"step": 3500
},
{
"epoch": 6.05,
"grad_norm": 1.576881766319275,
"learning_rate": 4.105357142857143e-05,
"loss": 1.417,
"step": 3510
},
{
"epoch": 6.06,
"grad_norm": 1.5996309518814087,
"learning_rate": 4.0875000000000004e-05,
"loss": 1.4542,
"step": 3520
},
{
"epoch": 6.08,
"grad_norm": 1.3949395418167114,
"learning_rate": 4.0696428571428574e-05,
"loss": 1.4481,
"step": 3530
},
{
"epoch": 6.1,
"grad_norm": 1.6811378002166748,
"learning_rate": 4.0517857142857144e-05,
"loss": 1.4005,
"step": 3540
},
{
"epoch": 6.12,
"grad_norm": 1.4827687740325928,
"learning_rate": 4.033928571428572e-05,
"loss": 1.4587,
"step": 3550
},
{
"epoch": 6.13,
"grad_norm": 1.536258339881897,
"learning_rate": 4.016071428571429e-05,
"loss": 1.422,
"step": 3560
},
{
"epoch": 6.15,
"grad_norm": 1.7262201309204102,
"learning_rate": 3.9982142857142854e-05,
"loss": 1.4467,
"step": 3570
},
{
"epoch": 6.17,
"grad_norm": 1.6241017580032349,
"learning_rate": 3.980357142857143e-05,
"loss": 1.3993,
"step": 3580
},
{
"epoch": 6.18,
"grad_norm": 1.7238517999649048,
"learning_rate": 3.9625e-05,
"loss": 1.4649,
"step": 3590
},
{
"epoch": 6.2,
"grad_norm": 1.6746817827224731,
"learning_rate": 3.944642857142857e-05,
"loss": 1.4625,
"step": 3600
},
{
"epoch": 6.22,
"grad_norm": 1.5277612209320068,
"learning_rate": 3.926785714285715e-05,
"loss": 1.4718,
"step": 3610
},
{
"epoch": 6.24,
"grad_norm": 1.6776617765426636,
"learning_rate": 3.908928571428572e-05,
"loss": 1.453,
"step": 3620
},
{
"epoch": 6.25,
"grad_norm": 1.5777333974838257,
"learning_rate": 3.891071428571429e-05,
"loss": 1.4104,
"step": 3630
},
{
"epoch": 6.27,
"grad_norm": 1.463641881942749,
"learning_rate": 3.8732142857142864e-05,
"loss": 1.398,
"step": 3640
},
{
"epoch": 6.29,
"grad_norm": 1.3779195547103882,
"learning_rate": 3.855357142857143e-05,
"loss": 1.4463,
"step": 3650
},
{
"epoch": 6.3,
"grad_norm": 1.40390145778656,
"learning_rate": 3.8375e-05,
"loss": 1.4361,
"step": 3660
},
{
"epoch": 6.32,
"grad_norm": 1.5744775533676147,
"learning_rate": 3.8196428571428574e-05,
"loss": 1.535,
"step": 3670
},
{
"epoch": 6.34,
"grad_norm": 1.6865484714508057,
"learning_rate": 3.8017857142857144e-05,
"loss": 1.409,
"step": 3680
},
{
"epoch": 6.36,
"grad_norm": 1.6662002801895142,
"learning_rate": 3.7839285714285714e-05,
"loss": 1.4088,
"step": 3690
},
{
"epoch": 6.37,
"grad_norm": 1.3653278350830078,
"learning_rate": 3.766071428571429e-05,
"loss": 1.4066,
"step": 3700
},
{
"epoch": 6.39,
"grad_norm": 1.48214590549469,
"learning_rate": 3.748214285714286e-05,
"loss": 1.4269,
"step": 3710
},
{
"epoch": 6.41,
"grad_norm": 1.4544599056243896,
"learning_rate": 3.730357142857143e-05,
"loss": 1.443,
"step": 3720
},
{
"epoch": 6.43,
"grad_norm": 1.586845874786377,
"learning_rate": 3.7125e-05,
"loss": 1.3706,
"step": 3730
},
{
"epoch": 6.44,
"grad_norm": 1.6445093154907227,
"learning_rate": 3.694642857142857e-05,
"loss": 1.4412,
"step": 3740
},
{
"epoch": 6.46,
"grad_norm": 1.6151032447814941,
"learning_rate": 3.676785714285714e-05,
"loss": 1.4699,
"step": 3750
},
{
"epoch": 6.48,
"grad_norm": 1.366557240486145,
"learning_rate": 3.658928571428572e-05,
"loss": 1.4603,
"step": 3760
},
{
"epoch": 6.49,
"grad_norm": 1.6722807884216309,
"learning_rate": 3.641071428571429e-05,
"loss": 1.4367,
"step": 3770
},
{
"epoch": 6.51,
"grad_norm": 1.5589567422866821,
"learning_rate": 3.623214285714286e-05,
"loss": 1.4129,
"step": 3780
},
{
"epoch": 6.53,
"grad_norm": 2.4718008041381836,
"learning_rate": 3.605357142857143e-05,
"loss": 1.4271,
"step": 3790
},
{
"epoch": 6.55,
"grad_norm": 1.5975865125656128,
"learning_rate": 3.5875000000000005e-05,
"loss": 1.4603,
"step": 3800
},
{
"epoch": 6.56,
"grad_norm": 1.9084751605987549,
"learning_rate": 3.5696428571428575e-05,
"loss": 1.4539,
"step": 3810
},
{
"epoch": 6.58,
"grad_norm": 1.7772917747497559,
"learning_rate": 3.5517857142857145e-05,
"loss": 1.4513,
"step": 3820
},
{
"epoch": 6.6,
"grad_norm": 1.4100738763809204,
"learning_rate": 3.5339285714285715e-05,
"loss": 1.4466,
"step": 3830
},
{
"epoch": 6.61,
"grad_norm": 1.5141396522521973,
"learning_rate": 3.5160714285714285e-05,
"loss": 1.4322,
"step": 3840
},
{
"epoch": 6.63,
"grad_norm": 1.5596503019332886,
"learning_rate": 3.4982142857142855e-05,
"loss": 1.4606,
"step": 3850
},
{
"epoch": 6.65,
"grad_norm": 1.6146564483642578,
"learning_rate": 3.480357142857143e-05,
"loss": 1.4204,
"step": 3860
},
{
"epoch": 6.67,
"grad_norm": 1.6012805700302124,
"learning_rate": 3.4625e-05,
"loss": 1.4149,
"step": 3870
},
{
"epoch": 6.68,
"grad_norm": 1.5628446340560913,
"learning_rate": 3.444642857142857e-05,
"loss": 1.4213,
"step": 3880
},
{
"epoch": 6.7,
"grad_norm": 1.6580945253372192,
"learning_rate": 3.426785714285715e-05,
"loss": 1.4544,
"step": 3890
},
{
"epoch": 6.72,
"grad_norm": 1.4234137535095215,
"learning_rate": 3.408928571428572e-05,
"loss": 1.4465,
"step": 3900
},
{
"epoch": 6.74,
"grad_norm": 1.4981999397277832,
"learning_rate": 3.391071428571429e-05,
"loss": 1.4373,
"step": 3910
},
{
"epoch": 6.75,
"grad_norm": 1.748923659324646,
"learning_rate": 3.373214285714286e-05,
"loss": 1.4339,
"step": 3920
},
{
"epoch": 6.77,
"grad_norm": 1.7327990531921387,
"learning_rate": 3.355357142857143e-05,
"loss": 1.4495,
"step": 3930
},
{
"epoch": 6.79,
"grad_norm": 1.5327931642532349,
"learning_rate": 3.3375e-05,
"loss": 1.3951,
"step": 3940
},
{
"epoch": 6.8,
"grad_norm": 1.5292168855667114,
"learning_rate": 3.3196428571428575e-05,
"loss": 1.4341,
"step": 3950
},
{
"epoch": 6.82,
"grad_norm": 1.740376353263855,
"learning_rate": 3.3017857142857145e-05,
"loss": 1.4452,
"step": 3960
},
{
"epoch": 6.84,
"grad_norm": 1.5825227499008179,
"learning_rate": 3.2839285714285715e-05,
"loss": 1.4078,
"step": 3970
},
{
"epoch": 6.86,
"grad_norm": 1.5701614618301392,
"learning_rate": 3.266071428571429e-05,
"loss": 1.4145,
"step": 3980
},
{
"epoch": 6.87,
"grad_norm": 1.3951566219329834,
"learning_rate": 3.248214285714286e-05,
"loss": 1.4225,
"step": 3990
},
{
"epoch": 6.89,
"grad_norm": 1.8979390859603882,
"learning_rate": 3.2303571428571425e-05,
"loss": 1.4466,
"step": 4000
},
{
"epoch": 6.91,
"grad_norm": 1.4181392192840576,
"learning_rate": 3.2125e-05,
"loss": 1.4511,
"step": 4010
},
{
"epoch": 6.93,
"grad_norm": 1.6311503648757935,
"learning_rate": 3.194642857142857e-05,
"loss": 1.4165,
"step": 4020
},
{
"epoch": 6.94,
"grad_norm": 1.5814690589904785,
"learning_rate": 3.176785714285714e-05,
"loss": 1.4785,
"step": 4030
},
{
"epoch": 6.96,
"grad_norm": 1.6128079891204834,
"learning_rate": 3.158928571428572e-05,
"loss": 1.3793,
"step": 4040
},
{
"epoch": 6.98,
"grad_norm": 1.596709132194519,
"learning_rate": 3.141071428571429e-05,
"loss": 1.4796,
"step": 4050
},
{
"epoch": 6.99,
"grad_norm": 1.621424913406372,
"learning_rate": 3.123214285714286e-05,
"loss": 1.4363,
"step": 4060
},
{
"epoch": 7.0,
"eval_loss": 1.3302702903747559,
"eval_runtime": 188.6986,
"eval_samples_per_second": 263.6,
"eval_steps_per_second": 4.123,
"step": 4063
},
{
"epoch": 7.01,
"grad_norm": 1.708495855331421,
"learning_rate": 3.1053571428571435e-05,
"loss": 1.4038,
"step": 4070
},
{
"epoch": 7.03,
"grad_norm": 1.4852769374847412,
"learning_rate": 3.0875000000000005e-05,
"loss": 1.3993,
"step": 4080
},
{
"epoch": 7.05,
"grad_norm": 1.498517394065857,
"learning_rate": 3.069642857142857e-05,
"loss": 1.4277,
"step": 4090
},
{
"epoch": 7.06,
"grad_norm": 1.4383912086486816,
"learning_rate": 3.0517857142857145e-05,
"loss": 1.4055,
"step": 4100
},
{
"epoch": 7.08,
"grad_norm": 1.4877851009368896,
"learning_rate": 3.0339285714285715e-05,
"loss": 1.418,
"step": 4110
},
{
"epoch": 7.1,
"grad_norm": 1.5312427282333374,
"learning_rate": 3.0160714285714285e-05,
"loss": 1.4098,
"step": 4120
},
{
"epoch": 7.11,
"grad_norm": 1.8560508489608765,
"learning_rate": 2.998214285714286e-05,
"loss": 1.4316,
"step": 4130
},
{
"epoch": 7.13,
"grad_norm": 1.5068204402923584,
"learning_rate": 2.9803571428571432e-05,
"loss": 1.4095,
"step": 4140
},
{
"epoch": 7.15,
"grad_norm": 1.7784984111785889,
"learning_rate": 2.9625000000000002e-05,
"loss": 1.4067,
"step": 4150
},
{
"epoch": 7.17,
"grad_norm": 1.5669249296188354,
"learning_rate": 2.9446428571428575e-05,
"loss": 1.4246,
"step": 4160
},
{
"epoch": 7.18,
"grad_norm": 1.4671744108200073,
"learning_rate": 2.9267857142857145e-05,
"loss": 1.4362,
"step": 4170
},
{
"epoch": 7.2,
"grad_norm": 1.578516960144043,
"learning_rate": 2.9089285714285712e-05,
"loss": 1.4581,
"step": 4180
},
{
"epoch": 7.22,
"grad_norm": 1.4771876335144043,
"learning_rate": 2.8910714285714285e-05,
"loss": 1.4719,
"step": 4190
},
{
"epoch": 7.24,
"grad_norm": 1.8852756023406982,
"learning_rate": 2.873214285714286e-05,
"loss": 1.4644,
"step": 4200
},
{
"epoch": 7.25,
"grad_norm": 1.6869423389434814,
"learning_rate": 2.855357142857143e-05,
"loss": 1.4133,
"step": 4210
},
{
"epoch": 7.27,
"grad_norm": 1.7301143407821655,
"learning_rate": 2.8375000000000002e-05,
"loss": 1.3952,
"step": 4220
},
{
"epoch": 7.29,
"grad_norm": 1.7250717878341675,
"learning_rate": 2.8196428571428575e-05,
"loss": 1.3988,
"step": 4230
},
{
"epoch": 7.3,
"grad_norm": 1.6003575325012207,
"learning_rate": 2.8017857142857145e-05,
"loss": 1.4233,
"step": 4240
},
{
"epoch": 7.32,
"grad_norm": 1.4987831115722656,
"learning_rate": 2.783928571428572e-05,
"loss": 1.4516,
"step": 4250
},
{
"epoch": 7.34,
"grad_norm": 1.6205782890319824,
"learning_rate": 2.766071428571429e-05,
"loss": 1.3808,
"step": 4260
},
{
"epoch": 7.36,
"grad_norm": 1.5655540227890015,
"learning_rate": 2.7482142857142855e-05,
"loss": 1.4326,
"step": 4270
},
{
"epoch": 7.37,
"grad_norm": 1.7098954916000366,
"learning_rate": 2.730357142857143e-05,
"loss": 1.4111,
"step": 4280
},
{
"epoch": 7.39,
"grad_norm": 1.6183218955993652,
"learning_rate": 2.7125000000000002e-05,
"loss": 1.4252,
"step": 4290
},
{
"epoch": 7.41,
"grad_norm": 1.493147850036621,
"learning_rate": 2.6946428571428572e-05,
"loss": 1.3851,
"step": 4300
},
{
"epoch": 7.42,
"grad_norm": 1.4206022024154663,
"learning_rate": 2.6767857142857145e-05,
"loss": 1.4265,
"step": 4310
},
{
"epoch": 7.44,
"grad_norm": 1.4952884912490845,
"learning_rate": 2.6589285714285715e-05,
"loss": 1.403,
"step": 4320
},
{
"epoch": 7.46,
"grad_norm": 1.3531843423843384,
"learning_rate": 2.641071428571429e-05,
"loss": 1.4089,
"step": 4330
},
{
"epoch": 7.48,
"grad_norm": 1.4456886053085327,
"learning_rate": 2.6232142857142862e-05,
"loss": 1.4411,
"step": 4340
},
{
"epoch": 7.49,
"grad_norm": 1.5120099782943726,
"learning_rate": 2.605357142857143e-05,
"loss": 1.394,
"step": 4350
},
{
"epoch": 7.51,
"grad_norm": 1.7234888076782227,
"learning_rate": 2.5875e-05,
"loss": 1.4327,
"step": 4360
},
{
"epoch": 7.53,
"grad_norm": 1.649519443511963,
"learning_rate": 2.5696428571428572e-05,
"loss": 1.4017,
"step": 4370
},
{
"epoch": 7.55,
"grad_norm": 1.6993837356567383,
"learning_rate": 2.5517857142857142e-05,
"loss": 1.4133,
"step": 4380
},
{
"epoch": 7.56,
"grad_norm": 1.6382921934127808,
"learning_rate": 2.5339285714285716e-05,
"loss": 1.3937,
"step": 4390
},
{
"epoch": 7.58,
"grad_norm": 1.4987329244613647,
"learning_rate": 2.516071428571429e-05,
"loss": 1.433,
"step": 4400
},
{
"epoch": 7.6,
"grad_norm": 2.0574324131011963,
"learning_rate": 2.498214285714286e-05,
"loss": 1.4461,
"step": 4410
},
{
"epoch": 7.61,
"grad_norm": 1.5586438179016113,
"learning_rate": 2.480357142857143e-05,
"loss": 1.4074,
"step": 4420
},
{
"epoch": 7.63,
"grad_norm": 1.554789662361145,
"learning_rate": 2.4625000000000002e-05,
"loss": 1.424,
"step": 4430
},
{
"epoch": 7.65,
"grad_norm": 1.6773581504821777,
"learning_rate": 2.4446428571428572e-05,
"loss": 1.422,
"step": 4440
},
{
"epoch": 7.67,
"grad_norm": 1.6803804636001587,
"learning_rate": 2.4267857142857146e-05,
"loss": 1.3963,
"step": 4450
},
{
"epoch": 7.68,
"grad_norm": 2.0189342498779297,
"learning_rate": 2.4089285714285716e-05,
"loss": 1.4,
"step": 4460
},
{
"epoch": 7.7,
"grad_norm": 1.6067453622817993,
"learning_rate": 2.3910714285714286e-05,
"loss": 1.4282,
"step": 4470
},
{
"epoch": 7.72,
"grad_norm": 1.5282777547836304,
"learning_rate": 2.373214285714286e-05,
"loss": 1.4281,
"step": 4480
},
{
"epoch": 7.73,
"grad_norm": 1.419765591621399,
"learning_rate": 2.355357142857143e-05,
"loss": 1.469,
"step": 4490
},
{
"epoch": 7.75,
"grad_norm": 1.5633577108383179,
"learning_rate": 2.3375000000000002e-05,
"loss": 1.4114,
"step": 4500
},
{
"epoch": 7.77,
"grad_norm": 1.6632983684539795,
"learning_rate": 2.3196428571428572e-05,
"loss": 1.402,
"step": 4510
},
{
"epoch": 7.79,
"grad_norm": 1.5646424293518066,
"learning_rate": 2.3017857142857142e-05,
"loss": 1.3988,
"step": 4520
},
{
"epoch": 7.8,
"grad_norm": 1.6965550184249878,
"learning_rate": 2.2839285714285716e-05,
"loss": 1.4058,
"step": 4530
},
{
"epoch": 7.82,
"grad_norm": 1.6160495281219482,
"learning_rate": 2.266071428571429e-05,
"loss": 1.3833,
"step": 4540
},
{
"epoch": 7.84,
"grad_norm": 1.6150667667388916,
"learning_rate": 2.2482142857142856e-05,
"loss": 1.3824,
"step": 4550
},
{
"epoch": 7.86,
"grad_norm": 1.4956583976745605,
"learning_rate": 2.230357142857143e-05,
"loss": 1.4208,
"step": 4560
},
{
"epoch": 7.87,
"grad_norm": 1.770931363105774,
"learning_rate": 2.2125000000000002e-05,
"loss": 1.4181,
"step": 4570
},
{
"epoch": 7.89,
"grad_norm": 1.6238857507705688,
"learning_rate": 2.1946428571428572e-05,
"loss": 1.4232,
"step": 4580
},
{
"epoch": 7.91,
"grad_norm": 1.5213741064071655,
"learning_rate": 2.1767857142857142e-05,
"loss": 1.4032,
"step": 4590
},
{
"epoch": 7.92,
"grad_norm": 1.633959412574768,
"learning_rate": 2.1589285714285716e-05,
"loss": 1.449,
"step": 4600
},
{
"epoch": 7.94,
"grad_norm": 1.4910664558410645,
"learning_rate": 2.1410714285714286e-05,
"loss": 1.409,
"step": 4610
},
{
"epoch": 7.96,
"grad_norm": 1.5691108703613281,
"learning_rate": 2.123214285714286e-05,
"loss": 1.4584,
"step": 4620
},
{
"epoch": 7.98,
"grad_norm": 1.8607861995697021,
"learning_rate": 2.105357142857143e-05,
"loss": 1.4205,
"step": 4630
},
{
"epoch": 7.99,
"grad_norm": 1.4865373373031616,
"learning_rate": 2.0875e-05,
"loss": 1.4176,
"step": 4640
},
{
"epoch": 8.0,
"eval_loss": 1.3218775987625122,
"eval_runtime": 188.3771,
"eval_samples_per_second": 264.05,
"eval_steps_per_second": 4.13,
"step": 4644
}
],
"logging_steps": 10,
"max_steps": 5800,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 1.9806249716925071e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}