lact / trainer_state.json
nferruz's picture
Upload 13 files
4a15b34
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.998499249624812,
"global_step": 4990,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 7.991983967935872e-05,
"loss": 6.0598,
"step": 5
},
{
"epoch": 0.02,
"learning_rate": 7.983967935871744e-05,
"loss": 3.2882,
"step": 10
},
{
"epoch": 0.02,
"eval_loss": 2.9581081867218018,
"eval_runtime": 30.7575,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 1.821,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 7.975951903807616e-05,
"loss": 2.8014,
"step": 15
},
{
"epoch": 0.04,
"learning_rate": 7.967935871743487e-05,
"loss": 2.5059,
"step": 20
},
{
"epoch": 0.04,
"eval_loss": 2.384403944015503,
"eval_runtime": 30.8572,
"eval_samples_per_second": 7.194,
"eval_steps_per_second": 1.815,
"step": 20
},
{
"epoch": 0.05,
"learning_rate": 7.95991983967936e-05,
"loss": 2.404,
"step": 25
},
{
"epoch": 0.06,
"learning_rate": 7.951903807615231e-05,
"loss": 2.3368,
"step": 30
},
{
"epoch": 0.06,
"eval_loss": 2.3643715381622314,
"eval_runtime": 30.9262,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 1.811,
"step": 30
},
{
"epoch": 0.07,
"learning_rate": 7.943887775551104e-05,
"loss": 2.2994,
"step": 35
},
{
"epoch": 0.08,
"learning_rate": 7.935871743486974e-05,
"loss": 2.3476,
"step": 40
},
{
"epoch": 0.08,
"eval_loss": 2.349405288696289,
"eval_runtime": 30.9436,
"eval_samples_per_second": 7.174,
"eval_steps_per_second": 1.81,
"step": 40
},
{
"epoch": 0.09,
"learning_rate": 7.927855711422847e-05,
"loss": 2.3623,
"step": 45
},
{
"epoch": 0.1,
"learning_rate": 7.919839679358717e-05,
"loss": 2.3185,
"step": 50
},
{
"epoch": 0.1,
"eval_loss": 2.369673252105713,
"eval_runtime": 30.9718,
"eval_samples_per_second": 7.168,
"eval_steps_per_second": 1.808,
"step": 50
},
{
"epoch": 0.11,
"learning_rate": 7.91182364729459e-05,
"loss": 2.433,
"step": 55
},
{
"epoch": 0.12,
"learning_rate": 7.903807615230462e-05,
"loss": 2.3468,
"step": 60
},
{
"epoch": 0.12,
"eval_loss": 2.32550048828125,
"eval_runtime": 31.009,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 1.806,
"step": 60
},
{
"epoch": 0.13,
"learning_rate": 7.895791583166334e-05,
"loss": 2.2554,
"step": 65
},
{
"epoch": 0.14,
"learning_rate": 7.887775551102205e-05,
"loss": 2.262,
"step": 70
},
{
"epoch": 0.14,
"eval_loss": 2.2511556148529053,
"eval_runtime": 31.035,
"eval_samples_per_second": 7.153,
"eval_steps_per_second": 1.804,
"step": 70
},
{
"epoch": 0.15,
"learning_rate": 7.879759519038077e-05,
"loss": 2.2116,
"step": 75
},
{
"epoch": 0.16,
"learning_rate": 7.871743486973949e-05,
"loss": 2.1646,
"step": 80
},
{
"epoch": 0.16,
"eval_loss": 2.194511651992798,
"eval_runtime": 31.0667,
"eval_samples_per_second": 7.146,
"eval_steps_per_second": 1.803,
"step": 80
},
{
"epoch": 0.17,
"learning_rate": 7.86372745490982e-05,
"loss": 2.1732,
"step": 85
},
{
"epoch": 0.18,
"learning_rate": 7.855711422845692e-05,
"loss": 2.1558,
"step": 90
},
{
"epoch": 0.18,
"eval_loss": 2.188542366027832,
"eval_runtime": 31.077,
"eval_samples_per_second": 7.144,
"eval_steps_per_second": 1.802,
"step": 90
},
{
"epoch": 0.19,
"learning_rate": 7.847695390781564e-05,
"loss": 2.2353,
"step": 95
},
{
"epoch": 0.2,
"learning_rate": 7.839679358717435e-05,
"loss": 2.1934,
"step": 100
},
{
"epoch": 0.2,
"eval_loss": 2.148287057876587,
"eval_runtime": 31.1071,
"eval_samples_per_second": 7.137,
"eval_steps_per_second": 1.8,
"step": 100
},
{
"epoch": 0.21,
"learning_rate": 7.831663326653307e-05,
"loss": 2.0947,
"step": 105
},
{
"epoch": 0.22,
"learning_rate": 7.823647294589179e-05,
"loss": 2.0855,
"step": 110
},
{
"epoch": 0.22,
"eval_loss": 2.115215301513672,
"eval_runtime": 31.1349,
"eval_samples_per_second": 7.13,
"eval_steps_per_second": 1.799,
"step": 110
},
{
"epoch": 0.23,
"learning_rate": 7.81563126252505e-05,
"loss": 2.1728,
"step": 115
},
{
"epoch": 0.24,
"learning_rate": 7.807615230460922e-05,
"loss": 2.0844,
"step": 120
},
{
"epoch": 0.24,
"eval_loss": 2.0839340686798096,
"eval_runtime": 31.1371,
"eval_samples_per_second": 7.13,
"eval_steps_per_second": 1.798,
"step": 120
},
{
"epoch": 0.25,
"learning_rate": 7.799599198396795e-05,
"loss": 2.0497,
"step": 125
},
{
"epoch": 0.26,
"learning_rate": 7.791583166332665e-05,
"loss": 2.0647,
"step": 130
},
{
"epoch": 0.26,
"eval_loss": 2.0615038871765137,
"eval_runtime": 31.1492,
"eval_samples_per_second": 7.127,
"eval_steps_per_second": 1.798,
"step": 130
},
{
"epoch": 0.27,
"learning_rate": 7.783567134268538e-05,
"loss": 2.0643,
"step": 135
},
{
"epoch": 0.28,
"learning_rate": 7.775551102204409e-05,
"loss": 1.9665,
"step": 140
},
{
"epoch": 0.28,
"eval_loss": 2.0329749584198,
"eval_runtime": 31.1654,
"eval_samples_per_second": 7.123,
"eval_steps_per_second": 1.797,
"step": 140
},
{
"epoch": 0.29,
"learning_rate": 7.767535070140282e-05,
"loss": 1.9736,
"step": 145
},
{
"epoch": 0.3,
"learning_rate": 7.759519038076152e-05,
"loss": 1.9761,
"step": 150
},
{
"epoch": 0.3,
"eval_loss": 2.0067899227142334,
"eval_runtime": 31.1584,
"eval_samples_per_second": 7.125,
"eval_steps_per_second": 1.797,
"step": 150
},
{
"epoch": 0.31,
"learning_rate": 7.751503006012025e-05,
"loss": 1.9611,
"step": 155
},
{
"epoch": 0.32,
"learning_rate": 7.743486973947897e-05,
"loss": 1.9428,
"step": 160
},
{
"epoch": 0.32,
"eval_loss": 1.9913756847381592,
"eval_runtime": 31.1769,
"eval_samples_per_second": 7.121,
"eval_steps_per_second": 1.796,
"step": 160
},
{
"epoch": 0.33,
"learning_rate": 7.735470941883769e-05,
"loss": 1.9547,
"step": 165
},
{
"epoch": 0.34,
"learning_rate": 7.72745490981964e-05,
"loss": 1.9351,
"step": 170
},
{
"epoch": 0.34,
"eval_loss": 1.9368627071380615,
"eval_runtime": 31.1785,
"eval_samples_per_second": 7.12,
"eval_steps_per_second": 1.796,
"step": 170
},
{
"epoch": 0.35,
"learning_rate": 7.719438877755512e-05,
"loss": 1.9509,
"step": 175
},
{
"epoch": 0.36,
"learning_rate": 7.711422845691384e-05,
"loss": 1.9366,
"step": 180
},
{
"epoch": 0.36,
"eval_loss": 1.9138563871383667,
"eval_runtime": 31.1927,
"eval_samples_per_second": 7.117,
"eval_steps_per_second": 1.795,
"step": 180
},
{
"epoch": 0.37,
"learning_rate": 7.703406813627255e-05,
"loss": 1.9439,
"step": 185
},
{
"epoch": 0.38,
"learning_rate": 7.695390781563127e-05,
"loss": 1.9548,
"step": 190
},
{
"epoch": 0.38,
"eval_loss": 1.8788814544677734,
"eval_runtime": 31.199,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 1.795,
"step": 190
},
{
"epoch": 0.39,
"learning_rate": 7.687374749498999e-05,
"loss": 1.9061,
"step": 195
},
{
"epoch": 0.4,
"learning_rate": 7.67935871743487e-05,
"loss": 1.9625,
"step": 200
},
{
"epoch": 0.4,
"eval_loss": 1.848646640777588,
"eval_runtime": 31.2024,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 200
},
{
"epoch": 0.41,
"learning_rate": 7.671342685370742e-05,
"loss": 1.9024,
"step": 205
},
{
"epoch": 0.42,
"learning_rate": 7.663326653306614e-05,
"loss": 1.8584,
"step": 210
},
{
"epoch": 0.42,
"eval_loss": 1.8197656869888306,
"eval_runtime": 31.2023,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 210
},
{
"epoch": 0.43,
"learning_rate": 7.655310621242485e-05,
"loss": 1.7843,
"step": 215
},
{
"epoch": 0.44,
"learning_rate": 7.647294589178357e-05,
"loss": 1.8857,
"step": 220
},
{
"epoch": 0.44,
"eval_loss": 1.8118294477462769,
"eval_runtime": 31.1985,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 1.795,
"step": 220
},
{
"epoch": 0.45,
"learning_rate": 7.639278557114229e-05,
"loss": 1.7888,
"step": 225
},
{
"epoch": 0.46,
"learning_rate": 7.6312625250501e-05,
"loss": 1.7574,
"step": 230
},
{
"epoch": 0.46,
"eval_loss": 1.760263442993164,
"eval_runtime": 31.1988,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 1.795,
"step": 230
},
{
"epoch": 0.47,
"learning_rate": 7.623246492985973e-05,
"loss": 1.7695,
"step": 235
},
{
"epoch": 0.48,
"learning_rate": 7.615230460921844e-05,
"loss": 1.8114,
"step": 240
},
{
"epoch": 0.48,
"eval_loss": 1.7369911670684814,
"eval_runtime": 31.2207,
"eval_samples_per_second": 7.111,
"eval_steps_per_second": 1.794,
"step": 240
},
{
"epoch": 0.49,
"learning_rate": 7.607214428857717e-05,
"loss": 1.7307,
"step": 245
},
{
"epoch": 0.5,
"learning_rate": 7.599198396793587e-05,
"loss": 1.7303,
"step": 250
},
{
"epoch": 0.5,
"eval_loss": 1.720489263534546,
"eval_runtime": 31.2198,
"eval_samples_per_second": 7.111,
"eval_steps_per_second": 1.794,
"step": 250
},
{
"epoch": 0.51,
"learning_rate": 7.59118236472946e-05,
"loss": 1.6382,
"step": 255
},
{
"epoch": 0.52,
"learning_rate": 7.58316633266533e-05,
"loss": 1.7535,
"step": 260
},
{
"epoch": 0.52,
"eval_loss": 1.7123703956604004,
"eval_runtime": 31.1991,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 1.795,
"step": 260
},
{
"epoch": 0.53,
"learning_rate": 7.575150300601203e-05,
"loss": 1.6792,
"step": 265
},
{
"epoch": 0.54,
"learning_rate": 7.567134268537075e-05,
"loss": 1.7775,
"step": 270
},
{
"epoch": 0.54,
"eval_loss": 1.7013072967529297,
"eval_runtime": 31.2171,
"eval_samples_per_second": 7.111,
"eval_steps_per_second": 1.794,
"step": 270
},
{
"epoch": 0.55,
"learning_rate": 7.559118236472947e-05,
"loss": 1.7402,
"step": 275
},
{
"epoch": 0.56,
"learning_rate": 7.551102204408818e-05,
"loss": 1.685,
"step": 280
},
{
"epoch": 0.56,
"eval_loss": 1.6611982583999634,
"eval_runtime": 31.2154,
"eval_samples_per_second": 7.112,
"eval_steps_per_second": 1.794,
"step": 280
},
{
"epoch": 0.57,
"learning_rate": 7.54308617234469e-05,
"loss": 1.7125,
"step": 285
},
{
"epoch": 0.58,
"learning_rate": 7.535070140280562e-05,
"loss": 1.5898,
"step": 290
},
{
"epoch": 0.58,
"eval_loss": 1.6577653884887695,
"eval_runtime": 31.217,
"eval_samples_per_second": 7.112,
"eval_steps_per_second": 1.794,
"step": 290
},
{
"epoch": 0.59,
"learning_rate": 7.527054108216433e-05,
"loss": 1.6172,
"step": 295
},
{
"epoch": 0.6,
"learning_rate": 7.519038076152305e-05,
"loss": 1.7875,
"step": 300
},
{
"epoch": 0.6,
"eval_loss": 1.645772933959961,
"eval_runtime": 31.2182,
"eval_samples_per_second": 7.111,
"eval_steps_per_second": 1.794,
"step": 300
},
{
"epoch": 0.61,
"learning_rate": 7.511022044088177e-05,
"loss": 1.5817,
"step": 305
},
{
"epoch": 0.62,
"learning_rate": 7.50300601202405e-05,
"loss": 1.628,
"step": 310
},
{
"epoch": 0.62,
"eval_loss": 1.6252926588058472,
"eval_runtime": 31.2223,
"eval_samples_per_second": 7.11,
"eval_steps_per_second": 1.794,
"step": 310
},
{
"epoch": 0.63,
"learning_rate": 7.49498997995992e-05,
"loss": 1.6397,
"step": 315
},
{
"epoch": 0.64,
"learning_rate": 7.486973947895793e-05,
"loss": 1.6186,
"step": 320
},
{
"epoch": 0.64,
"eval_loss": 1.6194863319396973,
"eval_runtime": 31.2314,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 1.793,
"step": 320
},
{
"epoch": 0.65,
"learning_rate": 7.478957915831663e-05,
"loss": 1.6546,
"step": 325
},
{
"epoch": 0.66,
"learning_rate": 7.470941883767536e-05,
"loss": 1.6899,
"step": 330
},
{
"epoch": 0.66,
"eval_loss": 1.6101500988006592,
"eval_runtime": 31.2273,
"eval_samples_per_second": 7.109,
"eval_steps_per_second": 1.793,
"step": 330
},
{
"epoch": 0.67,
"learning_rate": 7.462925851703407e-05,
"loss": 1.6294,
"step": 335
},
{
"epoch": 0.68,
"learning_rate": 7.45490981963928e-05,
"loss": 1.5908,
"step": 340
},
{
"epoch": 0.68,
"eval_loss": 1.5906888246536255,
"eval_runtime": 31.2321,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 1.793,
"step": 340
},
{
"epoch": 0.69,
"learning_rate": 7.446893787575151e-05,
"loss": 1.6827,
"step": 345
},
{
"epoch": 0.7,
"learning_rate": 7.438877755511023e-05,
"loss": 1.6514,
"step": 350
},
{
"epoch": 0.7,
"eval_loss": 1.6103746891021729,
"eval_runtime": 31.239,
"eval_samples_per_second": 7.107,
"eval_steps_per_second": 1.793,
"step": 350
},
{
"epoch": 0.71,
"learning_rate": 7.430861723446895e-05,
"loss": 1.73,
"step": 355
},
{
"epoch": 0.72,
"learning_rate": 7.422845691382766e-05,
"loss": 1.6027,
"step": 360
},
{
"epoch": 0.72,
"eval_loss": 1.576550841331482,
"eval_runtime": 31.2402,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 1.793,
"step": 360
},
{
"epoch": 0.73,
"learning_rate": 7.414829659318638e-05,
"loss": 1.5685,
"step": 365
},
{
"epoch": 0.74,
"learning_rate": 7.40681362725451e-05,
"loss": 1.6319,
"step": 370
},
{
"epoch": 0.74,
"eval_loss": 1.562269926071167,
"eval_runtime": 31.2342,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 1.793,
"step": 370
},
{
"epoch": 0.75,
"learning_rate": 7.398797595190381e-05,
"loss": 1.5724,
"step": 375
},
{
"epoch": 0.76,
"learning_rate": 7.390781563126253e-05,
"loss": 1.6103,
"step": 380
},
{
"epoch": 0.76,
"eval_loss": 1.5764440298080444,
"eval_runtime": 31.2479,
"eval_samples_per_second": 7.104,
"eval_steps_per_second": 1.792,
"step": 380
},
{
"epoch": 0.77,
"learning_rate": 7.382765531062125e-05,
"loss": 1.6548,
"step": 385
},
{
"epoch": 0.78,
"learning_rate": 7.374749498997996e-05,
"loss": 1.4518,
"step": 390
},
{
"epoch": 0.78,
"eval_loss": 1.544908881187439,
"eval_runtime": 31.2325,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 1.793,
"step": 390
},
{
"epoch": 0.79,
"learning_rate": 7.366733466933868e-05,
"loss": 1.5493,
"step": 395
},
{
"epoch": 0.8,
"learning_rate": 7.35871743486974e-05,
"loss": 1.498,
"step": 400
},
{
"epoch": 0.8,
"eval_loss": 1.5345020294189453,
"eval_runtime": 31.2401,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 1.793,
"step": 400
},
{
"epoch": 0.81,
"learning_rate": 7.350701402805611e-05,
"loss": 1.5031,
"step": 405
},
{
"epoch": 0.82,
"learning_rate": 7.342685370741484e-05,
"loss": 1.5266,
"step": 410
},
{
"epoch": 0.82,
"eval_loss": 1.541326642036438,
"eval_runtime": 31.2476,
"eval_samples_per_second": 7.105,
"eval_steps_per_second": 1.792,
"step": 410
},
{
"epoch": 0.83,
"learning_rate": 7.334669338677355e-05,
"loss": 1.4406,
"step": 415
},
{
"epoch": 0.84,
"learning_rate": 7.326653306613228e-05,
"loss": 1.5622,
"step": 420
},
{
"epoch": 0.84,
"eval_loss": 1.5228804349899292,
"eval_runtime": 31.2404,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 1.793,
"step": 420
},
{
"epoch": 0.85,
"learning_rate": 7.318637274549098e-05,
"loss": 1.5308,
"step": 425
},
{
"epoch": 0.86,
"learning_rate": 7.310621242484971e-05,
"loss": 1.4863,
"step": 430
},
{
"epoch": 0.86,
"eval_loss": 1.5208450555801392,
"eval_runtime": 31.2453,
"eval_samples_per_second": 7.105,
"eval_steps_per_second": 1.792,
"step": 430
},
{
"epoch": 0.87,
"learning_rate": 7.302605210420841e-05,
"loss": 1.5183,
"step": 435
},
{
"epoch": 0.88,
"learning_rate": 7.294589178356714e-05,
"loss": 1.5492,
"step": 440
},
{
"epoch": 0.88,
"eval_loss": 1.4996235370635986,
"eval_runtime": 31.241,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 1.793,
"step": 440
},
{
"epoch": 0.89,
"learning_rate": 7.286573146292586e-05,
"loss": 1.4304,
"step": 445
},
{
"epoch": 0.9,
"learning_rate": 7.278557114228458e-05,
"loss": 1.5515,
"step": 450
},
{
"epoch": 0.9,
"eval_loss": 1.4857271909713745,
"eval_runtime": 31.2521,
"eval_samples_per_second": 7.104,
"eval_steps_per_second": 1.792,
"step": 450
},
{
"epoch": 0.91,
"learning_rate": 7.27054108216433e-05,
"loss": 1.506,
"step": 455
},
{
"epoch": 0.92,
"learning_rate": 7.262525050100201e-05,
"loss": 1.4799,
"step": 460
},
{
"epoch": 0.92,
"eval_loss": 1.493463158607483,
"eval_runtime": 31.2544,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 1.792,
"step": 460
},
{
"epoch": 0.93,
"learning_rate": 7.254509018036073e-05,
"loss": 1.5673,
"step": 465
},
{
"epoch": 0.94,
"learning_rate": 7.246492985971944e-05,
"loss": 1.4514,
"step": 470
},
{
"epoch": 0.94,
"eval_loss": 1.4745410680770874,
"eval_runtime": 31.2458,
"eval_samples_per_second": 7.105,
"eval_steps_per_second": 1.792,
"step": 470
},
{
"epoch": 0.95,
"learning_rate": 7.238476953907816e-05,
"loss": 1.6031,
"step": 475
},
{
"epoch": 0.96,
"learning_rate": 7.230460921843688e-05,
"loss": 1.5462,
"step": 480
},
{
"epoch": 0.96,
"eval_loss": 1.4784045219421387,
"eval_runtime": 31.2514,
"eval_samples_per_second": 7.104,
"eval_steps_per_second": 1.792,
"step": 480
},
{
"epoch": 0.97,
"learning_rate": 7.22244488977956e-05,
"loss": 1.4849,
"step": 485
},
{
"epoch": 0.98,
"learning_rate": 7.214428857715431e-05,
"loss": 1.6032,
"step": 490
},
{
"epoch": 0.98,
"eval_loss": 1.4910966157913208,
"eval_runtime": 31.2331,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 1.793,
"step": 490
},
{
"epoch": 0.99,
"learning_rate": 7.206412825651303e-05,
"loss": 1.5241,
"step": 495
},
{
"epoch": 1.0,
"learning_rate": 7.198396793587175e-05,
"loss": 1.7418,
"step": 500
},
{
"epoch": 1.0,
"eval_loss": 1.473315954208374,
"eval_runtime": 31.2557,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 1.792,
"step": 500
},
{
"epoch": 1.01,
"learning_rate": 7.190380761523046e-05,
"loss": 1.4456,
"step": 505
},
{
"epoch": 1.02,
"learning_rate": 7.182364729458918e-05,
"loss": 1.4983,
"step": 510
},
{
"epoch": 1.02,
"eval_loss": 1.4645960330963135,
"eval_runtime": 31.1687,
"eval_samples_per_second": 7.123,
"eval_steps_per_second": 1.797,
"step": 510
},
{
"epoch": 1.03,
"learning_rate": 7.17434869739479e-05,
"loss": 1.4675,
"step": 515
},
{
"epoch": 1.04,
"learning_rate": 7.166332665330663e-05,
"loss": 1.5383,
"step": 520
},
{
"epoch": 1.04,
"eval_loss": 1.4441555738449097,
"eval_runtime": 31.242,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 1.792,
"step": 520
},
{
"epoch": 1.05,
"learning_rate": 7.158316633266533e-05,
"loss": 1.4809,
"step": 525
},
{
"epoch": 1.06,
"learning_rate": 7.150300601202406e-05,
"loss": 1.3454,
"step": 530
},
{
"epoch": 1.06,
"eval_loss": 1.4332164525985718,
"eval_runtime": 31.2551,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 1.792,
"step": 530
},
{
"epoch": 1.07,
"learning_rate": 7.142284569138276e-05,
"loss": 1.2579,
"step": 535
},
{
"epoch": 1.08,
"learning_rate": 7.134268537074149e-05,
"loss": 1.3128,
"step": 540
},
{
"epoch": 1.08,
"eval_loss": 1.4260525703430176,
"eval_runtime": 31.259,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 1.791,
"step": 540
},
{
"epoch": 1.09,
"learning_rate": 7.12625250501002e-05,
"loss": 1.4444,
"step": 545
},
{
"epoch": 1.1,
"learning_rate": 7.118236472945893e-05,
"loss": 1.5472,
"step": 550
},
{
"epoch": 1.1,
"eval_loss": 1.4231517314910889,
"eval_runtime": 31.2585,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 1.792,
"step": 550
},
{
"epoch": 1.11,
"learning_rate": 7.110220440881764e-05,
"loss": 1.5043,
"step": 555
},
{
"epoch": 1.12,
"learning_rate": 7.102204408817636e-05,
"loss": 1.252,
"step": 560
},
{
"epoch": 1.12,
"eval_loss": 1.3924123048782349,
"eval_runtime": 31.2631,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 560
},
{
"epoch": 1.13,
"learning_rate": 7.094188376753508e-05,
"loss": 1.4416,
"step": 565
},
{
"epoch": 1.14,
"learning_rate": 7.086172344689379e-05,
"loss": 1.3538,
"step": 570
},
{
"epoch": 1.14,
"eval_loss": 1.3975321054458618,
"eval_runtime": 31.264,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 570
},
{
"epoch": 1.15,
"learning_rate": 7.078156312625251e-05,
"loss": 1.4595,
"step": 575
},
{
"epoch": 1.16,
"learning_rate": 7.070140280561123e-05,
"loss": 1.5448,
"step": 580
},
{
"epoch": 1.16,
"eval_loss": 1.391546368598938,
"eval_runtime": 31.2681,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 1.791,
"step": 580
},
{
"epoch": 1.17,
"learning_rate": 7.062124248496996e-05,
"loss": 1.5233,
"step": 585
},
{
"epoch": 1.18,
"learning_rate": 7.054108216432866e-05,
"loss": 1.4016,
"step": 590
},
{
"epoch": 1.18,
"eval_loss": 1.4025028944015503,
"eval_runtime": 31.2695,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 1.791,
"step": 590
},
{
"epoch": 1.19,
"learning_rate": 7.046092184368739e-05,
"loss": 1.3933,
"step": 595
},
{
"epoch": 1.2,
"learning_rate": 7.038076152304609e-05,
"loss": 1.3041,
"step": 600
},
{
"epoch": 1.2,
"eval_loss": 1.383679747581482,
"eval_runtime": 31.2652,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 600
},
{
"epoch": 1.21,
"learning_rate": 7.030060120240482e-05,
"loss": 1.1945,
"step": 605
},
{
"epoch": 1.22,
"learning_rate": 7.022044088176353e-05,
"loss": 1.3857,
"step": 610
},
{
"epoch": 1.22,
"eval_loss": 1.3890188932418823,
"eval_runtime": 31.2731,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 610
},
{
"epoch": 1.23,
"learning_rate": 7.014028056112226e-05,
"loss": 1.3095,
"step": 615
},
{
"epoch": 1.24,
"learning_rate": 7.006012024048097e-05,
"loss": 1.2923,
"step": 620
},
{
"epoch": 1.24,
"eval_loss": 1.3452343940734863,
"eval_runtime": 31.2652,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 620
},
{
"epoch": 1.25,
"learning_rate": 6.997995991983969e-05,
"loss": 1.3602,
"step": 625
},
{
"epoch": 1.26,
"learning_rate": 6.98997995991984e-05,
"loss": 1.28,
"step": 630
},
{
"epoch": 1.26,
"eval_loss": 1.349170446395874,
"eval_runtime": 31.2663,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 1.791,
"step": 630
},
{
"epoch": 1.27,
"learning_rate": 6.981963927855712e-05,
"loss": 1.3712,
"step": 635
},
{
"epoch": 1.28,
"learning_rate": 6.973947895791584e-05,
"loss": 1.4052,
"step": 640
},
{
"epoch": 1.28,
"eval_loss": 1.3253566026687622,
"eval_runtime": 31.2668,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 1.791,
"step": 640
},
{
"epoch": 1.29,
"learning_rate": 6.965931863727456e-05,
"loss": 1.2965,
"step": 645
},
{
"epoch": 1.3,
"learning_rate": 6.957915831663327e-05,
"loss": 1.3992,
"step": 650
},
{
"epoch": 1.3,
"eval_loss": 1.367018461227417,
"eval_runtime": 31.2605,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 1.791,
"step": 650
},
{
"epoch": 1.31,
"learning_rate": 6.949899799599199e-05,
"loss": 1.3044,
"step": 655
},
{
"epoch": 1.32,
"learning_rate": 6.941883767535071e-05,
"loss": 1.5044,
"step": 660
},
{
"epoch": 1.32,
"eval_loss": 1.3153263330459595,
"eval_runtime": 31.2645,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 660
},
{
"epoch": 1.33,
"learning_rate": 6.933867735470942e-05,
"loss": 1.3481,
"step": 665
},
{
"epoch": 1.34,
"learning_rate": 6.925851703406814e-05,
"loss": 1.2274,
"step": 670
},
{
"epoch": 1.34,
"eval_loss": 1.314244270324707,
"eval_runtime": 31.2738,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 670
},
{
"epoch": 1.35,
"learning_rate": 6.917835671342686e-05,
"loss": 1.3536,
"step": 675
},
{
"epoch": 1.36,
"learning_rate": 6.909819639278557e-05,
"loss": 1.2392,
"step": 680
},
{
"epoch": 1.36,
"eval_loss": 1.3149820566177368,
"eval_runtime": 31.2585,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 1.792,
"step": 680
},
{
"epoch": 1.37,
"learning_rate": 6.901803607214429e-05,
"loss": 1.2368,
"step": 685
},
{
"epoch": 1.38,
"learning_rate": 6.893787575150301e-05,
"loss": 1.365,
"step": 690
},
{
"epoch": 1.38,
"eval_loss": 1.296552300453186,
"eval_runtime": 31.2732,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 690
},
{
"epoch": 1.39,
"learning_rate": 6.885771543086174e-05,
"loss": 1.1902,
"step": 695
},
{
"epoch": 1.4,
"learning_rate": 6.877755511022044e-05,
"loss": 1.3024,
"step": 700
},
{
"epoch": 1.4,
"eval_loss": 1.2687900066375732,
"eval_runtime": 31.2617,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 700
},
{
"epoch": 1.41,
"learning_rate": 6.869739478957917e-05,
"loss": 1.2785,
"step": 705
},
{
"epoch": 1.42,
"learning_rate": 6.861723446893787e-05,
"loss": 1.347,
"step": 710
},
{
"epoch": 1.42,
"eval_loss": 1.2873570919036865,
"eval_runtime": 31.2708,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 710
},
{
"epoch": 1.43,
"learning_rate": 6.85370741482966e-05,
"loss": 1.4252,
"step": 715
},
{
"epoch": 1.44,
"learning_rate": 6.845691382765531e-05,
"loss": 1.3898,
"step": 720
},
{
"epoch": 1.44,
"eval_loss": 1.2543420791625977,
"eval_runtime": 31.2712,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 720
},
{
"epoch": 1.45,
"learning_rate": 6.837675350701404e-05,
"loss": 1.1915,
"step": 725
},
{
"epoch": 1.46,
"learning_rate": 6.829659318637275e-05,
"loss": 1.4256,
"step": 730
},
{
"epoch": 1.46,
"eval_loss": 1.2396987676620483,
"eval_runtime": 31.2721,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 730
},
{
"epoch": 1.47,
"learning_rate": 6.821643286573147e-05,
"loss": 1.1646,
"step": 735
},
{
"epoch": 1.48,
"learning_rate": 6.813627254509019e-05,
"loss": 1.2566,
"step": 740
},
{
"epoch": 1.48,
"eval_loss": 1.2429862022399902,
"eval_runtime": 31.2712,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 740
},
{
"epoch": 1.49,
"learning_rate": 6.80561122244489e-05,
"loss": 1.209,
"step": 745
},
{
"epoch": 1.5,
"learning_rate": 6.797595190380762e-05,
"loss": 1.2473,
"step": 750
},
{
"epoch": 1.5,
"eval_loss": 1.2135179042816162,
"eval_runtime": 31.2749,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 750
},
{
"epoch": 1.51,
"learning_rate": 6.789579158316634e-05,
"loss": 1.1848,
"step": 755
},
{
"epoch": 1.52,
"learning_rate": 6.781563126252505e-05,
"loss": 1.1466,
"step": 760
},
{
"epoch": 1.52,
"eval_loss": 1.2170690298080444,
"eval_runtime": 31.2836,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 760
},
{
"epoch": 1.53,
"learning_rate": 6.773547094188377e-05,
"loss": 1.2543,
"step": 765
},
{
"epoch": 1.54,
"learning_rate": 6.765531062124249e-05,
"loss": 1.3065,
"step": 770
},
{
"epoch": 1.54,
"eval_loss": 1.1896520853042603,
"eval_runtime": 31.2763,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 770
},
{
"epoch": 1.55,
"learning_rate": 6.75751503006012e-05,
"loss": 1.1649,
"step": 775
},
{
"epoch": 1.56,
"learning_rate": 6.749498997995992e-05,
"loss": 1.3033,
"step": 780
},
{
"epoch": 1.56,
"eval_loss": 1.1646301746368408,
"eval_runtime": 31.2767,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 780
},
{
"epoch": 1.57,
"learning_rate": 6.741482965931864e-05,
"loss": 1.0613,
"step": 785
},
{
"epoch": 1.58,
"learning_rate": 6.733466933867735e-05,
"loss": 1.1166,
"step": 790
},
{
"epoch": 1.58,
"eval_loss": 1.1722773313522339,
"eval_runtime": 31.2753,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 790
},
{
"epoch": 1.59,
"learning_rate": 6.725450901803607e-05,
"loss": 1.1091,
"step": 795
},
{
"epoch": 1.6,
"learning_rate": 6.717434869739479e-05,
"loss": 1.0874,
"step": 800
},
{
"epoch": 1.6,
"eval_loss": 1.1511393785476685,
"eval_runtime": 31.2755,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 800
},
{
"epoch": 1.61,
"learning_rate": 6.709418837675352e-05,
"loss": 1.2602,
"step": 805
},
{
"epoch": 1.62,
"learning_rate": 6.701402805611222e-05,
"loss": 1.017,
"step": 810
},
{
"epoch": 1.62,
"eval_loss": 1.1395540237426758,
"eval_runtime": 31.2767,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 810
},
{
"epoch": 1.63,
"learning_rate": 6.693386773547095e-05,
"loss": 1.153,
"step": 815
},
{
"epoch": 1.64,
"learning_rate": 6.685370741482966e-05,
"loss": 1.0437,
"step": 820
},
{
"epoch": 1.64,
"eval_loss": 1.1016473770141602,
"eval_runtime": 31.2746,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 820
},
{
"epoch": 1.65,
"learning_rate": 6.677354709418839e-05,
"loss": 1.0316,
"step": 825
},
{
"epoch": 1.66,
"learning_rate": 6.669338677354709e-05,
"loss": 1.2206,
"step": 830
},
{
"epoch": 1.66,
"eval_loss": 1.0841138362884521,
"eval_runtime": 31.2759,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 830
},
{
"epoch": 1.67,
"learning_rate": 6.661322645290582e-05,
"loss": 0.9704,
"step": 835
},
{
"epoch": 1.68,
"learning_rate": 6.653306613226454e-05,
"loss": 0.9738,
"step": 840
},
{
"epoch": 1.68,
"eval_loss": 1.0759927034378052,
"eval_runtime": 31.2775,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 840
},
{
"epoch": 1.69,
"learning_rate": 6.645290581162325e-05,
"loss": 1.1332,
"step": 845
},
{
"epoch": 1.7,
"learning_rate": 6.637274549098197e-05,
"loss": 1.1351,
"step": 850
},
{
"epoch": 1.7,
"eval_loss": 1.0561842918395996,
"eval_runtime": 31.2727,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 850
},
{
"epoch": 1.71,
"learning_rate": 6.629258517034069e-05,
"loss": 1.0462,
"step": 855
},
{
"epoch": 1.72,
"learning_rate": 6.62124248496994e-05,
"loss": 1.0697,
"step": 860
},
{
"epoch": 1.72,
"eval_loss": 1.05562424659729,
"eval_runtime": 31.2794,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 860
},
{
"epoch": 1.73,
"learning_rate": 6.613226452905812e-05,
"loss": 1.1096,
"step": 865
},
{
"epoch": 1.74,
"learning_rate": 6.605210420841685e-05,
"loss": 1.0296,
"step": 870
},
{
"epoch": 1.74,
"eval_loss": 1.034234881401062,
"eval_runtime": 31.2794,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 870
},
{
"epoch": 1.75,
"learning_rate": 6.597194388777555e-05,
"loss": 1.0413,
"step": 875
},
{
"epoch": 1.76,
"learning_rate": 6.589178356713428e-05,
"loss": 1.0904,
"step": 880
},
{
"epoch": 1.76,
"eval_loss": 1.0046826601028442,
"eval_runtime": 31.283,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 880
},
{
"epoch": 1.77,
"learning_rate": 6.581162324649299e-05,
"loss": 1.0728,
"step": 885
},
{
"epoch": 1.78,
"learning_rate": 6.573146292585172e-05,
"loss": 1.01,
"step": 890
},
{
"epoch": 1.78,
"eval_loss": 1.018371820449829,
"eval_runtime": 31.2848,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 890
},
{
"epoch": 1.79,
"learning_rate": 6.565130260521042e-05,
"loss": 1.047,
"step": 895
},
{
"epoch": 1.8,
"learning_rate": 6.557114228456915e-05,
"loss": 0.951,
"step": 900
},
{
"epoch": 1.8,
"eval_loss": 0.9845412969589233,
"eval_runtime": 31.2866,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 900
},
{
"epoch": 1.81,
"learning_rate": 6.549098196392787e-05,
"loss": 0.9055,
"step": 905
},
{
"epoch": 1.82,
"learning_rate": 6.541082164328658e-05,
"loss": 1.0111,
"step": 910
},
{
"epoch": 1.82,
"eval_loss": 0.9674527049064636,
"eval_runtime": 31.2872,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 910
},
{
"epoch": 1.83,
"learning_rate": 6.53306613226453e-05,
"loss": 0.9433,
"step": 915
},
{
"epoch": 1.84,
"learning_rate": 6.525050100200402e-05,
"loss": 1.0824,
"step": 920
},
{
"epoch": 1.84,
"eval_loss": 0.9758660793304443,
"eval_runtime": 31.2867,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 920
},
{
"epoch": 1.85,
"learning_rate": 6.517034068136273e-05,
"loss": 0.9381,
"step": 925
},
{
"epoch": 1.86,
"learning_rate": 6.509018036072145e-05,
"loss": 0.9745,
"step": 930
},
{
"epoch": 1.86,
"eval_loss": 0.9335694909095764,
"eval_runtime": 31.2906,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 930
},
{
"epoch": 1.87,
"learning_rate": 6.501002004008017e-05,
"loss": 0.8404,
"step": 935
},
{
"epoch": 1.88,
"learning_rate": 6.492985971943888e-05,
"loss": 0.8632,
"step": 940
},
{
"epoch": 1.88,
"eval_loss": 0.934661865234375,
"eval_runtime": 31.2917,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 940
},
{
"epoch": 1.89,
"learning_rate": 6.48496993987976e-05,
"loss": 0.9011,
"step": 945
},
{
"epoch": 1.9,
"learning_rate": 6.476953907815632e-05,
"loss": 0.9959,
"step": 950
},
{
"epoch": 1.9,
"eval_loss": 0.9394508600234985,
"eval_runtime": 31.2847,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 950
},
{
"epoch": 1.91,
"learning_rate": 6.468937875751503e-05,
"loss": 0.967,
"step": 955
},
{
"epoch": 1.92,
"learning_rate": 6.460921843687375e-05,
"loss": 0.8906,
"step": 960
},
{
"epoch": 1.92,
"eval_loss": 0.8965132236480713,
"eval_runtime": 31.292,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 960
},
{
"epoch": 1.93,
"learning_rate": 6.452905811623247e-05,
"loss": 0.9108,
"step": 965
},
{
"epoch": 1.94,
"learning_rate": 6.444889779559118e-05,
"loss": 1.0552,
"step": 970
},
{
"epoch": 1.94,
"eval_loss": 0.8891679048538208,
"eval_runtime": 31.2905,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 970
},
{
"epoch": 1.95,
"learning_rate": 6.43687374749499e-05,
"loss": 0.8783,
"step": 975
},
{
"epoch": 1.96,
"learning_rate": 6.428857715430863e-05,
"loss": 0.8387,
"step": 980
},
{
"epoch": 1.96,
"eval_loss": 0.8821650147438049,
"eval_runtime": 31.2928,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 980
},
{
"epoch": 1.97,
"learning_rate": 6.420841683366733e-05,
"loss": 0.9528,
"step": 985
},
{
"epoch": 1.98,
"learning_rate": 6.412825651302606e-05,
"loss": 1.0068,
"step": 990
},
{
"epoch": 1.98,
"eval_loss": 0.8804778456687927,
"eval_runtime": 31.2971,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 990
},
{
"epoch": 1.99,
"learning_rate": 6.404809619238477e-05,
"loss": 0.9084,
"step": 995
},
{
"epoch": 2.0,
"learning_rate": 6.39679358717435e-05,
"loss": 1.083,
"step": 1000
},
{
"epoch": 2.0,
"eval_loss": 0.8489722013473511,
"eval_runtime": 31.2897,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1000
},
{
"epoch": 2.01,
"learning_rate": 6.38877755511022e-05,
"loss": 0.9829,
"step": 1005
},
{
"epoch": 2.02,
"learning_rate": 6.380761523046093e-05,
"loss": 0.8407,
"step": 1010
},
{
"epoch": 2.02,
"eval_loss": 0.8456799387931824,
"eval_runtime": 31.202,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 1010
},
{
"epoch": 2.03,
"learning_rate": 6.372745490981965e-05,
"loss": 0.7473,
"step": 1015
},
{
"epoch": 2.04,
"learning_rate": 6.364729458917836e-05,
"loss": 0.7468,
"step": 1020
},
{
"epoch": 2.04,
"eval_loss": 0.8285406827926636,
"eval_runtime": 31.2693,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 1.791,
"step": 1020
},
{
"epoch": 2.05,
"learning_rate": 6.356713426853708e-05,
"loss": 0.892,
"step": 1025
},
{
"epoch": 2.06,
"learning_rate": 6.34869739478958e-05,
"loss": 0.8421,
"step": 1030
},
{
"epoch": 2.06,
"eval_loss": 0.8055410385131836,
"eval_runtime": 31.2828,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 1030
},
{
"epoch": 2.07,
"learning_rate": 6.340681362725451e-05,
"loss": 0.7795,
"step": 1035
},
{
"epoch": 2.08,
"learning_rate": 6.332665330661323e-05,
"loss": 0.8407,
"step": 1040
},
{
"epoch": 2.08,
"eval_loss": 0.8160460591316223,
"eval_runtime": 31.2892,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1040
},
{
"epoch": 2.09,
"learning_rate": 6.324649298597195e-05,
"loss": 0.7307,
"step": 1045
},
{
"epoch": 2.1,
"learning_rate": 6.316633266533066e-05,
"loss": 0.8126,
"step": 1050
},
{
"epoch": 2.1,
"eval_loss": 0.8266436457633972,
"eval_runtime": 31.2933,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1050
},
{
"epoch": 2.11,
"learning_rate": 6.308617234468938e-05,
"loss": 0.7997,
"step": 1055
},
{
"epoch": 2.12,
"learning_rate": 6.30060120240481e-05,
"loss": 0.7318,
"step": 1060
},
{
"epoch": 2.12,
"eval_loss": 0.815096914768219,
"eval_runtime": 31.3047,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1060
},
{
"epoch": 2.13,
"learning_rate": 6.292585170340681e-05,
"loss": 0.7908,
"step": 1065
},
{
"epoch": 2.14,
"learning_rate": 6.284569138276553e-05,
"loss": 0.9142,
"step": 1070
},
{
"epoch": 2.14,
"eval_loss": 0.7875866889953613,
"eval_runtime": 31.2805,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 1070
},
{
"epoch": 2.15,
"learning_rate": 6.276553106212425e-05,
"loss": 0.7489,
"step": 1075
},
{
"epoch": 2.16,
"learning_rate": 6.268537074148298e-05,
"loss": 0.6483,
"step": 1080
},
{
"epoch": 2.16,
"eval_loss": 0.7866150736808777,
"eval_runtime": 31.2895,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1080
},
{
"epoch": 2.17,
"learning_rate": 6.260521042084168e-05,
"loss": 0.7886,
"step": 1085
},
{
"epoch": 2.18,
"learning_rate": 6.252505010020041e-05,
"loss": 0.8092,
"step": 1090
},
{
"epoch": 2.18,
"eval_loss": 0.7817696332931519,
"eval_runtime": 31.2844,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 1090
},
{
"epoch": 2.19,
"learning_rate": 6.244488977955911e-05,
"loss": 0.6213,
"step": 1095
},
{
"epoch": 2.2,
"learning_rate": 6.236472945891785e-05,
"loss": 0.8235,
"step": 1100
},
{
"epoch": 2.2,
"eval_loss": 0.7707763910293579,
"eval_runtime": 31.2903,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1100
},
{
"epoch": 2.21,
"learning_rate": 6.228456913827655e-05,
"loss": 0.8963,
"step": 1105
},
{
"epoch": 2.22,
"learning_rate": 6.220440881763528e-05,
"loss": 0.7062,
"step": 1110
},
{
"epoch": 2.22,
"eval_loss": 0.7692943811416626,
"eval_runtime": 31.2891,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1110
},
{
"epoch": 2.23,
"learning_rate": 6.2124248496994e-05,
"loss": 0.7988,
"step": 1115
},
{
"epoch": 2.24,
"learning_rate": 6.204408817635271e-05,
"loss": 0.7348,
"step": 1120
},
{
"epoch": 2.24,
"eval_loss": 0.7874757647514343,
"eval_runtime": 31.2928,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1120
},
{
"epoch": 2.25,
"learning_rate": 6.196392785571143e-05,
"loss": 0.6604,
"step": 1125
},
{
"epoch": 2.26,
"learning_rate": 6.188376753507015e-05,
"loss": 0.7507,
"step": 1130
},
{
"epoch": 2.26,
"eval_loss": 0.7566913962364197,
"eval_runtime": 31.2871,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 1130
},
{
"epoch": 2.27,
"learning_rate": 6.180360721442886e-05,
"loss": 0.7219,
"step": 1135
},
{
"epoch": 2.28,
"learning_rate": 6.172344689378758e-05,
"loss": 0.7588,
"step": 1140
},
{
"epoch": 2.28,
"eval_loss": 0.7564798593521118,
"eval_runtime": 31.2861,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 1140
},
{
"epoch": 2.29,
"learning_rate": 6.16432865731463e-05,
"loss": 0.7573,
"step": 1145
},
{
"epoch": 2.3,
"learning_rate": 6.156312625250501e-05,
"loss": 0.605,
"step": 1150
},
{
"epoch": 2.3,
"eval_loss": 0.7298113703727722,
"eval_runtime": 31.2916,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1150
},
{
"epoch": 2.31,
"learning_rate": 6.148296593186374e-05,
"loss": 0.7134,
"step": 1155
},
{
"epoch": 2.32,
"learning_rate": 6.140280561122245e-05,
"loss": 0.8721,
"step": 1160
},
{
"epoch": 2.32,
"eval_loss": 0.7254282236099243,
"eval_runtime": 31.2918,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1160
},
{
"epoch": 2.33,
"learning_rate": 6.132264529058118e-05,
"loss": 0.8905,
"step": 1165
},
{
"epoch": 2.34,
"learning_rate": 6.124248496993988e-05,
"loss": 0.6988,
"step": 1170
},
{
"epoch": 2.34,
"eval_loss": 0.7072407007217407,
"eval_runtime": 31.2946,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1170
},
{
"epoch": 2.35,
"learning_rate": 6.116232464929861e-05,
"loss": 0.6772,
"step": 1175
},
{
"epoch": 2.36,
"learning_rate": 6.108216432865731e-05,
"loss": 0.6294,
"step": 1180
},
{
"epoch": 2.36,
"eval_loss": 0.7082269787788391,
"eval_runtime": 31.303,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1180
},
{
"epoch": 2.37,
"learning_rate": 6.100200400801604e-05,
"loss": 0.5605,
"step": 1185
},
{
"epoch": 2.38,
"learning_rate": 6.092184368737475e-05,
"loss": 0.7117,
"step": 1190
},
{
"epoch": 2.38,
"eval_loss": 0.7112658619880676,
"eval_runtime": 31.295,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1190
},
{
"epoch": 2.39,
"learning_rate": 6.0841683366733476e-05,
"loss": 0.7873,
"step": 1195
},
{
"epoch": 2.4,
"learning_rate": 6.0761523046092186e-05,
"loss": 0.8558,
"step": 1200
},
{
"epoch": 2.4,
"eval_loss": 0.6991309523582458,
"eval_runtime": 31.3002,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1200
},
{
"epoch": 2.41,
"learning_rate": 6.068136272545091e-05,
"loss": 0.8755,
"step": 1205
},
{
"epoch": 2.42,
"learning_rate": 6.060120240480962e-05,
"loss": 0.6187,
"step": 1210
},
{
"epoch": 2.42,
"eval_loss": 0.690467119216919,
"eval_runtime": 31.2955,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1210
},
{
"epoch": 2.43,
"learning_rate": 6.052104208416834e-05,
"loss": 0.7238,
"step": 1215
},
{
"epoch": 2.44,
"learning_rate": 6.044088176352706e-05,
"loss": 0.6791,
"step": 1220
},
{
"epoch": 2.44,
"eval_loss": 0.687512993812561,
"eval_runtime": 31.3006,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1220
},
{
"epoch": 2.45,
"learning_rate": 6.0360721442885776e-05,
"loss": 0.7325,
"step": 1225
},
{
"epoch": 2.46,
"learning_rate": 6.028056112224449e-05,
"loss": 0.5447,
"step": 1230
},
{
"epoch": 2.46,
"eval_loss": 0.6869089007377625,
"eval_runtime": 31.296,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1230
},
{
"epoch": 2.47,
"learning_rate": 6.0200400801603217e-05,
"loss": 0.7392,
"step": 1235
},
{
"epoch": 2.48,
"learning_rate": 6.0120240480961926e-05,
"loss": 0.7299,
"step": 1240
},
{
"epoch": 2.48,
"eval_loss": 0.6777493357658386,
"eval_runtime": 31.2976,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1240
},
{
"epoch": 2.49,
"learning_rate": 6.004008016032065e-05,
"loss": 0.7736,
"step": 1245
},
{
"epoch": 2.5,
"learning_rate": 5.995991983967936e-05,
"loss": 0.5829,
"step": 1250
},
{
"epoch": 2.5,
"eval_loss": 0.6657550930976868,
"eval_runtime": 31.2961,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1250
},
{
"epoch": 2.51,
"learning_rate": 5.987975951903808e-05,
"loss": 0.7086,
"step": 1255
},
{
"epoch": 2.52,
"learning_rate": 5.979959919839679e-05,
"loss": 0.6435,
"step": 1260
},
{
"epoch": 2.52,
"eval_loss": 0.6603330969810486,
"eval_runtime": 31.2999,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1260
},
{
"epoch": 2.53,
"learning_rate": 5.971943887775552e-05,
"loss": 0.6293,
"step": 1265
},
{
"epoch": 2.54,
"learning_rate": 5.9639278557114233e-05,
"loss": 0.7303,
"step": 1270
},
{
"epoch": 2.54,
"eval_loss": 0.6578312516212463,
"eval_runtime": 31.305,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1270
},
{
"epoch": 2.55,
"learning_rate": 5.955911823647295e-05,
"loss": 0.6796,
"step": 1275
},
{
"epoch": 2.56,
"learning_rate": 5.947895791583167e-05,
"loss": 0.7244,
"step": 1280
},
{
"epoch": 2.56,
"eval_loss": 0.6594119668006897,
"eval_runtime": 31.3007,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1280
},
{
"epoch": 2.57,
"learning_rate": 5.939879759519039e-05,
"loss": 0.6601,
"step": 1285
},
{
"epoch": 2.58,
"learning_rate": 5.93186372745491e-05,
"loss": 0.6463,
"step": 1290
},
{
"epoch": 2.58,
"eval_loss": 0.640873372554779,
"eval_runtime": 31.295,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1290
},
{
"epoch": 2.59,
"learning_rate": 5.9238476953907824e-05,
"loss": 0.6722,
"step": 1295
},
{
"epoch": 2.6,
"learning_rate": 5.9158316633266534e-05,
"loss": 0.7766,
"step": 1300
},
{
"epoch": 2.6,
"eval_loss": 0.6417058706283569,
"eval_runtime": 31.3016,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1300
},
{
"epoch": 2.61,
"learning_rate": 5.907815631262526e-05,
"loss": 0.5843,
"step": 1305
},
{
"epoch": 2.62,
"learning_rate": 5.899799599198397e-05,
"loss": 0.6012,
"step": 1310
},
{
"epoch": 2.62,
"eval_loss": 0.646079957485199,
"eval_runtime": 31.2989,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1310
},
{
"epoch": 2.63,
"learning_rate": 5.891783567134269e-05,
"loss": 0.7435,
"step": 1315
},
{
"epoch": 2.64,
"learning_rate": 5.883767535070141e-05,
"loss": 0.5974,
"step": 1320
},
{
"epoch": 2.64,
"eval_loss": 0.6364943981170654,
"eval_runtime": 31.305,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1320
},
{
"epoch": 2.65,
"learning_rate": 5.8757515030060124e-05,
"loss": 0.6149,
"step": 1325
},
{
"epoch": 2.66,
"learning_rate": 5.867735470941884e-05,
"loss": 0.556,
"step": 1330
},
{
"epoch": 2.66,
"eval_loss": 0.6301265358924866,
"eval_runtime": 31.3004,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1330
},
{
"epoch": 2.67,
"learning_rate": 5.859719438877756e-05,
"loss": 0.5322,
"step": 1335
},
{
"epoch": 2.68,
"learning_rate": 5.8517034068136274e-05,
"loss": 0.6369,
"step": 1340
},
{
"epoch": 2.68,
"eval_loss": 0.6247262358665466,
"eval_runtime": 31.298,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1340
},
{
"epoch": 2.69,
"learning_rate": 5.8436873747495e-05,
"loss": 0.6517,
"step": 1345
},
{
"epoch": 2.7,
"learning_rate": 5.835671342685371e-05,
"loss": 0.5699,
"step": 1350
},
{
"epoch": 2.7,
"eval_loss": 0.6162915229797363,
"eval_runtime": 31.2973,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1350
},
{
"epoch": 2.71,
"learning_rate": 5.827655310621243e-05,
"loss": 0.6318,
"step": 1355
},
{
"epoch": 2.72,
"learning_rate": 5.819639278557114e-05,
"loss": 0.624,
"step": 1360
},
{
"epoch": 2.72,
"eval_loss": 0.6137639880180359,
"eval_runtime": 31.3062,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1360
},
{
"epoch": 2.73,
"learning_rate": 5.8116232464929865e-05,
"loss": 0.695,
"step": 1365
},
{
"epoch": 2.74,
"learning_rate": 5.8036072144288574e-05,
"loss": 0.6774,
"step": 1370
},
{
"epoch": 2.74,
"eval_loss": 0.6134688854217529,
"eval_runtime": 31.2931,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1370
},
{
"epoch": 2.75,
"learning_rate": 5.79559118236473e-05,
"loss": 0.5395,
"step": 1375
},
{
"epoch": 2.76,
"learning_rate": 5.7875751503006015e-05,
"loss": 0.5553,
"step": 1380
},
{
"epoch": 2.76,
"eval_loss": 0.6075760126113892,
"eval_runtime": 31.3038,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1380
},
{
"epoch": 2.77,
"learning_rate": 5.779559118236473e-05,
"loss": 0.8059,
"step": 1385
},
{
"epoch": 2.78,
"learning_rate": 5.7715430861723455e-05,
"loss": 0.604,
"step": 1390
},
{
"epoch": 2.78,
"eval_loss": 0.5937612652778625,
"eval_runtime": 31.3124,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 1390
},
{
"epoch": 2.79,
"learning_rate": 5.763527054108217e-05,
"loss": 0.6524,
"step": 1395
},
{
"epoch": 2.8,
"learning_rate": 5.755511022044089e-05,
"loss": 0.6087,
"step": 1400
},
{
"epoch": 2.8,
"eval_loss": 0.5955749750137329,
"eval_runtime": 31.3138,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 1400
},
{
"epoch": 2.81,
"learning_rate": 5.7474949899799605e-05,
"loss": 0.6067,
"step": 1405
},
{
"epoch": 2.82,
"learning_rate": 5.739478957915833e-05,
"loss": 0.5935,
"step": 1410
},
{
"epoch": 2.82,
"eval_loss": 0.5933490991592407,
"eval_runtime": 31.3036,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1410
},
{
"epoch": 2.83,
"learning_rate": 5.731462925851704e-05,
"loss": 0.6652,
"step": 1415
},
{
"epoch": 2.84,
"learning_rate": 5.723446893787576e-05,
"loss": 0.6042,
"step": 1420
},
{
"epoch": 2.84,
"eval_loss": 0.5911222100257874,
"eval_runtime": 31.2993,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1420
},
{
"epoch": 2.85,
"learning_rate": 5.715430861723447e-05,
"loss": 0.6957,
"step": 1425
},
{
"epoch": 2.86,
"learning_rate": 5.7074148296593195e-05,
"loss": 0.6425,
"step": 1430
},
{
"epoch": 2.86,
"eval_loss": 0.5844302773475647,
"eval_runtime": 31.2967,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1430
},
{
"epoch": 2.87,
"learning_rate": 5.6993987975951905e-05,
"loss": 0.4406,
"step": 1435
},
{
"epoch": 2.88,
"learning_rate": 5.691382765531063e-05,
"loss": 0.6316,
"step": 1440
},
{
"epoch": 2.88,
"eval_loss": 0.5744926929473877,
"eval_runtime": 31.3029,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1440
},
{
"epoch": 2.89,
"learning_rate": 5.6833667334669345e-05,
"loss": 0.5697,
"step": 1445
},
{
"epoch": 2.9,
"learning_rate": 5.675350701402806e-05,
"loss": 0.597,
"step": 1450
},
{
"epoch": 2.9,
"eval_loss": 0.5694547891616821,
"eval_runtime": 31.311,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 1450
},
{
"epoch": 2.91,
"learning_rate": 5.667334669338678e-05,
"loss": 0.5544,
"step": 1455
},
{
"epoch": 2.92,
"learning_rate": 5.6593186372745496e-05,
"loss": 0.5754,
"step": 1460
},
{
"epoch": 2.92,
"eval_loss": 0.5703684091567993,
"eval_runtime": 31.3157,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 1460
},
{
"epoch": 2.93,
"learning_rate": 5.651302605210421e-05,
"loss": 0.465,
"step": 1465
},
{
"epoch": 2.94,
"learning_rate": 5.6432865731462936e-05,
"loss": 0.5197,
"step": 1470
},
{
"epoch": 2.94,
"eval_loss": 0.5696949362754822,
"eval_runtime": 31.3085,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1470
},
{
"epoch": 2.95,
"learning_rate": 5.6352705410821646e-05,
"loss": 0.6936,
"step": 1475
},
{
"epoch": 2.96,
"learning_rate": 5.627254509018037e-05,
"loss": 0.6256,
"step": 1480
},
{
"epoch": 2.96,
"eval_loss": 0.5596420764923096,
"eval_runtime": 31.3048,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1480
},
{
"epoch": 2.97,
"learning_rate": 5.619238476953908e-05,
"loss": 0.6175,
"step": 1485
},
{
"epoch": 2.98,
"learning_rate": 5.61122244488978e-05,
"loss": 0.5818,
"step": 1490
},
{
"epoch": 2.98,
"eval_loss": 0.5599228739738464,
"eval_runtime": 31.3107,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 1490
},
{
"epoch": 2.99,
"learning_rate": 5.603206412825652e-05,
"loss": 0.5264,
"step": 1495
},
{
"epoch": 3.01,
"learning_rate": 5.5951903807615236e-05,
"loss": 0.5464,
"step": 1500
},
{
"epoch": 3.01,
"eval_loss": 0.5564998388290405,
"eval_runtime": 31.303,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1500
},
{
"epoch": 3.02,
"learning_rate": 5.587174348697395e-05,
"loss": 0.3897,
"step": 1505
},
{
"epoch": 3.03,
"learning_rate": 5.579158316633267e-05,
"loss": 0.4616,
"step": 1510
},
{
"epoch": 3.03,
"eval_loss": 0.5629072189331055,
"eval_runtime": 31.0222,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 1.805,
"step": 1510
},
{
"epoch": 3.04,
"learning_rate": 5.5711422845691386e-05,
"loss": 0.6419,
"step": 1515
},
{
"epoch": 3.05,
"learning_rate": 5.563126252505011e-05,
"loss": 0.6482,
"step": 1520
},
{
"epoch": 3.05,
"eval_loss": 0.5529131889343262,
"eval_runtime": 31.2128,
"eval_samples_per_second": 7.112,
"eval_steps_per_second": 1.794,
"step": 1520
},
{
"epoch": 3.06,
"learning_rate": 5.555110220440882e-05,
"loss": 0.5377,
"step": 1525
},
{
"epoch": 3.07,
"learning_rate": 5.547094188376754e-05,
"loss": 0.5356,
"step": 1530
},
{
"epoch": 3.07,
"eval_loss": 0.5526372790336609,
"eval_runtime": 31.2759,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 1530
},
{
"epoch": 3.08,
"learning_rate": 5.539078156312625e-05,
"loss": 0.589,
"step": 1535
},
{
"epoch": 3.09,
"learning_rate": 5.5310621242484976e-05,
"loss": 0.5688,
"step": 1540
},
{
"epoch": 3.09,
"eval_loss": 0.5528168678283691,
"eval_runtime": 31.2986,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1540
},
{
"epoch": 3.1,
"learning_rate": 5.5230460921843686e-05,
"loss": 0.4732,
"step": 1545
},
{
"epoch": 3.11,
"learning_rate": 5.515030060120241e-05,
"loss": 0.6018,
"step": 1550
},
{
"epoch": 3.11,
"eval_loss": 0.5408484935760498,
"eval_runtime": 31.3037,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1550
},
{
"epoch": 3.12,
"learning_rate": 5.5070140280561127e-05,
"loss": 0.505,
"step": 1555
},
{
"epoch": 3.13,
"learning_rate": 5.498997995991984e-05,
"loss": 0.5794,
"step": 1560
},
{
"epoch": 3.13,
"eval_loss": 0.5370539426803589,
"eval_runtime": 31.3017,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1560
},
{
"epoch": 3.14,
"learning_rate": 5.490981963927856e-05,
"loss": 0.5621,
"step": 1565
},
{
"epoch": 3.15,
"learning_rate": 5.4829659318637283e-05,
"loss": 0.5443,
"step": 1570
},
{
"epoch": 3.15,
"eval_loss": 0.537534236907959,
"eval_runtime": 31.2972,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1570
},
{
"epoch": 3.16,
"learning_rate": 5.474949899799599e-05,
"loss": 0.449,
"step": 1575
},
{
"epoch": 3.17,
"learning_rate": 5.466933867735472e-05,
"loss": 0.4435,
"step": 1580
},
{
"epoch": 3.17,
"eval_loss": 0.5345003604888916,
"eval_runtime": 31.2985,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1580
},
{
"epoch": 3.18,
"learning_rate": 5.458917835671343e-05,
"loss": 0.4819,
"step": 1585
},
{
"epoch": 3.19,
"learning_rate": 5.450901803607215e-05,
"loss": 0.5087,
"step": 1590
},
{
"epoch": 3.19,
"eval_loss": 0.5292515754699707,
"eval_runtime": 31.3021,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1590
},
{
"epoch": 3.2,
"learning_rate": 5.442885771543086e-05,
"loss": 0.453,
"step": 1595
},
{
"epoch": 3.21,
"learning_rate": 5.4348697394789584e-05,
"loss": 0.518,
"step": 1600
},
{
"epoch": 3.21,
"eval_loss": 0.5336319804191589,
"eval_runtime": 31.2945,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1600
},
{
"epoch": 3.22,
"learning_rate": 5.42685370741483e-05,
"loss": 0.5508,
"step": 1605
},
{
"epoch": 3.23,
"learning_rate": 5.418837675350702e-05,
"loss": 0.5914,
"step": 1610
},
{
"epoch": 3.23,
"eval_loss": 0.5315628051757812,
"eval_runtime": 31.3062,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1610
},
{
"epoch": 3.24,
"learning_rate": 5.4108216432865734e-05,
"loss": 0.5609,
"step": 1615
},
{
"epoch": 3.25,
"learning_rate": 5.402805611222446e-05,
"loss": 0.5667,
"step": 1620
},
{
"epoch": 3.25,
"eval_loss": 0.5254489183425903,
"eval_runtime": 31.2915,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1620
},
{
"epoch": 3.26,
"learning_rate": 5.394789579158317e-05,
"loss": 0.5349,
"step": 1625
},
{
"epoch": 3.27,
"learning_rate": 5.386773547094189e-05,
"loss": 0.5218,
"step": 1630
},
{
"epoch": 3.27,
"eval_loss": 0.5206549167633057,
"eval_runtime": 31.2983,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1630
},
{
"epoch": 3.28,
"learning_rate": 5.37875751503006e-05,
"loss": 0.5036,
"step": 1635
},
{
"epoch": 3.29,
"learning_rate": 5.3707414829659324e-05,
"loss": 0.4267,
"step": 1640
},
{
"epoch": 3.29,
"eval_loss": 0.5270143151283264,
"eval_runtime": 31.2985,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1640
},
{
"epoch": 3.3,
"learning_rate": 5.3627254509018034e-05,
"loss": 0.5262,
"step": 1645
},
{
"epoch": 3.31,
"learning_rate": 5.354709418837676e-05,
"loss": 0.5839,
"step": 1650
},
{
"epoch": 3.31,
"eval_loss": 0.5198652148246765,
"eval_runtime": 31.2991,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1650
},
{
"epoch": 3.32,
"learning_rate": 5.3466933867735474e-05,
"loss": 0.4521,
"step": 1655
},
{
"epoch": 3.33,
"learning_rate": 5.338677354709419e-05,
"loss": 0.5095,
"step": 1660
},
{
"epoch": 3.33,
"eval_loss": 0.5267544984817505,
"eval_runtime": 31.299,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1660
},
{
"epoch": 3.34,
"learning_rate": 5.3306613226452914e-05,
"loss": 0.5022,
"step": 1665
},
{
"epoch": 3.35,
"learning_rate": 5.3226452905811624e-05,
"loss": 0.4616,
"step": 1670
},
{
"epoch": 3.35,
"eval_loss": 0.5191987752914429,
"eval_runtime": 31.2917,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1670
},
{
"epoch": 3.36,
"learning_rate": 5.314629258517035e-05,
"loss": 0.5251,
"step": 1675
},
{
"epoch": 3.37,
"learning_rate": 5.3066132264529065e-05,
"loss": 0.5027,
"step": 1680
},
{
"epoch": 3.37,
"eval_loss": 0.5106366872787476,
"eval_runtime": 31.2948,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1680
},
{
"epoch": 3.38,
"learning_rate": 5.298597194388778e-05,
"loss": 0.5192,
"step": 1685
},
{
"epoch": 3.39,
"learning_rate": 5.29058116232465e-05,
"loss": 0.441,
"step": 1690
},
{
"epoch": 3.39,
"eval_loss": 0.5149854421615601,
"eval_runtime": 31.2974,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1690
},
{
"epoch": 3.4,
"learning_rate": 5.282565130260522e-05,
"loss": 0.5155,
"step": 1695
},
{
"epoch": 3.41,
"learning_rate": 5.274549098196393e-05,
"loss": 0.4416,
"step": 1700
},
{
"epoch": 3.41,
"eval_loss": 0.5155748724937439,
"eval_runtime": 31.2962,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1700
},
{
"epoch": 3.42,
"learning_rate": 5.2665330661322655e-05,
"loss": 0.5154,
"step": 1705
},
{
"epoch": 3.43,
"learning_rate": 5.2585170340681365e-05,
"loss": 0.4411,
"step": 1710
},
{
"epoch": 3.43,
"eval_loss": 0.5102916955947876,
"eval_runtime": 31.2896,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 1710
},
{
"epoch": 3.44,
"learning_rate": 5.250501002004009e-05,
"loss": 0.5015,
"step": 1715
},
{
"epoch": 3.45,
"learning_rate": 5.24248496993988e-05,
"loss": 0.47,
"step": 1720
},
{
"epoch": 3.45,
"eval_loss": 0.5037886500358582,
"eval_runtime": 31.2923,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1720
},
{
"epoch": 3.46,
"learning_rate": 5.234468937875752e-05,
"loss": 0.403,
"step": 1725
},
{
"epoch": 3.47,
"learning_rate": 5.226452905811624e-05,
"loss": 0.5079,
"step": 1730
},
{
"epoch": 3.47,
"eval_loss": 0.5047650337219238,
"eval_runtime": 31.2924,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1730
},
{
"epoch": 3.48,
"learning_rate": 5.2184368737474955e-05,
"loss": 0.4939,
"step": 1735
},
{
"epoch": 3.49,
"learning_rate": 5.210420841683367e-05,
"loss": 0.3913,
"step": 1740
},
{
"epoch": 3.49,
"eval_loss": 0.508187472820282,
"eval_runtime": 31.2958,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1740
},
{
"epoch": 3.5,
"learning_rate": 5.2024048096192395e-05,
"loss": 0.3983,
"step": 1745
},
{
"epoch": 3.51,
"learning_rate": 5.1943887775551105e-05,
"loss": 0.4977,
"step": 1750
},
{
"epoch": 3.51,
"eval_loss": 0.49760758876800537,
"eval_runtime": 31.3045,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1750
},
{
"epoch": 3.52,
"learning_rate": 5.186372745490983e-05,
"loss": 0.5879,
"step": 1755
},
{
"epoch": 3.53,
"learning_rate": 5.178356713426854e-05,
"loss": 0.5905,
"step": 1760
},
{
"epoch": 3.53,
"eval_loss": 0.4974704384803772,
"eval_runtime": 31.2985,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1760
},
{
"epoch": 3.54,
"learning_rate": 5.170340681362726e-05,
"loss": 0.5392,
"step": 1765
},
{
"epoch": 3.55,
"learning_rate": 5.162324649298597e-05,
"loss": 0.4362,
"step": 1770
},
{
"epoch": 3.55,
"eval_loss": 0.4961581230163574,
"eval_runtime": 31.306,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1770
},
{
"epoch": 3.56,
"learning_rate": 5.1543086172344696e-05,
"loss": 0.522,
"step": 1775
},
{
"epoch": 3.57,
"learning_rate": 5.146292585170341e-05,
"loss": 0.4309,
"step": 1780
},
{
"epoch": 3.57,
"eval_loss": 0.500778317451477,
"eval_runtime": 31.3034,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1780
},
{
"epoch": 3.58,
"learning_rate": 5.138276553106213e-05,
"loss": 0.5687,
"step": 1785
},
{
"epoch": 3.59,
"learning_rate": 5.1302605210420846e-05,
"loss": 0.4477,
"step": 1790
},
{
"epoch": 3.59,
"eval_loss": 0.49876537919044495,
"eval_runtime": 31.2939,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1790
},
{
"epoch": 3.6,
"learning_rate": 5.122244488977956e-05,
"loss": 0.4987,
"step": 1795
},
{
"epoch": 3.61,
"learning_rate": 5.114228456913828e-05,
"loss": 0.4826,
"step": 1800
},
{
"epoch": 3.61,
"eval_loss": 0.488558828830719,
"eval_runtime": 31.2964,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1800
},
{
"epoch": 3.62,
"learning_rate": 5.1062124248497e-05,
"loss": 0.597,
"step": 1805
},
{
"epoch": 3.63,
"learning_rate": 5.098196392785571e-05,
"loss": 0.6181,
"step": 1810
},
{
"epoch": 3.63,
"eval_loss": 0.48853781819343567,
"eval_runtime": 31.2965,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1810
},
{
"epoch": 3.64,
"learning_rate": 5.0901803607214436e-05,
"loss": 0.5415,
"step": 1815
},
{
"epoch": 3.65,
"learning_rate": 5.0821643286573146e-05,
"loss": 0.4738,
"step": 1820
},
{
"epoch": 3.65,
"eval_loss": 0.48789137601852417,
"eval_runtime": 31.2967,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1820
},
{
"epoch": 3.66,
"learning_rate": 5.074148296593187e-05,
"loss": 0.5222,
"step": 1825
},
{
"epoch": 3.67,
"learning_rate": 5.066132264529058e-05,
"loss": 0.4932,
"step": 1830
},
{
"epoch": 3.67,
"eval_loss": 0.4817972481250763,
"eval_runtime": 31.2976,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1830
},
{
"epoch": 3.68,
"learning_rate": 5.05811623246493e-05,
"loss": 0.4425,
"step": 1835
},
{
"epoch": 3.69,
"learning_rate": 5.050100200400802e-05,
"loss": 0.4684,
"step": 1840
},
{
"epoch": 3.69,
"eval_loss": 0.4812251031398773,
"eval_runtime": 31.2943,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1840
},
{
"epoch": 3.7,
"learning_rate": 5.0420841683366736e-05,
"loss": 0.5561,
"step": 1845
},
{
"epoch": 3.71,
"learning_rate": 5.034068136272545e-05,
"loss": 0.5484,
"step": 1850
},
{
"epoch": 3.71,
"eval_loss": 0.47668221592903137,
"eval_runtime": 31.2926,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 1850
},
{
"epoch": 3.72,
"learning_rate": 5.0260521042084176e-05,
"loss": 0.5458,
"step": 1855
},
{
"epoch": 3.73,
"learning_rate": 5.0180360721442886e-05,
"loss": 0.5086,
"step": 1860
},
{
"epoch": 3.73,
"eval_loss": 0.4790602922439575,
"eval_runtime": 31.298,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1860
},
{
"epoch": 3.74,
"learning_rate": 5.010020040080161e-05,
"loss": 0.4593,
"step": 1865
},
{
"epoch": 3.75,
"learning_rate": 5.002004008016032e-05,
"loss": 0.3548,
"step": 1870
},
{
"epoch": 3.75,
"eval_loss": 0.4792560935020447,
"eval_runtime": 31.2988,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1870
},
{
"epoch": 3.76,
"learning_rate": 4.993987975951904e-05,
"loss": 0.4227,
"step": 1875
},
{
"epoch": 3.77,
"learning_rate": 4.985971943887775e-05,
"loss": 0.5229,
"step": 1880
},
{
"epoch": 3.77,
"eval_loss": 0.47653260827064514,
"eval_runtime": 31.3054,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1880
},
{
"epoch": 3.78,
"learning_rate": 4.977955911823648e-05,
"loss": 0.3925,
"step": 1885
},
{
"epoch": 3.79,
"learning_rate": 4.9699398797595193e-05,
"loss": 0.4578,
"step": 1890
},
{
"epoch": 3.79,
"eval_loss": 0.4703618884086609,
"eval_runtime": 31.3015,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1890
},
{
"epoch": 3.8,
"learning_rate": 4.961923847695391e-05,
"loss": 0.4669,
"step": 1895
},
{
"epoch": 3.81,
"learning_rate": 4.953907815631263e-05,
"loss": 0.5277,
"step": 1900
},
{
"epoch": 3.81,
"eval_loss": 0.4690556526184082,
"eval_runtime": 31.3076,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 1900
},
{
"epoch": 3.82,
"learning_rate": 4.945891783567135e-05,
"loss": 0.3436,
"step": 1905
},
{
"epoch": 3.83,
"learning_rate": 4.937875751503006e-05,
"loss": 0.4683,
"step": 1910
},
{
"epoch": 3.83,
"eval_loss": 0.4648754894733429,
"eval_runtime": 31.2941,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 1910
},
{
"epoch": 3.84,
"learning_rate": 4.9298597194388784e-05,
"loss": 0.4551,
"step": 1915
},
{
"epoch": 3.85,
"learning_rate": 4.9218436873747494e-05,
"loss": 0.448,
"step": 1920
},
{
"epoch": 3.85,
"eval_loss": 0.46837684512138367,
"eval_runtime": 31.2979,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1920
},
{
"epoch": 3.86,
"learning_rate": 4.913827655310622e-05,
"loss": 0.5228,
"step": 1925
},
{
"epoch": 3.87,
"learning_rate": 4.905811623246493e-05,
"loss": 0.3752,
"step": 1930
},
{
"epoch": 3.87,
"eval_loss": 0.4696580469608307,
"eval_runtime": 31.3122,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 1930
},
{
"epoch": 3.88,
"learning_rate": 4.897795591182365e-05,
"loss": 0.5608,
"step": 1935
},
{
"epoch": 3.89,
"learning_rate": 4.889779559118237e-05,
"loss": 0.4631,
"step": 1940
},
{
"epoch": 3.89,
"eval_loss": 0.467781126499176,
"eval_runtime": 31.3015,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1940
},
{
"epoch": 3.9,
"learning_rate": 4.8817635270541084e-05,
"loss": 0.4402,
"step": 1945
},
{
"epoch": 3.91,
"learning_rate": 4.873747494989981e-05,
"loss": 0.4277,
"step": 1950
},
{
"epoch": 3.91,
"eval_loss": 0.4608190953731537,
"eval_runtime": 31.3027,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1950
},
{
"epoch": 3.92,
"learning_rate": 4.865731462925852e-05,
"loss": 0.3894,
"step": 1955
},
{
"epoch": 3.93,
"learning_rate": 4.857715430861724e-05,
"loss": 0.3646,
"step": 1960
},
{
"epoch": 3.93,
"eval_loss": 0.460921972990036,
"eval_runtime": 31.2992,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1960
},
{
"epoch": 3.94,
"learning_rate": 4.849699398797596e-05,
"loss": 0.3929,
"step": 1965
},
{
"epoch": 3.95,
"learning_rate": 4.8416833667334674e-05,
"loss": 0.5276,
"step": 1970
},
{
"epoch": 3.95,
"eval_loss": 0.45434585213661194,
"eval_runtime": 31.2975,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1970
},
{
"epoch": 3.96,
"learning_rate": 4.833667334669339e-05,
"loss": 0.4141,
"step": 1975
},
{
"epoch": 3.97,
"learning_rate": 4.8256513026052115e-05,
"loss": 0.431,
"step": 1980
},
{
"epoch": 3.97,
"eval_loss": 0.4538600444793701,
"eval_runtime": 31.2976,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 1980
},
{
"epoch": 3.98,
"learning_rate": 4.8176352705410824e-05,
"loss": 0.6326,
"step": 1985
},
{
"epoch": 3.99,
"learning_rate": 4.809619238476955e-05,
"loss": 0.5465,
"step": 1990
},
{
"epoch": 3.99,
"eval_loss": 0.4550160765647888,
"eval_runtime": 31.3034,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 1990
},
{
"epoch": 4.0,
"learning_rate": 4.801603206412826e-05,
"loss": 0.5185,
"step": 1995
},
{
"epoch": 4.01,
"learning_rate": 4.793587174348698e-05,
"loss": 0.4954,
"step": 2000
},
{
"epoch": 4.01,
"eval_loss": 0.4522875249385834,
"eval_runtime": 31.2925,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2000
},
{
"epoch": 4.02,
"learning_rate": 4.785571142284569e-05,
"loss": 0.4591,
"step": 2005
},
{
"epoch": 4.03,
"learning_rate": 4.7775551102204415e-05,
"loss": 0.4886,
"step": 2010
},
{
"epoch": 4.03,
"eval_loss": 0.4499128758907318,
"eval_runtime": 31.1861,
"eval_samples_per_second": 7.119,
"eval_steps_per_second": 1.796,
"step": 2010
},
{
"epoch": 4.04,
"learning_rate": 4.769539078156313e-05,
"loss": 0.3942,
"step": 2015
},
{
"epoch": 4.05,
"learning_rate": 4.761523046092185e-05,
"loss": 0.4898,
"step": 2020
},
{
"epoch": 4.05,
"eval_loss": 0.4461597204208374,
"eval_runtime": 31.2639,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 1.791,
"step": 2020
},
{
"epoch": 4.06,
"learning_rate": 4.7535070140280565e-05,
"loss": 0.3444,
"step": 2025
},
{
"epoch": 4.07,
"learning_rate": 4.745490981963929e-05,
"loss": 0.4072,
"step": 2030
},
{
"epoch": 4.07,
"eval_loss": 0.4478509724140167,
"eval_runtime": 31.292,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2030
},
{
"epoch": 4.08,
"learning_rate": 4.7374749498998e-05,
"loss": 0.4066,
"step": 2035
},
{
"epoch": 4.09,
"learning_rate": 4.729458917835672e-05,
"loss": 0.4565,
"step": 2040
},
{
"epoch": 4.09,
"eval_loss": 0.4457860291004181,
"eval_runtime": 31.3,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 2040
},
{
"epoch": 4.1,
"learning_rate": 4.721442885771543e-05,
"loss": 0.419,
"step": 2045
},
{
"epoch": 4.11,
"learning_rate": 4.7134268537074155e-05,
"loss": 0.3739,
"step": 2050
},
{
"epoch": 4.11,
"eval_loss": 0.4474635720252991,
"eval_runtime": 31.3042,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 2050
},
{
"epoch": 4.12,
"learning_rate": 4.7054108216432865e-05,
"loss": 0.5401,
"step": 2055
},
{
"epoch": 4.13,
"learning_rate": 4.697394789579159e-05,
"loss": 0.4211,
"step": 2060
},
{
"epoch": 4.13,
"eval_loss": 0.44858118891716003,
"eval_runtime": 31.3113,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 2060
},
{
"epoch": 4.14,
"learning_rate": 4.6893787575150305e-05,
"loss": 0.4913,
"step": 2065
},
{
"epoch": 4.15,
"learning_rate": 4.681362725450902e-05,
"loss": 0.4048,
"step": 2070
},
{
"epoch": 4.15,
"eval_loss": 0.4393081068992615,
"eval_runtime": 31.3163,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2070
},
{
"epoch": 4.16,
"learning_rate": 4.673346693386774e-05,
"loss": 0.3935,
"step": 2075
},
{
"epoch": 4.17,
"learning_rate": 4.665330661322646e-05,
"loss": 0.5064,
"step": 2080
},
{
"epoch": 4.17,
"eval_loss": 0.4351194500923157,
"eval_runtime": 31.3041,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 2080
},
{
"epoch": 4.18,
"learning_rate": 4.657314629258517e-05,
"loss": 0.3017,
"step": 2085
},
{
"epoch": 4.19,
"learning_rate": 4.6492985971943896e-05,
"loss": 0.4652,
"step": 2090
},
{
"epoch": 4.19,
"eval_loss": 0.43793508410453796,
"eval_runtime": 31.3085,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 2090
},
{
"epoch": 4.2,
"learning_rate": 4.6412825651302606e-05,
"loss": 0.4423,
"step": 2095
},
{
"epoch": 4.21,
"learning_rate": 4.633266533066133e-05,
"loss": 0.4061,
"step": 2100
},
{
"epoch": 4.21,
"eval_loss": 0.43406903743743896,
"eval_runtime": 31.3118,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 2100
},
{
"epoch": 4.22,
"learning_rate": 4.625250501002004e-05,
"loss": 0.3877,
"step": 2105
},
{
"epoch": 4.23,
"learning_rate": 4.617234468937876e-05,
"loss": 0.3784,
"step": 2110
},
{
"epoch": 4.23,
"eval_loss": 0.4390106499195099,
"eval_runtime": 31.3101,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 2110
},
{
"epoch": 4.24,
"learning_rate": 4.609218436873748e-05,
"loss": 0.415,
"step": 2115
},
{
"epoch": 4.25,
"learning_rate": 4.6012024048096196e-05,
"loss": 0.4142,
"step": 2120
},
{
"epoch": 4.25,
"eval_loss": 0.43537265062332153,
"eval_runtime": 31.3175,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2120
},
{
"epoch": 4.26,
"learning_rate": 4.593186372745491e-05,
"loss": 0.4459,
"step": 2125
},
{
"epoch": 4.27,
"learning_rate": 4.585170340681363e-05,
"loss": 0.3625,
"step": 2130
},
{
"epoch": 4.27,
"eval_loss": 0.4415459930896759,
"eval_runtime": 31.299,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 2130
},
{
"epoch": 4.28,
"learning_rate": 4.5771543086172346e-05,
"loss": 0.4102,
"step": 2135
},
{
"epoch": 4.29,
"learning_rate": 4.569138276553107e-05,
"loss": 0.3807,
"step": 2140
},
{
"epoch": 4.29,
"eval_loss": 0.4403214752674103,
"eval_runtime": 31.3171,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2140
},
{
"epoch": 4.3,
"learning_rate": 4.561122244488978e-05,
"loss": 0.544,
"step": 2145
},
{
"epoch": 4.31,
"learning_rate": 4.55310621242485e-05,
"loss": 0.4154,
"step": 2150
},
{
"epoch": 4.31,
"eval_loss": 0.4307992458343506,
"eval_runtime": 31.3169,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2150
},
{
"epoch": 4.32,
"learning_rate": 4.545090180360721e-05,
"loss": 0.405,
"step": 2155
},
{
"epoch": 4.33,
"learning_rate": 4.5370741482965936e-05,
"loss": 0.4509,
"step": 2160
},
{
"epoch": 4.33,
"eval_loss": 0.429840087890625,
"eval_runtime": 31.316,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2160
},
{
"epoch": 4.34,
"learning_rate": 4.5290581162324646e-05,
"loss": 0.3593,
"step": 2165
},
{
"epoch": 4.35,
"learning_rate": 4.521042084168337e-05,
"loss": 0.4254,
"step": 2170
},
{
"epoch": 4.35,
"eval_loss": 0.42388150095939636,
"eval_runtime": 31.3164,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2170
},
{
"epoch": 4.36,
"learning_rate": 4.5130260521042086e-05,
"loss": 0.4071,
"step": 2175
},
{
"epoch": 4.37,
"learning_rate": 4.50501002004008e-05,
"loss": 0.4323,
"step": 2180
},
{
"epoch": 4.37,
"eval_loss": 0.42136842012405396,
"eval_runtime": 31.3202,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2180
},
{
"epoch": 4.38,
"learning_rate": 4.496993987975952e-05,
"loss": 0.3243,
"step": 2185
},
{
"epoch": 4.39,
"learning_rate": 4.4889779559118243e-05,
"loss": 0.4359,
"step": 2190
},
{
"epoch": 4.39,
"eval_loss": 0.4290623068809509,
"eval_runtime": 31.3259,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2190
},
{
"epoch": 4.4,
"learning_rate": 4.480961923847695e-05,
"loss": 0.4209,
"step": 2195
},
{
"epoch": 4.41,
"learning_rate": 4.472945891783568e-05,
"loss": 0.3759,
"step": 2200
},
{
"epoch": 4.41,
"eval_loss": 0.4223538935184479,
"eval_runtime": 31.3287,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 1.788,
"step": 2200
},
{
"epoch": 4.42,
"learning_rate": 4.464929859719439e-05,
"loss": 0.4106,
"step": 2205
},
{
"epoch": 4.43,
"learning_rate": 4.456913827655311e-05,
"loss": 0.4534,
"step": 2210
},
{
"epoch": 4.43,
"eval_loss": 0.42248478531837463,
"eval_runtime": 31.3216,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2210
},
{
"epoch": 4.44,
"learning_rate": 4.448897795591182e-05,
"loss": 0.3862,
"step": 2215
},
{
"epoch": 4.45,
"learning_rate": 4.4408817635270544e-05,
"loss": 0.4013,
"step": 2220
},
{
"epoch": 4.45,
"eval_loss": 0.42617106437683105,
"eval_runtime": 31.3246,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2220
},
{
"epoch": 4.46,
"learning_rate": 4.432865731462927e-05,
"loss": 0.3103,
"step": 2225
},
{
"epoch": 4.47,
"learning_rate": 4.424849699398798e-05,
"loss": 0.4331,
"step": 2230
},
{
"epoch": 4.47,
"eval_loss": 0.4213978052139282,
"eval_runtime": 31.3244,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2230
},
{
"epoch": 4.48,
"learning_rate": 4.41683366733467e-05,
"loss": 0.381,
"step": 2235
},
{
"epoch": 4.49,
"learning_rate": 4.408817635270542e-05,
"loss": 0.4373,
"step": 2240
},
{
"epoch": 4.49,
"eval_loss": 0.4198138117790222,
"eval_runtime": 31.3271,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2240
},
{
"epoch": 4.5,
"learning_rate": 4.4008016032064134e-05,
"loss": 0.3603,
"step": 2245
},
{
"epoch": 4.51,
"learning_rate": 4.392785571142285e-05,
"loss": 0.4975,
"step": 2250
},
{
"epoch": 4.51,
"eval_loss": 0.42358022928237915,
"eval_runtime": 31.3249,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2250
},
{
"epoch": 4.52,
"learning_rate": 4.384769539078157e-05,
"loss": 0.342,
"step": 2255
},
{
"epoch": 4.53,
"learning_rate": 4.3767535070140284e-05,
"loss": 0.423,
"step": 2260
},
{
"epoch": 4.53,
"eval_loss": 0.4189080595970154,
"eval_runtime": 31.3275,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 1.788,
"step": 2260
},
{
"epoch": 4.54,
"learning_rate": 4.368737474949901e-05,
"loss": 0.4149,
"step": 2265
},
{
"epoch": 4.55,
"learning_rate": 4.360721442885772e-05,
"loss": 0.4503,
"step": 2270
},
{
"epoch": 4.55,
"eval_loss": 0.4171365201473236,
"eval_runtime": 31.3168,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2270
},
{
"epoch": 4.56,
"learning_rate": 4.352705410821644e-05,
"loss": 0.415,
"step": 2275
},
{
"epoch": 4.57,
"learning_rate": 4.344689378757515e-05,
"loss": 0.3796,
"step": 2280
},
{
"epoch": 4.57,
"eval_loss": 0.41718369722366333,
"eval_runtime": 31.3195,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2280
},
{
"epoch": 4.58,
"learning_rate": 4.3366733466933874e-05,
"loss": 0.3788,
"step": 2285
},
{
"epoch": 4.59,
"learning_rate": 4.3286573146292584e-05,
"loss": 0.4063,
"step": 2290
},
{
"epoch": 4.59,
"eval_loss": 0.41249218583106995,
"eval_runtime": 31.3221,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2290
},
{
"epoch": 4.6,
"learning_rate": 4.320641282565131e-05,
"loss": 0.4379,
"step": 2295
},
{
"epoch": 4.61,
"learning_rate": 4.3126252505010025e-05,
"loss": 0.3841,
"step": 2300
},
{
"epoch": 4.61,
"eval_loss": 0.41186362504959106,
"eval_runtime": 31.3313,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 1.787,
"step": 2300
},
{
"epoch": 4.62,
"learning_rate": 4.304609218436874e-05,
"loss": 0.3669,
"step": 2305
},
{
"epoch": 4.63,
"learning_rate": 4.296593186372746e-05,
"loss": 0.2956,
"step": 2310
},
{
"epoch": 4.63,
"eval_loss": 0.4147048890590668,
"eval_runtime": 31.3349,
"eval_samples_per_second": 7.085,
"eval_steps_per_second": 1.787,
"step": 2310
},
{
"epoch": 4.64,
"learning_rate": 4.288577154308618e-05,
"loss": 0.5446,
"step": 2315
},
{
"epoch": 4.65,
"learning_rate": 4.280561122244489e-05,
"loss": 0.3486,
"step": 2320
},
{
"epoch": 4.65,
"eval_loss": 0.42460867762565613,
"eval_runtime": 31.3167,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2320
},
{
"epoch": 4.66,
"learning_rate": 4.2725450901803615e-05,
"loss": 0.3405,
"step": 2325
},
{
"epoch": 4.67,
"learning_rate": 4.2645290581162325e-05,
"loss": 0.3585,
"step": 2330
},
{
"epoch": 4.67,
"eval_loss": 0.4116860330104828,
"eval_runtime": 31.3378,
"eval_samples_per_second": 7.084,
"eval_steps_per_second": 1.787,
"step": 2330
},
{
"epoch": 4.68,
"learning_rate": 4.256513026052105e-05,
"loss": 0.3642,
"step": 2335
},
{
"epoch": 4.69,
"learning_rate": 4.248496993987976e-05,
"loss": 0.4496,
"step": 2340
},
{
"epoch": 4.69,
"eval_loss": 0.40906357765197754,
"eval_runtime": 31.3315,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 1.787,
"step": 2340
},
{
"epoch": 4.7,
"learning_rate": 4.240480961923848e-05,
"loss": 0.4845,
"step": 2345
},
{
"epoch": 4.71,
"learning_rate": 4.23246492985972e-05,
"loss": 0.399,
"step": 2350
},
{
"epoch": 4.71,
"eval_loss": 0.40488967299461365,
"eval_runtime": 31.3211,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2350
},
{
"epoch": 4.72,
"learning_rate": 4.2244488977955915e-05,
"loss": 0.4057,
"step": 2355
},
{
"epoch": 4.73,
"learning_rate": 4.216432865731463e-05,
"loss": 0.3885,
"step": 2360
},
{
"epoch": 4.73,
"eval_loss": 0.4003817141056061,
"eval_runtime": 31.3245,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2360
},
{
"epoch": 4.74,
"learning_rate": 4.2084168336673355e-05,
"loss": 0.3819,
"step": 2365
},
{
"epoch": 4.75,
"learning_rate": 4.2004008016032065e-05,
"loss": 0.3728,
"step": 2370
},
{
"epoch": 4.75,
"eval_loss": 0.4003088176250458,
"eval_runtime": 31.3239,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2370
},
{
"epoch": 4.76,
"learning_rate": 4.192384769539079e-05,
"loss": 0.3902,
"step": 2375
},
{
"epoch": 4.77,
"learning_rate": 4.18436873747495e-05,
"loss": 0.2698,
"step": 2380
},
{
"epoch": 4.77,
"eval_loss": 0.40085363388061523,
"eval_runtime": 31.3226,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2380
},
{
"epoch": 4.78,
"learning_rate": 4.176352705410822e-05,
"loss": 0.3917,
"step": 2385
},
{
"epoch": 4.79,
"learning_rate": 4.168336673346693e-05,
"loss": 0.3799,
"step": 2390
},
{
"epoch": 4.79,
"eval_loss": 0.40029290318489075,
"eval_runtime": 31.329,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 1.787,
"step": 2390
},
{
"epoch": 4.8,
"learning_rate": 4.1603206412825656e-05,
"loss": 0.3973,
"step": 2395
},
{
"epoch": 4.81,
"learning_rate": 4.152304609218437e-05,
"loss": 0.4888,
"step": 2400
},
{
"epoch": 4.81,
"eval_loss": 0.3974343240261078,
"eval_runtime": 31.3235,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2400
},
{
"epoch": 4.82,
"learning_rate": 4.144288577154309e-05,
"loss": 0.2457,
"step": 2405
},
{
"epoch": 4.83,
"learning_rate": 4.1362725450901806e-05,
"loss": 0.3795,
"step": 2410
},
{
"epoch": 4.83,
"eval_loss": 0.3994871973991394,
"eval_runtime": 31.3159,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2410
},
{
"epoch": 4.84,
"learning_rate": 4.128256513026052e-05,
"loss": 0.3535,
"step": 2415
},
{
"epoch": 4.85,
"learning_rate": 4.120240480961924e-05,
"loss": 0.4249,
"step": 2420
},
{
"epoch": 4.85,
"eval_loss": 0.3967938721179962,
"eval_runtime": 31.3239,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 2420
},
{
"epoch": 4.86,
"learning_rate": 4.112224448897796e-05,
"loss": 0.4219,
"step": 2425
},
{
"epoch": 4.87,
"learning_rate": 4.104208416833667e-05,
"loss": 0.4635,
"step": 2430
},
{
"epoch": 4.87,
"eval_loss": 0.4001442790031433,
"eval_runtime": 31.3171,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2430
},
{
"epoch": 4.88,
"learning_rate": 4.0961923847695396e-05,
"loss": 0.3439,
"step": 2435
},
{
"epoch": 4.89,
"learning_rate": 4.0881763527054106e-05,
"loss": 0.4965,
"step": 2440
},
{
"epoch": 4.89,
"eval_loss": 0.39344674348831177,
"eval_runtime": 31.3149,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2440
},
{
"epoch": 4.9,
"learning_rate": 4.080160320641283e-05,
"loss": 0.3569,
"step": 2445
},
{
"epoch": 4.91,
"learning_rate": 4.072144288577154e-05,
"loss": 0.3745,
"step": 2450
},
{
"epoch": 4.91,
"eval_loss": 0.3987390697002411,
"eval_runtime": 31.318,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 2450
},
{
"epoch": 4.92,
"learning_rate": 4.064128256513026e-05,
"loss": 0.4011,
"step": 2455
},
{
"epoch": 4.93,
"learning_rate": 4.056112224448898e-05,
"loss": 0.3601,
"step": 2460
},
{
"epoch": 4.93,
"eval_loss": 0.3985511064529419,
"eval_runtime": 31.3089,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 2460
},
{
"epoch": 4.94,
"learning_rate": 4.0480961923847696e-05,
"loss": 0.4154,
"step": 2465
},
{
"epoch": 4.95,
"learning_rate": 4.040080160320641e-05,
"loss": 0.2878,
"step": 2470
},
{
"epoch": 4.95,
"eval_loss": 0.39409133791923523,
"eval_runtime": 31.3205,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2470
},
{
"epoch": 4.96,
"learning_rate": 4.0320641282565136e-05,
"loss": 0.3433,
"step": 2475
},
{
"epoch": 4.97,
"learning_rate": 4.0240480961923846e-05,
"loss": 0.4297,
"step": 2480
},
{
"epoch": 4.97,
"eval_loss": 0.3889669179916382,
"eval_runtime": 31.3109,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 2480
},
{
"epoch": 4.98,
"learning_rate": 4.016032064128257e-05,
"loss": 0.3173,
"step": 2485
},
{
"epoch": 4.99,
"learning_rate": 4.008016032064128e-05,
"loss": 0.278,
"step": 2490
},
{
"epoch": 4.99,
"eval_loss": 0.3974519968032837,
"eval_runtime": 31.3206,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2490
},
{
"epoch": 5.0,
"learning_rate": 4e-05,
"loss": 0.3572,
"step": 2495
},
{
"epoch": 5.01,
"learning_rate": 3.991983967935872e-05,
"loss": 0.4509,
"step": 2500
},
{
"epoch": 5.01,
"eval_loss": 0.39074984192848206,
"eval_runtime": 31.3198,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 1.788,
"step": 2500
},
{
"epoch": 5.02,
"learning_rate": 3.983967935871744e-05,
"loss": 0.2818,
"step": 2505
},
{
"epoch": 5.03,
"learning_rate": 3.9759519038076153e-05,
"loss": 0.3202,
"step": 2510
},
{
"epoch": 5.03,
"eval_loss": 0.3872080147266388,
"eval_runtime": 31.1989,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 1.795,
"step": 2510
},
{
"epoch": 5.04,
"learning_rate": 3.967935871743487e-05,
"loss": 0.3529,
"step": 2515
},
{
"epoch": 5.05,
"learning_rate": 3.959919839679359e-05,
"loss": 0.3047,
"step": 2520
},
{
"epoch": 5.05,
"eval_loss": 0.3956039547920227,
"eval_runtime": 31.2729,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 1.791,
"step": 2520
},
{
"epoch": 5.06,
"learning_rate": 3.951903807615231e-05,
"loss": 0.3407,
"step": 2525
},
{
"epoch": 5.07,
"learning_rate": 3.943887775551103e-05,
"loss": 0.2931,
"step": 2530
},
{
"epoch": 5.07,
"eval_loss": 0.39254140853881836,
"eval_runtime": 31.2872,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2530
},
{
"epoch": 5.08,
"learning_rate": 3.9358717434869744e-05,
"loss": 0.3376,
"step": 2535
},
{
"epoch": 5.09,
"learning_rate": 3.927855711422846e-05,
"loss": 0.3487,
"step": 2540
},
{
"epoch": 5.09,
"eval_loss": 0.3909657597541809,
"eval_runtime": 31.2934,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2540
},
{
"epoch": 5.1,
"learning_rate": 3.919839679358718e-05,
"loss": 0.4409,
"step": 2545
},
{
"epoch": 5.11,
"learning_rate": 3.9118236472945894e-05,
"loss": 0.2792,
"step": 2550
},
{
"epoch": 5.11,
"eval_loss": 0.3901335895061493,
"eval_runtime": 31.3009,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 2550
},
{
"epoch": 5.12,
"learning_rate": 3.903807615230461e-05,
"loss": 0.2794,
"step": 2555
},
{
"epoch": 5.13,
"learning_rate": 3.895791583166333e-05,
"loss": 0.3446,
"step": 2560
},
{
"epoch": 5.13,
"eval_loss": 0.38729164004325867,
"eval_runtime": 31.2913,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2560
},
{
"epoch": 5.14,
"learning_rate": 3.8877755511022044e-05,
"loss": 0.3807,
"step": 2565
},
{
"epoch": 5.15,
"learning_rate": 3.879759519038076e-05,
"loss": 0.3482,
"step": 2570
},
{
"epoch": 5.15,
"eval_loss": 0.3839856684207916,
"eval_runtime": 31.2918,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2570
},
{
"epoch": 5.16,
"learning_rate": 3.8717434869739484e-05,
"loss": 0.333,
"step": 2575
},
{
"epoch": 5.17,
"learning_rate": 3.86372745490982e-05,
"loss": 0.3464,
"step": 2580
},
{
"epoch": 5.17,
"eval_loss": 0.38349151611328125,
"eval_runtime": 31.2954,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 2580
},
{
"epoch": 5.18,
"learning_rate": 3.855711422845692e-05,
"loss": 0.446,
"step": 2585
},
{
"epoch": 5.19,
"learning_rate": 3.8476953907815634e-05,
"loss": 0.3212,
"step": 2590
},
{
"epoch": 5.19,
"eval_loss": 0.3845639228820801,
"eval_runtime": 31.3009,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 2590
},
{
"epoch": 5.2,
"learning_rate": 3.839679358717435e-05,
"loss": 0.2797,
"step": 2595
},
{
"epoch": 5.21,
"learning_rate": 3.831663326653307e-05,
"loss": 0.3847,
"step": 2600
},
{
"epoch": 5.21,
"eval_loss": 0.3818623721599579,
"eval_runtime": 31.298,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 2600
},
{
"epoch": 5.22,
"learning_rate": 3.8236472945891784e-05,
"loss": 0.4431,
"step": 2605
},
{
"epoch": 5.23,
"learning_rate": 3.81563126252505e-05,
"loss": 0.3212,
"step": 2610
},
{
"epoch": 5.23,
"eval_loss": 0.38965049386024475,
"eval_runtime": 31.2995,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 2610
},
{
"epoch": 5.24,
"learning_rate": 3.807615230460922e-05,
"loss": 0.3441,
"step": 2615
},
{
"epoch": 5.25,
"learning_rate": 3.7995991983967935e-05,
"loss": 0.358,
"step": 2620
},
{
"epoch": 5.25,
"eval_loss": 0.381111204624176,
"eval_runtime": 31.2872,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2620
},
{
"epoch": 5.26,
"learning_rate": 3.791583166332665e-05,
"loss": 0.287,
"step": 2625
},
{
"epoch": 5.27,
"learning_rate": 3.7835671342685375e-05,
"loss": 0.3471,
"step": 2630
},
{
"epoch": 5.27,
"eval_loss": 0.38052383065223694,
"eval_runtime": 31.2873,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2630
},
{
"epoch": 5.28,
"learning_rate": 3.775551102204409e-05,
"loss": 0.2928,
"step": 2635
},
{
"epoch": 5.29,
"learning_rate": 3.767535070140281e-05,
"loss": 0.3348,
"step": 2640
},
{
"epoch": 5.29,
"eval_loss": 0.38677719235420227,
"eval_runtime": 31.2908,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2640
},
{
"epoch": 5.3,
"learning_rate": 3.7595190380761525e-05,
"loss": 0.3454,
"step": 2645
},
{
"epoch": 5.31,
"learning_rate": 3.751503006012025e-05,
"loss": 0.342,
"step": 2650
},
{
"epoch": 5.31,
"eval_loss": 0.3768896162509918,
"eval_runtime": 31.29,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2650
},
{
"epoch": 5.32,
"learning_rate": 3.7434869739478965e-05,
"loss": 0.2887,
"step": 2655
},
{
"epoch": 5.33,
"learning_rate": 3.735470941883768e-05,
"loss": 0.4504,
"step": 2660
},
{
"epoch": 5.33,
"eval_loss": 0.3774382770061493,
"eval_runtime": 31.2944,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 2660
},
{
"epoch": 5.34,
"learning_rate": 3.72745490981964e-05,
"loss": 0.3159,
"step": 2665
},
{
"epoch": 5.35,
"learning_rate": 3.7194388777555115e-05,
"loss": 0.2713,
"step": 2670
},
{
"epoch": 5.35,
"eval_loss": 0.3802602291107178,
"eval_runtime": 31.2859,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2670
},
{
"epoch": 5.36,
"learning_rate": 3.711422845691383e-05,
"loss": 0.3543,
"step": 2675
},
{
"epoch": 5.37,
"learning_rate": 3.703406813627255e-05,
"loss": 0.3848,
"step": 2680
},
{
"epoch": 5.37,
"eval_loss": 0.3776351511478424,
"eval_runtime": 31.2876,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2680
},
{
"epoch": 5.38,
"learning_rate": 3.6953907815631265e-05,
"loss": 0.3891,
"step": 2685
},
{
"epoch": 5.39,
"learning_rate": 3.687374749498998e-05,
"loss": 0.354,
"step": 2690
},
{
"epoch": 5.39,
"eval_loss": 0.3758071959018707,
"eval_runtime": 31.2908,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2690
},
{
"epoch": 5.4,
"learning_rate": 3.67935871743487e-05,
"loss": 0.3596,
"step": 2695
},
{
"epoch": 5.41,
"learning_rate": 3.671342685370742e-05,
"loss": 0.3796,
"step": 2700
},
{
"epoch": 5.41,
"eval_loss": 0.37596631050109863,
"eval_runtime": 31.2928,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2700
},
{
"epoch": 5.42,
"learning_rate": 3.663326653306614e-05,
"loss": 0.2658,
"step": 2705
},
{
"epoch": 5.43,
"learning_rate": 3.6553106212424856e-05,
"loss": 0.3654,
"step": 2710
},
{
"epoch": 5.43,
"eval_loss": 0.37372830510139465,
"eval_runtime": 31.2931,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2710
},
{
"epoch": 5.44,
"learning_rate": 3.647294589178357e-05,
"loss": 0.3026,
"step": 2715
},
{
"epoch": 5.45,
"learning_rate": 3.639278557114229e-05,
"loss": 0.3448,
"step": 2720
},
{
"epoch": 5.45,
"eval_loss": 0.38118383288383484,
"eval_runtime": 31.2914,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2720
},
{
"epoch": 5.46,
"learning_rate": 3.6312625250501006e-05,
"loss": 0.2915,
"step": 2725
},
{
"epoch": 5.47,
"learning_rate": 3.623246492985972e-05,
"loss": 0.355,
"step": 2730
},
{
"epoch": 5.47,
"eval_loss": 0.3758777379989624,
"eval_runtime": 31.293,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2730
},
{
"epoch": 5.48,
"learning_rate": 3.615230460921844e-05,
"loss": 0.3554,
"step": 2735
},
{
"epoch": 5.49,
"learning_rate": 3.6072144288577156e-05,
"loss": 0.288,
"step": 2740
},
{
"epoch": 5.49,
"eval_loss": 0.3711189031600952,
"eval_runtime": 31.2905,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2740
},
{
"epoch": 5.5,
"learning_rate": 3.599198396793587e-05,
"loss": 0.3532,
"step": 2745
},
{
"epoch": 5.51,
"learning_rate": 3.591182364729459e-05,
"loss": 0.2991,
"step": 2750
},
{
"epoch": 5.51,
"eval_loss": 0.3690561354160309,
"eval_runtime": 31.2895,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2750
},
{
"epoch": 5.52,
"learning_rate": 3.583166332665331e-05,
"loss": 0.2659,
"step": 2755
},
{
"epoch": 5.53,
"learning_rate": 3.575150300601203e-05,
"loss": 0.3443,
"step": 2760
},
{
"epoch": 5.53,
"eval_loss": 0.3708031177520752,
"eval_runtime": 31.292,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2760
},
{
"epoch": 5.54,
"learning_rate": 3.5671342685370746e-05,
"loss": 0.2478,
"step": 2765
},
{
"epoch": 5.55,
"learning_rate": 3.559118236472946e-05,
"loss": 0.3374,
"step": 2770
},
{
"epoch": 5.55,
"eval_loss": 0.36587995290756226,
"eval_runtime": 31.291,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2770
},
{
"epoch": 5.56,
"learning_rate": 3.551102204408818e-05,
"loss": 0.2512,
"step": 2775
},
{
"epoch": 5.57,
"learning_rate": 3.5430861723446896e-05,
"loss": 0.4078,
"step": 2780
},
{
"epoch": 5.57,
"eval_loss": 0.37093624472618103,
"eval_runtime": 31.2862,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2780
},
{
"epoch": 5.58,
"learning_rate": 3.535070140280561e-05,
"loss": 0.3944,
"step": 2785
},
{
"epoch": 5.59,
"learning_rate": 3.527054108216433e-05,
"loss": 0.2967,
"step": 2790
},
{
"epoch": 5.59,
"eval_loss": 0.3683302104473114,
"eval_runtime": 31.2839,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2790
},
{
"epoch": 5.6,
"learning_rate": 3.5190380761523046e-05,
"loss": 0.2733,
"step": 2795
},
{
"epoch": 5.61,
"learning_rate": 3.511022044088176e-05,
"loss": 0.3532,
"step": 2800
},
{
"epoch": 5.61,
"eval_loss": 0.3638099730014801,
"eval_runtime": 31.2946,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 2800
},
{
"epoch": 5.62,
"learning_rate": 3.503006012024049e-05,
"loss": 0.4112,
"step": 2805
},
{
"epoch": 5.63,
"learning_rate": 3.49498997995992e-05,
"loss": 0.4123,
"step": 2810
},
{
"epoch": 5.63,
"eval_loss": 0.36417555809020996,
"eval_runtime": 31.2926,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2810
},
{
"epoch": 5.64,
"learning_rate": 3.486973947895792e-05,
"loss": 0.4427,
"step": 2815
},
{
"epoch": 5.65,
"learning_rate": 3.478957915831664e-05,
"loss": 0.3195,
"step": 2820
},
{
"epoch": 5.65,
"eval_loss": 0.36552584171295166,
"eval_runtime": 31.2936,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2820
},
{
"epoch": 5.66,
"learning_rate": 3.4709418837675353e-05,
"loss": 0.2884,
"step": 2825
},
{
"epoch": 5.67,
"learning_rate": 3.462925851703407e-05,
"loss": 0.3161,
"step": 2830
},
{
"epoch": 5.67,
"eval_loss": 0.3598646819591522,
"eval_runtime": 31.292,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2830
},
{
"epoch": 5.68,
"learning_rate": 3.454909819639279e-05,
"loss": 0.323,
"step": 2835
},
{
"epoch": 5.69,
"learning_rate": 3.4468937875751504e-05,
"loss": 0.4152,
"step": 2840
},
{
"epoch": 5.69,
"eval_loss": 0.3621058464050293,
"eval_runtime": 31.2915,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2840
},
{
"epoch": 5.7,
"learning_rate": 3.438877755511022e-05,
"loss": 0.3332,
"step": 2845
},
{
"epoch": 5.71,
"learning_rate": 3.430861723446894e-05,
"loss": 0.2802,
"step": 2850
},
{
"epoch": 5.71,
"eval_loss": 0.36484986543655396,
"eval_runtime": 31.2933,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2850
},
{
"epoch": 5.72,
"learning_rate": 3.4228456913827654e-05,
"loss": 0.37,
"step": 2855
},
{
"epoch": 5.73,
"learning_rate": 3.414829659318638e-05,
"loss": 0.2909,
"step": 2860
},
{
"epoch": 5.73,
"eval_loss": 0.36040765047073364,
"eval_runtime": 31.2907,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2860
},
{
"epoch": 5.74,
"learning_rate": 3.4068136272545094e-05,
"loss": 0.3928,
"step": 2865
},
{
"epoch": 5.75,
"learning_rate": 3.398797595190381e-05,
"loss": 0.3105,
"step": 2870
},
{
"epoch": 5.75,
"eval_loss": 0.3604431748390198,
"eval_runtime": 31.2918,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2870
},
{
"epoch": 5.76,
"learning_rate": 3.390781563126253e-05,
"loss": 0.3766,
"step": 2875
},
{
"epoch": 5.77,
"learning_rate": 3.3827655310621244e-05,
"loss": 0.3291,
"step": 2880
},
{
"epoch": 5.77,
"eval_loss": 0.355290025472641,
"eval_runtime": 31.2907,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2880
},
{
"epoch": 5.78,
"learning_rate": 3.374749498997996e-05,
"loss": 0.3502,
"step": 2885
},
{
"epoch": 5.79,
"learning_rate": 3.366733466933868e-05,
"loss": 0.3916,
"step": 2890
},
{
"epoch": 5.79,
"eval_loss": 0.36026230454444885,
"eval_runtime": 31.2896,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2890
},
{
"epoch": 5.8,
"learning_rate": 3.3587174348697394e-05,
"loss": 0.31,
"step": 2895
},
{
"epoch": 5.81,
"learning_rate": 3.350701402805611e-05,
"loss": 0.3657,
"step": 2900
},
{
"epoch": 5.81,
"eval_loss": 0.3543720543384552,
"eval_runtime": 31.2877,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2900
},
{
"epoch": 5.82,
"learning_rate": 3.342685370741483e-05,
"loss": 0.3137,
"step": 2905
},
{
"epoch": 5.83,
"learning_rate": 3.3346693386773544e-05,
"loss": 0.3745,
"step": 2910
},
{
"epoch": 5.83,
"eval_loss": 0.35586270689964294,
"eval_runtime": 31.2748,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 2910
},
{
"epoch": 5.84,
"learning_rate": 3.326653306613227e-05,
"loss": 0.3075,
"step": 2915
},
{
"epoch": 5.85,
"learning_rate": 3.3186372745490984e-05,
"loss": 0.3281,
"step": 2920
},
{
"epoch": 5.85,
"eval_loss": 0.35174089670181274,
"eval_runtime": 31.2876,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2920
},
{
"epoch": 5.86,
"learning_rate": 3.31062124248497e-05,
"loss": 0.2976,
"step": 2925
},
{
"epoch": 5.87,
"learning_rate": 3.3026052104208425e-05,
"loss": 0.2892,
"step": 2930
},
{
"epoch": 5.87,
"eval_loss": 0.3550553023815155,
"eval_runtime": 31.2925,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2930
},
{
"epoch": 5.88,
"learning_rate": 3.294589178356714e-05,
"loss": 0.2826,
"step": 2935
},
{
"epoch": 5.89,
"learning_rate": 3.286573146292586e-05,
"loss": 0.4121,
"step": 2940
},
{
"epoch": 5.89,
"eval_loss": 0.34888410568237305,
"eval_runtime": 31.2869,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2940
},
{
"epoch": 5.9,
"learning_rate": 3.2785571142284575e-05,
"loss": 0.3921,
"step": 2945
},
{
"epoch": 5.91,
"learning_rate": 3.270541082164329e-05,
"loss": 0.2908,
"step": 2950
},
{
"epoch": 5.91,
"eval_loss": 0.35319554805755615,
"eval_runtime": 31.2849,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 2950
},
{
"epoch": 5.92,
"learning_rate": 3.262525050100201e-05,
"loss": 0.2526,
"step": 2955
},
{
"epoch": 5.93,
"learning_rate": 3.2545090180360725e-05,
"loss": 0.3677,
"step": 2960
},
{
"epoch": 5.93,
"eval_loss": 0.3468542993068695,
"eval_runtime": 31.2952,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 2960
},
{
"epoch": 5.94,
"learning_rate": 3.246492985971944e-05,
"loss": 0.303,
"step": 2965
},
{
"epoch": 5.95,
"learning_rate": 3.238476953907816e-05,
"loss": 0.341,
"step": 2970
},
{
"epoch": 5.95,
"eval_loss": 0.3503284752368927,
"eval_runtime": 31.2896,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 2970
},
{
"epoch": 5.96,
"learning_rate": 3.2304609218436875e-05,
"loss": 0.3035,
"step": 2975
},
{
"epoch": 5.97,
"learning_rate": 3.222444889779559e-05,
"loss": 0.2319,
"step": 2980
},
{
"epoch": 5.97,
"eval_loss": 0.3496892750263214,
"eval_runtime": 31.3066,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 2980
},
{
"epoch": 5.98,
"learning_rate": 3.2144288577154315e-05,
"loss": 0.2791,
"step": 2985
},
{
"epoch": 5.99,
"learning_rate": 3.206412825651303e-05,
"loss": 0.2624,
"step": 2990
},
{
"epoch": 5.99,
"eval_loss": 0.34680891036987305,
"eval_runtime": 31.2925,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 2990
},
{
"epoch": 6.0,
"learning_rate": 3.198396793587175e-05,
"loss": 0.2614,
"step": 2995
},
{
"epoch": 6.01,
"learning_rate": 3.1903807615230465e-05,
"loss": 0.3324,
"step": 3000
},
{
"epoch": 6.01,
"eval_loss": 0.3479662537574768,
"eval_runtime": 31.2947,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3000
},
{
"epoch": 6.02,
"learning_rate": 3.182364729458918e-05,
"loss": 0.3096,
"step": 3005
},
{
"epoch": 6.03,
"learning_rate": 3.17434869739479e-05,
"loss": 0.2114,
"step": 3010
},
{
"epoch": 6.03,
"eval_loss": 0.35304367542266846,
"eval_runtime": 31.2007,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 3010
},
{
"epoch": 6.04,
"learning_rate": 3.1663326653306616e-05,
"loss": 0.2836,
"step": 3015
},
{
"epoch": 6.05,
"learning_rate": 3.158316633266533e-05,
"loss": 0.256,
"step": 3020
},
{
"epoch": 6.05,
"eval_loss": 0.3500817120075226,
"eval_runtime": 31.2754,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.791,
"step": 3020
},
{
"epoch": 6.06,
"learning_rate": 3.150300601202405e-05,
"loss": 0.2611,
"step": 3025
},
{
"epoch": 6.07,
"learning_rate": 3.1422845691382766e-05,
"loss": 0.2716,
"step": 3030
},
{
"epoch": 6.07,
"eval_loss": 0.3489656150341034,
"eval_runtime": 31.2904,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3030
},
{
"epoch": 6.08,
"learning_rate": 3.134268537074149e-05,
"loss": 0.2764,
"step": 3035
},
{
"epoch": 6.09,
"learning_rate": 3.1262525050100206e-05,
"loss": 0.2921,
"step": 3040
},
{
"epoch": 6.09,
"eval_loss": 0.34664395451545715,
"eval_runtime": 31.2969,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3040
},
{
"epoch": 6.1,
"learning_rate": 3.118236472945892e-05,
"loss": 0.256,
"step": 3045
},
{
"epoch": 6.11,
"learning_rate": 3.110220440881764e-05,
"loss": 0.2924,
"step": 3050
},
{
"epoch": 6.11,
"eval_loss": 0.35313141345977783,
"eval_runtime": 31.2961,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3050
},
{
"epoch": 6.12,
"learning_rate": 3.1022044088176356e-05,
"loss": 0.3146,
"step": 3055
},
{
"epoch": 6.13,
"learning_rate": 3.094188376753507e-05,
"loss": 0.3267,
"step": 3060
},
{
"epoch": 6.13,
"eval_loss": 0.3454523980617523,
"eval_runtime": 31.2999,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3060
},
{
"epoch": 6.14,
"learning_rate": 3.086172344689379e-05,
"loss": 0.3476,
"step": 3065
},
{
"epoch": 6.15,
"learning_rate": 3.0781563126252506e-05,
"loss": 0.3488,
"step": 3070
},
{
"epoch": 6.15,
"eval_loss": 0.3427518606185913,
"eval_runtime": 31.2904,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3070
},
{
"epoch": 6.16,
"learning_rate": 3.070140280561122e-05,
"loss": 0.3351,
"step": 3075
},
{
"epoch": 6.17,
"learning_rate": 3.062124248496994e-05,
"loss": 0.301,
"step": 3080
},
{
"epoch": 6.17,
"eval_loss": 0.3455488681793213,
"eval_runtime": 31.288,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3080
},
{
"epoch": 6.18,
"learning_rate": 3.0541082164328656e-05,
"loss": 0.287,
"step": 3085
},
{
"epoch": 6.19,
"learning_rate": 3.0460921843687376e-05,
"loss": 0.2656,
"step": 3090
},
{
"epoch": 6.19,
"eval_loss": 0.34496939182281494,
"eval_runtime": 31.2947,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3090
},
{
"epoch": 6.2,
"learning_rate": 3.0380761523046093e-05,
"loss": 0.2814,
"step": 3095
},
{
"epoch": 6.21,
"learning_rate": 3.030060120240481e-05,
"loss": 0.2377,
"step": 3100
},
{
"epoch": 6.21,
"eval_loss": 0.3474458158016205,
"eval_runtime": 31.2977,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3100
},
{
"epoch": 6.22,
"learning_rate": 3.022044088176353e-05,
"loss": 0.2179,
"step": 3105
},
{
"epoch": 6.23,
"learning_rate": 3.0140280561122247e-05,
"loss": 0.2344,
"step": 3110
},
{
"epoch": 6.23,
"eval_loss": 0.34611740708351135,
"eval_runtime": 31.2932,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3110
},
{
"epoch": 6.24,
"learning_rate": 3.0060120240480963e-05,
"loss": 0.3124,
"step": 3115
},
{
"epoch": 6.25,
"learning_rate": 2.997995991983968e-05,
"loss": 0.2816,
"step": 3120
},
{
"epoch": 6.25,
"eval_loss": 0.3488862216472626,
"eval_runtime": 31.3005,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3120
},
{
"epoch": 6.26,
"learning_rate": 2.9899799599198397e-05,
"loss": 0.3506,
"step": 3125
},
{
"epoch": 6.27,
"learning_rate": 2.9819639278557117e-05,
"loss": 0.2675,
"step": 3130
},
{
"epoch": 6.27,
"eval_loss": 0.34268468618392944,
"eval_runtime": 31.3,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3130
},
{
"epoch": 6.28,
"learning_rate": 2.9739478957915833e-05,
"loss": 0.375,
"step": 3135
},
{
"epoch": 6.29,
"learning_rate": 2.965931863727455e-05,
"loss": 0.3315,
"step": 3140
},
{
"epoch": 6.29,
"eval_loss": 0.3393230736255646,
"eval_runtime": 31.3008,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3140
},
{
"epoch": 6.3,
"learning_rate": 2.9579158316633267e-05,
"loss": 0.2561,
"step": 3145
},
{
"epoch": 6.31,
"learning_rate": 2.9498997995991984e-05,
"loss": 0.335,
"step": 3150
},
{
"epoch": 6.31,
"eval_loss": 0.3406154215335846,
"eval_runtime": 31.3005,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3150
},
{
"epoch": 6.32,
"learning_rate": 2.9418837675350704e-05,
"loss": 0.3748,
"step": 3155
},
{
"epoch": 6.33,
"learning_rate": 2.933867735470942e-05,
"loss": 0.2418,
"step": 3160
},
{
"epoch": 6.33,
"eval_loss": 0.3384529948234558,
"eval_runtime": 31.2977,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3160
},
{
"epoch": 6.34,
"learning_rate": 2.9258517034068137e-05,
"loss": 0.3214,
"step": 3165
},
{
"epoch": 6.35,
"learning_rate": 2.9178356713426854e-05,
"loss": 0.215,
"step": 3170
},
{
"epoch": 6.35,
"eval_loss": 0.33930283784866333,
"eval_runtime": 31.3001,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3170
},
{
"epoch": 6.36,
"learning_rate": 2.909819639278557e-05,
"loss": 0.2009,
"step": 3175
},
{
"epoch": 6.37,
"learning_rate": 2.9018036072144287e-05,
"loss": 0.2279,
"step": 3180
},
{
"epoch": 6.37,
"eval_loss": 0.342680424451828,
"eval_runtime": 31.3037,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3180
},
{
"epoch": 6.38,
"learning_rate": 2.8937875751503007e-05,
"loss": 0.2684,
"step": 3185
},
{
"epoch": 6.39,
"learning_rate": 2.8857715430861727e-05,
"loss": 0.2907,
"step": 3190
},
{
"epoch": 6.39,
"eval_loss": 0.33792585134506226,
"eval_runtime": 31.3001,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3190
},
{
"epoch": 6.4,
"learning_rate": 2.8777555110220444e-05,
"loss": 0.284,
"step": 3195
},
{
"epoch": 6.41,
"learning_rate": 2.8697394789579164e-05,
"loss": 0.2184,
"step": 3200
},
{
"epoch": 6.41,
"eval_loss": 0.34375742077827454,
"eval_runtime": 31.3016,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3200
},
{
"epoch": 6.42,
"learning_rate": 2.861723446893788e-05,
"loss": 0.3549,
"step": 3205
},
{
"epoch": 6.43,
"learning_rate": 2.8537074148296598e-05,
"loss": 0.3484,
"step": 3210
},
{
"epoch": 6.43,
"eval_loss": 0.33638885617256165,
"eval_runtime": 31.3083,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3210
},
{
"epoch": 6.44,
"learning_rate": 2.8456913827655314e-05,
"loss": 0.2836,
"step": 3215
},
{
"epoch": 6.45,
"learning_rate": 2.837675350701403e-05,
"loss": 0.2327,
"step": 3220
},
{
"epoch": 6.45,
"eval_loss": 0.3405826687812805,
"eval_runtime": 31.3055,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3220
},
{
"epoch": 6.46,
"learning_rate": 2.8296593186372748e-05,
"loss": 0.2421,
"step": 3225
},
{
"epoch": 6.47,
"learning_rate": 2.8216432865731468e-05,
"loss": 0.2571,
"step": 3230
},
{
"epoch": 6.47,
"eval_loss": 0.3399759531021118,
"eval_runtime": 31.3102,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 3230
},
{
"epoch": 6.48,
"learning_rate": 2.8136272545090185e-05,
"loss": 0.2913,
"step": 3235
},
{
"epoch": 6.49,
"learning_rate": 2.80561122244489e-05,
"loss": 0.2864,
"step": 3240
},
{
"epoch": 6.49,
"eval_loss": 0.3367003798484802,
"eval_runtime": 31.3053,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3240
},
{
"epoch": 6.5,
"learning_rate": 2.7975951903807618e-05,
"loss": 0.2917,
"step": 3245
},
{
"epoch": 6.51,
"learning_rate": 2.7895791583166335e-05,
"loss": 0.2383,
"step": 3250
},
{
"epoch": 6.51,
"eval_loss": 0.3376709818840027,
"eval_runtime": 31.3097,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 3250
},
{
"epoch": 6.52,
"learning_rate": 2.7815631262525055e-05,
"loss": 0.2364,
"step": 3255
},
{
"epoch": 6.53,
"learning_rate": 2.773547094188377e-05,
"loss": 0.187,
"step": 3260
},
{
"epoch": 6.53,
"eval_loss": 0.334637314081192,
"eval_runtime": 31.3083,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3260
},
{
"epoch": 6.54,
"learning_rate": 2.7655310621242488e-05,
"loss": 0.2886,
"step": 3265
},
{
"epoch": 6.55,
"learning_rate": 2.7575150300601205e-05,
"loss": 0.2453,
"step": 3270
},
{
"epoch": 6.55,
"eval_loss": 0.33486467599868774,
"eval_runtime": 31.3013,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3270
},
{
"epoch": 6.56,
"learning_rate": 2.749498997995992e-05,
"loss": 0.3396,
"step": 3275
},
{
"epoch": 6.57,
"learning_rate": 2.7414829659318642e-05,
"loss": 0.296,
"step": 3280
},
{
"epoch": 6.57,
"eval_loss": 0.33391273021698,
"eval_runtime": 31.3077,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3280
},
{
"epoch": 6.58,
"learning_rate": 2.733466933867736e-05,
"loss": 0.2475,
"step": 3285
},
{
"epoch": 6.59,
"learning_rate": 2.7254509018036075e-05,
"loss": 0.2601,
"step": 3290
},
{
"epoch": 6.59,
"eval_loss": 0.3335227370262146,
"eval_runtime": 31.2966,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3290
},
{
"epoch": 6.6,
"learning_rate": 2.7174348697394792e-05,
"loss": 0.2877,
"step": 3295
},
{
"epoch": 6.61,
"learning_rate": 2.709418837675351e-05,
"loss": 0.2927,
"step": 3300
},
{
"epoch": 6.61,
"eval_loss": 0.33400464057922363,
"eval_runtime": 31.2901,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3300
},
{
"epoch": 6.62,
"learning_rate": 2.701402805611223e-05,
"loss": 0.246,
"step": 3305
},
{
"epoch": 6.63,
"learning_rate": 2.6933867735470945e-05,
"loss": 0.2796,
"step": 3310
},
{
"epoch": 6.63,
"eval_loss": 0.33034074306488037,
"eval_runtime": 31.2965,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3310
},
{
"epoch": 6.64,
"learning_rate": 2.6853707414829662e-05,
"loss": 0.307,
"step": 3315
},
{
"epoch": 6.65,
"learning_rate": 2.677354709418838e-05,
"loss": 0.2393,
"step": 3320
},
{
"epoch": 6.65,
"eval_loss": 0.3351325988769531,
"eval_runtime": 31.3032,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3320
},
{
"epoch": 6.66,
"learning_rate": 2.6693386773547095e-05,
"loss": 0.3415,
"step": 3325
},
{
"epoch": 6.67,
"learning_rate": 2.6613226452905812e-05,
"loss": 0.2764,
"step": 3330
},
{
"epoch": 6.67,
"eval_loss": 0.3287622332572937,
"eval_runtime": 31.3032,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3330
},
{
"epoch": 6.68,
"learning_rate": 2.6533066132264532e-05,
"loss": 0.3419,
"step": 3335
},
{
"epoch": 6.69,
"learning_rate": 2.645290581162325e-05,
"loss": 0.2547,
"step": 3340
},
{
"epoch": 6.69,
"eval_loss": 0.3326658010482788,
"eval_runtime": 31.3149,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 3340
},
{
"epoch": 6.7,
"learning_rate": 2.6372745490981966e-05,
"loss": 0.2263,
"step": 3345
},
{
"epoch": 6.71,
"learning_rate": 2.6292585170340682e-05,
"loss": 0.3247,
"step": 3350
},
{
"epoch": 6.71,
"eval_loss": 0.3279091715812683,
"eval_runtime": 31.3129,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 3350
},
{
"epoch": 6.72,
"learning_rate": 2.62124248496994e-05,
"loss": 0.2162,
"step": 3355
},
{
"epoch": 6.73,
"learning_rate": 2.613226452905812e-05,
"loss": 0.3217,
"step": 3360
},
{
"epoch": 6.73,
"eval_loss": 0.32833293080329895,
"eval_runtime": 31.3104,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 3360
},
{
"epoch": 6.74,
"learning_rate": 2.6052104208416836e-05,
"loss": 0.3017,
"step": 3365
},
{
"epoch": 6.75,
"learning_rate": 2.5971943887775553e-05,
"loss": 0.2881,
"step": 3370
},
{
"epoch": 6.75,
"eval_loss": 0.33072584867477417,
"eval_runtime": 31.3091,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3370
},
{
"epoch": 6.76,
"learning_rate": 2.589178356713427e-05,
"loss": 0.2871,
"step": 3375
},
{
"epoch": 6.77,
"learning_rate": 2.5811623246492986e-05,
"loss": 0.2897,
"step": 3380
},
{
"epoch": 6.77,
"eval_loss": 0.3281080722808838,
"eval_runtime": 31.3013,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3380
},
{
"epoch": 6.78,
"learning_rate": 2.5731462925851706e-05,
"loss": 0.2671,
"step": 3385
},
{
"epoch": 6.79,
"learning_rate": 2.5651302605210423e-05,
"loss": 0.3096,
"step": 3390
},
{
"epoch": 6.79,
"eval_loss": 0.3256888687610626,
"eval_runtime": 31.3048,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3390
},
{
"epoch": 6.8,
"learning_rate": 2.557114228456914e-05,
"loss": 0.3105,
"step": 3395
},
{
"epoch": 6.81,
"learning_rate": 2.5490981963927856e-05,
"loss": 0.2463,
"step": 3400
},
{
"epoch": 6.81,
"eval_loss": 0.3243662714958191,
"eval_runtime": 31.3033,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3400
},
{
"epoch": 6.82,
"learning_rate": 2.5410821643286573e-05,
"loss": 0.3127,
"step": 3405
},
{
"epoch": 6.83,
"learning_rate": 2.533066132264529e-05,
"loss": 0.2404,
"step": 3410
},
{
"epoch": 6.83,
"eval_loss": 0.32544904947280884,
"eval_runtime": 31.3022,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3410
},
{
"epoch": 6.84,
"learning_rate": 2.525050100200401e-05,
"loss": 0.2506,
"step": 3415
},
{
"epoch": 6.85,
"learning_rate": 2.5170340681362726e-05,
"loss": 0.2907,
"step": 3420
},
{
"epoch": 6.85,
"eval_loss": 0.32269883155822754,
"eval_runtime": 31.2987,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3420
},
{
"epoch": 6.86,
"learning_rate": 2.5090180360721443e-05,
"loss": 0.2479,
"step": 3425
},
{
"epoch": 6.87,
"learning_rate": 2.501002004008016e-05,
"loss": 0.2749,
"step": 3430
},
{
"epoch": 6.87,
"eval_loss": 0.3225778043270111,
"eval_runtime": 31.3034,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3430
},
{
"epoch": 6.88,
"learning_rate": 2.4929859719438877e-05,
"loss": 0.2815,
"step": 3435
},
{
"epoch": 6.89,
"learning_rate": 2.4849699398797597e-05,
"loss": 0.2262,
"step": 3440
},
{
"epoch": 6.89,
"eval_loss": 0.32259687781333923,
"eval_runtime": 31.3032,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3440
},
{
"epoch": 6.9,
"learning_rate": 2.4769539078156313e-05,
"loss": 0.3338,
"step": 3445
},
{
"epoch": 6.91,
"learning_rate": 2.468937875751503e-05,
"loss": 0.2799,
"step": 3450
},
{
"epoch": 6.91,
"eval_loss": 0.3232540488243103,
"eval_runtime": 31.2972,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3450
},
{
"epoch": 6.92,
"learning_rate": 2.4609218436873747e-05,
"loss": 0.2203,
"step": 3455
},
{
"epoch": 6.93,
"learning_rate": 2.4529058116232464e-05,
"loss": 0.2764,
"step": 3460
},
{
"epoch": 6.93,
"eval_loss": 0.31978297233581543,
"eval_runtime": 31.3045,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3460
},
{
"epoch": 6.94,
"learning_rate": 2.4448897795591184e-05,
"loss": 0.3449,
"step": 3465
},
{
"epoch": 6.95,
"learning_rate": 2.4368737474949904e-05,
"loss": 0.2644,
"step": 3470
},
{
"epoch": 6.95,
"eval_loss": 0.3230922222137451,
"eval_runtime": 31.3046,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3470
},
{
"epoch": 6.96,
"learning_rate": 2.428857715430862e-05,
"loss": 0.2169,
"step": 3475
},
{
"epoch": 6.97,
"learning_rate": 2.4208416833667337e-05,
"loss": 0.2733,
"step": 3480
},
{
"epoch": 6.97,
"eval_loss": 0.3187558650970459,
"eval_runtime": 31.3014,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3480
},
{
"epoch": 6.98,
"learning_rate": 2.4128256513026057e-05,
"loss": 0.2509,
"step": 3485
},
{
"epoch": 6.99,
"learning_rate": 2.4048096192384774e-05,
"loss": 0.2861,
"step": 3490
},
{
"epoch": 6.99,
"eval_loss": 0.3191593587398529,
"eval_runtime": 31.3055,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3490
},
{
"epoch": 7.0,
"learning_rate": 2.396793587174349e-05,
"loss": 0.3167,
"step": 3495
},
{
"epoch": 7.01,
"learning_rate": 2.3887775551102207e-05,
"loss": 0.1757,
"step": 3500
},
{
"epoch": 7.01,
"eval_loss": 0.3242829144001007,
"eval_runtime": 31.3027,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3500
},
{
"epoch": 7.02,
"learning_rate": 2.3807615230460924e-05,
"loss": 0.2188,
"step": 3505
},
{
"epoch": 7.03,
"learning_rate": 2.3727454909819644e-05,
"loss": 0.2588,
"step": 3510
},
{
"epoch": 7.03,
"eval_loss": 0.3237724304199219,
"eval_runtime": 31.201,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 3510
},
{
"epoch": 7.04,
"learning_rate": 2.364729458917836e-05,
"loss": 0.2842,
"step": 3515
},
{
"epoch": 7.05,
"learning_rate": 2.3567134268537078e-05,
"loss": 0.2132,
"step": 3520
},
{
"epoch": 7.05,
"eval_loss": 0.3207135498523712,
"eval_runtime": 31.2833,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 3520
},
{
"epoch": 7.06,
"learning_rate": 2.3486973947895794e-05,
"loss": 0.2001,
"step": 3525
},
{
"epoch": 7.07,
"learning_rate": 2.340681362725451e-05,
"loss": 0.2787,
"step": 3530
},
{
"epoch": 7.07,
"eval_loss": 0.32723355293273926,
"eval_runtime": 31.2924,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3530
},
{
"epoch": 7.08,
"learning_rate": 2.332665330661323e-05,
"loss": 0.1942,
"step": 3535
},
{
"epoch": 7.09,
"learning_rate": 2.3246492985971948e-05,
"loss": 0.2786,
"step": 3540
},
{
"epoch": 7.09,
"eval_loss": 0.32288235425949097,
"eval_runtime": 31.2971,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3540
},
{
"epoch": 7.1,
"learning_rate": 2.3166332665330665e-05,
"loss": 0.1944,
"step": 3545
},
{
"epoch": 7.11,
"learning_rate": 2.308617234468938e-05,
"loss": 0.2854,
"step": 3550
},
{
"epoch": 7.11,
"eval_loss": 0.32317566871643066,
"eval_runtime": 31.2981,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3550
},
{
"epoch": 7.12,
"learning_rate": 2.3006012024048098e-05,
"loss": 0.224,
"step": 3555
},
{
"epoch": 7.13,
"learning_rate": 2.2925851703406815e-05,
"loss": 0.1982,
"step": 3560
},
{
"epoch": 7.13,
"eval_loss": 0.3237103521823883,
"eval_runtime": 31.3037,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3560
},
{
"epoch": 7.14,
"learning_rate": 2.2845691382765535e-05,
"loss": 0.2057,
"step": 3565
},
{
"epoch": 7.15,
"learning_rate": 2.276553106212425e-05,
"loss": 0.2022,
"step": 3570
},
{
"epoch": 7.15,
"eval_loss": 0.3253968060016632,
"eval_runtime": 31.3005,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3570
},
{
"epoch": 7.16,
"learning_rate": 2.2685370741482968e-05,
"loss": 0.1863,
"step": 3575
},
{
"epoch": 7.17,
"learning_rate": 2.2605210420841685e-05,
"loss": 0.2592,
"step": 3580
},
{
"epoch": 7.17,
"eval_loss": 0.32580825686454773,
"eval_runtime": 31.2975,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3580
},
{
"epoch": 7.18,
"learning_rate": 2.25250501002004e-05,
"loss": 0.2837,
"step": 3585
},
{
"epoch": 7.19,
"learning_rate": 2.2444889779559122e-05,
"loss": 0.2299,
"step": 3590
},
{
"epoch": 7.19,
"eval_loss": 0.32067885994911194,
"eval_runtime": 31.2956,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3590
},
{
"epoch": 7.2,
"learning_rate": 2.236472945891784e-05,
"loss": 0.1787,
"step": 3595
},
{
"epoch": 7.21,
"learning_rate": 2.2284569138276555e-05,
"loss": 0.2054,
"step": 3600
},
{
"epoch": 7.21,
"eval_loss": 0.31973427534103394,
"eval_runtime": 31.2985,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3600
},
{
"epoch": 7.22,
"learning_rate": 2.2204408817635272e-05,
"loss": 0.2326,
"step": 3605
},
{
"epoch": 7.23,
"learning_rate": 2.212424849699399e-05,
"loss": 0.208,
"step": 3610
},
{
"epoch": 7.23,
"eval_loss": 0.32161736488342285,
"eval_runtime": 31.2934,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3610
},
{
"epoch": 7.24,
"learning_rate": 2.204408817635271e-05,
"loss": 0.283,
"step": 3615
},
{
"epoch": 7.25,
"learning_rate": 2.1963927855711425e-05,
"loss": 0.2432,
"step": 3620
},
{
"epoch": 7.25,
"eval_loss": 0.3228018283843994,
"eval_runtime": 31.2953,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3620
},
{
"epoch": 7.26,
"learning_rate": 2.1883767535070142e-05,
"loss": 0.2692,
"step": 3625
},
{
"epoch": 7.27,
"learning_rate": 2.180360721442886e-05,
"loss": 0.2452,
"step": 3630
},
{
"epoch": 7.27,
"eval_loss": 0.3180503249168396,
"eval_runtime": 31.2954,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3630
},
{
"epoch": 7.28,
"learning_rate": 2.1723446893787575e-05,
"loss": 0.284,
"step": 3635
},
{
"epoch": 7.29,
"learning_rate": 2.1643286573146292e-05,
"loss": 0.264,
"step": 3640
},
{
"epoch": 7.29,
"eval_loss": 0.3237510919570923,
"eval_runtime": 31.3062,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3640
},
{
"epoch": 7.3,
"learning_rate": 2.1563126252505012e-05,
"loss": 0.1818,
"step": 3645
},
{
"epoch": 7.31,
"learning_rate": 2.148296593186373e-05,
"loss": 0.2019,
"step": 3650
},
{
"epoch": 7.31,
"eval_loss": 0.3177684545516968,
"eval_runtime": 31.3017,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3650
},
{
"epoch": 7.32,
"learning_rate": 2.1402805611222446e-05,
"loss": 0.2007,
"step": 3655
},
{
"epoch": 7.33,
"learning_rate": 2.1322645290581162e-05,
"loss": 0.2299,
"step": 3660
},
{
"epoch": 7.33,
"eval_loss": 0.32181212306022644,
"eval_runtime": 31.3118,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 3660
},
{
"epoch": 7.34,
"learning_rate": 2.124248496993988e-05,
"loss": 0.242,
"step": 3665
},
{
"epoch": 7.35,
"learning_rate": 2.11623246492986e-05,
"loss": 0.2465,
"step": 3670
},
{
"epoch": 7.35,
"eval_loss": 0.3172205090522766,
"eval_runtime": 31.3041,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3670
},
{
"epoch": 7.36,
"learning_rate": 2.1082164328657316e-05,
"loss": 0.2447,
"step": 3675
},
{
"epoch": 7.37,
"learning_rate": 2.1002004008016033e-05,
"loss": 0.2466,
"step": 3680
},
{
"epoch": 7.37,
"eval_loss": 0.3167315125465393,
"eval_runtime": 31.3046,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3680
},
{
"epoch": 7.38,
"learning_rate": 2.092184368737475e-05,
"loss": 0.3394,
"step": 3685
},
{
"epoch": 7.39,
"learning_rate": 2.0841683366733466e-05,
"loss": 0.2824,
"step": 3690
},
{
"epoch": 7.39,
"eval_loss": 0.3142654299736023,
"eval_runtime": 31.3029,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3690
},
{
"epoch": 7.4,
"learning_rate": 2.0761523046092186e-05,
"loss": 0.2809,
"step": 3695
},
{
"epoch": 7.41,
"learning_rate": 2.0681362725450903e-05,
"loss": 0.2314,
"step": 3700
},
{
"epoch": 7.41,
"eval_loss": 0.3143279254436493,
"eval_runtime": 31.3083,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3700
},
{
"epoch": 7.42,
"learning_rate": 2.060120240480962e-05,
"loss": 0.2344,
"step": 3705
},
{
"epoch": 7.43,
"learning_rate": 2.0521042084168336e-05,
"loss": 0.2822,
"step": 3710
},
{
"epoch": 7.43,
"eval_loss": 0.31426194310188293,
"eval_runtime": 31.3051,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3710
},
{
"epoch": 7.44,
"learning_rate": 2.0440881763527053e-05,
"loss": 0.2441,
"step": 3715
},
{
"epoch": 7.45,
"learning_rate": 2.036072144288577e-05,
"loss": 0.2254,
"step": 3720
},
{
"epoch": 7.45,
"eval_loss": 0.3139478862285614,
"eval_runtime": 31.3076,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3720
},
{
"epoch": 7.46,
"learning_rate": 2.028056112224449e-05,
"loss": 0.3121,
"step": 3725
},
{
"epoch": 7.47,
"learning_rate": 2.0200400801603206e-05,
"loss": 0.2454,
"step": 3730
},
{
"epoch": 7.47,
"eval_loss": 0.32176563143730164,
"eval_runtime": 31.3037,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3730
},
{
"epoch": 7.48,
"learning_rate": 2.0120240480961923e-05,
"loss": 0.178,
"step": 3735
},
{
"epoch": 7.49,
"learning_rate": 2.004008016032064e-05,
"loss": 0.2656,
"step": 3740
},
{
"epoch": 7.49,
"eval_loss": 0.3115682899951935,
"eval_runtime": 31.3043,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3740
},
{
"epoch": 7.5,
"learning_rate": 1.995991983967936e-05,
"loss": 0.2158,
"step": 3745
},
{
"epoch": 7.51,
"learning_rate": 1.9879759519038077e-05,
"loss": 0.2172,
"step": 3750
},
{
"epoch": 7.51,
"eval_loss": 0.3154009282588959,
"eval_runtime": 31.3017,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3750
},
{
"epoch": 7.52,
"learning_rate": 1.9799599198396793e-05,
"loss": 0.1828,
"step": 3755
},
{
"epoch": 7.53,
"learning_rate": 1.9719438877755514e-05,
"loss": 0.2408,
"step": 3760
},
{
"epoch": 7.53,
"eval_loss": 0.3127301037311554,
"eval_runtime": 31.2971,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3760
},
{
"epoch": 7.54,
"learning_rate": 1.963927855711423e-05,
"loss": 0.262,
"step": 3765
},
{
"epoch": 7.55,
"learning_rate": 1.9559118236472947e-05,
"loss": 0.1761,
"step": 3770
},
{
"epoch": 7.55,
"eval_loss": 0.3148895502090454,
"eval_runtime": 31.2992,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3770
},
{
"epoch": 7.56,
"learning_rate": 1.9478957915831664e-05,
"loss": 0.2206,
"step": 3775
},
{
"epoch": 7.57,
"learning_rate": 1.939879759519038e-05,
"loss": 0.2232,
"step": 3780
},
{
"epoch": 7.57,
"eval_loss": 0.3114352226257324,
"eval_runtime": 31.2962,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3780
},
{
"epoch": 7.58,
"learning_rate": 1.93186372745491e-05,
"loss": 0.2738,
"step": 3785
},
{
"epoch": 7.59,
"learning_rate": 1.9238476953907817e-05,
"loss": 0.2902,
"step": 3790
},
{
"epoch": 7.59,
"eval_loss": 0.3135593831539154,
"eval_runtime": 31.304,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3790
},
{
"epoch": 7.6,
"learning_rate": 1.9158316633266534e-05,
"loss": 0.1814,
"step": 3795
},
{
"epoch": 7.61,
"learning_rate": 1.907815631262525e-05,
"loss": 0.2485,
"step": 3800
},
{
"epoch": 7.61,
"eval_loss": 0.31458309292793274,
"eval_runtime": 31.3021,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3800
},
{
"epoch": 7.62,
"learning_rate": 1.8997995991983967e-05,
"loss": 0.2788,
"step": 3805
},
{
"epoch": 7.63,
"learning_rate": 1.8917835671342687e-05,
"loss": 0.1901,
"step": 3810
},
{
"epoch": 7.63,
"eval_loss": 0.30936095118522644,
"eval_runtime": 31.3065,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3810
},
{
"epoch": 7.64,
"learning_rate": 1.8837675350701404e-05,
"loss": 0.2153,
"step": 3815
},
{
"epoch": 7.65,
"learning_rate": 1.8757515030060124e-05,
"loss": 0.2962,
"step": 3820
},
{
"epoch": 7.65,
"eval_loss": 0.3120403587818146,
"eval_runtime": 31.3087,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 3820
},
{
"epoch": 7.66,
"learning_rate": 1.867735470941884e-05,
"loss": 0.253,
"step": 3825
},
{
"epoch": 7.67,
"learning_rate": 1.8597194388777558e-05,
"loss": 0.2093,
"step": 3830
},
{
"epoch": 7.67,
"eval_loss": 0.31331053376197815,
"eval_runtime": 31.2958,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3830
},
{
"epoch": 7.68,
"learning_rate": 1.8517034068136274e-05,
"loss": 0.2398,
"step": 3835
},
{
"epoch": 7.69,
"learning_rate": 1.843687374749499e-05,
"loss": 0.368,
"step": 3840
},
{
"epoch": 7.69,
"eval_loss": 0.3063763976097107,
"eval_runtime": 31.3038,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 3840
},
{
"epoch": 7.7,
"learning_rate": 1.835671342685371e-05,
"loss": 0.2372,
"step": 3845
},
{
"epoch": 7.71,
"learning_rate": 1.8276553106212428e-05,
"loss": 0.2849,
"step": 3850
},
{
"epoch": 7.71,
"eval_loss": 0.3091437518596649,
"eval_runtime": 31.2956,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3850
},
{
"epoch": 7.72,
"learning_rate": 1.8196392785571145e-05,
"loss": 0.2551,
"step": 3855
},
{
"epoch": 7.73,
"learning_rate": 1.811623246492986e-05,
"loss": 0.1948,
"step": 3860
},
{
"epoch": 7.73,
"eval_loss": 0.3075093924999237,
"eval_runtime": 31.2948,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3860
},
{
"epoch": 7.74,
"learning_rate": 1.8036072144288578e-05,
"loss": 0.2395,
"step": 3865
},
{
"epoch": 7.75,
"learning_rate": 1.7955911823647295e-05,
"loss": 0.2241,
"step": 3870
},
{
"epoch": 7.75,
"eval_loss": 0.30777955055236816,
"eval_runtime": 31.2961,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3870
},
{
"epoch": 7.76,
"learning_rate": 1.7875751503006015e-05,
"loss": 0.2796,
"step": 3875
},
{
"epoch": 7.77,
"learning_rate": 1.779559118236473e-05,
"loss": 0.1935,
"step": 3880
},
{
"epoch": 7.77,
"eval_loss": 0.30446866154670715,
"eval_runtime": 31.2946,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3880
},
{
"epoch": 7.78,
"learning_rate": 1.7715430861723448e-05,
"loss": 0.2199,
"step": 3885
},
{
"epoch": 7.79,
"learning_rate": 1.7635270541082165e-05,
"loss": 0.2045,
"step": 3890
},
{
"epoch": 7.79,
"eval_loss": 0.3065406084060669,
"eval_runtime": 31.2936,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3890
},
{
"epoch": 7.8,
"learning_rate": 1.755511022044088e-05,
"loss": 0.1856,
"step": 3895
},
{
"epoch": 7.81,
"learning_rate": 1.74749498997996e-05,
"loss": 0.159,
"step": 3900
},
{
"epoch": 7.81,
"eval_loss": 0.30820584297180176,
"eval_runtime": 31.2972,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3900
},
{
"epoch": 7.82,
"learning_rate": 1.739478957915832e-05,
"loss": 0.3111,
"step": 3905
},
{
"epoch": 7.83,
"learning_rate": 1.7314629258517035e-05,
"loss": 0.1714,
"step": 3910
},
{
"epoch": 7.83,
"eval_loss": 0.3057255148887634,
"eval_runtime": 31.2961,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 3910
},
{
"epoch": 7.84,
"learning_rate": 1.7234468937875752e-05,
"loss": 0.1983,
"step": 3915
},
{
"epoch": 7.85,
"learning_rate": 1.715430861723447e-05,
"loss": 0.1984,
"step": 3920
},
{
"epoch": 7.85,
"eval_loss": 0.3059474527835846,
"eval_runtime": 31.3004,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3920
},
{
"epoch": 7.86,
"learning_rate": 1.707414829659319e-05,
"loss": 0.2186,
"step": 3925
},
{
"epoch": 7.87,
"learning_rate": 1.6993987975951905e-05,
"loss": 0.2397,
"step": 3930
},
{
"epoch": 7.87,
"eval_loss": 0.30371129512786865,
"eval_runtime": 31.2982,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3930
},
{
"epoch": 7.88,
"learning_rate": 1.6913827655310622e-05,
"loss": 0.2559,
"step": 3935
},
{
"epoch": 7.89,
"learning_rate": 1.683366733466934e-05,
"loss": 0.1884,
"step": 3940
},
{
"epoch": 7.89,
"eval_loss": 0.3053794205188751,
"eval_runtime": 31.2925,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3940
},
{
"epoch": 7.9,
"learning_rate": 1.6753507014028055e-05,
"loss": 0.2414,
"step": 3945
},
{
"epoch": 7.91,
"learning_rate": 1.6673346693386772e-05,
"loss": 0.2585,
"step": 3950
},
{
"epoch": 7.91,
"eval_loss": 0.30300775170326233,
"eval_runtime": 31.3,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 3950
},
{
"epoch": 7.92,
"learning_rate": 1.6593186372745492e-05,
"loss": 0.2049,
"step": 3955
},
{
"epoch": 7.93,
"learning_rate": 1.6513026052104212e-05,
"loss": 0.2476,
"step": 3960
},
{
"epoch": 7.93,
"eval_loss": 0.30583810806274414,
"eval_runtime": 31.2909,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3960
},
{
"epoch": 7.94,
"learning_rate": 1.643286573146293e-05,
"loss": 0.2572,
"step": 3965
},
{
"epoch": 7.95,
"learning_rate": 1.6352705410821646e-05,
"loss": 0.2525,
"step": 3970
},
{
"epoch": 7.95,
"eval_loss": 0.3032612204551697,
"eval_runtime": 31.2935,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3970
},
{
"epoch": 7.96,
"learning_rate": 1.6272545090180362e-05,
"loss": 0.2901,
"step": 3975
},
{
"epoch": 7.97,
"learning_rate": 1.619238476953908e-05,
"loss": 0.2001,
"step": 3980
},
{
"epoch": 7.97,
"eval_loss": 0.30624955892562866,
"eval_runtime": 31.2914,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 3980
},
{
"epoch": 7.98,
"learning_rate": 1.6112224448897796e-05,
"loss": 0.1784,
"step": 3985
},
{
"epoch": 7.99,
"learning_rate": 1.6032064128256516e-05,
"loss": 0.1985,
"step": 3990
},
{
"epoch": 7.99,
"eval_loss": 0.3039126992225647,
"eval_runtime": 31.2929,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 3990
},
{
"epoch": 8.01,
"learning_rate": 1.5951903807615233e-05,
"loss": 0.247,
"step": 3995
},
{
"epoch": 8.02,
"learning_rate": 1.587174348697395e-05,
"loss": 0.1984,
"step": 4000
},
{
"epoch": 8.02,
"eval_loss": 0.3139249384403229,
"eval_runtime": 31.2927,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4000
},
{
"epoch": 8.03,
"learning_rate": 1.5791583166332666e-05,
"loss": 0.1848,
"step": 4005
},
{
"epoch": 8.04,
"learning_rate": 1.5711422845691383e-05,
"loss": 0.2008,
"step": 4010
},
{
"epoch": 8.04,
"eval_loss": 0.3099471926689148,
"eval_runtime": 31.2217,
"eval_samples_per_second": 7.11,
"eval_steps_per_second": 1.794,
"step": 4010
},
{
"epoch": 8.05,
"learning_rate": 1.5631262525050103e-05,
"loss": 0.1658,
"step": 4015
},
{
"epoch": 8.06,
"learning_rate": 1.555110220440882e-05,
"loss": 0.2159,
"step": 4020
},
{
"epoch": 8.06,
"eval_loss": 0.3085058629512787,
"eval_runtime": 31.2776,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 4020
},
{
"epoch": 8.07,
"learning_rate": 1.5470941883767536e-05,
"loss": 0.2254,
"step": 4025
},
{
"epoch": 8.08,
"learning_rate": 1.5390781563126253e-05,
"loss": 0.2305,
"step": 4030
},
{
"epoch": 8.08,
"eval_loss": 0.3107781410217285,
"eval_runtime": 31.2779,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 1.79,
"step": 4030
},
{
"epoch": 8.09,
"learning_rate": 1.531062124248497e-05,
"loss": 0.1916,
"step": 4035
},
{
"epoch": 8.1,
"learning_rate": 1.5230460921843688e-05,
"loss": 0.2007,
"step": 4040
},
{
"epoch": 8.1,
"eval_loss": 0.30502209067344666,
"eval_runtime": 31.29,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4040
},
{
"epoch": 8.11,
"learning_rate": 1.5150300601202405e-05,
"loss": 0.2161,
"step": 4045
},
{
"epoch": 8.12,
"learning_rate": 1.5070140280561123e-05,
"loss": 0.2124,
"step": 4050
},
{
"epoch": 8.12,
"eval_loss": 0.31148508191108704,
"eval_runtime": 31.2984,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4050
},
{
"epoch": 8.13,
"learning_rate": 1.498997995991984e-05,
"loss": 0.2602,
"step": 4055
},
{
"epoch": 8.14,
"learning_rate": 1.4909819639278558e-05,
"loss": 0.1435,
"step": 4060
},
{
"epoch": 8.14,
"eval_loss": 0.3084125816822052,
"eval_runtime": 31.2938,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4060
},
{
"epoch": 8.15,
"learning_rate": 1.4829659318637275e-05,
"loss": 0.192,
"step": 4065
},
{
"epoch": 8.16,
"learning_rate": 1.4749498997995992e-05,
"loss": 0.1968,
"step": 4070
},
{
"epoch": 8.16,
"eval_loss": 0.3087104856967926,
"eval_runtime": 31.2947,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4070
},
{
"epoch": 8.17,
"learning_rate": 1.466933867735471e-05,
"loss": 0.167,
"step": 4075
},
{
"epoch": 8.18,
"learning_rate": 1.4589178356713427e-05,
"loss": 0.2507,
"step": 4080
},
{
"epoch": 8.18,
"eval_loss": 0.3084275722503662,
"eval_runtime": 31.2993,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4080
},
{
"epoch": 8.19,
"learning_rate": 1.4509018036072144e-05,
"loss": 0.164,
"step": 4085
},
{
"epoch": 8.2,
"learning_rate": 1.4428857715430864e-05,
"loss": 0.1703,
"step": 4090
},
{
"epoch": 8.2,
"eval_loss": 0.3060537874698639,
"eval_runtime": 31.2962,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4090
},
{
"epoch": 8.21,
"learning_rate": 1.4348697394789582e-05,
"loss": 0.2239,
"step": 4095
},
{
"epoch": 8.22,
"learning_rate": 1.4268537074148299e-05,
"loss": 0.2511,
"step": 4100
},
{
"epoch": 8.22,
"eval_loss": 0.310585081577301,
"eval_runtime": 31.2934,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4100
},
{
"epoch": 8.23,
"learning_rate": 1.4188376753507016e-05,
"loss": 0.1675,
"step": 4105
},
{
"epoch": 8.24,
"learning_rate": 1.4108216432865734e-05,
"loss": 0.1698,
"step": 4110
},
{
"epoch": 8.24,
"eval_loss": 0.3134320378303528,
"eval_runtime": 31.2953,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4110
},
{
"epoch": 8.25,
"learning_rate": 1.402805611222445e-05,
"loss": 0.1757,
"step": 4115
},
{
"epoch": 8.26,
"learning_rate": 1.3947895791583167e-05,
"loss": 0.2518,
"step": 4120
},
{
"epoch": 8.26,
"eval_loss": 0.3101155757904053,
"eval_runtime": 31.2883,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4120
},
{
"epoch": 8.27,
"learning_rate": 1.3867735470941886e-05,
"loss": 0.1248,
"step": 4125
},
{
"epoch": 8.28,
"learning_rate": 1.3787575150300602e-05,
"loss": 0.1489,
"step": 4130
},
{
"epoch": 8.28,
"eval_loss": 0.3090272843837738,
"eval_runtime": 31.2934,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4130
},
{
"epoch": 8.29,
"learning_rate": 1.3707414829659321e-05,
"loss": 0.1893,
"step": 4135
},
{
"epoch": 8.3,
"learning_rate": 1.3627254509018038e-05,
"loss": 0.1759,
"step": 4140
},
{
"epoch": 8.3,
"eval_loss": 0.3098497688770294,
"eval_runtime": 31.2929,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4140
},
{
"epoch": 8.31,
"learning_rate": 1.3547094188376754e-05,
"loss": 0.2406,
"step": 4145
},
{
"epoch": 8.32,
"learning_rate": 1.3466933867735473e-05,
"loss": 0.1939,
"step": 4150
},
{
"epoch": 8.32,
"eval_loss": 0.305558979511261,
"eval_runtime": 31.2912,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4150
},
{
"epoch": 8.33,
"learning_rate": 1.338677354709419e-05,
"loss": 0.211,
"step": 4155
},
{
"epoch": 8.34,
"learning_rate": 1.3306613226452906e-05,
"loss": 0.2168,
"step": 4160
},
{
"epoch": 8.34,
"eval_loss": 0.31060683727264404,
"eval_runtime": 31.2852,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 4160
},
{
"epoch": 8.35,
"learning_rate": 1.3226452905811624e-05,
"loss": 0.2253,
"step": 4165
},
{
"epoch": 8.36,
"learning_rate": 1.3146292585170341e-05,
"loss": 0.2119,
"step": 4170
},
{
"epoch": 8.36,
"eval_loss": 0.3051324188709259,
"eval_runtime": 31.2925,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4170
},
{
"epoch": 8.37,
"learning_rate": 1.306613226452906e-05,
"loss": 0.1942,
"step": 4175
},
{
"epoch": 8.38,
"learning_rate": 1.2985971943887776e-05,
"loss": 0.1793,
"step": 4180
},
{
"epoch": 8.38,
"eval_loss": 0.30557873845100403,
"eval_runtime": 31.3033,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4180
},
{
"epoch": 8.39,
"learning_rate": 1.2905811623246493e-05,
"loss": 0.1906,
"step": 4185
},
{
"epoch": 8.4,
"learning_rate": 1.2825651302605211e-05,
"loss": 0.2434,
"step": 4190
},
{
"epoch": 8.4,
"eval_loss": 0.30504488945007324,
"eval_runtime": 31.2924,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4190
},
{
"epoch": 8.41,
"learning_rate": 1.2745490981963928e-05,
"loss": 0.1886,
"step": 4195
},
{
"epoch": 8.42,
"learning_rate": 1.2665330661322645e-05,
"loss": 0.2601,
"step": 4200
},
{
"epoch": 8.42,
"eval_loss": 0.30650991201400757,
"eval_runtime": 31.2933,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4200
},
{
"epoch": 8.43,
"learning_rate": 1.2585170340681363e-05,
"loss": 0.1338,
"step": 4205
},
{
"epoch": 8.44,
"learning_rate": 1.250501002004008e-05,
"loss": 0.1791,
"step": 4210
},
{
"epoch": 8.44,
"eval_loss": 0.3051263391971588,
"eval_runtime": 31.3026,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4210
},
{
"epoch": 8.45,
"learning_rate": 1.2424849699398798e-05,
"loss": 0.2534,
"step": 4215
},
{
"epoch": 8.46,
"learning_rate": 1.2344689378757515e-05,
"loss": 0.1404,
"step": 4220
},
{
"epoch": 8.46,
"eval_loss": 0.3057839870452881,
"eval_runtime": 31.8248,
"eval_samples_per_second": 6.976,
"eval_steps_per_second": 1.76,
"step": 4220
},
{
"epoch": 8.47,
"learning_rate": 1.2264529058116232e-05,
"loss": 0.2097,
"step": 4225
},
{
"epoch": 8.48,
"learning_rate": 1.2184368737474952e-05,
"loss": 0.222,
"step": 4230
},
{
"epoch": 8.48,
"eval_loss": 0.3059149384498596,
"eval_runtime": 31.2917,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4230
},
{
"epoch": 8.49,
"learning_rate": 1.2104208416833669e-05,
"loss": 0.1822,
"step": 4235
},
{
"epoch": 8.5,
"learning_rate": 1.2024048096192387e-05,
"loss": 0.1809,
"step": 4240
},
{
"epoch": 8.5,
"eval_loss": 0.30699098110198975,
"eval_runtime": 31.297,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4240
},
{
"epoch": 8.51,
"learning_rate": 1.1943887775551104e-05,
"loss": 0.1847,
"step": 4245
},
{
"epoch": 8.52,
"learning_rate": 1.1863727454909822e-05,
"loss": 0.1745,
"step": 4250
},
{
"epoch": 8.52,
"eval_loss": 0.30655932426452637,
"eval_runtime": 31.8903,
"eval_samples_per_second": 6.961,
"eval_steps_per_second": 1.756,
"step": 4250
},
{
"epoch": 8.53,
"learning_rate": 1.1783567134268539e-05,
"loss": 0.2631,
"step": 4255
},
{
"epoch": 8.54,
"learning_rate": 1.1703406813627256e-05,
"loss": 0.2236,
"step": 4260
},
{
"epoch": 8.54,
"eval_loss": 0.30122604966163635,
"eval_runtime": 31.2919,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4260
},
{
"epoch": 8.55,
"learning_rate": 1.1623246492985974e-05,
"loss": 0.1922,
"step": 4265
},
{
"epoch": 8.56,
"learning_rate": 1.154308617234469e-05,
"loss": 0.1965,
"step": 4270
},
{
"epoch": 8.56,
"eval_loss": 0.30368027091026306,
"eval_runtime": 31.3086,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4270
},
{
"epoch": 8.57,
"learning_rate": 1.1462925851703407e-05,
"loss": 0.1554,
"step": 4275
},
{
"epoch": 8.58,
"learning_rate": 1.1382765531062126e-05,
"loss": 0.1836,
"step": 4280
},
{
"epoch": 8.58,
"eval_loss": 0.3051268458366394,
"eval_runtime": 31.6028,
"eval_samples_per_second": 7.025,
"eval_steps_per_second": 1.772,
"step": 4280
},
{
"epoch": 8.59,
"learning_rate": 1.1302605210420842e-05,
"loss": 0.1924,
"step": 4285
},
{
"epoch": 8.6,
"learning_rate": 1.1222444889779561e-05,
"loss": 0.1912,
"step": 4290
},
{
"epoch": 8.6,
"eval_loss": 0.30165210366249084,
"eval_runtime": 31.2902,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4290
},
{
"epoch": 8.61,
"learning_rate": 1.1142284569138278e-05,
"loss": 0.1751,
"step": 4295
},
{
"epoch": 8.62,
"learning_rate": 1.1062124248496994e-05,
"loss": 0.2207,
"step": 4300
},
{
"epoch": 8.62,
"eval_loss": 0.30252379179000854,
"eval_runtime": 31.3027,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4300
},
{
"epoch": 8.63,
"learning_rate": 1.0981963927855713e-05,
"loss": 0.1475,
"step": 4305
},
{
"epoch": 8.64,
"learning_rate": 1.090180360721443e-05,
"loss": 0.2481,
"step": 4310
},
{
"epoch": 8.64,
"eval_loss": 0.29974907636642456,
"eval_runtime": 31.3079,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4310
},
{
"epoch": 8.65,
"learning_rate": 1.0821643286573146e-05,
"loss": 0.124,
"step": 4315
},
{
"epoch": 8.66,
"learning_rate": 1.0741482965931864e-05,
"loss": 0.1506,
"step": 4320
},
{
"epoch": 8.66,
"eval_loss": 0.30026939511299133,
"eval_runtime": 31.4042,
"eval_samples_per_second": 7.069,
"eval_steps_per_second": 1.783,
"step": 4320
},
{
"epoch": 8.67,
"learning_rate": 1.0661322645290581e-05,
"loss": 0.1864,
"step": 4325
},
{
"epoch": 8.68,
"learning_rate": 1.05811623246493e-05,
"loss": 0.2216,
"step": 4330
},
{
"epoch": 8.68,
"eval_loss": 0.30346396565437317,
"eval_runtime": 31.3544,
"eval_samples_per_second": 7.08,
"eval_steps_per_second": 1.786,
"step": 4330
},
{
"epoch": 8.69,
"learning_rate": 1.0501002004008016e-05,
"loss": 0.1759,
"step": 4335
},
{
"epoch": 8.7,
"learning_rate": 1.0420841683366733e-05,
"loss": 0.1866,
"step": 4340
},
{
"epoch": 8.7,
"eval_loss": 0.301408588886261,
"eval_runtime": 31.3231,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 4340
},
{
"epoch": 8.71,
"learning_rate": 1.0340681362725451e-05,
"loss": 0.1588,
"step": 4345
},
{
"epoch": 8.72,
"learning_rate": 1.0260521042084168e-05,
"loss": 0.2025,
"step": 4350
},
{
"epoch": 8.72,
"eval_loss": 0.30352672934532166,
"eval_runtime": 31.2959,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4350
},
{
"epoch": 8.73,
"learning_rate": 1.0180360721442885e-05,
"loss": 0.1465,
"step": 4355
},
{
"epoch": 8.74,
"learning_rate": 1.0100200400801603e-05,
"loss": 0.1521,
"step": 4360
},
{
"epoch": 8.74,
"eval_loss": 0.2992115318775177,
"eval_runtime": 31.3056,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4360
},
{
"epoch": 8.75,
"learning_rate": 1.002004008016032e-05,
"loss": 0.2296,
"step": 4365
},
{
"epoch": 8.76,
"learning_rate": 9.939879759519038e-06,
"loss": 0.1598,
"step": 4370
},
{
"epoch": 8.76,
"eval_loss": 0.30343157052993774,
"eval_runtime": 31.304,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4370
},
{
"epoch": 8.77,
"learning_rate": 9.859719438877757e-06,
"loss": 0.2342,
"step": 4375
},
{
"epoch": 8.78,
"learning_rate": 9.779559118236473e-06,
"loss": 0.185,
"step": 4380
},
{
"epoch": 8.78,
"eval_loss": 0.3016977906227112,
"eval_runtime": 31.3084,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4380
},
{
"epoch": 8.79,
"learning_rate": 9.69939879759519e-06,
"loss": 0.1527,
"step": 4385
},
{
"epoch": 8.8,
"learning_rate": 9.619238476953909e-06,
"loss": 0.2427,
"step": 4390
},
{
"epoch": 8.8,
"eval_loss": 0.2971956133842468,
"eval_runtime": 31.2993,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4390
},
{
"epoch": 8.81,
"learning_rate": 9.539078156312625e-06,
"loss": 0.1416,
"step": 4395
},
{
"epoch": 8.82,
"learning_rate": 9.458917835671344e-06,
"loss": 0.2343,
"step": 4400
},
{
"epoch": 8.82,
"eval_loss": 0.2979062795639038,
"eval_runtime": 31.2918,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 1.79,
"step": 4400
},
{
"epoch": 8.83,
"learning_rate": 9.378757515030062e-06,
"loss": 0.2002,
"step": 4405
},
{
"epoch": 8.84,
"learning_rate": 9.298597194388779e-06,
"loss": 0.1994,
"step": 4410
},
{
"epoch": 8.84,
"eval_loss": 0.2994498908519745,
"eval_runtime": 31.2999,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4410
},
{
"epoch": 8.85,
"learning_rate": 9.218436873747496e-06,
"loss": 0.2031,
"step": 4415
},
{
"epoch": 8.86,
"learning_rate": 9.138276553106214e-06,
"loss": 0.2671,
"step": 4420
},
{
"epoch": 8.86,
"eval_loss": 0.29864001274108887,
"eval_runtime": 31.9081,
"eval_samples_per_second": 6.957,
"eval_steps_per_second": 1.755,
"step": 4420
},
{
"epoch": 8.87,
"learning_rate": 9.05811623246493e-06,
"loss": 0.202,
"step": 4425
},
{
"epoch": 8.88,
"learning_rate": 8.977955911823647e-06,
"loss": 0.1158,
"step": 4430
},
{
"epoch": 8.88,
"eval_loss": 0.2991441786289215,
"eval_runtime": 31.5574,
"eval_samples_per_second": 7.035,
"eval_steps_per_second": 1.775,
"step": 4430
},
{
"epoch": 8.89,
"learning_rate": 8.897795591182366e-06,
"loss": 0.2213,
"step": 4435
},
{
"epoch": 8.9,
"learning_rate": 8.817635270541082e-06,
"loss": 0.2127,
"step": 4440
},
{
"epoch": 8.9,
"eval_loss": 0.3000405430793762,
"eval_runtime": 31.2838,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 1.79,
"step": 4440
},
{
"epoch": 8.91,
"learning_rate": 8.7374749498998e-06,
"loss": 0.1603,
"step": 4445
},
{
"epoch": 8.92,
"learning_rate": 8.657314629258518e-06,
"loss": 0.1691,
"step": 4450
},
{
"epoch": 8.92,
"eval_loss": 0.2980547845363617,
"eval_runtime": 31.2967,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4450
},
{
"epoch": 8.93,
"learning_rate": 8.577154308617234e-06,
"loss": 0.2523,
"step": 4455
},
{
"epoch": 8.94,
"learning_rate": 8.496993987975953e-06,
"loss": 0.2103,
"step": 4460
},
{
"epoch": 8.94,
"eval_loss": 0.2978658080101013,
"eval_runtime": 31.3156,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4460
},
{
"epoch": 8.95,
"learning_rate": 8.41683366733467e-06,
"loss": 0.1574,
"step": 4465
},
{
"epoch": 8.96,
"learning_rate": 8.336673346693386e-06,
"loss": 0.1392,
"step": 4470
},
{
"epoch": 8.96,
"eval_loss": 0.29815390706062317,
"eval_runtime": 31.3016,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4470
},
{
"epoch": 8.97,
"learning_rate": 8.256513026052106e-06,
"loss": 0.2192,
"step": 4475
},
{
"epoch": 8.98,
"learning_rate": 8.176352705410823e-06,
"loss": 0.1712,
"step": 4480
},
{
"epoch": 8.98,
"eval_loss": 0.2943491041660309,
"eval_runtime": 31.3127,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 4480
},
{
"epoch": 8.99,
"learning_rate": 8.09619238476954e-06,
"loss": 0.1652,
"step": 4485
},
{
"epoch": 9.0,
"learning_rate": 8.016032064128258e-06,
"loss": 0.2435,
"step": 4490
},
{
"epoch": 9.0,
"eval_loss": 0.2958294153213501,
"eval_runtime": 31.2931,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.79,
"step": 4490
},
{
"epoch": 9.01,
"learning_rate": 7.935871743486975e-06,
"loss": 0.1859,
"step": 4495
},
{
"epoch": 9.02,
"learning_rate": 7.855711422845691e-06,
"loss": 0.1715,
"step": 4500
},
{
"epoch": 9.02,
"eval_loss": 0.30551087856292725,
"eval_runtime": 31.309,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4500
},
{
"epoch": 9.03,
"learning_rate": 7.77555110220441e-06,
"loss": 0.1663,
"step": 4505
},
{
"epoch": 9.04,
"learning_rate": 7.695390781563127e-06,
"loss": 0.1641,
"step": 4510
},
{
"epoch": 9.04,
"eval_loss": 0.3048071563243866,
"eval_runtime": 31.2037,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 1.795,
"step": 4510
},
{
"epoch": 9.05,
"learning_rate": 7.615230460921844e-06,
"loss": 0.17,
"step": 4515
},
{
"epoch": 9.06,
"learning_rate": 7.535070140280562e-06,
"loss": 0.1529,
"step": 4520
},
{
"epoch": 9.06,
"eval_loss": 0.30287060141563416,
"eval_runtime": 31.2814,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 1.79,
"step": 4520
},
{
"epoch": 9.07,
"learning_rate": 7.454909819639279e-06,
"loss": 0.175,
"step": 4525
},
{
"epoch": 9.08,
"learning_rate": 7.374749498997996e-06,
"loss": 0.1566,
"step": 4530
},
{
"epoch": 9.08,
"eval_loss": 0.3046806752681732,
"eval_runtime": 31.2973,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4530
},
{
"epoch": 9.09,
"learning_rate": 7.2945891783567134e-06,
"loss": 0.1749,
"step": 4535
},
{
"epoch": 9.1,
"learning_rate": 7.214428857715432e-06,
"loss": 0.1382,
"step": 4540
},
{
"epoch": 9.1,
"eval_loss": 0.30272936820983887,
"eval_runtime": 31.302,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4540
},
{
"epoch": 9.11,
"learning_rate": 7.134268537074149e-06,
"loss": 0.1956,
"step": 4545
},
{
"epoch": 9.12,
"learning_rate": 7.054108216432867e-06,
"loss": 0.1605,
"step": 4550
},
{
"epoch": 9.12,
"eval_loss": 0.30227676033973694,
"eval_runtime": 31.3042,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4550
},
{
"epoch": 9.13,
"learning_rate": 6.973947895791584e-06,
"loss": 0.1675,
"step": 4555
},
{
"epoch": 9.14,
"learning_rate": 6.893787575150301e-06,
"loss": 0.2167,
"step": 4560
},
{
"epoch": 9.14,
"eval_loss": 0.3055172264575958,
"eval_runtime": 31.3016,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4560
},
{
"epoch": 9.15,
"learning_rate": 6.813627254509019e-06,
"loss": 0.1603,
"step": 4565
},
{
"epoch": 9.16,
"learning_rate": 6.733466933867736e-06,
"loss": 0.1506,
"step": 4570
},
{
"epoch": 9.16,
"eval_loss": 0.303718626499176,
"eval_runtime": 31.323,
"eval_samples_per_second": 7.087,
"eval_steps_per_second": 1.788,
"step": 4570
},
{
"epoch": 9.17,
"learning_rate": 6.653306613226453e-06,
"loss": 0.2345,
"step": 4575
},
{
"epoch": 9.18,
"learning_rate": 6.573146292585171e-06,
"loss": 0.192,
"step": 4580
},
{
"epoch": 9.18,
"eval_loss": 0.30387794971466064,
"eval_runtime": 31.3084,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4580
},
{
"epoch": 9.19,
"learning_rate": 6.492985971943888e-06,
"loss": 0.1714,
"step": 4585
},
{
"epoch": 9.2,
"learning_rate": 6.412825651302606e-06,
"loss": 0.139,
"step": 4590
},
{
"epoch": 9.2,
"eval_loss": 0.3030008375644684,
"eval_runtime": 31.311,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 4590
},
{
"epoch": 9.21,
"learning_rate": 6.332665330661322e-06,
"loss": 0.1492,
"step": 4595
},
{
"epoch": 9.22,
"learning_rate": 6.25250501002004e-06,
"loss": 0.1974,
"step": 4600
},
{
"epoch": 9.22,
"eval_loss": 0.30382072925567627,
"eval_runtime": 31.314,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4600
},
{
"epoch": 9.23,
"learning_rate": 6.1723446893787575e-06,
"loss": 0.1619,
"step": 4605
},
{
"epoch": 9.24,
"learning_rate": 6.092184368737476e-06,
"loss": 0.167,
"step": 4610
},
{
"epoch": 9.24,
"eval_loss": 0.303739070892334,
"eval_runtime": 31.3071,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4610
},
{
"epoch": 9.25,
"learning_rate": 6.0120240480961935e-06,
"loss": 0.1597,
"step": 4615
},
{
"epoch": 9.26,
"learning_rate": 5.931863727454911e-06,
"loss": 0.2409,
"step": 4620
},
{
"epoch": 9.26,
"eval_loss": 0.3033643066883087,
"eval_runtime": 31.3054,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4620
},
{
"epoch": 9.27,
"learning_rate": 5.851703406813628e-06,
"loss": 0.1673,
"step": 4625
},
{
"epoch": 9.28,
"learning_rate": 5.771543086172345e-06,
"loss": 0.1494,
"step": 4630
},
{
"epoch": 9.28,
"eval_loss": 0.30480068922042847,
"eval_runtime": 31.3033,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4630
},
{
"epoch": 9.29,
"learning_rate": 5.691382765531063e-06,
"loss": 0.1907,
"step": 4635
},
{
"epoch": 9.3,
"learning_rate": 5.6112224448897804e-06,
"loss": 0.1762,
"step": 4640
},
{
"epoch": 9.3,
"eval_loss": 0.30366069078445435,
"eval_runtime": 31.2972,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4640
},
{
"epoch": 9.31,
"learning_rate": 5.531062124248497e-06,
"loss": 0.1336,
"step": 4645
},
{
"epoch": 9.32,
"learning_rate": 5.450901803607215e-06,
"loss": 0.183,
"step": 4650
},
{
"epoch": 9.32,
"eval_loss": 0.3042277991771698,
"eval_runtime": 31.3048,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4650
},
{
"epoch": 9.33,
"learning_rate": 5.370741482965932e-06,
"loss": 0.2423,
"step": 4655
},
{
"epoch": 9.34,
"learning_rate": 5.29058116232465e-06,
"loss": 0.1773,
"step": 4660
},
{
"epoch": 9.34,
"eval_loss": 0.3042871356010437,
"eval_runtime": 31.3002,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 1.789,
"step": 4660
},
{
"epoch": 9.35,
"learning_rate": 5.2104208416833665e-06,
"loss": 0.1586,
"step": 4665
},
{
"epoch": 9.36,
"learning_rate": 5.130260521042084e-06,
"loss": 0.1509,
"step": 4670
},
{
"epoch": 9.36,
"eval_loss": 0.3053213059902191,
"eval_runtime": 31.3174,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4670
},
{
"epoch": 9.37,
"learning_rate": 5.050100200400802e-06,
"loss": 0.136,
"step": 4675
},
{
"epoch": 9.38,
"learning_rate": 4.969939879759519e-06,
"loss": 0.1994,
"step": 4680
},
{
"epoch": 9.38,
"eval_loss": 0.30446866154670715,
"eval_runtime": 31.3129,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 4680
},
{
"epoch": 9.39,
"learning_rate": 4.889779559118237e-06,
"loss": 0.2068,
"step": 4685
},
{
"epoch": 9.4,
"learning_rate": 4.809619238476954e-06,
"loss": 0.1928,
"step": 4690
},
{
"epoch": 9.4,
"eval_loss": 0.30361202359199524,
"eval_runtime": 31.3048,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4690
},
{
"epoch": 9.41,
"learning_rate": 4.729458917835672e-06,
"loss": 0.212,
"step": 4695
},
{
"epoch": 9.42,
"learning_rate": 4.649298597194389e-06,
"loss": 0.1158,
"step": 4700
},
{
"epoch": 9.42,
"eval_loss": 0.30384451150894165,
"eval_runtime": 31.3031,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4700
},
{
"epoch": 9.43,
"learning_rate": 4.569138276553107e-06,
"loss": 0.2038,
"step": 4705
},
{
"epoch": 9.44,
"learning_rate": 4.488977955911824e-06,
"loss": 0.1503,
"step": 4710
},
{
"epoch": 9.44,
"eval_loss": 0.3019496500492096,
"eval_runtime": 31.3092,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4710
},
{
"epoch": 9.45,
"learning_rate": 4.408817635270541e-06,
"loss": 0.1511,
"step": 4715
},
{
"epoch": 9.46,
"learning_rate": 4.328657314629259e-06,
"loss": 0.1556,
"step": 4720
},
{
"epoch": 9.46,
"eval_loss": 0.3029373288154602,
"eval_runtime": 31.305,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4720
},
{
"epoch": 9.47,
"learning_rate": 4.248496993987976e-06,
"loss": 0.182,
"step": 4725
},
{
"epoch": 9.48,
"learning_rate": 4.168336673346693e-06,
"loss": 0.1327,
"step": 4730
},
{
"epoch": 9.48,
"eval_loss": 0.30504941940307617,
"eval_runtime": 31.3165,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4730
},
{
"epoch": 9.49,
"learning_rate": 4.0881763527054114e-06,
"loss": 0.1574,
"step": 4735
},
{
"epoch": 9.5,
"learning_rate": 4.008016032064129e-06,
"loss": 0.1772,
"step": 4740
},
{
"epoch": 9.5,
"eval_loss": 0.3057289719581604,
"eval_runtime": 31.3047,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4740
},
{
"epoch": 9.51,
"learning_rate": 3.927855711422846e-06,
"loss": 0.1497,
"step": 4745
},
{
"epoch": 9.52,
"learning_rate": 3.847695390781563e-06,
"loss": 0.1555,
"step": 4750
},
{
"epoch": 9.52,
"eval_loss": 0.30276989936828613,
"eval_runtime": 31.3062,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4750
},
{
"epoch": 9.53,
"learning_rate": 3.767535070140281e-06,
"loss": 0.1281,
"step": 4755
},
{
"epoch": 9.54,
"learning_rate": 3.687374749498998e-06,
"loss": 0.1363,
"step": 4760
},
{
"epoch": 9.54,
"eval_loss": 0.3013622760772705,
"eval_runtime": 31.3134,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 4760
},
{
"epoch": 9.55,
"learning_rate": 3.607214428857716e-06,
"loss": 0.1461,
"step": 4765
},
{
"epoch": 9.56,
"learning_rate": 3.5270541082164335e-06,
"loss": 0.139,
"step": 4770
},
{
"epoch": 9.56,
"eval_loss": 0.3009650707244873,
"eval_runtime": 31.3047,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4770
},
{
"epoch": 9.57,
"learning_rate": 3.4468937875751506e-06,
"loss": 0.1519,
"step": 4775
},
{
"epoch": 9.58,
"learning_rate": 3.366733466933868e-06,
"loss": 0.1639,
"step": 4780
},
{
"epoch": 9.58,
"eval_loss": 0.30125322937965393,
"eval_runtime": 31.3146,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4780
},
{
"epoch": 9.59,
"learning_rate": 3.2865731462925853e-06,
"loss": 0.1735,
"step": 4785
},
{
"epoch": 9.6,
"learning_rate": 3.206412825651303e-06,
"loss": 0.1669,
"step": 4790
},
{
"epoch": 9.6,
"eval_loss": 0.30152443051338196,
"eval_runtime": 31.3073,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4790
},
{
"epoch": 9.61,
"learning_rate": 3.12625250501002e-06,
"loss": 0.1085,
"step": 4795
},
{
"epoch": 9.62,
"learning_rate": 3.046092184368738e-06,
"loss": 0.144,
"step": 4800
},
{
"epoch": 9.62,
"eval_loss": 0.30229106545448303,
"eval_runtime": 31.3106,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 4800
},
{
"epoch": 9.63,
"learning_rate": 2.9659318637274555e-06,
"loss": 0.1607,
"step": 4805
},
{
"epoch": 9.64,
"learning_rate": 2.8857715430861727e-06,
"loss": 0.1925,
"step": 4810
},
{
"epoch": 9.64,
"eval_loss": 0.3034406900405884,
"eval_runtime": 31.3159,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 1.788,
"step": 4810
},
{
"epoch": 9.65,
"learning_rate": 2.8056112224448902e-06,
"loss": 0.1486,
"step": 4815
},
{
"epoch": 9.66,
"learning_rate": 2.7254509018036073e-06,
"loss": 0.1615,
"step": 4820
},
{
"epoch": 9.66,
"eval_loss": 0.3025255501270294,
"eval_runtime": 31.295,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4820
},
{
"epoch": 9.67,
"learning_rate": 2.645290581162325e-06,
"loss": 0.1677,
"step": 4825
},
{
"epoch": 9.68,
"learning_rate": 2.565130260521042e-06,
"loss": 0.1625,
"step": 4830
},
{
"epoch": 9.68,
"eval_loss": 0.3018721044063568,
"eval_runtime": 31.3106,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 4830
},
{
"epoch": 9.69,
"learning_rate": 2.4849699398797596e-06,
"loss": 0.1508,
"step": 4835
},
{
"epoch": 9.7,
"learning_rate": 2.404809619238477e-06,
"loss": 0.1355,
"step": 4840
},
{
"epoch": 9.7,
"eval_loss": 0.3022632300853729,
"eval_runtime": 31.3118,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 4840
},
{
"epoch": 9.71,
"learning_rate": 2.3246492985971947e-06,
"loss": 0.142,
"step": 4845
},
{
"epoch": 9.72,
"learning_rate": 2.244488977955912e-06,
"loss": 0.1671,
"step": 4850
},
{
"epoch": 9.72,
"eval_loss": 0.30193448066711426,
"eval_runtime": 31.3088,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4850
},
{
"epoch": 9.73,
"learning_rate": 2.1643286573146294e-06,
"loss": 0.1557,
"step": 4855
},
{
"epoch": 9.74,
"learning_rate": 2.0841683366733465e-06,
"loss": 0.1447,
"step": 4860
},
{
"epoch": 9.74,
"eval_loss": 0.30213478207588196,
"eval_runtime": 31.3072,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4860
},
{
"epoch": 9.75,
"learning_rate": 2.0040080160320645e-06,
"loss": 0.1617,
"step": 4865
},
{
"epoch": 9.76,
"learning_rate": 1.9238476953907816e-06,
"loss": 0.1465,
"step": 4870
},
{
"epoch": 9.76,
"eval_loss": 0.3024033308029175,
"eval_runtime": 31.3039,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4870
},
{
"epoch": 9.77,
"learning_rate": 1.843687374749499e-06,
"loss": 0.144,
"step": 4875
},
{
"epoch": 9.78,
"learning_rate": 1.7635270541082167e-06,
"loss": 0.1794,
"step": 4880
},
{
"epoch": 9.78,
"eval_loss": 0.3020671606063843,
"eval_runtime": 31.3114,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.788,
"step": 4880
},
{
"epoch": 9.79,
"learning_rate": 1.683366733466934e-06,
"loss": 0.1762,
"step": 4885
},
{
"epoch": 9.8,
"learning_rate": 1.6032064128256514e-06,
"loss": 0.156,
"step": 4890
},
{
"epoch": 9.8,
"eval_loss": 0.30112236738204956,
"eval_runtime": 31.3062,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4890
},
{
"epoch": 9.81,
"learning_rate": 1.523046092184369e-06,
"loss": 0.139,
"step": 4895
},
{
"epoch": 9.82,
"learning_rate": 1.4428857715430863e-06,
"loss": 0.1018,
"step": 4900
},
{
"epoch": 9.82,
"eval_loss": 0.30046290159225464,
"eval_runtime": 31.3036,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4900
},
{
"epoch": 9.83,
"learning_rate": 1.3627254509018037e-06,
"loss": 0.1524,
"step": 4905
},
{
"epoch": 9.84,
"learning_rate": 1.282565130260521e-06,
"loss": 0.1403,
"step": 4910
},
{
"epoch": 9.84,
"eval_loss": 0.3010559380054474,
"eval_runtime": 31.303,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4910
},
{
"epoch": 9.85,
"learning_rate": 1.2024048096192386e-06,
"loss": 0.1495,
"step": 4915
},
{
"epoch": 9.86,
"learning_rate": 1.122244488977956e-06,
"loss": 0.1126,
"step": 4920
},
{
"epoch": 9.86,
"eval_loss": 0.30058878660202026,
"eval_runtime": 31.3099,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 1.789,
"step": 4920
},
{
"epoch": 9.87,
"learning_rate": 1.0420841683366733e-06,
"loss": 0.1365,
"step": 4925
},
{
"epoch": 9.88,
"learning_rate": 9.619238476953908e-07,
"loss": 0.1595,
"step": 4930
},
{
"epoch": 9.88,
"eval_loss": 0.3006744682788849,
"eval_runtime": 31.302,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4930
},
{
"epoch": 9.89,
"learning_rate": 8.817635270541084e-07,
"loss": 0.1921,
"step": 4935
},
{
"epoch": 9.9,
"learning_rate": 8.016032064128257e-07,
"loss": 0.1415,
"step": 4940
},
{
"epoch": 9.9,
"eval_loss": 0.3012050986289978,
"eval_runtime": 31.3083,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4940
},
{
"epoch": 9.91,
"learning_rate": 7.214428857715432e-07,
"loss": 0.1173,
"step": 4945
},
{
"epoch": 9.92,
"learning_rate": 6.412825651302605e-07,
"loss": 0.1651,
"step": 4950
},
{
"epoch": 9.92,
"eval_loss": 0.30145999789237976,
"eval_runtime": 31.3009,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 1.789,
"step": 4950
},
{
"epoch": 9.93,
"learning_rate": 5.61122244488978e-07,
"loss": 0.148,
"step": 4955
},
{
"epoch": 9.94,
"learning_rate": 4.809619238476954e-07,
"loss": 0.1558,
"step": 4960
},
{
"epoch": 9.94,
"eval_loss": 0.30154505372047424,
"eval_runtime": 31.3072,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4960
},
{
"epoch": 9.95,
"learning_rate": 4.0080160320641286e-07,
"loss": 0.191,
"step": 4965
},
{
"epoch": 9.96,
"learning_rate": 3.2064128256513025e-07,
"loss": 0.1734,
"step": 4970
},
{
"epoch": 9.96,
"eval_loss": 0.301419734954834,
"eval_runtime": 31.296,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4970
},
{
"epoch": 9.97,
"learning_rate": 2.404809619238477e-07,
"loss": 0.1638,
"step": 4975
},
{
"epoch": 9.98,
"learning_rate": 1.6032064128256513e-07,
"loss": 0.1909,
"step": 4980
},
{
"epoch": 9.98,
"eval_loss": 0.30140334367752075,
"eval_runtime": 31.2942,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 1.789,
"step": 4980
},
{
"epoch": 9.99,
"learning_rate": 8.016032064128256e-08,
"loss": 0.1475,
"step": 4985
},
{
"epoch": 10.0,
"learning_rate": 0.0,
"loss": 0.1246,
"step": 4990
},
{
"epoch": 10.0,
"eval_loss": 0.30139291286468506,
"eval_runtime": 31.307,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 1.789,
"step": 4990
},
{
"epoch": 10.0,
"step": 4990,
"total_flos": 8.69904317939712e+16,
"train_loss": 0.5887919666891347,
"train_runtime": 25757.7329,
"train_samples_per_second": 0.776,
"train_steps_per_second": 0.194
}
],
"max_steps": 4990,
"num_train_epochs": 10,
"total_flos": 8.69904317939712e+16,
"trial_name": null,
"trial_params": null
}