adapters-opt-gptq-QLORA-super_glue-rte / trainer_state-opt-gptq-QLORA-super_glue-rte-sequence_classification.json
RMHalak's picture
Task: SequenceClassification
90c8104 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 1,
"global_step": 160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0625,
"grad_norm": 17.75647735595703,
"learning_rate": 2.5e-05,
"loss": 0.8699,
"step": 1
},
{
"epoch": 0.0625,
"eval_accuracy": 0.496,
"eval_loss": 0.837658703327179,
"eval_runtime": 4.5469,
"eval_samples_per_second": 54.983,
"eval_steps_per_second": 1.759,
"step": 1
},
{
"epoch": 0.125,
"grad_norm": 10.630256652832031,
"learning_rate": 5e-05,
"loss": 0.8372,
"step": 2
},
{
"epoch": 0.125,
"eval_accuracy": 0.496,
"eval_loss": 0.8297348618507385,
"eval_runtime": 4.4856,
"eval_samples_per_second": 55.733,
"eval_steps_per_second": 1.783,
"step": 2
},
{
"epoch": 0.1875,
"grad_norm": 18.88325309753418,
"learning_rate": 4.968354430379747e-05,
"loss": 0.8813,
"step": 3
},
{
"epoch": 0.1875,
"eval_accuracy": 0.496,
"eval_loss": 0.8005664348602295,
"eval_runtime": 4.5427,
"eval_samples_per_second": 55.034,
"eval_steps_per_second": 1.761,
"step": 3
},
{
"epoch": 0.25,
"grad_norm": 14.382352828979492,
"learning_rate": 4.936708860759494e-05,
"loss": 0.8725,
"step": 4
},
{
"epoch": 0.25,
"eval_accuracy": 0.5,
"eval_loss": 0.7717475295066833,
"eval_runtime": 4.495,
"eval_samples_per_second": 55.618,
"eval_steps_per_second": 1.78,
"step": 4
},
{
"epoch": 0.3125,
"grad_norm": 18.139081954956055,
"learning_rate": 4.905063291139241e-05,
"loss": 0.8504,
"step": 5
},
{
"epoch": 0.3125,
"eval_accuracy": 0.496,
"eval_loss": 0.7444770336151123,
"eval_runtime": 4.5407,
"eval_samples_per_second": 55.057,
"eval_steps_per_second": 1.762,
"step": 5
},
{
"epoch": 0.375,
"grad_norm": 7.32247257232666,
"learning_rate": 4.8734177215189874e-05,
"loss": 0.8015,
"step": 6
},
{
"epoch": 0.375,
"eval_accuracy": 0.5,
"eval_loss": 0.7214609384536743,
"eval_runtime": 4.5353,
"eval_samples_per_second": 55.123,
"eval_steps_per_second": 1.764,
"step": 6
},
{
"epoch": 0.4375,
"grad_norm": 4.8535966873168945,
"learning_rate": 4.8417721518987346e-05,
"loss": 0.7165,
"step": 7
},
{
"epoch": 0.4375,
"eval_accuracy": 0.512,
"eval_loss": 0.7035966515541077,
"eval_runtime": 4.5437,
"eval_samples_per_second": 55.021,
"eval_steps_per_second": 1.761,
"step": 7
},
{
"epoch": 0.5,
"grad_norm": 7.805424690246582,
"learning_rate": 4.810126582278481e-05,
"loss": 0.766,
"step": 8
},
{
"epoch": 0.5,
"eval_accuracy": 0.52,
"eval_loss": 0.6936992406845093,
"eval_runtime": 4.5422,
"eval_samples_per_second": 55.04,
"eval_steps_per_second": 1.761,
"step": 8
},
{
"epoch": 0.5625,
"grad_norm": 2.8888654708862305,
"learning_rate": 4.778481012658228e-05,
"loss": 0.6915,
"step": 9
},
{
"epoch": 0.5625,
"eval_accuracy": 0.524,
"eval_loss": 0.6935371160507202,
"eval_runtime": 4.4937,
"eval_samples_per_second": 55.633,
"eval_steps_per_second": 1.78,
"step": 9
},
{
"epoch": 0.625,
"grad_norm": 7.086565971374512,
"learning_rate": 4.7468354430379746e-05,
"loss": 0.7093,
"step": 10
},
{
"epoch": 0.625,
"eval_accuracy": 0.532,
"eval_loss": 0.6983515620231628,
"eval_runtime": 4.4948,
"eval_samples_per_second": 55.619,
"eval_steps_per_second": 1.78,
"step": 10
},
{
"epoch": 0.6875,
"grad_norm": 2.0717809200286865,
"learning_rate": 4.715189873417722e-05,
"loss": 0.7107,
"step": 11
},
{
"epoch": 0.6875,
"eval_accuracy": 0.524,
"eval_loss": 0.7018242478370667,
"eval_runtime": 4.5362,
"eval_samples_per_second": 55.112,
"eval_steps_per_second": 1.764,
"step": 11
},
{
"epoch": 0.75,
"grad_norm": 4.748137474060059,
"learning_rate": 4.683544303797468e-05,
"loss": 0.729,
"step": 12
},
{
"epoch": 0.75,
"eval_accuracy": 0.524,
"eval_loss": 0.7031621336936951,
"eval_runtime": 4.5441,
"eval_samples_per_second": 55.017,
"eval_steps_per_second": 1.761,
"step": 12
},
{
"epoch": 0.8125,
"grad_norm": 5.206336975097656,
"learning_rate": 4.6518987341772154e-05,
"loss": 0.7819,
"step": 13
},
{
"epoch": 0.8125,
"eval_accuracy": 0.524,
"eval_loss": 0.7015683650970459,
"eval_runtime": 4.5432,
"eval_samples_per_second": 55.028,
"eval_steps_per_second": 1.761,
"step": 13
},
{
"epoch": 0.875,
"grad_norm": 5.746586322784424,
"learning_rate": 4.6202531645569625e-05,
"loss": 0.7343,
"step": 14
},
{
"epoch": 0.875,
"eval_accuracy": 0.524,
"eval_loss": 0.6985683441162109,
"eval_runtime": 4.5466,
"eval_samples_per_second": 54.986,
"eval_steps_per_second": 1.76,
"step": 14
},
{
"epoch": 0.9375,
"grad_norm": 2.441669464111328,
"learning_rate": 4.588607594936709e-05,
"loss": 0.7052,
"step": 15
},
{
"epoch": 0.9375,
"eval_accuracy": 0.536,
"eval_loss": 0.6962031126022339,
"eval_runtime": 4.5479,
"eval_samples_per_second": 54.97,
"eval_steps_per_second": 1.759,
"step": 15
},
{
"epoch": 1.0,
"grad_norm": 8.590319633483887,
"learning_rate": 4.556962025316456e-05,
"loss": 0.7406,
"step": 16
},
{
"epoch": 1.0,
"eval_accuracy": 0.54,
"eval_loss": 0.6932148337364197,
"eval_runtime": 4.5474,
"eval_samples_per_second": 54.977,
"eval_steps_per_second": 1.759,
"step": 16
},
{
"epoch": 1.0625,
"grad_norm": 3.343947172164917,
"learning_rate": 4.525316455696203e-05,
"loss": 0.6939,
"step": 17
},
{
"epoch": 1.0625,
"eval_accuracy": 0.524,
"eval_loss": 0.6910995841026306,
"eval_runtime": 4.5426,
"eval_samples_per_second": 55.035,
"eval_steps_per_second": 1.761,
"step": 17
},
{
"epoch": 1.125,
"grad_norm": 5.235752582550049,
"learning_rate": 4.49367088607595e-05,
"loss": 0.6857,
"step": 18
},
{
"epoch": 1.125,
"eval_accuracy": 0.528,
"eval_loss": 0.690261721611023,
"eval_runtime": 4.5449,
"eval_samples_per_second": 55.006,
"eval_steps_per_second": 1.76,
"step": 18
},
{
"epoch": 1.1875,
"grad_norm": 8.583759307861328,
"learning_rate": 4.462025316455696e-05,
"loss": 0.6932,
"step": 19
},
{
"epoch": 1.1875,
"eval_accuracy": 0.52,
"eval_loss": 0.6894707083702087,
"eval_runtime": 4.5371,
"eval_samples_per_second": 55.101,
"eval_steps_per_second": 1.763,
"step": 19
},
{
"epoch": 1.25,
"grad_norm": 9.518383979797363,
"learning_rate": 4.430379746835443e-05,
"loss": 0.6993,
"step": 20
},
{
"epoch": 1.25,
"eval_accuracy": 0.516,
"eval_loss": 0.6895566582679749,
"eval_runtime": 4.5489,
"eval_samples_per_second": 54.958,
"eval_steps_per_second": 1.759,
"step": 20
},
{
"epoch": 1.3125,
"grad_norm": 7.239161014556885,
"learning_rate": 4.3987341772151904e-05,
"loss": 0.763,
"step": 21
},
{
"epoch": 1.3125,
"eval_accuracy": 0.52,
"eval_loss": 0.690017580986023,
"eval_runtime": 4.4884,
"eval_samples_per_second": 55.699,
"eval_steps_per_second": 1.782,
"step": 21
},
{
"epoch": 1.375,
"grad_norm": 7.390464782714844,
"learning_rate": 4.367088607594937e-05,
"loss": 0.6954,
"step": 22
},
{
"epoch": 1.375,
"eval_accuracy": 0.512,
"eval_loss": 0.6905702948570251,
"eval_runtime": 4.5375,
"eval_samples_per_second": 55.097,
"eval_steps_per_second": 1.763,
"step": 22
},
{
"epoch": 1.4375,
"grad_norm": 3.728330135345459,
"learning_rate": 4.3354430379746834e-05,
"loss": 0.7282,
"step": 23
},
{
"epoch": 1.4375,
"eval_accuracy": 0.524,
"eval_loss": 0.690304696559906,
"eval_runtime": 4.5427,
"eval_samples_per_second": 55.033,
"eval_steps_per_second": 1.761,
"step": 23
},
{
"epoch": 1.5,
"grad_norm": 5.795580863952637,
"learning_rate": 4.3037974683544305e-05,
"loss": 0.7534,
"step": 24
},
{
"epoch": 1.5,
"eval_accuracy": 0.52,
"eval_loss": 0.6902949213981628,
"eval_runtime": 4.5361,
"eval_samples_per_second": 55.113,
"eval_steps_per_second": 1.764,
"step": 24
},
{
"epoch": 1.5625,
"grad_norm": 2.294114589691162,
"learning_rate": 4.2721518987341776e-05,
"loss": 0.7245,
"step": 25
},
{
"epoch": 1.5625,
"eval_accuracy": 0.52,
"eval_loss": 0.6901406049728394,
"eval_runtime": 4.4936,
"eval_samples_per_second": 55.634,
"eval_steps_per_second": 1.78,
"step": 25
},
{
"epoch": 1.625,
"grad_norm": 7.018092155456543,
"learning_rate": 4.240506329113924e-05,
"loss": 0.692,
"step": 26
},
{
"epoch": 1.625,
"eval_accuracy": 0.508,
"eval_loss": 0.6903828382492065,
"eval_runtime": 4.5453,
"eval_samples_per_second": 55.002,
"eval_steps_per_second": 1.76,
"step": 26
},
{
"epoch": 1.6875,
"grad_norm": 7.2787909507751465,
"learning_rate": 4.208860759493671e-05,
"loss": 0.754,
"step": 27
},
{
"epoch": 1.6875,
"eval_accuracy": 0.512,
"eval_loss": 0.6910136938095093,
"eval_runtime": 4.5501,
"eval_samples_per_second": 54.944,
"eval_steps_per_second": 1.758,
"step": 27
},
{
"epoch": 1.75,
"grad_norm": 6.70567512512207,
"learning_rate": 4.177215189873418e-05,
"loss": 0.7132,
"step": 28
},
{
"epoch": 1.75,
"eval_accuracy": 0.508,
"eval_loss": 0.691275417804718,
"eval_runtime": 4.4942,
"eval_samples_per_second": 55.627,
"eval_steps_per_second": 1.78,
"step": 28
},
{
"epoch": 1.8125,
"grad_norm": 7.861635208129883,
"learning_rate": 4.145569620253165e-05,
"loss": 0.7075,
"step": 29
},
{
"epoch": 1.8125,
"eval_accuracy": 0.52,
"eval_loss": 0.6910507678985596,
"eval_runtime": 4.5518,
"eval_samples_per_second": 54.924,
"eval_steps_per_second": 1.758,
"step": 29
},
{
"epoch": 1.875,
"grad_norm": 6.021241188049316,
"learning_rate": 4.113924050632912e-05,
"loss": 0.7013,
"step": 30
},
{
"epoch": 1.875,
"eval_accuracy": 0.528,
"eval_loss": 0.6917343735694885,
"eval_runtime": 4.5466,
"eval_samples_per_second": 54.986,
"eval_steps_per_second": 1.76,
"step": 30
},
{
"epoch": 1.9375,
"grad_norm": 4.954082012176514,
"learning_rate": 4.0822784810126584e-05,
"loss": 0.7059,
"step": 31
},
{
"epoch": 1.9375,
"eval_accuracy": 0.508,
"eval_loss": 0.692550778388977,
"eval_runtime": 4.5373,
"eval_samples_per_second": 55.099,
"eval_steps_per_second": 1.763,
"step": 31
},
{
"epoch": 2.0,
"grad_norm": 4.5776824951171875,
"learning_rate": 4.050632911392405e-05,
"loss": 0.6932,
"step": 32
},
{
"epoch": 2.0,
"eval_accuracy": 0.52,
"eval_loss": 0.692632794380188,
"eval_runtime": 4.5448,
"eval_samples_per_second": 55.008,
"eval_steps_per_second": 1.76,
"step": 32
},
{
"epoch": 2.0625,
"grad_norm": 8.209676742553711,
"learning_rate": 4.018987341772152e-05,
"loss": 0.7083,
"step": 33
},
{
"epoch": 2.0625,
"eval_accuracy": 0.52,
"eval_loss": 0.6923945546150208,
"eval_runtime": 4.5461,
"eval_samples_per_second": 54.993,
"eval_steps_per_second": 1.76,
"step": 33
},
{
"epoch": 2.125,
"grad_norm": 2.001976490020752,
"learning_rate": 3.987341772151899e-05,
"loss": 0.7423,
"step": 34
},
{
"epoch": 2.125,
"eval_accuracy": 0.528,
"eval_loss": 0.692144513130188,
"eval_runtime": 4.5419,
"eval_samples_per_second": 55.043,
"eval_steps_per_second": 1.761,
"step": 34
},
{
"epoch": 2.1875,
"grad_norm": 7.856252670288086,
"learning_rate": 3.9556962025316456e-05,
"loss": 0.6794,
"step": 35
},
{
"epoch": 2.1875,
"eval_accuracy": 0.532,
"eval_loss": 0.6922343969345093,
"eval_runtime": 4.5401,
"eval_samples_per_second": 55.065,
"eval_steps_per_second": 1.762,
"step": 35
},
{
"epoch": 2.25,
"grad_norm": 10.469124794006348,
"learning_rate": 3.924050632911392e-05,
"loss": 0.7089,
"step": 36
},
{
"epoch": 2.25,
"eval_accuracy": 0.52,
"eval_loss": 0.6925742030143738,
"eval_runtime": 4.4947,
"eval_samples_per_second": 55.621,
"eval_steps_per_second": 1.78,
"step": 36
},
{
"epoch": 2.3125,
"grad_norm": 12.528965950012207,
"learning_rate": 3.89240506329114e-05,
"loss": 0.738,
"step": 37
},
{
"epoch": 2.3125,
"eval_accuracy": 0.512,
"eval_loss": 0.6928867101669312,
"eval_runtime": 4.4904,
"eval_samples_per_second": 55.674,
"eval_steps_per_second": 1.782,
"step": 37
},
{
"epoch": 2.375,
"grad_norm": 10.900518417358398,
"learning_rate": 3.8607594936708864e-05,
"loss": 0.6796,
"step": 38
},
{
"epoch": 2.375,
"eval_accuracy": 0.512,
"eval_loss": 0.6924609541893005,
"eval_runtime": 4.5411,
"eval_samples_per_second": 55.052,
"eval_steps_per_second": 1.762,
"step": 38
},
{
"epoch": 2.4375,
"grad_norm": 1.5410585403442383,
"learning_rate": 3.829113924050633e-05,
"loss": 0.6729,
"step": 39
},
{
"epoch": 2.4375,
"eval_accuracy": 0.512,
"eval_loss": 0.6923437714576721,
"eval_runtime": 4.5412,
"eval_samples_per_second": 55.052,
"eval_steps_per_second": 1.762,
"step": 39
},
{
"epoch": 2.5,
"grad_norm": 5.861754894256592,
"learning_rate": 3.79746835443038e-05,
"loss": 0.6589,
"step": 40
},
{
"epoch": 2.5,
"eval_accuracy": 0.512,
"eval_loss": 0.6922851800918579,
"eval_runtime": 4.5478,
"eval_samples_per_second": 54.971,
"eval_steps_per_second": 1.759,
"step": 40
},
{
"epoch": 2.5625,
"grad_norm": 2.633316993713379,
"learning_rate": 3.765822784810127e-05,
"loss": 0.7336,
"step": 41
},
{
"epoch": 2.5625,
"eval_accuracy": 0.512,
"eval_loss": 0.6914882659912109,
"eval_runtime": 4.4965,
"eval_samples_per_second": 55.598,
"eval_steps_per_second": 1.779,
"step": 41
},
{
"epoch": 2.625,
"grad_norm": 4.3643693923950195,
"learning_rate": 3.7341772151898736e-05,
"loss": 0.7018,
"step": 42
},
{
"epoch": 2.625,
"eval_accuracy": 0.524,
"eval_loss": 0.690136730670929,
"eval_runtime": 4.5418,
"eval_samples_per_second": 55.045,
"eval_steps_per_second": 1.761,
"step": 42
},
{
"epoch": 2.6875,
"grad_norm": 4.561107158660889,
"learning_rate": 3.70253164556962e-05,
"loss": 0.7331,
"step": 43
},
{
"epoch": 2.6875,
"eval_accuracy": 0.536,
"eval_loss": 0.6878847479820251,
"eval_runtime": 4.5431,
"eval_samples_per_second": 55.028,
"eval_steps_per_second": 1.761,
"step": 43
},
{
"epoch": 2.75,
"grad_norm": 2.425762891769409,
"learning_rate": 3.670886075949367e-05,
"loss": 0.6961,
"step": 44
},
{
"epoch": 2.75,
"eval_accuracy": 0.544,
"eval_loss": 0.6869159936904907,
"eval_runtime": 4.5385,
"eval_samples_per_second": 55.084,
"eval_steps_per_second": 1.763,
"step": 44
},
{
"epoch": 2.8125,
"grad_norm": 7.950039863586426,
"learning_rate": 3.639240506329114e-05,
"loss": 0.7228,
"step": 45
},
{
"epoch": 2.8125,
"eval_accuracy": 0.544,
"eval_loss": 0.6861679553985596,
"eval_runtime": 4.5403,
"eval_samples_per_second": 55.063,
"eval_steps_per_second": 1.762,
"step": 45
},
{
"epoch": 2.875,
"grad_norm": 12.410717964172363,
"learning_rate": 3.607594936708861e-05,
"loss": 0.7031,
"step": 46
},
{
"epoch": 2.875,
"eval_accuracy": 0.532,
"eval_loss": 0.685476541519165,
"eval_runtime": 4.5437,
"eval_samples_per_second": 55.022,
"eval_steps_per_second": 1.761,
"step": 46
},
{
"epoch": 2.9375,
"grad_norm": 3.116471767425537,
"learning_rate": 3.575949367088608e-05,
"loss": 0.6885,
"step": 47
},
{
"epoch": 2.9375,
"eval_accuracy": 0.544,
"eval_loss": 0.6849316358566284,
"eval_runtime": 4.5418,
"eval_samples_per_second": 55.045,
"eval_steps_per_second": 1.761,
"step": 47
},
{
"epoch": 3.0,
"grad_norm": 6.724969387054443,
"learning_rate": 3.5443037974683544e-05,
"loss": 0.7062,
"step": 48
},
{
"epoch": 3.0,
"eval_accuracy": 0.532,
"eval_loss": 0.6846718788146973,
"eval_runtime": 4.4458,
"eval_samples_per_second": 56.233,
"eval_steps_per_second": 1.799,
"step": 48
},
{
"epoch": 3.0625,
"grad_norm": 2.1322343349456787,
"learning_rate": 3.5126582278481015e-05,
"loss": 0.6679,
"step": 49
},
{
"epoch": 3.0625,
"eval_accuracy": 0.532,
"eval_loss": 0.6838710904121399,
"eval_runtime": 4.4926,
"eval_samples_per_second": 55.648,
"eval_steps_per_second": 1.781,
"step": 49
},
{
"epoch": 3.125,
"grad_norm": 6.895395278930664,
"learning_rate": 3.4810126582278487e-05,
"loss": 0.6956,
"step": 50
},
{
"epoch": 3.125,
"eval_accuracy": 0.532,
"eval_loss": 0.6838496327400208,
"eval_runtime": 4.5422,
"eval_samples_per_second": 55.04,
"eval_steps_per_second": 1.761,
"step": 50
},
{
"epoch": 3.1875,
"grad_norm": 10.101134300231934,
"learning_rate": 3.449367088607595e-05,
"loss": 0.7449,
"step": 51
},
{
"epoch": 3.1875,
"eval_accuracy": 0.528,
"eval_loss": 0.6836503744125366,
"eval_runtime": 4.5,
"eval_samples_per_second": 55.556,
"eval_steps_per_second": 1.778,
"step": 51
},
{
"epoch": 3.25,
"grad_norm": 5.3039422035217285,
"learning_rate": 3.4177215189873416e-05,
"loss": 0.6853,
"step": 52
},
{
"epoch": 3.25,
"eval_accuracy": 0.536,
"eval_loss": 0.6831699013710022,
"eval_runtime": 4.4953,
"eval_samples_per_second": 55.613,
"eval_steps_per_second": 1.78,
"step": 52
},
{
"epoch": 3.3125,
"grad_norm": 2.962162733078003,
"learning_rate": 3.386075949367089e-05,
"loss": 0.7127,
"step": 53
},
{
"epoch": 3.3125,
"eval_accuracy": 0.536,
"eval_loss": 0.6828047037124634,
"eval_runtime": 4.5467,
"eval_samples_per_second": 54.985,
"eval_steps_per_second": 1.76,
"step": 53
},
{
"epoch": 3.375,
"grad_norm": 4.858814239501953,
"learning_rate": 3.354430379746836e-05,
"loss": 0.6544,
"step": 54
},
{
"epoch": 3.375,
"eval_accuracy": 0.556,
"eval_loss": 0.6824140548706055,
"eval_runtime": 4.5474,
"eval_samples_per_second": 54.976,
"eval_steps_per_second": 1.759,
"step": 54
},
{
"epoch": 3.4375,
"grad_norm": 5.237043380737305,
"learning_rate": 3.322784810126582e-05,
"loss": 0.6638,
"step": 55
},
{
"epoch": 3.4375,
"eval_accuracy": 0.532,
"eval_loss": 0.6816914081573486,
"eval_runtime": 4.5453,
"eval_samples_per_second": 55.002,
"eval_steps_per_second": 1.76,
"step": 55
},
{
"epoch": 3.5,
"grad_norm": 3.878478527069092,
"learning_rate": 3.291139240506329e-05,
"loss": 0.7148,
"step": 56
},
{
"epoch": 3.5,
"eval_accuracy": 0.536,
"eval_loss": 0.6814433336257935,
"eval_runtime": 4.5412,
"eval_samples_per_second": 55.052,
"eval_steps_per_second": 1.762,
"step": 56
},
{
"epoch": 3.5625,
"grad_norm": 4.188953399658203,
"learning_rate": 3.2594936708860766e-05,
"loss": 0.7003,
"step": 57
},
{
"epoch": 3.5625,
"eval_accuracy": 0.528,
"eval_loss": 0.6815546751022339,
"eval_runtime": 4.5437,
"eval_samples_per_second": 55.021,
"eval_steps_per_second": 1.761,
"step": 57
},
{
"epoch": 3.625,
"grad_norm": 12.408546447753906,
"learning_rate": 3.227848101265823e-05,
"loss": 0.771,
"step": 58
},
{
"epoch": 3.625,
"eval_accuracy": 0.528,
"eval_loss": 0.681021511554718,
"eval_runtime": 4.5384,
"eval_samples_per_second": 55.086,
"eval_steps_per_second": 1.763,
"step": 58
},
{
"epoch": 3.6875,
"grad_norm": 3.4157402515411377,
"learning_rate": 3.1962025316455695e-05,
"loss": 0.6973,
"step": 59
},
{
"epoch": 3.6875,
"eval_accuracy": 0.512,
"eval_loss": 0.6810234189033508,
"eval_runtime": 4.5487,
"eval_samples_per_second": 54.96,
"eval_steps_per_second": 1.759,
"step": 59
},
{
"epoch": 3.75,
"grad_norm": 7.873476028442383,
"learning_rate": 3.1645569620253167e-05,
"loss": 0.7426,
"step": 60
},
{
"epoch": 3.75,
"eval_accuracy": 0.512,
"eval_loss": 0.6811171770095825,
"eval_runtime": 4.5472,
"eval_samples_per_second": 54.978,
"eval_steps_per_second": 1.759,
"step": 60
},
{
"epoch": 3.8125,
"grad_norm": 5.3661322593688965,
"learning_rate": 3.132911392405064e-05,
"loss": 0.6969,
"step": 61
},
{
"epoch": 3.8125,
"eval_accuracy": 0.528,
"eval_loss": 0.6811054944992065,
"eval_runtime": 4.5489,
"eval_samples_per_second": 54.959,
"eval_steps_per_second": 1.759,
"step": 61
},
{
"epoch": 3.875,
"grad_norm": 2.467409372329712,
"learning_rate": 3.10126582278481e-05,
"loss": 0.7369,
"step": 62
},
{
"epoch": 3.875,
"eval_accuracy": 0.54,
"eval_loss": 0.6803652048110962,
"eval_runtime": 4.5424,
"eval_samples_per_second": 55.037,
"eval_steps_per_second": 1.761,
"step": 62
},
{
"epoch": 3.9375,
"grad_norm": 2.4884164333343506,
"learning_rate": 3.0696202531645574e-05,
"loss": 0.6572,
"step": 63
},
{
"epoch": 3.9375,
"eval_accuracy": 0.56,
"eval_loss": 0.6803945302963257,
"eval_runtime": 4.5015,
"eval_samples_per_second": 55.537,
"eval_steps_per_second": 1.777,
"step": 63
},
{
"epoch": 4.0,
"grad_norm": 1.9957572221755981,
"learning_rate": 3.0379746835443042e-05,
"loss": 0.758,
"step": 64
},
{
"epoch": 4.0,
"eval_accuracy": 0.544,
"eval_loss": 0.6798281073570251,
"eval_runtime": 4.5427,
"eval_samples_per_second": 55.033,
"eval_steps_per_second": 1.761,
"step": 64
},
{
"epoch": 4.0625,
"grad_norm": 11.552275657653809,
"learning_rate": 3.0063291139240506e-05,
"loss": 0.7428,
"step": 65
},
{
"epoch": 4.0625,
"eval_accuracy": 0.524,
"eval_loss": 0.679925799369812,
"eval_runtime": 4.539,
"eval_samples_per_second": 55.078,
"eval_steps_per_second": 1.763,
"step": 65
},
{
"epoch": 4.125,
"grad_norm": 2.6973438262939453,
"learning_rate": 2.9746835443037974e-05,
"loss": 0.6784,
"step": 66
},
{
"epoch": 4.125,
"eval_accuracy": 0.536,
"eval_loss": 0.6790605187416077,
"eval_runtime": 4.5462,
"eval_samples_per_second": 54.991,
"eval_steps_per_second": 1.76,
"step": 66
},
{
"epoch": 4.1875,
"grad_norm": 3.727440595626831,
"learning_rate": 2.9430379746835446e-05,
"loss": 0.7045,
"step": 67
},
{
"epoch": 4.1875,
"eval_accuracy": 0.528,
"eval_loss": 0.6793281435966492,
"eval_runtime": 4.5469,
"eval_samples_per_second": 54.983,
"eval_steps_per_second": 1.759,
"step": 67
},
{
"epoch": 4.25,
"grad_norm": 1.7801040410995483,
"learning_rate": 2.9113924050632914e-05,
"loss": 0.643,
"step": 68
},
{
"epoch": 4.25,
"eval_accuracy": 0.512,
"eval_loss": 0.6788183450698853,
"eval_runtime": 4.5354,
"eval_samples_per_second": 55.122,
"eval_steps_per_second": 1.764,
"step": 68
},
{
"epoch": 4.3125,
"grad_norm": 3.4789085388183594,
"learning_rate": 2.879746835443038e-05,
"loss": 0.675,
"step": 69
},
{
"epoch": 4.3125,
"eval_accuracy": 0.52,
"eval_loss": 0.6782128810882568,
"eval_runtime": 4.5402,
"eval_samples_per_second": 55.063,
"eval_steps_per_second": 1.762,
"step": 69
},
{
"epoch": 4.375,
"grad_norm": 4.243752956390381,
"learning_rate": 2.848101265822785e-05,
"loss": 0.6469,
"step": 70
},
{
"epoch": 4.375,
"eval_accuracy": 0.508,
"eval_loss": 0.6780292987823486,
"eval_runtime": 4.543,
"eval_samples_per_second": 55.029,
"eval_steps_per_second": 1.761,
"step": 70
},
{
"epoch": 4.4375,
"grad_norm": 6.593841552734375,
"learning_rate": 2.8164556962025318e-05,
"loss": 0.7455,
"step": 71
},
{
"epoch": 4.4375,
"eval_accuracy": 0.516,
"eval_loss": 0.6775800585746765,
"eval_runtime": 4.545,
"eval_samples_per_second": 55.005,
"eval_steps_per_second": 1.76,
"step": 71
},
{
"epoch": 4.5,
"grad_norm": 12.047831535339355,
"learning_rate": 2.7848101265822786e-05,
"loss": 0.6985,
"step": 72
},
{
"epoch": 4.5,
"eval_accuracy": 0.516,
"eval_loss": 0.6778261661529541,
"eval_runtime": 4.5527,
"eval_samples_per_second": 54.912,
"eval_steps_per_second": 1.757,
"step": 72
},
{
"epoch": 4.5625,
"grad_norm": 3.4566452503204346,
"learning_rate": 2.7531645569620257e-05,
"loss": 0.7616,
"step": 73
},
{
"epoch": 4.5625,
"eval_accuracy": 0.52,
"eval_loss": 0.6769921779632568,
"eval_runtime": 4.545,
"eval_samples_per_second": 55.005,
"eval_steps_per_second": 1.76,
"step": 73
},
{
"epoch": 4.625,
"grad_norm": 2.8978374004364014,
"learning_rate": 2.7215189873417722e-05,
"loss": 0.7135,
"step": 74
},
{
"epoch": 4.625,
"eval_accuracy": 0.516,
"eval_loss": 0.6770429611206055,
"eval_runtime": 4.541,
"eval_samples_per_second": 55.054,
"eval_steps_per_second": 1.762,
"step": 74
},
{
"epoch": 4.6875,
"grad_norm": 3.3244338035583496,
"learning_rate": 2.689873417721519e-05,
"loss": 0.7157,
"step": 75
},
{
"epoch": 4.6875,
"eval_accuracy": 0.528,
"eval_loss": 0.6766347885131836,
"eval_runtime": 4.5403,
"eval_samples_per_second": 55.062,
"eval_steps_per_second": 1.762,
"step": 75
},
{
"epoch": 4.75,
"grad_norm": 5.23004150390625,
"learning_rate": 2.6582278481012658e-05,
"loss": 0.7058,
"step": 76
},
{
"epoch": 4.75,
"eval_accuracy": 0.528,
"eval_loss": 0.6764668226242065,
"eval_runtime": 4.54,
"eval_samples_per_second": 55.066,
"eval_steps_per_second": 1.762,
"step": 76
},
{
"epoch": 4.8125,
"grad_norm": 8.803872108459473,
"learning_rate": 2.626582278481013e-05,
"loss": 0.7127,
"step": 77
},
{
"epoch": 4.8125,
"eval_accuracy": 0.524,
"eval_loss": 0.6759433746337891,
"eval_runtime": 4.5454,
"eval_samples_per_second": 55.0,
"eval_steps_per_second": 1.76,
"step": 77
},
{
"epoch": 4.875,
"grad_norm": 3.5992655754089355,
"learning_rate": 2.5949367088607597e-05,
"loss": 0.7004,
"step": 78
},
{
"epoch": 4.875,
"eval_accuracy": 0.536,
"eval_loss": 0.6762988567352295,
"eval_runtime": 4.4993,
"eval_samples_per_second": 55.565,
"eval_steps_per_second": 1.778,
"step": 78
},
{
"epoch": 4.9375,
"grad_norm": 3.1371684074401855,
"learning_rate": 2.5632911392405062e-05,
"loss": 0.6827,
"step": 79
},
{
"epoch": 4.9375,
"eval_accuracy": 0.552,
"eval_loss": 0.6757890582084656,
"eval_runtime": 4.5476,
"eval_samples_per_second": 54.974,
"eval_steps_per_second": 1.759,
"step": 79
},
{
"epoch": 5.0,
"grad_norm": 3.854306697845459,
"learning_rate": 2.5316455696202533e-05,
"loss": 0.7649,
"step": 80
},
{
"epoch": 5.0,
"eval_accuracy": 0.54,
"eval_loss": 0.6760488152503967,
"eval_runtime": 4.539,
"eval_samples_per_second": 55.079,
"eval_steps_per_second": 1.763,
"step": 80
},
{
"epoch": 5.0625,
"grad_norm": 4.356711387634277,
"learning_rate": 2.5e-05,
"loss": 0.7461,
"step": 81
},
{
"epoch": 5.0625,
"eval_accuracy": 0.54,
"eval_loss": 0.6765702962875366,
"eval_runtime": 4.4883,
"eval_samples_per_second": 55.701,
"eval_steps_per_second": 1.782,
"step": 81
},
{
"epoch": 5.125,
"grad_norm": 4.030115127563477,
"learning_rate": 2.468354430379747e-05,
"loss": 0.6346,
"step": 82
},
{
"epoch": 5.125,
"eval_accuracy": 0.536,
"eval_loss": 0.6769394278526306,
"eval_runtime": 4.5387,
"eval_samples_per_second": 55.082,
"eval_steps_per_second": 1.763,
"step": 82
},
{
"epoch": 5.1875,
"grad_norm": 3.892704486846924,
"learning_rate": 2.4367088607594937e-05,
"loss": 0.6245,
"step": 83
},
{
"epoch": 5.1875,
"eval_accuracy": 0.548,
"eval_loss": 0.6764355301856995,
"eval_runtime": 4.548,
"eval_samples_per_second": 54.969,
"eval_steps_per_second": 1.759,
"step": 83
},
{
"epoch": 5.25,
"grad_norm": 2.755213975906372,
"learning_rate": 2.4050632911392405e-05,
"loss": 0.6595,
"step": 84
},
{
"epoch": 5.25,
"eval_accuracy": 0.548,
"eval_loss": 0.6767304539680481,
"eval_runtime": 4.5445,
"eval_samples_per_second": 55.012,
"eval_steps_per_second": 1.76,
"step": 84
},
{
"epoch": 5.3125,
"grad_norm": 9.109251976013184,
"learning_rate": 2.3734177215189873e-05,
"loss": 0.6507,
"step": 85
},
{
"epoch": 5.3125,
"eval_accuracy": 0.552,
"eval_loss": 0.6769980192184448,
"eval_runtime": 4.4987,
"eval_samples_per_second": 55.572,
"eval_steps_per_second": 1.778,
"step": 85
},
{
"epoch": 5.375,
"grad_norm": 4.487890720367432,
"learning_rate": 2.341772151898734e-05,
"loss": 0.6528,
"step": 86
},
{
"epoch": 5.375,
"eval_accuracy": 0.552,
"eval_loss": 0.6765019297599792,
"eval_runtime": 4.544,
"eval_samples_per_second": 55.017,
"eval_steps_per_second": 1.761,
"step": 86
},
{
"epoch": 5.4375,
"grad_norm": 2.2593257427215576,
"learning_rate": 2.3101265822784813e-05,
"loss": 0.687,
"step": 87
},
{
"epoch": 5.4375,
"eval_accuracy": 0.564,
"eval_loss": 0.6773359179496765,
"eval_runtime": 4.5397,
"eval_samples_per_second": 55.07,
"eval_steps_per_second": 1.762,
"step": 87
},
{
"epoch": 5.5,
"grad_norm": 9.76685905456543,
"learning_rate": 2.278481012658228e-05,
"loss": 0.6913,
"step": 88
},
{
"epoch": 5.5,
"eval_accuracy": 0.56,
"eval_loss": 0.6779413819313049,
"eval_runtime": 4.5446,
"eval_samples_per_second": 55.01,
"eval_steps_per_second": 1.76,
"step": 88
},
{
"epoch": 5.5625,
"grad_norm": 1.9855612516403198,
"learning_rate": 2.246835443037975e-05,
"loss": 0.6799,
"step": 89
},
{
"epoch": 5.5625,
"eval_accuracy": 0.56,
"eval_loss": 0.6777753829956055,
"eval_runtime": 4.5453,
"eval_samples_per_second": 55.002,
"eval_steps_per_second": 1.76,
"step": 89
},
{
"epoch": 5.625,
"grad_norm": 6.978314399719238,
"learning_rate": 2.2151898734177217e-05,
"loss": 0.6616,
"step": 90
},
{
"epoch": 5.625,
"eval_accuracy": 0.568,
"eval_loss": 0.6782050728797913,
"eval_runtime": 4.5005,
"eval_samples_per_second": 55.549,
"eval_steps_per_second": 1.778,
"step": 90
},
{
"epoch": 5.6875,
"grad_norm": 2.3891565799713135,
"learning_rate": 2.1835443037974685e-05,
"loss": 0.6577,
"step": 91
},
{
"epoch": 5.6875,
"eval_accuracy": 0.552,
"eval_loss": 0.6784765720367432,
"eval_runtime": 4.5406,
"eval_samples_per_second": 55.059,
"eval_steps_per_second": 1.762,
"step": 91
},
{
"epoch": 5.75,
"grad_norm": 4.9778313636779785,
"learning_rate": 2.1518987341772153e-05,
"loss": 0.6248,
"step": 92
},
{
"epoch": 5.75,
"eval_accuracy": 0.556,
"eval_loss": 0.678955078125,
"eval_runtime": 4.5404,
"eval_samples_per_second": 55.062,
"eval_steps_per_second": 1.762,
"step": 92
},
{
"epoch": 5.8125,
"grad_norm": 1.9475889205932617,
"learning_rate": 2.120253164556962e-05,
"loss": 0.7026,
"step": 93
},
{
"epoch": 5.8125,
"eval_accuracy": 0.552,
"eval_loss": 0.6784570217132568,
"eval_runtime": 4.5434,
"eval_samples_per_second": 55.025,
"eval_steps_per_second": 1.761,
"step": 93
},
{
"epoch": 5.875,
"grad_norm": 6.539444923400879,
"learning_rate": 2.088607594936709e-05,
"loss": 0.6816,
"step": 94
},
{
"epoch": 5.875,
"eval_accuracy": 0.536,
"eval_loss": 0.6789179444313049,
"eval_runtime": 4.5418,
"eval_samples_per_second": 55.044,
"eval_steps_per_second": 1.761,
"step": 94
},
{
"epoch": 5.9375,
"grad_norm": 1.8745115995407104,
"learning_rate": 2.056962025316456e-05,
"loss": 0.6476,
"step": 95
},
{
"epoch": 5.9375,
"eval_accuracy": 0.532,
"eval_loss": 0.6787148714065552,
"eval_runtime": 4.5397,
"eval_samples_per_second": 55.069,
"eval_steps_per_second": 1.762,
"step": 95
},
{
"epoch": 6.0,
"grad_norm": 7.960897922515869,
"learning_rate": 2.0253164556962025e-05,
"loss": 0.6797,
"step": 96
},
{
"epoch": 6.0,
"eval_accuracy": 0.54,
"eval_loss": 0.6785527467727661,
"eval_runtime": 4.5395,
"eval_samples_per_second": 55.072,
"eval_steps_per_second": 1.762,
"step": 96
},
{
"epoch": 6.0625,
"grad_norm": 6.119703769683838,
"learning_rate": 1.9936708860759496e-05,
"loss": 0.6603,
"step": 97
},
{
"epoch": 6.0625,
"eval_accuracy": 0.532,
"eval_loss": 0.6781836152076721,
"eval_runtime": 4.5394,
"eval_samples_per_second": 55.073,
"eval_steps_per_second": 1.762,
"step": 97
},
{
"epoch": 6.125,
"grad_norm": 2.6292548179626465,
"learning_rate": 1.962025316455696e-05,
"loss": 0.6892,
"step": 98
},
{
"epoch": 6.125,
"eval_accuracy": 0.54,
"eval_loss": 0.6773242354393005,
"eval_runtime": 4.5436,
"eval_samples_per_second": 55.022,
"eval_steps_per_second": 1.761,
"step": 98
},
{
"epoch": 6.1875,
"grad_norm": 5.301840305328369,
"learning_rate": 1.9303797468354432e-05,
"loss": 0.677,
"step": 99
},
{
"epoch": 6.1875,
"eval_accuracy": 0.548,
"eval_loss": 0.6762461066246033,
"eval_runtime": 4.5432,
"eval_samples_per_second": 55.027,
"eval_steps_per_second": 1.761,
"step": 99
},
{
"epoch": 6.25,
"grad_norm": 3.4270968437194824,
"learning_rate": 1.89873417721519e-05,
"loss": 0.6696,
"step": 100
},
{
"epoch": 6.25,
"eval_accuracy": 0.544,
"eval_loss": 0.6752324104309082,
"eval_runtime": 4.5411,
"eval_samples_per_second": 55.052,
"eval_steps_per_second": 1.762,
"step": 100
},
{
"epoch": 6.3125,
"grad_norm": 2.9809482097625732,
"learning_rate": 1.8670886075949368e-05,
"loss": 0.666,
"step": 101
},
{
"epoch": 6.3125,
"eval_accuracy": 0.56,
"eval_loss": 0.6741093993186951,
"eval_runtime": 4.5435,
"eval_samples_per_second": 55.024,
"eval_steps_per_second": 1.761,
"step": 101
},
{
"epoch": 6.375,
"grad_norm": 3.612354278564453,
"learning_rate": 1.8354430379746836e-05,
"loss": 0.6552,
"step": 102
},
{
"epoch": 6.375,
"eval_accuracy": 0.564,
"eval_loss": 0.6736387014389038,
"eval_runtime": 4.5443,
"eval_samples_per_second": 55.014,
"eval_steps_per_second": 1.76,
"step": 102
},
{
"epoch": 6.4375,
"grad_norm": 13.848094940185547,
"learning_rate": 1.8037974683544304e-05,
"loss": 0.6958,
"step": 103
},
{
"epoch": 6.4375,
"eval_accuracy": 0.564,
"eval_loss": 0.6730585694313049,
"eval_runtime": 4.537,
"eval_samples_per_second": 55.102,
"eval_steps_per_second": 1.763,
"step": 103
},
{
"epoch": 6.5,
"grad_norm": 2.657895565032959,
"learning_rate": 1.7721518987341772e-05,
"loss": 0.6779,
"step": 104
},
{
"epoch": 6.5,
"eval_accuracy": 0.576,
"eval_loss": 0.6721835732460022,
"eval_runtime": 4.5416,
"eval_samples_per_second": 55.047,
"eval_steps_per_second": 1.762,
"step": 104
},
{
"epoch": 6.5625,
"grad_norm": 3.6230475902557373,
"learning_rate": 1.7405063291139243e-05,
"loss": 0.662,
"step": 105
},
{
"epoch": 6.5625,
"eval_accuracy": 0.576,
"eval_loss": 0.6725234389305115,
"eval_runtime": 4.4966,
"eval_samples_per_second": 55.598,
"eval_steps_per_second": 1.779,
"step": 105
},
{
"epoch": 6.625,
"grad_norm": 2.817807674407959,
"learning_rate": 1.7088607594936708e-05,
"loss": 0.639,
"step": 106
},
{
"epoch": 6.625,
"eval_accuracy": 0.58,
"eval_loss": 0.6714980602264404,
"eval_runtime": 4.4976,
"eval_samples_per_second": 55.585,
"eval_steps_per_second": 1.779,
"step": 106
},
{
"epoch": 6.6875,
"grad_norm": 2.2491910457611084,
"learning_rate": 1.677215189873418e-05,
"loss": 0.6469,
"step": 107
},
{
"epoch": 6.6875,
"eval_accuracy": 0.564,
"eval_loss": 0.6703847646713257,
"eval_runtime": 4.5015,
"eval_samples_per_second": 55.537,
"eval_steps_per_second": 1.777,
"step": 107
},
{
"epoch": 6.75,
"grad_norm": 6.607123851776123,
"learning_rate": 1.6455696202531644e-05,
"loss": 0.6494,
"step": 108
},
{
"epoch": 6.75,
"eval_accuracy": 0.544,
"eval_loss": 0.6705585718154907,
"eval_runtime": 4.5512,
"eval_samples_per_second": 54.931,
"eval_steps_per_second": 1.758,
"step": 108
},
{
"epoch": 6.8125,
"grad_norm": 3.7436728477478027,
"learning_rate": 1.6139240506329115e-05,
"loss": 0.6428,
"step": 109
},
{
"epoch": 6.8125,
"eval_accuracy": 0.556,
"eval_loss": 0.6696659922599792,
"eval_runtime": 4.5466,
"eval_samples_per_second": 54.986,
"eval_steps_per_second": 1.76,
"step": 109
},
{
"epoch": 6.875,
"grad_norm": 10.663908004760742,
"learning_rate": 1.5822784810126583e-05,
"loss": 0.6949,
"step": 110
},
{
"epoch": 6.875,
"eval_accuracy": 0.552,
"eval_loss": 0.6699844002723694,
"eval_runtime": 4.5417,
"eval_samples_per_second": 55.046,
"eval_steps_per_second": 1.761,
"step": 110
},
{
"epoch": 6.9375,
"grad_norm": 2.8781378269195557,
"learning_rate": 1.550632911392405e-05,
"loss": 0.6557,
"step": 111
},
{
"epoch": 6.9375,
"eval_accuracy": 0.556,
"eval_loss": 0.6697744131088257,
"eval_runtime": 4.5421,
"eval_samples_per_second": 55.041,
"eval_steps_per_second": 1.761,
"step": 111
},
{
"epoch": 7.0,
"grad_norm": 9.62548828125,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.625,
"step": 112
},
{
"epoch": 7.0,
"eval_accuracy": 0.56,
"eval_loss": 0.6696327924728394,
"eval_runtime": 4.5434,
"eval_samples_per_second": 55.025,
"eval_steps_per_second": 1.761,
"step": 112
},
{
"epoch": 7.0625,
"grad_norm": 3.4376492500305176,
"learning_rate": 1.4873417721518987e-05,
"loss": 0.6648,
"step": 113
},
{
"epoch": 7.0625,
"eval_accuracy": 0.556,
"eval_loss": 0.6688730716705322,
"eval_runtime": 4.5489,
"eval_samples_per_second": 54.958,
"eval_steps_per_second": 1.759,
"step": 113
},
{
"epoch": 7.125,
"grad_norm": 11.591545104980469,
"learning_rate": 1.4556962025316457e-05,
"loss": 0.6909,
"step": 114
},
{
"epoch": 7.125,
"eval_accuracy": 0.54,
"eval_loss": 0.6680371165275574,
"eval_runtime": 4.5477,
"eval_samples_per_second": 54.973,
"eval_steps_per_second": 1.759,
"step": 114
},
{
"epoch": 7.1875,
"grad_norm": 3.0911552906036377,
"learning_rate": 1.4240506329113925e-05,
"loss": 0.6548,
"step": 115
},
{
"epoch": 7.1875,
"eval_accuracy": 0.552,
"eval_loss": 0.667611300945282,
"eval_runtime": 4.5404,
"eval_samples_per_second": 55.062,
"eval_steps_per_second": 1.762,
"step": 115
},
{
"epoch": 7.25,
"grad_norm": 5.890276908874512,
"learning_rate": 1.3924050632911393e-05,
"loss": 0.6278,
"step": 116
},
{
"epoch": 7.25,
"eval_accuracy": 0.58,
"eval_loss": 0.6670957207679749,
"eval_runtime": 4.5443,
"eval_samples_per_second": 55.014,
"eval_steps_per_second": 1.76,
"step": 116
},
{
"epoch": 7.3125,
"grad_norm": 2.038860321044922,
"learning_rate": 1.3607594936708861e-05,
"loss": 0.6899,
"step": 117
},
{
"epoch": 7.3125,
"eval_accuracy": 0.596,
"eval_loss": 0.6669042706489563,
"eval_runtime": 4.5442,
"eval_samples_per_second": 55.015,
"eval_steps_per_second": 1.76,
"step": 117
},
{
"epoch": 7.375,
"grad_norm": 7.413594722747803,
"learning_rate": 1.3291139240506329e-05,
"loss": 0.6197,
"step": 118
},
{
"epoch": 7.375,
"eval_accuracy": 0.588,
"eval_loss": 0.6667382717132568,
"eval_runtime": 4.5391,
"eval_samples_per_second": 55.077,
"eval_steps_per_second": 1.762,
"step": 118
},
{
"epoch": 7.4375,
"grad_norm": 3.1535215377807617,
"learning_rate": 1.2974683544303799e-05,
"loss": 0.653,
"step": 119
},
{
"epoch": 7.4375,
"eval_accuracy": 0.588,
"eval_loss": 0.666509747505188,
"eval_runtime": 4.5431,
"eval_samples_per_second": 55.028,
"eval_steps_per_second": 1.761,
"step": 119
},
{
"epoch": 7.5,
"grad_norm": 5.736833095550537,
"learning_rate": 1.2658227848101267e-05,
"loss": 0.6531,
"step": 120
},
{
"epoch": 7.5,
"eval_accuracy": 0.592,
"eval_loss": 0.6669736504554749,
"eval_runtime": 4.5365,
"eval_samples_per_second": 55.108,
"eval_steps_per_second": 1.763,
"step": 120
},
{
"epoch": 7.5625,
"grad_norm": 3.403089761734009,
"learning_rate": 1.2341772151898735e-05,
"loss": 0.6494,
"step": 121
},
{
"epoch": 7.5625,
"eval_accuracy": 0.584,
"eval_loss": 0.6666631102561951,
"eval_runtime": 4.5001,
"eval_samples_per_second": 55.555,
"eval_steps_per_second": 1.778,
"step": 121
},
{
"epoch": 7.625,
"grad_norm": 2.2943952083587646,
"learning_rate": 1.2025316455696203e-05,
"loss": 0.6914,
"step": 122
},
{
"epoch": 7.625,
"eval_accuracy": 0.588,
"eval_loss": 0.6671044826507568,
"eval_runtime": 4.5495,
"eval_samples_per_second": 54.952,
"eval_steps_per_second": 1.758,
"step": 122
},
{
"epoch": 7.6875,
"grad_norm": 1.8052605390548706,
"learning_rate": 1.170886075949367e-05,
"loss": 0.6506,
"step": 123
},
{
"epoch": 7.6875,
"eval_accuracy": 0.592,
"eval_loss": 0.6672109365463257,
"eval_runtime": 4.5468,
"eval_samples_per_second": 54.984,
"eval_steps_per_second": 1.759,
"step": 123
},
{
"epoch": 7.75,
"grad_norm": 2.0512139797210693,
"learning_rate": 1.139240506329114e-05,
"loss": 0.6647,
"step": 124
},
{
"epoch": 7.75,
"eval_accuracy": 0.592,
"eval_loss": 0.6669785380363464,
"eval_runtime": 4.5423,
"eval_samples_per_second": 55.039,
"eval_steps_per_second": 1.761,
"step": 124
},
{
"epoch": 7.8125,
"grad_norm": 9.648463249206543,
"learning_rate": 1.1075949367088608e-05,
"loss": 0.6476,
"step": 125
},
{
"epoch": 7.8125,
"eval_accuracy": 0.592,
"eval_loss": 0.6669345498085022,
"eval_runtime": 4.5447,
"eval_samples_per_second": 55.009,
"eval_steps_per_second": 1.76,
"step": 125
},
{
"epoch": 7.875,
"grad_norm": 3.750437021255493,
"learning_rate": 1.0759493670886076e-05,
"loss": 0.6609,
"step": 126
},
{
"epoch": 7.875,
"eval_accuracy": 0.592,
"eval_loss": 0.6669287085533142,
"eval_runtime": 4.5377,
"eval_samples_per_second": 55.094,
"eval_steps_per_second": 1.763,
"step": 126
},
{
"epoch": 7.9375,
"grad_norm": 2.9882094860076904,
"learning_rate": 1.0443037974683544e-05,
"loss": 0.6497,
"step": 127
},
{
"epoch": 7.9375,
"eval_accuracy": 0.596,
"eval_loss": 0.6663134694099426,
"eval_runtime": 4.5412,
"eval_samples_per_second": 55.052,
"eval_steps_per_second": 1.762,
"step": 127
},
{
"epoch": 8.0,
"grad_norm": 5.13292932510376,
"learning_rate": 1.0126582278481012e-05,
"loss": 0.6773,
"step": 128
},
{
"epoch": 8.0,
"eval_accuracy": 0.588,
"eval_loss": 0.6660781502723694,
"eval_runtime": 4.4907,
"eval_samples_per_second": 55.671,
"eval_steps_per_second": 1.781,
"step": 128
},
{
"epoch": 8.0625,
"grad_norm": 4.037117958068848,
"learning_rate": 9.81012658227848e-06,
"loss": 0.6841,
"step": 129
},
{
"epoch": 8.0625,
"eval_accuracy": 0.596,
"eval_loss": 0.6660195589065552,
"eval_runtime": 4.4945,
"eval_samples_per_second": 55.623,
"eval_steps_per_second": 1.78,
"step": 129
},
{
"epoch": 8.125,
"grad_norm": 9.9661865234375,
"learning_rate": 9.49367088607595e-06,
"loss": 0.657,
"step": 130
},
{
"epoch": 8.125,
"eval_accuracy": 0.592,
"eval_loss": 0.6656200885772705,
"eval_runtime": 4.5383,
"eval_samples_per_second": 55.086,
"eval_steps_per_second": 1.763,
"step": 130
},
{
"epoch": 8.1875,
"grad_norm": 8.460039138793945,
"learning_rate": 9.177215189873418e-06,
"loss": 0.6622,
"step": 131
},
{
"epoch": 8.1875,
"eval_accuracy": 0.6,
"eval_loss": 0.6657363176345825,
"eval_runtime": 4.549,
"eval_samples_per_second": 54.957,
"eval_steps_per_second": 1.759,
"step": 131
},
{
"epoch": 8.25,
"grad_norm": 2.761270046234131,
"learning_rate": 8.860759493670886e-06,
"loss": 0.667,
"step": 132
},
{
"epoch": 8.25,
"eval_accuracy": 0.592,
"eval_loss": 0.665112316608429,
"eval_runtime": 4.5498,
"eval_samples_per_second": 54.947,
"eval_steps_per_second": 1.758,
"step": 132
},
{
"epoch": 8.3125,
"grad_norm": 4.367539405822754,
"learning_rate": 8.544303797468354e-06,
"loss": 0.6662,
"step": 133
},
{
"epoch": 8.3125,
"eval_accuracy": 0.596,
"eval_loss": 0.6654492020606995,
"eval_runtime": 4.5418,
"eval_samples_per_second": 55.044,
"eval_steps_per_second": 1.761,
"step": 133
},
{
"epoch": 8.375,
"grad_norm": 3.8258039951324463,
"learning_rate": 8.227848101265822e-06,
"loss": 0.615,
"step": 134
},
{
"epoch": 8.375,
"eval_accuracy": 0.588,
"eval_loss": 0.6657968759536743,
"eval_runtime": 4.5412,
"eval_samples_per_second": 55.051,
"eval_steps_per_second": 1.762,
"step": 134
},
{
"epoch": 8.4375,
"grad_norm": 2.691741466522217,
"learning_rate": 7.911392405063292e-06,
"loss": 0.6961,
"step": 135
},
{
"epoch": 8.4375,
"eval_accuracy": 0.596,
"eval_loss": 0.6653828024864197,
"eval_runtime": 4.5434,
"eval_samples_per_second": 55.025,
"eval_steps_per_second": 1.761,
"step": 135
},
{
"epoch": 8.5,
"grad_norm": 5.671183109283447,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.6134,
"step": 136
},
{
"epoch": 8.5,
"eval_accuracy": 0.6,
"eval_loss": 0.6660419702529907,
"eval_runtime": 4.5413,
"eval_samples_per_second": 55.05,
"eval_steps_per_second": 1.762,
"step": 136
},
{
"epoch": 8.5625,
"grad_norm": 7.398742198944092,
"learning_rate": 7.2784810126582285e-06,
"loss": 0.6839,
"step": 137
},
{
"epoch": 8.5625,
"eval_accuracy": 0.592,
"eval_loss": 0.6657724380493164,
"eval_runtime": 4.5404,
"eval_samples_per_second": 55.061,
"eval_steps_per_second": 1.762,
"step": 137
},
{
"epoch": 8.625,
"grad_norm": 4.798144340515137,
"learning_rate": 6.9620253164556965e-06,
"loss": 0.6482,
"step": 138
},
{
"epoch": 8.625,
"eval_accuracy": 0.596,
"eval_loss": 0.666140615940094,
"eval_runtime": 4.5422,
"eval_samples_per_second": 55.039,
"eval_steps_per_second": 1.761,
"step": 138
},
{
"epoch": 8.6875,
"grad_norm": 8.25437068939209,
"learning_rate": 6.6455696202531645e-06,
"loss": 0.6635,
"step": 139
},
{
"epoch": 8.6875,
"eval_accuracy": 0.604,
"eval_loss": 0.6665273308753967,
"eval_runtime": 4.5468,
"eval_samples_per_second": 54.984,
"eval_steps_per_second": 1.759,
"step": 139
},
{
"epoch": 8.75,
"grad_norm": 2.5796449184417725,
"learning_rate": 6.329113924050633e-06,
"loss": 0.6229,
"step": 140
},
{
"epoch": 8.75,
"eval_accuracy": 0.608,
"eval_loss": 0.6665956974029541,
"eval_runtime": 4.5394,
"eval_samples_per_second": 55.073,
"eval_steps_per_second": 1.762,
"step": 140
},
{
"epoch": 8.8125,
"grad_norm": 2.3988282680511475,
"learning_rate": 6.012658227848101e-06,
"loss": 0.6205,
"step": 141
},
{
"epoch": 8.8125,
"eval_accuracy": 0.604,
"eval_loss": 0.6665576100349426,
"eval_runtime": 4.5397,
"eval_samples_per_second": 55.07,
"eval_steps_per_second": 1.762,
"step": 141
},
{
"epoch": 8.875,
"grad_norm": 3.2234578132629395,
"learning_rate": 5.69620253164557e-06,
"loss": 0.6347,
"step": 142
},
{
"epoch": 8.875,
"eval_accuracy": 0.596,
"eval_loss": 0.6664531230926514,
"eval_runtime": 4.4932,
"eval_samples_per_second": 55.64,
"eval_steps_per_second": 1.78,
"step": 142
},
{
"epoch": 8.9375,
"grad_norm": 3.1038153171539307,
"learning_rate": 5.379746835443038e-06,
"loss": 0.6868,
"step": 143
},
{
"epoch": 8.9375,
"eval_accuracy": 0.608,
"eval_loss": 0.6668280959129333,
"eval_runtime": 4.5461,
"eval_samples_per_second": 54.993,
"eval_steps_per_second": 1.76,
"step": 143
},
{
"epoch": 9.0,
"grad_norm": 5.682613849639893,
"learning_rate": 5.063291139240506e-06,
"loss": 0.6447,
"step": 144
},
{
"epoch": 9.0,
"eval_accuracy": 0.604,
"eval_loss": 0.6665273308753967,
"eval_runtime": 4.5397,
"eval_samples_per_second": 55.069,
"eval_steps_per_second": 1.762,
"step": 144
},
{
"epoch": 9.0625,
"grad_norm": 8.149535179138184,
"learning_rate": 4.746835443037975e-06,
"loss": 0.6755,
"step": 145
},
{
"epoch": 9.0625,
"eval_accuracy": 0.596,
"eval_loss": 0.6669501662254333,
"eval_runtime": 4.497,
"eval_samples_per_second": 55.592,
"eval_steps_per_second": 1.779,
"step": 145
},
{
"epoch": 9.125,
"grad_norm": 3.2166755199432373,
"learning_rate": 4.430379746835443e-06,
"loss": 0.6749,
"step": 146
},
{
"epoch": 9.125,
"eval_accuracy": 0.604,
"eval_loss": 0.667477548122406,
"eval_runtime": 4.549,
"eval_samples_per_second": 54.957,
"eval_steps_per_second": 1.759,
"step": 146
},
{
"epoch": 9.1875,
"grad_norm": 2.9138267040252686,
"learning_rate": 4.113924050632911e-06,
"loss": 0.6681,
"step": 147
},
{
"epoch": 9.1875,
"eval_accuracy": 0.596,
"eval_loss": 0.6679531335830688,
"eval_runtime": 4.5432,
"eval_samples_per_second": 55.027,
"eval_steps_per_second": 1.761,
"step": 147
},
{
"epoch": 9.25,
"grad_norm": 8.955977439880371,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.6768,
"step": 148
},
{
"epoch": 9.25,
"eval_accuracy": 0.6,
"eval_loss": 0.667892575263977,
"eval_runtime": 4.5414,
"eval_samples_per_second": 55.049,
"eval_steps_per_second": 1.762,
"step": 148
},
{
"epoch": 9.3125,
"grad_norm": 4.039650917053223,
"learning_rate": 3.4810126582278482e-06,
"loss": 0.6291,
"step": 149
},
{
"epoch": 9.3125,
"eval_accuracy": 0.596,
"eval_loss": 0.6680244207382202,
"eval_runtime": 4.5436,
"eval_samples_per_second": 55.022,
"eval_steps_per_second": 1.761,
"step": 149
},
{
"epoch": 9.375,
"grad_norm": 3.648364543914795,
"learning_rate": 3.1645569620253167e-06,
"loss": 0.6857,
"step": 150
},
{
"epoch": 9.375,
"eval_accuracy": 0.596,
"eval_loss": 0.6680644750595093,
"eval_runtime": 4.5475,
"eval_samples_per_second": 54.975,
"eval_steps_per_second": 1.759,
"step": 150
},
{
"epoch": 9.4375,
"grad_norm": 2.3928475379943848,
"learning_rate": 2.848101265822785e-06,
"loss": 0.6454,
"step": 151
},
{
"epoch": 9.4375,
"eval_accuracy": 0.604,
"eval_loss": 0.6678046584129333,
"eval_runtime": 4.5472,
"eval_samples_per_second": 54.979,
"eval_steps_per_second": 1.759,
"step": 151
},
{
"epoch": 9.5,
"grad_norm": 1.8685684204101562,
"learning_rate": 2.531645569620253e-06,
"loss": 0.648,
"step": 152
},
{
"epoch": 9.5,
"eval_accuracy": 0.592,
"eval_loss": 0.6684179902076721,
"eval_runtime": 4.5422,
"eval_samples_per_second": 55.04,
"eval_steps_per_second": 1.761,
"step": 152
},
{
"epoch": 9.5625,
"grad_norm": 6.94075345993042,
"learning_rate": 2.2151898734177215e-06,
"loss": 0.5989,
"step": 153
},
{
"epoch": 9.5625,
"eval_accuracy": 0.604,
"eval_loss": 0.6686621308326721,
"eval_runtime": 4.549,
"eval_samples_per_second": 54.957,
"eval_steps_per_second": 1.759,
"step": 153
},
{
"epoch": 9.625,
"grad_norm": 2.2033395767211914,
"learning_rate": 1.8987341772151901e-06,
"loss": 0.6334,
"step": 154
},
{
"epoch": 9.625,
"eval_accuracy": 0.588,
"eval_loss": 0.6692119240760803,
"eval_runtime": 4.5415,
"eval_samples_per_second": 55.048,
"eval_steps_per_second": 1.762,
"step": 154
},
{
"epoch": 9.6875,
"grad_norm": 4.624488353729248,
"learning_rate": 1.5822784810126583e-06,
"loss": 0.6086,
"step": 155
},
{
"epoch": 9.6875,
"eval_accuracy": 0.592,
"eval_loss": 0.6682060360908508,
"eval_runtime": 4.5376,
"eval_samples_per_second": 55.095,
"eval_steps_per_second": 1.763,
"step": 155
},
{
"epoch": 9.75,
"grad_norm": 8.24832820892334,
"learning_rate": 1.2658227848101265e-06,
"loss": 0.6355,
"step": 156
},
{
"epoch": 9.75,
"eval_accuracy": 0.6,
"eval_loss": 0.6689130663871765,
"eval_runtime": 4.5365,
"eval_samples_per_second": 55.109,
"eval_steps_per_second": 1.763,
"step": 156
},
{
"epoch": 9.8125,
"grad_norm": 1.9968777894973755,
"learning_rate": 9.493670886075951e-07,
"loss": 0.618,
"step": 157
},
{
"epoch": 9.8125,
"eval_accuracy": 0.596,
"eval_loss": 0.6689111590385437,
"eval_runtime": 4.547,
"eval_samples_per_second": 54.981,
"eval_steps_per_second": 1.759,
"step": 157
},
{
"epoch": 9.875,
"grad_norm": 2.4490880966186523,
"learning_rate": 6.329113924050633e-07,
"loss": 0.6603,
"step": 158
},
{
"epoch": 9.875,
"eval_accuracy": 0.596,
"eval_loss": 0.6684619188308716,
"eval_runtime": 4.5445,
"eval_samples_per_second": 55.012,
"eval_steps_per_second": 1.76,
"step": 158
},
{
"epoch": 9.9375,
"grad_norm": 5.009583950042725,
"learning_rate": 3.1645569620253163e-07,
"loss": 0.6585,
"step": 159
},
{
"epoch": 9.9375,
"eval_accuracy": 0.6,
"eval_loss": 0.6681816577911377,
"eval_runtime": 4.5393,
"eval_samples_per_second": 55.075,
"eval_steps_per_second": 1.762,
"step": 159
},
{
"epoch": 10.0,
"grad_norm": 3.7535176277160645,
"learning_rate": 0.0,
"loss": 0.6705,
"step": 160
},
{
"epoch": 10.0,
"eval_accuracy": 0.6,
"eval_loss": 0.6681679487228394,
"eval_runtime": 4.4878,
"eval_samples_per_second": 55.706,
"eval_steps_per_second": 1.783,
"step": 160
},
{
"epoch": 10.0,
"step": 160,
"total_flos": 174253428178944.0,
"train_loss": 0.6897118806838989,
"train_runtime": 1349.6743,
"train_samples_per_second": 7.409,
"train_steps_per_second": 0.119
}
],
"logging_steps": 1,
"max_steps": 160,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 174253428178944.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}