adapters-opt-gptq-QLORA-super_glue-rte
/
trainer_state-opt-gptq-QLORA-super_glue-rte-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 160, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.0625, | |
"grad_norm": 17.75647735595703, | |
"learning_rate": 2.5e-05, | |
"loss": 0.8699, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.0625, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.837658703327179, | |
"eval_runtime": 4.5469, | |
"eval_samples_per_second": 54.983, | |
"eval_steps_per_second": 1.759, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.125, | |
"grad_norm": 10.630256652832031, | |
"learning_rate": 5e-05, | |
"loss": 0.8372, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.125, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.8297348618507385, | |
"eval_runtime": 4.4856, | |
"eval_samples_per_second": 55.733, | |
"eval_steps_per_second": 1.783, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.1875, | |
"grad_norm": 18.88325309753418, | |
"learning_rate": 4.968354430379747e-05, | |
"loss": 0.8813, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.1875, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.8005664348602295, | |
"eval_runtime": 4.5427, | |
"eval_samples_per_second": 55.034, | |
"eval_steps_per_second": 1.761, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 14.382352828979492, | |
"learning_rate": 4.936708860759494e-05, | |
"loss": 0.8725, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.25, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7717475295066833, | |
"eval_runtime": 4.495, | |
"eval_samples_per_second": 55.618, | |
"eval_steps_per_second": 1.78, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.3125, | |
"grad_norm": 18.139081954956055, | |
"learning_rate": 4.905063291139241e-05, | |
"loss": 0.8504, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.3125, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.7444770336151123, | |
"eval_runtime": 4.5407, | |
"eval_samples_per_second": 55.057, | |
"eval_steps_per_second": 1.762, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.375, | |
"grad_norm": 7.32247257232666, | |
"learning_rate": 4.8734177215189874e-05, | |
"loss": 0.8015, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.375, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7214609384536743, | |
"eval_runtime": 4.5353, | |
"eval_samples_per_second": 55.123, | |
"eval_steps_per_second": 1.764, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.4375, | |
"grad_norm": 4.8535966873168945, | |
"learning_rate": 4.8417721518987346e-05, | |
"loss": 0.7165, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.4375, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.7035966515541077, | |
"eval_runtime": 4.5437, | |
"eval_samples_per_second": 55.021, | |
"eval_steps_per_second": 1.761, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 7.805424690246582, | |
"learning_rate": 4.810126582278481e-05, | |
"loss": 0.766, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6936992406845093, | |
"eval_runtime": 4.5422, | |
"eval_samples_per_second": 55.04, | |
"eval_steps_per_second": 1.761, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5625, | |
"grad_norm": 2.8888654708862305, | |
"learning_rate": 4.778481012658228e-05, | |
"loss": 0.6915, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.5625, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6935371160507202, | |
"eval_runtime": 4.4937, | |
"eval_samples_per_second": 55.633, | |
"eval_steps_per_second": 1.78, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.625, | |
"grad_norm": 7.086565971374512, | |
"learning_rate": 4.7468354430379746e-05, | |
"loss": 0.7093, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.625, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6983515620231628, | |
"eval_runtime": 4.4948, | |
"eval_samples_per_second": 55.619, | |
"eval_steps_per_second": 1.78, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.6875, | |
"grad_norm": 2.0717809200286865, | |
"learning_rate": 4.715189873417722e-05, | |
"loss": 0.7107, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.6875, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.7018242478370667, | |
"eval_runtime": 4.5362, | |
"eval_samples_per_second": 55.112, | |
"eval_steps_per_second": 1.764, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 4.748137474060059, | |
"learning_rate": 4.683544303797468e-05, | |
"loss": 0.729, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.75, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.7031621336936951, | |
"eval_runtime": 4.5441, | |
"eval_samples_per_second": 55.017, | |
"eval_steps_per_second": 1.761, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.8125, | |
"grad_norm": 5.206336975097656, | |
"learning_rate": 4.6518987341772154e-05, | |
"loss": 0.7819, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.8125, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.7015683650970459, | |
"eval_runtime": 4.5432, | |
"eval_samples_per_second": 55.028, | |
"eval_steps_per_second": 1.761, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.875, | |
"grad_norm": 5.746586322784424, | |
"learning_rate": 4.6202531645569625e-05, | |
"loss": 0.7343, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.875, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6985683441162109, | |
"eval_runtime": 4.5466, | |
"eval_samples_per_second": 54.986, | |
"eval_steps_per_second": 1.76, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.9375, | |
"grad_norm": 2.441669464111328, | |
"learning_rate": 4.588607594936709e-05, | |
"loss": 0.7052, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.9375, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6962031126022339, | |
"eval_runtime": 4.5479, | |
"eval_samples_per_second": 54.97, | |
"eval_steps_per_second": 1.759, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 8.590319633483887, | |
"learning_rate": 4.556962025316456e-05, | |
"loss": 0.7406, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6932148337364197, | |
"eval_runtime": 4.5474, | |
"eval_samples_per_second": 54.977, | |
"eval_steps_per_second": 1.759, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0625, | |
"grad_norm": 3.343947172164917, | |
"learning_rate": 4.525316455696203e-05, | |
"loss": 0.6939, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.0625, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6910995841026306, | |
"eval_runtime": 4.5426, | |
"eval_samples_per_second": 55.035, | |
"eval_steps_per_second": 1.761, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.125, | |
"grad_norm": 5.235752582550049, | |
"learning_rate": 4.49367088607595e-05, | |
"loss": 0.6857, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.125, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.690261721611023, | |
"eval_runtime": 4.5449, | |
"eval_samples_per_second": 55.006, | |
"eval_steps_per_second": 1.76, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.1875, | |
"grad_norm": 8.583759307861328, | |
"learning_rate": 4.462025316455696e-05, | |
"loss": 0.6932, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.1875, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6894707083702087, | |
"eval_runtime": 4.5371, | |
"eval_samples_per_second": 55.101, | |
"eval_steps_per_second": 1.763, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 9.518383979797363, | |
"learning_rate": 4.430379746835443e-05, | |
"loss": 0.6993, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.25, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6895566582679749, | |
"eval_runtime": 4.5489, | |
"eval_samples_per_second": 54.958, | |
"eval_steps_per_second": 1.759, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.3125, | |
"grad_norm": 7.239161014556885, | |
"learning_rate": 4.3987341772151904e-05, | |
"loss": 0.763, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.3125, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.690017580986023, | |
"eval_runtime": 4.4884, | |
"eval_samples_per_second": 55.699, | |
"eval_steps_per_second": 1.782, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.375, | |
"grad_norm": 7.390464782714844, | |
"learning_rate": 4.367088607594937e-05, | |
"loss": 0.6954, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.375, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6905702948570251, | |
"eval_runtime": 4.5375, | |
"eval_samples_per_second": 55.097, | |
"eval_steps_per_second": 1.763, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.4375, | |
"grad_norm": 3.728330135345459, | |
"learning_rate": 4.3354430379746834e-05, | |
"loss": 0.7282, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.4375, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.690304696559906, | |
"eval_runtime": 4.5427, | |
"eval_samples_per_second": 55.033, | |
"eval_steps_per_second": 1.761, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 5.795580863952637, | |
"learning_rate": 4.3037974683544305e-05, | |
"loss": 0.7534, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6902949213981628, | |
"eval_runtime": 4.5361, | |
"eval_samples_per_second": 55.113, | |
"eval_steps_per_second": 1.764, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5625, | |
"grad_norm": 2.294114589691162, | |
"learning_rate": 4.2721518987341776e-05, | |
"loss": 0.7245, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.5625, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6901406049728394, | |
"eval_runtime": 4.4936, | |
"eval_samples_per_second": 55.634, | |
"eval_steps_per_second": 1.78, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.625, | |
"grad_norm": 7.018092155456543, | |
"learning_rate": 4.240506329113924e-05, | |
"loss": 0.692, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.625, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6903828382492065, | |
"eval_runtime": 4.5453, | |
"eval_samples_per_second": 55.002, | |
"eval_steps_per_second": 1.76, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.6875, | |
"grad_norm": 7.2787909507751465, | |
"learning_rate": 4.208860759493671e-05, | |
"loss": 0.754, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.6875, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6910136938095093, | |
"eval_runtime": 4.5501, | |
"eval_samples_per_second": 54.944, | |
"eval_steps_per_second": 1.758, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 6.70567512512207, | |
"learning_rate": 4.177215189873418e-05, | |
"loss": 0.7132, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.75, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.691275417804718, | |
"eval_runtime": 4.4942, | |
"eval_samples_per_second": 55.627, | |
"eval_steps_per_second": 1.78, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.8125, | |
"grad_norm": 7.861635208129883, | |
"learning_rate": 4.145569620253165e-05, | |
"loss": 0.7075, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.8125, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6910507678985596, | |
"eval_runtime": 4.5518, | |
"eval_samples_per_second": 54.924, | |
"eval_steps_per_second": 1.758, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.875, | |
"grad_norm": 6.021241188049316, | |
"learning_rate": 4.113924050632912e-05, | |
"loss": 0.7013, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.875, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6917343735694885, | |
"eval_runtime": 4.5466, | |
"eval_samples_per_second": 54.986, | |
"eval_steps_per_second": 1.76, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.9375, | |
"grad_norm": 4.954082012176514, | |
"learning_rate": 4.0822784810126584e-05, | |
"loss": 0.7059, | |
"step": 31 | |
}, | |
{ | |
"epoch": 1.9375, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.692550778388977, | |
"eval_runtime": 4.5373, | |
"eval_samples_per_second": 55.099, | |
"eval_steps_per_second": 1.763, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 4.5776824951171875, | |
"learning_rate": 4.050632911392405e-05, | |
"loss": 0.6932, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.692632794380188, | |
"eval_runtime": 4.5448, | |
"eval_samples_per_second": 55.008, | |
"eval_steps_per_second": 1.76, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0625, | |
"grad_norm": 8.209676742553711, | |
"learning_rate": 4.018987341772152e-05, | |
"loss": 0.7083, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.0625, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6923945546150208, | |
"eval_runtime": 4.5461, | |
"eval_samples_per_second": 54.993, | |
"eval_steps_per_second": 1.76, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.125, | |
"grad_norm": 2.001976490020752, | |
"learning_rate": 3.987341772151899e-05, | |
"loss": 0.7423, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.125, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.692144513130188, | |
"eval_runtime": 4.5419, | |
"eval_samples_per_second": 55.043, | |
"eval_steps_per_second": 1.761, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.1875, | |
"grad_norm": 7.856252670288086, | |
"learning_rate": 3.9556962025316456e-05, | |
"loss": 0.6794, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.1875, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6922343969345093, | |
"eval_runtime": 4.5401, | |
"eval_samples_per_second": 55.065, | |
"eval_steps_per_second": 1.762, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 10.469124794006348, | |
"learning_rate": 3.924050632911392e-05, | |
"loss": 0.7089, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.25, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6925742030143738, | |
"eval_runtime": 4.4947, | |
"eval_samples_per_second": 55.621, | |
"eval_steps_per_second": 1.78, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.3125, | |
"grad_norm": 12.528965950012207, | |
"learning_rate": 3.89240506329114e-05, | |
"loss": 0.738, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.3125, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6928867101669312, | |
"eval_runtime": 4.4904, | |
"eval_samples_per_second": 55.674, | |
"eval_steps_per_second": 1.782, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.375, | |
"grad_norm": 10.900518417358398, | |
"learning_rate": 3.8607594936708864e-05, | |
"loss": 0.6796, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.375, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6924609541893005, | |
"eval_runtime": 4.5411, | |
"eval_samples_per_second": 55.052, | |
"eval_steps_per_second": 1.762, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.4375, | |
"grad_norm": 1.5410585403442383, | |
"learning_rate": 3.829113924050633e-05, | |
"loss": 0.6729, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.4375, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6923437714576721, | |
"eval_runtime": 4.5412, | |
"eval_samples_per_second": 55.052, | |
"eval_steps_per_second": 1.762, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 5.861754894256592, | |
"learning_rate": 3.79746835443038e-05, | |
"loss": 0.6589, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6922851800918579, | |
"eval_runtime": 4.5478, | |
"eval_samples_per_second": 54.971, | |
"eval_steps_per_second": 1.759, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5625, | |
"grad_norm": 2.633316993713379, | |
"learning_rate": 3.765822784810127e-05, | |
"loss": 0.7336, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.5625, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6914882659912109, | |
"eval_runtime": 4.4965, | |
"eval_samples_per_second": 55.598, | |
"eval_steps_per_second": 1.779, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.625, | |
"grad_norm": 4.3643693923950195, | |
"learning_rate": 3.7341772151898736e-05, | |
"loss": 0.7018, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.625, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.690136730670929, | |
"eval_runtime": 4.5418, | |
"eval_samples_per_second": 55.045, | |
"eval_steps_per_second": 1.761, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.6875, | |
"grad_norm": 4.561107158660889, | |
"learning_rate": 3.70253164556962e-05, | |
"loss": 0.7331, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.6875, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6878847479820251, | |
"eval_runtime": 4.5431, | |
"eval_samples_per_second": 55.028, | |
"eval_steps_per_second": 1.761, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 2.425762891769409, | |
"learning_rate": 3.670886075949367e-05, | |
"loss": 0.6961, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.75, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6869159936904907, | |
"eval_runtime": 4.5385, | |
"eval_samples_per_second": 55.084, | |
"eval_steps_per_second": 1.763, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.8125, | |
"grad_norm": 7.950039863586426, | |
"learning_rate": 3.639240506329114e-05, | |
"loss": 0.7228, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.8125, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6861679553985596, | |
"eval_runtime": 4.5403, | |
"eval_samples_per_second": 55.063, | |
"eval_steps_per_second": 1.762, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.875, | |
"grad_norm": 12.410717964172363, | |
"learning_rate": 3.607594936708861e-05, | |
"loss": 0.7031, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.875, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.685476541519165, | |
"eval_runtime": 4.5437, | |
"eval_samples_per_second": 55.022, | |
"eval_steps_per_second": 1.761, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.9375, | |
"grad_norm": 3.116471767425537, | |
"learning_rate": 3.575949367088608e-05, | |
"loss": 0.6885, | |
"step": 47 | |
}, | |
{ | |
"epoch": 2.9375, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6849316358566284, | |
"eval_runtime": 4.5418, | |
"eval_samples_per_second": 55.045, | |
"eval_steps_per_second": 1.761, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 6.724969387054443, | |
"learning_rate": 3.5443037974683544e-05, | |
"loss": 0.7062, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6846718788146973, | |
"eval_runtime": 4.4458, | |
"eval_samples_per_second": 56.233, | |
"eval_steps_per_second": 1.799, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0625, | |
"grad_norm": 2.1322343349456787, | |
"learning_rate": 3.5126582278481015e-05, | |
"loss": 0.6679, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.0625, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6838710904121399, | |
"eval_runtime": 4.4926, | |
"eval_samples_per_second": 55.648, | |
"eval_steps_per_second": 1.781, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.125, | |
"grad_norm": 6.895395278930664, | |
"learning_rate": 3.4810126582278487e-05, | |
"loss": 0.6956, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.125, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6838496327400208, | |
"eval_runtime": 4.5422, | |
"eval_samples_per_second": 55.04, | |
"eval_steps_per_second": 1.761, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.1875, | |
"grad_norm": 10.101134300231934, | |
"learning_rate": 3.449367088607595e-05, | |
"loss": 0.7449, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.1875, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6836503744125366, | |
"eval_runtime": 4.5, | |
"eval_samples_per_second": 55.556, | |
"eval_steps_per_second": 1.778, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.25, | |
"grad_norm": 5.3039422035217285, | |
"learning_rate": 3.4177215189873416e-05, | |
"loss": 0.6853, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.25, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6831699013710022, | |
"eval_runtime": 4.4953, | |
"eval_samples_per_second": 55.613, | |
"eval_steps_per_second": 1.78, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.3125, | |
"grad_norm": 2.962162733078003, | |
"learning_rate": 3.386075949367089e-05, | |
"loss": 0.7127, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.3125, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6828047037124634, | |
"eval_runtime": 4.5467, | |
"eval_samples_per_second": 54.985, | |
"eval_steps_per_second": 1.76, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.375, | |
"grad_norm": 4.858814239501953, | |
"learning_rate": 3.354430379746836e-05, | |
"loss": 0.6544, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.375, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6824140548706055, | |
"eval_runtime": 4.5474, | |
"eval_samples_per_second": 54.976, | |
"eval_steps_per_second": 1.759, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.4375, | |
"grad_norm": 5.237043380737305, | |
"learning_rate": 3.322784810126582e-05, | |
"loss": 0.6638, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.4375, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6816914081573486, | |
"eval_runtime": 4.5453, | |
"eval_samples_per_second": 55.002, | |
"eval_steps_per_second": 1.76, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 3.878478527069092, | |
"learning_rate": 3.291139240506329e-05, | |
"loss": 0.7148, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6814433336257935, | |
"eval_runtime": 4.5412, | |
"eval_samples_per_second": 55.052, | |
"eval_steps_per_second": 1.762, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5625, | |
"grad_norm": 4.188953399658203, | |
"learning_rate": 3.2594936708860766e-05, | |
"loss": 0.7003, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.5625, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6815546751022339, | |
"eval_runtime": 4.5437, | |
"eval_samples_per_second": 55.021, | |
"eval_steps_per_second": 1.761, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.625, | |
"grad_norm": 12.408546447753906, | |
"learning_rate": 3.227848101265823e-05, | |
"loss": 0.771, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.625, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.681021511554718, | |
"eval_runtime": 4.5384, | |
"eval_samples_per_second": 55.086, | |
"eval_steps_per_second": 1.763, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.6875, | |
"grad_norm": 3.4157402515411377, | |
"learning_rate": 3.1962025316455695e-05, | |
"loss": 0.6973, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.6875, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6810234189033508, | |
"eval_runtime": 4.5487, | |
"eval_samples_per_second": 54.96, | |
"eval_steps_per_second": 1.759, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.75, | |
"grad_norm": 7.873476028442383, | |
"learning_rate": 3.1645569620253167e-05, | |
"loss": 0.7426, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.75, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6811171770095825, | |
"eval_runtime": 4.5472, | |
"eval_samples_per_second": 54.978, | |
"eval_steps_per_second": 1.759, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.8125, | |
"grad_norm": 5.3661322593688965, | |
"learning_rate": 3.132911392405064e-05, | |
"loss": 0.6969, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.8125, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6811054944992065, | |
"eval_runtime": 4.5489, | |
"eval_samples_per_second": 54.959, | |
"eval_steps_per_second": 1.759, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.875, | |
"grad_norm": 2.467409372329712, | |
"learning_rate": 3.10126582278481e-05, | |
"loss": 0.7369, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.875, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6803652048110962, | |
"eval_runtime": 4.5424, | |
"eval_samples_per_second": 55.037, | |
"eval_steps_per_second": 1.761, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.9375, | |
"grad_norm": 2.4884164333343506, | |
"learning_rate": 3.0696202531645574e-05, | |
"loss": 0.6572, | |
"step": 63 | |
}, | |
{ | |
"epoch": 3.9375, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6803945302963257, | |
"eval_runtime": 4.5015, | |
"eval_samples_per_second": 55.537, | |
"eval_steps_per_second": 1.777, | |
"step": 63 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 1.9957572221755981, | |
"learning_rate": 3.0379746835443042e-05, | |
"loss": 0.758, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6798281073570251, | |
"eval_runtime": 4.5427, | |
"eval_samples_per_second": 55.033, | |
"eval_steps_per_second": 1.761, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0625, | |
"grad_norm": 11.552275657653809, | |
"learning_rate": 3.0063291139240506e-05, | |
"loss": 0.7428, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.0625, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.679925799369812, | |
"eval_runtime": 4.539, | |
"eval_samples_per_second": 55.078, | |
"eval_steps_per_second": 1.763, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.125, | |
"grad_norm": 2.6973438262939453, | |
"learning_rate": 2.9746835443037974e-05, | |
"loss": 0.6784, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.125, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6790605187416077, | |
"eval_runtime": 4.5462, | |
"eval_samples_per_second": 54.991, | |
"eval_steps_per_second": 1.76, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.1875, | |
"grad_norm": 3.727440595626831, | |
"learning_rate": 2.9430379746835446e-05, | |
"loss": 0.7045, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.1875, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6793281435966492, | |
"eval_runtime": 4.5469, | |
"eval_samples_per_second": 54.983, | |
"eval_steps_per_second": 1.759, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.25, | |
"grad_norm": 1.7801040410995483, | |
"learning_rate": 2.9113924050632914e-05, | |
"loss": 0.643, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.25, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6788183450698853, | |
"eval_runtime": 4.5354, | |
"eval_samples_per_second": 55.122, | |
"eval_steps_per_second": 1.764, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.3125, | |
"grad_norm": 3.4789085388183594, | |
"learning_rate": 2.879746835443038e-05, | |
"loss": 0.675, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.3125, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6782128810882568, | |
"eval_runtime": 4.5402, | |
"eval_samples_per_second": 55.063, | |
"eval_steps_per_second": 1.762, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.375, | |
"grad_norm": 4.243752956390381, | |
"learning_rate": 2.848101265822785e-05, | |
"loss": 0.6469, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.375, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6780292987823486, | |
"eval_runtime": 4.543, | |
"eval_samples_per_second": 55.029, | |
"eval_steps_per_second": 1.761, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.4375, | |
"grad_norm": 6.593841552734375, | |
"learning_rate": 2.8164556962025318e-05, | |
"loss": 0.7455, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.4375, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6775800585746765, | |
"eval_runtime": 4.545, | |
"eval_samples_per_second": 55.005, | |
"eval_steps_per_second": 1.76, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 12.047831535339355, | |
"learning_rate": 2.7848101265822786e-05, | |
"loss": 0.6985, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6778261661529541, | |
"eval_runtime": 4.5527, | |
"eval_samples_per_second": 54.912, | |
"eval_steps_per_second": 1.757, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5625, | |
"grad_norm": 3.4566452503204346, | |
"learning_rate": 2.7531645569620257e-05, | |
"loss": 0.7616, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.5625, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6769921779632568, | |
"eval_runtime": 4.545, | |
"eval_samples_per_second": 55.005, | |
"eval_steps_per_second": 1.76, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.625, | |
"grad_norm": 2.8978374004364014, | |
"learning_rate": 2.7215189873417722e-05, | |
"loss": 0.7135, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.625, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6770429611206055, | |
"eval_runtime": 4.541, | |
"eval_samples_per_second": 55.054, | |
"eval_steps_per_second": 1.762, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.6875, | |
"grad_norm": 3.3244338035583496, | |
"learning_rate": 2.689873417721519e-05, | |
"loss": 0.7157, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.6875, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6766347885131836, | |
"eval_runtime": 4.5403, | |
"eval_samples_per_second": 55.062, | |
"eval_steps_per_second": 1.762, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.75, | |
"grad_norm": 5.23004150390625, | |
"learning_rate": 2.6582278481012658e-05, | |
"loss": 0.7058, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.75, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6764668226242065, | |
"eval_runtime": 4.54, | |
"eval_samples_per_second": 55.066, | |
"eval_steps_per_second": 1.762, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.8125, | |
"grad_norm": 8.803872108459473, | |
"learning_rate": 2.626582278481013e-05, | |
"loss": 0.7127, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.8125, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6759433746337891, | |
"eval_runtime": 4.5454, | |
"eval_samples_per_second": 55.0, | |
"eval_steps_per_second": 1.76, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.875, | |
"grad_norm": 3.5992655754089355, | |
"learning_rate": 2.5949367088607597e-05, | |
"loss": 0.7004, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.875, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6762988567352295, | |
"eval_runtime": 4.4993, | |
"eval_samples_per_second": 55.565, | |
"eval_steps_per_second": 1.778, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.9375, | |
"grad_norm": 3.1371684074401855, | |
"learning_rate": 2.5632911392405062e-05, | |
"loss": 0.6827, | |
"step": 79 | |
}, | |
{ | |
"epoch": 4.9375, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6757890582084656, | |
"eval_runtime": 4.5476, | |
"eval_samples_per_second": 54.974, | |
"eval_steps_per_second": 1.759, | |
"step": 79 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 3.854306697845459, | |
"learning_rate": 2.5316455696202533e-05, | |
"loss": 0.7649, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6760488152503967, | |
"eval_runtime": 4.539, | |
"eval_samples_per_second": 55.079, | |
"eval_steps_per_second": 1.763, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0625, | |
"grad_norm": 4.356711387634277, | |
"learning_rate": 2.5e-05, | |
"loss": 0.7461, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.0625, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6765702962875366, | |
"eval_runtime": 4.4883, | |
"eval_samples_per_second": 55.701, | |
"eval_steps_per_second": 1.782, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.125, | |
"grad_norm": 4.030115127563477, | |
"learning_rate": 2.468354430379747e-05, | |
"loss": 0.6346, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.125, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6769394278526306, | |
"eval_runtime": 4.5387, | |
"eval_samples_per_second": 55.082, | |
"eval_steps_per_second": 1.763, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.1875, | |
"grad_norm": 3.892704486846924, | |
"learning_rate": 2.4367088607594937e-05, | |
"loss": 0.6245, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.1875, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.6764355301856995, | |
"eval_runtime": 4.548, | |
"eval_samples_per_second": 54.969, | |
"eval_steps_per_second": 1.759, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.25, | |
"grad_norm": 2.755213975906372, | |
"learning_rate": 2.4050632911392405e-05, | |
"loss": 0.6595, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.25, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.6767304539680481, | |
"eval_runtime": 4.5445, | |
"eval_samples_per_second": 55.012, | |
"eval_steps_per_second": 1.76, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.3125, | |
"grad_norm": 9.109251976013184, | |
"learning_rate": 2.3734177215189873e-05, | |
"loss": 0.6507, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.3125, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6769980192184448, | |
"eval_runtime": 4.4987, | |
"eval_samples_per_second": 55.572, | |
"eval_steps_per_second": 1.778, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.375, | |
"grad_norm": 4.487890720367432, | |
"learning_rate": 2.341772151898734e-05, | |
"loss": 0.6528, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.375, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6765019297599792, | |
"eval_runtime": 4.544, | |
"eval_samples_per_second": 55.017, | |
"eval_steps_per_second": 1.761, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.4375, | |
"grad_norm": 2.2593257427215576, | |
"learning_rate": 2.3101265822784813e-05, | |
"loss": 0.687, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.4375, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.6773359179496765, | |
"eval_runtime": 4.5397, | |
"eval_samples_per_second": 55.07, | |
"eval_steps_per_second": 1.762, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 9.76685905456543, | |
"learning_rate": 2.278481012658228e-05, | |
"loss": 0.6913, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6779413819313049, | |
"eval_runtime": 4.5446, | |
"eval_samples_per_second": 55.01, | |
"eval_steps_per_second": 1.76, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5625, | |
"grad_norm": 1.9855612516403198, | |
"learning_rate": 2.246835443037975e-05, | |
"loss": 0.6799, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.5625, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6777753829956055, | |
"eval_runtime": 4.5453, | |
"eval_samples_per_second": 55.002, | |
"eval_steps_per_second": 1.76, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.625, | |
"grad_norm": 6.978314399719238, | |
"learning_rate": 2.2151898734177217e-05, | |
"loss": 0.6616, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.625, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6782050728797913, | |
"eval_runtime": 4.5005, | |
"eval_samples_per_second": 55.549, | |
"eval_steps_per_second": 1.778, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.6875, | |
"grad_norm": 2.3891565799713135, | |
"learning_rate": 2.1835443037974685e-05, | |
"loss": 0.6577, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.6875, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6784765720367432, | |
"eval_runtime": 4.5406, | |
"eval_samples_per_second": 55.059, | |
"eval_steps_per_second": 1.762, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.75, | |
"grad_norm": 4.9778313636779785, | |
"learning_rate": 2.1518987341772153e-05, | |
"loss": 0.6248, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.75, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.678955078125, | |
"eval_runtime": 4.5404, | |
"eval_samples_per_second": 55.062, | |
"eval_steps_per_second": 1.762, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.8125, | |
"grad_norm": 1.9475889205932617, | |
"learning_rate": 2.120253164556962e-05, | |
"loss": 0.7026, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.8125, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6784570217132568, | |
"eval_runtime": 4.5434, | |
"eval_samples_per_second": 55.025, | |
"eval_steps_per_second": 1.761, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.875, | |
"grad_norm": 6.539444923400879, | |
"learning_rate": 2.088607594936709e-05, | |
"loss": 0.6816, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.875, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6789179444313049, | |
"eval_runtime": 4.5418, | |
"eval_samples_per_second": 55.044, | |
"eval_steps_per_second": 1.761, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.9375, | |
"grad_norm": 1.8745115995407104, | |
"learning_rate": 2.056962025316456e-05, | |
"loss": 0.6476, | |
"step": 95 | |
}, | |
{ | |
"epoch": 5.9375, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6787148714065552, | |
"eval_runtime": 4.5397, | |
"eval_samples_per_second": 55.069, | |
"eval_steps_per_second": 1.762, | |
"step": 95 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 7.960897922515869, | |
"learning_rate": 2.0253164556962025e-05, | |
"loss": 0.6797, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6785527467727661, | |
"eval_runtime": 4.5395, | |
"eval_samples_per_second": 55.072, | |
"eval_steps_per_second": 1.762, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0625, | |
"grad_norm": 6.119703769683838, | |
"learning_rate": 1.9936708860759496e-05, | |
"loss": 0.6603, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.0625, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6781836152076721, | |
"eval_runtime": 4.5394, | |
"eval_samples_per_second": 55.073, | |
"eval_steps_per_second": 1.762, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.125, | |
"grad_norm": 2.6292548179626465, | |
"learning_rate": 1.962025316455696e-05, | |
"loss": 0.6892, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.125, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6773242354393005, | |
"eval_runtime": 4.5436, | |
"eval_samples_per_second": 55.022, | |
"eval_steps_per_second": 1.761, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.1875, | |
"grad_norm": 5.301840305328369, | |
"learning_rate": 1.9303797468354432e-05, | |
"loss": 0.677, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.1875, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.6762461066246033, | |
"eval_runtime": 4.5432, | |
"eval_samples_per_second": 55.027, | |
"eval_steps_per_second": 1.761, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.25, | |
"grad_norm": 3.4270968437194824, | |
"learning_rate": 1.89873417721519e-05, | |
"loss": 0.6696, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.25, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6752324104309082, | |
"eval_runtime": 4.5411, | |
"eval_samples_per_second": 55.052, | |
"eval_steps_per_second": 1.762, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.3125, | |
"grad_norm": 2.9809482097625732, | |
"learning_rate": 1.8670886075949368e-05, | |
"loss": 0.666, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.3125, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6741093993186951, | |
"eval_runtime": 4.5435, | |
"eval_samples_per_second": 55.024, | |
"eval_steps_per_second": 1.761, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.375, | |
"grad_norm": 3.612354278564453, | |
"learning_rate": 1.8354430379746836e-05, | |
"loss": 0.6552, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.375, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.6736387014389038, | |
"eval_runtime": 4.5443, | |
"eval_samples_per_second": 55.014, | |
"eval_steps_per_second": 1.76, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.4375, | |
"grad_norm": 13.848094940185547, | |
"learning_rate": 1.8037974683544304e-05, | |
"loss": 0.6958, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.4375, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.6730585694313049, | |
"eval_runtime": 4.537, | |
"eval_samples_per_second": 55.102, | |
"eval_steps_per_second": 1.763, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 2.657895565032959, | |
"learning_rate": 1.7721518987341772e-05, | |
"loss": 0.6779, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6721835732460022, | |
"eval_runtime": 4.5416, | |
"eval_samples_per_second": 55.047, | |
"eval_steps_per_second": 1.762, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5625, | |
"grad_norm": 3.6230475902557373, | |
"learning_rate": 1.7405063291139243e-05, | |
"loss": 0.662, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.5625, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6725234389305115, | |
"eval_runtime": 4.4966, | |
"eval_samples_per_second": 55.598, | |
"eval_steps_per_second": 1.779, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.625, | |
"grad_norm": 2.817807674407959, | |
"learning_rate": 1.7088607594936708e-05, | |
"loss": 0.639, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.625, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6714980602264404, | |
"eval_runtime": 4.4976, | |
"eval_samples_per_second": 55.585, | |
"eval_steps_per_second": 1.779, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.6875, | |
"grad_norm": 2.2491910457611084, | |
"learning_rate": 1.677215189873418e-05, | |
"loss": 0.6469, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.6875, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.6703847646713257, | |
"eval_runtime": 4.5015, | |
"eval_samples_per_second": 55.537, | |
"eval_steps_per_second": 1.777, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.75, | |
"grad_norm": 6.607123851776123, | |
"learning_rate": 1.6455696202531644e-05, | |
"loss": 0.6494, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.75, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6705585718154907, | |
"eval_runtime": 4.5512, | |
"eval_samples_per_second": 54.931, | |
"eval_steps_per_second": 1.758, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.8125, | |
"grad_norm": 3.7436728477478027, | |
"learning_rate": 1.6139240506329115e-05, | |
"loss": 0.6428, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.8125, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6696659922599792, | |
"eval_runtime": 4.5466, | |
"eval_samples_per_second": 54.986, | |
"eval_steps_per_second": 1.76, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.875, | |
"grad_norm": 10.663908004760742, | |
"learning_rate": 1.5822784810126583e-05, | |
"loss": 0.6949, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.875, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6699844002723694, | |
"eval_runtime": 4.5417, | |
"eval_samples_per_second": 55.046, | |
"eval_steps_per_second": 1.761, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.9375, | |
"grad_norm": 2.8781378269195557, | |
"learning_rate": 1.550632911392405e-05, | |
"loss": 0.6557, | |
"step": 111 | |
}, | |
{ | |
"epoch": 6.9375, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6697744131088257, | |
"eval_runtime": 4.5421, | |
"eval_samples_per_second": 55.041, | |
"eval_steps_per_second": 1.761, | |
"step": 111 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 9.62548828125, | |
"learning_rate": 1.5189873417721521e-05, | |
"loss": 0.625, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6696327924728394, | |
"eval_runtime": 4.5434, | |
"eval_samples_per_second": 55.025, | |
"eval_steps_per_second": 1.761, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0625, | |
"grad_norm": 3.4376492500305176, | |
"learning_rate": 1.4873417721518987e-05, | |
"loss": 0.6648, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.0625, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6688730716705322, | |
"eval_runtime": 4.5489, | |
"eval_samples_per_second": 54.958, | |
"eval_steps_per_second": 1.759, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.125, | |
"grad_norm": 11.591545104980469, | |
"learning_rate": 1.4556962025316457e-05, | |
"loss": 0.6909, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.125, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6680371165275574, | |
"eval_runtime": 4.5477, | |
"eval_samples_per_second": 54.973, | |
"eval_steps_per_second": 1.759, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.1875, | |
"grad_norm": 3.0911552906036377, | |
"learning_rate": 1.4240506329113925e-05, | |
"loss": 0.6548, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.1875, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.667611300945282, | |
"eval_runtime": 4.5404, | |
"eval_samples_per_second": 55.062, | |
"eval_steps_per_second": 1.762, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.25, | |
"grad_norm": 5.890276908874512, | |
"learning_rate": 1.3924050632911393e-05, | |
"loss": 0.6278, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.25, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6670957207679749, | |
"eval_runtime": 4.5443, | |
"eval_samples_per_second": 55.014, | |
"eval_steps_per_second": 1.76, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.3125, | |
"grad_norm": 2.038860321044922, | |
"learning_rate": 1.3607594936708861e-05, | |
"loss": 0.6899, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.3125, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6669042706489563, | |
"eval_runtime": 4.5442, | |
"eval_samples_per_second": 55.015, | |
"eval_steps_per_second": 1.76, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.375, | |
"grad_norm": 7.413594722747803, | |
"learning_rate": 1.3291139240506329e-05, | |
"loss": 0.6197, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.375, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6667382717132568, | |
"eval_runtime": 4.5391, | |
"eval_samples_per_second": 55.077, | |
"eval_steps_per_second": 1.762, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.4375, | |
"grad_norm": 3.1535215377807617, | |
"learning_rate": 1.2974683544303799e-05, | |
"loss": 0.653, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.4375, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.666509747505188, | |
"eval_runtime": 4.5431, | |
"eval_samples_per_second": 55.028, | |
"eval_steps_per_second": 1.761, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.5, | |
"grad_norm": 5.736833095550537, | |
"learning_rate": 1.2658227848101267e-05, | |
"loss": 0.6531, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6669736504554749, | |
"eval_runtime": 4.5365, | |
"eval_samples_per_second": 55.108, | |
"eval_steps_per_second": 1.763, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5625, | |
"grad_norm": 3.403089761734009, | |
"learning_rate": 1.2341772151898735e-05, | |
"loss": 0.6494, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.5625, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6666631102561951, | |
"eval_runtime": 4.5001, | |
"eval_samples_per_second": 55.555, | |
"eval_steps_per_second": 1.778, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.625, | |
"grad_norm": 2.2943952083587646, | |
"learning_rate": 1.2025316455696203e-05, | |
"loss": 0.6914, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.625, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6671044826507568, | |
"eval_runtime": 4.5495, | |
"eval_samples_per_second": 54.952, | |
"eval_steps_per_second": 1.758, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.6875, | |
"grad_norm": 1.8052605390548706, | |
"learning_rate": 1.170886075949367e-05, | |
"loss": 0.6506, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.6875, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6672109365463257, | |
"eval_runtime": 4.5468, | |
"eval_samples_per_second": 54.984, | |
"eval_steps_per_second": 1.759, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.75, | |
"grad_norm": 2.0512139797210693, | |
"learning_rate": 1.139240506329114e-05, | |
"loss": 0.6647, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.75, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6669785380363464, | |
"eval_runtime": 4.5423, | |
"eval_samples_per_second": 55.039, | |
"eval_steps_per_second": 1.761, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.8125, | |
"grad_norm": 9.648463249206543, | |
"learning_rate": 1.1075949367088608e-05, | |
"loss": 0.6476, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.8125, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6669345498085022, | |
"eval_runtime": 4.5447, | |
"eval_samples_per_second": 55.009, | |
"eval_steps_per_second": 1.76, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.875, | |
"grad_norm": 3.750437021255493, | |
"learning_rate": 1.0759493670886076e-05, | |
"loss": 0.6609, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.875, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6669287085533142, | |
"eval_runtime": 4.5377, | |
"eval_samples_per_second": 55.094, | |
"eval_steps_per_second": 1.763, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.9375, | |
"grad_norm": 2.9882094860076904, | |
"learning_rate": 1.0443037974683544e-05, | |
"loss": 0.6497, | |
"step": 127 | |
}, | |
{ | |
"epoch": 7.9375, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6663134694099426, | |
"eval_runtime": 4.5412, | |
"eval_samples_per_second": 55.052, | |
"eval_steps_per_second": 1.762, | |
"step": 127 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 5.13292932510376, | |
"learning_rate": 1.0126582278481012e-05, | |
"loss": 0.6773, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6660781502723694, | |
"eval_runtime": 4.4907, | |
"eval_samples_per_second": 55.671, | |
"eval_steps_per_second": 1.781, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0625, | |
"grad_norm": 4.037117958068848, | |
"learning_rate": 9.81012658227848e-06, | |
"loss": 0.6841, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.0625, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6660195589065552, | |
"eval_runtime": 4.4945, | |
"eval_samples_per_second": 55.623, | |
"eval_steps_per_second": 1.78, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.125, | |
"grad_norm": 9.9661865234375, | |
"learning_rate": 9.49367088607595e-06, | |
"loss": 0.657, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.125, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6656200885772705, | |
"eval_runtime": 4.5383, | |
"eval_samples_per_second": 55.086, | |
"eval_steps_per_second": 1.763, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.1875, | |
"grad_norm": 8.460039138793945, | |
"learning_rate": 9.177215189873418e-06, | |
"loss": 0.6622, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.1875, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6657363176345825, | |
"eval_runtime": 4.549, | |
"eval_samples_per_second": 54.957, | |
"eval_steps_per_second": 1.759, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.25, | |
"grad_norm": 2.761270046234131, | |
"learning_rate": 8.860759493670886e-06, | |
"loss": 0.667, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.25, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.665112316608429, | |
"eval_runtime": 4.5498, | |
"eval_samples_per_second": 54.947, | |
"eval_steps_per_second": 1.758, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.3125, | |
"grad_norm": 4.367539405822754, | |
"learning_rate": 8.544303797468354e-06, | |
"loss": 0.6662, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.3125, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6654492020606995, | |
"eval_runtime": 4.5418, | |
"eval_samples_per_second": 55.044, | |
"eval_steps_per_second": 1.761, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.375, | |
"grad_norm": 3.8258039951324463, | |
"learning_rate": 8.227848101265822e-06, | |
"loss": 0.615, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.375, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6657968759536743, | |
"eval_runtime": 4.5412, | |
"eval_samples_per_second": 55.051, | |
"eval_steps_per_second": 1.762, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.4375, | |
"grad_norm": 2.691741466522217, | |
"learning_rate": 7.911392405063292e-06, | |
"loss": 0.6961, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.4375, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6653828024864197, | |
"eval_runtime": 4.5434, | |
"eval_samples_per_second": 55.025, | |
"eval_steps_per_second": 1.761, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 5.671183109283447, | |
"learning_rate": 7.5949367088607605e-06, | |
"loss": 0.6134, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6660419702529907, | |
"eval_runtime": 4.5413, | |
"eval_samples_per_second": 55.05, | |
"eval_steps_per_second": 1.762, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5625, | |
"grad_norm": 7.398742198944092, | |
"learning_rate": 7.2784810126582285e-06, | |
"loss": 0.6839, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.5625, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6657724380493164, | |
"eval_runtime": 4.5404, | |
"eval_samples_per_second": 55.061, | |
"eval_steps_per_second": 1.762, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.625, | |
"grad_norm": 4.798144340515137, | |
"learning_rate": 6.9620253164556965e-06, | |
"loss": 0.6482, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.625, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.666140615940094, | |
"eval_runtime": 4.5422, | |
"eval_samples_per_second": 55.039, | |
"eval_steps_per_second": 1.761, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.6875, | |
"grad_norm": 8.25437068939209, | |
"learning_rate": 6.6455696202531645e-06, | |
"loss": 0.6635, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.6875, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6665273308753967, | |
"eval_runtime": 4.5468, | |
"eval_samples_per_second": 54.984, | |
"eval_steps_per_second": 1.759, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.75, | |
"grad_norm": 2.5796449184417725, | |
"learning_rate": 6.329113924050633e-06, | |
"loss": 0.6229, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.75, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.6665956974029541, | |
"eval_runtime": 4.5394, | |
"eval_samples_per_second": 55.073, | |
"eval_steps_per_second": 1.762, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.8125, | |
"grad_norm": 2.3988282680511475, | |
"learning_rate": 6.012658227848101e-06, | |
"loss": 0.6205, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.8125, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6665576100349426, | |
"eval_runtime": 4.5397, | |
"eval_samples_per_second": 55.07, | |
"eval_steps_per_second": 1.762, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.875, | |
"grad_norm": 3.2234578132629395, | |
"learning_rate": 5.69620253164557e-06, | |
"loss": 0.6347, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.875, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6664531230926514, | |
"eval_runtime": 4.4932, | |
"eval_samples_per_second": 55.64, | |
"eval_steps_per_second": 1.78, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.9375, | |
"grad_norm": 3.1038153171539307, | |
"learning_rate": 5.379746835443038e-06, | |
"loss": 0.6868, | |
"step": 143 | |
}, | |
{ | |
"epoch": 8.9375, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.6668280959129333, | |
"eval_runtime": 4.5461, | |
"eval_samples_per_second": 54.993, | |
"eval_steps_per_second": 1.76, | |
"step": 143 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 5.682613849639893, | |
"learning_rate": 5.063291139240506e-06, | |
"loss": 0.6447, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6665273308753967, | |
"eval_runtime": 4.5397, | |
"eval_samples_per_second": 55.069, | |
"eval_steps_per_second": 1.762, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0625, | |
"grad_norm": 8.149535179138184, | |
"learning_rate": 4.746835443037975e-06, | |
"loss": 0.6755, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.0625, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6669501662254333, | |
"eval_runtime": 4.497, | |
"eval_samples_per_second": 55.592, | |
"eval_steps_per_second": 1.779, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.125, | |
"grad_norm": 3.2166755199432373, | |
"learning_rate": 4.430379746835443e-06, | |
"loss": 0.6749, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.125, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.667477548122406, | |
"eval_runtime": 4.549, | |
"eval_samples_per_second": 54.957, | |
"eval_steps_per_second": 1.759, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.1875, | |
"grad_norm": 2.9138267040252686, | |
"learning_rate": 4.113924050632911e-06, | |
"loss": 0.6681, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.1875, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6679531335830688, | |
"eval_runtime": 4.5432, | |
"eval_samples_per_second": 55.027, | |
"eval_steps_per_second": 1.761, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.25, | |
"grad_norm": 8.955977439880371, | |
"learning_rate": 3.7974683544303802e-06, | |
"loss": 0.6768, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.25, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.667892575263977, | |
"eval_runtime": 4.5414, | |
"eval_samples_per_second": 55.049, | |
"eval_steps_per_second": 1.762, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.3125, | |
"grad_norm": 4.039650917053223, | |
"learning_rate": 3.4810126582278482e-06, | |
"loss": 0.6291, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.3125, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6680244207382202, | |
"eval_runtime": 4.5436, | |
"eval_samples_per_second": 55.022, | |
"eval_steps_per_second": 1.761, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.375, | |
"grad_norm": 3.648364543914795, | |
"learning_rate": 3.1645569620253167e-06, | |
"loss": 0.6857, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.375, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6680644750595093, | |
"eval_runtime": 4.5475, | |
"eval_samples_per_second": 54.975, | |
"eval_steps_per_second": 1.759, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.4375, | |
"grad_norm": 2.3928475379943848, | |
"learning_rate": 2.848101265822785e-06, | |
"loss": 0.6454, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.4375, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6678046584129333, | |
"eval_runtime": 4.5472, | |
"eval_samples_per_second": 54.979, | |
"eval_steps_per_second": 1.759, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.5, | |
"grad_norm": 1.8685684204101562, | |
"learning_rate": 2.531645569620253e-06, | |
"loss": 0.648, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6684179902076721, | |
"eval_runtime": 4.5422, | |
"eval_samples_per_second": 55.04, | |
"eval_steps_per_second": 1.761, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5625, | |
"grad_norm": 6.94075345993042, | |
"learning_rate": 2.2151898734177215e-06, | |
"loss": 0.5989, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.5625, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6686621308326721, | |
"eval_runtime": 4.549, | |
"eval_samples_per_second": 54.957, | |
"eval_steps_per_second": 1.759, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.625, | |
"grad_norm": 2.2033395767211914, | |
"learning_rate": 1.8987341772151901e-06, | |
"loss": 0.6334, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.625, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6692119240760803, | |
"eval_runtime": 4.5415, | |
"eval_samples_per_second": 55.048, | |
"eval_steps_per_second": 1.762, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.6875, | |
"grad_norm": 4.624488353729248, | |
"learning_rate": 1.5822784810126583e-06, | |
"loss": 0.6086, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.6875, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6682060360908508, | |
"eval_runtime": 4.5376, | |
"eval_samples_per_second": 55.095, | |
"eval_steps_per_second": 1.763, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.75, | |
"grad_norm": 8.24832820892334, | |
"learning_rate": 1.2658227848101265e-06, | |
"loss": 0.6355, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.75, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6689130663871765, | |
"eval_runtime": 4.5365, | |
"eval_samples_per_second": 55.109, | |
"eval_steps_per_second": 1.763, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.8125, | |
"grad_norm": 1.9968777894973755, | |
"learning_rate": 9.493670886075951e-07, | |
"loss": 0.618, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.8125, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6689111590385437, | |
"eval_runtime": 4.547, | |
"eval_samples_per_second": 54.981, | |
"eval_steps_per_second": 1.759, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.875, | |
"grad_norm": 2.4490880966186523, | |
"learning_rate": 6.329113924050633e-07, | |
"loss": 0.6603, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.875, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6684619188308716, | |
"eval_runtime": 4.5445, | |
"eval_samples_per_second": 55.012, | |
"eval_steps_per_second": 1.76, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.9375, | |
"grad_norm": 5.009583950042725, | |
"learning_rate": 3.1645569620253163e-07, | |
"loss": 0.6585, | |
"step": 159 | |
}, | |
{ | |
"epoch": 9.9375, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6681816577911377, | |
"eval_runtime": 4.5393, | |
"eval_samples_per_second": 55.075, | |
"eval_steps_per_second": 1.762, | |
"step": 159 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 3.7535176277160645, | |
"learning_rate": 0.0, | |
"loss": 0.6705, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6681679487228394, | |
"eval_runtime": 4.4878, | |
"eval_samples_per_second": 55.706, | |
"eval_steps_per_second": 1.783, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 160, | |
"total_flos": 174253428178944.0, | |
"train_loss": 0.6897118806838989, | |
"train_runtime": 1349.6743, | |
"train_samples_per_second": 7.409, | |
"train_steps_per_second": 0.119 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 160, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 174253428178944.0, | |
"train_batch_size": 8, | |
"trial_name": null, | |
"trial_params": null | |
} | |