adapters-gemma-bf16-QLORA-super_glue-axg
/
trainer_state-gemma-bf16-QLORA-super_glue-axg-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 90, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1111111111111111, | |
"grad_norm": 330.0, | |
"learning_rate": 2.5e-05, | |
"loss": 2.5909, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.1111111111111111, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 3.1164491176605225, | |
"eval_runtime": 0.5321, | |
"eval_samples_per_second": 135.313, | |
"eval_steps_per_second": 9.397, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"grad_norm": 336.0, | |
"learning_rate": 5e-05, | |
"loss": 3.2519, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 2.3881568908691406, | |
"eval_runtime": 0.6264, | |
"eval_samples_per_second": 114.934, | |
"eval_steps_per_second": 7.982, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 316.0, | |
"learning_rate": 4.943181818181818e-05, | |
"loss": 2.2592, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 1.807464838027954, | |
"eval_runtime": 0.6245, | |
"eval_samples_per_second": 115.289, | |
"eval_steps_per_second": 8.006, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"grad_norm": 201.0, | |
"learning_rate": 4.886363636363637e-05, | |
"loss": 1.745, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 2.4170877933502197, | |
"eval_runtime": 0.6253, | |
"eval_samples_per_second": 115.149, | |
"eval_steps_per_second": 7.996, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"grad_norm": 237.0, | |
"learning_rate": 4.829545454545455e-05, | |
"loss": 2.4192, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 1.8398630619049072, | |
"eval_runtime": 0.6243, | |
"eval_samples_per_second": 115.327, | |
"eval_steps_per_second": 8.009, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 290.0, | |
"learning_rate": 4.772727272727273e-05, | |
"loss": 2.2182, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"eval_accuracy": 0.5694444444444444, | |
"eval_loss": 1.299572467803955, | |
"eval_runtime": 0.5773, | |
"eval_samples_per_second": 124.719, | |
"eval_steps_per_second": 8.661, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"grad_norm": 247.0, | |
"learning_rate": 4.715909090909091e-05, | |
"loss": 1.5298, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 1.4382593631744385, | |
"eval_runtime": 0.6238, | |
"eval_samples_per_second": 115.42, | |
"eval_steps_per_second": 8.015, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"grad_norm": 61.75, | |
"learning_rate": 4.659090909090909e-05, | |
"loss": 0.8786, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.6289072036743164, | |
"eval_runtime": 0.6268, | |
"eval_samples_per_second": 114.863, | |
"eval_steps_per_second": 7.977, | |
"step": 8 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 156.0, | |
"learning_rate": 4.602272727272727e-05, | |
"loss": 0.8227, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 1.5709539651870728, | |
"eval_runtime": 0.6278, | |
"eval_samples_per_second": 114.692, | |
"eval_steps_per_second": 7.965, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"grad_norm": 34.0, | |
"learning_rate": 4.545454545454546e-05, | |
"loss": 0.4666, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 1.4421333074569702, | |
"eval_runtime": 0.5742, | |
"eval_samples_per_second": 125.392, | |
"eval_steps_per_second": 8.708, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"grad_norm": 44.5, | |
"learning_rate": 4.488636363636364e-05, | |
"loss": 0.5033, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 1.321220874786377, | |
"eval_runtime": 0.6247, | |
"eval_samples_per_second": 115.26, | |
"eval_steps_per_second": 8.004, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 17.75, | |
"learning_rate": 4.431818181818182e-05, | |
"loss": 0.3354, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 1.329500675201416, | |
"eval_runtime": 0.6271, | |
"eval_samples_per_second": 114.814, | |
"eval_steps_per_second": 7.973, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"grad_norm": 17.5, | |
"learning_rate": 4.375e-05, | |
"loss": 0.2448, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 1.3822489976882935, | |
"eval_runtime": 0.6265, | |
"eval_samples_per_second": 114.93, | |
"eval_steps_per_second": 7.981, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"grad_norm": 29.0, | |
"learning_rate": 4.318181818181819e-05, | |
"loss": 0.7798, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 1.4287919998168945, | |
"eval_runtime": 0.6272, | |
"eval_samples_per_second": 114.8, | |
"eval_steps_per_second": 7.972, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 66.0, | |
"learning_rate": 4.261363636363637e-05, | |
"loss": 0.4917, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 1.5045852661132812, | |
"eval_runtime": 0.6244, | |
"eval_samples_per_second": 115.305, | |
"eval_steps_per_second": 8.007, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"grad_norm": 36.5, | |
"learning_rate": 4.204545454545455e-05, | |
"loss": 0.2832, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 1.6484360694885254, | |
"eval_runtime": 0.6251, | |
"eval_samples_per_second": 115.18, | |
"eval_steps_per_second": 7.999, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"grad_norm": 18.5, | |
"learning_rate": 4.1477272727272734e-05, | |
"loss": 0.1876, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.70353102684021, | |
"eval_runtime": 0.6234, | |
"eval_samples_per_second": 115.489, | |
"eval_steps_per_second": 8.02, | |
"step": 17 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 70.0, | |
"learning_rate": 4.0909090909090915e-05, | |
"loss": 0.3145, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.67561936378479, | |
"eval_runtime": 0.6238, | |
"eval_samples_per_second": 115.418, | |
"eval_steps_per_second": 8.015, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"grad_norm": 26.625, | |
"learning_rate": 4.034090909090909e-05, | |
"loss": 0.1101, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 1.6205506324768066, | |
"eval_runtime": 0.6263, | |
"eval_samples_per_second": 114.968, | |
"eval_steps_per_second": 7.984, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"grad_norm": 5.625, | |
"learning_rate": 3.9772727272727275e-05, | |
"loss": 0.06, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 1.606693148612976, | |
"eval_runtime": 0.6242, | |
"eval_samples_per_second": 115.355, | |
"eval_steps_per_second": 8.011, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 13.375, | |
"learning_rate": 3.9204545454545456e-05, | |
"loss": 0.1102, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 1.5763201713562012, | |
"eval_runtime": 0.6243, | |
"eval_samples_per_second": 115.323, | |
"eval_steps_per_second": 8.009, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"grad_norm": 11.875, | |
"learning_rate": 3.8636363636363636e-05, | |
"loss": 0.0756, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 1.517237901687622, | |
"eval_runtime": 0.6287, | |
"eval_samples_per_second": 114.513, | |
"eval_steps_per_second": 7.952, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"grad_norm": 16.875, | |
"learning_rate": 3.8068181818181816e-05, | |
"loss": 0.0421, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.479959487915039, | |
"eval_runtime": 0.6268, | |
"eval_samples_per_second": 114.877, | |
"eval_steps_per_second": 7.978, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 23.125, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.0681, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.4677960872650146, | |
"eval_runtime": 0.6261, | |
"eval_samples_per_second": 114.994, | |
"eval_steps_per_second": 7.986, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"grad_norm": 6.21875, | |
"learning_rate": 3.6931818181818184e-05, | |
"loss": 0.0477, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"eval_accuracy": 0.5694444444444444, | |
"eval_loss": 1.4750635623931885, | |
"eval_runtime": 0.6231, | |
"eval_samples_per_second": 115.55, | |
"eval_steps_per_second": 8.024, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"grad_norm": 6.0, | |
"learning_rate": 3.6363636363636364e-05, | |
"loss": 0.0144, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 1.4336278438568115, | |
"eval_runtime": 0.6225, | |
"eval_samples_per_second": 115.661, | |
"eval_steps_per_second": 8.032, | |
"step": 26 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 23.375, | |
"learning_rate": 3.579545454545455e-05, | |
"loss": 0.0773, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 1.3547403812408447, | |
"eval_runtime": 0.6235, | |
"eval_samples_per_second": 115.474, | |
"eval_steps_per_second": 8.019, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"grad_norm": 1.46875, | |
"learning_rate": 3.522727272727273e-05, | |
"loss": 0.004, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 1.2619075775146484, | |
"eval_runtime": 0.6216, | |
"eval_samples_per_second": 115.826, | |
"eval_steps_per_second": 8.043, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"grad_norm": 1.8671875, | |
"learning_rate": 3.465909090909091e-05, | |
"loss": 0.0073, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 1.2116299867630005, | |
"eval_runtime": 0.6311, | |
"eval_samples_per_second": 114.086, | |
"eval_steps_per_second": 7.923, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 0.40234375, | |
"learning_rate": 3.409090909090909e-05, | |
"loss": 0.0034, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 1.2432918548583984, | |
"eval_runtime": 0.6245, | |
"eval_samples_per_second": 115.286, | |
"eval_steps_per_second": 8.006, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"grad_norm": 3.3125, | |
"learning_rate": 3.352272727272727e-05, | |
"loss": 0.0059, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 1.2839678525924683, | |
"eval_runtime": 0.6231, | |
"eval_samples_per_second": 115.559, | |
"eval_steps_per_second": 8.025, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"grad_norm": 2.890625, | |
"learning_rate": 3.295454545454545e-05, | |
"loss": 0.0083, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 1.2911839485168457, | |
"eval_runtime": 0.6242, | |
"eval_samples_per_second": 115.344, | |
"eval_steps_per_second": 8.01, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 0.6796875, | |
"learning_rate": 3.238636363636364e-05, | |
"loss": 0.0012, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 1.2517170906066895, | |
"eval_runtime": 0.6243, | |
"eval_samples_per_second": 115.335, | |
"eval_steps_per_second": 8.009, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"grad_norm": 0.86328125, | |
"learning_rate": 3.181818181818182e-05, | |
"loss": 0.0014, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 1.1986007690429688, | |
"eval_runtime": 0.5252, | |
"eval_samples_per_second": 137.095, | |
"eval_steps_per_second": 9.52, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"grad_norm": 0.70703125, | |
"learning_rate": 3.125e-05, | |
"loss": 0.0013, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.1582382917404175, | |
"eval_runtime": 0.6232, | |
"eval_samples_per_second": 115.53, | |
"eval_steps_per_second": 8.023, | |
"step": 35 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 0.07568359375, | |
"learning_rate": 3.068181818181818e-05, | |
"loss": 0.0004, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.1583126783370972, | |
"eval_runtime": 0.6233, | |
"eval_samples_per_second": 115.513, | |
"eval_steps_per_second": 8.022, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"grad_norm": 0.06591796875, | |
"learning_rate": 3.0113636363636365e-05, | |
"loss": 0.0002, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.1871634721755981, | |
"eval_runtime": 0.6234, | |
"eval_samples_per_second": 115.494, | |
"eval_steps_per_second": 8.02, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"grad_norm": 0.06201171875, | |
"learning_rate": 2.954545454545455e-05, | |
"loss": 0.0001, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.2237037420272827, | |
"eval_runtime": 0.6228, | |
"eval_samples_per_second": 115.606, | |
"eval_steps_per_second": 8.028, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 0.0615234375, | |
"learning_rate": 2.8977272727272732e-05, | |
"loss": 0.0001, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"eval_accuracy": 0.75, | |
"eval_loss": 1.2508012056350708, | |
"eval_runtime": 0.623, | |
"eval_samples_per_second": 115.574, | |
"eval_steps_per_second": 8.026, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"grad_norm": 0.734375, | |
"learning_rate": 2.8409090909090912e-05, | |
"loss": 0.0011, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.2626367807388306, | |
"eval_runtime": 0.625, | |
"eval_samples_per_second": 115.193, | |
"eval_steps_per_second": 8.0, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"grad_norm": 0.07177734375, | |
"learning_rate": 2.784090909090909e-05, | |
"loss": 0.0001, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 1.2784370183944702, | |
"eval_runtime": 0.623, | |
"eval_samples_per_second": 115.577, | |
"eval_steps_per_second": 8.026, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 0.84765625, | |
"learning_rate": 2.7272727272727273e-05, | |
"loss": 0.0006, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.2676023244857788, | |
"eval_runtime": 0.6247, | |
"eval_samples_per_second": 115.264, | |
"eval_steps_per_second": 8.004, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"grad_norm": 0.00823974609375, | |
"learning_rate": 2.6704545454545453e-05, | |
"loss": 0.0, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.3084845542907715, | |
"eval_runtime": 0.6226, | |
"eval_samples_per_second": 115.635, | |
"eval_steps_per_second": 8.03, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"grad_norm": 0.1455078125, | |
"learning_rate": 2.6136363636363637e-05, | |
"loss": 0.0002, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.3400903940200806, | |
"eval_runtime": 0.6218, | |
"eval_samples_per_second": 115.794, | |
"eval_steps_per_second": 8.041, | |
"step": 44 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 0.006134033203125, | |
"learning_rate": 2.5568181818181817e-05, | |
"loss": 0.0, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3613409996032715, | |
"eval_runtime": 0.6243, | |
"eval_samples_per_second": 115.324, | |
"eval_steps_per_second": 8.009, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"grad_norm": 0.0120849609375, | |
"learning_rate": 2.5e-05, | |
"loss": 0.0, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3976116180419922, | |
"eval_runtime": 0.6228, | |
"eval_samples_per_second": 115.607, | |
"eval_steps_per_second": 8.028, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"grad_norm": 0.0164794921875, | |
"learning_rate": 2.4431818181818185e-05, | |
"loss": 0.0, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.418382167816162, | |
"eval_runtime": 0.6217, | |
"eval_samples_per_second": 115.803, | |
"eval_steps_per_second": 8.042, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 0.0301513671875, | |
"learning_rate": 2.3863636363636365e-05, | |
"loss": 0.0001, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.4322800636291504, | |
"eval_runtime": 0.6241, | |
"eval_samples_per_second": 115.372, | |
"eval_steps_per_second": 8.012, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"grad_norm": 0.00092315673828125, | |
"learning_rate": 2.3295454545454546e-05, | |
"loss": 0.0, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4491381645202637, | |
"eval_runtime": 0.6242, | |
"eval_samples_per_second": 115.342, | |
"eval_steps_per_second": 8.01, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"grad_norm": 0.000576019287109375, | |
"learning_rate": 2.272727272727273e-05, | |
"loss": 0.0, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 1.4777884483337402, | |
"eval_runtime": 0.6242, | |
"eval_samples_per_second": 115.346, | |
"eval_steps_per_second": 8.01, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 0.0260009765625, | |
"learning_rate": 2.215909090909091e-05, | |
"loss": 0.0001, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4906997680664062, | |
"eval_runtime": 0.6249, | |
"eval_samples_per_second": 115.212, | |
"eval_steps_per_second": 8.001, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"grad_norm": 0.00079345703125, | |
"learning_rate": 2.1590909090909093e-05, | |
"loss": 0.0, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4938912391662598, | |
"eval_runtime": 0.6254, | |
"eval_samples_per_second": 115.131, | |
"eval_steps_per_second": 7.995, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"grad_norm": 0.0172119140625, | |
"learning_rate": 2.1022727272727274e-05, | |
"loss": 0.0, | |
"step": 53 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.5104340314865112, | |
"eval_runtime": 0.625, | |
"eval_samples_per_second": 115.196, | |
"eval_steps_per_second": 8.0, | |
"step": 53 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 0.057861328125, | |
"learning_rate": 2.0454545454545457e-05, | |
"loss": 0.0001, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.5254335403442383, | |
"eval_runtime": 0.625, | |
"eval_samples_per_second": 115.196, | |
"eval_steps_per_second": 8.0, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"grad_norm": 0.0032806396484375, | |
"learning_rate": 1.9886363636363638e-05, | |
"loss": 0.0, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.542032241821289, | |
"eval_runtime": 0.6326, | |
"eval_samples_per_second": 113.813, | |
"eval_steps_per_second": 7.904, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"grad_norm": 0.021728515625, | |
"learning_rate": 1.9318181818181818e-05, | |
"loss": 0.0, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.5513436794281006, | |
"eval_runtime": 0.6244, | |
"eval_samples_per_second": 115.315, | |
"eval_steps_per_second": 8.008, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 0.00115203857421875, | |
"learning_rate": 1.8750000000000002e-05, | |
"loss": 0.0, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.5597673654556274, | |
"eval_runtime": 0.6252, | |
"eval_samples_per_second": 115.161, | |
"eval_steps_per_second": 7.997, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"grad_norm": 0.029052734375, | |
"learning_rate": 1.8181818181818182e-05, | |
"loss": 0.0, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.575224757194519, | |
"eval_runtime": 0.6248, | |
"eval_samples_per_second": 115.228, | |
"eval_steps_per_second": 8.002, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"grad_norm": 0.004058837890625, | |
"learning_rate": 1.7613636363636366e-05, | |
"loss": 0.0, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.5835950374603271, | |
"eval_runtime": 0.6244, | |
"eval_samples_per_second": 115.306, | |
"eval_steps_per_second": 8.007, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 0.0010528564453125, | |
"learning_rate": 1.7045454545454546e-05, | |
"loss": 0.0, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.587098479270935, | |
"eval_runtime": 0.645, | |
"eval_samples_per_second": 111.621, | |
"eval_steps_per_second": 7.751, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"grad_norm": 0.004058837890625, | |
"learning_rate": 1.6477272727272726e-05, | |
"loss": 0.0, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.5953326225280762, | |
"eval_runtime": 0.6226, | |
"eval_samples_per_second": 115.648, | |
"eval_steps_per_second": 8.031, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"grad_norm": 0.0030670166015625, | |
"learning_rate": 1.590909090909091e-05, | |
"loss": 0.0, | |
"step": 62 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.5967087745666504, | |
"eval_runtime": 0.6253, | |
"eval_samples_per_second": 115.146, | |
"eval_steps_per_second": 7.996, | |
"step": 62 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 0.002593994140625, | |
"learning_rate": 1.534090909090909e-05, | |
"loss": 0.0, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6085822582244873, | |
"eval_runtime": 0.6253, | |
"eval_samples_per_second": 115.153, | |
"eval_steps_per_second": 7.997, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"grad_norm": 0.00738525390625, | |
"learning_rate": 1.4772727272727274e-05, | |
"loss": 0.0, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.597582459449768, | |
"eval_runtime": 0.5737, | |
"eval_samples_per_second": 125.507, | |
"eval_steps_per_second": 8.716, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"grad_norm": 0.0018463134765625, | |
"learning_rate": 1.4204545454545456e-05, | |
"loss": 0.0, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.600056529045105, | |
"eval_runtime": 0.6217, | |
"eval_samples_per_second": 115.809, | |
"eval_steps_per_second": 8.042, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"grad_norm": 0.000949859619140625, | |
"learning_rate": 1.3636363636363637e-05, | |
"loss": 0.0, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6038472652435303, | |
"eval_runtime": 0.6271, | |
"eval_samples_per_second": 114.813, | |
"eval_steps_per_second": 7.973, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"grad_norm": 0.0047607421875, | |
"learning_rate": 1.3068181818181819e-05, | |
"loss": 0.0, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6238569021224976, | |
"eval_runtime": 0.6258, | |
"eval_samples_per_second": 115.045, | |
"eval_steps_per_second": 7.989, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"grad_norm": 0.0016021728515625, | |
"learning_rate": 1.25e-05, | |
"loss": 0.0, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.6014671325683594, | |
"eval_runtime": 0.6227, | |
"eval_samples_per_second": 115.634, | |
"eval_steps_per_second": 8.03, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"grad_norm": 0.0003643035888671875, | |
"learning_rate": 1.1931818181818183e-05, | |
"loss": 0.0, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6172735691070557, | |
"eval_runtime": 0.6399, | |
"eval_samples_per_second": 112.522, | |
"eval_steps_per_second": 7.814, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"grad_norm": 0.0137939453125, | |
"learning_rate": 1.1363636363636365e-05, | |
"loss": 0.0, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6042041778564453, | |
"eval_runtime": 0.6241, | |
"eval_samples_per_second": 115.365, | |
"eval_steps_per_second": 8.011, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"grad_norm": 0.01336669921875, | |
"learning_rate": 1.0795454545454547e-05, | |
"loss": 0.0, | |
"step": 71 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6198363304138184, | |
"eval_runtime": 0.5514, | |
"eval_samples_per_second": 130.587, | |
"eval_steps_per_second": 9.069, | |
"step": 71 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 0.00157928466796875, | |
"learning_rate": 1.0227272727272729e-05, | |
"loss": 0.0, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6046463251113892, | |
"eval_runtime": 0.6274, | |
"eval_samples_per_second": 114.752, | |
"eval_steps_per_second": 7.969, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"grad_norm": 0.006805419921875, | |
"learning_rate": 9.659090909090909e-06, | |
"loss": 0.0, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6086045503616333, | |
"eval_runtime": 0.5492, | |
"eval_samples_per_second": 131.099, | |
"eval_steps_per_second": 9.104, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"grad_norm": 0.00159454345703125, | |
"learning_rate": 9.090909090909091e-06, | |
"loss": 0.0, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6164934635162354, | |
"eval_runtime": 0.6241, | |
"eval_samples_per_second": 115.374, | |
"eval_steps_per_second": 8.012, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"grad_norm": 0.007659912109375, | |
"learning_rate": 8.522727272727273e-06, | |
"loss": 0.0, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6239551305770874, | |
"eval_runtime": 0.6237, | |
"eval_samples_per_second": 115.434, | |
"eval_steps_per_second": 8.016, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"grad_norm": 0.01190185546875, | |
"learning_rate": 7.954545454545455e-06, | |
"loss": 0.0, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6252025365829468, | |
"eval_runtime": 0.6251, | |
"eval_samples_per_second": 115.189, | |
"eval_steps_per_second": 7.999, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"grad_norm": 0.005950927734375, | |
"learning_rate": 7.386363636363637e-06, | |
"loss": 0.0, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6214745044708252, | |
"eval_runtime": 0.6243, | |
"eval_samples_per_second": 115.333, | |
"eval_steps_per_second": 8.009, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"grad_norm": 0.0004711151123046875, | |
"learning_rate": 6.818181818181818e-06, | |
"loss": 0.0, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.621131420135498, | |
"eval_runtime": 0.6242, | |
"eval_samples_per_second": 115.354, | |
"eval_steps_per_second": 8.011, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"grad_norm": 0.006195068359375, | |
"learning_rate": 6.25e-06, | |
"loss": 0.0, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6142359972000122, | |
"eval_runtime": 0.6237, | |
"eval_samples_per_second": 115.441, | |
"eval_steps_per_second": 8.017, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"grad_norm": 0.00060272216796875, | |
"learning_rate": 5.681818181818182e-06, | |
"loss": 0.0, | |
"step": 80 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6200090646743774, | |
"eval_runtime": 0.624, | |
"eval_samples_per_second": 115.375, | |
"eval_steps_per_second": 8.012, | |
"step": 80 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.0025634765625, | |
"learning_rate": 5.113636363636364e-06, | |
"loss": 0.0, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6171174049377441, | |
"eval_runtime": 0.6229, | |
"eval_samples_per_second": 115.582, | |
"eval_steps_per_second": 8.027, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"grad_norm": 0.01220703125, | |
"learning_rate": 4.5454545454545455e-06, | |
"loss": 0.0, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6105068922042847, | |
"eval_runtime": 0.6248, | |
"eval_samples_per_second": 115.24, | |
"eval_steps_per_second": 8.003, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"grad_norm": 0.00167083740234375, | |
"learning_rate": 3.9772727272727275e-06, | |
"loss": 0.0, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6220049858093262, | |
"eval_runtime": 0.6253, | |
"eval_samples_per_second": 115.141, | |
"eval_steps_per_second": 7.996, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"grad_norm": 0.0031280517578125, | |
"learning_rate": 3.409090909090909e-06, | |
"loss": 0.0, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.621678113937378, | |
"eval_runtime": 0.6233, | |
"eval_samples_per_second": 115.516, | |
"eval_steps_per_second": 8.022, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"grad_norm": 0.01220703125, | |
"learning_rate": 2.840909090909091e-06, | |
"loss": 0.0, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6148762702941895, | |
"eval_runtime": 0.6224, | |
"eval_samples_per_second": 115.681, | |
"eval_steps_per_second": 8.033, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"grad_norm": 0.00014972686767578125, | |
"learning_rate": 2.2727272727272728e-06, | |
"loss": 0.0, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6093779802322388, | |
"eval_runtime": 0.6232, | |
"eval_samples_per_second": 115.534, | |
"eval_steps_per_second": 8.023, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"grad_norm": 0.0012054443359375, | |
"learning_rate": 1.7045454545454546e-06, | |
"loss": 0.0, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.612194538116455, | |
"eval_runtime": 0.6254, | |
"eval_samples_per_second": 115.134, | |
"eval_steps_per_second": 7.995, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"grad_norm": 0.0096435546875, | |
"learning_rate": 1.1363636363636364e-06, | |
"loss": 0.0, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.6344364881515503, | |
"eval_runtime": 0.6244, | |
"eval_samples_per_second": 115.306, | |
"eval_steps_per_second": 8.007, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"grad_norm": 0.00136566162109375, | |
"learning_rate": 5.681818181818182e-07, | |
"loss": 0.0, | |
"step": 89 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.6183820962905884, | |
"eval_runtime": 0.6241, | |
"eval_samples_per_second": 115.365, | |
"eval_steps_per_second": 8.011, | |
"step": 89 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.000919342041015625, | |
"learning_rate": 0.0, | |
"loss": 0.0, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.621146321296692, | |
"eval_runtime": 0.6236, | |
"eval_samples_per_second": 115.455, | |
"eval_steps_per_second": 8.018, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 90, | |
"total_flos": 4834098484871168.0, | |
"train_loss": 0.24404720813035966, | |
"train_runtime": 123.0578, | |
"train_samples_per_second": 23.079, | |
"train_steps_per_second": 0.731 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 90, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": false, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 4834098484871168.0, | |
"train_batch_size": 4, | |
"trial_name": null, | |
"trial_params": null | |
} | |