|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 133.33333333333334, |
|
"eval_steps": 500, |
|
"global_step": 900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"eval_accuracy": 0.09375, |
|
"eval_loss": 2.078925609588623, |
|
"eval_runtime": 2.1652, |
|
"eval_samples_per_second": 44.337, |
|
"eval_steps_per_second": 1.386, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 4.666333198547363, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 2.1005, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"eval_accuracy": 0.125, |
|
"eval_loss": 2.027583122253418, |
|
"eval_runtime": 1.9426, |
|
"eval_samples_per_second": 49.419, |
|
"eval_steps_per_second": 1.544, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 6.9924726486206055, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 2.0321, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"eval_accuracy": 0.28125, |
|
"eval_loss": 1.945550560951233, |
|
"eval_runtime": 1.8821, |
|
"eval_samples_per_second": 51.006, |
|
"eval_steps_per_second": 1.594, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4583333333333333, |
|
"eval_loss": 1.8392573595046997, |
|
"eval_runtime": 1.8452, |
|
"eval_samples_per_second": 52.028, |
|
"eval_steps_per_second": 1.626, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 6.883745193481445, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.9151, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"eval_accuracy": 0.5833333333333334, |
|
"eval_loss": 1.7343072891235352, |
|
"eval_runtime": 1.8707, |
|
"eval_samples_per_second": 51.318, |
|
"eval_steps_per_second": 1.604, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 12.400025367736816, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.7396, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"eval_accuracy": 0.6041666666666666, |
|
"eval_loss": 1.5971508026123047, |
|
"eval_runtime": 1.891, |
|
"eval_samples_per_second": 50.767, |
|
"eval_steps_per_second": 1.586, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 6.962962962962963, |
|
"eval_accuracy": 0.6770833333333334, |
|
"eval_loss": 1.4546260833740234, |
|
"eval_runtime": 1.8412, |
|
"eval_samples_per_second": 52.14, |
|
"eval_steps_per_second": 1.629, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 11.476837158203125, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.5392, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7395833333333334, |
|
"eval_loss": 1.2942534685134888, |
|
"eval_runtime": 1.9365, |
|
"eval_samples_per_second": 49.574, |
|
"eval_steps_per_second": 1.549, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 14.580023765563965, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.3096, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"eval_accuracy": 0.7395833333333334, |
|
"eval_loss": 1.1409353017807007, |
|
"eval_runtime": 1.8719, |
|
"eval_samples_per_second": 51.286, |
|
"eval_steps_per_second": 1.603, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 9.925925925925926, |
|
"eval_accuracy": 0.8229166666666666, |
|
"eval_loss": 0.9840934872627258, |
|
"eval_runtime": 1.8663, |
|
"eval_samples_per_second": 51.44, |
|
"eval_steps_per_second": 1.607, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 10.37037037037037, |
|
"grad_norm": 15.968297958374023, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 1.1062, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 10.962962962962964, |
|
"eval_accuracy": 0.8229166666666666, |
|
"eval_loss": 0.851224958896637, |
|
"eval_runtime": 1.9128, |
|
"eval_samples_per_second": 50.189, |
|
"eval_steps_per_second": 1.568, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 11.851851851851851, |
|
"grad_norm": 23.297271728515625, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.896, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.7128415703773499, |
|
"eval_runtime": 1.8414, |
|
"eval_samples_per_second": 52.136, |
|
"eval_steps_per_second": 1.629, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 12.88888888888889, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6365618109703064, |
|
"eval_runtime": 1.8634, |
|
"eval_samples_per_second": 51.518, |
|
"eval_steps_per_second": 1.61, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 24.776830673217773, |
|
"learning_rate": 5e-05, |
|
"loss": 0.712, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 13.925925925925926, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.5419111847877502, |
|
"eval_runtime": 1.874, |
|
"eval_samples_per_second": 51.228, |
|
"eval_steps_per_second": 1.601, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 14.814814814814815, |
|
"grad_norm": 26.37338638305664, |
|
"learning_rate": 4.938271604938271e-05, |
|
"loss": 0.6231, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 14.962962962962964, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.5082051753997803, |
|
"eval_runtime": 1.8338, |
|
"eval_samples_per_second": 52.35, |
|
"eval_steps_per_second": 1.636, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.4674176871776581, |
|
"eval_runtime": 1.878, |
|
"eval_samples_per_second": 51.118, |
|
"eval_steps_per_second": 1.597, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 16.296296296296298, |
|
"grad_norm": 26.630212783813477, |
|
"learning_rate": 4.876543209876544e-05, |
|
"loss": 0.4962, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 16.88888888888889, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.4479581415653229, |
|
"eval_runtime": 1.8306, |
|
"eval_samples_per_second": 52.441, |
|
"eval_steps_per_second": 1.639, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 23.847532272338867, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.4322, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 17.925925925925927, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.41380929946899414, |
|
"eval_runtime": 1.8812, |
|
"eval_samples_per_second": 51.03, |
|
"eval_steps_per_second": 1.595, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 18.962962962962962, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.39465710520744324, |
|
"eval_runtime": 1.8978, |
|
"eval_samples_per_second": 50.584, |
|
"eval_steps_per_second": 1.581, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 19.25925925925926, |
|
"grad_norm": 23.30731201171875, |
|
"learning_rate": 4.7530864197530866e-05, |
|
"loss": 0.3937, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3826509416103363, |
|
"eval_runtime": 1.8444, |
|
"eval_samples_per_second": 52.05, |
|
"eval_steps_per_second": 1.627, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 20.74074074074074, |
|
"grad_norm": 22.551395416259766, |
|
"learning_rate": 4.691358024691358e-05, |
|
"loss": 0.3377, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 20.88888888888889, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.36256420612335205, |
|
"eval_runtime": 1.8807, |
|
"eval_samples_per_second": 51.044, |
|
"eval_steps_per_second": 1.595, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 21.925925925925927, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3579378128051758, |
|
"eval_runtime": 1.8493, |
|
"eval_samples_per_second": 51.911, |
|
"eval_steps_per_second": 1.622, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 22.22222222222222, |
|
"grad_norm": 19.129470825195312, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.3099, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 22.962962962962962, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.35575783252716064, |
|
"eval_runtime": 1.8947, |
|
"eval_samples_per_second": 50.668, |
|
"eval_steps_per_second": 1.583, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 23.703703703703702, |
|
"grad_norm": 34.541709899902344, |
|
"learning_rate": 4.567901234567901e-05, |
|
"loss": 0.2895, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3242819011211395, |
|
"eval_runtime": 1.9128, |
|
"eval_samples_per_second": 50.188, |
|
"eval_steps_per_second": 1.568, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 24.88888888888889, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.34730610251426697, |
|
"eval_runtime": 1.857, |
|
"eval_samples_per_second": 51.697, |
|
"eval_steps_per_second": 1.616, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 25.185185185185187, |
|
"grad_norm": 27.23337745666504, |
|
"learning_rate": 4.506172839506173e-05, |
|
"loss": 0.2732, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 25.925925925925927, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.34611785411834717, |
|
"eval_runtime": 1.9339, |
|
"eval_samples_per_second": 49.641, |
|
"eval_steps_per_second": 1.551, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 26.6937255859375, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.2447, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 26.962962962962962, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.34497448801994324, |
|
"eval_runtime": 1.9227, |
|
"eval_samples_per_second": 49.929, |
|
"eval_steps_per_second": 1.56, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.36034080386161804, |
|
"eval_runtime": 1.8685, |
|
"eval_samples_per_second": 51.378, |
|
"eval_steps_per_second": 1.606, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 28.14814814814815, |
|
"grad_norm": 22.52947235107422, |
|
"learning_rate": 4.3827160493827164e-05, |
|
"loss": 0.2009, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 28.88888888888889, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3214483857154846, |
|
"eval_runtime": 1.8832, |
|
"eval_samples_per_second": 50.977, |
|
"eval_steps_per_second": 1.593, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 29.62962962962963, |
|
"grad_norm": 23.971797943115234, |
|
"learning_rate": 4.3209876543209875e-05, |
|
"loss": 0.2064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 29.925925925925927, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.30430108308792114, |
|
"eval_runtime": 1.8494, |
|
"eval_samples_per_second": 51.909, |
|
"eval_steps_per_second": 1.622, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 30.962962962962962, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2916516661643982, |
|
"eval_runtime": 1.8828, |
|
"eval_samples_per_second": 50.987, |
|
"eval_steps_per_second": 1.593, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 31.11111111111111, |
|
"grad_norm": 20.200942993164062, |
|
"learning_rate": 4.259259259259259e-05, |
|
"loss": 0.2139, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.28600063920021057, |
|
"eval_runtime": 1.903, |
|
"eval_samples_per_second": 50.448, |
|
"eval_steps_per_second": 1.576, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 32.592592592592595, |
|
"grad_norm": 49.91157531738281, |
|
"learning_rate": 4.197530864197531e-05, |
|
"loss": 0.1732, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 32.888888888888886, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.3314121663570404, |
|
"eval_runtime": 1.8588, |
|
"eval_samples_per_second": 51.646, |
|
"eval_steps_per_second": 1.614, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 33.925925925925924, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.339076429605484, |
|
"eval_runtime": 1.8736, |
|
"eval_samples_per_second": 51.238, |
|
"eval_steps_per_second": 1.601, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 34.074074074074076, |
|
"grad_norm": 15.412144660949707, |
|
"learning_rate": 4.135802469135803e-05, |
|
"loss": 0.2009, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 34.96296296296296, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.3118279278278351, |
|
"eval_runtime": 1.8956, |
|
"eval_samples_per_second": 50.645, |
|
"eval_steps_per_second": 1.583, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 35.55555555555556, |
|
"grad_norm": 18.91287612915039, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.1683, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3162083327770233, |
|
"eval_runtime": 1.8823, |
|
"eval_samples_per_second": 51.002, |
|
"eval_steps_per_second": 1.594, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 36.888888888888886, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3011355996131897, |
|
"eval_runtime": 1.8611, |
|
"eval_samples_per_second": 51.582, |
|
"eval_steps_per_second": 1.612, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 37.03703703703704, |
|
"grad_norm": 35.87955856323242, |
|
"learning_rate": 4.012345679012346e-05, |
|
"loss": 0.16, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 37.925925925925924, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.2981296479701996, |
|
"eval_runtime": 1.9139, |
|
"eval_samples_per_second": 50.159, |
|
"eval_steps_per_second": 1.567, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 38.51851851851852, |
|
"grad_norm": 12.093141555786133, |
|
"learning_rate": 3.950617283950617e-05, |
|
"loss": 0.1448, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 38.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.34168219566345215, |
|
"eval_runtime": 1.8741, |
|
"eval_samples_per_second": 51.226, |
|
"eval_steps_per_second": 1.601, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 36.96841049194336, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.1272, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.35579848289489746, |
|
"eval_runtime": 1.8827, |
|
"eval_samples_per_second": 50.992, |
|
"eval_steps_per_second": 1.593, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 40.888888888888886, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.3948463499546051, |
|
"eval_runtime": 1.8989, |
|
"eval_samples_per_second": 50.557, |
|
"eval_steps_per_second": 1.58, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 41.48148148148148, |
|
"grad_norm": 17.500049591064453, |
|
"learning_rate": 3.82716049382716e-05, |
|
"loss": 0.1578, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 41.925925925925924, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.36678051948547363, |
|
"eval_runtime": 1.9696, |
|
"eval_samples_per_second": 48.741, |
|
"eval_steps_per_second": 1.523, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 42.96296296296296, |
|
"grad_norm": 19.923572540283203, |
|
"learning_rate": 3.7654320987654326e-05, |
|
"loss": 0.1604, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 42.96296296296296, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.33422672748565674, |
|
"eval_runtime": 1.9103, |
|
"eval_samples_per_second": 50.253, |
|
"eval_steps_per_second": 1.57, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.9166666666666666, |
|
"eval_loss": 0.314091295003891, |
|
"eval_runtime": 1.8512, |
|
"eval_samples_per_second": 51.858, |
|
"eval_steps_per_second": 1.621, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 44.44444444444444, |
|
"grad_norm": 21.997079849243164, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.1251, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 44.888888888888886, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3266027271747589, |
|
"eval_runtime": 1.8596, |
|
"eval_samples_per_second": 51.625, |
|
"eval_steps_per_second": 1.613, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 45.925925925925924, |
|
"grad_norm": 27.58420753479004, |
|
"learning_rate": 3.6419753086419754e-05, |
|
"loss": 0.1449, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 45.925925925925924, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.34379109740257263, |
|
"eval_runtime": 1.9283, |
|
"eval_samples_per_second": 49.786, |
|
"eval_steps_per_second": 1.556, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 46.96296296296296, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3382701873779297, |
|
"eval_runtime": 1.8689, |
|
"eval_samples_per_second": 51.368, |
|
"eval_steps_per_second": 1.605, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 47.407407407407405, |
|
"grad_norm": 14.891225814819336, |
|
"learning_rate": 3.580246913580247e-05, |
|
"loss": 0.1134, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.33413389325141907, |
|
"eval_runtime": 1.887, |
|
"eval_samples_per_second": 50.875, |
|
"eval_steps_per_second": 1.59, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"grad_norm": 22.530115127563477, |
|
"learning_rate": 3.518518518518519e-05, |
|
"loss": 0.1558, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2854965031147003, |
|
"eval_runtime": 1.9011, |
|
"eval_samples_per_second": 50.497, |
|
"eval_steps_per_second": 1.578, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 49.925925925925924, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2842860221862793, |
|
"eval_runtime": 1.9386, |
|
"eval_samples_per_second": 49.52, |
|
"eval_steps_per_second": 1.547, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 50.370370370370374, |
|
"grad_norm": 45.90536880493164, |
|
"learning_rate": 3.45679012345679e-05, |
|
"loss": 0.1433, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 50.96296296296296, |
|
"eval_accuracy": 0.84375, |
|
"eval_loss": 0.2878771126270294, |
|
"eval_runtime": 1.8358, |
|
"eval_samples_per_second": 52.292, |
|
"eval_steps_per_second": 1.634, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 51.851851851851855, |
|
"grad_norm": 30.33245849609375, |
|
"learning_rate": 3.395061728395062e-05, |
|
"loss": 0.1207, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.28866705298423767, |
|
"eval_runtime": 1.8609, |
|
"eval_samples_per_second": 51.587, |
|
"eval_steps_per_second": 1.612, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 52.888888888888886, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.3173443377017975, |
|
"eval_runtime": 1.858, |
|
"eval_samples_per_second": 51.668, |
|
"eval_steps_per_second": 1.615, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 14.928536415100098, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.1006, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 53.925925925925924, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.2926301658153534, |
|
"eval_runtime": 2.1238, |
|
"eval_samples_per_second": 45.203, |
|
"eval_steps_per_second": 1.413, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 54.81481481481482, |
|
"grad_norm": 16.707883834838867, |
|
"learning_rate": 3.271604938271605e-05, |
|
"loss": 0.1053, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 54.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.2791050672531128, |
|
"eval_runtime": 1.8468, |
|
"eval_samples_per_second": 51.982, |
|
"eval_steps_per_second": 1.624, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3276265859603882, |
|
"eval_runtime": 1.8761, |
|
"eval_samples_per_second": 51.17, |
|
"eval_steps_per_second": 1.599, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 56.2962962962963, |
|
"grad_norm": 19.896167755126953, |
|
"learning_rate": 3.209876543209876e-05, |
|
"loss": 0.106, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 56.888888888888886, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.32238277792930603, |
|
"eval_runtime": 1.9352, |
|
"eval_samples_per_second": 49.608, |
|
"eval_steps_per_second": 1.55, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 57.77777777777778, |
|
"grad_norm": 23.663541793823242, |
|
"learning_rate": 3.148148148148148e-05, |
|
"loss": 0.1058, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 57.925925925925924, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.33849918842315674, |
|
"eval_runtime": 1.9952, |
|
"eval_samples_per_second": 48.116, |
|
"eval_steps_per_second": 1.504, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 58.96296296296296, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.34936726093292236, |
|
"eval_runtime": 1.8826, |
|
"eval_samples_per_second": 50.992, |
|
"eval_steps_per_second": 1.594, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 59.25925925925926, |
|
"grad_norm": 6.200643539428711, |
|
"learning_rate": 3.08641975308642e-05, |
|
"loss": 0.0962, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.2797829508781433, |
|
"eval_runtime": 1.8593, |
|
"eval_samples_per_second": 51.633, |
|
"eval_steps_per_second": 1.614, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 60.74074074074074, |
|
"grad_norm": 16.507617950439453, |
|
"learning_rate": 3.0246913580246916e-05, |
|
"loss": 0.0883, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 60.888888888888886, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.29343104362487793, |
|
"eval_runtime": 1.8755, |
|
"eval_samples_per_second": 51.187, |
|
"eval_steps_per_second": 1.6, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 61.925925925925924, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.2956160008907318, |
|
"eval_runtime": 2.1241, |
|
"eval_samples_per_second": 45.195, |
|
"eval_steps_per_second": 1.412, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 62.22222222222222, |
|
"grad_norm": 19.266647338867188, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.084, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 62.96296296296296, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.291843980550766, |
|
"eval_runtime": 1.864, |
|
"eval_samples_per_second": 51.503, |
|
"eval_steps_per_second": 1.609, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 63.7037037037037, |
|
"grad_norm": 27.770219802856445, |
|
"learning_rate": 2.9012345679012347e-05, |
|
"loss": 0.0808, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3416362702846527, |
|
"eval_runtime": 1.8651, |
|
"eval_samples_per_second": 51.471, |
|
"eval_steps_per_second": 1.608, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 64.88888888888889, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.35024556517601013, |
|
"eval_runtime": 1.8463, |
|
"eval_samples_per_second": 51.995, |
|
"eval_steps_per_second": 1.625, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 65.18518518518519, |
|
"grad_norm": 14.995842933654785, |
|
"learning_rate": 2.839506172839506e-05, |
|
"loss": 0.0804, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 65.92592592592592, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2984811067581177, |
|
"eval_runtime": 2.0874, |
|
"eval_samples_per_second": 45.989, |
|
"eval_steps_per_second": 1.437, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 34.63008117675781, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0854, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 66.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.2791596055030823, |
|
"eval_runtime": 1.88, |
|
"eval_samples_per_second": 51.064, |
|
"eval_steps_per_second": 1.596, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.3643587827682495, |
|
"eval_runtime": 1.8436, |
|
"eval_samples_per_second": 52.072, |
|
"eval_steps_per_second": 1.627, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 68.14814814814815, |
|
"grad_norm": 32.28718566894531, |
|
"learning_rate": 2.7160493827160493e-05, |
|
"loss": 0.0887, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 68.88888888888889, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.2684231400489807, |
|
"eval_runtime": 1.8572, |
|
"eval_samples_per_second": 51.691, |
|
"eval_steps_per_second": 1.615, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 69.62962962962963, |
|
"grad_norm": 22.44804573059082, |
|
"learning_rate": 2.654320987654321e-05, |
|
"loss": 0.0671, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 69.92592592592592, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.28022265434265137, |
|
"eval_runtime": 2.1506, |
|
"eval_samples_per_second": 44.638, |
|
"eval_steps_per_second": 1.395, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 70.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.2900524437427521, |
|
"eval_runtime": 1.859, |
|
"eval_samples_per_second": 51.641, |
|
"eval_steps_per_second": 1.614, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 71.11111111111111, |
|
"grad_norm": 4.379269123077393, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.0704, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3098466694355011, |
|
"eval_runtime": 1.838, |
|
"eval_samples_per_second": 52.23, |
|
"eval_steps_per_second": 1.632, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 72.5925925925926, |
|
"grad_norm": 17.398576736450195, |
|
"learning_rate": 2.5308641975308646e-05, |
|
"loss": 0.0802, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 72.88888888888889, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.2960352897644043, |
|
"eval_runtime": 1.867, |
|
"eval_samples_per_second": 51.421, |
|
"eval_steps_per_second": 1.607, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 73.92592592592592, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.2757139503955841, |
|
"eval_runtime": 2.0356, |
|
"eval_samples_per_second": 47.161, |
|
"eval_steps_per_second": 1.474, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 74.07407407407408, |
|
"grad_norm": 18.339569091796875, |
|
"learning_rate": 2.4691358024691357e-05, |
|
"loss": 0.09, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 74.96296296296296, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.31044331192970276, |
|
"eval_runtime": 1.8732, |
|
"eval_samples_per_second": 51.249, |
|
"eval_steps_per_second": 1.602, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 75.55555555555556, |
|
"grad_norm": 8.075996398925781, |
|
"learning_rate": 2.4074074074074074e-05, |
|
"loss": 0.0772, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.3120499849319458, |
|
"eval_runtime": 1.9108, |
|
"eval_samples_per_second": 50.239, |
|
"eval_steps_per_second": 1.57, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 76.88888888888889, |
|
"eval_accuracy": 0.9166666666666666, |
|
"eval_loss": 0.2803463637828827, |
|
"eval_runtime": 1.8689, |
|
"eval_samples_per_second": 51.368, |
|
"eval_steps_per_second": 1.605, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 77.03703703703704, |
|
"grad_norm": 3.0919692516326904, |
|
"learning_rate": 2.345679012345679e-05, |
|
"loss": 0.0725, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 77.92592592592592, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.28252843022346497, |
|
"eval_runtime": 1.9144, |
|
"eval_samples_per_second": 50.146, |
|
"eval_steps_per_second": 1.567, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 78.51851851851852, |
|
"grad_norm": 14.103160858154297, |
|
"learning_rate": 2.2839506172839506e-05, |
|
"loss": 0.0684, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 78.96296296296296, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.325451523065567, |
|
"eval_runtime": 1.8932, |
|
"eval_samples_per_second": 50.708, |
|
"eval_steps_per_second": 1.585, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 25.228979110717773, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.0732, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.3091324269771576, |
|
"eval_runtime": 1.8455, |
|
"eval_samples_per_second": 52.019, |
|
"eval_steps_per_second": 1.626, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 80.88888888888889, |
|
"eval_accuracy": 0.9166666666666666, |
|
"eval_loss": 0.2875919044017792, |
|
"eval_runtime": 1.8878, |
|
"eval_samples_per_second": 50.852, |
|
"eval_steps_per_second": 1.589, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 81.48148148148148, |
|
"grad_norm": 9.110090255737305, |
|
"learning_rate": 2.1604938271604937e-05, |
|
"loss": 0.0743, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 81.92592592592592, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.30346551537513733, |
|
"eval_runtime": 2.0498, |
|
"eval_samples_per_second": 46.834, |
|
"eval_steps_per_second": 1.464, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 82.96296296296296, |
|
"grad_norm": 23.57141876220703, |
|
"learning_rate": 2.0987654320987655e-05, |
|
"loss": 0.0807, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 82.96296296296296, |
|
"eval_accuracy": 0.9270833333333334, |
|
"eval_loss": 0.2750629186630249, |
|
"eval_runtime": 1.8436, |
|
"eval_samples_per_second": 52.073, |
|
"eval_steps_per_second": 1.627, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.9166666666666666, |
|
"eval_loss": 0.2656916677951813, |
|
"eval_runtime": 1.884, |
|
"eval_samples_per_second": 50.955, |
|
"eval_steps_per_second": 1.592, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 84.44444444444444, |
|
"grad_norm": 7.4042277336120605, |
|
"learning_rate": 2.037037037037037e-05, |
|
"loss": 0.0799, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 84.88888888888889, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.2810324728488922, |
|
"eval_runtime": 1.8651, |
|
"eval_samples_per_second": 51.472, |
|
"eval_steps_per_second": 1.608, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 85.92592592592592, |
|
"grad_norm": 9.805841445922852, |
|
"learning_rate": 1.9753086419753087e-05, |
|
"loss": 0.0632, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 85.92592592592592, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.303717702627182, |
|
"eval_runtime": 1.9401, |
|
"eval_samples_per_second": 49.481, |
|
"eval_steps_per_second": 1.546, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 86.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.33565008640289307, |
|
"eval_runtime": 1.8817, |
|
"eval_samples_per_second": 51.018, |
|
"eval_steps_per_second": 1.594, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 87.4074074074074, |
|
"grad_norm": 27.298439025878906, |
|
"learning_rate": 1.91358024691358e-05, |
|
"loss": 0.0579, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3170994818210602, |
|
"eval_runtime": 1.8439, |
|
"eval_samples_per_second": 52.063, |
|
"eval_steps_per_second": 1.627, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 88.88888888888889, |
|
"grad_norm": 12.730273246765137, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.0593, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 88.88888888888889, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.32229921221733093, |
|
"eval_runtime": 1.8672, |
|
"eval_samples_per_second": 51.414, |
|
"eval_steps_per_second": 1.607, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 89.92592592592592, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.29770705103874207, |
|
"eval_runtime": 1.9546, |
|
"eval_samples_per_second": 49.115, |
|
"eval_steps_per_second": 1.535, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 90.37037037037037, |
|
"grad_norm": 7.479738712310791, |
|
"learning_rate": 1.7901234567901236e-05, |
|
"loss": 0.0418, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 90.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.3380300998687744, |
|
"eval_runtime": 1.8381, |
|
"eval_samples_per_second": 52.228, |
|
"eval_steps_per_second": 1.632, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 91.85185185185185, |
|
"grad_norm": 19.313344955444336, |
|
"learning_rate": 1.728395061728395e-05, |
|
"loss": 0.0647, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.28629523515701294, |
|
"eval_runtime": 1.9227, |
|
"eval_samples_per_second": 49.928, |
|
"eval_steps_per_second": 1.56, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 92.88888888888889, |
|
"eval_accuracy": 0.9166666666666666, |
|
"eval_loss": 0.2898975610733032, |
|
"eval_runtime": 1.8782, |
|
"eval_samples_per_second": 51.114, |
|
"eval_steps_per_second": 1.597, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 93.33333333333333, |
|
"grad_norm": 11.266192436218262, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0649, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 93.92592592592592, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2853105962276459, |
|
"eval_runtime": 1.9709, |
|
"eval_samples_per_second": 48.708, |
|
"eval_steps_per_second": 1.522, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 94.81481481481481, |
|
"grad_norm": 9.165306091308594, |
|
"learning_rate": 1.604938271604938e-05, |
|
"loss": 0.0538, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 94.96296296296296, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.24523399770259857, |
|
"eval_runtime": 1.9081, |
|
"eval_samples_per_second": 50.312, |
|
"eval_steps_per_second": 1.572, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2568932771682739, |
|
"eval_runtime": 1.8246, |
|
"eval_samples_per_second": 52.614, |
|
"eval_steps_per_second": 1.644, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 96.29629629629629, |
|
"grad_norm": 6.884490489959717, |
|
"learning_rate": 1.54320987654321e-05, |
|
"loss": 0.0483, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 96.88888888888889, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.26870599389076233, |
|
"eval_runtime": 1.8508, |
|
"eval_samples_per_second": 51.87, |
|
"eval_steps_per_second": 1.621, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 97.77777777777777, |
|
"grad_norm": 10.973348617553711, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.0597, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 97.92592592592592, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3083449900150299, |
|
"eval_runtime": 1.991, |
|
"eval_samples_per_second": 48.217, |
|
"eval_steps_per_second": 1.507, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 98.96296296296296, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.2929399311542511, |
|
"eval_runtime": 1.8639, |
|
"eval_samples_per_second": 51.505, |
|
"eval_steps_per_second": 1.61, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 99.25925925925925, |
|
"grad_norm": 7.356732368469238, |
|
"learning_rate": 1.419753086419753e-05, |
|
"loss": 0.0544, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.32528772950172424, |
|
"eval_runtime": 1.9688, |
|
"eval_samples_per_second": 48.76, |
|
"eval_steps_per_second": 1.524, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 100.74074074074075, |
|
"grad_norm": 5.596048831939697, |
|
"learning_rate": 1.3580246913580247e-05, |
|
"loss": 0.0585, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 100.88888888888889, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.33938369154930115, |
|
"eval_runtime": 1.8565, |
|
"eval_samples_per_second": 51.711, |
|
"eval_steps_per_second": 1.616, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 101.92592592592592, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.37479040026664734, |
|
"eval_runtime": 2.0134, |
|
"eval_samples_per_second": 47.68, |
|
"eval_steps_per_second": 1.49, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 102.22222222222223, |
|
"grad_norm": 13.269804000854492, |
|
"learning_rate": 1.2962962962962962e-05, |
|
"loss": 0.0563, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 102.96296296296296, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.38897252082824707, |
|
"eval_runtime": 1.8519, |
|
"eval_samples_per_second": 51.838, |
|
"eval_steps_per_second": 1.62, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 103.70370370370371, |
|
"grad_norm": 16.924348831176758, |
|
"learning_rate": 1.2345679012345678e-05, |
|
"loss": 0.059, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3460318148136139, |
|
"eval_runtime": 1.8788, |
|
"eval_samples_per_second": 51.096, |
|
"eval_steps_per_second": 1.597, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 104.88888888888889, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.33083054423332214, |
|
"eval_runtime": 1.9045, |
|
"eval_samples_per_second": 50.406, |
|
"eval_steps_per_second": 1.575, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 105.18518518518519, |
|
"grad_norm": 34.068790435791016, |
|
"learning_rate": 1.1728395061728396e-05, |
|
"loss": 0.0601, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 105.92592592592592, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3228204846382141, |
|
"eval_runtime": 1.8931, |
|
"eval_samples_per_second": 50.711, |
|
"eval_steps_per_second": 1.585, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 106.66666666666667, |
|
"grad_norm": 28.148273468017578, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0512, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 106.96296296296296, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3189867436885834, |
|
"eval_runtime": 1.9115, |
|
"eval_samples_per_second": 50.224, |
|
"eval_steps_per_second": 1.569, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.30278849601745605, |
|
"eval_runtime": 1.8687, |
|
"eval_samples_per_second": 51.373, |
|
"eval_steps_per_second": 1.605, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 108.14814814814815, |
|
"grad_norm": 8.840377807617188, |
|
"learning_rate": 1.0493827160493827e-05, |
|
"loss": 0.0346, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 108.88888888888889, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.3065723478794098, |
|
"eval_runtime": 1.849, |
|
"eval_samples_per_second": 51.919, |
|
"eval_steps_per_second": 1.622, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 109.62962962962963, |
|
"grad_norm": 5.847916126251221, |
|
"learning_rate": 9.876543209876543e-06, |
|
"loss": 0.0434, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 109.92592592592592, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.29524290561676025, |
|
"eval_runtime": 1.8637, |
|
"eval_samples_per_second": 51.511, |
|
"eval_steps_per_second": 1.61, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 110.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.3053508698940277, |
|
"eval_runtime": 1.875, |
|
"eval_samples_per_second": 51.201, |
|
"eval_steps_per_second": 1.6, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 111.11111111111111, |
|
"grad_norm": 13.934989929199219, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.0466, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.30871471762657166, |
|
"eval_runtime": 1.8837, |
|
"eval_samples_per_second": 50.965, |
|
"eval_steps_per_second": 1.593, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 112.5925925925926, |
|
"grad_norm": 4.37272834777832, |
|
"learning_rate": 8.641975308641975e-06, |
|
"loss": 0.0402, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 112.88888888888889, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3211623728275299, |
|
"eval_runtime": 1.8685, |
|
"eval_samples_per_second": 51.377, |
|
"eval_steps_per_second": 1.606, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 113.92592592592592, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3235282897949219, |
|
"eval_runtime": 1.8591, |
|
"eval_samples_per_second": 51.637, |
|
"eval_steps_per_second": 1.614, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 114.07407407407408, |
|
"grad_norm": 17.743671417236328, |
|
"learning_rate": 8.02469135802469e-06, |
|
"loss": 0.0491, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 114.96296296296296, |
|
"eval_accuracy": 0.90625, |
|
"eval_loss": 0.3134639263153076, |
|
"eval_runtime": 1.8752, |
|
"eval_samples_per_second": 51.195, |
|
"eval_steps_per_second": 1.6, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 115.55555555555556, |
|
"grad_norm": 14.649134635925293, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.0495, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"eval_accuracy": 0.8958333333333334, |
|
"eval_loss": 0.2991209924221039, |
|
"eval_runtime": 1.9038, |
|
"eval_samples_per_second": 50.426, |
|
"eval_steps_per_second": 1.576, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 116.88888888888889, |
|
"eval_accuracy": 0.8854166666666666, |
|
"eval_loss": 0.3050777018070221, |
|
"eval_runtime": 1.8271, |
|
"eval_samples_per_second": 52.541, |
|
"eval_steps_per_second": 1.642, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 117.03703703703704, |
|
"grad_norm": 14.181517601013184, |
|
"learning_rate": 6.790123456790123e-06, |
|
"loss": 0.0536, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 117.92592592592592, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.33390840888023376, |
|
"eval_runtime": 1.8859, |
|
"eval_samples_per_second": 50.904, |
|
"eval_steps_per_second": 1.591, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 118.51851851851852, |
|
"grad_norm": 13.237271308898926, |
|
"learning_rate": 6.172839506172839e-06, |
|
"loss": 0.0419, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 118.96296296296296, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3370848596096039, |
|
"eval_runtime": 1.8605, |
|
"eval_samples_per_second": 51.6, |
|
"eval_steps_per_second": 1.612, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"grad_norm": 5.592768669128418, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.0333, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3375813066959381, |
|
"eval_runtime": 1.8801, |
|
"eval_samples_per_second": 51.06, |
|
"eval_steps_per_second": 1.596, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 120.88888888888889, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.33790695667266846, |
|
"eval_runtime": 1.8455, |
|
"eval_samples_per_second": 52.018, |
|
"eval_steps_per_second": 1.626, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 121.48148148148148, |
|
"grad_norm": 4.372032165527344, |
|
"learning_rate": 4.938271604938272e-06, |
|
"loss": 0.0376, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 121.92592592592592, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.337320476770401, |
|
"eval_runtime": 1.8367, |
|
"eval_samples_per_second": 52.267, |
|
"eval_steps_per_second": 1.633, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 122.96296296296296, |
|
"grad_norm": 10.392346382141113, |
|
"learning_rate": 4.3209876543209875e-06, |
|
"loss": 0.0397, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 122.96296296296296, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.34366607666015625, |
|
"eval_runtime": 1.8522, |
|
"eval_samples_per_second": 51.831, |
|
"eval_steps_per_second": 1.62, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 124.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.35845816135406494, |
|
"eval_runtime": 1.8547, |
|
"eval_samples_per_second": 51.76, |
|
"eval_steps_per_second": 1.618, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 124.44444444444444, |
|
"grad_norm": 5.668208122253418, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.0299, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 124.88888888888889, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3514343202114105, |
|
"eval_runtime": 1.8661, |
|
"eval_samples_per_second": 51.445, |
|
"eval_steps_per_second": 1.608, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 125.92592592592592, |
|
"grad_norm": 21.28321647644043, |
|
"learning_rate": 3.0864197530864196e-06, |
|
"loss": 0.0468, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 125.92592592592592, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3397478759288788, |
|
"eval_runtime": 1.8626, |
|
"eval_samples_per_second": 51.54, |
|
"eval_steps_per_second": 1.611, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 126.96296296296296, |
|
"eval_accuracy": 0.8541666666666666, |
|
"eval_loss": 0.33162921667099, |
|
"eval_runtime": 1.9031, |
|
"eval_samples_per_second": 50.444, |
|
"eval_steps_per_second": 1.576, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 127.4074074074074, |
|
"grad_norm": 2.5001776218414307, |
|
"learning_rate": 2.469135802469136e-06, |
|
"loss": 0.0351, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 128.0, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.33344319462776184, |
|
"eval_runtime": 1.8441, |
|
"eval_samples_per_second": 52.058, |
|
"eval_steps_per_second": 1.627, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 128.88888888888889, |
|
"grad_norm": 9.004130363464355, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.0439, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 128.88888888888889, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3323952555656433, |
|
"eval_runtime": 1.8475, |
|
"eval_samples_per_second": 51.963, |
|
"eval_steps_per_second": 1.624, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 129.92592592592592, |
|
"eval_accuracy": 0.8645833333333334, |
|
"eval_loss": 0.3289691209793091, |
|
"eval_runtime": 1.8689, |
|
"eval_samples_per_second": 51.368, |
|
"eval_steps_per_second": 1.605, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 130.37037037037038, |
|
"grad_norm": 12.271383285522461, |
|
"learning_rate": 1.234567901234568e-06, |
|
"loss": 0.0478, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 130.96296296296296, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.325620174407959, |
|
"eval_runtime": 1.8869, |
|
"eval_samples_per_second": 50.876, |
|
"eval_steps_per_second": 1.59, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 131.85185185185185, |
|
"grad_norm": 8.459063529968262, |
|
"learning_rate": 6.17283950617284e-07, |
|
"loss": 0.0434, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 132.0, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.32529863715171814, |
|
"eval_runtime": 1.8653, |
|
"eval_samples_per_second": 51.466, |
|
"eval_steps_per_second": 1.608, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 132.88888888888889, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.325122594833374, |
|
"eval_runtime": 1.835, |
|
"eval_samples_per_second": 52.315, |
|
"eval_steps_per_second": 1.635, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"grad_norm": 9.571310997009277, |
|
"learning_rate": 0.0, |
|
"loss": 0.0374, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.3251601457595825, |
|
"eval_runtime": 1.8591, |
|
"eval_samples_per_second": 51.637, |
|
"eval_steps_per_second": 1.614, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"step": 900, |
|
"total_flos": 2.900008367869133e+18, |
|
"train_loss": 0.254776107304626, |
|
"train_runtime": 3168.008, |
|
"train_samples_per_second": 40.909, |
|
"train_steps_per_second": 0.284 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.900008367869133e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|