{ "best_metric": null, "best_model_checkpoint": null, "epoch": 133.33333333333334, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.8888888888888888, "eval_accuracy": 0.09375, "eval_loss": 2.078925609588623, "eval_runtime": 2.1652, "eval_samples_per_second": 44.337, "eval_steps_per_second": 1.386, "step": 6 }, { "epoch": 1.4814814814814814, "grad_norm": 4.666333198547363, "learning_rate": 5.555555555555556e-06, "loss": 2.1005, "step": 10 }, { "epoch": 1.925925925925926, "eval_accuracy": 0.125, "eval_loss": 2.027583122253418, "eval_runtime": 1.9426, "eval_samples_per_second": 49.419, "eval_steps_per_second": 1.544, "step": 13 }, { "epoch": 2.962962962962963, "grad_norm": 6.9924726486206055, "learning_rate": 1.1111111111111112e-05, "loss": 2.0321, "step": 20 }, { "epoch": 2.962962962962963, "eval_accuracy": 0.28125, "eval_loss": 1.945550560951233, "eval_runtime": 1.8821, "eval_samples_per_second": 51.006, "eval_steps_per_second": 1.594, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.4583333333333333, "eval_loss": 1.8392573595046997, "eval_runtime": 1.8452, "eval_samples_per_second": 52.028, "eval_steps_per_second": 1.626, "step": 27 }, { "epoch": 4.444444444444445, "grad_norm": 6.883745193481445, "learning_rate": 1.6666666666666667e-05, "loss": 1.9151, "step": 30 }, { "epoch": 4.888888888888889, "eval_accuracy": 0.5833333333333334, "eval_loss": 1.7343072891235352, "eval_runtime": 1.8707, "eval_samples_per_second": 51.318, "eval_steps_per_second": 1.604, "step": 33 }, { "epoch": 5.925925925925926, "grad_norm": 12.400025367736816, "learning_rate": 2.2222222222222223e-05, "loss": 1.7396, "step": 40 }, { "epoch": 5.925925925925926, "eval_accuracy": 0.6041666666666666, "eval_loss": 1.5971508026123047, "eval_runtime": 1.891, "eval_samples_per_second": 50.767, "eval_steps_per_second": 1.586, "step": 40 }, { "epoch": 6.962962962962963, "eval_accuracy": 0.6770833333333334, "eval_loss": 1.4546260833740234, "eval_runtime": 1.8412, "eval_samples_per_second": 52.14, "eval_steps_per_second": 1.629, "step": 47 }, { "epoch": 7.407407407407407, "grad_norm": 11.476837158203125, "learning_rate": 2.777777777777778e-05, "loss": 1.5392, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.7395833333333334, "eval_loss": 1.2942534685134888, "eval_runtime": 1.9365, "eval_samples_per_second": 49.574, "eval_steps_per_second": 1.549, "step": 54 }, { "epoch": 8.88888888888889, "grad_norm": 14.580023765563965, "learning_rate": 3.3333333333333335e-05, "loss": 1.3096, "step": 60 }, { "epoch": 8.88888888888889, "eval_accuracy": 0.7395833333333334, "eval_loss": 1.1409353017807007, "eval_runtime": 1.8719, "eval_samples_per_second": 51.286, "eval_steps_per_second": 1.603, "step": 60 }, { "epoch": 9.925925925925926, "eval_accuracy": 0.8229166666666666, "eval_loss": 0.9840934872627258, "eval_runtime": 1.8663, "eval_samples_per_second": 51.44, "eval_steps_per_second": 1.607, "step": 67 }, { "epoch": 10.37037037037037, "grad_norm": 15.968297958374023, "learning_rate": 3.888888888888889e-05, "loss": 1.1062, "step": 70 }, { "epoch": 10.962962962962964, "eval_accuracy": 0.8229166666666666, "eval_loss": 0.851224958896637, "eval_runtime": 1.9128, "eval_samples_per_second": 50.189, "eval_steps_per_second": 1.568, "step": 74 }, { "epoch": 11.851851851851851, "grad_norm": 23.297271728515625, "learning_rate": 4.4444444444444447e-05, "loss": 0.896, "step": 80 }, { "epoch": 12.0, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.7128415703773499, "eval_runtime": 1.8414, "eval_samples_per_second": 52.136, "eval_steps_per_second": 1.629, "step": 81 }, { "epoch": 12.88888888888889, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.6365618109703064, "eval_runtime": 1.8634, "eval_samples_per_second": 51.518, "eval_steps_per_second": 1.61, "step": 87 }, { "epoch": 13.333333333333334, "grad_norm": 24.776830673217773, "learning_rate": 5e-05, "loss": 0.712, "step": 90 }, { "epoch": 13.925925925925926, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.5419111847877502, "eval_runtime": 1.874, "eval_samples_per_second": 51.228, "eval_steps_per_second": 1.601, "step": 94 }, { "epoch": 14.814814814814815, "grad_norm": 26.37338638305664, "learning_rate": 4.938271604938271e-05, "loss": 0.6231, "step": 100 }, { "epoch": 14.962962962962964, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.5082051753997803, "eval_runtime": 1.8338, "eval_samples_per_second": 52.35, "eval_steps_per_second": 1.636, "step": 101 }, { "epoch": 16.0, "eval_accuracy": 0.875, "eval_loss": 0.4674176871776581, "eval_runtime": 1.878, "eval_samples_per_second": 51.118, "eval_steps_per_second": 1.597, "step": 108 }, { "epoch": 16.296296296296298, "grad_norm": 26.630212783813477, "learning_rate": 4.876543209876544e-05, "loss": 0.4962, "step": 110 }, { "epoch": 16.88888888888889, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.4479581415653229, "eval_runtime": 1.8306, "eval_samples_per_second": 52.441, "eval_steps_per_second": 1.639, "step": 114 }, { "epoch": 17.77777777777778, "grad_norm": 23.847532272338867, "learning_rate": 4.814814814814815e-05, "loss": 0.4322, "step": 120 }, { "epoch": 17.925925925925927, "eval_accuracy": 0.875, "eval_loss": 0.41380929946899414, "eval_runtime": 1.8812, "eval_samples_per_second": 51.03, "eval_steps_per_second": 1.595, "step": 121 }, { "epoch": 18.962962962962962, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.39465710520744324, "eval_runtime": 1.8978, "eval_samples_per_second": 50.584, "eval_steps_per_second": 1.581, "step": 128 }, { "epoch": 19.25925925925926, "grad_norm": 23.30731201171875, "learning_rate": 4.7530864197530866e-05, "loss": 0.3937, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3826509416103363, "eval_runtime": 1.8444, "eval_samples_per_second": 52.05, "eval_steps_per_second": 1.627, "step": 135 }, { "epoch": 20.74074074074074, "grad_norm": 22.551395416259766, "learning_rate": 4.691358024691358e-05, "loss": 0.3377, "step": 140 }, { "epoch": 20.88888888888889, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.36256420612335205, "eval_runtime": 1.8807, "eval_samples_per_second": 51.044, "eval_steps_per_second": 1.595, "step": 141 }, { "epoch": 21.925925925925927, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3579378128051758, "eval_runtime": 1.8493, "eval_samples_per_second": 51.911, "eval_steps_per_second": 1.622, "step": 148 }, { "epoch": 22.22222222222222, "grad_norm": 19.129470825195312, "learning_rate": 4.62962962962963e-05, "loss": 0.3099, "step": 150 }, { "epoch": 22.962962962962962, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.35575783252716064, "eval_runtime": 1.8947, "eval_samples_per_second": 50.668, "eval_steps_per_second": 1.583, "step": 155 }, { "epoch": 23.703703703703702, "grad_norm": 34.541709899902344, "learning_rate": 4.567901234567901e-05, "loss": 0.2895, "step": 160 }, { "epoch": 24.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3242819011211395, "eval_runtime": 1.9128, "eval_samples_per_second": 50.188, "eval_steps_per_second": 1.568, "step": 162 }, { "epoch": 24.88888888888889, "eval_accuracy": 0.875, "eval_loss": 0.34730610251426697, "eval_runtime": 1.857, "eval_samples_per_second": 51.697, "eval_steps_per_second": 1.616, "step": 168 }, { "epoch": 25.185185185185187, "grad_norm": 27.23337745666504, "learning_rate": 4.506172839506173e-05, "loss": 0.2732, "step": 170 }, { "epoch": 25.925925925925927, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.34611785411834717, "eval_runtime": 1.9339, "eval_samples_per_second": 49.641, "eval_steps_per_second": 1.551, "step": 175 }, { "epoch": 26.666666666666668, "grad_norm": 26.6937255859375, "learning_rate": 4.4444444444444447e-05, "loss": 0.2447, "step": 180 }, { "epoch": 26.962962962962962, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.34497448801994324, "eval_runtime": 1.9227, "eval_samples_per_second": 49.929, "eval_steps_per_second": 1.56, "step": 182 }, { "epoch": 28.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.36034080386161804, "eval_runtime": 1.8685, "eval_samples_per_second": 51.378, "eval_steps_per_second": 1.606, "step": 189 }, { "epoch": 28.14814814814815, "grad_norm": 22.52947235107422, "learning_rate": 4.3827160493827164e-05, "loss": 0.2009, "step": 190 }, { "epoch": 28.88888888888889, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3214483857154846, "eval_runtime": 1.8832, "eval_samples_per_second": 50.977, "eval_steps_per_second": 1.593, "step": 195 }, { "epoch": 29.62962962962963, "grad_norm": 23.971797943115234, "learning_rate": 4.3209876543209875e-05, "loss": 0.2064, "step": 200 }, { "epoch": 29.925925925925927, "eval_accuracy": 0.875, "eval_loss": 0.30430108308792114, "eval_runtime": 1.8494, "eval_samples_per_second": 51.909, "eval_steps_per_second": 1.622, "step": 202 }, { "epoch": 30.962962962962962, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2916516661643982, "eval_runtime": 1.8828, "eval_samples_per_second": 50.987, "eval_steps_per_second": 1.593, "step": 209 }, { "epoch": 31.11111111111111, "grad_norm": 20.200942993164062, "learning_rate": 4.259259259259259e-05, "loss": 0.2139, "step": 210 }, { "epoch": 32.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.28600063920021057, "eval_runtime": 1.903, "eval_samples_per_second": 50.448, "eval_steps_per_second": 1.576, "step": 216 }, { "epoch": 32.592592592592595, "grad_norm": 49.91157531738281, "learning_rate": 4.197530864197531e-05, "loss": 0.1732, "step": 220 }, { "epoch": 32.888888888888886, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.3314121663570404, "eval_runtime": 1.8588, "eval_samples_per_second": 51.646, "eval_steps_per_second": 1.614, "step": 222 }, { "epoch": 33.925925925925924, "eval_accuracy": 0.875, "eval_loss": 0.339076429605484, "eval_runtime": 1.8736, "eval_samples_per_second": 51.238, "eval_steps_per_second": 1.601, "step": 229 }, { "epoch": 34.074074074074076, "grad_norm": 15.412144660949707, "learning_rate": 4.135802469135803e-05, "loss": 0.2009, "step": 230 }, { "epoch": 34.96296296296296, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.3118279278278351, "eval_runtime": 1.8956, "eval_samples_per_second": 50.645, "eval_steps_per_second": 1.583, "step": 236 }, { "epoch": 35.55555555555556, "grad_norm": 18.91287612915039, "learning_rate": 4.074074074074074e-05, "loss": 0.1683, "step": 240 }, { "epoch": 36.0, "eval_accuracy": 0.875, "eval_loss": 0.3162083327770233, "eval_runtime": 1.8823, "eval_samples_per_second": 51.002, "eval_steps_per_second": 1.594, "step": 243 }, { "epoch": 36.888888888888886, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3011355996131897, "eval_runtime": 1.8611, "eval_samples_per_second": 51.582, "eval_steps_per_second": 1.612, "step": 249 }, { "epoch": 37.03703703703704, "grad_norm": 35.87955856323242, "learning_rate": 4.012345679012346e-05, "loss": 0.16, "step": 250 }, { "epoch": 37.925925925925924, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.2981296479701996, "eval_runtime": 1.9139, "eval_samples_per_second": 50.159, "eval_steps_per_second": 1.567, "step": 256 }, { "epoch": 38.51851851851852, "grad_norm": 12.093141555786133, "learning_rate": 3.950617283950617e-05, "loss": 0.1448, "step": 260 }, { "epoch": 38.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.34168219566345215, "eval_runtime": 1.8741, "eval_samples_per_second": 51.226, "eval_steps_per_second": 1.601, "step": 263 }, { "epoch": 40.0, "grad_norm": 36.96841049194336, "learning_rate": 3.888888888888889e-05, "loss": 0.1272, "step": 270 }, { "epoch": 40.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.35579848289489746, "eval_runtime": 1.8827, "eval_samples_per_second": 50.992, "eval_steps_per_second": 1.593, "step": 270 }, { "epoch": 40.888888888888886, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.3948463499546051, "eval_runtime": 1.8989, "eval_samples_per_second": 50.557, "eval_steps_per_second": 1.58, "step": 276 }, { "epoch": 41.48148148148148, "grad_norm": 17.500049591064453, "learning_rate": 3.82716049382716e-05, "loss": 0.1578, "step": 280 }, { "epoch": 41.925925925925924, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.36678051948547363, "eval_runtime": 1.9696, "eval_samples_per_second": 48.741, "eval_steps_per_second": 1.523, "step": 283 }, { "epoch": 42.96296296296296, "grad_norm": 19.923572540283203, "learning_rate": 3.7654320987654326e-05, "loss": 0.1604, "step": 290 }, { "epoch": 42.96296296296296, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.33422672748565674, "eval_runtime": 1.9103, "eval_samples_per_second": 50.253, "eval_steps_per_second": 1.57, "step": 290 }, { "epoch": 44.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.314091295003891, "eval_runtime": 1.8512, "eval_samples_per_second": 51.858, "eval_steps_per_second": 1.621, "step": 297 }, { "epoch": 44.44444444444444, "grad_norm": 21.997079849243164, "learning_rate": 3.7037037037037037e-05, "loss": 0.1251, "step": 300 }, { "epoch": 44.888888888888886, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3266027271747589, "eval_runtime": 1.8596, "eval_samples_per_second": 51.625, "eval_steps_per_second": 1.613, "step": 303 }, { "epoch": 45.925925925925924, "grad_norm": 27.58420753479004, "learning_rate": 3.6419753086419754e-05, "loss": 0.1449, "step": 310 }, { "epoch": 45.925925925925924, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.34379109740257263, "eval_runtime": 1.9283, "eval_samples_per_second": 49.786, "eval_steps_per_second": 1.556, "step": 310 }, { "epoch": 46.96296296296296, "eval_accuracy": 0.875, "eval_loss": 0.3382701873779297, "eval_runtime": 1.8689, "eval_samples_per_second": 51.368, "eval_steps_per_second": 1.605, "step": 317 }, { "epoch": 47.407407407407405, "grad_norm": 14.891225814819336, "learning_rate": 3.580246913580247e-05, "loss": 0.1134, "step": 320 }, { "epoch": 48.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.33413389325141907, "eval_runtime": 1.887, "eval_samples_per_second": 50.875, "eval_steps_per_second": 1.59, "step": 324 }, { "epoch": 48.888888888888886, "grad_norm": 22.530115127563477, "learning_rate": 3.518518518518519e-05, "loss": 0.1558, "step": 330 }, { "epoch": 48.888888888888886, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2854965031147003, "eval_runtime": 1.9011, "eval_samples_per_second": 50.497, "eval_steps_per_second": 1.578, "step": 330 }, { "epoch": 49.925925925925924, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2842860221862793, "eval_runtime": 1.9386, "eval_samples_per_second": 49.52, "eval_steps_per_second": 1.547, "step": 337 }, { "epoch": 50.370370370370374, "grad_norm": 45.90536880493164, "learning_rate": 3.45679012345679e-05, "loss": 0.1433, "step": 340 }, { "epoch": 50.96296296296296, "eval_accuracy": 0.84375, "eval_loss": 0.2878771126270294, "eval_runtime": 1.8358, "eval_samples_per_second": 52.292, "eval_steps_per_second": 1.634, "step": 344 }, { "epoch": 51.851851851851855, "grad_norm": 30.33245849609375, "learning_rate": 3.395061728395062e-05, "loss": 0.1207, "step": 350 }, { "epoch": 52.0, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.28866705298423767, "eval_runtime": 1.8609, "eval_samples_per_second": 51.587, "eval_steps_per_second": 1.612, "step": 351 }, { "epoch": 52.888888888888886, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.3173443377017975, "eval_runtime": 1.858, "eval_samples_per_second": 51.668, "eval_steps_per_second": 1.615, "step": 357 }, { "epoch": 53.333333333333336, "grad_norm": 14.928536415100098, "learning_rate": 3.3333333333333335e-05, "loss": 0.1006, "step": 360 }, { "epoch": 53.925925925925924, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.2926301658153534, "eval_runtime": 2.1238, "eval_samples_per_second": 45.203, "eval_steps_per_second": 1.413, "step": 364 }, { "epoch": 54.81481481481482, "grad_norm": 16.707883834838867, "learning_rate": 3.271604938271605e-05, "loss": 0.1053, "step": 370 }, { "epoch": 54.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.2791050672531128, "eval_runtime": 1.8468, "eval_samples_per_second": 51.982, "eval_steps_per_second": 1.624, "step": 371 }, { "epoch": 56.0, "eval_accuracy": 0.875, "eval_loss": 0.3276265859603882, "eval_runtime": 1.8761, "eval_samples_per_second": 51.17, "eval_steps_per_second": 1.599, "step": 378 }, { "epoch": 56.2962962962963, "grad_norm": 19.896167755126953, "learning_rate": 3.209876543209876e-05, "loss": 0.106, "step": 380 }, { "epoch": 56.888888888888886, "eval_accuracy": 0.875, "eval_loss": 0.32238277792930603, "eval_runtime": 1.9352, "eval_samples_per_second": 49.608, "eval_steps_per_second": 1.55, "step": 384 }, { "epoch": 57.77777777777778, "grad_norm": 23.663541793823242, "learning_rate": 3.148148148148148e-05, "loss": 0.1058, "step": 390 }, { "epoch": 57.925925925925924, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.33849918842315674, "eval_runtime": 1.9952, "eval_samples_per_second": 48.116, "eval_steps_per_second": 1.504, "step": 391 }, { "epoch": 58.96296296296296, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.34936726093292236, "eval_runtime": 1.8826, "eval_samples_per_second": 50.992, "eval_steps_per_second": 1.594, "step": 398 }, { "epoch": 59.25925925925926, "grad_norm": 6.200643539428711, "learning_rate": 3.08641975308642e-05, "loss": 0.0962, "step": 400 }, { "epoch": 60.0, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.2797829508781433, "eval_runtime": 1.8593, "eval_samples_per_second": 51.633, "eval_steps_per_second": 1.614, "step": 405 }, { "epoch": 60.74074074074074, "grad_norm": 16.507617950439453, "learning_rate": 3.0246913580246916e-05, "loss": 0.0883, "step": 410 }, { "epoch": 60.888888888888886, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.29343104362487793, "eval_runtime": 1.8755, "eval_samples_per_second": 51.187, "eval_steps_per_second": 1.6, "step": 411 }, { "epoch": 61.925925925925924, "eval_accuracy": 0.875, "eval_loss": 0.2956160008907318, "eval_runtime": 2.1241, "eval_samples_per_second": 45.195, "eval_steps_per_second": 1.412, "step": 418 }, { "epoch": 62.22222222222222, "grad_norm": 19.266647338867188, "learning_rate": 2.962962962962963e-05, "loss": 0.084, "step": 420 }, { "epoch": 62.96296296296296, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.291843980550766, "eval_runtime": 1.864, "eval_samples_per_second": 51.503, "eval_steps_per_second": 1.609, "step": 425 }, { "epoch": 63.7037037037037, "grad_norm": 27.770219802856445, "learning_rate": 2.9012345679012347e-05, "loss": 0.0808, "step": 430 }, { "epoch": 64.0, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3416362702846527, "eval_runtime": 1.8651, "eval_samples_per_second": 51.471, "eval_steps_per_second": 1.608, "step": 432 }, { "epoch": 64.88888888888889, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.35024556517601013, "eval_runtime": 1.8463, "eval_samples_per_second": 51.995, "eval_steps_per_second": 1.625, "step": 438 }, { "epoch": 65.18518518518519, "grad_norm": 14.995842933654785, "learning_rate": 2.839506172839506e-05, "loss": 0.0804, "step": 440 }, { "epoch": 65.92592592592592, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2984811067581177, "eval_runtime": 2.0874, "eval_samples_per_second": 45.989, "eval_steps_per_second": 1.437, "step": 445 }, { "epoch": 66.66666666666667, "grad_norm": 34.63008117675781, "learning_rate": 2.777777777777778e-05, "loss": 0.0854, "step": 450 }, { "epoch": 66.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.2791596055030823, "eval_runtime": 1.88, "eval_samples_per_second": 51.064, "eval_steps_per_second": 1.596, "step": 452 }, { "epoch": 68.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.3643587827682495, "eval_runtime": 1.8436, "eval_samples_per_second": 52.072, "eval_steps_per_second": 1.627, "step": 459 }, { "epoch": 68.14814814814815, "grad_norm": 32.28718566894531, "learning_rate": 2.7160493827160493e-05, "loss": 0.0887, "step": 460 }, { "epoch": 68.88888888888889, "eval_accuracy": 0.90625, "eval_loss": 0.2684231400489807, "eval_runtime": 1.8572, "eval_samples_per_second": 51.691, "eval_steps_per_second": 1.615, "step": 465 }, { "epoch": 69.62962962962963, "grad_norm": 22.44804573059082, "learning_rate": 2.654320987654321e-05, "loss": 0.0671, "step": 470 }, { "epoch": 69.92592592592592, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.28022265434265137, "eval_runtime": 2.1506, "eval_samples_per_second": 44.638, "eval_steps_per_second": 1.395, "step": 472 }, { "epoch": 70.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.2900524437427521, "eval_runtime": 1.859, "eval_samples_per_second": 51.641, "eval_steps_per_second": 1.614, "step": 479 }, { "epoch": 71.11111111111111, "grad_norm": 4.379269123077393, "learning_rate": 2.5925925925925925e-05, "loss": 0.0704, "step": 480 }, { "epoch": 72.0, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3098466694355011, "eval_runtime": 1.838, "eval_samples_per_second": 52.23, "eval_steps_per_second": 1.632, "step": 486 }, { "epoch": 72.5925925925926, "grad_norm": 17.398576736450195, "learning_rate": 2.5308641975308646e-05, "loss": 0.0802, "step": 490 }, { "epoch": 72.88888888888889, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.2960352897644043, "eval_runtime": 1.867, "eval_samples_per_second": 51.421, "eval_steps_per_second": 1.607, "step": 492 }, { "epoch": 73.92592592592592, "eval_accuracy": 0.875, "eval_loss": 0.2757139503955841, "eval_runtime": 2.0356, "eval_samples_per_second": 47.161, "eval_steps_per_second": 1.474, "step": 499 }, { "epoch": 74.07407407407408, "grad_norm": 18.339569091796875, "learning_rate": 2.4691358024691357e-05, "loss": 0.09, "step": 500 }, { "epoch": 74.96296296296296, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.31044331192970276, "eval_runtime": 1.8732, "eval_samples_per_second": 51.249, "eval_steps_per_second": 1.602, "step": 506 }, { "epoch": 75.55555555555556, "grad_norm": 8.075996398925781, "learning_rate": 2.4074074074074074e-05, "loss": 0.0772, "step": 510 }, { "epoch": 76.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.3120499849319458, "eval_runtime": 1.9108, "eval_samples_per_second": 50.239, "eval_steps_per_second": 1.57, "step": 513 }, { "epoch": 76.88888888888889, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.2803463637828827, "eval_runtime": 1.8689, "eval_samples_per_second": 51.368, "eval_steps_per_second": 1.605, "step": 519 }, { "epoch": 77.03703703703704, "grad_norm": 3.0919692516326904, "learning_rate": 2.345679012345679e-05, "loss": 0.0725, "step": 520 }, { "epoch": 77.92592592592592, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.28252843022346497, "eval_runtime": 1.9144, "eval_samples_per_second": 50.146, "eval_steps_per_second": 1.567, "step": 526 }, { "epoch": 78.51851851851852, "grad_norm": 14.103160858154297, "learning_rate": 2.2839506172839506e-05, "loss": 0.0684, "step": 530 }, { "epoch": 78.96296296296296, "eval_accuracy": 0.875, "eval_loss": 0.325451523065567, "eval_runtime": 1.8932, "eval_samples_per_second": 50.708, "eval_steps_per_second": 1.585, "step": 533 }, { "epoch": 80.0, "grad_norm": 25.228979110717773, "learning_rate": 2.2222222222222223e-05, "loss": 0.0732, "step": 540 }, { "epoch": 80.0, "eval_accuracy": 0.90625, "eval_loss": 0.3091324269771576, "eval_runtime": 1.8455, "eval_samples_per_second": 52.019, "eval_steps_per_second": 1.626, "step": 540 }, { "epoch": 80.88888888888889, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.2875919044017792, "eval_runtime": 1.8878, "eval_samples_per_second": 50.852, "eval_steps_per_second": 1.589, "step": 546 }, { "epoch": 81.48148148148148, "grad_norm": 9.110090255737305, "learning_rate": 2.1604938271604937e-05, "loss": 0.0743, "step": 550 }, { "epoch": 81.92592592592592, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.30346551537513733, "eval_runtime": 2.0498, "eval_samples_per_second": 46.834, "eval_steps_per_second": 1.464, "step": 553 }, { "epoch": 82.96296296296296, "grad_norm": 23.57141876220703, "learning_rate": 2.0987654320987655e-05, "loss": 0.0807, "step": 560 }, { "epoch": 82.96296296296296, "eval_accuracy": 0.9270833333333334, "eval_loss": 0.2750629186630249, "eval_runtime": 1.8436, "eval_samples_per_second": 52.073, "eval_steps_per_second": 1.627, "step": 560 }, { "epoch": 84.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.2656916677951813, "eval_runtime": 1.884, "eval_samples_per_second": 50.955, "eval_steps_per_second": 1.592, "step": 567 }, { "epoch": 84.44444444444444, "grad_norm": 7.4042277336120605, "learning_rate": 2.037037037037037e-05, "loss": 0.0799, "step": 570 }, { "epoch": 84.88888888888889, "eval_accuracy": 0.90625, "eval_loss": 0.2810324728488922, "eval_runtime": 1.8651, "eval_samples_per_second": 51.472, "eval_steps_per_second": 1.608, "step": 573 }, { "epoch": 85.92592592592592, "grad_norm": 9.805841445922852, "learning_rate": 1.9753086419753087e-05, "loss": 0.0632, "step": 580 }, { "epoch": 85.92592592592592, "eval_accuracy": 0.90625, "eval_loss": 0.303717702627182, "eval_runtime": 1.9401, "eval_samples_per_second": 49.481, "eval_steps_per_second": 1.546, "step": 580 }, { "epoch": 86.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.33565008640289307, "eval_runtime": 1.8817, "eval_samples_per_second": 51.018, "eval_steps_per_second": 1.594, "step": 587 }, { "epoch": 87.4074074074074, "grad_norm": 27.298439025878906, "learning_rate": 1.91358024691358e-05, "loss": 0.0579, "step": 590 }, { "epoch": 88.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3170994818210602, "eval_runtime": 1.8439, "eval_samples_per_second": 52.063, "eval_steps_per_second": 1.627, "step": 594 }, { "epoch": 88.88888888888889, "grad_norm": 12.730273246765137, "learning_rate": 1.8518518518518518e-05, "loss": 0.0593, "step": 600 }, { "epoch": 88.88888888888889, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.32229921221733093, "eval_runtime": 1.8672, "eval_samples_per_second": 51.414, "eval_steps_per_second": 1.607, "step": 600 }, { "epoch": 89.92592592592592, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.29770705103874207, "eval_runtime": 1.9546, "eval_samples_per_second": 49.115, "eval_steps_per_second": 1.535, "step": 607 }, { "epoch": 90.37037037037037, "grad_norm": 7.479738712310791, "learning_rate": 1.7901234567901236e-05, "loss": 0.0418, "step": 610 }, { "epoch": 90.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.3380300998687744, "eval_runtime": 1.8381, "eval_samples_per_second": 52.228, "eval_steps_per_second": 1.632, "step": 614 }, { "epoch": 91.85185185185185, "grad_norm": 19.313344955444336, "learning_rate": 1.728395061728395e-05, "loss": 0.0647, "step": 620 }, { "epoch": 92.0, "eval_accuracy": 0.875, "eval_loss": 0.28629523515701294, "eval_runtime": 1.9227, "eval_samples_per_second": 49.928, "eval_steps_per_second": 1.56, "step": 621 }, { "epoch": 92.88888888888889, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.2898975610733032, "eval_runtime": 1.8782, "eval_samples_per_second": 51.114, "eval_steps_per_second": 1.597, "step": 627 }, { "epoch": 93.33333333333333, "grad_norm": 11.266192436218262, "learning_rate": 1.6666666666666667e-05, "loss": 0.0649, "step": 630 }, { "epoch": 93.92592592592592, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2853105962276459, "eval_runtime": 1.9709, "eval_samples_per_second": 48.708, "eval_steps_per_second": 1.522, "step": 634 }, { "epoch": 94.81481481481481, "grad_norm": 9.165306091308594, "learning_rate": 1.604938271604938e-05, "loss": 0.0538, "step": 640 }, { "epoch": 94.96296296296296, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.24523399770259857, "eval_runtime": 1.9081, "eval_samples_per_second": 50.312, "eval_steps_per_second": 1.572, "step": 641 }, { "epoch": 96.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2568932771682739, "eval_runtime": 1.8246, "eval_samples_per_second": 52.614, "eval_steps_per_second": 1.644, "step": 648 }, { "epoch": 96.29629629629629, "grad_norm": 6.884490489959717, "learning_rate": 1.54320987654321e-05, "loss": 0.0483, "step": 650 }, { "epoch": 96.88888888888889, "eval_accuracy": 0.90625, "eval_loss": 0.26870599389076233, "eval_runtime": 1.8508, "eval_samples_per_second": 51.87, "eval_steps_per_second": 1.621, "step": 654 }, { "epoch": 97.77777777777777, "grad_norm": 10.973348617553711, "learning_rate": 1.4814814814814815e-05, "loss": 0.0597, "step": 660 }, { "epoch": 97.92592592592592, "eval_accuracy": 0.875, "eval_loss": 0.3083449900150299, "eval_runtime": 1.991, "eval_samples_per_second": 48.217, "eval_steps_per_second": 1.507, "step": 661 }, { "epoch": 98.96296296296296, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.2929399311542511, "eval_runtime": 1.8639, "eval_samples_per_second": 51.505, "eval_steps_per_second": 1.61, "step": 668 }, { "epoch": 99.25925925925925, "grad_norm": 7.356732368469238, "learning_rate": 1.419753086419753e-05, "loss": 0.0544, "step": 670 }, { "epoch": 100.0, "eval_accuracy": 0.875, "eval_loss": 0.32528772950172424, "eval_runtime": 1.9688, "eval_samples_per_second": 48.76, "eval_steps_per_second": 1.524, "step": 675 }, { "epoch": 100.74074074074075, "grad_norm": 5.596048831939697, "learning_rate": 1.3580246913580247e-05, "loss": 0.0585, "step": 680 }, { "epoch": 100.88888888888889, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.33938369154930115, "eval_runtime": 1.8565, "eval_samples_per_second": 51.711, "eval_steps_per_second": 1.616, "step": 681 }, { "epoch": 101.92592592592592, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.37479040026664734, "eval_runtime": 2.0134, "eval_samples_per_second": 47.68, "eval_steps_per_second": 1.49, "step": 688 }, { "epoch": 102.22222222222223, "grad_norm": 13.269804000854492, "learning_rate": 1.2962962962962962e-05, "loss": 0.0563, "step": 690 }, { "epoch": 102.96296296296296, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.38897252082824707, "eval_runtime": 1.8519, "eval_samples_per_second": 51.838, "eval_steps_per_second": 1.62, "step": 695 }, { "epoch": 103.70370370370371, "grad_norm": 16.924348831176758, "learning_rate": 1.2345679012345678e-05, "loss": 0.059, "step": 700 }, { "epoch": 104.0, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3460318148136139, "eval_runtime": 1.8788, "eval_samples_per_second": 51.096, "eval_steps_per_second": 1.597, "step": 702 }, { "epoch": 104.88888888888889, "eval_accuracy": 0.875, "eval_loss": 0.33083054423332214, "eval_runtime": 1.9045, "eval_samples_per_second": 50.406, "eval_steps_per_second": 1.575, "step": 708 }, { "epoch": 105.18518518518519, "grad_norm": 34.068790435791016, "learning_rate": 1.1728395061728396e-05, "loss": 0.0601, "step": 710 }, { "epoch": 105.92592592592592, "eval_accuracy": 0.875, "eval_loss": 0.3228204846382141, "eval_runtime": 1.8931, "eval_samples_per_second": 50.711, "eval_steps_per_second": 1.585, "step": 715 }, { "epoch": 106.66666666666667, "grad_norm": 28.148273468017578, "learning_rate": 1.1111111111111112e-05, "loss": 0.0512, "step": 720 }, { "epoch": 106.96296296296296, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3189867436885834, "eval_runtime": 1.9115, "eval_samples_per_second": 50.224, "eval_steps_per_second": 1.569, "step": 722 }, { "epoch": 108.0, "eval_accuracy": 0.875, "eval_loss": 0.30278849601745605, "eval_runtime": 1.8687, "eval_samples_per_second": 51.373, "eval_steps_per_second": 1.605, "step": 729 }, { "epoch": 108.14814814814815, "grad_norm": 8.840377807617188, "learning_rate": 1.0493827160493827e-05, "loss": 0.0346, "step": 730 }, { "epoch": 108.88888888888889, "eval_accuracy": 0.90625, "eval_loss": 0.3065723478794098, "eval_runtime": 1.849, "eval_samples_per_second": 51.919, "eval_steps_per_second": 1.622, "step": 735 }, { "epoch": 109.62962962962963, "grad_norm": 5.847916126251221, "learning_rate": 9.876543209876543e-06, "loss": 0.0434, "step": 740 }, { "epoch": 109.92592592592592, "eval_accuracy": 0.90625, "eval_loss": 0.29524290561676025, "eval_runtime": 1.8637, "eval_samples_per_second": 51.511, "eval_steps_per_second": 1.61, "step": 742 }, { "epoch": 110.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.3053508698940277, "eval_runtime": 1.875, "eval_samples_per_second": 51.201, "eval_steps_per_second": 1.6, "step": 749 }, { "epoch": 111.11111111111111, "grad_norm": 13.934989929199219, "learning_rate": 9.259259259259259e-06, "loss": 0.0466, "step": 750 }, { "epoch": 112.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.30871471762657166, "eval_runtime": 1.8837, "eval_samples_per_second": 50.965, "eval_steps_per_second": 1.593, "step": 756 }, { "epoch": 112.5925925925926, "grad_norm": 4.37272834777832, "learning_rate": 8.641975308641975e-06, "loss": 0.0402, "step": 760 }, { "epoch": 112.88888888888889, "eval_accuracy": 0.875, "eval_loss": 0.3211623728275299, "eval_runtime": 1.8685, "eval_samples_per_second": 51.377, "eval_steps_per_second": 1.606, "step": 762 }, { "epoch": 113.92592592592592, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3235282897949219, "eval_runtime": 1.8591, "eval_samples_per_second": 51.637, "eval_steps_per_second": 1.614, "step": 769 }, { "epoch": 114.07407407407408, "grad_norm": 17.743671417236328, "learning_rate": 8.02469135802469e-06, "loss": 0.0491, "step": 770 }, { "epoch": 114.96296296296296, "eval_accuracy": 0.90625, "eval_loss": 0.3134639263153076, "eval_runtime": 1.8752, "eval_samples_per_second": 51.195, "eval_steps_per_second": 1.6, "step": 776 }, { "epoch": 115.55555555555556, "grad_norm": 14.649134635925293, "learning_rate": 7.4074074074074075e-06, "loss": 0.0495, "step": 780 }, { "epoch": 116.0, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.2991209924221039, "eval_runtime": 1.9038, "eval_samples_per_second": 50.426, "eval_steps_per_second": 1.576, "step": 783 }, { "epoch": 116.88888888888889, "eval_accuracy": 0.8854166666666666, "eval_loss": 0.3050777018070221, "eval_runtime": 1.8271, "eval_samples_per_second": 52.541, "eval_steps_per_second": 1.642, "step": 789 }, { "epoch": 117.03703703703704, "grad_norm": 14.181517601013184, "learning_rate": 6.790123456790123e-06, "loss": 0.0536, "step": 790 }, { "epoch": 117.92592592592592, "eval_accuracy": 0.875, "eval_loss": 0.33390840888023376, "eval_runtime": 1.8859, "eval_samples_per_second": 50.904, "eval_steps_per_second": 1.591, "step": 796 }, { "epoch": 118.51851851851852, "grad_norm": 13.237271308898926, "learning_rate": 6.172839506172839e-06, "loss": 0.0419, "step": 800 }, { "epoch": 118.96296296296296, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3370848596096039, "eval_runtime": 1.8605, "eval_samples_per_second": 51.6, "eval_steps_per_second": 1.612, "step": 803 }, { "epoch": 120.0, "grad_norm": 5.592768669128418, "learning_rate": 5.555555555555556e-06, "loss": 0.0333, "step": 810 }, { "epoch": 120.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3375813066959381, "eval_runtime": 1.8801, "eval_samples_per_second": 51.06, "eval_steps_per_second": 1.596, "step": 810 }, { "epoch": 120.88888888888889, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.33790695667266846, "eval_runtime": 1.8455, "eval_samples_per_second": 52.018, "eval_steps_per_second": 1.626, "step": 816 }, { "epoch": 121.48148148148148, "grad_norm": 4.372032165527344, "learning_rate": 4.938271604938272e-06, "loss": 0.0376, "step": 820 }, { "epoch": 121.92592592592592, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.337320476770401, "eval_runtime": 1.8367, "eval_samples_per_second": 52.267, "eval_steps_per_second": 1.633, "step": 823 }, { "epoch": 122.96296296296296, "grad_norm": 10.392346382141113, "learning_rate": 4.3209876543209875e-06, "loss": 0.0397, "step": 830 }, { "epoch": 122.96296296296296, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.34366607666015625, "eval_runtime": 1.8522, "eval_samples_per_second": 51.831, "eval_steps_per_second": 1.62, "step": 830 }, { "epoch": 124.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.35845816135406494, "eval_runtime": 1.8547, "eval_samples_per_second": 51.76, "eval_steps_per_second": 1.618, "step": 837 }, { "epoch": 124.44444444444444, "grad_norm": 5.668208122253418, "learning_rate": 3.7037037037037037e-06, "loss": 0.0299, "step": 840 }, { "epoch": 124.88888888888889, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3514343202114105, "eval_runtime": 1.8661, "eval_samples_per_second": 51.445, "eval_steps_per_second": 1.608, "step": 843 }, { "epoch": 125.92592592592592, "grad_norm": 21.28321647644043, "learning_rate": 3.0864197530864196e-06, "loss": 0.0468, "step": 850 }, { "epoch": 125.92592592592592, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3397478759288788, "eval_runtime": 1.8626, "eval_samples_per_second": 51.54, "eval_steps_per_second": 1.611, "step": 850 }, { "epoch": 126.96296296296296, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.33162921667099, "eval_runtime": 1.9031, "eval_samples_per_second": 50.444, "eval_steps_per_second": 1.576, "step": 857 }, { "epoch": 127.4074074074074, "grad_norm": 2.5001776218414307, "learning_rate": 2.469135802469136e-06, "loss": 0.0351, "step": 860 }, { "epoch": 128.0, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.33344319462776184, "eval_runtime": 1.8441, "eval_samples_per_second": 52.058, "eval_steps_per_second": 1.627, "step": 864 }, { "epoch": 128.88888888888889, "grad_norm": 9.004130363464355, "learning_rate": 1.8518518518518519e-06, "loss": 0.0439, "step": 870 }, { "epoch": 128.88888888888889, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3323952555656433, "eval_runtime": 1.8475, "eval_samples_per_second": 51.963, "eval_steps_per_second": 1.624, "step": 870 }, { "epoch": 129.92592592592592, "eval_accuracy": 0.8645833333333334, "eval_loss": 0.3289691209793091, "eval_runtime": 1.8689, "eval_samples_per_second": 51.368, "eval_steps_per_second": 1.605, "step": 877 }, { "epoch": 130.37037037037038, "grad_norm": 12.271383285522461, "learning_rate": 1.234567901234568e-06, "loss": 0.0478, "step": 880 }, { "epoch": 130.96296296296296, "eval_accuracy": 0.875, "eval_loss": 0.325620174407959, "eval_runtime": 1.8869, "eval_samples_per_second": 50.876, "eval_steps_per_second": 1.59, "step": 884 }, { "epoch": 131.85185185185185, "grad_norm": 8.459063529968262, "learning_rate": 6.17283950617284e-07, "loss": 0.0434, "step": 890 }, { "epoch": 132.0, "eval_accuracy": 0.875, "eval_loss": 0.32529863715171814, "eval_runtime": 1.8653, "eval_samples_per_second": 51.466, "eval_steps_per_second": 1.608, "step": 891 }, { "epoch": 132.88888888888889, "eval_accuracy": 0.875, "eval_loss": 0.325122594833374, "eval_runtime": 1.835, "eval_samples_per_second": 52.315, "eval_steps_per_second": 1.635, "step": 897 }, { "epoch": 133.33333333333334, "grad_norm": 9.571310997009277, "learning_rate": 0.0, "loss": 0.0374, "step": 900 }, { "epoch": 133.33333333333334, "eval_accuracy": 0.875, "eval_loss": 0.3251601457595825, "eval_runtime": 1.8591, "eval_samples_per_second": 51.637, "eval_steps_per_second": 1.614, "step": 900 }, { "epoch": 133.33333333333334, "step": 900, "total_flos": 2.900008367869133e+18, "train_loss": 0.254776107304626, "train_runtime": 3168.008, "train_samples_per_second": 40.909, "train_steps_per_second": 0.284 } ], "logging_steps": 10, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.900008367869133e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }