{ "best_metric": 0.8260869565217391, "best_model_checkpoint": "swiftformer-xs-ve-U13-b-80\\checkpoint-136", "epoch": 73.84615384615384, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.92, "eval_accuracy": 0.2391304347826087, "eval_loss": 1.3858562707901, "eval_runtime": 0.6151, "eval_samples_per_second": 74.782, "eval_steps_per_second": 3.251, "step": 6 }, { "epoch": 1.54, "learning_rate": 4.166666666666667e-05, "loss": 1.3857, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.2826086956521739, "eval_loss": 1.383385419845581, "eval_runtime": 0.5892, "eval_samples_per_second": 78.066, "eval_steps_per_second": 3.394, "step": 13 }, { "epoch": 2.92, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.3788952827453613, "eval_runtime": 0.6135, "eval_samples_per_second": 74.979, "eval_steps_per_second": 3.26, "step": 19 }, { "epoch": 3.08, "learning_rate": 8.333333333333334e-05, "loss": 1.3767, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.15217391304347827, "eval_loss": 1.3666234016418457, "eval_runtime": 0.5852, "eval_samples_per_second": 78.61, "eval_steps_per_second": 3.418, "step": 26 }, { "epoch": 4.62, "learning_rate": 0.000125, "loss": 1.3226, "step": 30 }, { "epoch": 4.92, "eval_accuracy": 0.15217391304347827, "eval_loss": 1.356500506401062, "eval_runtime": 0.5925, "eval_samples_per_second": 77.636, "eval_steps_per_second": 3.375, "step": 32 }, { "epoch": 6.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.3901711702346802, "eval_runtime": 0.5997, "eval_samples_per_second": 76.711, "eval_steps_per_second": 3.335, "step": 39 }, { "epoch": 6.15, "learning_rate": 0.0001666666666666667, "loss": 1.1987, "step": 40 }, { "epoch": 6.92, "eval_accuracy": 0.21739130434782608, "eval_loss": 1.3712323904037476, "eval_runtime": 0.5757, "eval_samples_per_second": 79.903, "eval_steps_per_second": 3.474, "step": 45 }, { "epoch": 7.69, "learning_rate": 0.0001990740740740741, "loss": 1.1075, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.34782608695652173, "eval_loss": 1.3196704387664795, "eval_runtime": 0.5987, "eval_samples_per_second": 76.833, "eval_steps_per_second": 3.341, "step": 52 }, { "epoch": 8.92, "eval_accuracy": 0.3695652173913043, "eval_loss": 1.3648946285247803, "eval_runtime": 0.5855, "eval_samples_per_second": 78.566, "eval_steps_per_second": 3.416, "step": 58 }, { "epoch": 9.23, "learning_rate": 0.00019444444444444446, "loss": 0.9988, "step": 60 }, { "epoch": 10.0, "eval_accuracy": 0.3695652173913043, "eval_loss": 1.2583398818969727, "eval_runtime": 0.6118, "eval_samples_per_second": 75.188, "eval_steps_per_second": 3.269, "step": 65 }, { "epoch": 10.77, "learning_rate": 0.00018981481481481483, "loss": 0.8863, "step": 70 }, { "epoch": 10.92, "eval_accuracy": 0.3695652173913043, "eval_loss": 1.2484155893325806, "eval_runtime": 0.5738, "eval_samples_per_second": 80.169, "eval_steps_per_second": 3.486, "step": 71 }, { "epoch": 12.0, "eval_accuracy": 0.41304347826086957, "eval_loss": 1.286899447441101, "eval_runtime": 0.6289, "eval_samples_per_second": 73.144, "eval_steps_per_second": 3.18, "step": 78 }, { "epoch": 12.31, "learning_rate": 0.0001851851851851852, "loss": 0.8228, "step": 80 }, { "epoch": 12.92, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.1677778959274292, "eval_runtime": 0.5616, "eval_samples_per_second": 81.902, "eval_steps_per_second": 3.561, "step": 84 }, { "epoch": 13.85, "learning_rate": 0.00018055555555555557, "loss": 0.7456, "step": 90 }, { "epoch": 14.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.0274546146392822, "eval_runtime": 0.5647, "eval_samples_per_second": 81.465, "eval_steps_per_second": 3.542, "step": 91 }, { "epoch": 14.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.9701868891716003, "eval_runtime": 0.5987, "eval_samples_per_second": 76.834, "eval_steps_per_second": 3.341, "step": 97 }, { "epoch": 15.38, "learning_rate": 0.00017592592592592595, "loss": 0.6595, "step": 100 }, { "epoch": 16.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9103245139122009, "eval_runtime": 0.5651, "eval_samples_per_second": 81.4, "eval_steps_per_second": 3.539, "step": 104 }, { "epoch": 16.92, "learning_rate": 0.00017129629629629632, "loss": 0.5995, "step": 110 }, { "epoch": 16.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8506172895431519, "eval_runtime": 0.6113, "eval_samples_per_second": 75.248, "eval_steps_per_second": 3.272, "step": 110 }, { "epoch": 18.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8514139652252197, "eval_runtime": 0.5699, "eval_samples_per_second": 80.723, "eval_steps_per_second": 3.51, "step": 117 }, { "epoch": 18.46, "learning_rate": 0.0001666666666666667, "loss": 0.5826, "step": 120 }, { "epoch": 18.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8963512182235718, "eval_runtime": 0.5992, "eval_samples_per_second": 76.774, "eval_steps_per_second": 3.338, "step": 123 }, { "epoch": 20.0, "learning_rate": 0.00016203703703703706, "loss": 0.4818, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8549758195877075, "eval_runtime": 0.5779, "eval_samples_per_second": 79.599, "eval_steps_per_second": 3.461, "step": 130 }, { "epoch": 20.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.7131932973861694, "eval_runtime": 0.5892, "eval_samples_per_second": 78.073, "eval_steps_per_second": 3.394, "step": 136 }, { "epoch": 21.54, "learning_rate": 0.00015740740740740743, "loss": 0.4553, "step": 140 }, { "epoch": 22.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6972543597221375, "eval_runtime": 0.6311, "eval_samples_per_second": 72.888, "eval_steps_per_second": 3.169, "step": 143 }, { "epoch": 22.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.749606728553772, "eval_runtime": 0.6069, "eval_samples_per_second": 75.792, "eval_steps_per_second": 3.295, "step": 149 }, { "epoch": 23.08, "learning_rate": 0.00015277777777777777, "loss": 0.4276, "step": 150 }, { "epoch": 24.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9087320566177368, "eval_runtime": 0.6537, "eval_samples_per_second": 70.367, "eval_steps_per_second": 3.059, "step": 156 }, { "epoch": 24.62, "learning_rate": 0.00014814814814814815, "loss": 0.3375, "step": 160 }, { "epoch": 24.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.7786810398101807, "eval_runtime": 0.588, "eval_samples_per_second": 78.229, "eval_steps_per_second": 3.401, "step": 162 }, { "epoch": 26.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7131972312927246, "eval_runtime": 0.5899, "eval_samples_per_second": 77.976, "eval_steps_per_second": 3.39, "step": 169 }, { "epoch": 26.15, "learning_rate": 0.00014351851851851852, "loss": 0.3199, "step": 170 }, { "epoch": 26.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7569522261619568, "eval_runtime": 0.5879, "eval_samples_per_second": 78.242, "eval_steps_per_second": 3.402, "step": 175 }, { "epoch": 27.69, "learning_rate": 0.0001388888888888889, "loss": 0.2756, "step": 180 }, { "epoch": 28.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.787326455116272, "eval_runtime": 0.5846, "eval_samples_per_second": 78.69, "eval_steps_per_second": 3.421, "step": 182 }, { "epoch": 28.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7894624471664429, "eval_runtime": 0.6067, "eval_samples_per_second": 75.814, "eval_steps_per_second": 3.296, "step": 188 }, { "epoch": 29.23, "learning_rate": 0.00013425925925925926, "loss": 0.2254, "step": 190 }, { "epoch": 30.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7443256378173828, "eval_runtime": 0.6221, "eval_samples_per_second": 73.946, "eval_steps_per_second": 3.215, "step": 195 }, { "epoch": 30.77, "learning_rate": 0.00012962962962962963, "loss": 0.2576, "step": 200 }, { "epoch": 30.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9623061418533325, "eval_runtime": 0.6026, "eval_samples_per_second": 76.332, "eval_steps_per_second": 3.319, "step": 201 }, { "epoch": 32.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7348794341087341, "eval_runtime": 0.5766, "eval_samples_per_second": 79.779, "eval_steps_per_second": 3.469, "step": 208 }, { "epoch": 32.31, "learning_rate": 0.000125, "loss": 0.2113, "step": 210 }, { "epoch": 32.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7886755466461182, "eval_runtime": 0.6103, "eval_samples_per_second": 75.378, "eval_steps_per_second": 3.277, "step": 214 }, { "epoch": 33.85, "learning_rate": 0.00012037037037037037, "loss": 0.1978, "step": 220 }, { "epoch": 34.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8920990228652954, "eval_runtime": 0.573, "eval_samples_per_second": 80.278, "eval_steps_per_second": 3.49, "step": 221 }, { "epoch": 34.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8102229833602905, "eval_runtime": 0.6072, "eval_samples_per_second": 75.756, "eval_steps_per_second": 3.294, "step": 227 }, { "epoch": 35.38, "learning_rate": 0.00011574074074074075, "loss": 0.2455, "step": 230 }, { "epoch": 36.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.894730269908905, "eval_runtime": 0.6442, "eval_samples_per_second": 71.406, "eval_steps_per_second": 3.105, "step": 234 }, { "epoch": 36.92, "learning_rate": 0.00011111111111111112, "loss": 0.1809, "step": 240 }, { "epoch": 36.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.8143943548202515, "eval_runtime": 0.5823, "eval_samples_per_second": 78.995, "eval_steps_per_second": 3.435, "step": 240 }, { "epoch": 38.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.829043984413147, "eval_runtime": 0.573, "eval_samples_per_second": 80.275, "eval_steps_per_second": 3.49, "step": 247 }, { "epoch": 38.46, "learning_rate": 0.00010648148148148149, "loss": 0.1967, "step": 250 }, { "epoch": 38.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8135195374488831, "eval_runtime": 0.5896, "eval_samples_per_second": 78.014, "eval_steps_per_second": 3.392, "step": 253 }, { "epoch": 40.0, "learning_rate": 0.00010185185185185186, "loss": 0.1608, "step": 260 }, { "epoch": 40.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8064969182014465, "eval_runtime": 0.6144, "eval_samples_per_second": 74.87, "eval_steps_per_second": 3.255, "step": 260 }, { "epoch": 40.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.739860475063324, "eval_runtime": 0.5769, "eval_samples_per_second": 79.736, "eval_steps_per_second": 3.467, "step": 266 }, { "epoch": 41.54, "learning_rate": 9.722222222222223e-05, "loss": 0.1704, "step": 270 }, { "epoch": 42.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7099208235740662, "eval_runtime": 0.5722, "eval_samples_per_second": 80.394, "eval_steps_per_second": 3.495, "step": 273 }, { "epoch": 42.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7569423913955688, "eval_runtime": 0.5853, "eval_samples_per_second": 78.596, "eval_steps_per_second": 3.417, "step": 279 }, { "epoch": 43.08, "learning_rate": 9.25925925925926e-05, "loss": 0.1682, "step": 280 }, { "epoch": 44.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.8458698987960815, "eval_runtime": 0.6104, "eval_samples_per_second": 75.362, "eval_steps_per_second": 3.277, "step": 286 }, { "epoch": 44.62, "learning_rate": 8.796296296296297e-05, "loss": 0.1607, "step": 290 }, { "epoch": 44.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7311373353004456, "eval_runtime": 0.5835, "eval_samples_per_second": 78.829, "eval_steps_per_second": 3.427, "step": 292 }, { "epoch": 46.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.783256471157074, "eval_runtime": 0.619, "eval_samples_per_second": 74.311, "eval_steps_per_second": 3.231, "step": 299 }, { "epoch": 46.15, "learning_rate": 8.333333333333334e-05, "loss": 0.1589, "step": 300 }, { "epoch": 46.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8072806596755981, "eval_runtime": 0.6425, "eval_samples_per_second": 71.594, "eval_steps_per_second": 3.113, "step": 305 }, { "epoch": 47.69, "learning_rate": 7.870370370370372e-05, "loss": 0.1524, "step": 310 }, { "epoch": 48.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7472575902938843, "eval_runtime": 0.6, "eval_samples_per_second": 76.668, "eval_steps_per_second": 3.333, "step": 312 }, { "epoch": 48.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6779573559761047, "eval_runtime": 0.5961, "eval_samples_per_second": 77.169, "eval_steps_per_second": 3.355, "step": 318 }, { "epoch": 49.23, "learning_rate": 7.407407407407407e-05, "loss": 0.1586, "step": 320 }, { "epoch": 50.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7573379874229431, "eval_runtime": 0.6092, "eval_samples_per_second": 75.504, "eval_steps_per_second": 3.283, "step": 325 }, { "epoch": 50.77, "learning_rate": 6.944444444444444e-05, "loss": 0.128, "step": 330 }, { "epoch": 50.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7614392638206482, "eval_runtime": 0.5939, "eval_samples_per_second": 77.451, "eval_steps_per_second": 3.367, "step": 331 }, { "epoch": 52.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7338166236877441, "eval_runtime": 0.6299, "eval_samples_per_second": 73.033, "eval_steps_per_second": 3.175, "step": 338 }, { "epoch": 52.31, "learning_rate": 6.481481481481482e-05, "loss": 0.1254, "step": 340 }, { "epoch": 52.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7666023373603821, "eval_runtime": 0.6065, "eval_samples_per_second": 75.848, "eval_steps_per_second": 3.298, "step": 344 }, { "epoch": 53.85, "learning_rate": 6.018518518518519e-05, "loss": 0.1206, "step": 350 }, { "epoch": 54.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8433414101600647, "eval_runtime": 0.6112, "eval_samples_per_second": 75.257, "eval_steps_per_second": 3.272, "step": 351 }, { "epoch": 54.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8747164011001587, "eval_runtime": 0.6225, "eval_samples_per_second": 73.894, "eval_steps_per_second": 3.213, "step": 357 }, { "epoch": 55.38, "learning_rate": 5.555555555555556e-05, "loss": 0.1398, "step": 360 }, { "epoch": 56.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8940105438232422, "eval_runtime": 0.5972, "eval_samples_per_second": 77.026, "eval_steps_per_second": 3.349, "step": 364 }, { "epoch": 56.92, "learning_rate": 5.092592592592593e-05, "loss": 0.1536, "step": 370 }, { "epoch": 56.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7780929803848267, "eval_runtime": 0.5998, "eval_samples_per_second": 76.693, "eval_steps_per_second": 3.334, "step": 370 }, { "epoch": 58.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7350872159004211, "eval_runtime": 0.6081, "eval_samples_per_second": 75.64, "eval_steps_per_second": 3.289, "step": 377 }, { "epoch": 58.46, "learning_rate": 4.62962962962963e-05, "loss": 0.1281, "step": 380 }, { "epoch": 58.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7600933313369751, "eval_runtime": 0.5822, "eval_samples_per_second": 79.008, "eval_steps_per_second": 3.435, "step": 383 }, { "epoch": 60.0, "learning_rate": 4.166666666666667e-05, "loss": 0.1156, "step": 390 }, { "epoch": 60.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7990931868553162, "eval_runtime": 0.6251, "eval_samples_per_second": 73.589, "eval_steps_per_second": 3.2, "step": 390 }, { "epoch": 60.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7775823473930359, "eval_runtime": 0.5922, "eval_samples_per_second": 77.678, "eval_steps_per_second": 3.377, "step": 396 }, { "epoch": 61.54, "learning_rate": 3.7037037037037037e-05, "loss": 0.0852, "step": 400 }, { "epoch": 62.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.78378826379776, "eval_runtime": 0.6126, "eval_samples_per_second": 75.09, "eval_steps_per_second": 3.265, "step": 403 }, { "epoch": 62.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7752338647842407, "eval_runtime": 0.594, "eval_samples_per_second": 77.443, "eval_steps_per_second": 3.367, "step": 409 }, { "epoch": 63.08, "learning_rate": 3.240740740740741e-05, "loss": 0.1106, "step": 410 }, { "epoch": 64.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7541300058364868, "eval_runtime": 0.6316, "eval_samples_per_second": 72.826, "eval_steps_per_second": 3.166, "step": 416 }, { "epoch": 64.62, "learning_rate": 2.777777777777778e-05, "loss": 0.0817, "step": 420 }, { "epoch": 64.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7536001801490784, "eval_runtime": 0.6469, "eval_samples_per_second": 71.106, "eval_steps_per_second": 3.092, "step": 422 }, { "epoch": 66.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8129211068153381, "eval_runtime": 0.5986, "eval_samples_per_second": 76.844, "eval_steps_per_second": 3.341, "step": 429 }, { "epoch": 66.15, "learning_rate": 2.314814814814815e-05, "loss": 0.1211, "step": 430 }, { "epoch": 66.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7883771061897278, "eval_runtime": 0.6108, "eval_samples_per_second": 75.306, "eval_steps_per_second": 3.274, "step": 435 }, { "epoch": 67.69, "learning_rate": 1.8518518518518518e-05, "loss": 0.0944, "step": 440 }, { "epoch": 68.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8011212348937988, "eval_runtime": 0.5979, "eval_samples_per_second": 76.93, "eval_steps_per_second": 3.345, "step": 442 }, { "epoch": 68.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.806803822517395, "eval_runtime": 0.5931, "eval_samples_per_second": 77.56, "eval_steps_per_second": 3.372, "step": 448 }, { "epoch": 69.23, "learning_rate": 1.388888888888889e-05, "loss": 0.1187, "step": 450 }, { "epoch": 70.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7795934081077576, "eval_runtime": 0.6317, "eval_samples_per_second": 72.821, "eval_steps_per_second": 3.166, "step": 455 }, { "epoch": 70.77, "learning_rate": 9.259259259259259e-06, "loss": 0.0935, "step": 460 }, { "epoch": 70.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.793422281742096, "eval_runtime": 0.5934, "eval_samples_per_second": 77.523, "eval_steps_per_second": 3.371, "step": 461 }, { "epoch": 72.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.736743152141571, "eval_runtime": 0.6082, "eval_samples_per_second": 75.631, "eval_steps_per_second": 3.288, "step": 468 }, { "epoch": 72.31, "learning_rate": 4.6296296296296296e-06, "loss": 0.109, "step": 470 }, { "epoch": 72.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7514945268630981, "eval_runtime": 0.604, "eval_samples_per_second": 76.157, "eval_steps_per_second": 3.311, "step": 474 }, { "epoch": 73.85, "learning_rate": 0.0, "loss": 0.1006, "step": 480 }, { "epoch": 73.85, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7887920141220093, "eval_runtime": 0.613, "eval_samples_per_second": 75.037, "eval_steps_per_second": 3.262, "step": 480 }, { "epoch": 73.85, "step": 480, "total_flos": 1.6581977329862246e+17, "train_loss": 0.38448232350250083, "train_runtime": 275.3303, "train_samples_per_second": 237.969, "train_steps_per_second": 1.743 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 1.6581977329862246e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }