{ "best_metric": null, "best_model_checkpoint": null, "epoch": 34.628019323671495, "eval_steps": 500, "global_step": 2240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 0.0001, "loss": 2.2092, "step": 5 }, { "epoch": 0.15, "learning_rate": 0.0001, "loss": 1.4086, "step": 10 }, { "epoch": 0.23, "learning_rate": 0.0001, "loss": 1.1906, "step": 15 }, { "epoch": 0.31, "learning_rate": 0.0001, "loss": 1.1447, "step": 20 }, { "epoch": 0.39, "learning_rate": 0.0001, "loss": 1.1125, "step": 25 }, { "epoch": 0.46, "learning_rate": 0.0001, "loss": 1.1011, "step": 30 }, { "epoch": 0.54, "learning_rate": 0.0001, "loss": 1.0439, "step": 35 }, { "epoch": 0.62, "learning_rate": 0.0001, "loss": 1.0341, "step": 40 }, { "epoch": 0.7, "learning_rate": 0.0001, "loss": 1.0454, "step": 45 }, { "epoch": 0.77, "learning_rate": 0.0001, "loss": 1.0261, "step": 50 }, { "epoch": 0.85, "learning_rate": 0.0001, "loss": 1.0393, "step": 55 }, { "epoch": 0.93, "learning_rate": 0.0001, "loss": 1.0407, "step": 60 }, { "epoch": 1.0, "learning_rate": 0.0001, "loss": 0.9819, "step": 65 }, { "epoch": 1.08, "learning_rate": 0.0001, "loss": 0.7776, "step": 70 }, { "epoch": 1.16, "learning_rate": 0.0001, "loss": 0.742, "step": 75 }, { "epoch": 1.24, "learning_rate": 0.0001, "loss": 0.7434, "step": 80 }, { "epoch": 1.31, "learning_rate": 0.0001, "loss": 0.7963, "step": 85 }, { "epoch": 1.39, "learning_rate": 0.0001, "loss": 0.7459, "step": 90 }, { "epoch": 1.47, "learning_rate": 0.0001, "loss": 0.7714, "step": 95 }, { "epoch": 1.55, "learning_rate": 0.0001, "loss": 0.7663, "step": 100 }, { "epoch": 1.62, "learning_rate": 0.0001, "loss": 0.7458, "step": 105 }, { "epoch": 1.7, "learning_rate": 0.0001, "loss": 0.7591, "step": 110 }, { "epoch": 1.78, "learning_rate": 0.0001, "loss": 0.7567, "step": 115 }, { "epoch": 1.86, "learning_rate": 0.0001, "loss": 0.7394, "step": 120 }, { "epoch": 1.93, "learning_rate": 0.0001, "loss": 0.7631, "step": 125 }, { "epoch": 2.01, "learning_rate": 0.0001, "loss": 0.7383, "step": 130 }, { "epoch": 2.09, "learning_rate": 0.0001, "loss": 0.5741, "step": 135 }, { "epoch": 2.16, "learning_rate": 0.0001, "loss": 0.5493, "step": 140 }, { "epoch": 2.24, "learning_rate": 0.0001, "loss": 0.5863, "step": 145 }, { "epoch": 2.32, "learning_rate": 0.0001, "loss": 0.5505, "step": 150 }, { "epoch": 2.4, "learning_rate": 0.0001, "loss": 0.545, "step": 155 }, { "epoch": 2.47, "learning_rate": 0.0001, "loss": 0.5583, "step": 160 }, { "epoch": 2.55, "learning_rate": 0.0001, "loss": 0.5454, "step": 165 }, { "epoch": 2.63, "learning_rate": 0.0001, "loss": 0.551, "step": 170 }, { "epoch": 2.71, "learning_rate": 0.0001, "loss": 0.5609, "step": 175 }, { "epoch": 2.78, "learning_rate": 0.0001, "loss": 0.5414, "step": 180 }, { "epoch": 2.86, "learning_rate": 0.0001, "loss": 0.6485, "step": 185 }, { "epoch": 2.94, "learning_rate": 0.0001, "loss": 0.5638, "step": 190 }, { "epoch": 3.01, "learning_rate": 0.0001, "loss": 0.5406, "step": 195 }, { "epoch": 3.09, "learning_rate": 0.0001, "loss": 0.4256, "step": 200 }, { "epoch": 3.17, "learning_rate": 0.0001, "loss": 0.3931, "step": 205 }, { "epoch": 3.25, "learning_rate": 0.0001, "loss": 0.4069, "step": 210 }, { "epoch": 3.32, "learning_rate": 0.0001, "loss": 0.4141, "step": 215 }, { "epoch": 3.4, "learning_rate": 0.0001, "loss": 0.4716, "step": 220 }, { "epoch": 3.48, "learning_rate": 0.0001, "loss": 0.4219, "step": 225 }, { "epoch": 3.56, "learning_rate": 0.0001, "loss": 0.4143, "step": 230 }, { "epoch": 3.63, "learning_rate": 0.0001, "loss": 0.4139, "step": 235 }, { "epoch": 3.71, "learning_rate": 0.0001, "loss": 0.4412, "step": 240 }, { "epoch": 3.79, "learning_rate": 0.0001, "loss": 0.4283, "step": 245 }, { "epoch": 3.86, "learning_rate": 0.0001, "loss": 0.4261, "step": 250 }, { "epoch": 3.94, "learning_rate": 0.0001, "loss": 0.4304, "step": 255 }, { "epoch": 4.02, "learning_rate": 0.0001, "loss": 0.4093, "step": 260 }, { "epoch": 4.1, "learning_rate": 0.0001, "loss": 0.3088, "step": 265 }, { "epoch": 4.17, "learning_rate": 0.0001, "loss": 0.3017, "step": 270 }, { "epoch": 4.25, "learning_rate": 0.0001, "loss": 0.3169, "step": 275 }, { "epoch": 4.33, "learning_rate": 0.0001, "loss": 0.3077, "step": 280 }, { "epoch": 4.41, "learning_rate": 0.0001, "loss": 0.3131, "step": 285 }, { "epoch": 4.48, "learning_rate": 0.0001, "loss": 0.3239, "step": 290 }, { "epoch": 4.56, "learning_rate": 0.0001, "loss": 0.3428, "step": 295 }, { "epoch": 4.64, "learning_rate": 0.0001, "loss": 0.3267, "step": 300 }, { "epoch": 4.71, "learning_rate": 0.0001, "loss": 0.3194, "step": 305 }, { "epoch": 4.79, "learning_rate": 0.0001, "loss": 0.3276, "step": 310 }, { "epoch": 4.87, "learning_rate": 0.0001, "loss": 0.3314, "step": 315 }, { "epoch": 4.95, "learning_rate": 0.0001, "loss": 0.4401, "step": 320 }, { "epoch": 5.02, "learning_rate": 0.0001, "loss": 0.3017, "step": 325 }, { "epoch": 5.1, "learning_rate": 0.0001, "loss": 0.2653, "step": 330 }, { "epoch": 5.18, "learning_rate": 0.0001, "loss": 0.293, "step": 335 }, { "epoch": 5.26, "learning_rate": 0.0001, "loss": 0.2381, "step": 340 }, { "epoch": 5.33, "learning_rate": 0.0001, "loss": 0.245, "step": 345 }, { "epoch": 5.41, "learning_rate": 0.0001, "loss": 0.2307, "step": 350 }, { "epoch": 5.49, "learning_rate": 0.0001, "loss": 0.2469, "step": 355 }, { "epoch": 5.57, "learning_rate": 0.0001, "loss": 0.241, "step": 360 }, { "epoch": 5.64, "learning_rate": 0.0001, "loss": 0.2494, "step": 365 }, { "epoch": 5.72, "learning_rate": 0.0001, "loss": 0.2509, "step": 370 }, { "epoch": 5.8, "learning_rate": 0.0001, "loss": 0.2459, "step": 375 }, { "epoch": 5.87, "learning_rate": 0.0001, "loss": 0.2524, "step": 380 }, { "epoch": 5.95, "learning_rate": 0.0001, "loss": 0.2613, "step": 385 }, { "epoch": 6.03, "learning_rate": 0.0001, "loss": 0.2321, "step": 390 }, { "epoch": 6.11, "learning_rate": 0.0001, "loss": 0.1857, "step": 395 }, { "epoch": 6.18, "learning_rate": 0.0001, "loss": 0.1834, "step": 400 }, { "epoch": 6.26, "learning_rate": 0.0001, "loss": 0.1888, "step": 405 }, { "epoch": 6.34, "learning_rate": 0.0001, "loss": 0.1857, "step": 410 }, { "epoch": 6.42, "learning_rate": 0.0001, "loss": 0.1901, "step": 415 }, { "epoch": 6.49, "learning_rate": 0.0001, "loss": 0.2473, "step": 420 }, { "epoch": 6.57, "learning_rate": 0.0001, "loss": 0.1957, "step": 425 }, { "epoch": 6.65, "learning_rate": 0.0001, "loss": 0.1991, "step": 430 }, { "epoch": 6.72, "learning_rate": 0.0001, "loss": 0.2053, "step": 435 }, { "epoch": 6.8, "learning_rate": 0.0001, "loss": 0.196, "step": 440 }, { "epoch": 6.88, "learning_rate": 0.0001, "loss": 0.2074, "step": 445 }, { "epoch": 6.96, "learning_rate": 0.0001, "loss": 0.2103, "step": 450 }, { "epoch": 7.03, "learning_rate": 0.0001, "loss": 0.2051, "step": 455 }, { "epoch": 7.11, "learning_rate": 0.0001, "loss": 0.1453, "step": 460 }, { "epoch": 7.19, "learning_rate": 0.0001, "loss": 0.1502, "step": 465 }, { "epoch": 7.27, "learning_rate": 0.0001, "loss": 0.1432, "step": 470 }, { "epoch": 7.34, "learning_rate": 0.0001, "loss": 0.1494, "step": 475 }, { "epoch": 7.42, "learning_rate": 0.0001, "loss": 0.1475, "step": 480 }, { "epoch": 7.5, "learning_rate": 0.0001, "loss": 0.153, "step": 485 }, { "epoch": 7.57, "learning_rate": 0.0001, "loss": 0.1525, "step": 490 }, { "epoch": 7.65, "learning_rate": 0.0001, "loss": 0.1604, "step": 495 }, { "epoch": 7.73, "learning_rate": 0.0001, "loss": 0.206, "step": 500 }, { "epoch": 7.81, "learning_rate": 0.0001, "loss": 0.1656, "step": 505 }, { "epoch": 7.88, "learning_rate": 0.0001, "loss": 0.2155, "step": 510 }, { "epoch": 7.96, "learning_rate": 0.0001, "loss": 0.1727, "step": 515 }, { "epoch": 8.04, "learning_rate": 0.0001, "loss": 0.1458, "step": 520 }, { "epoch": 8.12, "learning_rate": 0.0001, "loss": 0.1598, "step": 525 }, { "epoch": 8.19, "learning_rate": 0.0001, "loss": 0.1173, "step": 530 }, { "epoch": 8.27, "learning_rate": 0.0001, "loss": 0.12, "step": 535 }, { "epoch": 8.35, "learning_rate": 0.0001, "loss": 0.1216, "step": 540 }, { "epoch": 8.43, "learning_rate": 0.0001, "loss": 0.1242, "step": 545 }, { "epoch": 8.5, "learning_rate": 0.0001, "loss": 0.126, "step": 550 }, { "epoch": 8.58, "learning_rate": 0.0001, "loss": 0.1706, "step": 555 }, { "epoch": 8.66, "learning_rate": 0.0001, "loss": 0.1386, "step": 560 }, { "epoch": 8.73, "learning_rate": 0.0001, "loss": 0.1341, "step": 565 }, { "epoch": 8.81, "learning_rate": 0.0001, "loss": 0.1466, "step": 570 }, { "epoch": 8.89, "learning_rate": 0.0001, "loss": 0.1395, "step": 575 }, { "epoch": 8.97, "learning_rate": 0.0001, "loss": 0.1403, "step": 580 }, { "epoch": 9.04, "learning_rate": 0.0001, "loss": 0.1172, "step": 585 }, { "epoch": 9.12, "learning_rate": 0.0001, "loss": 0.0994, "step": 590 }, { "epoch": 9.2, "learning_rate": 0.0001, "loss": 0.1263, "step": 595 }, { "epoch": 9.28, "learning_rate": 0.0001, "loss": 0.1073, "step": 600 }, { "epoch": 9.35, "learning_rate": 0.0001, "loss": 0.1062, "step": 605 }, { "epoch": 9.43, "learning_rate": 0.0001, "loss": 0.1072, "step": 610 }, { "epoch": 9.51, "learning_rate": 0.0001, "loss": 0.129, "step": 615 }, { "epoch": 9.58, "learning_rate": 0.0001, "loss": 0.1103, "step": 620 }, { "epoch": 9.66, "learning_rate": 0.0001, "loss": 0.14, "step": 625 }, { "epoch": 9.74, "learning_rate": 0.0001, "loss": 0.1138, "step": 630 }, { "epoch": 9.82, "learning_rate": 0.0001, "loss": 0.1136, "step": 635 }, { "epoch": 9.89, "learning_rate": 0.0001, "loss": 0.1161, "step": 640 }, { "epoch": 9.97, "learning_rate": 0.0001, "loss": 0.1126, "step": 645 }, { "epoch": 10.05, "learning_rate": 0.0001, "loss": 0.0942, "step": 650 }, { "epoch": 10.13, "learning_rate": 0.0001, "loss": 0.1245, "step": 655 }, { "epoch": 10.2, "learning_rate": 0.0001, "loss": 0.0892, "step": 660 }, { "epoch": 10.28, "learning_rate": 0.0001, "loss": 0.1198, "step": 665 }, { "epoch": 10.36, "learning_rate": 0.0001, "loss": 0.0929, "step": 670 }, { "epoch": 10.43, "learning_rate": 0.0001, "loss": 0.0923, "step": 675 }, { "epoch": 10.51, "learning_rate": 0.0001, "loss": 0.0962, "step": 680 }, { "epoch": 10.59, "learning_rate": 0.0001, "loss": 0.0945, "step": 685 }, { "epoch": 10.67, "learning_rate": 0.0001, "loss": 0.0986, "step": 690 }, { "epoch": 10.74, "learning_rate": 0.0001, "loss": 0.0982, "step": 695 }, { "epoch": 10.82, "learning_rate": 0.0001, "loss": 0.0982, "step": 700 }, { "epoch": 10.9, "learning_rate": 0.0001, "loss": 0.1011, "step": 705 }, { "epoch": 10.98, "learning_rate": 0.0001, "loss": 0.1037, "step": 710 }, { "epoch": 11.05, "learning_rate": 0.0001, "loss": 0.0817, "step": 715 }, { "epoch": 11.13, "learning_rate": 0.0001, "loss": 0.0979, "step": 720 }, { "epoch": 11.21, "learning_rate": 0.0001, "loss": 0.0738, "step": 725 }, { "epoch": 11.29, "learning_rate": 0.0001, "loss": 0.0757, "step": 730 }, { "epoch": 11.36, "learning_rate": 0.0001, "loss": 0.0795, "step": 735 }, { "epoch": 11.44, "learning_rate": 0.0001, "loss": 0.0762, "step": 740 }, { "epoch": 11.52, "learning_rate": 0.0001, "loss": 0.0863, "step": 745 }, { "epoch": 11.59, "learning_rate": 0.0001, "loss": 0.0784, "step": 750 }, { "epoch": 11.67, "learning_rate": 0.0001, "loss": 0.0828, "step": 755 }, { "epoch": 11.75, "learning_rate": 0.0001, "loss": 0.0812, "step": 760 }, { "epoch": 11.83, "learning_rate": 0.0001, "loss": 0.0838, "step": 765 }, { "epoch": 11.9, "learning_rate": 0.0001, "loss": 0.089, "step": 770 }, { "epoch": 11.98, "learning_rate": 0.0001, "loss": 0.1106, "step": 775 }, { "epoch": 12.06, "learning_rate": 0.0001, "loss": 0.0737, "step": 780 }, { "epoch": 12.14, "learning_rate": 0.0001, "loss": 0.0666, "step": 785 }, { "epoch": 12.21, "learning_rate": 0.0001, "loss": 0.0675, "step": 790 }, { "epoch": 12.29, "learning_rate": 0.0001, "loss": 0.0851, "step": 795 }, { "epoch": 12.37, "learning_rate": 0.0001, "loss": 0.0681, "step": 800 }, { "epoch": 12.44, "learning_rate": 0.0001, "loss": 0.0708, "step": 805 }, { "epoch": 12.52, "learning_rate": 0.0001, "loss": 0.0693, "step": 810 }, { "epoch": 12.6, "learning_rate": 0.0001, "loss": 0.0693, "step": 815 }, { "epoch": 12.68, "learning_rate": 0.0001, "loss": 0.1068, "step": 820 }, { "epoch": 12.75, "learning_rate": 0.0001, "loss": 0.07, "step": 825 }, { "epoch": 12.83, "learning_rate": 0.0001, "loss": 0.0871, "step": 830 }, { "epoch": 12.91, "learning_rate": 0.0001, "loss": 0.0743, "step": 835 }, { "epoch": 12.99, "learning_rate": 0.0001, "loss": 0.0722, "step": 840 }, { "epoch": 13.06, "learning_rate": 0.0001, "loss": 0.0593, "step": 845 }, { "epoch": 13.14, "learning_rate": 0.0001, "loss": 0.0549, "step": 850 }, { "epoch": 13.22, "learning_rate": 0.0001, "loss": 0.0548, "step": 855 }, { "epoch": 13.29, "learning_rate": 0.0001, "loss": 0.0582, "step": 860 }, { "epoch": 13.37, "learning_rate": 0.0001, "loss": 0.0579, "step": 865 }, { "epoch": 13.45, "learning_rate": 0.0001, "loss": 0.0601, "step": 870 }, { "epoch": 13.53, "learning_rate": 0.0001, "loss": 0.0576, "step": 875 }, { "epoch": 13.6, "learning_rate": 0.0001, "loss": 0.0594, "step": 880 }, { "epoch": 13.68, "learning_rate": 0.0001, "loss": 0.0605, "step": 885 }, { "epoch": 13.76, "learning_rate": 0.0001, "loss": 0.0732, "step": 890 }, { "epoch": 13.84, "learning_rate": 0.0001, "loss": 0.0652, "step": 895 }, { "epoch": 13.91, "learning_rate": 0.0001, "loss": 0.0628, "step": 900 }, { "epoch": 13.99, "learning_rate": 0.0001, "loss": 0.09, "step": 905 }, { "epoch": 14.07, "learning_rate": 0.0001, "loss": 0.0564, "step": 910 }, { "epoch": 14.14, "learning_rate": 0.0001, "loss": 0.0481, "step": 915 }, { "epoch": 14.22, "learning_rate": 0.0001, "loss": 0.048, "step": 920 }, { "epoch": 14.3, "learning_rate": 0.0001, "loss": 0.0468, "step": 925 }, { "epoch": 14.38, "learning_rate": 0.0001, "loss": 0.0989, "step": 930 }, { "epoch": 14.45, "learning_rate": 0.0001, "loss": 0.0497, "step": 935 }, { "epoch": 14.53, "learning_rate": 0.0001, "loss": 0.0495, "step": 940 }, { "epoch": 14.61, "learning_rate": 0.0001, "loss": 0.0499, "step": 945 }, { "epoch": 14.69, "learning_rate": 0.0001, "loss": 0.049, "step": 950 }, { "epoch": 14.76, "learning_rate": 0.0001, "loss": 0.0629, "step": 955 }, { "epoch": 14.84, "learning_rate": 0.0001, "loss": 0.0536, "step": 960 }, { "epoch": 14.92, "learning_rate": 0.0001, "loss": 0.0515, "step": 965 }, { "epoch": 15.0, "learning_rate": 0.0001, "loss": 0.0679, "step": 970 }, { "epoch": 15.07, "learning_rate": 0.0001, "loss": 0.04, "step": 975 }, { "epoch": 15.15, "learning_rate": 0.0001, "loss": 0.0596, "step": 980 }, { "epoch": 15.23, "learning_rate": 0.0001, "loss": 0.0742, "step": 985 }, { "epoch": 15.3, "learning_rate": 0.0001, "loss": 0.0693, "step": 990 }, { "epoch": 15.38, "learning_rate": 0.0001, "loss": 0.0414, "step": 995 }, { "epoch": 15.46, "learning_rate": 0.0001, "loss": 0.0442, "step": 1000 }, { "epoch": 15.54, "learning_rate": 0.0001, "loss": 0.0409, "step": 1005 }, { "epoch": 15.61, "learning_rate": 0.0001, "loss": 0.04, "step": 1010 }, { "epoch": 15.69, "learning_rate": 0.0001, "loss": 0.0414, "step": 1015 }, { "epoch": 15.77, "learning_rate": 0.0001, "loss": 0.0393, "step": 1020 }, { "epoch": 15.85, "learning_rate": 0.0001, "loss": 0.0398, "step": 1025 }, { "epoch": 15.92, "learning_rate": 0.0001, "loss": 0.0411, "step": 1030 }, { "epoch": 16.0, "learning_rate": 0.0001, "loss": 0.0407, "step": 1035 }, { "epoch": 16.08, "learning_rate": 0.0001, "loss": 0.0305, "step": 1040 }, { "epoch": 16.15, "learning_rate": 0.0001, "loss": 0.0338, "step": 1045 }, { "epoch": 16.23, "learning_rate": 0.0001, "loss": 0.0324, "step": 1050 }, { "epoch": 16.31, "learning_rate": 0.0001, "loss": 0.0326, "step": 1055 }, { "epoch": 16.39, "learning_rate": 0.0001, "loss": 0.0314, "step": 1060 }, { "epoch": 16.46, "learning_rate": 0.0001, "loss": 0.0332, "step": 1065 }, { "epoch": 16.54, "learning_rate": 0.0001, "loss": 0.0329, "step": 1070 }, { "epoch": 16.62, "learning_rate": 0.0001, "loss": 0.0406, "step": 1075 }, { "epoch": 16.7, "learning_rate": 0.0001, "loss": 0.0324, "step": 1080 }, { "epoch": 16.77, "learning_rate": 0.0001, "loss": 0.0324, "step": 1085 }, { "epoch": 16.85, "learning_rate": 0.0001, "loss": 0.0721, "step": 1090 }, { "epoch": 16.93, "learning_rate": 0.0001, "loss": 0.0344, "step": 1095 }, { "epoch": 17.0, "learning_rate": 0.0001, "loss": 0.0333, "step": 1100 }, { "epoch": 17.08, "learning_rate": 0.0001, "loss": 0.029, "step": 1105 }, { "epoch": 17.16, "learning_rate": 0.0001, "loss": 0.028, "step": 1110 }, { "epoch": 17.24, "learning_rate": 0.0001, "loss": 0.0288, "step": 1115 }, { "epoch": 17.31, "learning_rate": 0.0001, "loss": 0.0274, "step": 1120 }, { "epoch": 17.39, "learning_rate": 0.0001, "loss": 0.0278, "step": 1125 }, { "epoch": 17.47, "learning_rate": 0.0001, "loss": 0.0288, "step": 1130 }, { "epoch": 17.55, "learning_rate": 0.0001, "loss": 0.0414, "step": 1135 }, { "epoch": 17.62, "learning_rate": 0.0001, "loss": 0.0322, "step": 1140 }, { "epoch": 17.7, "learning_rate": 0.0001, "loss": 0.0303, "step": 1145 }, { "epoch": 17.78, "learning_rate": 0.0001, "loss": 0.0291, "step": 1150 }, { "epoch": 17.86, "learning_rate": 0.0001, "loss": 0.0411, "step": 1155 }, { "epoch": 17.93, "learning_rate": 0.0001, "loss": 0.0282, "step": 1160 }, { "epoch": 18.01, "learning_rate": 0.0001, "loss": 0.0285, "step": 1165 }, { "epoch": 18.09, "learning_rate": 0.0001, "loss": 0.0227, "step": 1170 }, { "epoch": 18.16, "learning_rate": 0.0001, "loss": 0.0232, "step": 1175 }, { "epoch": 18.24, "learning_rate": 0.0001, "loss": 0.0228, "step": 1180 }, { "epoch": 18.32, "learning_rate": 0.0001, "loss": 0.0332, "step": 1185 }, { "epoch": 18.4, "learning_rate": 0.0001, "loss": 0.0226, "step": 1190 }, { "epoch": 18.47, "learning_rate": 0.0001, "loss": 0.0367, "step": 1195 }, { "epoch": 18.55, "learning_rate": 0.0001, "loss": 0.0255, "step": 1200 }, { "epoch": 18.63, "learning_rate": 0.0001, "loss": 0.0354, "step": 1205 }, { "epoch": 18.71, "learning_rate": 0.0001, "loss": 0.0238, "step": 1210 }, { "epoch": 18.78, "learning_rate": 0.0001, "loss": 0.0262, "step": 1215 }, { "epoch": 18.86, "learning_rate": 0.0001, "loss": 0.0243, "step": 1220 }, { "epoch": 18.94, "learning_rate": 0.0001, "loss": 0.0241, "step": 1225 }, { "epoch": 19.01, "learning_rate": 0.0001, "loss": 0.0237, "step": 1230 }, { "epoch": 19.09, "learning_rate": 0.0001, "loss": 0.0207, "step": 1235 }, { "epoch": 19.17, "learning_rate": 0.0001, "loss": 0.0223, "step": 1240 }, { "epoch": 19.25, "learning_rate": 0.0001, "loss": 0.02, "step": 1245 }, { "epoch": 19.32, "learning_rate": 0.0001, "loss": 0.0205, "step": 1250 }, { "epoch": 19.4, "learning_rate": 0.0001, "loss": 0.0204, "step": 1255 }, { "epoch": 19.48, "learning_rate": 0.0001, "loss": 0.0195, "step": 1260 }, { "epoch": 19.56, "learning_rate": 0.0001, "loss": 0.0205, "step": 1265 }, { "epoch": 19.63, "learning_rate": 0.0001, "loss": 0.0222, "step": 1270 }, { "epoch": 19.71, "learning_rate": 0.0001, "loss": 0.0473, "step": 1275 }, { "epoch": 19.79, "learning_rate": 0.0001, "loss": 0.0216, "step": 1280 }, { "epoch": 19.86, "learning_rate": 0.0001, "loss": 0.0242, "step": 1285 }, { "epoch": 19.94, "learning_rate": 0.0001, "loss": 0.0209, "step": 1290 }, { "epoch": 20.02, "learning_rate": 0.0001, "loss": 0.0212, "step": 1295 }, { "epoch": 20.1, "learning_rate": 0.0001, "loss": 0.0243, "step": 1300 }, { "epoch": 20.17, "learning_rate": 0.0001, "loss": 0.0205, "step": 1305 }, { "epoch": 20.25, "learning_rate": 0.0001, "loss": 0.0197, "step": 1310 }, { "epoch": 20.33, "learning_rate": 0.0001, "loss": 0.0191, "step": 1315 }, { "epoch": 20.41, "learning_rate": 0.0001, "loss": 0.0186, "step": 1320 }, { "epoch": 20.48, "learning_rate": 0.0001, "loss": 0.0264, "step": 1325 }, { "epoch": 20.56, "learning_rate": 0.0001, "loss": 0.0194, "step": 1330 }, { "epoch": 20.64, "learning_rate": 0.0001, "loss": 0.0206, "step": 1335 }, { "epoch": 20.71, "learning_rate": 0.0001, "loss": 0.0203, "step": 1340 }, { "epoch": 20.79, "learning_rate": 0.0001, "loss": 0.019, "step": 1345 }, { "epoch": 20.87, "learning_rate": 0.0001, "loss": 0.0191, "step": 1350 }, { "epoch": 20.95, "learning_rate": 0.0001, "loss": 0.0262, "step": 1355 }, { "epoch": 21.02, "learning_rate": 0.0001, "loss": 0.0293, "step": 1360 }, { "epoch": 21.1, "learning_rate": 0.0001, "loss": 0.0169, "step": 1365 }, { "epoch": 21.18, "learning_rate": 0.0001, "loss": 0.0175, "step": 1370 }, { "epoch": 21.26, "learning_rate": 0.0001, "loss": 0.0175, "step": 1375 }, { "epoch": 21.33, "learning_rate": 0.0001, "loss": 0.0179, "step": 1380 }, { "epoch": 21.41, "learning_rate": 0.0001, "loss": 0.017, "step": 1385 }, { "epoch": 21.49, "learning_rate": 0.0001, "loss": 0.0211, "step": 1390 }, { "epoch": 21.57, "learning_rate": 0.0001, "loss": 0.0169, "step": 1395 }, { "epoch": 21.64, "learning_rate": 0.0001, "loss": 0.0168, "step": 1400 }, { "epoch": 21.72, "learning_rate": 0.0001, "loss": 0.0164, "step": 1405 }, { "epoch": 21.8, "learning_rate": 0.0001, "loss": 0.0298, "step": 1410 }, { "epoch": 21.87, "learning_rate": 0.0001, "loss": 0.02, "step": 1415 }, { "epoch": 21.95, "learning_rate": 0.0001, "loss": 0.0235, "step": 1420 }, { "epoch": 22.03, "learning_rate": 0.0001, "loss": 0.018, "step": 1425 }, { "epoch": 22.11, "learning_rate": 0.0001, "loss": 0.0164, "step": 1430 }, { "epoch": 22.18, "learning_rate": 0.0001, "loss": 0.0225, "step": 1435 }, { "epoch": 22.26, "learning_rate": 0.0001, "loss": 0.0167, "step": 1440 }, { "epoch": 22.34, "learning_rate": 0.0001, "loss": 0.024, "step": 1445 }, { "epoch": 22.42, "learning_rate": 0.0001, "loss": 0.0161, "step": 1450 }, { "epoch": 22.49, "learning_rate": 0.0001, "loss": 0.0224, "step": 1455 }, { "epoch": 22.57, "learning_rate": 0.0001, "loss": 0.0203, "step": 1460 }, { "epoch": 22.65, "learning_rate": 0.0001, "loss": 0.0169, "step": 1465 }, { "epoch": 22.72, "learning_rate": 0.0001, "loss": 0.0166, "step": 1470 }, { "epoch": 22.8, "learning_rate": 0.0001, "loss": 0.0163, "step": 1475 }, { "epoch": 22.88, "learning_rate": 0.0001, "loss": 0.0165, "step": 1480 }, { "epoch": 22.96, "learning_rate": 0.0001, "loss": 0.0155, "step": 1485 }, { "epoch": 23.03, "learning_rate": 0.0001, "loss": 0.0164, "step": 1490 }, { "epoch": 23.11, "learning_rate": 0.0001, "loss": 0.0148, "step": 1495 }, { "epoch": 23.19, "learning_rate": 0.0001, "loss": 0.0151, "step": 1500 }, { "epoch": 23.27, "learning_rate": 0.0001, "loss": 0.0176, "step": 1505 }, { "epoch": 23.34, "learning_rate": 0.0001, "loss": 0.0418, "step": 1510 }, { "epoch": 23.42, "learning_rate": 0.0001, "loss": 0.0155, "step": 1515 }, { "epoch": 23.5, "learning_rate": 0.0001, "loss": 0.0163, "step": 1520 }, { "epoch": 23.57, "learning_rate": 0.0001, "loss": 0.0161, "step": 1525 }, { "epoch": 23.65, "learning_rate": 0.0001, "loss": 0.0157, "step": 1530 }, { "epoch": 23.73, "learning_rate": 0.0001, "loss": 0.0219, "step": 1535 }, { "epoch": 23.81, "learning_rate": 0.0001, "loss": 0.0154, "step": 1540 }, { "epoch": 23.88, "learning_rate": 0.0001, "loss": 0.0149, "step": 1545 }, { "epoch": 23.96, "learning_rate": 0.0001, "loss": 0.0151, "step": 1550 }, { "epoch": 24.04, "learning_rate": 0.0001, "loss": 0.0138, "step": 1555 }, { "epoch": 24.12, "learning_rate": 0.0001, "loss": 0.0134, "step": 1560 }, { "epoch": 24.19, "learning_rate": 0.0001, "loss": 0.0349, "step": 1565 }, { "epoch": 24.27, "learning_rate": 0.0001, "loss": 0.0139, "step": 1570 }, { "epoch": 24.35, "learning_rate": 0.0001, "loss": 0.0138, "step": 1575 }, { "epoch": 24.43, "learning_rate": 0.0001, "loss": 0.014, "step": 1580 }, { "epoch": 24.5, "learning_rate": 0.0001, "loss": 0.0139, "step": 1585 }, { "epoch": 24.58, "learning_rate": 0.0001, "loss": 0.0137, "step": 1590 }, { "epoch": 24.66, "learning_rate": 0.0001, "loss": 0.0132, "step": 1595 }, { "epoch": 24.73, "learning_rate": 0.0001, "loss": 0.0133, "step": 1600 }, { "epoch": 24.81, "learning_rate": 0.0001, "loss": 0.027, "step": 1605 }, { "epoch": 24.89, "learning_rate": 0.0001, "loss": 0.0135, "step": 1610 }, { "epoch": 24.97, "learning_rate": 0.0001, "loss": 0.0141, "step": 1615 }, { "epoch": 25.04, "learning_rate": 0.0001, "loss": 0.0135, "step": 1620 }, { "epoch": 25.12, "learning_rate": 0.0001, "loss": 0.0119, "step": 1625 }, { "epoch": 25.2, "learning_rate": 0.0001, "loss": 0.0197, "step": 1630 }, { "epoch": 25.28, "learning_rate": 0.0001, "loss": 0.012, "step": 1635 }, { "epoch": 25.35, "learning_rate": 0.0001, "loss": 0.0119, "step": 1640 }, { "epoch": 25.43, "learning_rate": 0.0001, "loss": 0.0124, "step": 1645 }, { "epoch": 25.51, "learning_rate": 0.0001, "loss": 0.0121, "step": 1650 }, { "epoch": 25.58, "learning_rate": 0.0001, "loss": 0.0173, "step": 1655 }, { "epoch": 25.66, "learning_rate": 0.0001, "loss": 0.0153, "step": 1660 }, { "epoch": 25.74, "learning_rate": 0.0001, "loss": 0.0127, "step": 1665 }, { "epoch": 25.82, "learning_rate": 0.0001, "loss": 0.0125, "step": 1670 }, { "epoch": 25.89, "learning_rate": 0.0001, "loss": 0.0124, "step": 1675 }, { "epoch": 25.97, "learning_rate": 0.0001, "loss": 0.0146, "step": 1680 }, { "epoch": 26.05, "learning_rate": 0.0001, "loss": 0.0169, "step": 1685 }, { "epoch": 26.13, "learning_rate": 0.0001, "loss": 0.0137, "step": 1690 }, { "epoch": 26.2, "learning_rate": 0.0001, "loss": 0.0147, "step": 1695 }, { "epoch": 26.28, "learning_rate": 0.0001, "loss": 0.0116, "step": 1700 }, { "epoch": 26.36, "learning_rate": 0.0001, "loss": 0.0135, "step": 1705 }, { "epoch": 26.43, "learning_rate": 0.0001, "loss": 0.0129, "step": 1710 }, { "epoch": 26.51, "learning_rate": 0.0001, "loss": 0.0124, "step": 1715 }, { "epoch": 26.59, "learning_rate": 0.0001, "loss": 0.0115, "step": 1720 }, { "epoch": 26.67, "learning_rate": 0.0001, "loss": 0.0119, "step": 1725 }, { "epoch": 26.74, "learning_rate": 0.0001, "loss": 0.0118, "step": 1730 }, { "epoch": 26.82, "learning_rate": 0.0001, "loss": 0.0123, "step": 1735 }, { "epoch": 26.9, "learning_rate": 0.0001, "loss": 0.0114, "step": 1740 }, { "epoch": 26.98, "learning_rate": 0.0001, "loss": 0.0119, "step": 1745 }, { "epoch": 27.05, "learning_rate": 0.0001, "loss": 0.011, "step": 1750 }, { "epoch": 27.13, "learning_rate": 0.0001, "loss": 0.0122, "step": 1755 }, { "epoch": 27.21, "learning_rate": 0.0001, "loss": 0.0104, "step": 1760 }, { "epoch": 27.29, "learning_rate": 0.0001, "loss": 0.011, "step": 1765 }, { "epoch": 27.36, "learning_rate": 0.0001, "loss": 0.0104, "step": 1770 }, { "epoch": 27.44, "learning_rate": 0.0001, "loss": 0.0105, "step": 1775 }, { "epoch": 27.52, "learning_rate": 0.0001, "loss": 0.0106, "step": 1780 }, { "epoch": 27.59, "learning_rate": 0.0001, "loss": 0.0138, "step": 1785 }, { "epoch": 27.67, "learning_rate": 0.0001, "loss": 0.0129, "step": 1790 }, { "epoch": 27.75, "learning_rate": 0.0001, "loss": 0.0109, "step": 1795 }, { "epoch": 27.83, "learning_rate": 0.0001, "loss": 0.0108, "step": 1800 }, { "epoch": 27.9, "learning_rate": 0.0001, "loss": 0.0108, "step": 1805 }, { "epoch": 27.98, "learning_rate": 0.0001, "loss": 0.0109, "step": 1810 }, { "epoch": 28.06, "learning_rate": 0.0001, "loss": 0.0098, "step": 1815 }, { "epoch": 28.14, "learning_rate": 0.0001, "loss": 0.0127, "step": 1820 }, { "epoch": 28.21, "learning_rate": 0.0001, "loss": 0.0095, "step": 1825 }, { "epoch": 28.29, "learning_rate": 0.0001, "loss": 0.0095, "step": 1830 }, { "epoch": 28.37, "learning_rate": 0.0001, "loss": 0.0095, "step": 1835 }, { "epoch": 28.44, "learning_rate": 0.0001, "loss": 0.0101, "step": 1840 }, { "epoch": 28.52, "learning_rate": 0.0001, "loss": 0.0105, "step": 1845 }, { "epoch": 28.6, "learning_rate": 0.0001, "loss": 0.0096, "step": 1850 }, { "epoch": 28.68, "learning_rate": 0.0001, "loss": 0.0101, "step": 1855 }, { "epoch": 28.75, "learning_rate": 0.0001, "loss": 0.0103, "step": 1860 }, { "epoch": 28.83, "learning_rate": 0.0001, "loss": 0.0103, "step": 1865 }, { "epoch": 28.91, "learning_rate": 0.0001, "loss": 0.0139, "step": 1870 }, { "epoch": 28.99, "learning_rate": 0.0001, "loss": 0.0104, "step": 1875 }, { "epoch": 29.06, "learning_rate": 0.0001, "loss": 0.0096, "step": 1880 }, { "epoch": 29.14, "learning_rate": 0.0001, "loss": 0.01, "step": 1885 }, { "epoch": 29.22, "learning_rate": 0.0001, "loss": 0.009, "step": 1890 }, { "epoch": 29.29, "learning_rate": 0.0001, "loss": 0.0097, "step": 1895 }, { "epoch": 29.37, "learning_rate": 0.0001, "loss": 0.0094, "step": 1900 }, { "epoch": 29.45, "learning_rate": 0.0001, "loss": 0.0311, "step": 1905 }, { "epoch": 29.53, "learning_rate": 0.0001, "loss": 0.0101, "step": 1910 }, { "epoch": 29.6, "learning_rate": 0.0001, "loss": 0.0103, "step": 1915 }, { "epoch": 29.68, "learning_rate": 0.0001, "loss": 0.012, "step": 1920 }, { "epoch": 29.76, "learning_rate": 0.0001, "loss": 0.01, "step": 1925 }, { "epoch": 29.84, "learning_rate": 0.0001, "loss": 0.0112, "step": 1930 }, { "epoch": 29.91, "learning_rate": 0.0001, "loss": 0.0101, "step": 1935 }, { "epoch": 29.99, "learning_rate": 0.0001, "loss": 0.0105, "step": 1940 }, { "epoch": 30.07, "learning_rate": 0.0001, "loss": 0.0093, "step": 1945 }, { "epoch": 30.14, "learning_rate": 0.0001, "loss": 0.0095, "step": 1950 }, { "epoch": 30.22, "learning_rate": 0.0001, "loss": 0.0089, "step": 1955 }, { "epoch": 30.3, "learning_rate": 0.0001, "loss": 0.0101, "step": 1960 }, { "epoch": 30.38, "learning_rate": 0.0001, "loss": 0.0094, "step": 1965 }, { "epoch": 30.45, "learning_rate": 0.0001, "loss": 0.0093, "step": 1970 }, { "epoch": 30.53, "learning_rate": 0.0001, "loss": 0.0094, "step": 1975 }, { "epoch": 30.61, "learning_rate": 0.0001, "loss": 0.0197, "step": 1980 }, { "epoch": 30.69, "learning_rate": 0.0001, "loss": 0.0103, "step": 1985 }, { "epoch": 30.76, "learning_rate": 0.0001, "loss": 0.0107, "step": 1990 }, { "epoch": 30.84, "learning_rate": 0.0001, "loss": 0.01, "step": 1995 }, { "epoch": 30.92, "learning_rate": 0.0001, "loss": 0.0112, "step": 2000 }, { "epoch": 31.0, "learning_rate": 0.0001, "loss": 0.0104, "step": 2005 }, { "epoch": 31.07, "learning_rate": 0.0001, "loss": 0.0096, "step": 2010 }, { "epoch": 31.15, "learning_rate": 0.0001, "loss": 0.0089, "step": 2015 }, { "epoch": 31.23, "learning_rate": 0.0001, "loss": 0.0085, "step": 2020 }, { "epoch": 31.3, "learning_rate": 0.0001, "loss": 0.0092, "step": 2025 }, { "epoch": 31.38, "learning_rate": 0.0001, "loss": 0.0087, "step": 2030 }, { "epoch": 31.46, "learning_rate": 0.0001, "loss": 0.0093, "step": 2035 }, { "epoch": 31.54, "learning_rate": 0.0001, "loss": 0.01, "step": 2040 }, { "epoch": 31.61, "learning_rate": 0.0001, "loss": 0.0088, "step": 2045 }, { "epoch": 31.69, "learning_rate": 0.0001, "loss": 0.0099, "step": 2050 }, { "epoch": 31.77, "learning_rate": 0.0001, "loss": 0.0211, "step": 2055 }, { "epoch": 31.85, "learning_rate": 0.0001, "loss": 0.0096, "step": 2060 }, { "epoch": 31.92, "learning_rate": 0.0001, "loss": 0.0093, "step": 2065 }, { "epoch": 32.0, "learning_rate": 0.0001, "loss": 0.01, "step": 2070 }, { "epoch": 32.08, "learning_rate": 0.0001, "loss": 0.0089, "step": 2075 }, { "epoch": 32.15, "learning_rate": 0.0001, "loss": 0.0088, "step": 2080 }, { "epoch": 32.23, "learning_rate": 0.0001, "loss": 0.0082, "step": 2085 }, { "epoch": 32.31, "learning_rate": 0.0001, "loss": 0.0084, "step": 2090 }, { "epoch": 32.39, "learning_rate": 0.0001, "loss": 0.0084, "step": 2095 }, { "epoch": 32.46, "learning_rate": 0.0001, "loss": 0.0088, "step": 2100 }, { "epoch": 32.54, "learning_rate": 0.0001, "loss": 0.0103, "step": 2105 }, { "epoch": 32.62, "learning_rate": 0.0001, "loss": 0.0087, "step": 2110 }, { "epoch": 32.7, "learning_rate": 0.0001, "loss": 0.0154, "step": 2115 }, { "epoch": 32.77, "learning_rate": 0.0001, "loss": 0.0093, "step": 2120 }, { "epoch": 32.85, "learning_rate": 0.0001, "loss": 0.009, "step": 2125 }, { "epoch": 32.93, "learning_rate": 0.0001, "loss": 0.0095, "step": 2130 }, { "epoch": 33.0, "learning_rate": 0.0001, "loss": 0.0093, "step": 2135 }, { "epoch": 33.08, "learning_rate": 0.0001, "loss": 0.0261, "step": 2140 }, { "epoch": 33.16, "learning_rate": 0.0001, "loss": 0.0095, "step": 2145 }, { "epoch": 33.24, "learning_rate": 0.0001, "loss": 0.0088, "step": 2150 }, { "epoch": 33.31, "learning_rate": 0.0001, "loss": 0.0091, "step": 2155 }, { "epoch": 33.39, "learning_rate": 0.0001, "loss": 0.0091, "step": 2160 }, { "epoch": 33.47, "learning_rate": 0.0001, "loss": 0.0092, "step": 2165 }, { "epoch": 33.55, "learning_rate": 0.0001, "loss": 0.0093, "step": 2170 }, { "epoch": 33.62, "learning_rate": 0.0001, "loss": 0.009, "step": 2175 }, { "epoch": 33.7, "learning_rate": 0.0001, "loss": 0.0093, "step": 2180 }, { "epoch": 33.78, "learning_rate": 0.0001, "loss": 0.0096, "step": 2185 }, { "epoch": 33.86, "learning_rate": 0.0001, "loss": 0.0089, "step": 2190 }, { "epoch": 33.93, "learning_rate": 0.0001, "loss": 0.0091, "step": 2195 }, { "epoch": 34.01, "learning_rate": 0.0001, "loss": 0.0098, "step": 2200 }, { "epoch": 34.09, "learning_rate": 0.0001, "loss": 0.0084, "step": 2205 }, { "epoch": 34.16, "learning_rate": 0.0001, "loss": 0.008, "step": 2210 }, { "epoch": 34.24, "learning_rate": 0.0001, "loss": 0.0085, "step": 2215 }, { "epoch": 34.32, "learning_rate": 0.0001, "loss": 0.0082, "step": 2220 }, { "epoch": 34.4, "learning_rate": 0.0001, "loss": 0.0077, "step": 2225 }, { "epoch": 34.47, "learning_rate": 0.0001, "loss": 0.0099, "step": 2230 }, { "epoch": 34.55, "learning_rate": 0.0001, "loss": 0.0192, "step": 2235 }, { "epoch": 34.63, "learning_rate": 0.0001, "loss": 0.0087, "step": 2240 }, { "epoch": 34.63, "step": 2240, "total_flos": 5.322709161266381e+17, "train_loss": 0.14071606248617172, "train_runtime": 19826.8664, "train_samples_per_second": 3.654, "train_steps_per_second": 0.113 } ], "logging_steps": 5, "max_steps": 2240, "num_train_epochs": 35, "save_steps": -2240, "total_flos": 5.322709161266381e+17, "trial_name": null, "trial_params": null }