{ "best_metric": null, "best_model_checkpoint": null, "epoch": 74.17746759720838, "eval_steps": 500, "global_step": 2325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "learning_rate": 0.0001, "loss": 2.4292, "step": 5 }, { "epoch": 0.32, "learning_rate": 0.0001, "loss": 1.6311, "step": 10 }, { "epoch": 0.48, "learning_rate": 0.0001, "loss": 1.4364, "step": 15 }, { "epoch": 0.64, "learning_rate": 0.0001, "loss": 1.3208, "step": 20 }, { "epoch": 0.8, "learning_rate": 0.0001, "loss": 1.2781, "step": 25 }, { "epoch": 0.96, "learning_rate": 0.0001, "loss": 1.2784, "step": 30 }, { "epoch": 1.12, "learning_rate": 0.0001, "loss": 1.1591, "step": 35 }, { "epoch": 1.28, "learning_rate": 0.0001, "loss": 1.1193, "step": 40 }, { "epoch": 1.44, "learning_rate": 0.0001, "loss": 1.1173, "step": 45 }, { "epoch": 1.6, "learning_rate": 0.0001, "loss": 1.0948, "step": 50 }, { "epoch": 1.75, "learning_rate": 0.0001, "loss": 1.0937, "step": 55 }, { "epoch": 1.91, "learning_rate": 0.0001, "loss": 1.0643, "step": 60 }, { "epoch": 2.07, "learning_rate": 0.0001, "loss": 1.0252, "step": 65 }, { "epoch": 2.23, "learning_rate": 0.0001, "loss": 0.9701, "step": 70 }, { "epoch": 2.39, "learning_rate": 0.0001, "loss": 0.9773, "step": 75 }, { "epoch": 2.55, "learning_rate": 0.0001, "loss": 0.9364, "step": 80 }, { "epoch": 2.71, "learning_rate": 0.0001, "loss": 0.955, "step": 85 }, { "epoch": 2.87, "learning_rate": 0.0001, "loss": 0.9752, "step": 90 }, { "epoch": 3.03, "learning_rate": 0.0001, "loss": 0.9614, "step": 95 }, { "epoch": 3.19, "learning_rate": 0.0001, "loss": 0.8711, "step": 100 }, { "epoch": 3.35, "learning_rate": 0.0001, "loss": 0.8728, "step": 105 }, { "epoch": 3.51, "learning_rate": 0.0001, "loss": 0.8734, "step": 110 }, { "epoch": 3.67, "learning_rate": 0.0001, "loss": 0.8769, "step": 115 }, { "epoch": 3.83, "learning_rate": 0.0001, "loss": 0.8714, "step": 120 }, { "epoch": 3.99, "learning_rate": 0.0001, "loss": 0.8694, "step": 125 }, { "epoch": 4.15, "learning_rate": 0.0001, "loss": 0.8008, "step": 130 }, { "epoch": 4.31, "learning_rate": 0.0001, "loss": 0.8044, "step": 135 }, { "epoch": 4.47, "learning_rate": 0.0001, "loss": 0.7945, "step": 140 }, { "epoch": 4.63, "learning_rate": 0.0001, "loss": 0.7927, "step": 145 }, { "epoch": 4.79, "learning_rate": 0.0001, "loss": 0.7982, "step": 150 }, { "epoch": 4.95, "learning_rate": 0.0001, "loss": 0.7921, "step": 155 }, { "epoch": 5.1, "learning_rate": 0.0001, "loss": 0.7751, "step": 160 }, { "epoch": 5.26, "learning_rate": 0.0001, "loss": 0.7197, "step": 165 }, { "epoch": 5.42, "learning_rate": 0.0001, "loss": 0.7381, "step": 170 }, { "epoch": 5.58, "learning_rate": 0.0001, "loss": 0.7328, "step": 175 }, { "epoch": 5.74, "learning_rate": 0.0001, "loss": 0.7325, "step": 180 }, { "epoch": 5.9, "learning_rate": 0.0001, "loss": 0.7492, "step": 185 }, { "epoch": 6.06, "learning_rate": 0.0001, "loss": 0.7042, "step": 190 }, { "epoch": 6.22, "learning_rate": 0.0001, "loss": 0.6687, "step": 195 }, { "epoch": 6.38, "learning_rate": 0.0001, "loss": 0.6617, "step": 200 }, { "epoch": 6.54, "learning_rate": 0.0001, "loss": 0.6895, "step": 205 }, { "epoch": 6.7, "learning_rate": 0.0001, "loss": 0.6763, "step": 210 }, { "epoch": 6.86, "learning_rate": 0.0001, "loss": 0.6825, "step": 215 }, { "epoch": 7.02, "learning_rate": 0.0001, "loss": 0.6862, "step": 220 }, { "epoch": 7.18, "learning_rate": 0.0001, "loss": 0.6176, "step": 225 }, { "epoch": 7.34, "learning_rate": 0.0001, "loss": 0.6072, "step": 230 }, { "epoch": 7.5, "learning_rate": 0.0001, "loss": 0.6289, "step": 235 }, { "epoch": 7.66, "learning_rate": 0.0001, "loss": 0.6223, "step": 240 }, { "epoch": 7.82, "learning_rate": 0.0001, "loss": 0.6358, "step": 245 }, { "epoch": 7.98, "learning_rate": 0.0001, "loss": 0.6365, "step": 250 }, { "epoch": 8.14, "learning_rate": 0.0001, "loss": 0.5835, "step": 255 }, { "epoch": 8.3, "learning_rate": 0.0001, "loss": 0.5829, "step": 260 }, { "epoch": 8.45, "learning_rate": 0.0001, "loss": 0.5816, "step": 265 }, { "epoch": 8.61, "learning_rate": 0.0001, "loss": 0.5595, "step": 270 }, { "epoch": 8.77, "learning_rate": 0.0001, "loss": 0.5902, "step": 275 }, { "epoch": 8.93, "learning_rate": 0.0001, "loss": 0.5717, "step": 280 }, { "epoch": 9.09, "learning_rate": 0.0001, "loss": 0.5584, "step": 285 }, { "epoch": 9.25, "learning_rate": 0.0001, "loss": 0.538, "step": 290 }, { "epoch": 9.41, "learning_rate": 0.0001, "loss": 0.5172, "step": 295 }, { "epoch": 9.57, "learning_rate": 0.0001, "loss": 0.5378, "step": 300 }, { "epoch": 9.73, "learning_rate": 0.0001, "loss": 0.5319, "step": 305 }, { "epoch": 9.89, "learning_rate": 0.0001, "loss": 0.529, "step": 310 }, { "epoch": 10.05, "learning_rate": 0.0001, "loss": 0.5236, "step": 315 }, { "epoch": 10.21, "learning_rate": 0.0001, "loss": 0.4818, "step": 320 }, { "epoch": 10.37, "learning_rate": 0.0001, "loss": 0.4856, "step": 325 }, { "epoch": 10.53, "learning_rate": 0.0001, "loss": 0.502, "step": 330 }, { "epoch": 10.69, "learning_rate": 0.0001, "loss": 0.482, "step": 335 }, { "epoch": 10.85, "learning_rate": 0.0001, "loss": 0.4957, "step": 340 }, { "epoch": 11.01, "learning_rate": 0.0001, "loss": 0.503, "step": 345 }, { "epoch": 11.17, "learning_rate": 0.0001, "loss": 0.4476, "step": 350 }, { "epoch": 11.33, "learning_rate": 0.0001, "loss": 0.4525, "step": 355 }, { "epoch": 11.49, "learning_rate": 0.0001, "loss": 0.4533, "step": 360 }, { "epoch": 11.65, "learning_rate": 0.0001, "loss": 0.4623, "step": 365 }, { "epoch": 11.8, "learning_rate": 0.0001, "loss": 0.459, "step": 370 }, { "epoch": 11.96, "learning_rate": 0.0001, "loss": 0.4563, "step": 375 }, { "epoch": 12.12, "learning_rate": 0.0001, "loss": 0.4408, "step": 380 }, { "epoch": 12.28, "learning_rate": 0.0001, "loss": 0.4123, "step": 385 }, { "epoch": 12.44, "learning_rate": 0.0001, "loss": 0.4203, "step": 390 }, { "epoch": 12.6, "learning_rate": 0.0001, "loss": 0.4283, "step": 395 }, { "epoch": 12.76, "learning_rate": 0.0001, "loss": 0.4144, "step": 400 }, { "epoch": 12.92, "learning_rate": 0.0001, "loss": 0.4282, "step": 405 }, { "epoch": 13.08, "learning_rate": 0.0001, "loss": 0.4062, "step": 410 }, { "epoch": 13.24, "learning_rate": 0.0001, "loss": 0.393, "step": 415 }, { "epoch": 13.4, "learning_rate": 0.0001, "loss": 0.3828, "step": 420 }, { "epoch": 13.56, "learning_rate": 0.0001, "loss": 0.3871, "step": 425 }, { "epoch": 13.72, "learning_rate": 0.0001, "loss": 0.3957, "step": 430 }, { "epoch": 13.88, "learning_rate": 0.0001, "loss": 0.3931, "step": 435 }, { "epoch": 14.04, "learning_rate": 0.0001, "loss": 0.389, "step": 440 }, { "epoch": 14.2, "learning_rate": 0.0001, "loss": 0.3529, "step": 445 }, { "epoch": 14.36, "learning_rate": 0.0001, "loss": 0.3639, "step": 450 }, { "epoch": 14.52, "learning_rate": 0.0001, "loss": 0.3665, "step": 455 }, { "epoch": 14.68, "learning_rate": 0.0001, "loss": 0.3609, "step": 460 }, { "epoch": 14.84, "learning_rate": 0.0001, "loss": 0.3642, "step": 465 }, { "epoch": 15.0, "learning_rate": 0.0001, "loss": 0.3693, "step": 470 }, { "epoch": 15.15, "learning_rate": 0.0001, "loss": 0.3382, "step": 475 }, { "epoch": 15.31, "learning_rate": 0.0001, "loss": 0.3344, "step": 480 }, { "epoch": 15.47, "learning_rate": 0.0001, "loss": 0.3379, "step": 485 }, { "epoch": 15.63, "learning_rate": 0.0001, "loss": 0.3307, "step": 490 }, { "epoch": 15.79, "learning_rate": 0.0001, "loss": 0.3474, "step": 495 }, { "epoch": 15.95, "learning_rate": 0.0001, "loss": 0.3348, "step": 500 }, { "epoch": 16.11, "learning_rate": 0.0001, "loss": 0.3228, "step": 505 }, { "epoch": 16.27, "learning_rate": 0.0001, "loss": 0.3164, "step": 510 }, { "epoch": 16.43, "learning_rate": 0.0001, "loss": 0.3078, "step": 515 }, { "epoch": 16.59, "learning_rate": 0.0001, "loss": 0.3135, "step": 520 }, { "epoch": 16.75, "learning_rate": 0.0001, "loss": 0.3091, "step": 525 }, { "epoch": 16.91, "learning_rate": 0.0001, "loss": 0.3229, "step": 530 }, { "epoch": 17.07, "learning_rate": 0.0001, "loss": 0.3003, "step": 535 }, { "epoch": 17.23, "learning_rate": 0.0001, "loss": 0.2832, "step": 540 }, { "epoch": 17.39, "learning_rate": 0.0001, "loss": 0.2936, "step": 545 }, { "epoch": 17.55, "learning_rate": 0.0001, "loss": 0.2881, "step": 550 }, { "epoch": 17.71, "learning_rate": 0.0001, "loss": 0.2858, "step": 555 }, { "epoch": 17.87, "learning_rate": 0.0001, "loss": 0.2987, "step": 560 }, { "epoch": 18.03, "learning_rate": 0.0001, "loss": 0.2986, "step": 565 }, { "epoch": 18.19, "learning_rate": 0.0001, "loss": 0.2624, "step": 570 }, { "epoch": 18.34, "learning_rate": 0.0001, "loss": 0.2668, "step": 575 }, { "epoch": 18.5, "learning_rate": 0.0001, "loss": 0.2701, "step": 580 }, { "epoch": 18.66, "learning_rate": 0.0001, "loss": 0.2759, "step": 585 }, { "epoch": 18.82, "learning_rate": 0.0001, "loss": 0.273, "step": 590 }, { "epoch": 18.98, "learning_rate": 0.0001, "loss": 0.2776, "step": 595 }, { "epoch": 19.14, "learning_rate": 0.0001, "loss": 0.256, "step": 600 }, { "epoch": 19.3, "learning_rate": 0.0001, "loss": 0.2494, "step": 605 }, { "epoch": 19.46, "learning_rate": 0.0001, "loss": 0.2507, "step": 610 }, { "epoch": 19.62, "learning_rate": 0.0001, "loss": 0.2541, "step": 615 }, { "epoch": 19.78, "learning_rate": 0.0001, "loss": 0.2513, "step": 620 }, { "epoch": 19.94, "learning_rate": 0.0001, "loss": 0.2579, "step": 625 }, { "epoch": 20.1, "learning_rate": 0.0001, "loss": 0.2421, "step": 630 }, { "epoch": 20.26, "learning_rate": 0.0001, "loss": 0.2348, "step": 635 }, { "epoch": 20.42, "learning_rate": 0.0001, "loss": 0.2366, "step": 640 }, { "epoch": 20.58, "learning_rate": 0.0001, "loss": 0.2306, "step": 645 }, { "epoch": 20.74, "learning_rate": 0.0001, "loss": 0.2356, "step": 650 }, { "epoch": 20.9, "learning_rate": 0.0001, "loss": 0.2407, "step": 655 }, { "epoch": 21.06, "learning_rate": 0.0001, "loss": 0.2328, "step": 660 }, { "epoch": 21.22, "learning_rate": 0.0001, "loss": 0.2183, "step": 665 }, { "epoch": 21.38, "learning_rate": 0.0001, "loss": 0.2121, "step": 670 }, { "epoch": 21.54, "learning_rate": 0.0001, "loss": 0.2186, "step": 675 }, { "epoch": 21.69, "learning_rate": 0.0001, "loss": 0.22, "step": 680 }, { "epoch": 21.85, "learning_rate": 0.0001, "loss": 0.2266, "step": 685 }, { "epoch": 22.01, "learning_rate": 0.0001, "loss": 0.2245, "step": 690 }, { "epoch": 22.17, "learning_rate": 0.0001, "loss": 0.202, "step": 695 }, { "epoch": 22.33, "learning_rate": 0.0001, "loss": 0.1993, "step": 700 }, { "epoch": 22.49, "learning_rate": 0.0001, "loss": 0.2056, "step": 705 }, { "epoch": 22.65, "learning_rate": 0.0001, "loss": 0.2076, "step": 710 }, { "epoch": 22.81, "learning_rate": 0.0001, "loss": 0.208, "step": 715 }, { "epoch": 22.97, "learning_rate": 0.0001, "loss": 0.2134, "step": 720 }, { "epoch": 23.13, "learning_rate": 0.0001, "loss": 0.1929, "step": 725 }, { "epoch": 23.29, "learning_rate": 0.0001, "loss": 0.19, "step": 730 }, { "epoch": 23.45, "learning_rate": 0.0001, "loss": 0.191, "step": 735 }, { "epoch": 23.61, "learning_rate": 0.0001, "loss": 0.1923, "step": 740 }, { "epoch": 23.77, "learning_rate": 0.0001, "loss": 0.193, "step": 745 }, { "epoch": 23.93, "learning_rate": 0.0001, "loss": 0.1978, "step": 750 }, { "epoch": 24.09, "learning_rate": 0.0001, "loss": 0.1898, "step": 755 }, { "epoch": 24.25, "learning_rate": 0.0001, "loss": 0.1785, "step": 760 }, { "epoch": 24.41, "learning_rate": 0.0001, "loss": 0.1815, "step": 765 }, { "epoch": 24.57, "learning_rate": 0.0001, "loss": 0.1854, "step": 770 }, { "epoch": 24.73, "learning_rate": 0.0001, "loss": 0.1796, "step": 775 }, { "epoch": 24.89, "learning_rate": 0.0001, "loss": 0.1797, "step": 780 }, { "epoch": 25.04, "learning_rate": 0.0001, "loss": 0.1779, "step": 785 }, { "epoch": 25.2, "learning_rate": 0.0001, "loss": 0.1689, "step": 790 }, { "epoch": 25.36, "learning_rate": 0.0001, "loss": 0.1642, "step": 795 }, { "epoch": 25.52, "learning_rate": 0.0001, "loss": 0.1697, "step": 800 }, { "epoch": 25.68, "learning_rate": 0.0001, "loss": 0.1713, "step": 805 }, { "epoch": 25.84, "learning_rate": 0.0001, "loss": 0.175, "step": 810 }, { "epoch": 26.0, "learning_rate": 0.0001, "loss": 0.1736, "step": 815 }, { "epoch": 26.16, "learning_rate": 0.0001, "loss": 0.1598, "step": 820 }, { "epoch": 26.32, "learning_rate": 0.0001, "loss": 0.156, "step": 825 }, { "epoch": 26.48, "learning_rate": 0.0001, "loss": 0.1605, "step": 830 }, { "epoch": 26.64, "learning_rate": 0.0001, "loss": 0.1595, "step": 835 }, { "epoch": 26.8, "learning_rate": 0.0001, "loss": 0.1613, "step": 840 }, { "epoch": 26.96, "learning_rate": 0.0001, "loss": 0.1636, "step": 845 }, { "epoch": 27.12, "learning_rate": 0.0001, "loss": 0.1526, "step": 850 }, { "epoch": 27.28, "learning_rate": 0.0001, "loss": 0.1506, "step": 855 }, { "epoch": 27.44, "learning_rate": 0.0001, "loss": 0.1492, "step": 860 }, { "epoch": 27.6, "learning_rate": 0.0001, "loss": 0.1486, "step": 865 }, { "epoch": 27.76, "learning_rate": 0.0001, "loss": 0.1524, "step": 870 }, { "epoch": 27.92, "learning_rate": 0.0001, "loss": 0.1538, "step": 875 }, { "epoch": 28.08, "learning_rate": 0.0001, "loss": 0.1495, "step": 880 }, { "epoch": 28.24, "learning_rate": 0.0001, "loss": 0.1436, "step": 885 }, { "epoch": 28.39, "learning_rate": 0.0001, "loss": 0.1387, "step": 890 }, { "epoch": 28.55, "learning_rate": 0.0001, "loss": 0.1419, "step": 895 }, { "epoch": 28.71, "learning_rate": 0.0001, "loss": 0.144, "step": 900 }, { "epoch": 28.87, "learning_rate": 0.0001, "loss": 0.1421, "step": 905 }, { "epoch": 29.03, "learning_rate": 0.0001, "loss": 0.144, "step": 910 }, { "epoch": 29.19, "learning_rate": 0.0001, "loss": 0.1336, "step": 915 }, { "epoch": 29.35, "learning_rate": 0.0001, "loss": 0.1342, "step": 920 }, { "epoch": 29.51, "learning_rate": 0.0001, "loss": 0.1315, "step": 925 }, { "epoch": 29.67, "learning_rate": 0.0001, "loss": 0.134, "step": 930 }, { "epoch": 29.83, "learning_rate": 0.0001, "loss": 0.1385, "step": 935 }, { "epoch": 29.99, "learning_rate": 0.0001, "loss": 0.1396, "step": 940 }, { "epoch": 30.15, "learning_rate": 0.0001, "loss": 0.1261, "step": 945 }, { "epoch": 30.31, "learning_rate": 0.0001, "loss": 0.1254, "step": 950 }, { "epoch": 30.47, "learning_rate": 0.0001, "loss": 0.1274, "step": 955 }, { "epoch": 30.63, "learning_rate": 0.0001, "loss": 0.1273, "step": 960 }, { "epoch": 30.79, "learning_rate": 0.0001, "loss": 0.1328, "step": 965 }, { "epoch": 30.95, "learning_rate": 0.0001, "loss": 0.1305, "step": 970 }, { "epoch": 31.11, "learning_rate": 0.0001, "loss": 0.1215, "step": 975 }, { "epoch": 31.27, "learning_rate": 0.0001, "loss": 0.1196, "step": 980 }, { "epoch": 31.43, "learning_rate": 0.0001, "loss": 0.1209, "step": 985 }, { "epoch": 31.59, "learning_rate": 0.0001, "loss": 0.1211, "step": 990 }, { "epoch": 31.74, "learning_rate": 0.0001, "loss": 0.125, "step": 995 }, { "epoch": 31.9, "learning_rate": 0.0001, "loss": 0.1262, "step": 1000 }, { "epoch": 32.06, "learning_rate": 0.0001, "loss": 0.1185, "step": 1005 }, { "epoch": 32.22, "learning_rate": 0.0001, "loss": 0.113, "step": 1010 }, { "epoch": 32.38, "learning_rate": 0.0001, "loss": 0.1136, "step": 1015 }, { "epoch": 32.54, "learning_rate": 0.0001, "loss": 0.1144, "step": 1020 }, { "epoch": 32.7, "learning_rate": 0.0001, "loss": 0.1147, "step": 1025 }, { "epoch": 32.86, "learning_rate": 0.0001, "loss": 0.1168, "step": 1030 }, { "epoch": 33.02, "learning_rate": 0.0001, "loss": 0.1179, "step": 1035 }, { "epoch": 33.18, "learning_rate": 0.0001, "loss": 0.1073, "step": 1040 }, { "epoch": 33.34, "learning_rate": 0.0001, "loss": 0.1091, "step": 1045 }, { "epoch": 33.5, "learning_rate": 0.0001, "loss": 0.1116, "step": 1050 }, { "epoch": 33.66, "learning_rate": 0.0001, "loss": 0.1092, "step": 1055 }, { "epoch": 33.82, "learning_rate": 0.0001, "loss": 0.1107, "step": 1060 }, { "epoch": 33.98, "learning_rate": 0.0001, "loss": 0.1094, "step": 1065 }, { "epoch": 34.14, "learning_rate": 0.0001, "loss": 0.1033, "step": 1070 }, { "epoch": 34.3, "learning_rate": 0.0001, "loss": 0.1027, "step": 1075 }, { "epoch": 34.46, "learning_rate": 0.0001, "loss": 0.1021, "step": 1080 }, { "epoch": 34.62, "learning_rate": 0.0001, "loss": 0.1045, "step": 1085 }, { "epoch": 34.78, "learning_rate": 0.0001, "loss": 0.1044, "step": 1090 }, { "epoch": 34.94, "learning_rate": 0.0001, "loss": 0.1081, "step": 1095 }, { "epoch": 35.09, "learning_rate": 0.0001, "loss": 0.1028, "step": 1100 }, { "epoch": 35.25, "learning_rate": 0.0001, "loss": 0.0967, "step": 1105 }, { "epoch": 35.41, "learning_rate": 0.0001, "loss": 0.0967, "step": 1110 }, { "epoch": 35.57, "learning_rate": 0.0001, "loss": 0.1019, "step": 1115 }, { "epoch": 35.73, "learning_rate": 0.0001, "loss": 0.1005, "step": 1120 }, { "epoch": 35.89, "learning_rate": 0.0001, "loss": 0.1031, "step": 1125 }, { "epoch": 36.05, "learning_rate": 0.0001, "loss": 0.0981, "step": 1130 }, { "epoch": 36.21, "learning_rate": 0.0001, "loss": 0.0945, "step": 1135 }, { "epoch": 36.37, "learning_rate": 0.0001, "loss": 0.0954, "step": 1140 }, { "epoch": 36.53, "learning_rate": 0.0001, "loss": 0.0936, "step": 1145 }, { "epoch": 36.69, "learning_rate": 0.0001, "loss": 0.0972, "step": 1150 }, { "epoch": 36.85, "learning_rate": 0.0001, "loss": 0.0951, "step": 1155 }, { "epoch": 37.01, "learning_rate": 0.0001, "loss": 0.0974, "step": 1160 }, { "epoch": 37.17, "learning_rate": 0.0001, "loss": 0.0902, "step": 1165 }, { "epoch": 37.33, "learning_rate": 0.0001, "loss": 0.0909, "step": 1170 }, { "epoch": 37.49, "learning_rate": 0.0001, "loss": 0.0918, "step": 1175 }, { "epoch": 37.65, "learning_rate": 0.0001, "loss": 0.0913, "step": 1180 }, { "epoch": 37.81, "learning_rate": 0.0001, "loss": 0.0926, "step": 1185 }, { "epoch": 37.97, "learning_rate": 0.0001, "loss": 0.0925, "step": 1190 }, { "epoch": 38.13, "learning_rate": 0.0001, "loss": 0.0882, "step": 1195 }, { "epoch": 38.29, "learning_rate": 0.0001, "loss": 0.0856, "step": 1200 }, { "epoch": 38.44, "learning_rate": 0.0001, "loss": 0.0881, "step": 1205 }, { "epoch": 38.6, "learning_rate": 0.0001, "loss": 0.0889, "step": 1210 }, { "epoch": 38.76, "learning_rate": 0.0001, "loss": 0.0863, "step": 1215 }, { "epoch": 38.92, "learning_rate": 0.0001, "loss": 0.087, "step": 1220 }, { "epoch": 39.08, "learning_rate": 0.0001, "loss": 0.0851, "step": 1225 }, { "epoch": 39.24, "learning_rate": 0.0001, "loss": 0.0819, "step": 1230 }, { "epoch": 39.4, "learning_rate": 0.0001, "loss": 0.0844, "step": 1235 }, { "epoch": 39.56, "learning_rate": 0.0001, "loss": 0.0826, "step": 1240 }, { "epoch": 39.72, "learning_rate": 0.0001, "loss": 0.0832, "step": 1245 }, { "epoch": 39.88, "learning_rate": 0.0001, "loss": 0.0838, "step": 1250 }, { "epoch": 40.04, "learning_rate": 0.0001, "loss": 0.087, "step": 1255 }, { "epoch": 40.2, "learning_rate": 0.0001, "loss": 0.0792, "step": 1260 }, { "epoch": 40.36, "learning_rate": 0.0001, "loss": 0.0802, "step": 1265 }, { "epoch": 40.52, "learning_rate": 0.0001, "loss": 0.0806, "step": 1270 }, { "epoch": 40.68, "learning_rate": 0.0001, "loss": 0.0768, "step": 1275 }, { "epoch": 40.84, "learning_rate": 0.0001, "loss": 0.0812, "step": 1280 }, { "epoch": 41.0, "learning_rate": 0.0001, "loss": 0.0849, "step": 1285 }, { "epoch": 41.16, "learning_rate": 0.0001, "loss": 0.0767, "step": 1290 }, { "epoch": 41.32, "learning_rate": 0.0001, "loss": 0.0766, "step": 1295 }, { "epoch": 41.48, "learning_rate": 0.0001, "loss": 0.0774, "step": 1300 }, { "epoch": 41.64, "learning_rate": 0.0001, "loss": 0.0789, "step": 1305 }, { "epoch": 41.79, "learning_rate": 0.0001, "loss": 0.0774, "step": 1310 }, { "epoch": 41.95, "learning_rate": 0.0001, "loss": 0.0795, "step": 1315 }, { "epoch": 42.11, "learning_rate": 0.0001, "loss": 0.0748, "step": 1320 }, { "epoch": 42.27, "learning_rate": 0.0001, "loss": 0.0728, "step": 1325 }, { "epoch": 42.43, "learning_rate": 0.0001, "loss": 0.0751, "step": 1330 }, { "epoch": 42.59, "learning_rate": 0.0001, "loss": 0.0765, "step": 1335 }, { "epoch": 42.75, "learning_rate": 0.0001, "loss": 0.0741, "step": 1340 }, { "epoch": 42.91, "learning_rate": 0.0001, "loss": 0.0761, "step": 1345 }, { "epoch": 43.07, "learning_rate": 0.0001, "loss": 0.0752, "step": 1350 }, { "epoch": 43.23, "learning_rate": 0.0001, "loss": 0.0715, "step": 1355 }, { "epoch": 43.39, "learning_rate": 0.0001, "loss": 0.0734, "step": 1360 }, { "epoch": 43.55, "learning_rate": 0.0001, "loss": 0.0727, "step": 1365 }, { "epoch": 43.71, "learning_rate": 0.0001, "loss": 0.0726, "step": 1370 }, { "epoch": 43.87, "learning_rate": 0.0001, "loss": 0.0731, "step": 1375 }, { "epoch": 44.03, "learning_rate": 0.0001, "loss": 0.0706, "step": 1380 }, { "epoch": 44.19, "learning_rate": 0.0001, "loss": 0.0684, "step": 1385 }, { "epoch": 44.35, "learning_rate": 0.0001, "loss": 0.0695, "step": 1390 }, { "epoch": 44.51, "learning_rate": 0.0001, "loss": 0.0691, "step": 1395 }, { "epoch": 44.67, "learning_rate": 0.0001, "loss": 0.0711, "step": 1400 }, { "epoch": 44.83, "learning_rate": 0.0001, "loss": 0.07, "step": 1405 }, { "epoch": 44.99, "learning_rate": 0.0001, "loss": 0.071, "step": 1410 }, { "epoch": 45.14, "learning_rate": 0.0001, "loss": 0.0663, "step": 1415 }, { "epoch": 45.3, "learning_rate": 0.0001, "loss": 0.0663, "step": 1420 }, { "epoch": 45.46, "learning_rate": 0.0001, "loss": 0.0667, "step": 1425 }, { "epoch": 45.62, "learning_rate": 0.0001, "loss": 0.0694, "step": 1430 }, { "epoch": 45.78, "learning_rate": 0.0001, "loss": 0.0677, "step": 1435 }, { "epoch": 45.94, "learning_rate": 0.0001, "loss": 0.0683, "step": 1440 }, { "epoch": 46.1, "learning_rate": 0.0001, "loss": 0.065, "step": 1445 }, { "epoch": 46.26, "learning_rate": 0.0001, "loss": 0.0642, "step": 1450 }, { "epoch": 46.42, "learning_rate": 0.0001, "loss": 0.066, "step": 1455 }, { "epoch": 46.58, "learning_rate": 0.0001, "loss": 0.0644, "step": 1460 }, { "epoch": 46.74, "learning_rate": 0.0001, "loss": 0.0657, "step": 1465 }, { "epoch": 46.9, "learning_rate": 0.0001, "loss": 0.066, "step": 1470 }, { "epoch": 47.06, "learning_rate": 0.0001, "loss": 0.0648, "step": 1475 }, { "epoch": 47.22, "learning_rate": 0.0001, "loss": 0.0614, "step": 1480 }, { "epoch": 47.38, "learning_rate": 0.0001, "loss": 0.063, "step": 1485 }, { "epoch": 47.54, "learning_rate": 0.0001, "loss": 0.0651, "step": 1490 }, { "epoch": 47.7, "learning_rate": 0.0001, "loss": 0.0635, "step": 1495 }, { "epoch": 47.86, "learning_rate": 0.0001, "loss": 0.0637, "step": 1500 }, { "epoch": 48.02, "learning_rate": 0.0001, "loss": 0.0627, "step": 1505 }, { "epoch": 48.18, "learning_rate": 0.0001, "loss": 0.0602, "step": 1510 }, { "epoch": 48.33, "learning_rate": 0.0001, "loss": 0.0598, "step": 1515 }, { "epoch": 48.49, "learning_rate": 0.0001, "loss": 0.06, "step": 1520 }, { "epoch": 48.65, "learning_rate": 0.0001, "loss": 0.0607, "step": 1525 }, { "epoch": 48.81, "learning_rate": 0.0001, "loss": 0.063, "step": 1530 }, { "epoch": 48.97, "learning_rate": 0.0001, "loss": 0.0628, "step": 1535 }, { "epoch": 49.13, "learning_rate": 0.0001, "loss": 0.0584, "step": 1540 }, { "epoch": 49.29, "learning_rate": 0.0001, "loss": 0.0585, "step": 1545 }, { "epoch": 49.45, "learning_rate": 0.0001, "loss": 0.0599, "step": 1550 }, { "epoch": 49.61, "learning_rate": 0.0001, "loss": 0.0583, "step": 1555 }, { "epoch": 49.77, "learning_rate": 0.0001, "loss": 0.0601, "step": 1560 }, { "epoch": 49.93, "learning_rate": 0.0001, "loss": 0.0596, "step": 1565 }, { "epoch": 50.09, "learning_rate": 0.0001, "loss": 0.0588, "step": 1570 }, { "epoch": 50.25, "learning_rate": 0.0001, "loss": 0.0576, "step": 1575 }, { "epoch": 50.41, "learning_rate": 0.0001, "loss": 0.0564, "step": 1580 }, { "epoch": 50.57, "learning_rate": 0.0001, "loss": 0.0581, "step": 1585 }, { "epoch": 50.73, "learning_rate": 0.0001, "loss": 0.0583, "step": 1590 }, { "epoch": 50.89, "learning_rate": 0.0001, "loss": 0.0568, "step": 1595 }, { "epoch": 51.05, "learning_rate": 0.0001, "loss": 0.0586, "step": 1600 }, { "epoch": 51.21, "learning_rate": 0.0001, "loss": 0.0544, "step": 1605 }, { "epoch": 51.37, "learning_rate": 0.0001, "loss": 0.0553, "step": 1610 }, { "epoch": 51.53, "learning_rate": 0.0001, "loss": 0.0559, "step": 1615 }, { "epoch": 51.68, "learning_rate": 0.0001, "loss": 0.0553, "step": 1620 }, { "epoch": 51.84, "learning_rate": 0.0001, "loss": 0.0566, "step": 1625 }, { "epoch": 52.0, "learning_rate": 0.0001, "loss": 0.0576, "step": 1630 }, { "epoch": 52.16, "learning_rate": 0.0001, "loss": 0.0545, "step": 1635 }, { "epoch": 52.32, "learning_rate": 0.0001, "loss": 0.0552, "step": 1640 }, { "epoch": 52.48, "learning_rate": 0.0001, "loss": 0.0534, "step": 1645 }, { "epoch": 52.64, "learning_rate": 0.0001, "loss": 0.0535, "step": 1650 }, { "epoch": 52.8, "learning_rate": 0.0001, "loss": 0.0545, "step": 1655 }, { "epoch": 52.96, "learning_rate": 0.0001, "loss": 0.055, "step": 1660 }, { "epoch": 53.12, "learning_rate": 0.0001, "loss": 0.0526, "step": 1665 }, { "epoch": 53.28, "learning_rate": 0.0001, "loss": 0.0527, "step": 1670 }, { "epoch": 53.44, "learning_rate": 0.0001, "loss": 0.0521, "step": 1675 }, { "epoch": 53.6, "learning_rate": 0.0001, "loss": 0.0529, "step": 1680 }, { "epoch": 53.76, "learning_rate": 0.0001, "loss": 0.0531, "step": 1685 }, { "epoch": 53.92, "learning_rate": 0.0001, "loss": 0.0532, "step": 1690 }, { "epoch": 54.08, "learning_rate": 0.0001, "loss": 0.0524, "step": 1695 }, { "epoch": 54.24, "learning_rate": 0.0001, "loss": 0.0516, "step": 1700 }, { "epoch": 54.4, "learning_rate": 0.0001, "loss": 0.0496, "step": 1705 }, { "epoch": 54.56, "learning_rate": 0.0001, "loss": 0.0517, "step": 1710 }, { "epoch": 54.72, "learning_rate": 0.0001, "loss": 0.0528, "step": 1715 }, { "epoch": 54.88, "learning_rate": 0.0001, "loss": 0.0509, "step": 1720 }, { "epoch": 55.03, "learning_rate": 0.0001, "loss": 0.0503, "step": 1725 }, { "epoch": 55.19, "learning_rate": 0.0001, "loss": 0.0493, "step": 1730 }, { "epoch": 55.35, "learning_rate": 0.0001, "loss": 0.0492, "step": 1735 }, { "epoch": 55.51, "learning_rate": 0.0001, "loss": 0.0495, "step": 1740 }, { "epoch": 55.67, "learning_rate": 0.0001, "loss": 0.0491, "step": 1745 }, { "epoch": 55.83, "learning_rate": 0.0001, "loss": 0.0503, "step": 1750 }, { "epoch": 55.99, "learning_rate": 0.0001, "loss": 0.0516, "step": 1755 }, { "epoch": 56.15, "learning_rate": 0.0001, "loss": 0.0475, "step": 1760 }, { "epoch": 56.31, "learning_rate": 0.0001, "loss": 0.0493, "step": 1765 }, { "epoch": 56.47, "learning_rate": 0.0001, "loss": 0.0493, "step": 1770 }, { "epoch": 56.63, "learning_rate": 0.0001, "loss": 0.0484, "step": 1775 }, { "epoch": 56.79, "learning_rate": 0.0001, "loss": 0.0479, "step": 1780 }, { "epoch": 56.95, "learning_rate": 0.0001, "loss": 0.0488, "step": 1785 }, { "epoch": 57.11, "learning_rate": 0.0001, "loss": 0.0468, "step": 1790 }, { "epoch": 57.27, "learning_rate": 0.0001, "loss": 0.0479, "step": 1795 }, { "epoch": 57.43, "learning_rate": 0.0001, "loss": 0.0462, "step": 1800 }, { "epoch": 57.59, "learning_rate": 0.0001, "loss": 0.0488, "step": 1805 }, { "epoch": 57.75, "learning_rate": 0.0001, "loss": 0.0488, "step": 1810 }, { "epoch": 57.91, "learning_rate": 0.0001, "loss": 0.048, "step": 1815 }, { "epoch": 58.07, "learning_rate": 0.0001, "loss": 0.0474, "step": 1820 }, { "epoch": 58.23, "learning_rate": 0.0001, "loss": 0.0469, "step": 1825 }, { "epoch": 58.38, "learning_rate": 0.0001, "loss": 0.0456, "step": 1830 }, { "epoch": 58.54, "learning_rate": 0.0001, "loss": 0.0465, "step": 1835 }, { "epoch": 58.7, "learning_rate": 0.0001, "loss": 0.0475, "step": 1840 }, { "epoch": 58.86, "learning_rate": 0.0001, "loss": 0.0469, "step": 1845 }, { "epoch": 59.02, "learning_rate": 0.0001, "loss": 0.0472, "step": 1850 }, { "epoch": 59.18, "learning_rate": 0.0001, "loss": 0.0453, "step": 1855 }, { "epoch": 59.34, "learning_rate": 0.0001, "loss": 0.0464, "step": 1860 }, { "epoch": 59.5, "learning_rate": 0.0001, "loss": 0.0456, "step": 1865 }, { "epoch": 59.66, "learning_rate": 0.0001, "loss": 0.0463, "step": 1870 }, { "epoch": 59.82, "learning_rate": 0.0001, "loss": 0.0467, "step": 1875 }, { "epoch": 59.98, "learning_rate": 0.0001, "loss": 0.046, "step": 1880 }, { "epoch": 60.14, "learning_rate": 0.0001, "loss": 0.0444, "step": 1885 }, { "epoch": 60.3, "learning_rate": 0.0001, "loss": 0.0448, "step": 1890 }, { "epoch": 60.46, "learning_rate": 0.0001, "loss": 0.0441, "step": 1895 }, { "epoch": 60.62, "learning_rate": 0.0001, "loss": 0.0468, "step": 1900 }, { "epoch": 60.78, "learning_rate": 0.0001, "loss": 0.045, "step": 1905 }, { "epoch": 60.94, "learning_rate": 0.0001, "loss": 0.0454, "step": 1910 }, { "epoch": 61.1, "learning_rate": 0.0001, "loss": 0.0454, "step": 1915 }, { "epoch": 61.26, "learning_rate": 0.0001, "loss": 0.044, "step": 1920 }, { "epoch": 61.42, "learning_rate": 0.0001, "loss": 0.0451, "step": 1925 }, { "epoch": 61.58, "learning_rate": 0.0001, "loss": 0.0436, "step": 1930 }, { "epoch": 61.73, "learning_rate": 0.0001, "loss": 0.044, "step": 1935 }, { "epoch": 61.89, "learning_rate": 0.0001, "loss": 0.0444, "step": 1940 }, { "epoch": 62.05, "learning_rate": 0.0001, "loss": 0.043, "step": 1945 }, { "epoch": 62.21, "learning_rate": 0.0001, "loss": 0.0414, "step": 1950 }, { "epoch": 62.37, "learning_rate": 0.0001, "loss": 0.0417, "step": 1955 }, { "epoch": 62.53, "learning_rate": 0.0001, "loss": 0.0433, "step": 1960 }, { "epoch": 62.69, "learning_rate": 0.0001, "loss": 0.0437, "step": 1965 }, { "epoch": 62.85, "learning_rate": 0.0001, "loss": 0.0428, "step": 1970 }, { "epoch": 63.01, "learning_rate": 0.0001, "loss": 0.0437, "step": 1975 }, { "epoch": 63.17, "learning_rate": 0.0001, "loss": 0.042, "step": 1980 }, { "epoch": 63.33, "learning_rate": 0.0001, "loss": 0.0413, "step": 1985 }, { "epoch": 63.49, "learning_rate": 0.0001, "loss": 0.0426, "step": 1990 }, { "epoch": 63.65, "learning_rate": 0.0001, "loss": 0.042, "step": 1995 }, { "epoch": 63.81, "learning_rate": 0.0001, "loss": 0.0434, "step": 2000 }, { "epoch": 63.97, "learning_rate": 0.0001, "loss": 0.0429, "step": 2005 }, { "epoch": 64.13, "learning_rate": 0.0001, "loss": 0.0421, "step": 2010 }, { "epoch": 64.29, "learning_rate": 0.0001, "loss": 0.0411, "step": 2015 }, { "epoch": 64.45, "learning_rate": 0.0001, "loss": 0.0415, "step": 2020 }, { "epoch": 64.61, "learning_rate": 0.0001, "loss": 0.0401, "step": 2025 }, { "epoch": 64.77, "learning_rate": 0.0001, "loss": 0.0417, "step": 2030 }, { "epoch": 64.93, "learning_rate": 0.0001, "loss": 0.0418, "step": 2035 }, { "epoch": 65.08, "learning_rate": 0.0001, "loss": 0.0409, "step": 2040 }, { "epoch": 65.24, "learning_rate": 0.0001, "loss": 0.0404, "step": 2045 }, { "epoch": 65.4, "learning_rate": 0.0001, "loss": 0.0404, "step": 2050 }, { "epoch": 65.56, "learning_rate": 0.0001, "loss": 0.0403, "step": 2055 }, { "epoch": 65.72, "learning_rate": 0.0001, "loss": 0.0399, "step": 2060 }, { "epoch": 65.88, "learning_rate": 0.0001, "loss": 0.042, "step": 2065 }, { "epoch": 66.04, "learning_rate": 0.0001, "loss": 0.0397, "step": 2070 }, { "epoch": 66.2, "learning_rate": 0.0001, "loss": 0.0396, "step": 2075 }, { "epoch": 66.36, "learning_rate": 0.0001, "loss": 0.0387, "step": 2080 }, { "epoch": 66.52, "learning_rate": 0.0001, "loss": 0.0392, "step": 2085 }, { "epoch": 66.68, "learning_rate": 0.0001, "loss": 0.0407, "step": 2090 }, { "epoch": 66.84, "learning_rate": 0.0001, "loss": 0.0414, "step": 2095 }, { "epoch": 67.0, "learning_rate": 0.0001, "loss": 0.0407, "step": 2100 }, { "epoch": 67.16, "learning_rate": 0.0001, "loss": 0.0404, "step": 2105 }, { "epoch": 67.32, "learning_rate": 0.0001, "loss": 0.0382, "step": 2110 }, { "epoch": 67.48, "learning_rate": 0.0001, "loss": 0.0397, "step": 2115 }, { "epoch": 67.64, "learning_rate": 0.0001, "loss": 0.0392, "step": 2120 }, { "epoch": 67.8, "learning_rate": 0.0001, "loss": 0.0383, "step": 2125 }, { "epoch": 67.96, "learning_rate": 0.0001, "loss": 0.0397, "step": 2130 }, { "epoch": 68.12, "learning_rate": 0.0001, "loss": 0.0384, "step": 2135 }, { "epoch": 68.28, "learning_rate": 0.0001, "loss": 0.0376, "step": 2140 }, { "epoch": 68.43, "learning_rate": 0.0001, "loss": 0.0383, "step": 2145 }, { "epoch": 68.59, "learning_rate": 0.0001, "loss": 0.0393, "step": 2150 }, { "epoch": 68.75, "learning_rate": 0.0001, "loss": 0.0387, "step": 2155 }, { "epoch": 68.91, "learning_rate": 0.0001, "loss": 0.0382, "step": 2160 }, { "epoch": 69.07, "learning_rate": 0.0001, "loss": 0.0382, "step": 2165 }, { "epoch": 69.23, "learning_rate": 0.0001, "loss": 0.0376, "step": 2170 }, { "epoch": 69.39, "learning_rate": 0.0001, "loss": 0.0375, "step": 2175 }, { "epoch": 69.55, "learning_rate": 0.0001, "loss": 0.0382, "step": 2180 }, { "epoch": 69.71, "learning_rate": 0.0001, "loss": 0.0386, "step": 2185 }, { "epoch": 69.87, "learning_rate": 0.0001, "loss": 0.0373, "step": 2190 }, { "epoch": 70.03, "learning_rate": 0.0001, "loss": 0.0376, "step": 2195 }, { "epoch": 70.19, "learning_rate": 0.0001, "loss": 0.037, "step": 2200 }, { "epoch": 70.35, "learning_rate": 0.0001, "loss": 0.0368, "step": 2205 }, { "epoch": 70.51, "learning_rate": 0.0001, "loss": 0.036, "step": 2210 }, { "epoch": 70.67, "learning_rate": 0.0001, "loss": 0.0375, "step": 2215 }, { "epoch": 70.83, "learning_rate": 0.0001, "loss": 0.0368, "step": 2220 }, { "epoch": 70.99, "learning_rate": 0.0001, "loss": 0.038, "step": 2225 }, { "epoch": 71.15, "learning_rate": 0.0001, "loss": 0.0368, "step": 2230 }, { "epoch": 71.31, "learning_rate": 0.0001, "loss": 0.036, "step": 2235 }, { "epoch": 71.47, "learning_rate": 0.0001, "loss": 0.0367, "step": 2240 }, { "epoch": 71.63, "learning_rate": 0.0001, "loss": 0.0358, "step": 2245 }, { "epoch": 71.78, "learning_rate": 0.0001, "loss": 0.0363, "step": 2250 }, { "epoch": 71.94, "learning_rate": 0.0001, "loss": 0.0366, "step": 2255 }, { "epoch": 72.1, "learning_rate": 0.0001, "loss": 0.0356, "step": 2260 }, { "epoch": 72.26, "learning_rate": 0.0001, "loss": 0.0344, "step": 2265 }, { "epoch": 72.42, "learning_rate": 0.0001, "loss": 0.0366, "step": 2270 }, { "epoch": 72.58, "learning_rate": 0.0001, "loss": 0.0362, "step": 2275 }, { "epoch": 72.74, "learning_rate": 0.0001, "loss": 0.0354, "step": 2280 }, { "epoch": 72.9, "learning_rate": 0.0001, "loss": 0.0362, "step": 2285 }, { "epoch": 73.06, "learning_rate": 0.0001, "loss": 0.0358, "step": 2290 }, { "epoch": 73.22, "learning_rate": 0.0001, "loss": 0.035, "step": 2295 }, { "epoch": 73.38, "learning_rate": 0.0001, "loss": 0.0354, "step": 2300 }, { "epoch": 73.54, "learning_rate": 0.0001, "loss": 0.0357, "step": 2305 }, { "epoch": 73.7, "learning_rate": 0.0001, "loss": 0.0344, "step": 2310 }, { "epoch": 73.86, "learning_rate": 0.0001, "loss": 0.0348, "step": 2315 }, { "epoch": 74.02, "learning_rate": 0.0001, "loss": 0.0351, "step": 2320 }, { "epoch": 74.18, "learning_rate": 0.0001, "loss": 0.0339, "step": 2325 }, { "epoch": 74.18, "step": 2325, "total_flos": 2.773380910546944e+17, "train_loss": 0.22404980731266802, "train_runtime": 24311.8759, "train_samples_per_second": 3.094, "train_steps_per_second": 0.096 } ], "logging_steps": 5, "max_steps": 2325, "num_train_epochs": 75, "save_steps": -2325, "total_flos": 2.773380910546944e+17, "trial_name": null, "trial_params": null }