{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999842829076621, "eval_steps": 1590, "global_step": 15906, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009430255402750491, "grad_norm": 0.390625, "learning_rate": 0.001, "loss": 5.5551, "step": 15 }, { "epoch": 0.0018860510805500982, "grad_norm": 0.0791015625, "learning_rate": 0.001, "loss": 3.5038, "step": 30 }, { "epoch": 0.002829076620825147, "grad_norm": 0.1298828125, "learning_rate": 0.001, "loss": 3.5068, "step": 45 }, { "epoch": 0.0037721021611001964, "grad_norm": 0.0693359375, "learning_rate": 0.001, "loss": 3.4288, "step": 60 }, { "epoch": 0.004715127701375246, "grad_norm": 0.12255859375, "learning_rate": 0.001, "loss": 3.3071, "step": 75 }, { "epoch": 0.005658153241650294, "grad_norm": 0.091796875, "learning_rate": 0.001, "loss": 3.2653, "step": 90 }, { "epoch": 0.006601178781925344, "grad_norm": 0.1318359375, "learning_rate": 0.001, "loss": 3.1297, "step": 105 }, { "epoch": 0.007544204322200393, "grad_norm": 0.12451171875, "learning_rate": 0.001, "loss": 3.0482, "step": 120 }, { "epoch": 0.008487229862475442, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 2.9037, "step": 135 }, { "epoch": 0.009430255402750491, "grad_norm": 0.1650390625, "learning_rate": 0.001, "loss": 2.8178, "step": 150 }, { "epoch": 0.01037328094302554, "grad_norm": 0.111328125, "learning_rate": 0.001, "loss": 2.687, "step": 165 }, { "epoch": 0.011316306483300589, "grad_norm": 0.1640625, "learning_rate": 0.001, "loss": 2.6247, "step": 180 }, { "epoch": 0.01225933202357564, "grad_norm": 0.1298828125, "learning_rate": 0.001, "loss": 2.5556, "step": 195 }, { "epoch": 0.013202357563850688, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 2.4524, "step": 210 }, { "epoch": 0.014145383104125737, "grad_norm": 0.1083984375, "learning_rate": 0.001, "loss": 2.4904, "step": 225 }, { "epoch": 0.015088408644400786, "grad_norm": 0.1904296875, "learning_rate": 0.001, "loss": 2.4211, "step": 240 }, { "epoch": 0.016031434184675834, "grad_norm": 0.2412109375, "learning_rate": 0.001, "loss": 2.419, "step": 255 }, { "epoch": 0.016974459724950885, "grad_norm": 0.130859375, "learning_rate": 0.001, "loss": 2.3542, "step": 270 }, { "epoch": 0.017917485265225932, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 2.2893, "step": 285 }, { "epoch": 0.018860510805500982, "grad_norm": 0.1982421875, "learning_rate": 0.001, "loss": 2.2671, "step": 300 }, { "epoch": 0.019803536345776033, "grad_norm": 0.2109375, "learning_rate": 0.001, "loss": 2.2644, "step": 315 }, { "epoch": 0.02074656188605108, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 2.2669, "step": 330 }, { "epoch": 0.02168958742632613, "grad_norm": 0.1962890625, "learning_rate": 0.001, "loss": 2.2009, "step": 345 }, { "epoch": 0.022632612966601177, "grad_norm": 0.197265625, "learning_rate": 0.001, "loss": 2.1569, "step": 360 }, { "epoch": 0.023575638506876228, "grad_norm": 0.203125, "learning_rate": 0.001, "loss": 2.0607, "step": 375 }, { "epoch": 0.02451866404715128, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 2.1118, "step": 390 }, { "epoch": 0.025461689587426325, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 2.0465, "step": 405 }, { "epoch": 0.026404715127701376, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 2.0682, "step": 420 }, { "epoch": 0.027347740667976423, "grad_norm": 0.158203125, "learning_rate": 0.001, "loss": 2.014, "step": 435 }, { "epoch": 0.028290766208251474, "grad_norm": 0.2578125, "learning_rate": 0.001, "loss": 2.0251, "step": 450 }, { "epoch": 0.029233791748526524, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.991, "step": 465 }, { "epoch": 0.03017681728880157, "grad_norm": 0.1884765625, "learning_rate": 0.001, "loss": 1.9579, "step": 480 }, { "epoch": 0.03111984282907662, "grad_norm": 0.1630859375, "learning_rate": 0.001, "loss": 1.9253, "step": 495 }, { "epoch": 0.03206286836935167, "grad_norm": 0.408203125, "learning_rate": 0.001, "loss": 1.9019, "step": 510 }, { "epoch": 0.033005893909626716, "grad_norm": 0.15234375, "learning_rate": 0.001, "loss": 1.9208, "step": 525 }, { "epoch": 0.03394891944990177, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 1.9165, "step": 540 }, { "epoch": 0.03489194499017682, "grad_norm": 0.2734375, "learning_rate": 0.001, "loss": 1.8541, "step": 555 }, { "epoch": 0.035834970530451864, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 1.8854, "step": 570 }, { "epoch": 0.03677799607072692, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.8651, "step": 585 }, { "epoch": 0.037721021611001965, "grad_norm": 0.2734375, "learning_rate": 0.001, "loss": 1.8392, "step": 600 }, { "epoch": 0.03866404715127701, "grad_norm": 0.23828125, "learning_rate": 0.001, "loss": 1.843, "step": 615 }, { "epoch": 0.039607072691552066, "grad_norm": 0.2578125, "learning_rate": 0.001, "loss": 1.7958, "step": 630 }, { "epoch": 0.04055009823182711, "grad_norm": 0.197265625, "learning_rate": 0.001, "loss": 1.7849, "step": 645 }, { "epoch": 0.04149312377210216, "grad_norm": 0.1767578125, "learning_rate": 0.001, "loss": 1.7397, "step": 660 }, { "epoch": 0.04243614931237721, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.7396, "step": 675 }, { "epoch": 0.04337917485265226, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.7219, "step": 690 }, { "epoch": 0.04432220039292731, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.7536, "step": 705 }, { "epoch": 0.045265225933202355, "grad_norm": 0.2021484375, "learning_rate": 0.001, "loss": 1.697, "step": 720 }, { "epoch": 0.04620825147347741, "grad_norm": 0.421875, "learning_rate": 0.001, "loss": 1.6725, "step": 735 }, { "epoch": 0.047151277013752456, "grad_norm": 0.2080078125, "learning_rate": 0.001, "loss": 1.691, "step": 750 }, { "epoch": 0.0480943025540275, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 1.6721, "step": 765 }, { "epoch": 0.04903732809430256, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 1.7221, "step": 780 }, { "epoch": 0.049980353634577604, "grad_norm": 0.244140625, "learning_rate": 0.001, "loss": 1.6609, "step": 795 }, { "epoch": 0.05092337917485265, "grad_norm": 0.494140625, "learning_rate": 0.001, "loss": 1.6805, "step": 810 }, { "epoch": 0.0518664047151277, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.6157, "step": 825 }, { "epoch": 0.05280943025540275, "grad_norm": 0.19921875, "learning_rate": 0.001, "loss": 1.5996, "step": 840 }, { "epoch": 0.0537524557956778, "grad_norm": 0.419921875, "learning_rate": 0.001, "loss": 1.5686, "step": 855 }, { "epoch": 0.054695481335952846, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.6021, "step": 870 }, { "epoch": 0.0556385068762279, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 1.6159, "step": 885 }, { "epoch": 0.05658153241650295, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.5456, "step": 900 }, { "epoch": 0.057524557956777994, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.5764, "step": 915 }, { "epoch": 0.05846758349705305, "grad_norm": 0.369140625, "learning_rate": 0.001, "loss": 1.5426, "step": 930 }, { "epoch": 0.059410609037328095, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.5535, "step": 945 }, { "epoch": 0.06035363457760314, "grad_norm": 0.23828125, "learning_rate": 0.001, "loss": 1.505, "step": 960 }, { "epoch": 0.06129666011787819, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.5328, "step": 975 }, { "epoch": 0.06223968565815324, "grad_norm": 0.45703125, "learning_rate": 0.001, "loss": 1.5274, "step": 990 }, { "epoch": 0.06318271119842829, "grad_norm": 0.3515625, "learning_rate": 0.001, "loss": 1.5246, "step": 1005 }, { "epoch": 0.06412573673870334, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 1.4633, "step": 1020 }, { "epoch": 0.06506876227897838, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.487, "step": 1035 }, { "epoch": 0.06601178781925343, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.4582, "step": 1050 }, { "epoch": 0.06695481335952849, "grad_norm": 0.39453125, "learning_rate": 0.001, "loss": 1.4586, "step": 1065 }, { "epoch": 0.06789783889980354, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 1.4322, "step": 1080 }, { "epoch": 0.06884086444007859, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.47, "step": 1095 }, { "epoch": 0.06978388998035363, "grad_norm": 0.443359375, "learning_rate": 0.001, "loss": 1.4215, "step": 1110 }, { "epoch": 0.07072691552062868, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 1.4569, "step": 1125 }, { "epoch": 0.07166994106090373, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.4428, "step": 1140 }, { "epoch": 0.07261296660117879, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.3861, "step": 1155 }, { "epoch": 0.07355599214145384, "grad_norm": 0.427734375, "learning_rate": 0.001, "loss": 1.4478, "step": 1170 }, { "epoch": 0.07449901768172888, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.406, "step": 1185 }, { "epoch": 0.07544204322200393, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 1.3944, "step": 1200 }, { "epoch": 0.07638506876227898, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 1.3884, "step": 1215 }, { "epoch": 0.07732809430255402, "grad_norm": 0.361328125, "learning_rate": 0.001, "loss": 1.38, "step": 1230 }, { "epoch": 0.07827111984282907, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 1.3446, "step": 1245 }, { "epoch": 0.07921414538310413, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.351, "step": 1260 }, { "epoch": 0.08015717092337918, "grad_norm": 0.6953125, "learning_rate": 0.001, "loss": 1.352, "step": 1275 }, { "epoch": 0.08110019646365423, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.3378, "step": 1290 }, { "epoch": 0.08204322200392927, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3056, "step": 1305 }, { "epoch": 0.08298624754420432, "grad_norm": 0.439453125, "learning_rate": 0.001, "loss": 1.3099, "step": 1320 }, { "epoch": 0.08392927308447937, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 1.3364, "step": 1335 }, { "epoch": 0.08487229862475441, "grad_norm": 0.4296875, "learning_rate": 0.001, "loss": 1.2865, "step": 1350 }, { "epoch": 0.08581532416502947, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.3022, "step": 1365 }, { "epoch": 0.08675834970530452, "grad_norm": 0.37109375, "learning_rate": 0.001, "loss": 1.2641, "step": 1380 }, { "epoch": 0.08770137524557957, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 1.291, "step": 1395 }, { "epoch": 0.08864440078585462, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 1.2947, "step": 1410 }, { "epoch": 0.08958742632612966, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.2626, "step": 1425 }, { "epoch": 0.09053045186640471, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 1.2719, "step": 1440 }, { "epoch": 0.09147347740667977, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 1.2817, "step": 1455 }, { "epoch": 0.09241650294695482, "grad_norm": 0.361328125, "learning_rate": 0.001, "loss": 1.2678, "step": 1470 }, { "epoch": 0.09335952848722986, "grad_norm": 0.396484375, "learning_rate": 0.001, "loss": 1.2336, "step": 1485 }, { "epoch": 0.09430255402750491, "grad_norm": 0.384765625, "learning_rate": 0.001, "loss": 1.2415, "step": 1500 }, { "epoch": 0.09524557956777996, "grad_norm": 0.416015625, "learning_rate": 0.001, "loss": 1.2478, "step": 1515 }, { "epoch": 0.096188605108055, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 1.2475, "step": 1530 }, { "epoch": 0.09713163064833005, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 1.2128, "step": 1545 }, { "epoch": 0.09807465618860511, "grad_norm": 0.404296875, "learning_rate": 0.001, "loss": 1.2292, "step": 1560 }, { "epoch": 0.09901768172888016, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.2015, "step": 1575 }, { "epoch": 0.09996070726915521, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 1.2088, "step": 1590 }, { "epoch": 0.09996070726915521, "eval_loss": 1.5537890195846558, "eval_runtime": 9.6819, "eval_samples_per_second": 103.285, "eval_steps_per_second": 1.446, "step": 1590 }, { "epoch": 0.10090373280943025, "grad_norm": 0.3515625, "learning_rate": 0.001, "loss": 1.2156, "step": 1605 }, { "epoch": 0.1018467583497053, "grad_norm": 0.443359375, "learning_rate": 0.001, "loss": 1.2115, "step": 1620 }, { "epoch": 0.10278978388998035, "grad_norm": 0.8359375, "learning_rate": 0.001, "loss": 1.2202, "step": 1635 }, { "epoch": 0.1037328094302554, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.2208, "step": 1650 }, { "epoch": 0.10467583497053046, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 1.1911, "step": 1665 }, { "epoch": 0.1056188605108055, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 1.2102, "step": 1680 }, { "epoch": 0.10656188605108055, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.1984, "step": 1695 }, { "epoch": 0.1075049115913556, "grad_norm": 0.90625, "learning_rate": 0.001, "loss": 1.2012, "step": 1710 }, { "epoch": 0.10844793713163065, "grad_norm": 0.35546875, "learning_rate": 0.001, "loss": 1.1869, "step": 1725 }, { "epoch": 0.10939096267190569, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.1948, "step": 1740 }, { "epoch": 0.11033398821218075, "grad_norm": 0.404296875, "learning_rate": 0.001, "loss": 1.1783, "step": 1755 }, { "epoch": 0.1112770137524558, "grad_norm": 0.396484375, "learning_rate": 0.001, "loss": 1.1893, "step": 1770 }, { "epoch": 0.11222003929273085, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.1495, "step": 1785 }, { "epoch": 0.1131630648330059, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 1.175, "step": 1800 }, { "epoch": 0.11410609037328094, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 1.1588, "step": 1815 }, { "epoch": 0.11504911591355599, "grad_norm": 0.421875, "learning_rate": 0.001, "loss": 1.1376, "step": 1830 }, { "epoch": 0.11599214145383104, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 1.1511, "step": 1845 }, { "epoch": 0.1169351669941061, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 1.1645, "step": 1860 }, { "epoch": 0.11787819253438114, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 1.1619, "step": 1875 }, { "epoch": 0.11882121807465619, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 1.1304, "step": 1890 }, { "epoch": 0.11976424361493124, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 1.1361, "step": 1905 }, { "epoch": 0.12070726915520628, "grad_norm": 0.37109375, "learning_rate": 0.001, "loss": 1.1151, "step": 1920 }, { "epoch": 0.12165029469548133, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 1.1299, "step": 1935 }, { "epoch": 0.12259332023575638, "grad_norm": 0.45703125, "learning_rate": 0.001, "loss": 1.1334, "step": 1950 }, { "epoch": 0.12353634577603144, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 1.112, "step": 1965 }, { "epoch": 0.12447937131630649, "grad_norm": 0.7890625, "learning_rate": 0.001, "loss": 1.1034, "step": 1980 }, { "epoch": 0.12542239685658152, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 1.12, "step": 1995 }, { "epoch": 0.12636542239685658, "grad_norm": 0.373046875, "learning_rate": 0.001, "loss": 1.0996, "step": 2010 }, { "epoch": 0.12730844793713164, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.1141, "step": 2025 }, { "epoch": 0.12825147347740667, "grad_norm": 0.6640625, "learning_rate": 0.001, "loss": 1.1112, "step": 2040 }, { "epoch": 0.12919449901768174, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 1.1229, "step": 2055 }, { "epoch": 0.13013752455795677, "grad_norm": 0.97265625, "learning_rate": 0.001, "loss": 1.074, "step": 2070 }, { "epoch": 0.13108055009823183, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.1199, "step": 2085 }, { "epoch": 0.13202357563850686, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.097, "step": 2100 }, { "epoch": 0.13296660117878192, "grad_norm": 0.373046875, "learning_rate": 0.001, "loss": 1.0832, "step": 2115 }, { "epoch": 0.13390962671905698, "grad_norm": 0.40234375, "learning_rate": 0.001, "loss": 1.0887, "step": 2130 }, { "epoch": 0.13485265225933202, "grad_norm": 0.404296875, "learning_rate": 0.001, "loss": 1.066, "step": 2145 }, { "epoch": 0.13579567779960708, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 1.0979, "step": 2160 }, { "epoch": 0.1367387033398821, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 1.101, "step": 2175 }, { "epoch": 0.13768172888015717, "grad_norm": 0.396484375, "learning_rate": 0.001, "loss": 1.0761, "step": 2190 }, { "epoch": 0.13862475442043223, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 1.0845, "step": 2205 }, { "epoch": 0.13956777996070727, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 1.0938, "step": 2220 }, { "epoch": 0.14051080550098233, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 1.0659, "step": 2235 }, { "epoch": 0.14145383104125736, "grad_norm": 0.494140625, "learning_rate": 0.001, "loss": 1.0683, "step": 2250 }, { "epoch": 0.14239685658153242, "grad_norm": 0.66015625, "learning_rate": 0.001, "loss": 1.0777, "step": 2265 }, { "epoch": 0.14333988212180745, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 1.0741, "step": 2280 }, { "epoch": 0.14428290766208252, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 1.0533, "step": 2295 }, { "epoch": 0.14522593320235758, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 1.0655, "step": 2310 }, { "epoch": 0.1461689587426326, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.0541, "step": 2325 }, { "epoch": 0.14711198428290767, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 1.0506, "step": 2340 }, { "epoch": 0.1480550098231827, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 1.0596, "step": 2355 }, { "epoch": 0.14899803536345776, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 1.0586, "step": 2370 }, { "epoch": 0.1499410609037328, "grad_norm": 0.40234375, "learning_rate": 0.001, "loss": 1.0466, "step": 2385 }, { "epoch": 0.15088408644400786, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 1.0485, "step": 2400 }, { "epoch": 0.15182711198428292, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 1.011, "step": 2415 }, { "epoch": 0.15277013752455795, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 1.0434, "step": 2430 }, { "epoch": 0.153713163064833, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 1.0353, "step": 2445 }, { "epoch": 0.15465618860510805, "grad_norm": 0.45703125, "learning_rate": 0.001, "loss": 1.0222, "step": 2460 }, { "epoch": 0.1555992141453831, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 1.0403, "step": 2475 }, { "epoch": 0.15654223968565814, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 1.0397, "step": 2490 }, { "epoch": 0.1574852652259332, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 1.0382, "step": 2505 }, { "epoch": 0.15842829076620826, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 1.0336, "step": 2520 }, { "epoch": 0.1593713163064833, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 1.0083, "step": 2535 }, { "epoch": 0.16031434184675836, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 1.0236, "step": 2550 }, { "epoch": 0.1612573673870334, "grad_norm": 0.45703125, "learning_rate": 0.001, "loss": 1.0245, "step": 2565 }, { "epoch": 0.16220039292730845, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 1.026, "step": 2580 }, { "epoch": 0.16314341846758348, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 1.0276, "step": 2595 }, { "epoch": 0.16408644400785855, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.9937, "step": 2610 }, { "epoch": 0.1650294695481336, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 1.0249, "step": 2625 }, { "epoch": 0.16597249508840864, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 1.0096, "step": 2640 }, { "epoch": 0.1669155206286837, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 1.0195, "step": 2655 }, { "epoch": 0.16785854616895873, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 1.018, "step": 2670 }, { "epoch": 0.1688015717092338, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 1.0289, "step": 2685 }, { "epoch": 0.16974459724950883, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 0.9931, "step": 2700 }, { "epoch": 0.1706876227897839, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 1.0101, "step": 2715 }, { "epoch": 0.17163064833005895, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 1.0159, "step": 2730 }, { "epoch": 0.17257367387033398, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 1.0094, "step": 2745 }, { "epoch": 0.17351669941060904, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 1.0081, "step": 2760 }, { "epoch": 0.17445972495088408, "grad_norm": 0.6640625, "learning_rate": 0.001, "loss": 0.9958, "step": 2775 }, { "epoch": 0.17540275049115914, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.9909, "step": 2790 }, { "epoch": 0.1763457760314342, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.9854, "step": 2805 }, { "epoch": 0.17728880157170923, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.9858, "step": 2820 }, { "epoch": 0.1782318271119843, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.9825, "step": 2835 }, { "epoch": 0.17917485265225933, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 1.0153, "step": 2850 }, { "epoch": 0.1801178781925344, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.9984, "step": 2865 }, { "epoch": 0.18106090373280942, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.9832, "step": 2880 }, { "epoch": 0.18200392927308448, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.9843, "step": 2895 }, { "epoch": 0.18294695481335954, "grad_norm": 0.41015625, "learning_rate": 0.001, "loss": 0.9774, "step": 2910 }, { "epoch": 0.18388998035363457, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.9824, "step": 2925 }, { "epoch": 0.18483300589390964, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 0.9884, "step": 2940 }, { "epoch": 0.18577603143418467, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 0.9684, "step": 2955 }, { "epoch": 0.18671905697445973, "grad_norm": 0.6640625, "learning_rate": 0.001, "loss": 0.9746, "step": 2970 }, { "epoch": 0.18766208251473476, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.9831, "step": 2985 }, { "epoch": 0.18860510805500982, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.9868, "step": 3000 }, { "epoch": 0.18954813359528488, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.9687, "step": 3015 }, { "epoch": 0.19049115913555992, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.9759, "step": 3030 }, { "epoch": 0.19143418467583498, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.9755, "step": 3045 }, { "epoch": 0.19237721021611, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.9784, "step": 3060 }, { "epoch": 0.19332023575638507, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.9691, "step": 3075 }, { "epoch": 0.1942632612966601, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.9851, "step": 3090 }, { "epoch": 0.19520628683693517, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.9695, "step": 3105 }, { "epoch": 0.19614931237721023, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.993, "step": 3120 }, { "epoch": 0.19709233791748526, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.9625, "step": 3135 }, { "epoch": 0.19803536345776032, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 0.9655, "step": 3150 }, { "epoch": 0.19897838899803535, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.9606, "step": 3165 }, { "epoch": 0.19992141453831042, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.9608, "step": 3180 }, { "epoch": 0.19992141453831042, "eval_loss": 1.169226050376892, "eval_runtime": 9.7503, "eval_samples_per_second": 102.561, "eval_steps_per_second": 1.436, "step": 3180 }, { "epoch": 0.20086444007858545, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.9741, "step": 3195 }, { "epoch": 0.2018074656188605, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.9608, "step": 3210 }, { "epoch": 0.20275049115913557, "grad_norm": 0.44921875, "learning_rate": 0.001, "loss": 0.9464, "step": 3225 }, { "epoch": 0.2036935166994106, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.9683, "step": 3240 }, { "epoch": 0.20463654223968566, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.9308, "step": 3255 }, { "epoch": 0.2055795677799607, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 0.9541, "step": 3270 }, { "epoch": 0.20652259332023576, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.9452, "step": 3285 }, { "epoch": 0.2074656188605108, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.9673, "step": 3300 }, { "epoch": 0.20840864440078585, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.9508, "step": 3315 }, { "epoch": 0.2093516699410609, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.955, "step": 3330 }, { "epoch": 0.21029469548133595, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.9499, "step": 3345 }, { "epoch": 0.211237721021611, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.9441, "step": 3360 }, { "epoch": 0.21218074656188604, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.9476, "step": 3375 }, { "epoch": 0.2131237721021611, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 0.9506, "step": 3390 }, { "epoch": 0.21406679764243616, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.9546, "step": 3405 }, { "epoch": 0.2150098231827112, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.9488, "step": 3420 }, { "epoch": 0.21595284872298626, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.9473, "step": 3435 }, { "epoch": 0.2168958742632613, "grad_norm": 0.435546875, "learning_rate": 0.001, "loss": 0.9491, "step": 3450 }, { "epoch": 0.21783889980353635, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 0.9304, "step": 3465 }, { "epoch": 0.21878192534381138, "grad_norm": 0.80078125, "learning_rate": 0.001, "loss": 0.9482, "step": 3480 }, { "epoch": 0.21972495088408645, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.9418, "step": 3495 }, { "epoch": 0.2206679764243615, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.9226, "step": 3510 }, { "epoch": 0.22161100196463654, "grad_norm": 0.427734375, "learning_rate": 0.001, "loss": 0.9427, "step": 3525 }, { "epoch": 0.2225540275049116, "grad_norm": 0.76953125, "learning_rate": 0.001, "loss": 0.9261, "step": 3540 }, { "epoch": 0.22349705304518663, "grad_norm": 0.69140625, "learning_rate": 0.001, "loss": 0.9418, "step": 3555 }, { "epoch": 0.2244400785854617, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.9382, "step": 3570 }, { "epoch": 0.22538310412573673, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 0.9353, "step": 3585 }, { "epoch": 0.2263261296660118, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 0.9138, "step": 3600 }, { "epoch": 0.22726915520628685, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.9033, "step": 3615 }, { "epoch": 0.22821218074656188, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.9337, "step": 3630 }, { "epoch": 0.22915520628683694, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.9188, "step": 3645 }, { "epoch": 0.23009823182711198, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.9407, "step": 3660 }, { "epoch": 0.23104125736738704, "grad_norm": 0.40625, "learning_rate": 0.001, "loss": 0.9068, "step": 3675 }, { "epoch": 0.23198428290766207, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.9079, "step": 3690 }, { "epoch": 0.23292730844793713, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.9095, "step": 3705 }, { "epoch": 0.2338703339882122, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.9148, "step": 3720 }, { "epoch": 0.23481335952848723, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.9044, "step": 3735 }, { "epoch": 0.2357563850687623, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.9401, "step": 3750 }, { "epoch": 0.23669941060903732, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.9228, "step": 3765 }, { "epoch": 0.23764243614931238, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 0.9071, "step": 3780 }, { "epoch": 0.2385854616895874, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.92, "step": 3795 }, { "epoch": 0.23952848722986247, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.9323, "step": 3810 }, { "epoch": 0.24047151277013754, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 0.9013, "step": 3825 }, { "epoch": 0.24141453831041257, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.9045, "step": 3840 }, { "epoch": 0.24235756385068763, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.9049, "step": 3855 }, { "epoch": 0.24330058939096266, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.8902, "step": 3870 }, { "epoch": 0.24424361493123772, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.911, "step": 3885 }, { "epoch": 0.24518664047151276, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.9092, "step": 3900 }, { "epoch": 0.24612966601178782, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 0.894, "step": 3915 }, { "epoch": 0.24707269155206288, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.9096, "step": 3930 }, { "epoch": 0.2480157170923379, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.9147, "step": 3945 }, { "epoch": 0.24895874263261297, "grad_norm": 0.8359375, "learning_rate": 0.001, "loss": 0.9088, "step": 3960 }, { "epoch": 0.249901768172888, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.9116, "step": 3975 }, { "epoch": 0.25084479371316304, "grad_norm": 0.7421875, "learning_rate": 0.001, "loss": 0.901, "step": 3990 }, { "epoch": 0.2517878192534381, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.9013, "step": 4005 }, { "epoch": 0.25273084479371316, "grad_norm": 0.41015625, "learning_rate": 0.001, "loss": 0.903, "step": 4020 }, { "epoch": 0.2536738703339882, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.8916, "step": 4035 }, { "epoch": 0.2546168958742633, "grad_norm": 0.69140625, "learning_rate": 0.001, "loss": 0.897, "step": 4050 }, { "epoch": 0.2555599214145383, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 0.9015, "step": 4065 }, { "epoch": 0.25650294695481335, "grad_norm": 0.453125, "learning_rate": 0.001, "loss": 0.897, "step": 4080 }, { "epoch": 0.2574459724950884, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.8936, "step": 4095 }, { "epoch": 0.25838899803536347, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 0.9048, "step": 4110 }, { "epoch": 0.2593320235756385, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.8973, "step": 4125 }, { "epoch": 0.26027504911591354, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 0.9053, "step": 4140 }, { "epoch": 0.2612180746561886, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.9121, "step": 4155 }, { "epoch": 0.26216110019646366, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 0.89, "step": 4170 }, { "epoch": 0.2631041257367387, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.9025, "step": 4185 }, { "epoch": 0.2640471512770137, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 0.899, "step": 4200 }, { "epoch": 0.2649901768172888, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.8793, "step": 4215 }, { "epoch": 0.26593320235756385, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.8964, "step": 4230 }, { "epoch": 0.2668762278978389, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.896, "step": 4245 }, { "epoch": 0.26781925343811397, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 0.886, "step": 4260 }, { "epoch": 0.268762278978389, "grad_norm": 0.6796875, "learning_rate": 0.001, "loss": 0.8861, "step": 4275 }, { "epoch": 0.26970530451866404, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.8864, "step": 4290 }, { "epoch": 0.2706483300589391, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.8834, "step": 4305 }, { "epoch": 0.27159135559921416, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.8859, "step": 4320 }, { "epoch": 0.2725343811394892, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.8953, "step": 4335 }, { "epoch": 0.2734774066797642, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.8928, "step": 4350 }, { "epoch": 0.2744204322200393, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.8821, "step": 4365 }, { "epoch": 0.27536345776031435, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.8872, "step": 4380 }, { "epoch": 0.2763064833005894, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.8753, "step": 4395 }, { "epoch": 0.27724950884086447, "grad_norm": 0.80859375, "learning_rate": 0.001, "loss": 0.9047, "step": 4410 }, { "epoch": 0.2781925343811395, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.8876, "step": 4425 }, { "epoch": 0.27913555992141453, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.864, "step": 4440 }, { "epoch": 0.28007858546168957, "grad_norm": 0.443359375, "learning_rate": 0.001, "loss": 0.8863, "step": 4455 }, { "epoch": 0.28102161100196466, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.9028, "step": 4470 }, { "epoch": 0.2819646365422397, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8684, "step": 4485 }, { "epoch": 0.2829076620825147, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.8808, "step": 4500 }, { "epoch": 0.2838506876227898, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.8736, "step": 4515 }, { "epoch": 0.28479371316306484, "grad_norm": 0.439453125, "learning_rate": 0.001, "loss": 0.8729, "step": 4530 }, { "epoch": 0.2857367387033399, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.8807, "step": 4545 }, { "epoch": 0.2866797642436149, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.8716, "step": 4560 }, { "epoch": 0.28762278978389, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.8754, "step": 4575 }, { "epoch": 0.28856581532416503, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.866, "step": 4590 }, { "epoch": 0.28950884086444006, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.8661, "step": 4605 }, { "epoch": 0.29045186640471515, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.8797, "step": 4620 }, { "epoch": 0.2913948919449902, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.8523, "step": 4635 }, { "epoch": 0.2923379174852652, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.8774, "step": 4650 }, { "epoch": 0.29328094302554025, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.8785, "step": 4665 }, { "epoch": 0.29422396856581534, "grad_norm": 0.408203125, "learning_rate": 0.001, "loss": 0.8648, "step": 4680 }, { "epoch": 0.2951669941060904, "grad_norm": 0.71484375, "learning_rate": 0.001, "loss": 0.8676, "step": 4695 }, { "epoch": 0.2961100196463654, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8557, "step": 4710 }, { "epoch": 0.2970530451866405, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 0.8694, "step": 4725 }, { "epoch": 0.29799607072691553, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.8459, "step": 4740 }, { "epoch": 0.29893909626719056, "grad_norm": 0.44921875, "learning_rate": 0.001, "loss": 0.8551, "step": 4755 }, { "epoch": 0.2998821218074656, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.8717, "step": 4770 }, { "epoch": 0.2998821218074656, "eval_loss": 1.035895824432373, "eval_runtime": 9.7687, "eval_samples_per_second": 102.368, "eval_steps_per_second": 1.433, "step": 4770 }, { "epoch": 0.3008251473477407, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.8668, "step": 4785 }, { "epoch": 0.3017681728880157, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.8674, "step": 4800 }, { "epoch": 0.30271119842829075, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.8886, "step": 4815 }, { "epoch": 0.30365422396856584, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.854, "step": 4830 }, { "epoch": 0.3045972495088409, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8513, "step": 4845 }, { "epoch": 0.3055402750491159, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.8574, "step": 4860 }, { "epoch": 0.30648330058939094, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.8437, "step": 4875 }, { "epoch": 0.307426326129666, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.8604, "step": 4890 }, { "epoch": 0.30836935166994106, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.8544, "step": 4905 }, { "epoch": 0.3093123772102161, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.8607, "step": 4920 }, { "epoch": 0.3102554027504912, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.8454, "step": 4935 }, { "epoch": 0.3111984282907662, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 0.8575, "step": 4950 }, { "epoch": 0.31214145383104125, "grad_norm": 0.73828125, "learning_rate": 0.001, "loss": 0.8401, "step": 4965 }, { "epoch": 0.3130844793713163, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.8592, "step": 4980 }, { "epoch": 0.31402750491159137, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.8376, "step": 4995 }, { "epoch": 0.3149705304518664, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.853, "step": 5010 }, { "epoch": 0.31591355599214144, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.8659, "step": 5025 }, { "epoch": 0.3168565815324165, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.8733, "step": 5040 }, { "epoch": 0.31779960707269156, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 0.8541, "step": 5055 }, { "epoch": 0.3187426326129666, "grad_norm": 0.66015625, "learning_rate": 0.001, "loss": 0.8474, "step": 5070 }, { "epoch": 0.3196856581532416, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.8421, "step": 5085 }, { "epoch": 0.3206286836935167, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.8501, "step": 5100 }, { "epoch": 0.32157170923379175, "grad_norm": 0.44140625, "learning_rate": 0.001, "loss": 0.8596, "step": 5115 }, { "epoch": 0.3225147347740668, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.8421, "step": 5130 }, { "epoch": 0.32345776031434187, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.8732, "step": 5145 }, { "epoch": 0.3244007858546169, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.8549, "step": 5160 }, { "epoch": 0.32534381139489194, "grad_norm": 0.63671875, "learning_rate": 0.001, "loss": 0.8468, "step": 5175 }, { "epoch": 0.32628683693516697, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.8419, "step": 5190 }, { "epoch": 0.32722986247544206, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 0.8531, "step": 5205 }, { "epoch": 0.3281728880157171, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.848, "step": 5220 }, { "epoch": 0.3291159135559921, "grad_norm": 0.419921875, "learning_rate": 0.001, "loss": 0.8367, "step": 5235 }, { "epoch": 0.3300589390962672, "grad_norm": 0.435546875, "learning_rate": 0.001, "loss": 0.8405, "step": 5250 }, { "epoch": 0.33100196463654225, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.8567, "step": 5265 }, { "epoch": 0.3319449901768173, "grad_norm": 0.66796875, "learning_rate": 0.001, "loss": 0.8572, "step": 5280 }, { "epoch": 0.3328880157170923, "grad_norm": 0.78515625, "learning_rate": 0.001, "loss": 0.8505, "step": 5295 }, { "epoch": 0.3338310412573674, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.8398, "step": 5310 }, { "epoch": 0.33477406679764243, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.8475, "step": 5325 }, { "epoch": 0.33571709233791747, "grad_norm": 0.6796875, "learning_rate": 0.001, "loss": 0.8267, "step": 5340 }, { "epoch": 0.33666011787819256, "grad_norm": 0.6953125, "learning_rate": 0.001, "loss": 0.8442, "step": 5355 }, { "epoch": 0.3376031434184676, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.8605, "step": 5370 }, { "epoch": 0.3385461689587426, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.8458, "step": 5385 }, { "epoch": 0.33948919449901765, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.8474, "step": 5400 }, { "epoch": 0.34043222003929274, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.8507, "step": 5415 }, { "epoch": 0.3413752455795678, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.8449, "step": 5430 }, { "epoch": 0.3423182711198428, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.8456, "step": 5445 }, { "epoch": 0.3432612966601179, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.834, "step": 5460 }, { "epoch": 0.34420432220039293, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.8382, "step": 5475 }, { "epoch": 0.34514734774066796, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 0.8162, "step": 5490 }, { "epoch": 0.346090373280943, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.8331, "step": 5505 }, { "epoch": 0.3470333988212181, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.8461, "step": 5520 }, { "epoch": 0.3479764243614931, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.8277, "step": 5535 }, { "epoch": 0.34891944990176815, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.8261, "step": 5550 }, { "epoch": 0.34986247544204324, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8368, "step": 5565 }, { "epoch": 0.3508055009823183, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.829, "step": 5580 }, { "epoch": 0.3517485265225933, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.8356, "step": 5595 }, { "epoch": 0.3526915520628684, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.8404, "step": 5610 }, { "epoch": 0.35363457760314343, "grad_norm": 0.66796875, "learning_rate": 0.001, "loss": 0.8221, "step": 5625 }, { "epoch": 0.35457760314341846, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.8336, "step": 5640 }, { "epoch": 0.3555206286836935, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.8118, "step": 5655 }, { "epoch": 0.3564636542239686, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 0.8288, "step": 5670 }, { "epoch": 0.3574066797642436, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.8376, "step": 5685 }, { "epoch": 0.35834970530451865, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.8426, "step": 5700 }, { "epoch": 0.35929273084479374, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.8437, "step": 5715 }, { "epoch": 0.3602357563850688, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.8469, "step": 5730 }, { "epoch": 0.3611787819253438, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.8274, "step": 5745 }, { "epoch": 0.36212180746561884, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.8306, "step": 5760 }, { "epoch": 0.3630648330058939, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.8315, "step": 5775 }, { "epoch": 0.36400785854616896, "grad_norm": 0.6640625, "learning_rate": 0.001, "loss": 0.8379, "step": 5790 }, { "epoch": 0.364950884086444, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.8342, "step": 5805 }, { "epoch": 0.3658939096267191, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.8374, "step": 5820 }, { "epoch": 0.3668369351669941, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.8103, "step": 5835 }, { "epoch": 0.36777996070726915, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.8053, "step": 5850 }, { "epoch": 0.3687229862475442, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.8248, "step": 5865 }, { "epoch": 0.36966601178781927, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.8118, "step": 5880 }, { "epoch": 0.3706090373280943, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.8289, "step": 5895 }, { "epoch": 0.37155206286836934, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.8295, "step": 5910 }, { "epoch": 0.3724950884086444, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.8158, "step": 5925 }, { "epoch": 0.37343811394891946, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 0.8235, "step": 5940 }, { "epoch": 0.3743811394891945, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.8148, "step": 5955 }, { "epoch": 0.3753241650294695, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.8161, "step": 5970 }, { "epoch": 0.3762671905697446, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.812, "step": 5985 }, { "epoch": 0.37721021611001965, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.8154, "step": 6000 }, { "epoch": 0.3781532416502947, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.8248, "step": 6015 }, { "epoch": 0.37909626719056977, "grad_norm": 0.7265625, "learning_rate": 0.001, "loss": 0.8104, "step": 6030 }, { "epoch": 0.3800392927308448, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.8228, "step": 6045 }, { "epoch": 0.38098231827111984, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.8392, "step": 6060 }, { "epoch": 0.38192534381139487, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.8352, "step": 6075 }, { "epoch": 0.38286836935166996, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.8271, "step": 6090 }, { "epoch": 0.383811394891945, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.8122, "step": 6105 }, { "epoch": 0.38475442043222, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8221, "step": 6120 }, { "epoch": 0.3856974459724951, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.8354, "step": 6135 }, { "epoch": 0.38664047151277015, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.8277, "step": 6150 }, { "epoch": 0.3875834970530452, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.8263, "step": 6165 }, { "epoch": 0.3885265225933202, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.8122, "step": 6180 }, { "epoch": 0.3894695481335953, "grad_norm": 0.70703125, "learning_rate": 0.001, "loss": 0.8296, "step": 6195 }, { "epoch": 0.39041257367387033, "grad_norm": 0.65234375, "learning_rate": 0.001, "loss": 0.8171, "step": 6210 }, { "epoch": 0.39135559921414537, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.8127, "step": 6225 }, { "epoch": 0.39229862475442046, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.806, "step": 6240 }, { "epoch": 0.3932416502946955, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 0.8157, "step": 6255 }, { "epoch": 0.3941846758349705, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.826, "step": 6270 }, { "epoch": 0.39512770137524555, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 0.8208, "step": 6285 }, { "epoch": 0.39607072691552064, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.8041, "step": 6300 }, { "epoch": 0.3970137524557957, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.8254, "step": 6315 }, { "epoch": 0.3979567779960707, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.8332, "step": 6330 }, { "epoch": 0.3988998035363458, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.8143, "step": 6345 }, { "epoch": 0.39984282907662083, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.8087, "step": 6360 }, { "epoch": 0.39984282907662083, "eval_loss": 0.9629083871841431, "eval_runtime": 9.6716, "eval_samples_per_second": 103.395, "eval_steps_per_second": 1.448, "step": 6360 }, { "epoch": 0.40078585461689586, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.8169, "step": 6375 }, { "epoch": 0.4017288801571709, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.8229, "step": 6390 }, { "epoch": 0.402671905697446, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.8108, "step": 6405 }, { "epoch": 0.403614931237721, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.814, "step": 6420 }, { "epoch": 0.40455795677799605, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.8077, "step": 6435 }, { "epoch": 0.40550098231827114, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.8103, "step": 6450 }, { "epoch": 0.4064440078585462, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.7904, "step": 6465 }, { "epoch": 0.4073870333988212, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.8006, "step": 6480 }, { "epoch": 0.40833005893909624, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.8112, "step": 6495 }, { "epoch": 0.40927308447937133, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.7984, "step": 6510 }, { "epoch": 0.41021611001964636, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7883, "step": 6525 }, { "epoch": 0.4111591355599214, "grad_norm": 1.0625, "learning_rate": 0.001, "loss": 0.8196, "step": 6540 }, { "epoch": 0.4121021611001965, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.8274, "step": 6555 }, { "epoch": 0.4130451866404715, "grad_norm": 0.419921875, "learning_rate": 0.001, "loss": 0.7942, "step": 6570 }, { "epoch": 0.41398821218074655, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7965, "step": 6585 }, { "epoch": 0.4149312377210216, "grad_norm": 0.435546875, "learning_rate": 0.001, "loss": 0.7944, "step": 6600 }, { "epoch": 0.4158742632612967, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.8055, "step": 6615 }, { "epoch": 0.4168172888015717, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.8083, "step": 6630 }, { "epoch": 0.41776031434184674, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.8151, "step": 6645 }, { "epoch": 0.4187033398821218, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.8093, "step": 6660 }, { "epoch": 0.41964636542239686, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.807, "step": 6675 }, { "epoch": 0.4205893909626719, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7884, "step": 6690 }, { "epoch": 0.4215324165029469, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7958, "step": 6705 }, { "epoch": 0.422475442043222, "grad_norm": 0.73046875, "learning_rate": 0.001, "loss": 0.8029, "step": 6720 }, { "epoch": 0.42341846758349705, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 0.804, "step": 6735 }, { "epoch": 0.4243614931237721, "grad_norm": 0.6953125, "learning_rate": 0.001, "loss": 0.8235, "step": 6750 }, { "epoch": 0.42530451866404717, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.8105, "step": 6765 }, { "epoch": 0.4262475442043222, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.8028, "step": 6780 }, { "epoch": 0.42719056974459724, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.8017, "step": 6795 }, { "epoch": 0.4281335952848723, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7998, "step": 6810 }, { "epoch": 0.42907662082514736, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.8083, "step": 6825 }, { "epoch": 0.4300196463654224, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.7701, "step": 6840 }, { "epoch": 0.4309626719056974, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7922, "step": 6855 }, { "epoch": 0.4319056974459725, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7971, "step": 6870 }, { "epoch": 0.43284872298624755, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.795, "step": 6885 }, { "epoch": 0.4337917485265226, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.8004, "step": 6900 }, { "epoch": 0.43473477406679767, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7965, "step": 6915 }, { "epoch": 0.4356777996070727, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7937, "step": 6930 }, { "epoch": 0.43662082514734774, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.8007, "step": 6945 }, { "epoch": 0.43756385068762277, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7935, "step": 6960 }, { "epoch": 0.43850687622789786, "grad_norm": 0.404296875, "learning_rate": 0.001, "loss": 0.8045, "step": 6975 }, { "epoch": 0.4394499017681729, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.8055, "step": 6990 }, { "epoch": 0.4403929273084479, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 0.8005, "step": 7005 }, { "epoch": 0.441335952848723, "grad_norm": 0.72265625, "learning_rate": 0.001, "loss": 0.7881, "step": 7020 }, { "epoch": 0.44227897838899805, "grad_norm": 0.73046875, "learning_rate": 0.001, "loss": 0.8212, "step": 7035 }, { "epoch": 0.4432220039292731, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7984, "step": 7050 }, { "epoch": 0.4441650294695481, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.8078, "step": 7065 }, { "epoch": 0.4451080550098232, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 0.7773, "step": 7080 }, { "epoch": 0.44605108055009823, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.7884, "step": 7095 }, { "epoch": 0.44699410609037327, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7842, "step": 7110 }, { "epoch": 0.44793713163064836, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7854, "step": 7125 }, { "epoch": 0.4488801571709234, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7913, "step": 7140 }, { "epoch": 0.4498231827111984, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7944, "step": 7155 }, { "epoch": 0.45076620825147345, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7935, "step": 7170 }, { "epoch": 0.45170923379174854, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 0.7915, "step": 7185 }, { "epoch": 0.4526522593320236, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.7893, "step": 7200 }, { "epoch": 0.4535952848722986, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.7749, "step": 7215 }, { "epoch": 0.4545383104125737, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 0.7738, "step": 7230 }, { "epoch": 0.45548133595284873, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7832, "step": 7245 }, { "epoch": 0.45642436149312376, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.7935, "step": 7260 }, { "epoch": 0.4573673870333988, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7969, "step": 7275 }, { "epoch": 0.4583104125736739, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7891, "step": 7290 }, { "epoch": 0.4592534381139489, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7854, "step": 7305 }, { "epoch": 0.46019646365422395, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.8013, "step": 7320 }, { "epoch": 0.46113948919449904, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7864, "step": 7335 }, { "epoch": 0.4620825147347741, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7932, "step": 7350 }, { "epoch": 0.4630255402750491, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7866, "step": 7365 }, { "epoch": 0.46396856581532414, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.8011, "step": 7380 }, { "epoch": 0.46491159135559923, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7743, "step": 7395 }, { "epoch": 0.46585461689587426, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.7784, "step": 7410 }, { "epoch": 0.4667976424361493, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7953, "step": 7425 }, { "epoch": 0.4677406679764244, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7807, "step": 7440 }, { "epoch": 0.4686836935166994, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7713, "step": 7455 }, { "epoch": 0.46962671905697445, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7636, "step": 7470 }, { "epoch": 0.4705697445972495, "grad_norm": 0.443359375, "learning_rate": 0.001, "loss": 0.7773, "step": 7485 }, { "epoch": 0.4715127701375246, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.8002, "step": 7500 }, { "epoch": 0.4724557956777996, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 0.7799, "step": 7515 }, { "epoch": 0.47339882121807464, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7776, "step": 7530 }, { "epoch": 0.47434184675834973, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7823, "step": 7545 }, { "epoch": 0.47528487229862476, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.8059, "step": 7560 }, { "epoch": 0.4762278978388998, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7908, "step": 7575 }, { "epoch": 0.4771709233791748, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7923, "step": 7590 }, { "epoch": 0.4781139489194499, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.778, "step": 7605 }, { "epoch": 0.47905697445972495, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.8007, "step": 7620 }, { "epoch": 0.48, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7842, "step": 7635 }, { "epoch": 0.48094302554027507, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 0.7968, "step": 7650 }, { "epoch": 0.4818860510805501, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7812, "step": 7665 }, { "epoch": 0.48282907662082514, "grad_norm": 1.5078125, "learning_rate": 0.001, "loss": 0.7832, "step": 7680 }, { "epoch": 0.48377210216110017, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7915, "step": 7695 }, { "epoch": 0.48471512770137526, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.8046, "step": 7710 }, { "epoch": 0.4856581532416503, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.7674, "step": 7725 }, { "epoch": 0.4866011787819253, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.7795, "step": 7740 }, { "epoch": 0.4875442043222004, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7983, "step": 7755 }, { "epoch": 0.48848722986247545, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7897, "step": 7770 }, { "epoch": 0.4894302554027505, "grad_norm": 0.73828125, "learning_rate": 0.001, "loss": 0.772, "step": 7785 }, { "epoch": 0.4903732809430255, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7795, "step": 7800 }, { "epoch": 0.4913163064833006, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7739, "step": 7815 }, { "epoch": 0.49225933202357564, "grad_norm": 0.66015625, "learning_rate": 0.001, "loss": 0.7891, "step": 7830 }, { "epoch": 0.49320235756385067, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7802, "step": 7845 }, { "epoch": 0.49414538310412576, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7843, "step": 7860 }, { "epoch": 0.4950884086444008, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7756, "step": 7875 }, { "epoch": 0.4960314341846758, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.77, "step": 7890 }, { "epoch": 0.49697445972495086, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7633, "step": 7905 }, { "epoch": 0.49791748526522595, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7842, "step": 7920 }, { "epoch": 0.498860510805501, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7742, "step": 7935 }, { "epoch": 0.499803536345776, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7608, "step": 7950 }, { "epoch": 0.499803536345776, "eval_loss": 0.9156466126441956, "eval_runtime": 9.6921, "eval_samples_per_second": 103.176, "eval_steps_per_second": 1.444, "step": 7950 }, { "epoch": 0.500746561886051, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7861, "step": 7965 }, { "epoch": 0.5016895874263261, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7726, "step": 7980 }, { "epoch": 0.5026326129666012, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7749, "step": 7995 }, { "epoch": 0.5035756385068763, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7686, "step": 8010 }, { "epoch": 0.5045186640471513, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7797, "step": 8025 }, { "epoch": 0.5054616895874263, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7622, "step": 8040 }, { "epoch": 0.5064047151277014, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.7753, "step": 8055 }, { "epoch": 0.5073477406679764, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7744, "step": 8070 }, { "epoch": 0.5082907662082514, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7659, "step": 8085 }, { "epoch": 0.5092337917485266, "grad_norm": 0.69140625, "learning_rate": 0.001, "loss": 0.7883, "step": 8100 }, { "epoch": 0.5101768172888016, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7809, "step": 8115 }, { "epoch": 0.5111198428290766, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.7701, "step": 8130 }, { "epoch": 0.5120628683693517, "grad_norm": 0.6953125, "learning_rate": 0.001, "loss": 0.7659, "step": 8145 }, { "epoch": 0.5130058939096267, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7772, "step": 8160 }, { "epoch": 0.5139489194499017, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.7769, "step": 8175 }, { "epoch": 0.5148919449901768, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7706, "step": 8190 }, { "epoch": 0.5158349705304519, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.7645, "step": 8205 }, { "epoch": 0.5167779960707269, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.7724, "step": 8220 }, { "epoch": 0.517721021611002, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7651, "step": 8235 }, { "epoch": 0.518664047151277, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7703, "step": 8250 }, { "epoch": 0.519607072691552, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7709, "step": 8265 }, { "epoch": 0.5205500982318271, "grad_norm": 0.6796875, "learning_rate": 0.001, "loss": 0.7759, "step": 8280 }, { "epoch": 0.5214931237721021, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7687, "step": 8295 }, { "epoch": 0.5224361493123773, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7735, "step": 8310 }, { "epoch": 0.5233791748526523, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7653, "step": 8325 }, { "epoch": 0.5243222003929273, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.766, "step": 8340 }, { "epoch": 0.5252652259332024, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.768, "step": 8355 }, { "epoch": 0.5262082514734774, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7651, "step": 8370 }, { "epoch": 0.5271512770137524, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.77, "step": 8385 }, { "epoch": 0.5280943025540275, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7671, "step": 8400 }, { "epoch": 0.5290373280943026, "grad_norm": 0.63671875, "learning_rate": 0.001, "loss": 0.7568, "step": 8415 }, { "epoch": 0.5299803536345776, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7719, "step": 8430 }, { "epoch": 0.5309233791748527, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7765, "step": 8445 }, { "epoch": 0.5318664047151277, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7713, "step": 8460 }, { "epoch": 0.5328094302554027, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.7779, "step": 8475 }, { "epoch": 0.5337524557956778, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7675, "step": 8490 }, { "epoch": 0.5346954813359528, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 0.7652, "step": 8505 }, { "epoch": 0.5356385068762279, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7692, "step": 8520 }, { "epoch": 0.536581532416503, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7781, "step": 8535 }, { "epoch": 0.537524557956778, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.765, "step": 8550 }, { "epoch": 0.538467583497053, "grad_norm": 0.84765625, "learning_rate": 0.001, "loss": 0.7549, "step": 8565 }, { "epoch": 0.5394106090373281, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.7709, "step": 8580 }, { "epoch": 0.5403536345776031, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7739, "step": 8595 }, { "epoch": 0.5412966601178782, "grad_norm": 0.76171875, "learning_rate": 0.001, "loss": 0.769, "step": 8610 }, { "epoch": 0.5422396856581533, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.7737, "step": 8625 }, { "epoch": 0.5431827111984283, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7638, "step": 8640 }, { "epoch": 0.5441257367387033, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 0.7392, "step": 8655 }, { "epoch": 0.5450687622789784, "grad_norm": 0.71484375, "learning_rate": 0.001, "loss": 0.7566, "step": 8670 }, { "epoch": 0.5460117878192534, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7592, "step": 8685 }, { "epoch": 0.5469548133595284, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 0.7485, "step": 8700 }, { "epoch": 0.5478978388998036, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7678, "step": 8715 }, { "epoch": 0.5488408644400786, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7634, "step": 8730 }, { "epoch": 0.5497838899803537, "grad_norm": 0.4375, "learning_rate": 0.001, "loss": 0.7471, "step": 8745 }, { "epoch": 0.5507269155206287, "grad_norm": 1.0234375, "learning_rate": 0.001, "loss": 0.7561, "step": 8760 }, { "epoch": 0.5516699410609037, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7622, "step": 8775 }, { "epoch": 0.5526129666011788, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 0.7701, "step": 8790 }, { "epoch": 0.5535559921414538, "grad_norm": 0.7109375, "learning_rate": 0.001, "loss": 0.7728, "step": 8805 }, { "epoch": 0.5544990176817289, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7813, "step": 8820 }, { "epoch": 0.555442043222004, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7614, "step": 8835 }, { "epoch": 0.556385068762279, "grad_norm": 0.7890625, "learning_rate": 0.001, "loss": 0.7766, "step": 8850 }, { "epoch": 0.557328094302554, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7735, "step": 8865 }, { "epoch": 0.5582711198428291, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7641, "step": 8880 }, { "epoch": 0.5592141453831041, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7798, "step": 8895 }, { "epoch": 0.5601571709233791, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7471, "step": 8910 }, { "epoch": 0.5611001964636543, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 0.7625, "step": 8925 }, { "epoch": 0.5620432220039293, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7631, "step": 8940 }, { "epoch": 0.5629862475442043, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7679, "step": 8955 }, { "epoch": 0.5639292730844794, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7647, "step": 8970 }, { "epoch": 0.5648722986247544, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.7674, "step": 8985 }, { "epoch": 0.5658153241650294, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7735, "step": 9000 }, { "epoch": 0.5667583497053045, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7826, "step": 9015 }, { "epoch": 0.5677013752455796, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 0.764, "step": 9030 }, { "epoch": 0.5686444007858547, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7535, "step": 9045 }, { "epoch": 0.5695874263261297, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7588, "step": 9060 }, { "epoch": 0.5705304518664047, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7622, "step": 9075 }, { "epoch": 0.5714734774066798, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7514, "step": 9090 }, { "epoch": 0.5724165029469548, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7593, "step": 9105 }, { "epoch": 0.5733595284872298, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.7677, "step": 9120 }, { "epoch": 0.574302554027505, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7539, "step": 9135 }, { "epoch": 0.57524557956778, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.7475, "step": 9150 }, { "epoch": 0.576188605108055, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.741, "step": 9165 }, { "epoch": 0.5771316306483301, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7533, "step": 9180 }, { "epoch": 0.5780746561886051, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.765, "step": 9195 }, { "epoch": 0.5790176817288801, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7741, "step": 9210 }, { "epoch": 0.5799607072691552, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7598, "step": 9225 }, { "epoch": 0.5809037328094303, "grad_norm": 0.453125, "learning_rate": 0.001, "loss": 0.7539, "step": 9240 }, { "epoch": 0.5818467583497053, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7455, "step": 9255 }, { "epoch": 0.5827897838899804, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.7506, "step": 9270 }, { "epoch": 0.5837328094302554, "grad_norm": 0.74609375, "learning_rate": 0.001, "loss": 0.7555, "step": 9285 }, { "epoch": 0.5846758349705304, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.7635, "step": 9300 }, { "epoch": 0.5856188605108055, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7351, "step": 9315 }, { "epoch": 0.5865618860510805, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7341, "step": 9330 }, { "epoch": 0.5875049115913556, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7525, "step": 9345 }, { "epoch": 0.5884479371316307, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7575, "step": 9360 }, { "epoch": 0.5893909626719057, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7608, "step": 9375 }, { "epoch": 0.5903339882121807, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7602, "step": 9390 }, { "epoch": 0.5912770137524558, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.7615, "step": 9405 }, { "epoch": 0.5922200392927308, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 0.762, "step": 9420 }, { "epoch": 0.5931630648330058, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7635, "step": 9435 }, { "epoch": 0.594106090373281, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 0.7556, "step": 9450 }, { "epoch": 0.595049115913556, "grad_norm": 0.443359375, "learning_rate": 0.001, "loss": 0.7497, "step": 9465 }, { "epoch": 0.5959921414538311, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7419, "step": 9480 }, { "epoch": 0.5969351669941061, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7562, "step": 9495 }, { "epoch": 0.5978781925343811, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7468, "step": 9510 }, { "epoch": 0.5988212180746562, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.7499, "step": 9525 }, { "epoch": 0.5997642436149312, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7683, "step": 9540 }, { "epoch": 0.5997642436149312, "eval_loss": 0.8865543603897095, "eval_runtime": 9.6786, "eval_samples_per_second": 103.32, "eval_steps_per_second": 1.446, "step": 9540 }, { "epoch": 0.6007072691552063, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7574, "step": 9555 }, { "epoch": 0.6016502946954814, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7518, "step": 9570 }, { "epoch": 0.6025933202357564, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7391, "step": 9585 }, { "epoch": 0.6035363457760314, "grad_norm": 0.38671875, "learning_rate": 0.001, "loss": 0.7425, "step": 9600 }, { "epoch": 0.6044793713163065, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7606, "step": 9615 }, { "epoch": 0.6054223968565815, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.7292, "step": 9630 }, { "epoch": 0.6063654223968565, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7356, "step": 9645 }, { "epoch": 0.6073084479371317, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7513, "step": 9660 }, { "epoch": 0.6082514734774067, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7522, "step": 9675 }, { "epoch": 0.6091944990176817, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7563, "step": 9690 }, { "epoch": 0.6101375245579568, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7473, "step": 9705 }, { "epoch": 0.6110805500982318, "grad_norm": 0.66796875, "learning_rate": 0.001, "loss": 0.76, "step": 9720 }, { "epoch": 0.6120235756385068, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7473, "step": 9735 }, { "epoch": 0.6129666011787819, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7416, "step": 9750 }, { "epoch": 0.613909626719057, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7449, "step": 9765 }, { "epoch": 0.614852652259332, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.7509, "step": 9780 }, { "epoch": 0.6157956777996071, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.7468, "step": 9795 }, { "epoch": 0.6167387033398821, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7632, "step": 9810 }, { "epoch": 0.6176817288801572, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7586, "step": 9825 }, { "epoch": 0.6186247544204322, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7495, "step": 9840 }, { "epoch": 0.6195677799607072, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7548, "step": 9855 }, { "epoch": 0.6205108055009824, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7484, "step": 9870 }, { "epoch": 0.6214538310412574, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7683, "step": 9885 }, { "epoch": 0.6223968565815324, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.7332, "step": 9900 }, { "epoch": 0.6233398821218075, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 0.743, "step": 9915 }, { "epoch": 0.6242829076620825, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7527, "step": 9930 }, { "epoch": 0.6252259332023575, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7407, "step": 9945 }, { "epoch": 0.6261689587426326, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 0.756, "step": 9960 }, { "epoch": 0.6271119842829077, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 0.7505, "step": 9975 }, { "epoch": 0.6280550098231827, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7517, "step": 9990 }, { "epoch": 0.6289980353634578, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.766, "step": 10005 }, { "epoch": 0.6299410609037328, "grad_norm": 0.66015625, "learning_rate": 0.001, "loss": 0.7385, "step": 10020 }, { "epoch": 0.6308840864440078, "grad_norm": 0.7265625, "learning_rate": 0.001, "loss": 0.7565, "step": 10035 }, { "epoch": 0.6318271119842829, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7508, "step": 10050 }, { "epoch": 0.6327701375245579, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7519, "step": 10065 }, { "epoch": 0.633713163064833, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.76, "step": 10080 }, { "epoch": 0.6346561886051081, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7326, "step": 10095 }, { "epoch": 0.6355992141453831, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7506, "step": 10110 }, { "epoch": 0.6365422396856582, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7419, "step": 10125 }, { "epoch": 0.6374852652259332, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.7309, "step": 10140 }, { "epoch": 0.6384282907662082, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7367, "step": 10155 }, { "epoch": 0.6393713163064833, "grad_norm": 0.66796875, "learning_rate": 0.001, "loss": 0.7472, "step": 10170 }, { "epoch": 0.6403143418467584, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7431, "step": 10185 }, { "epoch": 0.6412573673870334, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.7496, "step": 10200 }, { "epoch": 0.6422003929273085, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.741, "step": 10215 }, { "epoch": 0.6431434184675835, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7548, "step": 10230 }, { "epoch": 0.6440864440078585, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7615, "step": 10245 }, { "epoch": 0.6450294695481336, "grad_norm": 0.494140625, "learning_rate": 0.001, "loss": 0.764, "step": 10260 }, { "epoch": 0.6459724950884086, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7467, "step": 10275 }, { "epoch": 0.6469155206286837, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.752, "step": 10290 }, { "epoch": 0.6478585461689588, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.7238, "step": 10305 }, { "epoch": 0.6488015717092338, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.7464, "step": 10320 }, { "epoch": 0.6497445972495088, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 0.7376, "step": 10335 }, { "epoch": 0.6506876227897839, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.7378, "step": 10350 }, { "epoch": 0.6516306483300589, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.7536, "step": 10365 }, { "epoch": 0.6525736738703339, "grad_norm": 0.4921875, "learning_rate": 0.001, "loss": 0.732, "step": 10380 }, { "epoch": 0.6535166994106091, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7554, "step": 10395 }, { "epoch": 0.6544597249508841, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.7348, "step": 10410 }, { "epoch": 0.6554027504911591, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7446, "step": 10425 }, { "epoch": 0.6563457760314342, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7386, "step": 10440 }, { "epoch": 0.6572888015717092, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7456, "step": 10455 }, { "epoch": 0.6582318271119842, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7447, "step": 10470 }, { "epoch": 0.6591748526522593, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7466, "step": 10485 }, { "epoch": 0.6601178781925344, "grad_norm": 0.75390625, "learning_rate": 0.001, "loss": 0.7638, "step": 10500 }, { "epoch": 0.6610609037328095, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7454, "step": 10515 }, { "epoch": 0.6620039292730845, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.738, "step": 10530 }, { "epoch": 0.6629469548133595, "grad_norm": 0.66796875, "learning_rate": 0.001, "loss": 0.7443, "step": 10545 }, { "epoch": 0.6638899803536346, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7433, "step": 10560 }, { "epoch": 0.6648330058939096, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 0.7328, "step": 10575 }, { "epoch": 0.6657760314341846, "grad_norm": 0.66015625, "learning_rate": 0.001, "loss": 0.7419, "step": 10590 }, { "epoch": 0.6667190569744598, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7387, "step": 10605 }, { "epoch": 0.6676620825147348, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.7325, "step": 10620 }, { "epoch": 0.6686051080550098, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.737, "step": 10635 }, { "epoch": 0.6695481335952849, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7447, "step": 10650 }, { "epoch": 0.6704911591355599, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7332, "step": 10665 }, { "epoch": 0.6714341846758349, "grad_norm": 0.80078125, "learning_rate": 0.001, "loss": 0.7459, "step": 10680 }, { "epoch": 0.67237721021611, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7389, "step": 10695 }, { "epoch": 0.6733202357563851, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7362, "step": 10710 }, { "epoch": 0.6742632612966601, "grad_norm": 0.67578125, "learning_rate": 0.001, "loss": 0.7297, "step": 10725 }, { "epoch": 0.6752062868369352, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7506, "step": 10740 }, { "epoch": 0.6761493123772102, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7279, "step": 10755 }, { "epoch": 0.6770923379174852, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7329, "step": 10770 }, { "epoch": 0.6780353634577603, "grad_norm": 0.6953125, "learning_rate": 0.001, "loss": 0.736, "step": 10785 }, { "epoch": 0.6789783889980353, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7168, "step": 10800 }, { "epoch": 0.6799214145383105, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7394, "step": 10815 }, { "epoch": 0.6808644400785855, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7165, "step": 10830 }, { "epoch": 0.6818074656188605, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7249, "step": 10845 }, { "epoch": 0.6827504911591356, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.732, "step": 10860 }, { "epoch": 0.6836935166994106, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.747, "step": 10875 }, { "epoch": 0.6846365422396856, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7268, "step": 10890 }, { "epoch": 0.6855795677799607, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7334, "step": 10905 }, { "epoch": 0.6865225933202358, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7243, "step": 10920 }, { "epoch": 0.6874656188605108, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7402, "step": 10935 }, { "epoch": 0.6884086444007859, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.738, "step": 10950 }, { "epoch": 0.6893516699410609, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7309, "step": 10965 }, { "epoch": 0.6902946954813359, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7551, "step": 10980 }, { "epoch": 0.691237721021611, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7438, "step": 10995 }, { "epoch": 0.692180746561886, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7353, "step": 11010 }, { "epoch": 0.6931237721021611, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.728, "step": 11025 }, { "epoch": 0.6940667976424362, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7366, "step": 11040 }, { "epoch": 0.6950098231827112, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7424, "step": 11055 }, { "epoch": 0.6959528487229862, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7434, "step": 11070 }, { "epoch": 0.6968958742632613, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.7371, "step": 11085 }, { "epoch": 0.6978388998035363, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.7326, "step": 11100 }, { "epoch": 0.6987819253438114, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7272, "step": 11115 }, { "epoch": 0.6997249508840865, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.738, "step": 11130 }, { "epoch": 0.6997249508840865, "eval_loss": 0.8602269291877747, "eval_runtime": 9.6753, "eval_samples_per_second": 103.356, "eval_steps_per_second": 1.447, "step": 11130 }, { "epoch": 0.7006679764243615, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7375, "step": 11145 }, { "epoch": 0.7016110019646365, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7545, "step": 11160 }, { "epoch": 0.7025540275049116, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7482, "step": 11175 }, { "epoch": 0.7034970530451866, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7274, "step": 11190 }, { "epoch": 0.7044400785854616, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7241, "step": 11205 }, { "epoch": 0.7053831041257368, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7303, "step": 11220 }, { "epoch": 0.7063261296660118, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7267, "step": 11235 }, { "epoch": 0.7072691552062869, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7267, "step": 11250 }, { "epoch": 0.7082121807465619, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.7309, "step": 11265 }, { "epoch": 0.7091552062868369, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7377, "step": 11280 }, { "epoch": 0.710098231827112, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7306, "step": 11295 }, { "epoch": 0.711041257367387, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7341, "step": 11310 }, { "epoch": 0.7119842829076621, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7349, "step": 11325 }, { "epoch": 0.7129273084479372, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7407, "step": 11340 }, { "epoch": 0.7138703339882122, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7358, "step": 11355 }, { "epoch": 0.7148133595284872, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7254, "step": 11370 }, { "epoch": 0.7157563850687623, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7328, "step": 11385 }, { "epoch": 0.7166994106090373, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7304, "step": 11400 }, { "epoch": 0.7176424361493123, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7317, "step": 11415 }, { "epoch": 0.7185854616895875, "grad_norm": 0.703125, "learning_rate": 0.001, "loss": 0.732, "step": 11430 }, { "epoch": 0.7195284872298625, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7433, "step": 11445 }, { "epoch": 0.7204715127701375, "grad_norm": 0.703125, "learning_rate": 0.001, "loss": 0.7415, "step": 11460 }, { "epoch": 0.7214145383104126, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7285, "step": 11475 }, { "epoch": 0.7223575638506876, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7299, "step": 11490 }, { "epoch": 0.7233005893909626, "grad_norm": 0.72265625, "learning_rate": 0.001, "loss": 0.7314, "step": 11505 }, { "epoch": 0.7242436149312377, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7413, "step": 11520 }, { "epoch": 0.7251866404715128, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7266, "step": 11535 }, { "epoch": 0.7261296660117879, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.7104, "step": 11550 }, { "epoch": 0.7270726915520629, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7252, "step": 11565 }, { "epoch": 0.7280157170923379, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.726, "step": 11580 }, { "epoch": 0.728958742632613, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7242, "step": 11595 }, { "epoch": 0.729901768172888, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7313, "step": 11610 }, { "epoch": 0.730844793713163, "grad_norm": 0.74609375, "learning_rate": 0.001, "loss": 0.7379, "step": 11625 }, { "epoch": 0.7317878192534382, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7394, "step": 11640 }, { "epoch": 0.7327308447937132, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7332, "step": 11655 }, { "epoch": 0.7336738703339882, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7154, "step": 11670 }, { "epoch": 0.7346168958742633, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7351, "step": 11685 }, { "epoch": 0.7355599214145383, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7375, "step": 11700 }, { "epoch": 0.7365029469548133, "grad_norm": 0.6640625, "learning_rate": 0.001, "loss": 0.7363, "step": 11715 }, { "epoch": 0.7374459724950884, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7301, "step": 11730 }, { "epoch": 0.7383889980353635, "grad_norm": 0.7734375, "learning_rate": 0.001, "loss": 0.7287, "step": 11745 }, { "epoch": 0.7393320235756385, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7237, "step": 11760 }, { "epoch": 0.7402750491159136, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7242, "step": 11775 }, { "epoch": 0.7412180746561886, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.7242, "step": 11790 }, { "epoch": 0.7421611001964636, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 0.7171, "step": 11805 }, { "epoch": 0.7431041257367387, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7191, "step": 11820 }, { "epoch": 0.7440471512770137, "grad_norm": 0.439453125, "learning_rate": 0.001, "loss": 0.7323, "step": 11835 }, { "epoch": 0.7449901768172889, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7139, "step": 11850 }, { "epoch": 0.7459332023575639, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7237, "step": 11865 }, { "epoch": 0.7468762278978389, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7136, "step": 11880 }, { "epoch": 0.747819253438114, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7375, "step": 11895 }, { "epoch": 0.748762278978389, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7236, "step": 11910 }, { "epoch": 0.749705304518664, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7416, "step": 11925 }, { "epoch": 0.750648330058939, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.7376, "step": 11940 }, { "epoch": 0.7515913555992142, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7293, "step": 11955 }, { "epoch": 0.7525343811394892, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7274, "step": 11970 }, { "epoch": 0.7534774066797643, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7251, "step": 11985 }, { "epoch": 0.7544204322200393, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7221, "step": 12000 }, { "epoch": 0.7553634577603143, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7269, "step": 12015 }, { "epoch": 0.7563064833005894, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7229, "step": 12030 }, { "epoch": 0.7572495088408644, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7332, "step": 12045 }, { "epoch": 0.7581925343811395, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7425, "step": 12060 }, { "epoch": 0.7591355599214146, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7084, "step": 12075 }, { "epoch": 0.7600785854616896, "grad_norm": 0.453125, "learning_rate": 0.001, "loss": 0.7212, "step": 12090 }, { "epoch": 0.7610216110019646, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7221, "step": 12105 }, { "epoch": 0.7619646365422397, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7125, "step": 12120 }, { "epoch": 0.7629076620825147, "grad_norm": 0.63671875, "learning_rate": 0.001, "loss": 0.7214, "step": 12135 }, { "epoch": 0.7638506876227897, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7211, "step": 12150 }, { "epoch": 0.7647937131630649, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7126, "step": 12165 }, { "epoch": 0.7657367387033399, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.726, "step": 12180 }, { "epoch": 0.766679764243615, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7079, "step": 12195 }, { "epoch": 0.76762278978389, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7282, "step": 12210 }, { "epoch": 0.768565815324165, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 0.7293, "step": 12225 }, { "epoch": 0.76950884086444, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7245, "step": 12240 }, { "epoch": 0.7704518664047151, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7263, "step": 12255 }, { "epoch": 0.7713948919449902, "grad_norm": 0.76953125, "learning_rate": 0.001, "loss": 0.7483, "step": 12270 }, { "epoch": 0.7723379174852653, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7243, "step": 12285 }, { "epoch": 0.7732809430255403, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 0.72, "step": 12300 }, { "epoch": 0.7742239685658153, "grad_norm": 0.7421875, "learning_rate": 0.001, "loss": 0.7145, "step": 12315 }, { "epoch": 0.7751669941060904, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7264, "step": 12330 }, { "epoch": 0.7761100196463654, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7233, "step": 12345 }, { "epoch": 0.7770530451866404, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7132, "step": 12360 }, { "epoch": 0.7779960707269156, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7218, "step": 12375 }, { "epoch": 0.7789390962671906, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.7229, "step": 12390 }, { "epoch": 0.7798821218074656, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.7244, "step": 12405 }, { "epoch": 0.7808251473477407, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7133, "step": 12420 }, { "epoch": 0.7817681728880157, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7165, "step": 12435 }, { "epoch": 0.7827111984282907, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.7125, "step": 12450 }, { "epoch": 0.7836542239685658, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.7025, "step": 12465 }, { "epoch": 0.7845972495088409, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7143, "step": 12480 }, { "epoch": 0.7855402750491159, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7217, "step": 12495 }, { "epoch": 0.786483300589391, "grad_norm": 0.44921875, "learning_rate": 0.001, "loss": 0.7194, "step": 12510 }, { "epoch": 0.787426326129666, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7117, "step": 12525 }, { "epoch": 0.788369351669941, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7125, "step": 12540 }, { "epoch": 0.7893123772102161, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 0.7107, "step": 12555 }, { "epoch": 0.7902554027504911, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7392, "step": 12570 }, { "epoch": 0.7911984282907663, "grad_norm": 0.67578125, "learning_rate": 0.001, "loss": 0.7211, "step": 12585 }, { "epoch": 0.7921414538310413, "grad_norm": 0.9375, "learning_rate": 0.001, "loss": 0.7139, "step": 12600 }, { "epoch": 0.7930844793713163, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.721, "step": 12615 }, { "epoch": 0.7940275049115914, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 0.7258, "step": 12630 }, { "epoch": 0.7949705304518664, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7079, "step": 12645 }, { "epoch": 0.7959135559921414, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.712, "step": 12660 }, { "epoch": 0.7968565815324165, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7296, "step": 12675 }, { "epoch": 0.7977996070726916, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.7146, "step": 12690 }, { "epoch": 0.7987426326129666, "grad_norm": 0.67578125, "learning_rate": 0.001, "loss": 0.7202, "step": 12705 }, { "epoch": 0.7996856581532417, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7257, "step": 12720 }, { "epoch": 0.7996856581532417, "eval_loss": 0.8420960307121277, "eval_runtime": 9.6794, "eval_samples_per_second": 103.312, "eval_steps_per_second": 1.446, "step": 12720 }, { "epoch": 0.8006286836935167, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7087, "step": 12735 }, { "epoch": 0.8015717092337917, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7219, "step": 12750 }, { "epoch": 0.8025147347740668, "grad_norm": 0.48046875, "learning_rate": 0.001, "loss": 0.7241, "step": 12765 }, { "epoch": 0.8034577603143418, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.7211, "step": 12780 }, { "epoch": 0.8044007858546169, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7234, "step": 12795 }, { "epoch": 0.805343811394892, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7214, "step": 12810 }, { "epoch": 0.806286836935167, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7201, "step": 12825 }, { "epoch": 0.807229862475442, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7261, "step": 12840 }, { "epoch": 0.8081728880157171, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7301, "step": 12855 }, { "epoch": 0.8091159135559921, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7104, "step": 12870 }, { "epoch": 0.8100589390962671, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7094, "step": 12885 }, { "epoch": 0.8110019646365423, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7188, "step": 12900 }, { "epoch": 0.8119449901768173, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7296, "step": 12915 }, { "epoch": 0.8128880157170923, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.725, "step": 12930 }, { "epoch": 0.8138310412573674, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.716, "step": 12945 }, { "epoch": 0.8147740667976424, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7199, "step": 12960 }, { "epoch": 0.8157170923379174, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7251, "step": 12975 }, { "epoch": 0.8166601178781925, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.7153, "step": 12990 }, { "epoch": 0.8176031434184676, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.7172, "step": 13005 }, { "epoch": 0.8185461689587427, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7241, "step": 13020 }, { "epoch": 0.8194891944990177, "grad_norm": 0.671875, "learning_rate": 0.001, "loss": 0.7087, "step": 13035 }, { "epoch": 0.8204322200392927, "grad_norm": 0.4375, "learning_rate": 0.001, "loss": 0.7146, "step": 13050 }, { "epoch": 0.8213752455795678, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 0.7137, "step": 13065 }, { "epoch": 0.8223182711198428, "grad_norm": 0.61328125, "learning_rate": 0.001, "loss": 0.7309, "step": 13080 }, { "epoch": 0.8232612966601178, "grad_norm": 0.74609375, "learning_rate": 0.001, "loss": 0.7075, "step": 13095 }, { "epoch": 0.824204322200393, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7187, "step": 13110 }, { "epoch": 0.825147347740668, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7133, "step": 13125 }, { "epoch": 0.826090373280943, "grad_norm": 0.65234375, "learning_rate": 0.001, "loss": 0.7062, "step": 13140 }, { "epoch": 0.8270333988212181, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7139, "step": 13155 }, { "epoch": 0.8279764243614931, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 0.7122, "step": 13170 }, { "epoch": 0.8289194499017681, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7089, "step": 13185 }, { "epoch": 0.8298624754420432, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7148, "step": 13200 }, { "epoch": 0.8308055009823183, "grad_norm": 0.484375, "learning_rate": 0.001, "loss": 0.7165, "step": 13215 }, { "epoch": 0.8317485265225933, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.716, "step": 13230 }, { "epoch": 0.8326915520628684, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7324, "step": 13245 }, { "epoch": 0.8336345776031434, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7028, "step": 13260 }, { "epoch": 0.8345776031434184, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7162, "step": 13275 }, { "epoch": 0.8355206286836935, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7115, "step": 13290 }, { "epoch": 0.8364636542239685, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7323, "step": 13305 }, { "epoch": 0.8374066797642437, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7154, "step": 13320 }, { "epoch": 0.8383497053045187, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7231, "step": 13335 }, { "epoch": 0.8392927308447937, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7308, "step": 13350 }, { "epoch": 0.8402357563850688, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7169, "step": 13365 }, { "epoch": 0.8411787819253438, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7209, "step": 13380 }, { "epoch": 0.8421218074656188, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.706, "step": 13395 }, { "epoch": 0.8430648330058939, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7079, "step": 13410 }, { "epoch": 0.844007858546169, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7062, "step": 13425 }, { "epoch": 0.844950884086444, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7047, "step": 13440 }, { "epoch": 0.8458939096267191, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7179, "step": 13455 }, { "epoch": 0.8468369351669941, "grad_norm": 0.72265625, "learning_rate": 0.001, "loss": 0.7159, "step": 13470 }, { "epoch": 0.8477799607072691, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7152, "step": 13485 }, { "epoch": 0.8487229862475442, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.709, "step": 13500 }, { "epoch": 0.8496660117878193, "grad_norm": 0.498046875, "learning_rate": 0.001, "loss": 0.7158, "step": 13515 }, { "epoch": 0.8506090373280943, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7026, "step": 13530 }, { "epoch": 0.8515520628683694, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7197, "step": 13545 }, { "epoch": 0.8524950884086444, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7271, "step": 13560 }, { "epoch": 0.8534381139489194, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7241, "step": 13575 }, { "epoch": 0.8543811394891945, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7244, "step": 13590 }, { "epoch": 0.8553241650294695, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.7154, "step": 13605 }, { "epoch": 0.8562671905697447, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.7135, "step": 13620 }, { "epoch": 0.8572102161100197, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7095, "step": 13635 }, { "epoch": 0.8581532416502947, "grad_norm": 0.87109375, "learning_rate": 0.001, "loss": 0.7245, "step": 13650 }, { "epoch": 0.8590962671905698, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.7174, "step": 13665 }, { "epoch": 0.8600392927308448, "grad_norm": 0.6875, "learning_rate": 0.001, "loss": 0.7131, "step": 13680 }, { "epoch": 0.8609823182711198, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7227, "step": 13695 }, { "epoch": 0.8619253438113949, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7067, "step": 13710 }, { "epoch": 0.86286836935167, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.7013, "step": 13725 }, { "epoch": 0.863811394891945, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7046, "step": 13740 }, { "epoch": 0.8647544204322201, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7077, "step": 13755 }, { "epoch": 0.8656974459724951, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7064, "step": 13770 }, { "epoch": 0.8666404715127701, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.7177, "step": 13785 }, { "epoch": 0.8675834970530452, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7128, "step": 13800 }, { "epoch": 0.8685265225933202, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.7131, "step": 13815 }, { "epoch": 0.8694695481335953, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.7048, "step": 13830 }, { "epoch": 0.8704125736738704, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7183, "step": 13845 }, { "epoch": 0.8713555992141454, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7087, "step": 13860 }, { "epoch": 0.8722986247544204, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.7117, "step": 13875 }, { "epoch": 0.8732416502946955, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 0.7216, "step": 13890 }, { "epoch": 0.8741846758349705, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.7159, "step": 13905 }, { "epoch": 0.8751277013752455, "grad_norm": 0.75, "learning_rate": 0.001, "loss": 0.7096, "step": 13920 }, { "epoch": 0.8760707269155207, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.702, "step": 13935 }, { "epoch": 0.8770137524557957, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7101, "step": 13950 }, { "epoch": 0.8779567779960707, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7212, "step": 13965 }, { "epoch": 0.8788998035363458, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.7126, "step": 13980 }, { "epoch": 0.8798428290766208, "grad_norm": 0.5078125, "learning_rate": 0.001, "loss": 0.7036, "step": 13995 }, { "epoch": 0.8807858546168958, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7071, "step": 14010 }, { "epoch": 0.8817288801571709, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7051, "step": 14025 }, { "epoch": 0.882671905697446, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.7156, "step": 14040 }, { "epoch": 0.8836149312377211, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.709, "step": 14055 }, { "epoch": 0.8845579567779961, "grad_norm": 0.6875, "learning_rate": 0.001, "loss": 0.7062, "step": 14070 }, { "epoch": 0.8855009823182711, "grad_norm": 0.6015625, "learning_rate": 0.001, "loss": 0.7142, "step": 14085 }, { "epoch": 0.8864440078585462, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7143, "step": 14100 }, { "epoch": 0.8873870333988212, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7093, "step": 14115 }, { "epoch": 0.8883300589390962, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.712, "step": 14130 }, { "epoch": 0.8892730844793714, "grad_norm": 0.51171875, "learning_rate": 0.001, "loss": 0.7085, "step": 14145 }, { "epoch": 0.8902161100196464, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7197, "step": 14160 }, { "epoch": 0.8911591355599214, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7022, "step": 14175 }, { "epoch": 0.8921021611001965, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7019, "step": 14190 }, { "epoch": 0.8930451866404715, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7171, "step": 14205 }, { "epoch": 0.8939882121807465, "grad_norm": 0.7890625, "learning_rate": 0.001, "loss": 0.7052, "step": 14220 }, { "epoch": 0.8949312377210216, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.7029, "step": 14235 }, { "epoch": 0.8958742632612967, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7067, "step": 14250 }, { "epoch": 0.8968172888015717, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.6962, "step": 14265 }, { "epoch": 0.8977603143418468, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.702, "step": 14280 }, { "epoch": 0.8987033398821218, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7066, "step": 14295 }, { "epoch": 0.8996463654223968, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7141, "step": 14310 }, { "epoch": 0.8996463654223968, "eval_loss": 0.8242524266242981, "eval_runtime": 9.6736, "eval_samples_per_second": 103.374, "eval_steps_per_second": 1.447, "step": 14310 }, { "epoch": 0.9005893909626719, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7051, "step": 14325 }, { "epoch": 0.9015324165029469, "grad_norm": 0.6484375, "learning_rate": 0.001, "loss": 0.7161, "step": 14340 }, { "epoch": 0.902475442043222, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 0.6994, "step": 14355 }, { "epoch": 0.9034184675834971, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 0.7121, "step": 14370 }, { "epoch": 0.9043614931237721, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.7232, "step": 14385 }, { "epoch": 0.9053045186640472, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.7122, "step": 14400 }, { "epoch": 0.9062475442043222, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 0.7168, "step": 14415 }, { "epoch": 0.9071905697445972, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 0.6997, "step": 14430 }, { "epoch": 0.9081335952848723, "grad_norm": 0.56640625, "learning_rate": 0.001, "loss": 0.7124, "step": 14445 }, { "epoch": 0.9090766208251474, "grad_norm": 0.486328125, "learning_rate": 0.001, "loss": 0.6995, "step": 14460 }, { "epoch": 0.9100196463654224, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7087, "step": 14475 }, { "epoch": 0.9109626719056975, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.6991, "step": 14490 }, { "epoch": 0.9119056974459725, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.7069, "step": 14505 }, { "epoch": 0.9128487229862475, "grad_norm": 0.625, "learning_rate": 0.001, "loss": 0.701, "step": 14520 }, { "epoch": 0.9137917485265226, "grad_norm": 0.734375, "learning_rate": 0.001, "loss": 0.7111, "step": 14535 }, { "epoch": 0.9147347740667976, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.6989, "step": 14550 }, { "epoch": 0.9156777996070727, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.7243, "step": 14565 }, { "epoch": 0.9166208251473478, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7043, "step": 14580 }, { "epoch": 0.9175638506876228, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.6925, "step": 14595 }, { "epoch": 0.9185068762278978, "grad_norm": 0.7890625, "learning_rate": 0.001, "loss": 0.7129, "step": 14610 }, { "epoch": 0.9194499017681729, "grad_norm": 0.65625, "learning_rate": 0.001, "loss": 0.7064, "step": 14625 }, { "epoch": 0.9203929273084479, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 0.6876, "step": 14640 }, { "epoch": 0.9213359528487229, "grad_norm": 0.6328125, "learning_rate": 0.001, "loss": 0.6978, "step": 14655 }, { "epoch": 0.9222789783889981, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7003, "step": 14670 }, { "epoch": 0.9232220039292731, "grad_norm": 0.4765625, "learning_rate": 0.001, "loss": 0.7009, "step": 14685 }, { "epoch": 0.9241650294695481, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.7093, "step": 14700 }, { "epoch": 0.9251080550098232, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.6927, "step": 14715 }, { "epoch": 0.9260510805500982, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.6995, "step": 14730 }, { "epoch": 0.9269941060903732, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.711, "step": 14745 }, { "epoch": 0.9279371316306483, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.7156, "step": 14760 }, { "epoch": 0.9288801571709234, "grad_norm": 0.72265625, "learning_rate": 0.001, "loss": 0.7173, "step": 14775 }, { "epoch": 0.9298231827111985, "grad_norm": 0.7578125, "learning_rate": 0.001, "loss": 0.7132, "step": 14790 }, { "epoch": 0.9307662082514735, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.6983, "step": 14805 }, { "epoch": 0.9317092337917485, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7047, "step": 14820 }, { "epoch": 0.9326522593320236, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.7115, "step": 14835 }, { "epoch": 0.9335952848722986, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 0.7038, "step": 14850 }, { "epoch": 0.9345383104125736, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7066, "step": 14865 }, { "epoch": 0.9354813359528488, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 0.7062, "step": 14880 }, { "epoch": 0.9364243614931238, "grad_norm": 0.4140625, "learning_rate": 0.001, "loss": 0.6915, "step": 14895 }, { "epoch": 0.9373673870333988, "grad_norm": 0.64453125, "learning_rate": 0.001, "loss": 0.7031, "step": 14910 }, { "epoch": 0.9383104125736739, "grad_norm": 0.6875, "learning_rate": 0.001, "loss": 0.7072, "step": 14925 }, { "epoch": 0.9392534381139489, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.7012, "step": 14940 }, { "epoch": 0.9401964636542239, "grad_norm": 0.70703125, "learning_rate": 0.001, "loss": 0.7211, "step": 14955 }, { "epoch": 0.941139489194499, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 0.7048, "step": 14970 }, { "epoch": 0.9420825147347741, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7016, "step": 14985 }, { "epoch": 0.9430255402750491, "grad_norm": 0.490234375, "learning_rate": 0.001, "loss": 0.7095, "step": 15000 }, { "epoch": 0.9439685658153242, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 0.705, "step": 15015 }, { "epoch": 0.9449115913555992, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.6986, "step": 15030 }, { "epoch": 0.9458546168958742, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 0.7026, "step": 15045 }, { "epoch": 0.9467976424361493, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 0.709, "step": 15060 }, { "epoch": 0.9477406679764243, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 0.712, "step": 15075 }, { "epoch": 0.9486836935166995, "grad_norm": 0.5625, "learning_rate": 0.001, "loss": 0.7126, "step": 15090 }, { "epoch": 0.9496267190569745, "grad_norm": 0.75390625, "learning_rate": 0.001, "loss": 0.6879, "step": 15105 }, { "epoch": 0.9505697445972495, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7031, "step": 15120 }, { "epoch": 0.9515127701375246, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.7146, "step": 15135 }, { "epoch": 0.9524557956777996, "grad_norm": 0.48828125, "learning_rate": 0.001, "loss": 0.6882, "step": 15150 }, { "epoch": 0.9533988212180746, "grad_norm": 0.50390625, "learning_rate": 0.001, "loss": 0.6981, "step": 15165 }, { "epoch": 0.9543418467583497, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7011, "step": 15180 }, { "epoch": 0.9552848722986248, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.698, "step": 15195 }, { "epoch": 0.9562278978388998, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.6932, "step": 15210 }, { "epoch": 0.9571709233791749, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.69, "step": 15225 }, { "epoch": 0.9581139489194499, "grad_norm": 0.609375, "learning_rate": 0.001, "loss": 0.695, "step": 15240 }, { "epoch": 0.9590569744597249, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 0.7002, "step": 15255 }, { "epoch": 0.96, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 0.6943, "step": 15270 }, { "epoch": 0.960943025540275, "grad_norm": 0.58203125, "learning_rate": 0.001, "loss": 0.7044, "step": 15285 }, { "epoch": 0.9618860510805501, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.7069, "step": 15300 }, { "epoch": 0.9628290766208252, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.6985, "step": 15315 }, { "epoch": 0.9637721021611002, "grad_norm": 0.640625, "learning_rate": 0.001, "loss": 0.7049, "step": 15330 }, { "epoch": 0.9647151277013752, "grad_norm": 0.62890625, "learning_rate": 0.001, "loss": 0.7035, "step": 15345 }, { "epoch": 0.9656581532416503, "grad_norm": 0.5234375, "learning_rate": 0.001, "loss": 0.7016, "step": 15360 }, { "epoch": 0.9666011787819253, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.6954, "step": 15375 }, { "epoch": 0.9675442043222003, "grad_norm": 0.5859375, "learning_rate": 0.001, "loss": 0.7014, "step": 15390 }, { "epoch": 0.9684872298624755, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.7129, "step": 15405 }, { "epoch": 0.9694302554027505, "grad_norm": 0.515625, "learning_rate": 0.001, "loss": 0.6999, "step": 15420 }, { "epoch": 0.9703732809430256, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7017, "step": 15435 }, { "epoch": 0.9713163064833006, "grad_norm": 0.546875, "learning_rate": 0.001, "loss": 0.6893, "step": 15450 }, { "epoch": 0.9722593320235756, "grad_norm": 0.71484375, "learning_rate": 0.001, "loss": 0.6993, "step": 15465 }, { "epoch": 0.9732023575638507, "grad_norm": 0.6171875, "learning_rate": 0.001, "loss": 0.6999, "step": 15480 }, { "epoch": 0.9741453831041257, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.6864, "step": 15495 }, { "epoch": 0.9750884086444008, "grad_norm": 0.49609375, "learning_rate": 0.001, "loss": 0.7057, "step": 15510 }, { "epoch": 0.9760314341846759, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.6957, "step": 15525 }, { "epoch": 0.9769744597249509, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 0.709, "step": 15540 }, { "epoch": 0.9779174852652259, "grad_norm": 0.482421875, "learning_rate": 0.001, "loss": 0.6965, "step": 15555 }, { "epoch": 0.978860510805501, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.6989, "step": 15570 }, { "epoch": 0.979803536345776, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.6995, "step": 15585 }, { "epoch": 0.980746561886051, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.6894, "step": 15600 }, { "epoch": 0.9816895874263262, "grad_norm": 0.5390625, "learning_rate": 0.001, "loss": 0.7084, "step": 15615 }, { "epoch": 0.9826326129666012, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 0.7021, "step": 15630 }, { "epoch": 0.9835756385068762, "grad_norm": 0.87109375, "learning_rate": 0.001, "loss": 0.6892, "step": 15645 }, { "epoch": 0.9845186640471513, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 0.7147, "step": 15660 }, { "epoch": 0.9854616895874263, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 0.7007, "step": 15675 }, { "epoch": 0.9864047151277013, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.699, "step": 15690 }, { "epoch": 0.9873477406679764, "grad_norm": 0.875, "learning_rate": 0.001, "loss": 0.6943, "step": 15705 }, { "epoch": 0.9882907662082515, "grad_norm": 0.5546875, "learning_rate": 0.001, "loss": 0.6943, "step": 15720 }, { "epoch": 0.9892337917485265, "grad_norm": 0.466796875, "learning_rate": 0.001, "loss": 0.703, "step": 15735 }, { "epoch": 0.9901768172888016, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.6953, "step": 15750 }, { "epoch": 0.9911198428290766, "grad_norm": 0.71875, "learning_rate": 0.001, "loss": 0.6884, "step": 15765 }, { "epoch": 0.9920628683693516, "grad_norm": 0.68359375, "learning_rate": 0.001, "loss": 0.6972, "step": 15780 }, { "epoch": 0.9930058939096267, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 0.6929, "step": 15795 }, { "epoch": 0.9939489194499017, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 0.6849, "step": 15810 }, { "epoch": 0.9948919449901769, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 0.6932, "step": 15825 }, { "epoch": 0.9958349705304519, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 0.7042, "step": 15840 }, { "epoch": 0.9967779960707269, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 0.6924, "step": 15855 }, { "epoch": 0.997721021611002, "grad_norm": 0.578125, "learning_rate": 0.001, "loss": 0.7009, "step": 15870 }, { "epoch": 0.998664047151277, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 0.7059, "step": 15885 }, { "epoch": 0.999607072691552, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 0.691, "step": 15900 }, { "epoch": 0.999607072691552, "eval_loss": 0.8118711709976196, "eval_runtime": 9.6839, "eval_samples_per_second": 103.264, "eval_steps_per_second": 1.446, "step": 15900 } ], "logging_steps": 15, "max_steps": 15906, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1590, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.185992916964999e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }