diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999842829076621, + "eval_steps": 1590, + "global_step": 15906, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009430255402750491, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 5.5551, + "step": 15 + }, + { + "epoch": 0.0018860510805500982, + "grad_norm": 0.0791015625, + "learning_rate": 0.001, + "loss": 3.5038, + "step": 30 + }, + { + "epoch": 0.002829076620825147, + "grad_norm": 0.1298828125, + "learning_rate": 0.001, + "loss": 3.5068, + "step": 45 + }, + { + "epoch": 0.0037721021611001964, + "grad_norm": 0.0693359375, + "learning_rate": 0.001, + "loss": 3.4288, + "step": 60 + }, + { + "epoch": 0.004715127701375246, + "grad_norm": 0.12255859375, + "learning_rate": 0.001, + "loss": 3.3071, + "step": 75 + }, + { + "epoch": 0.005658153241650294, + "grad_norm": 0.091796875, + "learning_rate": 0.001, + "loss": 3.2653, + "step": 90 + }, + { + "epoch": 0.006601178781925344, + "grad_norm": 0.1318359375, + "learning_rate": 0.001, + "loss": 3.1297, + "step": 105 + }, + { + "epoch": 0.007544204322200393, + "grad_norm": 0.12451171875, + "learning_rate": 0.001, + "loss": 3.0482, + "step": 120 + }, + { + "epoch": 0.008487229862475442, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.9037, + "step": 135 + }, + { + "epoch": 0.009430255402750491, + "grad_norm": 0.1650390625, + "learning_rate": 0.001, + "loss": 2.8178, + "step": 150 + }, + { + "epoch": 0.01037328094302554, + "grad_norm": 0.111328125, + "learning_rate": 0.001, + "loss": 2.687, + "step": 165 + }, + { + "epoch": 0.011316306483300589, + "grad_norm": 0.1640625, + "learning_rate": 0.001, + "loss": 2.6247, + "step": 180 + }, + { + "epoch": 0.01225933202357564, + "grad_norm": 0.1298828125, + "learning_rate": 0.001, + "loss": 2.5556, + "step": 195 + }, + { + "epoch": 0.013202357563850688, + "grad_norm": 0.2451171875, + "learning_rate": 0.001, + "loss": 2.4524, + "step": 210 + }, + { + "epoch": 0.014145383104125737, + "grad_norm": 0.1083984375, + "learning_rate": 0.001, + "loss": 2.4904, + "step": 225 + }, + { + "epoch": 0.015088408644400786, + "grad_norm": 0.1904296875, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 240 + }, + { + "epoch": 0.016031434184675834, + "grad_norm": 0.2412109375, + "learning_rate": 0.001, + "loss": 2.419, + "step": 255 + }, + { + "epoch": 0.016974459724950885, + "grad_norm": 0.130859375, + "learning_rate": 0.001, + "loss": 2.3542, + "step": 270 + }, + { + "epoch": 0.017917485265225932, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2893, + "step": 285 + }, + { + "epoch": 0.018860510805500982, + "grad_norm": 0.1982421875, + "learning_rate": 0.001, + "loss": 2.2671, + "step": 300 + }, + { + "epoch": 0.019803536345776033, + "grad_norm": 0.2109375, + "learning_rate": 0.001, + "loss": 2.2644, + "step": 315 + }, + { + "epoch": 0.02074656188605108, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2669, + "step": 330 + }, + { + "epoch": 0.02168958742632613, + "grad_norm": 0.1962890625, + "learning_rate": 0.001, + "loss": 2.2009, + "step": 345 + }, + { + "epoch": 0.022632612966601177, + "grad_norm": 0.197265625, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 360 + }, + { + "epoch": 0.023575638506876228, + "grad_norm": 0.203125, + "learning_rate": 0.001, + "loss": 2.0607, + "step": 375 + }, + { + "epoch": 0.02451866404715128, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.1118, + "step": 390 + }, + { + "epoch": 0.025461689587426325, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.0465, + "step": 405 + }, + { + "epoch": 0.026404715127701376, + "grad_norm": 0.26953125, + "learning_rate": 0.001, + "loss": 2.0682, + "step": 420 + }, + { + "epoch": 0.027347740667976423, + "grad_norm": 0.158203125, + "learning_rate": 0.001, + "loss": 2.014, + "step": 435 + }, + { + "epoch": 0.028290766208251474, + "grad_norm": 0.2578125, + "learning_rate": 0.001, + "loss": 2.0251, + "step": 450 + }, + { + "epoch": 0.029233791748526524, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 1.991, + "step": 465 + }, + { + "epoch": 0.03017681728880157, + "grad_norm": 0.1884765625, + "learning_rate": 0.001, + "loss": 1.9579, + "step": 480 + }, + { + "epoch": 0.03111984282907662, + "grad_norm": 0.1630859375, + "learning_rate": 0.001, + "loss": 1.9253, + "step": 495 + }, + { + "epoch": 0.03206286836935167, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 1.9019, + "step": 510 + }, + { + "epoch": 0.033005893909626716, + "grad_norm": 0.15234375, + "learning_rate": 0.001, + "loss": 1.9208, + "step": 525 + }, + { + "epoch": 0.03394891944990177, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 1.9165, + "step": 540 + }, + { + "epoch": 0.03489194499017682, + "grad_norm": 0.2734375, + "learning_rate": 0.001, + "loss": 1.8541, + "step": 555 + }, + { + "epoch": 0.035834970530451864, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 1.8854, + "step": 570 + }, + { + "epoch": 0.03677799607072692, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 1.8651, + "step": 585 + }, + { + "epoch": 0.037721021611001965, + "grad_norm": 0.2734375, + "learning_rate": 0.001, + "loss": 1.8392, + "step": 600 + }, + { + "epoch": 0.03866404715127701, + "grad_norm": 0.23828125, + "learning_rate": 0.001, + "loss": 1.843, + "step": 615 + }, + { + "epoch": 0.039607072691552066, + "grad_norm": 0.2578125, + "learning_rate": 0.001, + "loss": 1.7958, + "step": 630 + }, + { + "epoch": 0.04055009823182711, + "grad_norm": 0.197265625, + "learning_rate": 0.001, + "loss": 1.7849, + "step": 645 + }, + { + "epoch": 0.04149312377210216, + "grad_norm": 0.1767578125, + "learning_rate": 0.001, + "loss": 1.7397, + "step": 660 + }, + { + "epoch": 0.04243614931237721, + "grad_norm": 0.255859375, + "learning_rate": 0.001, + "loss": 1.7396, + "step": 675 + }, + { + "epoch": 0.04337917485265226, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 1.7219, + "step": 690 + }, + { + "epoch": 0.04432220039292731, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 1.7536, + "step": 705 + }, + { + "epoch": 0.045265225933202355, + "grad_norm": 0.2021484375, + "learning_rate": 0.001, + "loss": 1.697, + "step": 720 + }, + { + "epoch": 0.04620825147347741, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 1.6725, + "step": 735 + }, + { + "epoch": 0.047151277013752456, + "grad_norm": 0.2080078125, + "learning_rate": 0.001, + "loss": 1.691, + "step": 750 + }, + { + "epoch": 0.0480943025540275, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 1.6721, + "step": 765 + }, + { + "epoch": 0.04903732809430256, + "grad_norm": 0.2451171875, + "learning_rate": 0.001, + "loss": 1.7221, + "step": 780 + }, + { + "epoch": 0.049980353634577604, + "grad_norm": 0.244140625, + "learning_rate": 0.001, + "loss": 1.6609, + "step": 795 + }, + { + "epoch": 0.05092337917485265, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 1.6805, + "step": 810 + }, + { + "epoch": 0.0518664047151277, + "grad_norm": 0.265625, + "learning_rate": 0.001, + "loss": 1.6157, + "step": 825 + }, + { + "epoch": 0.05280943025540275, + "grad_norm": 0.19921875, + "learning_rate": 0.001, + "loss": 1.5996, + "step": 840 + }, + { + "epoch": 0.0537524557956778, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 1.5686, + "step": 855 + }, + { + "epoch": 0.054695481335952846, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 1.6021, + "step": 870 + }, + { + "epoch": 0.0556385068762279, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 1.6159, + "step": 885 + }, + { + "epoch": 0.05658153241650295, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 1.5456, + "step": 900 + }, + { + "epoch": 0.057524557956777994, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 1.5764, + "step": 915 + }, + { + "epoch": 0.05846758349705305, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 1.5426, + "step": 930 + }, + { + "epoch": 0.059410609037328095, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 1.5535, + "step": 945 + }, + { + "epoch": 0.06035363457760314, + "grad_norm": 0.23828125, + "learning_rate": 0.001, + "loss": 1.505, + "step": 960 + }, + { + "epoch": 0.06129666011787819, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 1.5328, + "step": 975 + }, + { + "epoch": 0.06223968565815324, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 1.5274, + "step": 990 + }, + { + "epoch": 0.06318271119842829, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 1.5246, + "step": 1005 + }, + { + "epoch": 0.06412573673870334, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 1.4633, + "step": 1020 + }, + { + "epoch": 0.06506876227897838, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 1.487, + "step": 1035 + }, + { + "epoch": 0.06601178781925343, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 1.4582, + "step": 1050 + }, + { + "epoch": 0.06695481335952849, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 1.4586, + "step": 1065 + }, + { + "epoch": 0.06789783889980354, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 1.4322, + "step": 1080 + }, + { + "epoch": 0.06884086444007859, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 1.47, + "step": 1095 + }, + { + "epoch": 0.06978388998035363, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 1.4215, + "step": 1110 + }, + { + "epoch": 0.07072691552062868, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 1.4569, + "step": 1125 + }, + { + "epoch": 0.07166994106090373, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 1.4428, + "step": 1140 + }, + { + "epoch": 0.07261296660117879, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 1.3861, + "step": 1155 + }, + { + "epoch": 0.07355599214145384, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 1.4478, + "step": 1170 + }, + { + "epoch": 0.07449901768172888, + "grad_norm": 0.27734375, + "learning_rate": 0.001, + "loss": 1.406, + "step": 1185 + }, + { + "epoch": 0.07544204322200393, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 1.3944, + "step": 1200 + }, + { + "epoch": 0.07638506876227898, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 1.3884, + "step": 1215 + }, + { + "epoch": 0.07732809430255402, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 1.38, + "step": 1230 + }, + { + "epoch": 0.07827111984282907, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 1.3446, + "step": 1245 + }, + { + "epoch": 0.07921414538310413, + "grad_norm": 0.28515625, + "learning_rate": 0.001, + "loss": 1.351, + "step": 1260 + }, + { + "epoch": 0.08015717092337918, + "grad_norm": 0.6953125, + "learning_rate": 0.001, + "loss": 1.352, + "step": 1275 + }, + { + "epoch": 0.08110019646365423, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 1.3378, + "step": 1290 + }, + { + "epoch": 0.08204322200392927, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 1.3056, + "step": 1305 + }, + { + "epoch": 0.08298624754420432, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 1.3099, + "step": 1320 + }, + { + "epoch": 0.08392927308447937, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 1.3364, + "step": 1335 + }, + { + "epoch": 0.08487229862475441, + "grad_norm": 0.4296875, + "learning_rate": 0.001, + "loss": 1.2865, + "step": 1350 + }, + { + "epoch": 0.08581532416502947, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 1.3022, + "step": 1365 + }, + { + "epoch": 0.08675834970530452, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 1.2641, + "step": 1380 + }, + { + "epoch": 0.08770137524557957, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 1.291, + "step": 1395 + }, + { + "epoch": 0.08864440078585462, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 1.2947, + "step": 1410 + }, + { + "epoch": 0.08958742632612966, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 1.2626, + "step": 1425 + }, + { + "epoch": 0.09053045186640471, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 1.2719, + "step": 1440 + }, + { + "epoch": 0.09147347740667977, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 1.2817, + "step": 1455 + }, + { + "epoch": 0.09241650294695482, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 1.2678, + "step": 1470 + }, + { + "epoch": 0.09335952848722986, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 1.2336, + "step": 1485 + }, + { + "epoch": 0.09430255402750491, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 1.2415, + "step": 1500 + }, + { + "epoch": 0.09524557956777996, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 1.2478, + "step": 1515 + }, + { + "epoch": 0.096188605108055, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 1.2475, + "step": 1530 + }, + { + "epoch": 0.09713163064833005, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 1.2128, + "step": 1545 + }, + { + "epoch": 0.09807465618860511, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 1.2292, + "step": 1560 + }, + { + "epoch": 0.09901768172888016, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 1.2015, + "step": 1575 + }, + { + "epoch": 0.09996070726915521, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 1.2088, + "step": 1590 + }, + { + "epoch": 0.09996070726915521, + "eval_loss": 1.5537890195846558, + "eval_runtime": 9.6819, + "eval_samples_per_second": 103.285, + "eval_steps_per_second": 1.446, + "step": 1590 + }, + { + "epoch": 0.10090373280943025, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 1.2156, + "step": 1605 + }, + { + "epoch": 0.1018467583497053, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 1.2115, + "step": 1620 + }, + { + "epoch": 0.10278978388998035, + "grad_norm": 0.8359375, + "learning_rate": 0.001, + "loss": 1.2202, + "step": 1635 + }, + { + "epoch": 0.1037328094302554, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 1.2208, + "step": 1650 + }, + { + "epoch": 0.10467583497053046, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 1.1911, + "step": 1665 + }, + { + "epoch": 0.1056188605108055, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 1.2102, + "step": 1680 + }, + { + "epoch": 0.10656188605108055, + "grad_norm": 0.27734375, + "learning_rate": 0.001, + "loss": 1.1984, + "step": 1695 + }, + { + "epoch": 0.1075049115913556, + "grad_norm": 0.90625, + "learning_rate": 0.001, + "loss": 1.2012, + "step": 1710 + }, + { + "epoch": 0.10844793713163065, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 1.1869, + "step": 1725 + }, + { + "epoch": 0.10939096267190569, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 1.1948, + "step": 1740 + }, + { + "epoch": 0.11033398821218075, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 1.1783, + "step": 1755 + }, + { + "epoch": 0.1112770137524558, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 1.1893, + "step": 1770 + }, + { + "epoch": 0.11222003929273085, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 1.1495, + "step": 1785 + }, + { + "epoch": 0.1131630648330059, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 1.175, + "step": 1800 + }, + { + "epoch": 0.11410609037328094, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 1.1588, + "step": 1815 + }, + { + "epoch": 0.11504911591355599, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 1.1376, + "step": 1830 + }, + { + "epoch": 0.11599214145383104, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 1.1511, + "step": 1845 + }, + { + "epoch": 0.1169351669941061, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 1.1645, + "step": 1860 + }, + { + "epoch": 0.11787819253438114, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 1.1619, + "step": 1875 + }, + { + "epoch": 0.11882121807465619, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 1.1304, + "step": 1890 + }, + { + "epoch": 0.11976424361493124, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 1.1361, + "step": 1905 + }, + { + "epoch": 0.12070726915520628, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 1.1151, + "step": 1920 + }, + { + "epoch": 0.12165029469548133, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 1.1299, + "step": 1935 + }, + { + "epoch": 0.12259332023575638, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 1.1334, + "step": 1950 + }, + { + "epoch": 0.12353634577603144, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 1.112, + "step": 1965 + }, + { + "epoch": 0.12447937131630649, + "grad_norm": 0.7890625, + "learning_rate": 0.001, + "loss": 1.1034, + "step": 1980 + }, + { + "epoch": 0.12542239685658152, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 1.12, + "step": 1995 + }, + { + "epoch": 0.12636542239685658, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 1.0996, + "step": 2010 + }, + { + "epoch": 0.12730844793713164, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 1.1141, + "step": 2025 + }, + { + "epoch": 0.12825147347740667, + "grad_norm": 0.6640625, + "learning_rate": 0.001, + "loss": 1.1112, + "step": 2040 + }, + { + "epoch": 0.12919449901768174, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 1.1229, + "step": 2055 + }, + { + "epoch": 0.13013752455795677, + "grad_norm": 0.97265625, + "learning_rate": 0.001, + "loss": 1.074, + "step": 2070 + }, + { + "epoch": 0.13108055009823183, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 1.1199, + "step": 2085 + }, + { + "epoch": 0.13202357563850686, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 1.097, + "step": 2100 + }, + { + "epoch": 0.13296660117878192, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 1.0832, + "step": 2115 + }, + { + "epoch": 0.13390962671905698, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 1.0887, + "step": 2130 + }, + { + "epoch": 0.13485265225933202, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 1.066, + "step": 2145 + }, + { + "epoch": 0.13579567779960708, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 1.0979, + "step": 2160 + }, + { + "epoch": 0.1367387033398821, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 1.101, + "step": 2175 + }, + { + "epoch": 0.13768172888015717, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 1.0761, + "step": 2190 + }, + { + "epoch": 0.13862475442043223, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 1.0845, + "step": 2205 + }, + { + "epoch": 0.13956777996070727, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 1.0938, + "step": 2220 + }, + { + "epoch": 0.14051080550098233, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 1.0659, + "step": 2235 + }, + { + "epoch": 0.14145383104125736, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 1.0683, + "step": 2250 + }, + { + "epoch": 0.14239685658153242, + "grad_norm": 0.66015625, + "learning_rate": 0.001, + "loss": 1.0777, + "step": 2265 + }, + { + "epoch": 0.14333988212180745, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 1.0741, + "step": 2280 + }, + { + "epoch": 0.14428290766208252, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 1.0533, + "step": 2295 + }, + { + "epoch": 0.14522593320235758, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 1.0655, + "step": 2310 + }, + { + "epoch": 0.1461689587426326, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 1.0541, + "step": 2325 + }, + { + "epoch": 0.14711198428290767, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 1.0506, + "step": 2340 + }, + { + "epoch": 0.1480550098231827, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 1.0596, + "step": 2355 + }, + { + "epoch": 0.14899803536345776, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 1.0586, + "step": 2370 + }, + { + "epoch": 0.1499410609037328, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 1.0466, + "step": 2385 + }, + { + "epoch": 0.15088408644400786, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 1.0485, + "step": 2400 + }, + { + "epoch": 0.15182711198428292, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 1.011, + "step": 2415 + }, + { + "epoch": 0.15277013752455795, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 1.0434, + "step": 2430 + }, + { + "epoch": 0.153713163064833, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 1.0353, + "step": 2445 + }, + { + "epoch": 0.15465618860510805, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 1.0222, + "step": 2460 + }, + { + "epoch": 0.1555992141453831, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 1.0403, + "step": 2475 + }, + { + "epoch": 0.15654223968565814, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 1.0397, + "step": 2490 + }, + { + "epoch": 0.1574852652259332, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 1.0382, + "step": 2505 + }, + { + "epoch": 0.15842829076620826, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 1.0336, + "step": 2520 + }, + { + "epoch": 0.1593713163064833, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 1.0083, + "step": 2535 + }, + { + "epoch": 0.16031434184675836, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 1.0236, + "step": 2550 + }, + { + "epoch": 0.1612573673870334, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 1.0245, + "step": 2565 + }, + { + "epoch": 0.16220039292730845, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 1.026, + "step": 2580 + }, + { + "epoch": 0.16314341846758348, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 1.0276, + "step": 2595 + }, + { + "epoch": 0.16408644400785855, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.9937, + "step": 2610 + }, + { + "epoch": 0.1650294695481336, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 1.0249, + "step": 2625 + }, + { + "epoch": 0.16597249508840864, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 1.0096, + "step": 2640 + }, + { + "epoch": 0.1669155206286837, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 1.0195, + "step": 2655 + }, + { + "epoch": 0.16785854616895873, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 1.018, + "step": 2670 + }, + { + "epoch": 0.1688015717092338, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 1.0289, + "step": 2685 + }, + { + "epoch": 0.16974459724950883, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 0.9931, + "step": 2700 + }, + { + "epoch": 0.1706876227897839, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 1.0101, + "step": 2715 + }, + { + "epoch": 0.17163064833005895, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 1.0159, + "step": 2730 + }, + { + "epoch": 0.17257367387033398, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 1.0094, + "step": 2745 + }, + { + "epoch": 0.17351669941060904, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 1.0081, + "step": 2760 + }, + { + "epoch": 0.17445972495088408, + "grad_norm": 0.6640625, + "learning_rate": 0.001, + "loss": 0.9958, + "step": 2775 + }, + { + "epoch": 0.17540275049115914, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.9909, + "step": 2790 + }, + { + "epoch": 0.1763457760314342, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.9854, + "step": 2805 + }, + { + "epoch": 0.17728880157170923, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.9858, + "step": 2820 + }, + { + "epoch": 0.1782318271119843, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.9825, + "step": 2835 + }, + { + "epoch": 0.17917485265225933, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 1.0153, + "step": 2850 + }, + { + "epoch": 0.1801178781925344, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.9984, + "step": 2865 + }, + { + "epoch": 0.18106090373280942, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.9832, + "step": 2880 + }, + { + "epoch": 0.18200392927308448, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.9843, + "step": 2895 + }, + { + "epoch": 0.18294695481335954, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 0.9774, + "step": 2910 + }, + { + "epoch": 0.18388998035363457, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.9824, + "step": 2925 + }, + { + "epoch": 0.18483300589390964, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 0.9884, + "step": 2940 + }, + { + "epoch": 0.18577603143418467, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 0.9684, + "step": 2955 + }, + { + "epoch": 0.18671905697445973, + "grad_norm": 0.6640625, + "learning_rate": 0.001, + "loss": 0.9746, + "step": 2970 + }, + { + "epoch": 0.18766208251473476, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.9831, + "step": 2985 + }, + { + "epoch": 0.18860510805500982, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.9868, + "step": 3000 + }, + { + "epoch": 0.18954813359528488, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.9687, + "step": 3015 + }, + { + "epoch": 0.19049115913555992, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.9759, + "step": 3030 + }, + { + "epoch": 0.19143418467583498, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.9755, + "step": 3045 + }, + { + "epoch": 0.19237721021611, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.9784, + "step": 3060 + }, + { + "epoch": 0.19332023575638507, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.9691, + "step": 3075 + }, + { + "epoch": 0.1942632612966601, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.9851, + "step": 3090 + }, + { + "epoch": 0.19520628683693517, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.9695, + "step": 3105 + }, + { + "epoch": 0.19614931237721023, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.993, + "step": 3120 + }, + { + "epoch": 0.19709233791748526, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.9625, + "step": 3135 + }, + { + "epoch": 0.19803536345776032, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 0.9655, + "step": 3150 + }, + { + "epoch": 0.19897838899803535, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.9606, + "step": 3165 + }, + { + "epoch": 0.19992141453831042, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.9608, + "step": 3180 + }, + { + "epoch": 0.19992141453831042, + "eval_loss": 1.169226050376892, + "eval_runtime": 9.7503, + "eval_samples_per_second": 102.561, + "eval_steps_per_second": 1.436, + "step": 3180 + }, + { + "epoch": 0.20086444007858545, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.9741, + "step": 3195 + }, + { + "epoch": 0.2018074656188605, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.9608, + "step": 3210 + }, + { + "epoch": 0.20275049115913557, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 0.9464, + "step": 3225 + }, + { + "epoch": 0.2036935166994106, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.9683, + "step": 3240 + }, + { + "epoch": 0.20463654223968566, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.9308, + "step": 3255 + }, + { + "epoch": 0.2055795677799607, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 0.9541, + "step": 3270 + }, + { + "epoch": 0.20652259332023576, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.9452, + "step": 3285 + }, + { + "epoch": 0.2074656188605108, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.9673, + "step": 3300 + }, + { + "epoch": 0.20840864440078585, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.9508, + "step": 3315 + }, + { + "epoch": 0.2093516699410609, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.955, + "step": 3330 + }, + { + "epoch": 0.21029469548133595, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.9499, + "step": 3345 + }, + { + "epoch": 0.211237721021611, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.9441, + "step": 3360 + }, + { + "epoch": 0.21218074656188604, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.9476, + "step": 3375 + }, + { + "epoch": 0.2131237721021611, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 0.9506, + "step": 3390 + }, + { + "epoch": 0.21406679764243616, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.9546, + "step": 3405 + }, + { + "epoch": 0.2150098231827112, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.9488, + "step": 3420 + }, + { + "epoch": 0.21595284872298626, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.9473, + "step": 3435 + }, + { + "epoch": 0.2168958742632613, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 0.9491, + "step": 3450 + }, + { + "epoch": 0.21783889980353635, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 0.9304, + "step": 3465 + }, + { + "epoch": 0.21878192534381138, + "grad_norm": 0.80078125, + "learning_rate": 0.001, + "loss": 0.9482, + "step": 3480 + }, + { + "epoch": 0.21972495088408645, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.9418, + "step": 3495 + }, + { + "epoch": 0.2206679764243615, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.9226, + "step": 3510 + }, + { + "epoch": 0.22161100196463654, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 0.9427, + "step": 3525 + }, + { + "epoch": 0.2225540275049116, + "grad_norm": 0.76953125, + "learning_rate": 0.001, + "loss": 0.9261, + "step": 3540 + }, + { + "epoch": 0.22349705304518663, + "grad_norm": 0.69140625, + "learning_rate": 0.001, + "loss": 0.9418, + "step": 3555 + }, + { + "epoch": 0.2244400785854617, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.9382, + "step": 3570 + }, + { + "epoch": 0.22538310412573673, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 0.9353, + "step": 3585 + }, + { + "epoch": 0.2263261296660118, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 0.9138, + "step": 3600 + }, + { + "epoch": 0.22726915520628685, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.9033, + "step": 3615 + }, + { + "epoch": 0.22821218074656188, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.9337, + "step": 3630 + }, + { + "epoch": 0.22915520628683694, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.9188, + "step": 3645 + }, + { + "epoch": 0.23009823182711198, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.9407, + "step": 3660 + }, + { + "epoch": 0.23104125736738704, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 0.9068, + "step": 3675 + }, + { + "epoch": 0.23198428290766207, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.9079, + "step": 3690 + }, + { + "epoch": 0.23292730844793713, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.9095, + "step": 3705 + }, + { + "epoch": 0.2338703339882122, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.9148, + "step": 3720 + }, + { + "epoch": 0.23481335952848723, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.9044, + "step": 3735 + }, + { + "epoch": 0.2357563850687623, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.9401, + "step": 3750 + }, + { + "epoch": 0.23669941060903732, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.9228, + "step": 3765 + }, + { + "epoch": 0.23764243614931238, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 0.9071, + "step": 3780 + }, + { + "epoch": 0.2385854616895874, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.92, + "step": 3795 + }, + { + "epoch": 0.23952848722986247, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.9323, + "step": 3810 + }, + { + "epoch": 0.24047151277013754, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 0.9013, + "step": 3825 + }, + { + "epoch": 0.24141453831041257, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.9045, + "step": 3840 + }, + { + "epoch": 0.24235756385068763, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.9049, + "step": 3855 + }, + { + "epoch": 0.24330058939096266, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.8902, + "step": 3870 + }, + { + "epoch": 0.24424361493123772, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.911, + "step": 3885 + }, + { + "epoch": 0.24518664047151276, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.9092, + "step": 3900 + }, + { + "epoch": 0.24612966601178782, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 0.894, + "step": 3915 + }, + { + "epoch": 0.24707269155206288, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.9096, + "step": 3930 + }, + { + "epoch": 0.2480157170923379, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.9147, + "step": 3945 + }, + { + "epoch": 0.24895874263261297, + "grad_norm": 0.8359375, + "learning_rate": 0.001, + "loss": 0.9088, + "step": 3960 + }, + { + "epoch": 0.249901768172888, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.9116, + "step": 3975 + }, + { + "epoch": 0.25084479371316304, + "grad_norm": 0.7421875, + "learning_rate": 0.001, + "loss": 0.901, + "step": 3990 + }, + { + "epoch": 0.2517878192534381, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.9013, + "step": 4005 + }, + { + "epoch": 0.25273084479371316, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 0.903, + "step": 4020 + }, + { + "epoch": 0.2536738703339882, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.8916, + "step": 4035 + }, + { + "epoch": 0.2546168958742633, + "grad_norm": 0.69140625, + "learning_rate": 0.001, + "loss": 0.897, + "step": 4050 + }, + { + "epoch": 0.2555599214145383, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 0.9015, + "step": 4065 + }, + { + "epoch": 0.25650294695481335, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 0.897, + "step": 4080 + }, + { + "epoch": 0.2574459724950884, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.8936, + "step": 4095 + }, + { + "epoch": 0.25838899803536347, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 0.9048, + "step": 4110 + }, + { + "epoch": 0.2593320235756385, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.8973, + "step": 4125 + }, + { + "epoch": 0.26027504911591354, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 0.9053, + "step": 4140 + }, + { + "epoch": 0.2612180746561886, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.9121, + "step": 4155 + }, + { + "epoch": 0.26216110019646366, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 0.89, + "step": 4170 + }, + { + "epoch": 0.2631041257367387, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.9025, + "step": 4185 + }, + { + "epoch": 0.2640471512770137, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 0.899, + "step": 4200 + }, + { + "epoch": 0.2649901768172888, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.8793, + "step": 4215 + }, + { + "epoch": 0.26593320235756385, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.8964, + "step": 4230 + }, + { + "epoch": 0.2668762278978389, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.896, + "step": 4245 + }, + { + "epoch": 0.26781925343811397, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 0.886, + "step": 4260 + }, + { + "epoch": 0.268762278978389, + "grad_norm": 0.6796875, + "learning_rate": 0.001, + "loss": 0.8861, + "step": 4275 + }, + { + "epoch": 0.26970530451866404, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.8864, + "step": 4290 + }, + { + "epoch": 0.2706483300589391, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.8834, + "step": 4305 + }, + { + "epoch": 0.27159135559921416, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.8859, + "step": 4320 + }, + { + "epoch": 0.2725343811394892, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.8953, + "step": 4335 + }, + { + "epoch": 0.2734774066797642, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.8928, + "step": 4350 + }, + { + "epoch": 0.2744204322200393, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.8821, + "step": 4365 + }, + { + "epoch": 0.27536345776031435, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.8872, + "step": 4380 + }, + { + "epoch": 0.2763064833005894, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.8753, + "step": 4395 + }, + { + "epoch": 0.27724950884086447, + "grad_norm": 0.80859375, + "learning_rate": 0.001, + "loss": 0.9047, + "step": 4410 + }, + { + "epoch": 0.2781925343811395, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.8876, + "step": 4425 + }, + { + "epoch": 0.27913555992141453, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.864, + "step": 4440 + }, + { + "epoch": 0.28007858546168957, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 0.8863, + "step": 4455 + }, + { + "epoch": 0.28102161100196466, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.9028, + "step": 4470 + }, + { + "epoch": 0.2819646365422397, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8684, + "step": 4485 + }, + { + "epoch": 0.2829076620825147, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.8808, + "step": 4500 + }, + { + "epoch": 0.2838506876227898, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.8736, + "step": 4515 + }, + { + "epoch": 0.28479371316306484, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 0.8729, + "step": 4530 + }, + { + "epoch": 0.2857367387033399, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.8807, + "step": 4545 + }, + { + "epoch": 0.2866797642436149, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.8716, + "step": 4560 + }, + { + "epoch": 0.28762278978389, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.8754, + "step": 4575 + }, + { + "epoch": 0.28856581532416503, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.866, + "step": 4590 + }, + { + "epoch": 0.28950884086444006, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.8661, + "step": 4605 + }, + { + "epoch": 0.29045186640471515, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.8797, + "step": 4620 + }, + { + "epoch": 0.2913948919449902, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.8523, + "step": 4635 + }, + { + "epoch": 0.2923379174852652, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.8774, + "step": 4650 + }, + { + "epoch": 0.29328094302554025, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.8785, + "step": 4665 + }, + { + "epoch": 0.29422396856581534, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 0.8648, + "step": 4680 + }, + { + "epoch": 0.2951669941060904, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 0.8676, + "step": 4695 + }, + { + "epoch": 0.2961100196463654, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8557, + "step": 4710 + }, + { + "epoch": 0.2970530451866405, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 0.8694, + "step": 4725 + }, + { + "epoch": 0.29799607072691553, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.8459, + "step": 4740 + }, + { + "epoch": 0.29893909626719056, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 0.8551, + "step": 4755 + }, + { + "epoch": 0.2998821218074656, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.8717, + "step": 4770 + }, + { + "epoch": 0.2998821218074656, + "eval_loss": 1.035895824432373, + "eval_runtime": 9.7687, + "eval_samples_per_second": 102.368, + "eval_steps_per_second": 1.433, + "step": 4770 + }, + { + "epoch": 0.3008251473477407, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.8668, + "step": 4785 + }, + { + "epoch": 0.3017681728880157, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.8674, + "step": 4800 + }, + { + "epoch": 0.30271119842829075, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.8886, + "step": 4815 + }, + { + "epoch": 0.30365422396856584, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.854, + "step": 4830 + }, + { + "epoch": 0.3045972495088409, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8513, + "step": 4845 + }, + { + "epoch": 0.3055402750491159, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.8574, + "step": 4860 + }, + { + "epoch": 0.30648330058939094, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.8437, + "step": 4875 + }, + { + "epoch": 0.307426326129666, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.8604, + "step": 4890 + }, + { + "epoch": 0.30836935166994106, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.8544, + "step": 4905 + }, + { + "epoch": 0.3093123772102161, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.8607, + "step": 4920 + }, + { + "epoch": 0.3102554027504912, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.8454, + "step": 4935 + }, + { + "epoch": 0.3111984282907662, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 0.8575, + "step": 4950 + }, + { + "epoch": 0.31214145383104125, + "grad_norm": 0.73828125, + "learning_rate": 0.001, + "loss": 0.8401, + "step": 4965 + }, + { + "epoch": 0.3130844793713163, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.8592, + "step": 4980 + }, + { + "epoch": 0.31402750491159137, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.8376, + "step": 4995 + }, + { + "epoch": 0.3149705304518664, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.853, + "step": 5010 + }, + { + "epoch": 0.31591355599214144, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.8659, + "step": 5025 + }, + { + "epoch": 0.3168565815324165, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.8733, + "step": 5040 + }, + { + "epoch": 0.31779960707269156, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 0.8541, + "step": 5055 + }, + { + "epoch": 0.3187426326129666, + "grad_norm": 0.66015625, + "learning_rate": 0.001, + "loss": 0.8474, + "step": 5070 + }, + { + "epoch": 0.3196856581532416, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.8421, + "step": 5085 + }, + { + "epoch": 0.3206286836935167, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.8501, + "step": 5100 + }, + { + "epoch": 0.32157170923379175, + "grad_norm": 0.44140625, + "learning_rate": 0.001, + "loss": 0.8596, + "step": 5115 + }, + { + "epoch": 0.3225147347740668, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.8421, + "step": 5130 + }, + { + "epoch": 0.32345776031434187, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.8732, + "step": 5145 + }, + { + "epoch": 0.3244007858546169, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.8549, + "step": 5160 + }, + { + "epoch": 0.32534381139489194, + "grad_norm": 0.63671875, + "learning_rate": 0.001, + "loss": 0.8468, + "step": 5175 + }, + { + "epoch": 0.32628683693516697, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.8419, + "step": 5190 + }, + { + "epoch": 0.32722986247544206, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 0.8531, + "step": 5205 + }, + { + "epoch": 0.3281728880157171, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.848, + "step": 5220 + }, + { + "epoch": 0.3291159135559921, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 0.8367, + "step": 5235 + }, + { + "epoch": 0.3300589390962672, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 0.8405, + "step": 5250 + }, + { + "epoch": 0.33100196463654225, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.8567, + "step": 5265 + }, + { + "epoch": 0.3319449901768173, + "grad_norm": 0.66796875, + "learning_rate": 0.001, + "loss": 0.8572, + "step": 5280 + }, + { + "epoch": 0.3328880157170923, + "grad_norm": 0.78515625, + "learning_rate": 0.001, + "loss": 0.8505, + "step": 5295 + }, + { + "epoch": 0.3338310412573674, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.8398, + "step": 5310 + }, + { + "epoch": 0.33477406679764243, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.8475, + "step": 5325 + }, + { + "epoch": 0.33571709233791747, + "grad_norm": 0.6796875, + "learning_rate": 0.001, + "loss": 0.8267, + "step": 5340 + }, + { + "epoch": 0.33666011787819256, + "grad_norm": 0.6953125, + "learning_rate": 0.001, + "loss": 0.8442, + "step": 5355 + }, + { + "epoch": 0.3376031434184676, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.8605, + "step": 5370 + }, + { + "epoch": 0.3385461689587426, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.8458, + "step": 5385 + }, + { + "epoch": 0.33948919449901765, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.8474, + "step": 5400 + }, + { + "epoch": 0.34043222003929274, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.8507, + "step": 5415 + }, + { + "epoch": 0.3413752455795678, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.8449, + "step": 5430 + }, + { + "epoch": 0.3423182711198428, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.8456, + "step": 5445 + }, + { + "epoch": 0.3432612966601179, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.834, + "step": 5460 + }, + { + "epoch": 0.34420432220039293, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.8382, + "step": 5475 + }, + { + "epoch": 0.34514734774066796, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 0.8162, + "step": 5490 + }, + { + "epoch": 0.346090373280943, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.8331, + "step": 5505 + }, + { + "epoch": 0.3470333988212181, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.8461, + "step": 5520 + }, + { + "epoch": 0.3479764243614931, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.8277, + "step": 5535 + }, + { + "epoch": 0.34891944990176815, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.8261, + "step": 5550 + }, + { + "epoch": 0.34986247544204324, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8368, + "step": 5565 + }, + { + "epoch": 0.3508055009823183, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.829, + "step": 5580 + }, + { + "epoch": 0.3517485265225933, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.8356, + "step": 5595 + }, + { + "epoch": 0.3526915520628684, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.8404, + "step": 5610 + }, + { + "epoch": 0.35363457760314343, + "grad_norm": 0.66796875, + "learning_rate": 0.001, + "loss": 0.8221, + "step": 5625 + }, + { + "epoch": 0.35457760314341846, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.8336, + "step": 5640 + }, + { + "epoch": 0.3555206286836935, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.8118, + "step": 5655 + }, + { + "epoch": 0.3564636542239686, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 0.8288, + "step": 5670 + }, + { + "epoch": 0.3574066797642436, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.8376, + "step": 5685 + }, + { + "epoch": 0.35834970530451865, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.8426, + "step": 5700 + }, + { + "epoch": 0.35929273084479374, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.8437, + "step": 5715 + }, + { + "epoch": 0.3602357563850688, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.8469, + "step": 5730 + }, + { + "epoch": 0.3611787819253438, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.8274, + "step": 5745 + }, + { + "epoch": 0.36212180746561884, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.8306, + "step": 5760 + }, + { + "epoch": 0.3630648330058939, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.8315, + "step": 5775 + }, + { + "epoch": 0.36400785854616896, + "grad_norm": 0.6640625, + "learning_rate": 0.001, + "loss": 0.8379, + "step": 5790 + }, + { + "epoch": 0.364950884086444, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.8342, + "step": 5805 + }, + { + "epoch": 0.3658939096267191, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.8374, + "step": 5820 + }, + { + "epoch": 0.3668369351669941, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.8103, + "step": 5835 + }, + { + "epoch": 0.36777996070726915, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.8053, + "step": 5850 + }, + { + "epoch": 0.3687229862475442, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.8248, + "step": 5865 + }, + { + "epoch": 0.36966601178781927, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.8118, + "step": 5880 + }, + { + "epoch": 0.3706090373280943, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.8289, + "step": 5895 + }, + { + "epoch": 0.37155206286836934, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.8295, + "step": 5910 + }, + { + "epoch": 0.3724950884086444, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.8158, + "step": 5925 + }, + { + "epoch": 0.37343811394891946, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 0.8235, + "step": 5940 + }, + { + "epoch": 0.3743811394891945, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.8148, + "step": 5955 + }, + { + "epoch": 0.3753241650294695, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.8161, + "step": 5970 + }, + { + "epoch": 0.3762671905697446, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.812, + "step": 5985 + }, + { + "epoch": 0.37721021611001965, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.8154, + "step": 6000 + }, + { + "epoch": 0.3781532416502947, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.8248, + "step": 6015 + }, + { + "epoch": 0.37909626719056977, + "grad_norm": 0.7265625, + "learning_rate": 0.001, + "loss": 0.8104, + "step": 6030 + }, + { + "epoch": 0.3800392927308448, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.8228, + "step": 6045 + }, + { + "epoch": 0.38098231827111984, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.8392, + "step": 6060 + }, + { + "epoch": 0.38192534381139487, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.8352, + "step": 6075 + }, + { + "epoch": 0.38286836935166996, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.8271, + "step": 6090 + }, + { + "epoch": 0.383811394891945, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.8122, + "step": 6105 + }, + { + "epoch": 0.38475442043222, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8221, + "step": 6120 + }, + { + "epoch": 0.3856974459724951, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.8354, + "step": 6135 + }, + { + "epoch": 0.38664047151277015, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.8277, + "step": 6150 + }, + { + "epoch": 0.3875834970530452, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.8263, + "step": 6165 + }, + { + "epoch": 0.3885265225933202, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.8122, + "step": 6180 + }, + { + "epoch": 0.3894695481335953, + "grad_norm": 0.70703125, + "learning_rate": 0.001, + "loss": 0.8296, + "step": 6195 + }, + { + "epoch": 0.39041257367387033, + "grad_norm": 0.65234375, + "learning_rate": 0.001, + "loss": 0.8171, + "step": 6210 + }, + { + "epoch": 0.39135559921414537, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.8127, + "step": 6225 + }, + { + "epoch": 0.39229862475442046, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.806, + "step": 6240 + }, + { + "epoch": 0.3932416502946955, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 0.8157, + "step": 6255 + }, + { + "epoch": 0.3941846758349705, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.826, + "step": 6270 + }, + { + "epoch": 0.39512770137524555, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 0.8208, + "step": 6285 + }, + { + "epoch": 0.39607072691552064, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.8041, + "step": 6300 + }, + { + "epoch": 0.3970137524557957, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.8254, + "step": 6315 + }, + { + "epoch": 0.3979567779960707, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.8332, + "step": 6330 + }, + { + "epoch": 0.3988998035363458, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.8143, + "step": 6345 + }, + { + "epoch": 0.39984282907662083, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.8087, + "step": 6360 + }, + { + "epoch": 0.39984282907662083, + "eval_loss": 0.9629083871841431, + "eval_runtime": 9.6716, + "eval_samples_per_second": 103.395, + "eval_steps_per_second": 1.448, + "step": 6360 + }, + { + "epoch": 0.40078585461689586, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.8169, + "step": 6375 + }, + { + "epoch": 0.4017288801571709, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.8229, + "step": 6390 + }, + { + "epoch": 0.402671905697446, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.8108, + "step": 6405 + }, + { + "epoch": 0.403614931237721, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.814, + "step": 6420 + }, + { + "epoch": 0.40455795677799605, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.8077, + "step": 6435 + }, + { + "epoch": 0.40550098231827114, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.8103, + "step": 6450 + }, + { + "epoch": 0.4064440078585462, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.7904, + "step": 6465 + }, + { + "epoch": 0.4073870333988212, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.8006, + "step": 6480 + }, + { + "epoch": 0.40833005893909624, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.8112, + "step": 6495 + }, + { + "epoch": 0.40927308447937133, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.7984, + "step": 6510 + }, + { + "epoch": 0.41021611001964636, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7883, + "step": 6525 + }, + { + "epoch": 0.4111591355599214, + "grad_norm": 1.0625, + "learning_rate": 0.001, + "loss": 0.8196, + "step": 6540 + }, + { + "epoch": 0.4121021611001965, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.8274, + "step": 6555 + }, + { + "epoch": 0.4130451866404715, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 0.7942, + "step": 6570 + }, + { + "epoch": 0.41398821218074655, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7965, + "step": 6585 + }, + { + "epoch": 0.4149312377210216, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 0.7944, + "step": 6600 + }, + { + "epoch": 0.4158742632612967, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.8055, + "step": 6615 + }, + { + "epoch": 0.4168172888015717, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.8083, + "step": 6630 + }, + { + "epoch": 0.41776031434184674, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.8151, + "step": 6645 + }, + { + "epoch": 0.4187033398821218, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.8093, + "step": 6660 + }, + { + "epoch": 0.41964636542239686, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.807, + "step": 6675 + }, + { + "epoch": 0.4205893909626719, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7884, + "step": 6690 + }, + { + "epoch": 0.4215324165029469, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7958, + "step": 6705 + }, + { + "epoch": 0.422475442043222, + "grad_norm": 0.73046875, + "learning_rate": 0.001, + "loss": 0.8029, + "step": 6720 + }, + { + "epoch": 0.42341846758349705, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 0.804, + "step": 6735 + }, + { + "epoch": 0.4243614931237721, + "grad_norm": 0.6953125, + "learning_rate": 0.001, + "loss": 0.8235, + "step": 6750 + }, + { + "epoch": 0.42530451866404717, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.8105, + "step": 6765 + }, + { + "epoch": 0.4262475442043222, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.8028, + "step": 6780 + }, + { + "epoch": 0.42719056974459724, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.8017, + "step": 6795 + }, + { + "epoch": 0.4281335952848723, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7998, + "step": 6810 + }, + { + "epoch": 0.42907662082514736, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.8083, + "step": 6825 + }, + { + "epoch": 0.4300196463654224, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.7701, + "step": 6840 + }, + { + "epoch": 0.4309626719056974, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7922, + "step": 6855 + }, + { + "epoch": 0.4319056974459725, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7971, + "step": 6870 + }, + { + "epoch": 0.43284872298624755, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.795, + "step": 6885 + }, + { + "epoch": 0.4337917485265226, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.8004, + "step": 6900 + }, + { + "epoch": 0.43473477406679767, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7965, + "step": 6915 + }, + { + "epoch": 0.4356777996070727, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7937, + "step": 6930 + }, + { + "epoch": 0.43662082514734774, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.8007, + "step": 6945 + }, + { + "epoch": 0.43756385068762277, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7935, + "step": 6960 + }, + { + "epoch": 0.43850687622789786, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 0.8045, + "step": 6975 + }, + { + "epoch": 0.4394499017681729, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.8055, + "step": 6990 + }, + { + "epoch": 0.4403929273084479, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 0.8005, + "step": 7005 + }, + { + "epoch": 0.441335952848723, + "grad_norm": 0.72265625, + "learning_rate": 0.001, + "loss": 0.7881, + "step": 7020 + }, + { + "epoch": 0.44227897838899805, + "grad_norm": 0.73046875, + "learning_rate": 0.001, + "loss": 0.8212, + "step": 7035 + }, + { + "epoch": 0.4432220039292731, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7984, + "step": 7050 + }, + { + "epoch": 0.4441650294695481, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.8078, + "step": 7065 + }, + { + "epoch": 0.4451080550098232, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 0.7773, + "step": 7080 + }, + { + "epoch": 0.44605108055009823, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.7884, + "step": 7095 + }, + { + "epoch": 0.44699410609037327, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7842, + "step": 7110 + }, + { + "epoch": 0.44793713163064836, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7854, + "step": 7125 + }, + { + "epoch": 0.4488801571709234, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7913, + "step": 7140 + }, + { + "epoch": 0.4498231827111984, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7944, + "step": 7155 + }, + { + "epoch": 0.45076620825147345, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7935, + "step": 7170 + }, + { + "epoch": 0.45170923379174854, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 0.7915, + "step": 7185 + }, + { + "epoch": 0.4526522593320236, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.7893, + "step": 7200 + }, + { + "epoch": 0.4535952848722986, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.7749, + "step": 7215 + }, + { + "epoch": 0.4545383104125737, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 0.7738, + "step": 7230 + }, + { + "epoch": 0.45548133595284873, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7832, + "step": 7245 + }, + { + "epoch": 0.45642436149312376, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.7935, + "step": 7260 + }, + { + "epoch": 0.4573673870333988, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7969, + "step": 7275 + }, + { + "epoch": 0.4583104125736739, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7891, + "step": 7290 + }, + { + "epoch": 0.4592534381139489, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7854, + "step": 7305 + }, + { + "epoch": 0.46019646365422395, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.8013, + "step": 7320 + }, + { + "epoch": 0.46113948919449904, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7864, + "step": 7335 + }, + { + "epoch": 0.4620825147347741, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7932, + "step": 7350 + }, + { + "epoch": 0.4630255402750491, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7866, + "step": 7365 + }, + { + "epoch": 0.46396856581532414, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.8011, + "step": 7380 + }, + { + "epoch": 0.46491159135559923, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7743, + "step": 7395 + }, + { + "epoch": 0.46585461689587426, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.7784, + "step": 7410 + }, + { + "epoch": 0.4667976424361493, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7953, + "step": 7425 + }, + { + "epoch": 0.4677406679764244, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7807, + "step": 7440 + }, + { + "epoch": 0.4686836935166994, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7713, + "step": 7455 + }, + { + "epoch": 0.46962671905697445, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7636, + "step": 7470 + }, + { + "epoch": 0.4705697445972495, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 0.7773, + "step": 7485 + }, + { + "epoch": 0.4715127701375246, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.8002, + "step": 7500 + }, + { + "epoch": 0.4724557956777996, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 0.7799, + "step": 7515 + }, + { + "epoch": 0.47339882121807464, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7776, + "step": 7530 + }, + { + "epoch": 0.47434184675834973, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7823, + "step": 7545 + }, + { + "epoch": 0.47528487229862476, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.8059, + "step": 7560 + }, + { + "epoch": 0.4762278978388998, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7908, + "step": 7575 + }, + { + "epoch": 0.4771709233791748, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7923, + "step": 7590 + }, + { + "epoch": 0.4781139489194499, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.778, + "step": 7605 + }, + { + "epoch": 0.47905697445972495, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.8007, + "step": 7620 + }, + { + "epoch": 0.48, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7842, + "step": 7635 + }, + { + "epoch": 0.48094302554027507, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 0.7968, + "step": 7650 + }, + { + "epoch": 0.4818860510805501, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7812, + "step": 7665 + }, + { + "epoch": 0.48282907662082514, + "grad_norm": 1.5078125, + "learning_rate": 0.001, + "loss": 0.7832, + "step": 7680 + }, + { + "epoch": 0.48377210216110017, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7915, + "step": 7695 + }, + { + "epoch": 0.48471512770137526, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.8046, + "step": 7710 + }, + { + "epoch": 0.4856581532416503, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.7674, + "step": 7725 + }, + { + "epoch": 0.4866011787819253, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.7795, + "step": 7740 + }, + { + "epoch": 0.4875442043222004, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7983, + "step": 7755 + }, + { + "epoch": 0.48848722986247545, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7897, + "step": 7770 + }, + { + "epoch": 0.4894302554027505, + "grad_norm": 0.73828125, + "learning_rate": 0.001, + "loss": 0.772, + "step": 7785 + }, + { + "epoch": 0.4903732809430255, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7795, + "step": 7800 + }, + { + "epoch": 0.4913163064833006, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7739, + "step": 7815 + }, + { + "epoch": 0.49225933202357564, + "grad_norm": 0.66015625, + "learning_rate": 0.001, + "loss": 0.7891, + "step": 7830 + }, + { + "epoch": 0.49320235756385067, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7802, + "step": 7845 + }, + { + "epoch": 0.49414538310412576, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7843, + "step": 7860 + }, + { + "epoch": 0.4950884086444008, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7756, + "step": 7875 + }, + { + "epoch": 0.4960314341846758, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.77, + "step": 7890 + }, + { + "epoch": 0.49697445972495086, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7633, + "step": 7905 + }, + { + "epoch": 0.49791748526522595, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7842, + "step": 7920 + }, + { + "epoch": 0.498860510805501, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7742, + "step": 7935 + }, + { + "epoch": 0.499803536345776, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7608, + "step": 7950 + }, + { + "epoch": 0.499803536345776, + "eval_loss": 0.9156466126441956, + "eval_runtime": 9.6921, + "eval_samples_per_second": 103.176, + "eval_steps_per_second": 1.444, + "step": 7950 + }, + { + "epoch": 0.500746561886051, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7861, + "step": 7965 + }, + { + "epoch": 0.5016895874263261, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7726, + "step": 7980 + }, + { + "epoch": 0.5026326129666012, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7749, + "step": 7995 + }, + { + "epoch": 0.5035756385068763, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7686, + "step": 8010 + }, + { + "epoch": 0.5045186640471513, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7797, + "step": 8025 + }, + { + "epoch": 0.5054616895874263, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7622, + "step": 8040 + }, + { + "epoch": 0.5064047151277014, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.7753, + "step": 8055 + }, + { + "epoch": 0.5073477406679764, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7744, + "step": 8070 + }, + { + "epoch": 0.5082907662082514, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7659, + "step": 8085 + }, + { + "epoch": 0.5092337917485266, + "grad_norm": 0.69140625, + "learning_rate": 0.001, + "loss": 0.7883, + "step": 8100 + }, + { + "epoch": 0.5101768172888016, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7809, + "step": 8115 + }, + { + "epoch": 0.5111198428290766, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.7701, + "step": 8130 + }, + { + "epoch": 0.5120628683693517, + "grad_norm": 0.6953125, + "learning_rate": 0.001, + "loss": 0.7659, + "step": 8145 + }, + { + "epoch": 0.5130058939096267, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7772, + "step": 8160 + }, + { + "epoch": 0.5139489194499017, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.7769, + "step": 8175 + }, + { + "epoch": 0.5148919449901768, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7706, + "step": 8190 + }, + { + "epoch": 0.5158349705304519, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.7645, + "step": 8205 + }, + { + "epoch": 0.5167779960707269, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.7724, + "step": 8220 + }, + { + "epoch": 0.517721021611002, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7651, + "step": 8235 + }, + { + "epoch": 0.518664047151277, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7703, + "step": 8250 + }, + { + "epoch": 0.519607072691552, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7709, + "step": 8265 + }, + { + "epoch": 0.5205500982318271, + "grad_norm": 0.6796875, + "learning_rate": 0.001, + "loss": 0.7759, + "step": 8280 + }, + { + "epoch": 0.5214931237721021, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7687, + "step": 8295 + }, + { + "epoch": 0.5224361493123773, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7735, + "step": 8310 + }, + { + "epoch": 0.5233791748526523, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7653, + "step": 8325 + }, + { + "epoch": 0.5243222003929273, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.766, + "step": 8340 + }, + { + "epoch": 0.5252652259332024, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.768, + "step": 8355 + }, + { + "epoch": 0.5262082514734774, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7651, + "step": 8370 + }, + { + "epoch": 0.5271512770137524, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.77, + "step": 8385 + }, + { + "epoch": 0.5280943025540275, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7671, + "step": 8400 + }, + { + "epoch": 0.5290373280943026, + "grad_norm": 0.63671875, + "learning_rate": 0.001, + "loss": 0.7568, + "step": 8415 + }, + { + "epoch": 0.5299803536345776, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7719, + "step": 8430 + }, + { + "epoch": 0.5309233791748527, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7765, + "step": 8445 + }, + { + "epoch": 0.5318664047151277, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7713, + "step": 8460 + }, + { + "epoch": 0.5328094302554027, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.7779, + "step": 8475 + }, + { + "epoch": 0.5337524557956778, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7675, + "step": 8490 + }, + { + "epoch": 0.5346954813359528, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 0.7652, + "step": 8505 + }, + { + "epoch": 0.5356385068762279, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7692, + "step": 8520 + }, + { + "epoch": 0.536581532416503, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7781, + "step": 8535 + }, + { + "epoch": 0.537524557956778, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.765, + "step": 8550 + }, + { + "epoch": 0.538467583497053, + "grad_norm": 0.84765625, + "learning_rate": 0.001, + "loss": 0.7549, + "step": 8565 + }, + { + "epoch": 0.5394106090373281, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.7709, + "step": 8580 + }, + { + "epoch": 0.5403536345776031, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7739, + "step": 8595 + }, + { + "epoch": 0.5412966601178782, + "grad_norm": 0.76171875, + "learning_rate": 0.001, + "loss": 0.769, + "step": 8610 + }, + { + "epoch": 0.5422396856581533, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.7737, + "step": 8625 + }, + { + "epoch": 0.5431827111984283, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7638, + "step": 8640 + }, + { + "epoch": 0.5441257367387033, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 0.7392, + "step": 8655 + }, + { + "epoch": 0.5450687622789784, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 0.7566, + "step": 8670 + }, + { + "epoch": 0.5460117878192534, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7592, + "step": 8685 + }, + { + "epoch": 0.5469548133595284, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 0.7485, + "step": 8700 + }, + { + "epoch": 0.5478978388998036, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7678, + "step": 8715 + }, + { + "epoch": 0.5488408644400786, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7634, + "step": 8730 + }, + { + "epoch": 0.5497838899803537, + "grad_norm": 0.4375, + "learning_rate": 0.001, + "loss": 0.7471, + "step": 8745 + }, + { + "epoch": 0.5507269155206287, + "grad_norm": 1.0234375, + "learning_rate": 0.001, + "loss": 0.7561, + "step": 8760 + }, + { + "epoch": 0.5516699410609037, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7622, + "step": 8775 + }, + { + "epoch": 0.5526129666011788, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 0.7701, + "step": 8790 + }, + { + "epoch": 0.5535559921414538, + "grad_norm": 0.7109375, + "learning_rate": 0.001, + "loss": 0.7728, + "step": 8805 + }, + { + "epoch": 0.5544990176817289, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7813, + "step": 8820 + }, + { + "epoch": 0.555442043222004, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7614, + "step": 8835 + }, + { + "epoch": 0.556385068762279, + "grad_norm": 0.7890625, + "learning_rate": 0.001, + "loss": 0.7766, + "step": 8850 + }, + { + "epoch": 0.557328094302554, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7735, + "step": 8865 + }, + { + "epoch": 0.5582711198428291, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7641, + "step": 8880 + }, + { + "epoch": 0.5592141453831041, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7798, + "step": 8895 + }, + { + "epoch": 0.5601571709233791, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7471, + "step": 8910 + }, + { + "epoch": 0.5611001964636543, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 0.7625, + "step": 8925 + }, + { + "epoch": 0.5620432220039293, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7631, + "step": 8940 + }, + { + "epoch": 0.5629862475442043, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7679, + "step": 8955 + }, + { + "epoch": 0.5639292730844794, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7647, + "step": 8970 + }, + { + "epoch": 0.5648722986247544, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.7674, + "step": 8985 + }, + { + "epoch": 0.5658153241650294, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7735, + "step": 9000 + }, + { + "epoch": 0.5667583497053045, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7826, + "step": 9015 + }, + { + "epoch": 0.5677013752455796, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 0.764, + "step": 9030 + }, + { + "epoch": 0.5686444007858547, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7535, + "step": 9045 + }, + { + "epoch": 0.5695874263261297, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7588, + "step": 9060 + }, + { + "epoch": 0.5705304518664047, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7622, + "step": 9075 + }, + { + "epoch": 0.5714734774066798, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7514, + "step": 9090 + }, + { + "epoch": 0.5724165029469548, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7593, + "step": 9105 + }, + { + "epoch": 0.5733595284872298, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.7677, + "step": 9120 + }, + { + "epoch": 0.574302554027505, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7539, + "step": 9135 + }, + { + "epoch": 0.57524557956778, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.7475, + "step": 9150 + }, + { + "epoch": 0.576188605108055, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.741, + "step": 9165 + }, + { + "epoch": 0.5771316306483301, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7533, + "step": 9180 + }, + { + "epoch": 0.5780746561886051, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.765, + "step": 9195 + }, + { + "epoch": 0.5790176817288801, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7741, + "step": 9210 + }, + { + "epoch": 0.5799607072691552, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7598, + "step": 9225 + }, + { + "epoch": 0.5809037328094303, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 0.7539, + "step": 9240 + }, + { + "epoch": 0.5818467583497053, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7455, + "step": 9255 + }, + { + "epoch": 0.5827897838899804, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.7506, + "step": 9270 + }, + { + "epoch": 0.5837328094302554, + "grad_norm": 0.74609375, + "learning_rate": 0.001, + "loss": 0.7555, + "step": 9285 + }, + { + "epoch": 0.5846758349705304, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.7635, + "step": 9300 + }, + { + "epoch": 0.5856188605108055, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7351, + "step": 9315 + }, + { + "epoch": 0.5865618860510805, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7341, + "step": 9330 + }, + { + "epoch": 0.5875049115913556, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7525, + "step": 9345 + }, + { + "epoch": 0.5884479371316307, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7575, + "step": 9360 + }, + { + "epoch": 0.5893909626719057, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7608, + "step": 9375 + }, + { + "epoch": 0.5903339882121807, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7602, + "step": 9390 + }, + { + "epoch": 0.5912770137524558, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.7615, + "step": 9405 + }, + { + "epoch": 0.5922200392927308, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 0.762, + "step": 9420 + }, + { + "epoch": 0.5931630648330058, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7635, + "step": 9435 + }, + { + "epoch": 0.594106090373281, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 0.7556, + "step": 9450 + }, + { + "epoch": 0.595049115913556, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 0.7497, + "step": 9465 + }, + { + "epoch": 0.5959921414538311, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7419, + "step": 9480 + }, + { + "epoch": 0.5969351669941061, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7562, + "step": 9495 + }, + { + "epoch": 0.5978781925343811, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7468, + "step": 9510 + }, + { + "epoch": 0.5988212180746562, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.7499, + "step": 9525 + }, + { + "epoch": 0.5997642436149312, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7683, + "step": 9540 + }, + { + "epoch": 0.5997642436149312, + "eval_loss": 0.8865543603897095, + "eval_runtime": 9.6786, + "eval_samples_per_second": 103.32, + "eval_steps_per_second": 1.446, + "step": 9540 + }, + { + "epoch": 0.6007072691552063, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7574, + "step": 9555 + }, + { + "epoch": 0.6016502946954814, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7518, + "step": 9570 + }, + { + "epoch": 0.6025933202357564, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7391, + "step": 9585 + }, + { + "epoch": 0.6035363457760314, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 0.7425, + "step": 9600 + }, + { + "epoch": 0.6044793713163065, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7606, + "step": 9615 + }, + { + "epoch": 0.6054223968565815, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.7292, + "step": 9630 + }, + { + "epoch": 0.6063654223968565, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7356, + "step": 9645 + }, + { + "epoch": 0.6073084479371317, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7513, + "step": 9660 + }, + { + "epoch": 0.6082514734774067, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7522, + "step": 9675 + }, + { + "epoch": 0.6091944990176817, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7563, + "step": 9690 + }, + { + "epoch": 0.6101375245579568, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7473, + "step": 9705 + }, + { + "epoch": 0.6110805500982318, + "grad_norm": 0.66796875, + "learning_rate": 0.001, + "loss": 0.76, + "step": 9720 + }, + { + "epoch": 0.6120235756385068, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7473, + "step": 9735 + }, + { + "epoch": 0.6129666011787819, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7416, + "step": 9750 + }, + { + "epoch": 0.613909626719057, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7449, + "step": 9765 + }, + { + "epoch": 0.614852652259332, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.7509, + "step": 9780 + }, + { + "epoch": 0.6157956777996071, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.7468, + "step": 9795 + }, + { + "epoch": 0.6167387033398821, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7632, + "step": 9810 + }, + { + "epoch": 0.6176817288801572, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7586, + "step": 9825 + }, + { + "epoch": 0.6186247544204322, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7495, + "step": 9840 + }, + { + "epoch": 0.6195677799607072, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7548, + "step": 9855 + }, + { + "epoch": 0.6205108055009824, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7484, + "step": 9870 + }, + { + "epoch": 0.6214538310412574, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7683, + "step": 9885 + }, + { + "epoch": 0.6223968565815324, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.7332, + "step": 9900 + }, + { + "epoch": 0.6233398821218075, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 0.743, + "step": 9915 + }, + { + "epoch": 0.6242829076620825, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7527, + "step": 9930 + }, + { + "epoch": 0.6252259332023575, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7407, + "step": 9945 + }, + { + "epoch": 0.6261689587426326, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 0.756, + "step": 9960 + }, + { + "epoch": 0.6271119842829077, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 0.7505, + "step": 9975 + }, + { + "epoch": 0.6280550098231827, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7517, + "step": 9990 + }, + { + "epoch": 0.6289980353634578, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.766, + "step": 10005 + }, + { + "epoch": 0.6299410609037328, + "grad_norm": 0.66015625, + "learning_rate": 0.001, + "loss": 0.7385, + "step": 10020 + }, + { + "epoch": 0.6308840864440078, + "grad_norm": 0.7265625, + "learning_rate": 0.001, + "loss": 0.7565, + "step": 10035 + }, + { + "epoch": 0.6318271119842829, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7508, + "step": 10050 + }, + { + "epoch": 0.6327701375245579, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7519, + "step": 10065 + }, + { + "epoch": 0.633713163064833, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.76, + "step": 10080 + }, + { + "epoch": 0.6346561886051081, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7326, + "step": 10095 + }, + { + "epoch": 0.6355992141453831, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7506, + "step": 10110 + }, + { + "epoch": 0.6365422396856582, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7419, + "step": 10125 + }, + { + "epoch": 0.6374852652259332, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.7309, + "step": 10140 + }, + { + "epoch": 0.6384282907662082, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7367, + "step": 10155 + }, + { + "epoch": 0.6393713163064833, + "grad_norm": 0.66796875, + "learning_rate": 0.001, + "loss": 0.7472, + "step": 10170 + }, + { + "epoch": 0.6403143418467584, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7431, + "step": 10185 + }, + { + "epoch": 0.6412573673870334, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.7496, + "step": 10200 + }, + { + "epoch": 0.6422003929273085, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.741, + "step": 10215 + }, + { + "epoch": 0.6431434184675835, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7548, + "step": 10230 + }, + { + "epoch": 0.6440864440078585, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7615, + "step": 10245 + }, + { + "epoch": 0.6450294695481336, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 0.764, + "step": 10260 + }, + { + "epoch": 0.6459724950884086, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7467, + "step": 10275 + }, + { + "epoch": 0.6469155206286837, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.752, + "step": 10290 + }, + { + "epoch": 0.6478585461689588, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.7238, + "step": 10305 + }, + { + "epoch": 0.6488015717092338, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.7464, + "step": 10320 + }, + { + "epoch": 0.6497445972495088, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 0.7376, + "step": 10335 + }, + { + "epoch": 0.6506876227897839, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.7378, + "step": 10350 + }, + { + "epoch": 0.6516306483300589, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.7536, + "step": 10365 + }, + { + "epoch": 0.6525736738703339, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 0.732, + "step": 10380 + }, + { + "epoch": 0.6535166994106091, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7554, + "step": 10395 + }, + { + "epoch": 0.6544597249508841, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.7348, + "step": 10410 + }, + { + "epoch": 0.6554027504911591, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7446, + "step": 10425 + }, + { + "epoch": 0.6563457760314342, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7386, + "step": 10440 + }, + { + "epoch": 0.6572888015717092, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7456, + "step": 10455 + }, + { + "epoch": 0.6582318271119842, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7447, + "step": 10470 + }, + { + "epoch": 0.6591748526522593, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7466, + "step": 10485 + }, + { + "epoch": 0.6601178781925344, + "grad_norm": 0.75390625, + "learning_rate": 0.001, + "loss": 0.7638, + "step": 10500 + }, + { + "epoch": 0.6610609037328095, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7454, + "step": 10515 + }, + { + "epoch": 0.6620039292730845, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.738, + "step": 10530 + }, + { + "epoch": 0.6629469548133595, + "grad_norm": 0.66796875, + "learning_rate": 0.001, + "loss": 0.7443, + "step": 10545 + }, + { + "epoch": 0.6638899803536346, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7433, + "step": 10560 + }, + { + "epoch": 0.6648330058939096, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 0.7328, + "step": 10575 + }, + { + "epoch": 0.6657760314341846, + "grad_norm": 0.66015625, + "learning_rate": 0.001, + "loss": 0.7419, + "step": 10590 + }, + { + "epoch": 0.6667190569744598, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7387, + "step": 10605 + }, + { + "epoch": 0.6676620825147348, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.7325, + "step": 10620 + }, + { + "epoch": 0.6686051080550098, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.737, + "step": 10635 + }, + { + "epoch": 0.6695481335952849, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7447, + "step": 10650 + }, + { + "epoch": 0.6704911591355599, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7332, + "step": 10665 + }, + { + "epoch": 0.6714341846758349, + "grad_norm": 0.80078125, + "learning_rate": 0.001, + "loss": 0.7459, + "step": 10680 + }, + { + "epoch": 0.67237721021611, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7389, + "step": 10695 + }, + { + "epoch": 0.6733202357563851, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7362, + "step": 10710 + }, + { + "epoch": 0.6742632612966601, + "grad_norm": 0.67578125, + "learning_rate": 0.001, + "loss": 0.7297, + "step": 10725 + }, + { + "epoch": 0.6752062868369352, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7506, + "step": 10740 + }, + { + "epoch": 0.6761493123772102, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7279, + "step": 10755 + }, + { + "epoch": 0.6770923379174852, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7329, + "step": 10770 + }, + { + "epoch": 0.6780353634577603, + "grad_norm": 0.6953125, + "learning_rate": 0.001, + "loss": 0.736, + "step": 10785 + }, + { + "epoch": 0.6789783889980353, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7168, + "step": 10800 + }, + { + "epoch": 0.6799214145383105, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7394, + "step": 10815 + }, + { + "epoch": 0.6808644400785855, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7165, + "step": 10830 + }, + { + "epoch": 0.6818074656188605, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7249, + "step": 10845 + }, + { + "epoch": 0.6827504911591356, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.732, + "step": 10860 + }, + { + "epoch": 0.6836935166994106, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.747, + "step": 10875 + }, + { + "epoch": 0.6846365422396856, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7268, + "step": 10890 + }, + { + "epoch": 0.6855795677799607, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7334, + "step": 10905 + }, + { + "epoch": 0.6865225933202358, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7243, + "step": 10920 + }, + { + "epoch": 0.6874656188605108, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7402, + "step": 10935 + }, + { + "epoch": 0.6884086444007859, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.738, + "step": 10950 + }, + { + "epoch": 0.6893516699410609, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7309, + "step": 10965 + }, + { + "epoch": 0.6902946954813359, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7551, + "step": 10980 + }, + { + "epoch": 0.691237721021611, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7438, + "step": 10995 + }, + { + "epoch": 0.692180746561886, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7353, + "step": 11010 + }, + { + "epoch": 0.6931237721021611, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.728, + "step": 11025 + }, + { + "epoch": 0.6940667976424362, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7366, + "step": 11040 + }, + { + "epoch": 0.6950098231827112, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7424, + "step": 11055 + }, + { + "epoch": 0.6959528487229862, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7434, + "step": 11070 + }, + { + "epoch": 0.6968958742632613, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.7371, + "step": 11085 + }, + { + "epoch": 0.6978388998035363, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.7326, + "step": 11100 + }, + { + "epoch": 0.6987819253438114, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7272, + "step": 11115 + }, + { + "epoch": 0.6997249508840865, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.738, + "step": 11130 + }, + { + "epoch": 0.6997249508840865, + "eval_loss": 0.8602269291877747, + "eval_runtime": 9.6753, + "eval_samples_per_second": 103.356, + "eval_steps_per_second": 1.447, + "step": 11130 + }, + { + "epoch": 0.7006679764243615, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7375, + "step": 11145 + }, + { + "epoch": 0.7016110019646365, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7545, + "step": 11160 + }, + { + "epoch": 0.7025540275049116, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7482, + "step": 11175 + }, + { + "epoch": 0.7034970530451866, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7274, + "step": 11190 + }, + { + "epoch": 0.7044400785854616, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7241, + "step": 11205 + }, + { + "epoch": 0.7053831041257368, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7303, + "step": 11220 + }, + { + "epoch": 0.7063261296660118, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7267, + "step": 11235 + }, + { + "epoch": 0.7072691552062869, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7267, + "step": 11250 + }, + { + "epoch": 0.7082121807465619, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.7309, + "step": 11265 + }, + { + "epoch": 0.7091552062868369, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7377, + "step": 11280 + }, + { + "epoch": 0.710098231827112, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7306, + "step": 11295 + }, + { + "epoch": 0.711041257367387, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7341, + "step": 11310 + }, + { + "epoch": 0.7119842829076621, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7349, + "step": 11325 + }, + { + "epoch": 0.7129273084479372, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7407, + "step": 11340 + }, + { + "epoch": 0.7138703339882122, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7358, + "step": 11355 + }, + { + "epoch": 0.7148133595284872, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7254, + "step": 11370 + }, + { + "epoch": 0.7157563850687623, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7328, + "step": 11385 + }, + { + "epoch": 0.7166994106090373, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7304, + "step": 11400 + }, + { + "epoch": 0.7176424361493123, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7317, + "step": 11415 + }, + { + "epoch": 0.7185854616895875, + "grad_norm": 0.703125, + "learning_rate": 0.001, + "loss": 0.732, + "step": 11430 + }, + { + "epoch": 0.7195284872298625, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7433, + "step": 11445 + }, + { + "epoch": 0.7204715127701375, + "grad_norm": 0.703125, + "learning_rate": 0.001, + "loss": 0.7415, + "step": 11460 + }, + { + "epoch": 0.7214145383104126, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7285, + "step": 11475 + }, + { + "epoch": 0.7223575638506876, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7299, + "step": 11490 + }, + { + "epoch": 0.7233005893909626, + "grad_norm": 0.72265625, + "learning_rate": 0.001, + "loss": 0.7314, + "step": 11505 + }, + { + "epoch": 0.7242436149312377, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7413, + "step": 11520 + }, + { + "epoch": 0.7251866404715128, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7266, + "step": 11535 + }, + { + "epoch": 0.7261296660117879, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.7104, + "step": 11550 + }, + { + "epoch": 0.7270726915520629, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7252, + "step": 11565 + }, + { + "epoch": 0.7280157170923379, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.726, + "step": 11580 + }, + { + "epoch": 0.728958742632613, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7242, + "step": 11595 + }, + { + "epoch": 0.729901768172888, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7313, + "step": 11610 + }, + { + "epoch": 0.730844793713163, + "grad_norm": 0.74609375, + "learning_rate": 0.001, + "loss": 0.7379, + "step": 11625 + }, + { + "epoch": 0.7317878192534382, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7394, + "step": 11640 + }, + { + "epoch": 0.7327308447937132, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7332, + "step": 11655 + }, + { + "epoch": 0.7336738703339882, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7154, + "step": 11670 + }, + { + "epoch": 0.7346168958742633, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7351, + "step": 11685 + }, + { + "epoch": 0.7355599214145383, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7375, + "step": 11700 + }, + { + "epoch": 0.7365029469548133, + "grad_norm": 0.6640625, + "learning_rate": 0.001, + "loss": 0.7363, + "step": 11715 + }, + { + "epoch": 0.7374459724950884, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7301, + "step": 11730 + }, + { + "epoch": 0.7383889980353635, + "grad_norm": 0.7734375, + "learning_rate": 0.001, + "loss": 0.7287, + "step": 11745 + }, + { + "epoch": 0.7393320235756385, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7237, + "step": 11760 + }, + { + "epoch": 0.7402750491159136, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7242, + "step": 11775 + }, + { + "epoch": 0.7412180746561886, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.7242, + "step": 11790 + }, + { + "epoch": 0.7421611001964636, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 0.7171, + "step": 11805 + }, + { + "epoch": 0.7431041257367387, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7191, + "step": 11820 + }, + { + "epoch": 0.7440471512770137, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 0.7323, + "step": 11835 + }, + { + "epoch": 0.7449901768172889, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7139, + "step": 11850 + }, + { + "epoch": 0.7459332023575639, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7237, + "step": 11865 + }, + { + "epoch": 0.7468762278978389, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7136, + "step": 11880 + }, + { + "epoch": 0.747819253438114, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7375, + "step": 11895 + }, + { + "epoch": 0.748762278978389, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7236, + "step": 11910 + }, + { + "epoch": 0.749705304518664, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7416, + "step": 11925 + }, + { + "epoch": 0.750648330058939, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.7376, + "step": 11940 + }, + { + "epoch": 0.7515913555992142, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7293, + "step": 11955 + }, + { + "epoch": 0.7525343811394892, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7274, + "step": 11970 + }, + { + "epoch": 0.7534774066797643, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7251, + "step": 11985 + }, + { + "epoch": 0.7544204322200393, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7221, + "step": 12000 + }, + { + "epoch": 0.7553634577603143, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7269, + "step": 12015 + }, + { + "epoch": 0.7563064833005894, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7229, + "step": 12030 + }, + { + "epoch": 0.7572495088408644, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7332, + "step": 12045 + }, + { + "epoch": 0.7581925343811395, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7425, + "step": 12060 + }, + { + "epoch": 0.7591355599214146, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7084, + "step": 12075 + }, + { + "epoch": 0.7600785854616896, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 0.7212, + "step": 12090 + }, + { + "epoch": 0.7610216110019646, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7221, + "step": 12105 + }, + { + "epoch": 0.7619646365422397, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7125, + "step": 12120 + }, + { + "epoch": 0.7629076620825147, + "grad_norm": 0.63671875, + "learning_rate": 0.001, + "loss": 0.7214, + "step": 12135 + }, + { + "epoch": 0.7638506876227897, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7211, + "step": 12150 + }, + { + "epoch": 0.7647937131630649, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7126, + "step": 12165 + }, + { + "epoch": 0.7657367387033399, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.726, + "step": 12180 + }, + { + "epoch": 0.766679764243615, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7079, + "step": 12195 + }, + { + "epoch": 0.76762278978389, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7282, + "step": 12210 + }, + { + "epoch": 0.768565815324165, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 0.7293, + "step": 12225 + }, + { + "epoch": 0.76950884086444, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7245, + "step": 12240 + }, + { + "epoch": 0.7704518664047151, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7263, + "step": 12255 + }, + { + "epoch": 0.7713948919449902, + "grad_norm": 0.76953125, + "learning_rate": 0.001, + "loss": 0.7483, + "step": 12270 + }, + { + "epoch": 0.7723379174852653, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7243, + "step": 12285 + }, + { + "epoch": 0.7732809430255403, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 0.72, + "step": 12300 + }, + { + "epoch": 0.7742239685658153, + "grad_norm": 0.7421875, + "learning_rate": 0.001, + "loss": 0.7145, + "step": 12315 + }, + { + "epoch": 0.7751669941060904, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7264, + "step": 12330 + }, + { + "epoch": 0.7761100196463654, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7233, + "step": 12345 + }, + { + "epoch": 0.7770530451866404, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7132, + "step": 12360 + }, + { + "epoch": 0.7779960707269156, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7218, + "step": 12375 + }, + { + "epoch": 0.7789390962671906, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.7229, + "step": 12390 + }, + { + "epoch": 0.7798821218074656, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.7244, + "step": 12405 + }, + { + "epoch": 0.7808251473477407, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7133, + "step": 12420 + }, + { + "epoch": 0.7817681728880157, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7165, + "step": 12435 + }, + { + "epoch": 0.7827111984282907, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.7125, + "step": 12450 + }, + { + "epoch": 0.7836542239685658, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.7025, + "step": 12465 + }, + { + "epoch": 0.7845972495088409, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7143, + "step": 12480 + }, + { + "epoch": 0.7855402750491159, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7217, + "step": 12495 + }, + { + "epoch": 0.786483300589391, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 0.7194, + "step": 12510 + }, + { + "epoch": 0.787426326129666, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7117, + "step": 12525 + }, + { + "epoch": 0.788369351669941, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7125, + "step": 12540 + }, + { + "epoch": 0.7893123772102161, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 0.7107, + "step": 12555 + }, + { + "epoch": 0.7902554027504911, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7392, + "step": 12570 + }, + { + "epoch": 0.7911984282907663, + "grad_norm": 0.67578125, + "learning_rate": 0.001, + "loss": 0.7211, + "step": 12585 + }, + { + "epoch": 0.7921414538310413, + "grad_norm": 0.9375, + "learning_rate": 0.001, + "loss": 0.7139, + "step": 12600 + }, + { + "epoch": 0.7930844793713163, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.721, + "step": 12615 + }, + { + "epoch": 0.7940275049115914, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 0.7258, + "step": 12630 + }, + { + "epoch": 0.7949705304518664, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7079, + "step": 12645 + }, + { + "epoch": 0.7959135559921414, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.712, + "step": 12660 + }, + { + "epoch": 0.7968565815324165, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7296, + "step": 12675 + }, + { + "epoch": 0.7977996070726916, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.7146, + "step": 12690 + }, + { + "epoch": 0.7987426326129666, + "grad_norm": 0.67578125, + "learning_rate": 0.001, + "loss": 0.7202, + "step": 12705 + }, + { + "epoch": 0.7996856581532417, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7257, + "step": 12720 + }, + { + "epoch": 0.7996856581532417, + "eval_loss": 0.8420960307121277, + "eval_runtime": 9.6794, + "eval_samples_per_second": 103.312, + "eval_steps_per_second": 1.446, + "step": 12720 + }, + { + "epoch": 0.8006286836935167, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7087, + "step": 12735 + }, + { + "epoch": 0.8015717092337917, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7219, + "step": 12750 + }, + { + "epoch": 0.8025147347740668, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 0.7241, + "step": 12765 + }, + { + "epoch": 0.8034577603143418, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.7211, + "step": 12780 + }, + { + "epoch": 0.8044007858546169, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7234, + "step": 12795 + }, + { + "epoch": 0.805343811394892, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7214, + "step": 12810 + }, + { + "epoch": 0.806286836935167, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7201, + "step": 12825 + }, + { + "epoch": 0.807229862475442, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7261, + "step": 12840 + }, + { + "epoch": 0.8081728880157171, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7301, + "step": 12855 + }, + { + "epoch": 0.8091159135559921, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7104, + "step": 12870 + }, + { + "epoch": 0.8100589390962671, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7094, + "step": 12885 + }, + { + "epoch": 0.8110019646365423, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7188, + "step": 12900 + }, + { + "epoch": 0.8119449901768173, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7296, + "step": 12915 + }, + { + "epoch": 0.8128880157170923, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.725, + "step": 12930 + }, + { + "epoch": 0.8138310412573674, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.716, + "step": 12945 + }, + { + "epoch": 0.8147740667976424, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7199, + "step": 12960 + }, + { + "epoch": 0.8157170923379174, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7251, + "step": 12975 + }, + { + "epoch": 0.8166601178781925, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.7153, + "step": 12990 + }, + { + "epoch": 0.8176031434184676, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.7172, + "step": 13005 + }, + { + "epoch": 0.8185461689587427, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7241, + "step": 13020 + }, + { + "epoch": 0.8194891944990177, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 0.7087, + "step": 13035 + }, + { + "epoch": 0.8204322200392927, + "grad_norm": 0.4375, + "learning_rate": 0.001, + "loss": 0.7146, + "step": 13050 + }, + { + "epoch": 0.8213752455795678, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 0.7137, + "step": 13065 + }, + { + "epoch": 0.8223182711198428, + "grad_norm": 0.61328125, + "learning_rate": 0.001, + "loss": 0.7309, + "step": 13080 + }, + { + "epoch": 0.8232612966601178, + "grad_norm": 0.74609375, + "learning_rate": 0.001, + "loss": 0.7075, + "step": 13095 + }, + { + "epoch": 0.824204322200393, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7187, + "step": 13110 + }, + { + "epoch": 0.825147347740668, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7133, + "step": 13125 + }, + { + "epoch": 0.826090373280943, + "grad_norm": 0.65234375, + "learning_rate": 0.001, + "loss": 0.7062, + "step": 13140 + }, + { + "epoch": 0.8270333988212181, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7139, + "step": 13155 + }, + { + "epoch": 0.8279764243614931, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 0.7122, + "step": 13170 + }, + { + "epoch": 0.8289194499017681, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7089, + "step": 13185 + }, + { + "epoch": 0.8298624754420432, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7148, + "step": 13200 + }, + { + "epoch": 0.8308055009823183, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 0.7165, + "step": 13215 + }, + { + "epoch": 0.8317485265225933, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.716, + "step": 13230 + }, + { + "epoch": 0.8326915520628684, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7324, + "step": 13245 + }, + { + "epoch": 0.8336345776031434, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7028, + "step": 13260 + }, + { + "epoch": 0.8345776031434184, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7162, + "step": 13275 + }, + { + "epoch": 0.8355206286836935, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7115, + "step": 13290 + }, + { + "epoch": 0.8364636542239685, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7323, + "step": 13305 + }, + { + "epoch": 0.8374066797642437, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7154, + "step": 13320 + }, + { + "epoch": 0.8383497053045187, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7231, + "step": 13335 + }, + { + "epoch": 0.8392927308447937, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7308, + "step": 13350 + }, + { + "epoch": 0.8402357563850688, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7169, + "step": 13365 + }, + { + "epoch": 0.8411787819253438, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7209, + "step": 13380 + }, + { + "epoch": 0.8421218074656188, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.706, + "step": 13395 + }, + { + "epoch": 0.8430648330058939, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7079, + "step": 13410 + }, + { + "epoch": 0.844007858546169, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7062, + "step": 13425 + }, + { + "epoch": 0.844950884086444, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7047, + "step": 13440 + }, + { + "epoch": 0.8458939096267191, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7179, + "step": 13455 + }, + { + "epoch": 0.8468369351669941, + "grad_norm": 0.72265625, + "learning_rate": 0.001, + "loss": 0.7159, + "step": 13470 + }, + { + "epoch": 0.8477799607072691, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7152, + "step": 13485 + }, + { + "epoch": 0.8487229862475442, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.709, + "step": 13500 + }, + { + "epoch": 0.8496660117878193, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 0.7158, + "step": 13515 + }, + { + "epoch": 0.8506090373280943, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7026, + "step": 13530 + }, + { + "epoch": 0.8515520628683694, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7197, + "step": 13545 + }, + { + "epoch": 0.8524950884086444, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7271, + "step": 13560 + }, + { + "epoch": 0.8534381139489194, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7241, + "step": 13575 + }, + { + "epoch": 0.8543811394891945, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7244, + "step": 13590 + }, + { + "epoch": 0.8553241650294695, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.7154, + "step": 13605 + }, + { + "epoch": 0.8562671905697447, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.7135, + "step": 13620 + }, + { + "epoch": 0.8572102161100197, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7095, + "step": 13635 + }, + { + "epoch": 0.8581532416502947, + "grad_norm": 0.87109375, + "learning_rate": 0.001, + "loss": 0.7245, + "step": 13650 + }, + { + "epoch": 0.8590962671905698, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.7174, + "step": 13665 + }, + { + "epoch": 0.8600392927308448, + "grad_norm": 0.6875, + "learning_rate": 0.001, + "loss": 0.7131, + "step": 13680 + }, + { + "epoch": 0.8609823182711198, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7227, + "step": 13695 + }, + { + "epoch": 0.8619253438113949, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7067, + "step": 13710 + }, + { + "epoch": 0.86286836935167, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.7013, + "step": 13725 + }, + { + "epoch": 0.863811394891945, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7046, + "step": 13740 + }, + { + "epoch": 0.8647544204322201, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7077, + "step": 13755 + }, + { + "epoch": 0.8656974459724951, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7064, + "step": 13770 + }, + { + "epoch": 0.8666404715127701, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.7177, + "step": 13785 + }, + { + "epoch": 0.8675834970530452, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7128, + "step": 13800 + }, + { + "epoch": 0.8685265225933202, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.7131, + "step": 13815 + }, + { + "epoch": 0.8694695481335953, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.7048, + "step": 13830 + }, + { + "epoch": 0.8704125736738704, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7183, + "step": 13845 + }, + { + "epoch": 0.8713555992141454, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7087, + "step": 13860 + }, + { + "epoch": 0.8722986247544204, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.7117, + "step": 13875 + }, + { + "epoch": 0.8732416502946955, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 0.7216, + "step": 13890 + }, + { + "epoch": 0.8741846758349705, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.7159, + "step": 13905 + }, + { + "epoch": 0.8751277013752455, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 0.7096, + "step": 13920 + }, + { + "epoch": 0.8760707269155207, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.702, + "step": 13935 + }, + { + "epoch": 0.8770137524557957, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7101, + "step": 13950 + }, + { + "epoch": 0.8779567779960707, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7212, + "step": 13965 + }, + { + "epoch": 0.8788998035363458, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.7126, + "step": 13980 + }, + { + "epoch": 0.8798428290766208, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 0.7036, + "step": 13995 + }, + { + "epoch": 0.8807858546168958, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7071, + "step": 14010 + }, + { + "epoch": 0.8817288801571709, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7051, + "step": 14025 + }, + { + "epoch": 0.882671905697446, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.7156, + "step": 14040 + }, + { + "epoch": 0.8836149312377211, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.709, + "step": 14055 + }, + { + "epoch": 0.8845579567779961, + "grad_norm": 0.6875, + "learning_rate": 0.001, + "loss": 0.7062, + "step": 14070 + }, + { + "epoch": 0.8855009823182711, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 0.7142, + "step": 14085 + }, + { + "epoch": 0.8864440078585462, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7143, + "step": 14100 + }, + { + "epoch": 0.8873870333988212, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7093, + "step": 14115 + }, + { + "epoch": 0.8883300589390962, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.712, + "step": 14130 + }, + { + "epoch": 0.8892730844793714, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 0.7085, + "step": 14145 + }, + { + "epoch": 0.8902161100196464, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7197, + "step": 14160 + }, + { + "epoch": 0.8911591355599214, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7022, + "step": 14175 + }, + { + "epoch": 0.8921021611001965, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7019, + "step": 14190 + }, + { + "epoch": 0.8930451866404715, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7171, + "step": 14205 + }, + { + "epoch": 0.8939882121807465, + "grad_norm": 0.7890625, + "learning_rate": 0.001, + "loss": 0.7052, + "step": 14220 + }, + { + "epoch": 0.8949312377210216, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.7029, + "step": 14235 + }, + { + "epoch": 0.8958742632612967, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7067, + "step": 14250 + }, + { + "epoch": 0.8968172888015717, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.6962, + "step": 14265 + }, + { + "epoch": 0.8977603143418468, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.702, + "step": 14280 + }, + { + "epoch": 0.8987033398821218, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7066, + "step": 14295 + }, + { + "epoch": 0.8996463654223968, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7141, + "step": 14310 + }, + { + "epoch": 0.8996463654223968, + "eval_loss": 0.8242524266242981, + "eval_runtime": 9.6736, + "eval_samples_per_second": 103.374, + "eval_steps_per_second": 1.447, + "step": 14310 + }, + { + "epoch": 0.9005893909626719, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7051, + "step": 14325 + }, + { + "epoch": 0.9015324165029469, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 0.7161, + "step": 14340 + }, + { + "epoch": 0.902475442043222, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 0.6994, + "step": 14355 + }, + { + "epoch": 0.9034184675834971, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 0.7121, + "step": 14370 + }, + { + "epoch": 0.9043614931237721, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.7232, + "step": 14385 + }, + { + "epoch": 0.9053045186640472, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.7122, + "step": 14400 + }, + { + "epoch": 0.9062475442043222, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 0.7168, + "step": 14415 + }, + { + "epoch": 0.9071905697445972, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 0.6997, + "step": 14430 + }, + { + "epoch": 0.9081335952848723, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 0.7124, + "step": 14445 + }, + { + "epoch": 0.9090766208251474, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 0.6995, + "step": 14460 + }, + { + "epoch": 0.9100196463654224, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7087, + "step": 14475 + }, + { + "epoch": 0.9109626719056975, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.6991, + "step": 14490 + }, + { + "epoch": 0.9119056974459725, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.7069, + "step": 14505 + }, + { + "epoch": 0.9128487229862475, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 0.701, + "step": 14520 + }, + { + "epoch": 0.9137917485265226, + "grad_norm": 0.734375, + "learning_rate": 0.001, + "loss": 0.7111, + "step": 14535 + }, + { + "epoch": 0.9147347740667976, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.6989, + "step": 14550 + }, + { + "epoch": 0.9156777996070727, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.7243, + "step": 14565 + }, + { + "epoch": 0.9166208251473478, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7043, + "step": 14580 + }, + { + "epoch": 0.9175638506876228, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.6925, + "step": 14595 + }, + { + "epoch": 0.9185068762278978, + "grad_norm": 0.7890625, + "learning_rate": 0.001, + "loss": 0.7129, + "step": 14610 + }, + { + "epoch": 0.9194499017681729, + "grad_norm": 0.65625, + "learning_rate": 0.001, + "loss": 0.7064, + "step": 14625 + }, + { + "epoch": 0.9203929273084479, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 0.6876, + "step": 14640 + }, + { + "epoch": 0.9213359528487229, + "grad_norm": 0.6328125, + "learning_rate": 0.001, + "loss": 0.6978, + "step": 14655 + }, + { + "epoch": 0.9222789783889981, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7003, + "step": 14670 + }, + { + "epoch": 0.9232220039292731, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 0.7009, + "step": 14685 + }, + { + "epoch": 0.9241650294695481, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.7093, + "step": 14700 + }, + { + "epoch": 0.9251080550098232, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.6927, + "step": 14715 + }, + { + "epoch": 0.9260510805500982, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.6995, + "step": 14730 + }, + { + "epoch": 0.9269941060903732, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.711, + "step": 14745 + }, + { + "epoch": 0.9279371316306483, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.7156, + "step": 14760 + }, + { + "epoch": 0.9288801571709234, + "grad_norm": 0.72265625, + "learning_rate": 0.001, + "loss": 0.7173, + "step": 14775 + }, + { + "epoch": 0.9298231827111985, + "grad_norm": 0.7578125, + "learning_rate": 0.001, + "loss": 0.7132, + "step": 14790 + }, + { + "epoch": 0.9307662082514735, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.6983, + "step": 14805 + }, + { + "epoch": 0.9317092337917485, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7047, + "step": 14820 + }, + { + "epoch": 0.9326522593320236, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.7115, + "step": 14835 + }, + { + "epoch": 0.9335952848722986, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 0.7038, + "step": 14850 + }, + { + "epoch": 0.9345383104125736, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7066, + "step": 14865 + }, + { + "epoch": 0.9354813359528488, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 0.7062, + "step": 14880 + }, + { + "epoch": 0.9364243614931238, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 0.6915, + "step": 14895 + }, + { + "epoch": 0.9373673870333988, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 0.7031, + "step": 14910 + }, + { + "epoch": 0.9383104125736739, + "grad_norm": 0.6875, + "learning_rate": 0.001, + "loss": 0.7072, + "step": 14925 + }, + { + "epoch": 0.9392534381139489, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.7012, + "step": 14940 + }, + { + "epoch": 0.9401964636542239, + "grad_norm": 0.70703125, + "learning_rate": 0.001, + "loss": 0.7211, + "step": 14955 + }, + { + "epoch": 0.941139489194499, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 0.7048, + "step": 14970 + }, + { + "epoch": 0.9420825147347741, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7016, + "step": 14985 + }, + { + "epoch": 0.9430255402750491, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 0.7095, + "step": 15000 + }, + { + "epoch": 0.9439685658153242, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 0.705, + "step": 15015 + }, + { + "epoch": 0.9449115913555992, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.6986, + "step": 15030 + }, + { + "epoch": 0.9458546168958742, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 0.7026, + "step": 15045 + }, + { + "epoch": 0.9467976424361493, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 0.709, + "step": 15060 + }, + { + "epoch": 0.9477406679764243, + "grad_norm": 0.59765625, + "learning_rate": 0.001, + "loss": 0.712, + "step": 15075 + }, + { + "epoch": 0.9486836935166995, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 0.7126, + "step": 15090 + }, + { + "epoch": 0.9496267190569745, + "grad_norm": 0.75390625, + "learning_rate": 0.001, + "loss": 0.6879, + "step": 15105 + }, + { + "epoch": 0.9505697445972495, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7031, + "step": 15120 + }, + { + "epoch": 0.9515127701375246, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.7146, + "step": 15135 + }, + { + "epoch": 0.9524557956777996, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 0.6882, + "step": 15150 + }, + { + "epoch": 0.9533988212180746, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 0.6981, + "step": 15165 + }, + { + "epoch": 0.9543418467583497, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7011, + "step": 15180 + }, + { + "epoch": 0.9552848722986248, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.698, + "step": 15195 + }, + { + "epoch": 0.9562278978388998, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.6932, + "step": 15210 + }, + { + "epoch": 0.9571709233791749, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.69, + "step": 15225 + }, + { + "epoch": 0.9581139489194499, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 0.695, + "step": 15240 + }, + { + "epoch": 0.9590569744597249, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 0.7002, + "step": 15255 + }, + { + "epoch": 0.96, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 0.6943, + "step": 15270 + }, + { + "epoch": 0.960943025540275, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 0.7044, + "step": 15285 + }, + { + "epoch": 0.9618860510805501, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.7069, + "step": 15300 + }, + { + "epoch": 0.9628290766208252, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.6985, + "step": 15315 + }, + { + "epoch": 0.9637721021611002, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 0.7049, + "step": 15330 + }, + { + "epoch": 0.9647151277013752, + "grad_norm": 0.62890625, + "learning_rate": 0.001, + "loss": 0.7035, + "step": 15345 + }, + { + "epoch": 0.9656581532416503, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 0.7016, + "step": 15360 + }, + { + "epoch": 0.9666011787819253, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.6954, + "step": 15375 + }, + { + "epoch": 0.9675442043222003, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 0.7014, + "step": 15390 + }, + { + "epoch": 0.9684872298624755, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.7129, + "step": 15405 + }, + { + "epoch": 0.9694302554027505, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 0.6999, + "step": 15420 + }, + { + "epoch": 0.9703732809430256, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7017, + "step": 15435 + }, + { + "epoch": 0.9713163064833006, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 0.6893, + "step": 15450 + }, + { + "epoch": 0.9722593320235756, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 0.6993, + "step": 15465 + }, + { + "epoch": 0.9732023575638507, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 0.6999, + "step": 15480 + }, + { + "epoch": 0.9741453831041257, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.6864, + "step": 15495 + }, + { + "epoch": 0.9750884086444008, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 0.7057, + "step": 15510 + }, + { + "epoch": 0.9760314341846759, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.6957, + "step": 15525 + }, + { + "epoch": 0.9769744597249509, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 0.709, + "step": 15540 + }, + { + "epoch": 0.9779174852652259, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 0.6965, + "step": 15555 + }, + { + "epoch": 0.978860510805501, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.6989, + "step": 15570 + }, + { + "epoch": 0.979803536345776, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.6995, + "step": 15585 + }, + { + "epoch": 0.980746561886051, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.6894, + "step": 15600 + }, + { + "epoch": 0.9816895874263262, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 0.7084, + "step": 15615 + }, + { + "epoch": 0.9826326129666012, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 0.7021, + "step": 15630 + }, + { + "epoch": 0.9835756385068762, + "grad_norm": 0.87109375, + "learning_rate": 0.001, + "loss": 0.6892, + "step": 15645 + }, + { + "epoch": 0.9845186640471513, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 0.7147, + "step": 15660 + }, + { + "epoch": 0.9854616895874263, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 0.7007, + "step": 15675 + }, + { + "epoch": 0.9864047151277013, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.699, + "step": 15690 + }, + { + "epoch": 0.9873477406679764, + "grad_norm": 0.875, + "learning_rate": 0.001, + "loss": 0.6943, + "step": 15705 + }, + { + "epoch": 0.9882907662082515, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 0.6943, + "step": 15720 + }, + { + "epoch": 0.9892337917485265, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 0.703, + "step": 15735 + }, + { + "epoch": 0.9901768172888016, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.6953, + "step": 15750 + }, + { + "epoch": 0.9911198428290766, + "grad_norm": 0.71875, + "learning_rate": 0.001, + "loss": 0.6884, + "step": 15765 + }, + { + "epoch": 0.9920628683693516, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 0.6972, + "step": 15780 + }, + { + "epoch": 0.9930058939096267, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 0.6929, + "step": 15795 + }, + { + "epoch": 0.9939489194499017, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 0.6849, + "step": 15810 + }, + { + "epoch": 0.9948919449901769, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 0.6932, + "step": 15825 + }, + { + "epoch": 0.9958349705304519, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 0.7042, + "step": 15840 + }, + { + "epoch": 0.9967779960707269, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 0.6924, + "step": 15855 + }, + { + "epoch": 0.997721021611002, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 0.7009, + "step": 15870 + }, + { + "epoch": 0.998664047151277, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 0.7059, + "step": 15885 + }, + { + "epoch": 0.999607072691552, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 0.691, + "step": 15900 + }, + { + "epoch": 0.999607072691552, + "eval_loss": 0.8118711709976196, + "eval_runtime": 9.6839, + "eval_samples_per_second": 103.264, + "eval_steps_per_second": 1.446, + "step": 15900 + } + ], + "logging_steps": 15, + "max_steps": 15906, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1590, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.185992916964999e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}