{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 10780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.361328125, "learning_rate": 3.710575139146568e-07, "loss": 2.6427, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.34765625, "learning_rate": 1.855287569573284e-06, "loss": 2.6431, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.380859375, "learning_rate": 3.710575139146568e-06, "loss": 2.6904, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.369140625, "learning_rate": 5.565862708719852e-06, "loss": 2.643, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.359375, "learning_rate": 7.421150278293136e-06, "loss": 2.6806, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.337890625, "learning_rate": 9.276437847866419e-06, "loss": 2.6886, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.40234375, "learning_rate": 1.1131725417439704e-05, "loss": 2.6287, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.36328125, "learning_rate": 1.2987012987012986e-05, "loss": 2.5784, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.376953125, "learning_rate": 1.4842300556586271e-05, "loss": 2.5996, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.3046875, "learning_rate": 1.6697588126159555e-05, "loss": 2.5966, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.31640625, "learning_rate": 1.8552875695732837e-05, "loss": 2.5816, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.314453125, "learning_rate": 2.0408163265306123e-05, "loss": 2.5522, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.3046875, "learning_rate": 2.2263450834879408e-05, "loss": 2.5633, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.328125, "learning_rate": 2.4118738404452693e-05, "loss": 2.5333, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.26171875, "learning_rate": 2.5974025974025972e-05, "loss": 2.4653, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.2431640625, "learning_rate": 2.782931354359926e-05, "loss": 2.4254, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.2333984375, "learning_rate": 2.9684601113172543e-05, "loss": 2.4339, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.248046875, "learning_rate": 3.1539888682745825e-05, "loss": 2.4517, "step": 85 }, { "epoch": 0.02, "grad_norm": 0.2138671875, "learning_rate": 3.339517625231911e-05, "loss": 2.4258, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.220703125, "learning_rate": 3.5250463821892396e-05, "loss": 2.4284, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.23046875, "learning_rate": 3.7105751391465674e-05, "loss": 2.3756, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.21484375, "learning_rate": 3.8961038961038966e-05, "loss": 2.3719, "step": 105 }, { "epoch": 0.02, "grad_norm": 0.1845703125, "learning_rate": 4.0816326530612245e-05, "loss": 2.3452, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.203125, "learning_rate": 4.267161410018553e-05, "loss": 2.389, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.2080078125, "learning_rate": 4.4526901669758816e-05, "loss": 2.3233, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.201171875, "learning_rate": 4.6382189239332094e-05, "loss": 2.3368, "step": 125 }, { "epoch": 0.02, "grad_norm": 0.1826171875, "learning_rate": 4.823747680890539e-05, "loss": 2.2901, "step": 130 }, { "epoch": 0.03, "grad_norm": 0.212890625, "learning_rate": 5.0092764378478665e-05, "loss": 2.3386, "step": 135 }, { "epoch": 0.03, "grad_norm": 0.1943359375, "learning_rate": 5.1948051948051944e-05, "loss": 2.3245, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.1943359375, "learning_rate": 5.380333951762524e-05, "loss": 2.3099, "step": 145 }, { "epoch": 0.03, "grad_norm": 0.1787109375, "learning_rate": 5.565862708719852e-05, "loss": 2.2899, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.1796875, "learning_rate": 5.751391465677181e-05, "loss": 2.2857, "step": 155 }, { "epoch": 0.03, "grad_norm": 0.185546875, "learning_rate": 5.9369202226345086e-05, "loss": 2.2968, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.197265625, "learning_rate": 6.122448979591838e-05, "loss": 2.2657, "step": 165 }, { "epoch": 0.03, "grad_norm": 0.1884765625, "learning_rate": 6.307977736549165e-05, "loss": 2.3, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.1875, "learning_rate": 6.493506493506494e-05, "loss": 2.2807, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.208984375, "learning_rate": 6.679035250463822e-05, "loss": 2.257, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.1962890625, "learning_rate": 6.86456400742115e-05, "loss": 2.2708, "step": 185 }, { "epoch": 0.04, "grad_norm": 0.1787109375, "learning_rate": 7.050092764378479e-05, "loss": 2.273, "step": 190 }, { "epoch": 0.04, "grad_norm": 0.1875, "learning_rate": 7.235621521335806e-05, "loss": 2.2603, "step": 195 }, { "epoch": 0.04, "grad_norm": 0.1904296875, "learning_rate": 7.421150278293135e-05, "loss": 2.2514, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.2119140625, "learning_rate": 7.606679035250465e-05, "loss": 2.2908, "step": 205 }, { "epoch": 0.04, "grad_norm": 0.1767578125, "learning_rate": 7.792207792207793e-05, "loss": 2.2708, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.1982421875, "learning_rate": 7.977736549165122e-05, "loss": 2.2279, "step": 215 }, { "epoch": 0.04, "grad_norm": 0.19921875, "learning_rate": 8.163265306122449e-05, "loss": 2.2627, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.2099609375, "learning_rate": 8.348794063079778e-05, "loss": 2.2642, "step": 225 }, { "epoch": 0.04, "grad_norm": 0.2041015625, "learning_rate": 8.534322820037106e-05, "loss": 2.2373, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.2001953125, "learning_rate": 8.719851576994435e-05, "loss": 2.2572, "step": 235 }, { "epoch": 0.04, "grad_norm": 0.1953125, "learning_rate": 8.905380333951763e-05, "loss": 2.2105, "step": 240 }, { "epoch": 0.05, "grad_norm": 0.1953125, "learning_rate": 9.090909090909092e-05, "loss": 2.2398, "step": 245 }, { "epoch": 0.05, "grad_norm": 0.203125, "learning_rate": 9.276437847866419e-05, "loss": 2.2448, "step": 250 }, { "epoch": 0.05, "grad_norm": 0.203125, "learning_rate": 9.461966604823747e-05, "loss": 2.2113, "step": 255 }, { "epoch": 0.05, "grad_norm": 0.2021484375, "learning_rate": 9.647495361781077e-05, "loss": 2.2631, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.19921875, "learning_rate": 9.833024118738406e-05, "loss": 2.2716, "step": 265 }, { "epoch": 0.05, "grad_norm": 0.201171875, "learning_rate": 0.00010018552875695733, "loss": 2.2368, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.1982421875, "learning_rate": 0.00010204081632653062, "loss": 2.2297, "step": 275 }, { "epoch": 0.05, "grad_norm": 0.201171875, "learning_rate": 0.00010389610389610389, "loss": 2.2706, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.205078125, "learning_rate": 0.00010575139146567719, "loss": 2.254, "step": 285 }, { "epoch": 0.05, "grad_norm": 0.1962890625, "learning_rate": 0.00010760667903525049, "loss": 2.2576, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.2060546875, "learning_rate": 0.00010946196660482376, "loss": 2.2456, "step": 295 }, { "epoch": 0.06, "grad_norm": 0.19921875, "learning_rate": 0.00011131725417439704, "loss": 2.2665, "step": 300 }, { "epoch": 0.06, "grad_norm": 0.2080078125, "learning_rate": 0.00011317254174397031, "loss": 2.2529, "step": 305 }, { "epoch": 0.06, "grad_norm": 0.2119140625, "learning_rate": 0.00011502782931354361, "loss": 2.2427, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.201171875, "learning_rate": 0.00011688311688311689, "loss": 2.2489, "step": 315 }, { "epoch": 0.06, "grad_norm": 0.203125, "learning_rate": 0.00011873840445269017, "loss": 2.2153, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.193359375, "learning_rate": 0.00012059369202226344, "loss": 2.2204, "step": 325 }, { "epoch": 0.06, "grad_norm": 0.1962890625, "learning_rate": 0.00012244897959183676, "loss": 2.2397, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.19921875, "learning_rate": 0.00012430426716141001, "loss": 2.2586, "step": 335 }, { "epoch": 0.06, "grad_norm": 0.1943359375, "learning_rate": 0.0001261595547309833, "loss": 2.2376, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.197265625, "learning_rate": 0.0001280148423005566, "loss": 2.1859, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.216796875, "learning_rate": 0.00012987012987012987, "loss": 2.2297, "step": 350 }, { "epoch": 0.07, "grad_norm": 0.203125, "learning_rate": 0.00013172541743970318, "loss": 2.2543, "step": 355 }, { "epoch": 0.07, "grad_norm": 0.2041015625, "learning_rate": 0.00013358070500927644, "loss": 2.2614, "step": 360 }, { "epoch": 0.07, "grad_norm": 0.1953125, "learning_rate": 0.00013543599257884973, "loss": 2.2152, "step": 365 }, { "epoch": 0.07, "grad_norm": 0.2080078125, "learning_rate": 0.000137291280148423, "loss": 2.2712, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.185546875, "learning_rate": 0.0001391465677179963, "loss": 2.2363, "step": 375 }, { "epoch": 0.07, "grad_norm": 0.205078125, "learning_rate": 0.00014100185528756958, "loss": 2.2156, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.1953125, "learning_rate": 0.00014285714285714287, "loss": 2.2721, "step": 385 }, { "epoch": 0.07, "grad_norm": 0.1923828125, "learning_rate": 0.00014471243042671613, "loss": 2.2148, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.1943359375, "learning_rate": 0.00014656771799628944, "loss": 2.1739, "step": 395 }, { "epoch": 0.07, "grad_norm": 0.2021484375, "learning_rate": 0.0001484230055658627, "loss": 2.2193, "step": 400 }, { "epoch": 0.08, "grad_norm": 0.1923828125, "learning_rate": 0.000150278293135436, "loss": 2.1901, "step": 405 }, { "epoch": 0.08, "grad_norm": 0.1982421875, "learning_rate": 0.0001521335807050093, "loss": 2.2271, "step": 410 }, { "epoch": 0.08, "grad_norm": 0.197265625, "learning_rate": 0.00015398886827458255, "loss": 2.2502, "step": 415 }, { "epoch": 0.08, "grad_norm": 0.19921875, "learning_rate": 0.00015584415584415587, "loss": 2.2148, "step": 420 }, { "epoch": 0.08, "grad_norm": 0.197265625, "learning_rate": 0.00015769944341372912, "loss": 2.1976, "step": 425 }, { "epoch": 0.08, "grad_norm": 0.1943359375, "learning_rate": 0.00015955473098330244, "loss": 2.1779, "step": 430 }, { "epoch": 0.08, "grad_norm": 0.1962890625, "learning_rate": 0.0001614100185528757, "loss": 2.2163, "step": 435 }, { "epoch": 0.08, "grad_norm": 0.1962890625, "learning_rate": 0.00016326530612244898, "loss": 2.2278, "step": 440 }, { "epoch": 0.08, "grad_norm": 0.1904296875, "learning_rate": 0.00016512059369202227, "loss": 2.2018, "step": 445 }, { "epoch": 0.08, "grad_norm": 0.1884765625, "learning_rate": 0.00016697588126159555, "loss": 2.2086, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.185546875, "learning_rate": 0.00016883116883116884, "loss": 2.2486, "step": 455 }, { "epoch": 0.09, "grad_norm": 0.1923828125, "learning_rate": 0.00017068645640074212, "loss": 2.2295, "step": 460 }, { "epoch": 0.09, "grad_norm": 0.1962890625, "learning_rate": 0.0001725417439703154, "loss": 2.2318, "step": 465 }, { "epoch": 0.09, "grad_norm": 0.1865234375, "learning_rate": 0.0001743970315398887, "loss": 2.2289, "step": 470 }, { "epoch": 0.09, "grad_norm": 0.1953125, "learning_rate": 0.00017625231910946198, "loss": 2.2378, "step": 475 }, { "epoch": 0.09, "grad_norm": 0.18359375, "learning_rate": 0.00017810760667903526, "loss": 2.1831, "step": 480 }, { "epoch": 0.09, "grad_norm": 0.2041015625, "learning_rate": 0.00017996289424860855, "loss": 2.2105, "step": 485 }, { "epoch": 0.09, "grad_norm": 0.1845703125, "learning_rate": 0.00018181818181818183, "loss": 2.2156, "step": 490 }, { "epoch": 0.09, "grad_norm": 0.1884765625, "learning_rate": 0.00018367346938775512, "loss": 2.2045, "step": 495 }, { "epoch": 0.09, "grad_norm": 0.197265625, "learning_rate": 0.00018552875695732838, "loss": 2.2298, "step": 500 }, { "epoch": 0.09, "grad_norm": 0.1884765625, "learning_rate": 0.0001873840445269017, "loss": 2.2342, "step": 505 }, { "epoch": 0.09, "grad_norm": 0.1953125, "learning_rate": 0.00018923933209647495, "loss": 2.2121, "step": 510 }, { "epoch": 0.1, "grad_norm": 0.1884765625, "learning_rate": 0.00019109461966604823, "loss": 2.2169, "step": 515 }, { "epoch": 0.1, "grad_norm": 0.1796875, "learning_rate": 0.00019294990723562155, "loss": 2.2174, "step": 520 }, { "epoch": 0.1, "grad_norm": 0.1826171875, "learning_rate": 0.0001948051948051948, "loss": 2.2328, "step": 525 }, { "epoch": 0.1, "grad_norm": 0.18359375, "learning_rate": 0.00019666048237476812, "loss": 2.2081, "step": 530 }, { "epoch": 0.1, "grad_norm": 0.189453125, "learning_rate": 0.00019851576994434138, "loss": 2.2411, "step": 535 }, { "epoch": 0.1, "grad_norm": 0.173828125, "learning_rate": 0.00020037105751391466, "loss": 2.208, "step": 540 }, { "epoch": 0.1, "grad_norm": 0.1796875, "learning_rate": 0.00020222634508348795, "loss": 2.1872, "step": 545 }, { "epoch": 0.1, "grad_norm": 0.185546875, "learning_rate": 0.00020408163265306123, "loss": 2.1829, "step": 550 }, { "epoch": 0.1, "grad_norm": 0.197265625, "learning_rate": 0.00020593692022263454, "loss": 2.2194, "step": 555 }, { "epoch": 0.1, "grad_norm": 0.185546875, "learning_rate": 0.00020779220779220778, "loss": 2.2283, "step": 560 }, { "epoch": 0.1, "grad_norm": 0.1884765625, "learning_rate": 0.0002096474953617811, "loss": 2.2119, "step": 565 }, { "epoch": 0.11, "grad_norm": 0.1826171875, "learning_rate": 0.00021150278293135437, "loss": 2.2196, "step": 570 }, { "epoch": 0.11, "grad_norm": 0.189453125, "learning_rate": 0.00021335807050092766, "loss": 2.2378, "step": 575 }, { "epoch": 0.11, "grad_norm": 0.1845703125, "learning_rate": 0.00021521335807050097, "loss": 2.2361, "step": 580 }, { "epoch": 0.11, "grad_norm": 0.185546875, "learning_rate": 0.0002170686456400742, "loss": 2.2253, "step": 585 }, { "epoch": 0.11, "grad_norm": 0.173828125, "learning_rate": 0.00021892393320964752, "loss": 2.2111, "step": 590 }, { "epoch": 0.11, "grad_norm": 0.1796875, "learning_rate": 0.0002207792207792208, "loss": 2.2417, "step": 595 }, { "epoch": 0.11, "grad_norm": 0.1806640625, "learning_rate": 0.00022263450834879409, "loss": 2.1814, "step": 600 }, { "epoch": 0.11, "grad_norm": 0.177734375, "learning_rate": 0.00022448979591836734, "loss": 2.248, "step": 605 }, { "epoch": 0.11, "grad_norm": 0.177734375, "learning_rate": 0.00022634508348794063, "loss": 2.191, "step": 610 }, { "epoch": 0.11, "grad_norm": 0.1787109375, "learning_rate": 0.00022820037105751392, "loss": 2.2258, "step": 615 }, { "epoch": 0.12, "grad_norm": 0.181640625, "learning_rate": 0.00023005565862708723, "loss": 2.226, "step": 620 }, { "epoch": 0.12, "grad_norm": 0.189453125, "learning_rate": 0.00023191094619666046, "loss": 2.231, "step": 625 }, { "epoch": 0.12, "grad_norm": 0.1826171875, "learning_rate": 0.00023376623376623377, "loss": 2.2119, "step": 630 }, { "epoch": 0.12, "grad_norm": 0.173828125, "learning_rate": 0.00023562152133580706, "loss": 2.2206, "step": 635 }, { "epoch": 0.12, "grad_norm": 0.177734375, "learning_rate": 0.00023747680890538034, "loss": 2.2083, "step": 640 }, { "epoch": 0.12, "grad_norm": 0.1748046875, "learning_rate": 0.00023933209647495365, "loss": 2.2206, "step": 645 }, { "epoch": 0.12, "grad_norm": 0.1728515625, "learning_rate": 0.00024118738404452689, "loss": 2.1931, "step": 650 }, { "epoch": 0.12, "grad_norm": 0.1767578125, "learning_rate": 0.0002430426716141002, "loss": 2.1753, "step": 655 }, { "epoch": 0.12, "grad_norm": 0.1796875, "learning_rate": 0.0002448979591836735, "loss": 2.1885, "step": 660 }, { "epoch": 0.12, "grad_norm": 0.181640625, "learning_rate": 0.0002467532467532468, "loss": 2.2343, "step": 665 }, { "epoch": 0.12, "grad_norm": 0.1865234375, "learning_rate": 0.00024860853432282003, "loss": 2.2425, "step": 670 }, { "epoch": 0.13, "grad_norm": 0.1796875, "learning_rate": 0.0002504638218923933, "loss": 2.1929, "step": 675 }, { "epoch": 0.13, "grad_norm": 0.1767578125, "learning_rate": 0.0002523191094619666, "loss": 2.2569, "step": 680 }, { "epoch": 0.13, "grad_norm": 0.173828125, "learning_rate": 0.00025417439703153994, "loss": 2.2311, "step": 685 }, { "epoch": 0.13, "grad_norm": 0.181640625, "learning_rate": 0.0002560296846011132, "loss": 2.2218, "step": 690 }, { "epoch": 0.13, "grad_norm": 0.1708984375, "learning_rate": 0.00025788497217068645, "loss": 2.2148, "step": 695 }, { "epoch": 0.13, "grad_norm": 0.1796875, "learning_rate": 0.00025974025974025974, "loss": 2.1973, "step": 700 }, { "epoch": 0.13, "grad_norm": 0.1826171875, "learning_rate": 0.000261595547309833, "loss": 2.197, "step": 705 }, { "epoch": 0.13, "grad_norm": 0.1796875, "learning_rate": 0.00026345083487940637, "loss": 2.2127, "step": 710 }, { "epoch": 0.13, "grad_norm": 0.169921875, "learning_rate": 0.0002653061224489796, "loss": 2.1964, "step": 715 }, { "epoch": 0.13, "grad_norm": 0.171875, "learning_rate": 0.0002671614100185529, "loss": 2.2008, "step": 720 }, { "epoch": 0.13, "grad_norm": 0.185546875, "learning_rate": 0.00026901669758812617, "loss": 2.1945, "step": 725 }, { "epoch": 0.14, "grad_norm": 0.173828125, "learning_rate": 0.00027087198515769945, "loss": 2.1954, "step": 730 }, { "epoch": 0.14, "grad_norm": 0.1748046875, "learning_rate": 0.00027272727272727274, "loss": 2.2016, "step": 735 }, { "epoch": 0.14, "grad_norm": 0.1787109375, "learning_rate": 0.000274582560296846, "loss": 2.257, "step": 740 }, { "epoch": 0.14, "grad_norm": 0.1767578125, "learning_rate": 0.0002764378478664193, "loss": 2.2091, "step": 745 }, { "epoch": 0.14, "grad_norm": 0.173828125, "learning_rate": 0.0002782931354359926, "loss": 2.2242, "step": 750 }, { "epoch": 0.14, "grad_norm": 0.1748046875, "learning_rate": 0.0002801484230055659, "loss": 2.2002, "step": 755 }, { "epoch": 0.14, "grad_norm": 0.1767578125, "learning_rate": 0.00028200371057513916, "loss": 2.2441, "step": 760 }, { "epoch": 0.14, "grad_norm": 0.171875, "learning_rate": 0.00028385899814471245, "loss": 2.1937, "step": 765 }, { "epoch": 0.14, "grad_norm": 0.173828125, "learning_rate": 0.00028571428571428574, "loss": 2.2132, "step": 770 }, { "epoch": 0.14, "grad_norm": 0.1708984375, "learning_rate": 0.000287569573283859, "loss": 2.2231, "step": 775 }, { "epoch": 0.14, "grad_norm": 0.177734375, "learning_rate": 0.00028942486085343225, "loss": 2.2246, "step": 780 }, { "epoch": 0.15, "grad_norm": 0.1787109375, "learning_rate": 0.0002912801484230056, "loss": 2.2031, "step": 785 }, { "epoch": 0.15, "grad_norm": 0.1728515625, "learning_rate": 0.0002931354359925789, "loss": 2.2084, "step": 790 }, { "epoch": 0.15, "grad_norm": 0.171875, "learning_rate": 0.00029499072356215216, "loss": 2.2021, "step": 795 }, { "epoch": 0.15, "grad_norm": 0.1748046875, "learning_rate": 0.0002968460111317254, "loss": 2.1771, "step": 800 }, { "epoch": 0.15, "grad_norm": 0.1796875, "learning_rate": 0.0002987012987012987, "loss": 2.2068, "step": 805 }, { "epoch": 0.15, "grad_norm": 0.1689453125, "learning_rate": 0.000300556586270872, "loss": 2.1828, "step": 810 }, { "epoch": 0.15, "grad_norm": 0.1708984375, "learning_rate": 0.0003024118738404453, "loss": 2.2225, "step": 815 }, { "epoch": 0.15, "grad_norm": 0.1767578125, "learning_rate": 0.0003042671614100186, "loss": 2.2214, "step": 820 }, { "epoch": 0.15, "grad_norm": 0.1689453125, "learning_rate": 0.0003061224489795918, "loss": 2.2285, "step": 825 }, { "epoch": 0.15, "grad_norm": 0.1875, "learning_rate": 0.0003079777365491651, "loss": 2.1673, "step": 830 }, { "epoch": 0.15, "grad_norm": 0.173828125, "learning_rate": 0.00030983302411873845, "loss": 2.2316, "step": 835 }, { "epoch": 0.16, "grad_norm": 0.1787109375, "learning_rate": 0.00031168831168831173, "loss": 2.211, "step": 840 }, { "epoch": 0.16, "grad_norm": 0.1875, "learning_rate": 0.00031354359925788496, "loss": 2.2433, "step": 845 }, { "epoch": 0.16, "grad_norm": 0.1728515625, "learning_rate": 0.00031539888682745825, "loss": 2.2117, "step": 850 }, { "epoch": 0.16, "grad_norm": 0.1748046875, "learning_rate": 0.00031725417439703153, "loss": 2.2104, "step": 855 }, { "epoch": 0.16, "grad_norm": 0.181640625, "learning_rate": 0.0003191094619666049, "loss": 2.2543, "step": 860 }, { "epoch": 0.16, "grad_norm": 0.1728515625, "learning_rate": 0.00032096474953617816, "loss": 2.202, "step": 865 }, { "epoch": 0.16, "grad_norm": 0.1748046875, "learning_rate": 0.0003228200371057514, "loss": 2.2002, "step": 870 }, { "epoch": 0.16, "grad_norm": 0.169921875, "learning_rate": 0.0003246753246753247, "loss": 2.2332, "step": 875 }, { "epoch": 0.16, "grad_norm": 0.1767578125, "learning_rate": 0.00032653061224489796, "loss": 2.1782, "step": 880 }, { "epoch": 0.16, "grad_norm": 0.169921875, "learning_rate": 0.0003283858998144713, "loss": 2.2428, "step": 885 }, { "epoch": 0.17, "grad_norm": 0.171875, "learning_rate": 0.00033024118738404453, "loss": 2.2064, "step": 890 }, { "epoch": 0.17, "grad_norm": 0.1884765625, "learning_rate": 0.0003320964749536178, "loss": 2.2444, "step": 895 }, { "epoch": 0.17, "grad_norm": 0.181640625, "learning_rate": 0.0003339517625231911, "loss": 2.2078, "step": 900 }, { "epoch": 0.17, "grad_norm": 0.169921875, "learning_rate": 0.0003358070500927644, "loss": 2.2068, "step": 905 }, { "epoch": 0.17, "grad_norm": 0.1796875, "learning_rate": 0.00033766233766233767, "loss": 2.1913, "step": 910 }, { "epoch": 0.17, "grad_norm": 0.1767578125, "learning_rate": 0.00033951762523191096, "loss": 2.2534, "step": 915 }, { "epoch": 0.17, "grad_norm": 0.17578125, "learning_rate": 0.00034137291280148424, "loss": 2.1718, "step": 920 }, { "epoch": 0.17, "grad_norm": 0.1796875, "learning_rate": 0.00034322820037105753, "loss": 2.2004, "step": 925 }, { "epoch": 0.17, "grad_norm": 0.16796875, "learning_rate": 0.0003450834879406308, "loss": 2.2119, "step": 930 }, { "epoch": 0.17, "grad_norm": 0.16796875, "learning_rate": 0.0003469387755102041, "loss": 2.2186, "step": 935 }, { "epoch": 0.17, "grad_norm": 0.1787109375, "learning_rate": 0.0003487940630797774, "loss": 2.2306, "step": 940 }, { "epoch": 0.18, "grad_norm": 0.173828125, "learning_rate": 0.00035064935064935067, "loss": 2.234, "step": 945 }, { "epoch": 0.18, "grad_norm": 0.1748046875, "learning_rate": 0.00035250463821892396, "loss": 2.1975, "step": 950 }, { "epoch": 0.18, "grad_norm": 0.177734375, "learning_rate": 0.00035435992578849724, "loss": 2.2149, "step": 955 }, { "epoch": 0.18, "grad_norm": 0.17578125, "learning_rate": 0.0003562152133580705, "loss": 2.1836, "step": 960 }, { "epoch": 0.18, "grad_norm": 0.1728515625, "learning_rate": 0.0003580705009276438, "loss": 2.1872, "step": 965 }, { "epoch": 0.18, "grad_norm": 0.17578125, "learning_rate": 0.0003599257884972171, "loss": 2.1991, "step": 970 }, { "epoch": 0.18, "grad_norm": 0.181640625, "learning_rate": 0.0003617810760667904, "loss": 2.2465, "step": 975 }, { "epoch": 0.18, "grad_norm": 0.166015625, "learning_rate": 0.00036363636363636367, "loss": 2.2186, "step": 980 }, { "epoch": 0.18, "grad_norm": 0.1728515625, "learning_rate": 0.00036549165120593695, "loss": 2.2318, "step": 985 }, { "epoch": 0.18, "grad_norm": 0.173828125, "learning_rate": 0.00036734693877551024, "loss": 2.1858, "step": 990 }, { "epoch": 0.18, "grad_norm": 0.169921875, "learning_rate": 0.0003692022263450835, "loss": 2.1947, "step": 995 }, { "epoch": 0.19, "grad_norm": 0.1787109375, "learning_rate": 0.00037105751391465676, "loss": 2.2203, "step": 1000 }, { "epoch": 0.19, "grad_norm": 0.1884765625, "learning_rate": 0.00037291280148423004, "loss": 2.2217, "step": 1005 }, { "epoch": 0.19, "grad_norm": 0.1826171875, "learning_rate": 0.0003747680890538034, "loss": 2.2001, "step": 1010 }, { "epoch": 0.19, "grad_norm": 0.1748046875, "learning_rate": 0.00037662337662337667, "loss": 2.2013, "step": 1015 }, { "epoch": 0.19, "grad_norm": 0.1748046875, "learning_rate": 0.0003784786641929499, "loss": 2.2198, "step": 1020 }, { "epoch": 0.19, "grad_norm": 0.1767578125, "learning_rate": 0.0003803339517625232, "loss": 2.2236, "step": 1025 }, { "epoch": 0.19, "grad_norm": 0.1787109375, "learning_rate": 0.00038218923933209647, "loss": 2.2227, "step": 1030 }, { "epoch": 0.19, "grad_norm": 0.1904296875, "learning_rate": 0.0003840445269016698, "loss": 2.2275, "step": 1035 }, { "epoch": 0.19, "grad_norm": 0.1796875, "learning_rate": 0.0003858998144712431, "loss": 2.2241, "step": 1040 }, { "epoch": 0.19, "grad_norm": 0.1708984375, "learning_rate": 0.0003877551020408163, "loss": 2.2066, "step": 1045 }, { "epoch": 0.19, "grad_norm": 0.1787109375, "learning_rate": 0.0003896103896103896, "loss": 2.1842, "step": 1050 }, { "epoch": 0.2, "grad_norm": 0.1748046875, "learning_rate": 0.0003914656771799629, "loss": 2.1279, "step": 1055 }, { "epoch": 0.2, "grad_norm": 0.1748046875, "learning_rate": 0.00039332096474953624, "loss": 2.1712, "step": 1060 }, { "epoch": 0.2, "grad_norm": 0.1904296875, "learning_rate": 0.00039517625231910947, "loss": 2.1972, "step": 1065 }, { "epoch": 0.2, "grad_norm": 0.173828125, "learning_rate": 0.00039703153988868275, "loss": 2.2082, "step": 1070 }, { "epoch": 0.2, "grad_norm": 0.1767578125, "learning_rate": 0.00039888682745825604, "loss": 2.21, "step": 1075 }, { "epoch": 0.2, "grad_norm": 0.1728515625, "learning_rate": 0.0003999999580591546, "loss": 2.2126, "step": 1080 }, { "epoch": 0.2, "grad_norm": 0.1806640625, "learning_rate": 0.00039999948622484506, "loss": 2.1857, "step": 1085 }, { "epoch": 0.2, "grad_norm": 0.1748046875, "learning_rate": 0.0003999984901314102, "loss": 2.1972, "step": 1090 }, { "epoch": 0.2, "grad_norm": 0.17578125, "learning_rate": 0.0003999969697814611, "loss": 2.1948, "step": 1095 }, { "epoch": 0.2, "grad_norm": 0.1708984375, "learning_rate": 0.00039999492517898294, "loss": 2.2061, "step": 1100 }, { "epoch": 0.21, "grad_norm": 0.177734375, "learning_rate": 0.00039999235632933523, "loss": 2.2225, "step": 1105 }, { "epoch": 0.21, "grad_norm": 0.1748046875, "learning_rate": 0.0003999892632392519, "loss": 2.2043, "step": 1110 }, { "epoch": 0.21, "grad_norm": 0.1728515625, "learning_rate": 0.00039998564591684063, "loss": 2.1753, "step": 1115 }, { "epoch": 0.21, "grad_norm": 0.1806640625, "learning_rate": 0.00039998150437158366, "loss": 2.1665, "step": 1120 }, { "epoch": 0.21, "grad_norm": 0.181640625, "learning_rate": 0.00039997683861433723, "loss": 2.2314, "step": 1125 }, { "epoch": 0.21, "grad_norm": 0.1728515625, "learning_rate": 0.0003999716486573316, "loss": 2.1757, "step": 1130 }, { "epoch": 0.21, "grad_norm": 0.1728515625, "learning_rate": 0.0003999659345141714, "loss": 2.2133, "step": 1135 }, { "epoch": 0.21, "grad_norm": 0.18359375, "learning_rate": 0.0003999596961998349, "loss": 2.204, "step": 1140 }, { "epoch": 0.21, "grad_norm": 0.1826171875, "learning_rate": 0.0003999529337306748, "loss": 2.1927, "step": 1145 }, { "epoch": 0.21, "grad_norm": 0.1787109375, "learning_rate": 0.0003999456471244174, "loss": 2.2271, "step": 1150 }, { "epoch": 0.21, "grad_norm": 0.1826171875, "learning_rate": 0.00039993783640016327, "loss": 2.1898, "step": 1155 }, { "epoch": 0.22, "grad_norm": 0.171875, "learning_rate": 0.0003999295015783866, "loss": 2.1957, "step": 1160 }, { "epoch": 0.22, "grad_norm": 0.1806640625, "learning_rate": 0.00039992064268093544, "loss": 2.1883, "step": 1165 }, { "epoch": 0.22, "grad_norm": 0.1767578125, "learning_rate": 0.00039991125973103174, "loss": 2.2178, "step": 1170 }, { "epoch": 0.22, "grad_norm": 0.1796875, "learning_rate": 0.00039990135275327096, "loss": 2.2018, "step": 1175 }, { "epoch": 0.22, "grad_norm": 0.1787109375, "learning_rate": 0.0003998909217736223, "loss": 2.2102, "step": 1180 }, { "epoch": 0.22, "grad_norm": 0.1845703125, "learning_rate": 0.0003998799668194285, "loss": 2.1726, "step": 1185 }, { "epoch": 0.22, "grad_norm": 0.1865234375, "learning_rate": 0.0003998684879194059, "loss": 2.2063, "step": 1190 }, { "epoch": 0.22, "grad_norm": 0.1748046875, "learning_rate": 0.0003998564851036441, "loss": 2.2208, "step": 1195 }, { "epoch": 0.22, "grad_norm": 0.1826171875, "learning_rate": 0.00039984395840360603, "loss": 2.1942, "step": 1200 }, { "epoch": 0.22, "grad_norm": 0.1796875, "learning_rate": 0.0003998309078521281, "loss": 2.2295, "step": 1205 }, { "epoch": 0.22, "grad_norm": 0.173828125, "learning_rate": 0.00039981733348341966, "loss": 2.1758, "step": 1210 }, { "epoch": 0.23, "grad_norm": 0.17578125, "learning_rate": 0.00039980323533306327, "loss": 2.2272, "step": 1215 }, { "epoch": 0.23, "grad_norm": 0.1767578125, "learning_rate": 0.00039978861343801446, "loss": 2.1481, "step": 1220 }, { "epoch": 0.23, "grad_norm": 0.1796875, "learning_rate": 0.0003997734678366016, "loss": 2.2496, "step": 1225 }, { "epoch": 0.23, "grad_norm": 0.1708984375, "learning_rate": 0.00039975779856852596, "loss": 2.1578, "step": 1230 }, { "epoch": 0.23, "grad_norm": 0.1787109375, "learning_rate": 0.0003997416056748613, "loss": 2.1974, "step": 1235 }, { "epoch": 0.23, "grad_norm": 0.177734375, "learning_rate": 0.0003997248891980542, "loss": 2.195, "step": 1240 }, { "epoch": 0.23, "grad_norm": 0.1806640625, "learning_rate": 0.00039970764918192356, "loss": 2.1602, "step": 1245 }, { "epoch": 0.23, "grad_norm": 0.1806640625, "learning_rate": 0.0003996898856716607, "loss": 2.1616, "step": 1250 }, { "epoch": 0.23, "grad_norm": 0.17578125, "learning_rate": 0.00039967159871382915, "loss": 2.2104, "step": 1255 }, { "epoch": 0.23, "grad_norm": 0.18359375, "learning_rate": 0.0003996527883563645, "loss": 2.2346, "step": 1260 }, { "epoch": 0.23, "grad_norm": 0.1767578125, "learning_rate": 0.0003996334546485744, "loss": 2.1678, "step": 1265 }, { "epoch": 0.24, "grad_norm": 0.1767578125, "learning_rate": 0.00039961359764113845, "loss": 2.2538, "step": 1270 }, { "epoch": 0.24, "grad_norm": 0.171875, "learning_rate": 0.00039959321738610777, "loss": 2.1512, "step": 1275 }, { "epoch": 0.24, "grad_norm": 0.1796875, "learning_rate": 0.0003995723139369052, "loss": 2.1887, "step": 1280 }, { "epoch": 0.24, "grad_norm": 0.18359375, "learning_rate": 0.00039955088734832485, "loss": 2.2243, "step": 1285 }, { "epoch": 0.24, "grad_norm": 0.1826171875, "learning_rate": 0.00039952893767653257, "loss": 2.182, "step": 1290 }, { "epoch": 0.24, "grad_norm": 0.173828125, "learning_rate": 0.0003995064649790649, "loss": 2.2166, "step": 1295 }, { "epoch": 0.24, "grad_norm": 0.1708984375, "learning_rate": 0.00039948346931482963, "loss": 2.2192, "step": 1300 }, { "epoch": 0.24, "grad_norm": 0.17578125, "learning_rate": 0.0003994599507441053, "loss": 2.2279, "step": 1305 }, { "epoch": 0.24, "grad_norm": 0.1787109375, "learning_rate": 0.00039943590932854124, "loss": 2.1521, "step": 1310 }, { "epoch": 0.24, "grad_norm": 0.1806640625, "learning_rate": 0.00039941134513115734, "loss": 2.1969, "step": 1315 }, { "epoch": 0.24, "grad_norm": 0.1767578125, "learning_rate": 0.00039938625821634365, "loss": 2.1979, "step": 1320 }, { "epoch": 0.25, "grad_norm": 0.18359375, "learning_rate": 0.00039936064864986063, "loss": 2.1844, "step": 1325 }, { "epoch": 0.25, "grad_norm": 0.1767578125, "learning_rate": 0.00039933451649883865, "loss": 2.2703, "step": 1330 }, { "epoch": 0.25, "grad_norm": 0.1787109375, "learning_rate": 0.0003993078618317781, "loss": 2.218, "step": 1335 }, { "epoch": 0.25, "grad_norm": 0.1796875, "learning_rate": 0.00039928068471854875, "loss": 2.1867, "step": 1340 }, { "epoch": 0.25, "grad_norm": 0.1767578125, "learning_rate": 0.00039925298523039017, "loss": 2.1961, "step": 1345 }, { "epoch": 0.25, "grad_norm": 0.1787109375, "learning_rate": 0.000399224763439911, "loss": 2.1803, "step": 1350 }, { "epoch": 0.25, "grad_norm": 0.1787109375, "learning_rate": 0.0003991960194210892, "loss": 2.2029, "step": 1355 }, { "epoch": 0.25, "grad_norm": 0.1748046875, "learning_rate": 0.0003991667532492714, "loss": 2.1686, "step": 1360 }, { "epoch": 0.25, "grad_norm": 0.1787109375, "learning_rate": 0.0003991369650011731, "loss": 2.18, "step": 1365 }, { "epoch": 0.25, "grad_norm": 0.1826171875, "learning_rate": 0.0003991066547548785, "loss": 2.1813, "step": 1370 }, { "epoch": 0.26, "grad_norm": 0.17578125, "learning_rate": 0.00039907582258983965, "loss": 2.1894, "step": 1375 }, { "epoch": 0.26, "grad_norm": 0.1767578125, "learning_rate": 0.00039904446858687713, "loss": 2.2217, "step": 1380 }, { "epoch": 0.26, "grad_norm": 0.1806640625, "learning_rate": 0.0003990125928281793, "loss": 2.1899, "step": 1385 }, { "epoch": 0.26, "grad_norm": 0.1767578125, "learning_rate": 0.00039898019539730197, "loss": 2.202, "step": 1390 }, { "epoch": 0.26, "grad_norm": 0.177734375, "learning_rate": 0.0003989472763791688, "loss": 2.1604, "step": 1395 }, { "epoch": 0.26, "grad_norm": 0.181640625, "learning_rate": 0.00039891383586007043, "loss": 2.1963, "step": 1400 }, { "epoch": 0.26, "grad_norm": 0.1943359375, "learning_rate": 0.00039887987392766454, "loss": 2.1794, "step": 1405 }, { "epoch": 0.26, "grad_norm": 0.17578125, "learning_rate": 0.0003988453906709756, "loss": 2.1875, "step": 1410 }, { "epoch": 0.26, "grad_norm": 0.185546875, "learning_rate": 0.0003988103861803948, "loss": 2.1734, "step": 1415 }, { "epoch": 0.26, "grad_norm": 0.1748046875, "learning_rate": 0.0003987748605476793, "loss": 2.1798, "step": 1420 }, { "epoch": 0.26, "grad_norm": 0.177734375, "learning_rate": 0.0003987388138659526, "loss": 2.2137, "step": 1425 }, { "epoch": 0.27, "grad_norm": 0.1865234375, "learning_rate": 0.000398702246229704, "loss": 2.1868, "step": 1430 }, { "epoch": 0.27, "grad_norm": 0.177734375, "learning_rate": 0.00039866515773478826, "loss": 2.1732, "step": 1435 }, { "epoch": 0.27, "grad_norm": 0.1796875, "learning_rate": 0.00039862754847842563, "loss": 2.1935, "step": 1440 }, { "epoch": 0.27, "grad_norm": 0.181640625, "learning_rate": 0.0003985894185592012, "loss": 2.1878, "step": 1445 }, { "epoch": 0.27, "grad_norm": 0.185546875, "learning_rate": 0.00039855076807706523, "loss": 2.1381, "step": 1450 }, { "epoch": 0.27, "grad_norm": 0.1796875, "learning_rate": 0.0003985115971333321, "loss": 2.1796, "step": 1455 }, { "epoch": 0.27, "grad_norm": 0.1796875, "learning_rate": 0.0003984719058306808, "loss": 2.2372, "step": 1460 }, { "epoch": 0.27, "grad_norm": 0.1826171875, "learning_rate": 0.00039843169427315425, "loss": 2.2127, "step": 1465 }, { "epoch": 0.27, "grad_norm": 0.1796875, "learning_rate": 0.0003983909625661591, "loss": 2.1515, "step": 1470 }, { "epoch": 0.27, "grad_norm": 0.1845703125, "learning_rate": 0.0003983497108164654, "loss": 2.2013, "step": 1475 }, { "epoch": 0.27, "grad_norm": 0.17578125, "learning_rate": 0.0003983079391322065, "loss": 2.1768, "step": 1480 }, { "epoch": 0.28, "grad_norm": 0.1796875, "learning_rate": 0.0003982656476228787, "loss": 2.2009, "step": 1485 }, { "epoch": 0.28, "grad_norm": 0.1767578125, "learning_rate": 0.0003982228363993406, "loss": 2.24, "step": 1490 }, { "epoch": 0.28, "grad_norm": 0.181640625, "learning_rate": 0.0003981795055738137, "loss": 2.2217, "step": 1495 }, { "epoch": 0.28, "grad_norm": 0.1767578125, "learning_rate": 0.00039813565525988084, "loss": 2.1766, "step": 1500 }, { "epoch": 0.28, "grad_norm": 0.1875, "learning_rate": 0.00039809128557248726, "loss": 2.1778, "step": 1505 }, { "epoch": 0.28, "grad_norm": 0.1826171875, "learning_rate": 0.00039804639662793914, "loss": 2.2053, "step": 1510 }, { "epoch": 0.28, "grad_norm": 0.1826171875, "learning_rate": 0.000398000988543904, "loss": 2.2288, "step": 1515 }, { "epoch": 0.28, "grad_norm": 0.181640625, "learning_rate": 0.00039795506143941017, "loss": 2.1998, "step": 1520 }, { "epoch": 0.28, "grad_norm": 0.1787109375, "learning_rate": 0.0003979086154348465, "loss": 2.2061, "step": 1525 }, { "epoch": 0.28, "grad_norm": 0.1796875, "learning_rate": 0.00039786165065196205, "loss": 2.1373, "step": 1530 }, { "epoch": 0.28, "grad_norm": 0.181640625, "learning_rate": 0.00039781416721386566, "loss": 2.2074, "step": 1535 }, { "epoch": 0.29, "grad_norm": 0.1796875, "learning_rate": 0.0003977661652450257, "loss": 2.2157, "step": 1540 }, { "epoch": 0.29, "grad_norm": 0.1884765625, "learning_rate": 0.00039771764487127, "loss": 2.2054, "step": 1545 }, { "epoch": 0.29, "grad_norm": 0.1826171875, "learning_rate": 0.00039766860621978504, "loss": 2.1908, "step": 1550 }, { "epoch": 0.29, "grad_norm": 0.1826171875, "learning_rate": 0.00039761904941911603, "loss": 2.1985, "step": 1555 }, { "epoch": 0.29, "grad_norm": 0.1875, "learning_rate": 0.0003975689745991662, "loss": 2.1675, "step": 1560 }, { "epoch": 0.29, "grad_norm": 0.185546875, "learning_rate": 0.0003975183818911969, "loss": 2.178, "step": 1565 }, { "epoch": 0.29, "grad_norm": 0.1826171875, "learning_rate": 0.00039746727142782686, "loss": 2.1903, "step": 1570 }, { "epoch": 0.29, "grad_norm": 0.181640625, "learning_rate": 0.0003974156433430321, "loss": 2.2173, "step": 1575 }, { "epoch": 0.29, "grad_norm": 0.1796875, "learning_rate": 0.0003973634977721454, "loss": 2.235, "step": 1580 }, { "epoch": 0.29, "grad_norm": 0.1796875, "learning_rate": 0.00039731083485185605, "loss": 2.159, "step": 1585 }, { "epoch": 0.29, "grad_norm": 0.1806640625, "learning_rate": 0.0003972576547202096, "loss": 2.2094, "step": 1590 }, { "epoch": 0.3, "grad_norm": 0.181640625, "learning_rate": 0.0003972039575166071, "loss": 2.2054, "step": 1595 }, { "epoch": 0.3, "grad_norm": 0.1875, "learning_rate": 0.0003971497433818053, "loss": 2.166, "step": 1600 }, { "epoch": 0.3, "grad_norm": 0.1865234375, "learning_rate": 0.00039709501245791575, "loss": 2.216, "step": 1605 }, { "epoch": 0.3, "grad_norm": 0.1865234375, "learning_rate": 0.0003970397648884048, "loss": 2.1989, "step": 1610 }, { "epoch": 0.3, "grad_norm": 0.1806640625, "learning_rate": 0.000396984000818093, "loss": 2.159, "step": 1615 }, { "epoch": 0.3, "grad_norm": 0.1796875, "learning_rate": 0.00039692772039315484, "loss": 2.2085, "step": 1620 }, { "epoch": 0.3, "grad_norm": 0.1845703125, "learning_rate": 0.0003968709237611183, "loss": 2.1399, "step": 1625 }, { "epoch": 0.3, "grad_norm": 0.18359375, "learning_rate": 0.00039681361107086463, "loss": 2.1798, "step": 1630 }, { "epoch": 0.3, "grad_norm": 0.1787109375, "learning_rate": 0.0003967557824726276, "loss": 2.1858, "step": 1635 }, { "epoch": 0.3, "grad_norm": 0.181640625, "learning_rate": 0.00039669743811799354, "loss": 2.1729, "step": 1640 }, { "epoch": 0.31, "grad_norm": 0.1865234375, "learning_rate": 0.0003966385781599006, "loss": 2.2067, "step": 1645 }, { "epoch": 0.31, "grad_norm": 0.1806640625, "learning_rate": 0.00039657920275263856, "loss": 2.1663, "step": 1650 }, { "epoch": 0.31, "grad_norm": 0.185546875, "learning_rate": 0.00039651931205184824, "loss": 2.1732, "step": 1655 }, { "epoch": 0.31, "grad_norm": 0.1875, "learning_rate": 0.00039645890621452137, "loss": 2.1602, "step": 1660 }, { "epoch": 0.31, "grad_norm": 0.1923828125, "learning_rate": 0.0003963979853989999, "loss": 2.2557, "step": 1665 }, { "epoch": 0.31, "grad_norm": 0.1953125, "learning_rate": 0.00039633654976497563, "loss": 2.149, "step": 1670 }, { "epoch": 0.31, "grad_norm": 0.1806640625, "learning_rate": 0.00039627459947349, "loss": 2.172, "step": 1675 }, { "epoch": 0.31, "grad_norm": 0.1826171875, "learning_rate": 0.00039621213468693343, "loss": 2.1722, "step": 1680 }, { "epoch": 0.31, "grad_norm": 0.1875, "learning_rate": 0.000396149155569045, "loss": 2.177, "step": 1685 }, { "epoch": 0.31, "grad_norm": 0.181640625, "learning_rate": 0.00039608566228491204, "loss": 2.1349, "step": 1690 }, { "epoch": 0.31, "grad_norm": 0.1865234375, "learning_rate": 0.00039602165500096973, "loss": 2.2292, "step": 1695 }, { "epoch": 0.32, "grad_norm": 0.1845703125, "learning_rate": 0.00039595713388500037, "loss": 2.1504, "step": 1700 }, { "epoch": 0.32, "grad_norm": 0.1875, "learning_rate": 0.00039589209910613336, "loss": 2.153, "step": 1705 }, { "epoch": 0.32, "grad_norm": 0.18359375, "learning_rate": 0.00039582655083484454, "loss": 2.2195, "step": 1710 }, { "epoch": 0.32, "grad_norm": 0.1787109375, "learning_rate": 0.00039576048924295576, "loss": 2.1511, "step": 1715 }, { "epoch": 0.32, "grad_norm": 0.1904296875, "learning_rate": 0.0003956939145036344, "loss": 2.1825, "step": 1720 }, { "epoch": 0.32, "grad_norm": 0.18359375, "learning_rate": 0.000395626826791393, "loss": 2.2075, "step": 1725 }, { "epoch": 0.32, "grad_norm": 0.18359375, "learning_rate": 0.00039555922628208874, "loss": 2.2107, "step": 1730 }, { "epoch": 0.32, "grad_norm": 0.1845703125, "learning_rate": 0.00039549111315292294, "loss": 2.2175, "step": 1735 }, { "epoch": 0.32, "grad_norm": 0.1826171875, "learning_rate": 0.00039542248758244077, "loss": 2.1779, "step": 1740 }, { "epoch": 0.32, "grad_norm": 0.1953125, "learning_rate": 0.0003953533497505306, "loss": 2.1802, "step": 1745 }, { "epoch": 0.32, "grad_norm": 0.1787109375, "learning_rate": 0.00039528369983842356, "loss": 2.1587, "step": 1750 }, { "epoch": 0.33, "grad_norm": 0.1904296875, "learning_rate": 0.0003952135380286931, "loss": 2.1503, "step": 1755 }, { "epoch": 0.33, "grad_norm": 0.18359375, "learning_rate": 0.00039514286450525457, "loss": 2.2139, "step": 1760 }, { "epoch": 0.33, "grad_norm": 0.1904296875, "learning_rate": 0.0003950716794533647, "loss": 2.1928, "step": 1765 }, { "epoch": 0.33, "grad_norm": 0.1787109375, "learning_rate": 0.000394999983059621, "loss": 2.1583, "step": 1770 }, { "epoch": 0.33, "grad_norm": 0.177734375, "learning_rate": 0.00039492777551196134, "loss": 2.2097, "step": 1775 }, { "epoch": 0.33, "grad_norm": 0.1845703125, "learning_rate": 0.00039485505699966356, "loss": 2.175, "step": 1780 }, { "epoch": 0.33, "grad_norm": 0.18359375, "learning_rate": 0.00039478182771334494, "loss": 2.1728, "step": 1785 }, { "epoch": 0.33, "grad_norm": 0.18359375, "learning_rate": 0.0003947080878449615, "loss": 2.1756, "step": 1790 }, { "epoch": 0.33, "grad_norm": 0.1826171875, "learning_rate": 0.0003946338375878078, "loss": 2.2138, "step": 1795 }, { "epoch": 0.33, "grad_norm": 0.1884765625, "learning_rate": 0.00039455907713651614, "loss": 2.1444, "step": 1800 }, { "epoch": 0.33, "grad_norm": 0.1884765625, "learning_rate": 0.0003944838066870563, "loss": 2.1863, "step": 1805 }, { "epoch": 0.34, "grad_norm": 0.1826171875, "learning_rate": 0.00039440802643673486, "loss": 2.1562, "step": 1810 }, { "epoch": 0.34, "grad_norm": 0.1845703125, "learning_rate": 0.00039433173658419483, "loss": 2.2186, "step": 1815 }, { "epoch": 0.34, "grad_norm": 0.18359375, "learning_rate": 0.0003942549373294149, "loss": 2.1931, "step": 1820 }, { "epoch": 0.34, "grad_norm": 0.1796875, "learning_rate": 0.00039417762887370924, "loss": 2.1274, "step": 1825 }, { "epoch": 0.34, "grad_norm": 0.1845703125, "learning_rate": 0.0003940998114197266, "loss": 2.2013, "step": 1830 }, { "epoch": 0.34, "grad_norm": 0.1787109375, "learning_rate": 0.0003940214851714501, "loss": 2.177, "step": 1835 }, { "epoch": 0.34, "grad_norm": 0.1875, "learning_rate": 0.0003939426503341965, "loss": 2.1978, "step": 1840 }, { "epoch": 0.34, "grad_norm": 0.185546875, "learning_rate": 0.0003938633071146158, "loss": 2.2461, "step": 1845 }, { "epoch": 0.34, "grad_norm": 0.1826171875, "learning_rate": 0.00039378345572069044, "loss": 2.1986, "step": 1850 }, { "epoch": 0.34, "grad_norm": 0.1826171875, "learning_rate": 0.00039370309636173513, "loss": 2.1906, "step": 1855 }, { "epoch": 0.35, "grad_norm": 0.1845703125, "learning_rate": 0.00039362222924839614, "loss": 2.1805, "step": 1860 }, { "epoch": 0.35, "grad_norm": 0.1845703125, "learning_rate": 0.00039354085459265055, "loss": 2.1797, "step": 1865 }, { "epoch": 0.35, "grad_norm": 0.1865234375, "learning_rate": 0.0003934589726078059, "loss": 2.1593, "step": 1870 }, { "epoch": 0.35, "grad_norm": 0.1845703125, "learning_rate": 0.00039337658350849973, "loss": 2.1646, "step": 1875 }, { "epoch": 0.35, "grad_norm": 0.1923828125, "learning_rate": 0.0003932936875106986, "loss": 2.215, "step": 1880 }, { "epoch": 0.35, "grad_norm": 0.1865234375, "learning_rate": 0.00039321028483169817, "loss": 2.1932, "step": 1885 }, { "epoch": 0.35, "grad_norm": 0.181640625, "learning_rate": 0.00039312637569012207, "loss": 2.1654, "step": 1890 }, { "epoch": 0.35, "grad_norm": 0.1806640625, "learning_rate": 0.0003930419603059214, "loss": 2.1686, "step": 1895 }, { "epoch": 0.35, "grad_norm": 0.1845703125, "learning_rate": 0.00039295703890037444, "loss": 2.1788, "step": 1900 }, { "epoch": 0.35, "grad_norm": 0.1865234375, "learning_rate": 0.00039287161169608597, "loss": 2.2414, "step": 1905 }, { "epoch": 0.35, "grad_norm": 0.185546875, "learning_rate": 0.0003927856789169865, "loss": 2.2244, "step": 1910 }, { "epoch": 0.36, "grad_norm": 0.189453125, "learning_rate": 0.0003926992407883317, "loss": 2.1363, "step": 1915 }, { "epoch": 0.36, "grad_norm": 0.1923828125, "learning_rate": 0.0003926122975367022, "loss": 2.204, "step": 1920 }, { "epoch": 0.36, "grad_norm": 0.18359375, "learning_rate": 0.0003925248493900024, "loss": 2.181, "step": 1925 }, { "epoch": 0.36, "grad_norm": 0.1953125, "learning_rate": 0.00039243689657746046, "loss": 2.1653, "step": 1930 }, { "epoch": 0.36, "grad_norm": 0.181640625, "learning_rate": 0.0003923484393296273, "loss": 2.1717, "step": 1935 }, { "epoch": 0.36, "grad_norm": 0.1962890625, "learning_rate": 0.000392259477878376, "loss": 2.1734, "step": 1940 }, { "epoch": 0.36, "grad_norm": 0.1943359375, "learning_rate": 0.0003921700124569015, "loss": 2.1973, "step": 1945 }, { "epoch": 0.36, "grad_norm": 0.189453125, "learning_rate": 0.0003920800432997197, "loss": 2.189, "step": 1950 }, { "epoch": 0.36, "grad_norm": 0.2080078125, "learning_rate": 0.000391989570642667, "loss": 2.2058, "step": 1955 }, { "epoch": 0.36, "grad_norm": 0.181640625, "learning_rate": 0.0003918985947228995, "loss": 2.174, "step": 1960 }, { "epoch": 0.36, "grad_norm": 0.1923828125, "learning_rate": 0.00039180711577889264, "loss": 2.2542, "step": 1965 }, { "epoch": 0.37, "grad_norm": 0.1884765625, "learning_rate": 0.0003917151340504405, "loss": 2.1754, "step": 1970 }, { "epoch": 0.37, "grad_norm": 0.193359375, "learning_rate": 0.0003916226497786548, "loss": 2.2225, "step": 1975 }, { "epoch": 0.37, "grad_norm": 0.181640625, "learning_rate": 0.0003915296632059649, "loss": 2.1529, "step": 1980 }, { "epoch": 0.37, "grad_norm": 0.1875, "learning_rate": 0.00039143617457611674, "loss": 2.1837, "step": 1985 }, { "epoch": 0.37, "grad_norm": 0.1845703125, "learning_rate": 0.0003913421841341723, "loss": 2.1773, "step": 1990 }, { "epoch": 0.37, "grad_norm": 0.1904296875, "learning_rate": 0.00039124769212650883, "loss": 2.1852, "step": 1995 }, { "epoch": 0.37, "grad_norm": 0.1953125, "learning_rate": 0.0003911526988008185, "loss": 2.2199, "step": 2000 }, { "epoch": 0.37, "grad_norm": 0.1904296875, "learning_rate": 0.00039105720440610765, "loss": 2.2312, "step": 2005 }, { "epoch": 0.37, "grad_norm": 0.1904296875, "learning_rate": 0.00039096120919269577, "loss": 2.2002, "step": 2010 }, { "epoch": 0.37, "grad_norm": 0.1826171875, "learning_rate": 0.0003908647134122156, "loss": 2.1313, "step": 2015 }, { "epoch": 0.37, "grad_norm": 0.1923828125, "learning_rate": 0.0003907677173176115, "loss": 2.1836, "step": 2020 }, { "epoch": 0.38, "grad_norm": 0.1923828125, "learning_rate": 0.00039067022116313964, "loss": 2.2171, "step": 2025 }, { "epoch": 0.38, "grad_norm": 0.1845703125, "learning_rate": 0.000390572225204367, "loss": 2.1877, "step": 2030 }, { "epoch": 0.38, "grad_norm": 0.18359375, "learning_rate": 0.00039047372969817044, "loss": 2.1615, "step": 2035 }, { "epoch": 0.38, "grad_norm": 0.1865234375, "learning_rate": 0.00039037473490273673, "loss": 2.1992, "step": 2040 }, { "epoch": 0.38, "grad_norm": 0.189453125, "learning_rate": 0.0003902752410775609, "loss": 2.1588, "step": 2045 }, { "epoch": 0.38, "grad_norm": 0.19140625, "learning_rate": 0.00039017524848344653, "loss": 2.1624, "step": 2050 }, { "epoch": 0.38, "grad_norm": 0.185546875, "learning_rate": 0.0003900747573825044, "loss": 2.1364, "step": 2055 }, { "epoch": 0.38, "grad_norm": 0.185546875, "learning_rate": 0.00038997376803815196, "loss": 2.1946, "step": 2060 }, { "epoch": 0.38, "grad_norm": 0.1845703125, "learning_rate": 0.0003898722807151129, "loss": 2.1941, "step": 2065 }, { "epoch": 0.38, "grad_norm": 0.197265625, "learning_rate": 0.0003897702956794163, "loss": 2.217, "step": 2070 }, { "epoch": 0.38, "grad_norm": 0.1904296875, "learning_rate": 0.0003896678131983956, "loss": 2.1456, "step": 2075 }, { "epoch": 0.39, "grad_norm": 0.2177734375, "learning_rate": 0.0003895648335406884, "loss": 2.1496, "step": 2080 }, { "epoch": 0.39, "grad_norm": 0.197265625, "learning_rate": 0.0003894613569762356, "loss": 2.1576, "step": 2085 }, { "epoch": 0.39, "grad_norm": 0.181640625, "learning_rate": 0.00038935738377628045, "loss": 2.1978, "step": 2090 }, { "epoch": 0.39, "grad_norm": 0.19140625, "learning_rate": 0.00038925291421336824, "loss": 2.221, "step": 2095 }, { "epoch": 0.39, "grad_norm": 0.1845703125, "learning_rate": 0.0003891479485613452, "loss": 2.1936, "step": 2100 }, { "epoch": 0.39, "grad_norm": 0.189453125, "learning_rate": 0.00038904248709535817, "loss": 2.2115, "step": 2105 }, { "epoch": 0.39, "grad_norm": 0.189453125, "learning_rate": 0.0003889365300918534, "loss": 2.1994, "step": 2110 }, { "epoch": 0.39, "grad_norm": 0.1865234375, "learning_rate": 0.00038883007782857627, "loss": 2.2152, "step": 2115 }, { "epoch": 0.39, "grad_norm": 0.1884765625, "learning_rate": 0.00038872313058457044, "loss": 2.1625, "step": 2120 }, { "epoch": 0.39, "grad_norm": 0.1943359375, "learning_rate": 0.0003886156886401768, "loss": 2.1987, "step": 2125 }, { "epoch": 0.4, "grad_norm": 0.1845703125, "learning_rate": 0.0003885077522770334, "loss": 2.1692, "step": 2130 }, { "epoch": 0.4, "grad_norm": 0.1875, "learning_rate": 0.00038839932177807385, "loss": 2.1872, "step": 2135 }, { "epoch": 0.4, "grad_norm": 0.1904296875, "learning_rate": 0.0003882903974275275, "loss": 2.2051, "step": 2140 }, { "epoch": 0.4, "grad_norm": 0.18359375, "learning_rate": 0.00038818097951091776, "loss": 2.2039, "step": 2145 }, { "epoch": 0.4, "grad_norm": 0.1884765625, "learning_rate": 0.0003880710683150622, "loss": 2.1854, "step": 2150 }, { "epoch": 0.4, "grad_norm": 0.1806640625, "learning_rate": 0.0003879606641280714, "loss": 2.2216, "step": 2155 }, { "epoch": 0.4, "grad_norm": 0.1865234375, "learning_rate": 0.00038784976723934796, "loss": 2.169, "step": 2160 }, { "epoch": 0.4, "grad_norm": 0.1923828125, "learning_rate": 0.00038773837793958625, "loss": 2.202, "step": 2165 }, { "epoch": 0.4, "grad_norm": 0.1865234375, "learning_rate": 0.0003876264965207712, "loss": 2.1729, "step": 2170 }, { "epoch": 0.4, "grad_norm": 0.1845703125, "learning_rate": 0.00038751412327617794, "loss": 2.1744, "step": 2175 }, { "epoch": 0.4, "grad_norm": 0.1875, "learning_rate": 0.0003874012585003707, "loss": 2.1703, "step": 2180 }, { "epoch": 0.41, "grad_norm": 0.1943359375, "learning_rate": 0.0003872879024892021, "loss": 2.2233, "step": 2185 }, { "epoch": 0.41, "grad_norm": 0.189453125, "learning_rate": 0.00038717405553981266, "loss": 2.2082, "step": 2190 }, { "epoch": 0.41, "grad_norm": 0.1904296875, "learning_rate": 0.00038705971795062954, "loss": 2.1604, "step": 2195 }, { "epoch": 0.41, "grad_norm": 0.1884765625, "learning_rate": 0.00038694489002136625, "loss": 2.1782, "step": 2200 }, { "epoch": 0.41, "grad_norm": 0.1826171875, "learning_rate": 0.00038682957205302137, "loss": 2.1704, "step": 2205 }, { "epoch": 0.41, "grad_norm": 0.1904296875, "learning_rate": 0.00038671376434787824, "loss": 2.1866, "step": 2210 }, { "epoch": 0.41, "grad_norm": 0.189453125, "learning_rate": 0.0003865974672095039, "loss": 2.1453, "step": 2215 }, { "epoch": 0.41, "grad_norm": 0.1845703125, "learning_rate": 0.00038648068094274823, "loss": 2.183, "step": 2220 }, { "epoch": 0.41, "grad_norm": 0.1845703125, "learning_rate": 0.0003863634058537434, "loss": 2.1609, "step": 2225 }, { "epoch": 0.41, "grad_norm": 0.1875, "learning_rate": 0.00038624564224990285, "loss": 2.2011, "step": 2230 }, { "epoch": 0.41, "grad_norm": 0.1953125, "learning_rate": 0.0003861273904399207, "loss": 2.1987, "step": 2235 }, { "epoch": 0.42, "grad_norm": 0.1865234375, "learning_rate": 0.0003860086507337705, "loss": 2.147, "step": 2240 }, { "epoch": 0.42, "grad_norm": 0.1943359375, "learning_rate": 0.00038588942344270504, "loss": 2.2012, "step": 2245 }, { "epoch": 0.42, "grad_norm": 0.1865234375, "learning_rate": 0.00038576970887925515, "loss": 2.1498, "step": 2250 }, { "epoch": 0.42, "grad_norm": 0.189453125, "learning_rate": 0.0003856495073572289, "loss": 2.1827, "step": 2255 }, { "epoch": 0.42, "grad_norm": 0.189453125, "learning_rate": 0.0003855288191917106, "loss": 2.2141, "step": 2260 }, { "epoch": 0.42, "grad_norm": 0.1904296875, "learning_rate": 0.00038540764469906073, "loss": 2.1551, "step": 2265 }, { "epoch": 0.42, "grad_norm": 0.1943359375, "learning_rate": 0.00038528598419691404, "loss": 2.1988, "step": 2270 }, { "epoch": 0.42, "grad_norm": 0.1884765625, "learning_rate": 0.0003851638380041796, "loss": 2.2131, "step": 2275 }, { "epoch": 0.42, "grad_norm": 0.1943359375, "learning_rate": 0.0003850412064410396, "loss": 2.2368, "step": 2280 }, { "epoch": 0.42, "grad_norm": 0.1884765625, "learning_rate": 0.0003849180898289482, "loss": 2.2131, "step": 2285 }, { "epoch": 0.42, "grad_norm": 0.189453125, "learning_rate": 0.00038479448849063145, "loss": 2.1753, "step": 2290 }, { "epoch": 0.43, "grad_norm": 0.1826171875, "learning_rate": 0.0003846704027500859, "loss": 2.125, "step": 2295 }, { "epoch": 0.43, "grad_norm": 0.1962890625, "learning_rate": 0.00038454583293257754, "loss": 2.1785, "step": 2300 }, { "epoch": 0.43, "grad_norm": 0.19140625, "learning_rate": 0.0003844207793646417, "loss": 2.1774, "step": 2305 }, { "epoch": 0.43, "grad_norm": 0.18359375, "learning_rate": 0.0003842952423740815, "loss": 2.201, "step": 2310 }, { "epoch": 0.43, "grad_norm": 0.1904296875, "learning_rate": 0.0003841692222899675, "loss": 2.1697, "step": 2315 }, { "epoch": 0.43, "grad_norm": 0.1962890625, "learning_rate": 0.00038404271944263635, "loss": 2.2196, "step": 2320 }, { "epoch": 0.43, "grad_norm": 0.189453125, "learning_rate": 0.0003839157341636903, "loss": 2.1834, "step": 2325 }, { "epoch": 0.43, "grad_norm": 0.19140625, "learning_rate": 0.0003837882667859961, "loss": 2.168, "step": 2330 }, { "epoch": 0.43, "grad_norm": 0.2001953125, "learning_rate": 0.0003836603176436842, "loss": 2.2153, "step": 2335 }, { "epoch": 0.43, "grad_norm": 0.1845703125, "learning_rate": 0.00038353188707214826, "loss": 2.1761, "step": 2340 }, { "epoch": 0.44, "grad_norm": 0.1904296875, "learning_rate": 0.0003834029754080435, "loss": 2.1737, "step": 2345 }, { "epoch": 0.44, "grad_norm": 0.1865234375, "learning_rate": 0.00038327358298928624, "loss": 2.152, "step": 2350 }, { "epoch": 0.44, "grad_norm": 0.1962890625, "learning_rate": 0.00038314371015505327, "loss": 2.2155, "step": 2355 }, { "epoch": 0.44, "grad_norm": 0.1943359375, "learning_rate": 0.00038301335724578057, "loss": 2.196, "step": 2360 }, { "epoch": 0.44, "grad_norm": 0.197265625, "learning_rate": 0.00038288252460316253, "loss": 2.1863, "step": 2365 }, { "epoch": 0.44, "grad_norm": 0.1875, "learning_rate": 0.000382751212570151, "loss": 2.1775, "step": 2370 }, { "epoch": 0.44, "grad_norm": 0.1865234375, "learning_rate": 0.0003826194214909545, "loss": 2.185, "step": 2375 }, { "epoch": 0.44, "grad_norm": 0.19921875, "learning_rate": 0.00038248715171103744, "loss": 2.1951, "step": 2380 }, { "epoch": 0.44, "grad_norm": 0.1953125, "learning_rate": 0.0003823544035771187, "loss": 2.143, "step": 2385 }, { "epoch": 0.44, "grad_norm": 0.19140625, "learning_rate": 0.0003822211774371715, "loss": 2.2101, "step": 2390 }, { "epoch": 0.44, "grad_norm": 0.1904296875, "learning_rate": 0.00038208747364042167, "loss": 2.1411, "step": 2395 }, { "epoch": 0.45, "grad_norm": 0.2001953125, "learning_rate": 0.00038195329253734735, "loss": 2.1732, "step": 2400 }, { "epoch": 0.45, "grad_norm": 0.1875, "learning_rate": 0.0003818186344796778, "loss": 2.1737, "step": 2405 }, { "epoch": 0.45, "grad_norm": 0.193359375, "learning_rate": 0.00038168349982039244, "loss": 2.185, "step": 2410 }, { "epoch": 0.45, "grad_norm": 0.1865234375, "learning_rate": 0.0003815478889137201, "loss": 2.1498, "step": 2415 }, { "epoch": 0.45, "grad_norm": 0.1865234375, "learning_rate": 0.000381411802115138, "loss": 2.1966, "step": 2420 }, { "epoch": 0.45, "grad_norm": 0.1904296875, "learning_rate": 0.0003812752397813708, "loss": 2.1869, "step": 2425 }, { "epoch": 0.45, "grad_norm": 0.1865234375, "learning_rate": 0.00038113820227038967, "loss": 2.1542, "step": 2430 }, { "epoch": 0.45, "grad_norm": 0.1884765625, "learning_rate": 0.0003810006899414113, "loss": 2.1937, "step": 2435 }, { "epoch": 0.45, "grad_norm": 0.1962890625, "learning_rate": 0.00038086270315489703, "loss": 2.178, "step": 2440 }, { "epoch": 0.45, "grad_norm": 0.1962890625, "learning_rate": 0.0003807242422725521, "loss": 2.199, "step": 2445 }, { "epoch": 0.45, "grad_norm": 0.193359375, "learning_rate": 0.0003805853076573243, "loss": 2.1818, "step": 2450 }, { "epoch": 0.46, "grad_norm": 0.1962890625, "learning_rate": 0.0003804458996734032, "loss": 2.1872, "step": 2455 }, { "epoch": 0.46, "grad_norm": 0.1953125, "learning_rate": 0.0003803060186862193, "loss": 2.1838, "step": 2460 }, { "epoch": 0.46, "grad_norm": 0.19140625, "learning_rate": 0.000380165665062443, "loss": 2.188, "step": 2465 }, { "epoch": 0.46, "grad_norm": 0.1875, "learning_rate": 0.0003800248391699836, "loss": 2.1769, "step": 2470 }, { "epoch": 0.46, "grad_norm": 0.1982421875, "learning_rate": 0.0003798835413779883, "loss": 2.1981, "step": 2475 }, { "epoch": 0.46, "grad_norm": 0.1953125, "learning_rate": 0.0003797417720568413, "loss": 2.1709, "step": 2480 }, { "epoch": 0.46, "grad_norm": 0.1865234375, "learning_rate": 0.0003795995315781629, "loss": 2.1286, "step": 2485 }, { "epoch": 0.46, "grad_norm": 0.201171875, "learning_rate": 0.00037945682031480845, "loss": 2.174, "step": 2490 }, { "epoch": 0.46, "grad_norm": 0.1923828125, "learning_rate": 0.0003793136386408673, "loss": 2.2026, "step": 2495 }, { "epoch": 0.46, "grad_norm": 0.1884765625, "learning_rate": 0.00037916998693166183, "loss": 2.1502, "step": 2500 }, { "epoch": 0.46, "grad_norm": 0.1884765625, "learning_rate": 0.00037902586556374666, "loss": 2.2136, "step": 2505 }, { "epoch": 0.47, "grad_norm": 0.1923828125, "learning_rate": 0.00037888127491490754, "loss": 2.1598, "step": 2510 }, { "epoch": 0.47, "grad_norm": 0.1923828125, "learning_rate": 0.00037873621536416017, "loss": 2.1932, "step": 2515 }, { "epoch": 0.47, "grad_norm": 0.189453125, "learning_rate": 0.00037859068729174955, "loss": 2.1698, "step": 2520 }, { "epoch": 0.47, "grad_norm": 0.19140625, "learning_rate": 0.00037844469107914874, "loss": 2.1692, "step": 2525 }, { "epoch": 0.47, "grad_norm": 0.1875, "learning_rate": 0.0003782982271090579, "loss": 2.2203, "step": 2530 }, { "epoch": 0.47, "grad_norm": 0.189453125, "learning_rate": 0.00037815129576540356, "loss": 2.1922, "step": 2535 }, { "epoch": 0.47, "grad_norm": 0.197265625, "learning_rate": 0.000378003897433337, "loss": 2.1406, "step": 2540 }, { "epoch": 0.47, "grad_norm": 0.1875, "learning_rate": 0.00037785603249923386, "loss": 2.1636, "step": 2545 }, { "epoch": 0.47, "grad_norm": 0.185546875, "learning_rate": 0.00037770770135069293, "loss": 2.1878, "step": 2550 }, { "epoch": 0.47, "grad_norm": 0.1826171875, "learning_rate": 0.000377558904376535, "loss": 2.1672, "step": 2555 }, { "epoch": 0.47, "grad_norm": 0.193359375, "learning_rate": 0.0003774096419668018, "loss": 2.1447, "step": 2560 }, { "epoch": 0.48, "grad_norm": 0.1953125, "learning_rate": 0.0003772599145127553, "loss": 2.1738, "step": 2565 }, { "epoch": 0.48, "grad_norm": 0.1904296875, "learning_rate": 0.00037710972240687654, "loss": 2.127, "step": 2570 }, { "epoch": 0.48, "grad_norm": 0.1865234375, "learning_rate": 0.00037695906604286427, "loss": 2.1835, "step": 2575 }, { "epoch": 0.48, "grad_norm": 0.19140625, "learning_rate": 0.0003768079458156344, "loss": 2.2312, "step": 2580 }, { "epoch": 0.48, "grad_norm": 0.1884765625, "learning_rate": 0.0003766563621213189, "loss": 2.1855, "step": 2585 }, { "epoch": 0.48, "grad_norm": 0.1923828125, "learning_rate": 0.0003765043153572643, "loss": 2.1818, "step": 2590 }, { "epoch": 0.48, "grad_norm": 0.189453125, "learning_rate": 0.0003763518059220311, "loss": 2.1717, "step": 2595 }, { "epoch": 0.48, "grad_norm": 0.1923828125, "learning_rate": 0.0003761988342153929, "loss": 2.1862, "step": 2600 }, { "epoch": 0.48, "grad_norm": 0.1875, "learning_rate": 0.0003760454006383345, "loss": 2.1539, "step": 2605 }, { "epoch": 0.48, "grad_norm": 0.1875, "learning_rate": 0.0003758915055930519, "loss": 2.1678, "step": 2610 }, { "epoch": 0.49, "grad_norm": 0.19140625, "learning_rate": 0.00037573714948295044, "loss": 2.1442, "step": 2615 }, { "epoch": 0.49, "grad_norm": 0.1962890625, "learning_rate": 0.00037558233271264423, "loss": 2.2111, "step": 2620 }, { "epoch": 0.49, "grad_norm": 0.1884765625, "learning_rate": 0.0003754270556879547, "loss": 2.1631, "step": 2625 }, { "epoch": 0.49, "grad_norm": 0.189453125, "learning_rate": 0.0003752713188159101, "loss": 2.1998, "step": 2630 }, { "epoch": 0.49, "grad_norm": 0.1904296875, "learning_rate": 0.00037511512250474363, "loss": 2.1674, "step": 2635 }, { "epoch": 0.49, "grad_norm": 0.1884765625, "learning_rate": 0.00037495846716389323, "loss": 2.1769, "step": 2640 }, { "epoch": 0.49, "grad_norm": 0.19140625, "learning_rate": 0.0003748013532039998, "loss": 2.1797, "step": 2645 }, { "epoch": 0.49, "grad_norm": 0.193359375, "learning_rate": 0.00037464378103690656, "loss": 2.1771, "step": 2650 }, { "epoch": 0.49, "grad_norm": 0.1904296875, "learning_rate": 0.00037448575107565786, "loss": 2.1965, "step": 2655 }, { "epoch": 0.49, "grad_norm": 0.1884765625, "learning_rate": 0.000374327263734498, "loss": 2.176, "step": 2660 }, { "epoch": 0.49, "grad_norm": 0.1943359375, "learning_rate": 0.0003741683194288701, "loss": 2.2076, "step": 2665 }, { "epoch": 0.5, "grad_norm": 0.1962890625, "learning_rate": 0.0003740089185754154, "loss": 2.1803, "step": 2670 }, { "epoch": 0.5, "grad_norm": 0.2001953125, "learning_rate": 0.0003738490615919716, "loss": 2.1576, "step": 2675 }, { "epoch": 0.5, "grad_norm": 0.2021484375, "learning_rate": 0.0003736887488975723, "loss": 2.2038, "step": 2680 }, { "epoch": 0.5, "grad_norm": 0.19140625, "learning_rate": 0.00037352798091244547, "loss": 2.1394, "step": 2685 }, { "epoch": 0.5, "grad_norm": 0.1953125, "learning_rate": 0.0003733667580580127, "loss": 2.2167, "step": 2690 }, { "epoch": 0.5, "grad_norm": 0.189453125, "learning_rate": 0.00037320508075688776, "loss": 2.1758, "step": 2695 }, { "epoch": 0.5, "grad_norm": 0.193359375, "learning_rate": 0.0003730429494328757, "loss": 2.1592, "step": 2700 }, { "epoch": 0.5, "grad_norm": 0.1865234375, "learning_rate": 0.0003728803645109719, "loss": 2.1819, "step": 2705 }, { "epoch": 0.5, "grad_norm": 0.1923828125, "learning_rate": 0.00037271732641736043, "loss": 2.2038, "step": 2710 }, { "epoch": 0.5, "grad_norm": 0.197265625, "learning_rate": 0.0003725538355794135, "loss": 2.191, "step": 2715 }, { "epoch": 0.5, "grad_norm": 0.2001953125, "learning_rate": 0.00037238989242569003, "loss": 2.1797, "step": 2720 }, { "epoch": 0.51, "grad_norm": 0.1923828125, "learning_rate": 0.0003722254973859346, "loss": 2.1828, "step": 2725 }, { "epoch": 0.51, "grad_norm": 0.1962890625, "learning_rate": 0.0003720606508910763, "loss": 2.1772, "step": 2730 }, { "epoch": 0.51, "grad_norm": 0.189453125, "learning_rate": 0.00037189535337322767, "loss": 2.1698, "step": 2735 }, { "epoch": 0.51, "grad_norm": 0.2080078125, "learning_rate": 0.0003717296052656835, "loss": 2.1274, "step": 2740 }, { "epoch": 0.51, "grad_norm": 0.189453125, "learning_rate": 0.0003715634070029196, "loss": 2.1501, "step": 2745 }, { "epoch": 0.51, "grad_norm": 0.1875, "learning_rate": 0.0003713967590205919, "loss": 2.1519, "step": 2750 }, { "epoch": 0.51, "grad_norm": 0.1884765625, "learning_rate": 0.00037122966175553524, "loss": 2.1676, "step": 2755 }, { "epoch": 0.51, "grad_norm": 0.2119140625, "learning_rate": 0.000371062115645762, "loss": 2.1744, "step": 2760 }, { "epoch": 0.51, "grad_norm": 0.1923828125, "learning_rate": 0.00037089412113046116, "loss": 2.1807, "step": 2765 }, { "epoch": 0.51, "grad_norm": 0.1904296875, "learning_rate": 0.00037072567864999723, "loss": 2.183, "step": 2770 }, { "epoch": 0.51, "grad_norm": 0.189453125, "learning_rate": 0.00037055678864590874, "loss": 2.1767, "step": 2775 }, { "epoch": 0.52, "grad_norm": 0.1865234375, "learning_rate": 0.00037038745156090766, "loss": 2.1606, "step": 2780 }, { "epoch": 0.52, "grad_norm": 0.1845703125, "learning_rate": 0.0003702176678388775, "loss": 2.149, "step": 2785 }, { "epoch": 0.52, "grad_norm": 0.185546875, "learning_rate": 0.0003700474379248728, "loss": 2.2374, "step": 2790 }, { "epoch": 0.52, "grad_norm": 0.189453125, "learning_rate": 0.0003698767622651178, "loss": 2.1584, "step": 2795 }, { "epoch": 0.52, "grad_norm": 0.1943359375, "learning_rate": 0.0003697056413070047, "loss": 2.1679, "step": 2800 }, { "epoch": 0.52, "grad_norm": 0.19140625, "learning_rate": 0.0003695340754990935, "loss": 2.1748, "step": 2805 }, { "epoch": 0.52, "grad_norm": 0.189453125, "learning_rate": 0.00036936206529110995, "loss": 2.2067, "step": 2810 }, { "epoch": 0.52, "grad_norm": 0.1875, "learning_rate": 0.0003691896111339449, "loss": 2.1876, "step": 2815 }, { "epoch": 0.52, "grad_norm": 0.1884765625, "learning_rate": 0.00036901671347965275, "loss": 2.1719, "step": 2820 }, { "epoch": 0.52, "grad_norm": 0.189453125, "learning_rate": 0.0003688433727814506, "loss": 2.1687, "step": 2825 }, { "epoch": 0.53, "grad_norm": 0.1904296875, "learning_rate": 0.00036866958949371677, "loss": 2.1702, "step": 2830 }, { "epoch": 0.53, "grad_norm": 0.19140625, "learning_rate": 0.0003684953640719899, "loss": 2.1444, "step": 2835 }, { "epoch": 0.53, "grad_norm": 0.203125, "learning_rate": 0.0003683206969729673, "loss": 2.2099, "step": 2840 }, { "epoch": 0.53, "grad_norm": 0.1884765625, "learning_rate": 0.0003681455886545045, "loss": 2.1858, "step": 2845 }, { "epoch": 0.53, "grad_norm": 0.1904296875, "learning_rate": 0.00036797003957561315, "loss": 2.1791, "step": 2850 }, { "epoch": 0.53, "grad_norm": 0.19140625, "learning_rate": 0.0003677940501964606, "loss": 2.1658, "step": 2855 }, { "epoch": 0.53, "grad_norm": 0.1953125, "learning_rate": 0.0003676176209783681, "loss": 2.1968, "step": 2860 }, { "epoch": 0.53, "grad_norm": 0.1884765625, "learning_rate": 0.00036744075238381017, "loss": 2.1729, "step": 2865 }, { "epoch": 0.53, "grad_norm": 0.1962890625, "learning_rate": 0.00036726344487641267, "loss": 2.1772, "step": 2870 }, { "epoch": 0.53, "grad_norm": 0.1923828125, "learning_rate": 0.00036708569892095227, "loss": 2.1572, "step": 2875 }, { "epoch": 0.53, "grad_norm": 0.193359375, "learning_rate": 0.00036690751498335487, "loss": 2.1488, "step": 2880 }, { "epoch": 0.54, "grad_norm": 0.1953125, "learning_rate": 0.0003667288935306944, "loss": 2.1995, "step": 2885 }, { "epoch": 0.54, "grad_norm": 0.1953125, "learning_rate": 0.0003665498350311918, "loss": 2.1854, "step": 2890 }, { "epoch": 0.54, "grad_norm": 0.2021484375, "learning_rate": 0.00036637033995421347, "loss": 2.1867, "step": 2895 }, { "epoch": 0.54, "grad_norm": 0.1923828125, "learning_rate": 0.0003661904087702702, "loss": 2.1902, "step": 2900 }, { "epoch": 0.54, "grad_norm": 0.189453125, "learning_rate": 0.0003660100419510161, "loss": 2.1654, "step": 2905 }, { "epoch": 0.54, "grad_norm": 0.1953125, "learning_rate": 0.00036582923996924724, "loss": 2.1987, "step": 2910 }, { "epoch": 0.54, "grad_norm": 0.193359375, "learning_rate": 0.0003656480032989001, "loss": 2.1442, "step": 2915 }, { "epoch": 0.54, "grad_norm": 0.1923828125, "learning_rate": 0.00036546633241505094, "loss": 2.1855, "step": 2920 }, { "epoch": 0.54, "grad_norm": 0.1904296875, "learning_rate": 0.000365284227793914, "loss": 2.2243, "step": 2925 }, { "epoch": 0.54, "grad_norm": 0.1904296875, "learning_rate": 0.0003651016899128406, "loss": 2.1589, "step": 2930 }, { "epoch": 0.54, "grad_norm": 0.1943359375, "learning_rate": 0.00036491871925031755, "loss": 2.1643, "step": 2935 }, { "epoch": 0.55, "grad_norm": 0.193359375, "learning_rate": 0.0003647353162859666, "loss": 2.1953, "step": 2940 }, { "epoch": 0.55, "grad_norm": 0.1904296875, "learning_rate": 0.0003645514815005421, "loss": 2.1838, "step": 2945 }, { "epoch": 0.55, "grad_norm": 0.1865234375, "learning_rate": 0.0003643672153759307, "loss": 2.2055, "step": 2950 }, { "epoch": 0.55, "grad_norm": 0.1923828125, "learning_rate": 0.00036418251839514956, "loss": 2.1623, "step": 2955 }, { "epoch": 0.55, "grad_norm": 0.18359375, "learning_rate": 0.00036399739104234544, "loss": 2.1533, "step": 2960 }, { "epoch": 0.55, "grad_norm": 0.2001953125, "learning_rate": 0.00036381183380279305, "loss": 2.1831, "step": 2965 }, { "epoch": 0.55, "grad_norm": 0.1962890625, "learning_rate": 0.00036362584716289405, "loss": 2.1655, "step": 2970 }, { "epoch": 0.55, "grad_norm": 0.1943359375, "learning_rate": 0.0003634394316101756, "loss": 2.1324, "step": 2975 }, { "epoch": 0.55, "grad_norm": 0.1982421875, "learning_rate": 0.0003632525876332892, "loss": 2.2055, "step": 2980 }, { "epoch": 0.55, "grad_norm": 0.1982421875, "learning_rate": 0.00036306531572200944, "loss": 2.1625, "step": 2985 }, { "epoch": 0.55, "grad_norm": 0.19921875, "learning_rate": 0.00036287761636723275, "loss": 2.1718, "step": 2990 }, { "epoch": 0.56, "grad_norm": 0.1923828125, "learning_rate": 0.00036268949006097566, "loss": 2.192, "step": 2995 }, { "epoch": 0.56, "grad_norm": 0.1904296875, "learning_rate": 0.00036250093729637433, "loss": 2.1969, "step": 3000 }, { "epoch": 0.56, "grad_norm": 0.1923828125, "learning_rate": 0.00036231195856768235, "loss": 2.1664, "step": 3005 }, { "epoch": 0.56, "grad_norm": 0.1923828125, "learning_rate": 0.0003621225543702703, "loss": 2.1199, "step": 3010 }, { "epoch": 0.56, "grad_norm": 0.193359375, "learning_rate": 0.00036193272520062376, "loss": 2.211, "step": 3015 }, { "epoch": 0.56, "grad_norm": 0.1904296875, "learning_rate": 0.00036174247155634233, "loss": 2.1411, "step": 3020 }, { "epoch": 0.56, "grad_norm": 0.1943359375, "learning_rate": 0.0003615517939361385, "loss": 2.1303, "step": 3025 }, { "epoch": 0.56, "grad_norm": 0.1923828125, "learning_rate": 0.00036136069283983577, "loss": 2.1731, "step": 3030 }, { "epoch": 0.56, "grad_norm": 0.193359375, "learning_rate": 0.00036116916876836804, "loss": 2.1537, "step": 3035 }, { "epoch": 0.56, "grad_norm": 0.193359375, "learning_rate": 0.00036097722222377775, "loss": 2.2165, "step": 3040 }, { "epoch": 0.56, "grad_norm": 0.1953125, "learning_rate": 0.00036078485370921476, "loss": 2.1514, "step": 3045 }, { "epoch": 0.57, "grad_norm": 0.193359375, "learning_rate": 0.00036059206372893523, "loss": 2.1642, "step": 3050 }, { "epoch": 0.57, "grad_norm": 0.1923828125, "learning_rate": 0.0003603988527883, "loss": 2.1888, "step": 3055 }, { "epoch": 0.57, "grad_norm": 0.2041015625, "learning_rate": 0.00036020522139377327, "loss": 2.1869, "step": 3060 }, { "epoch": 0.57, "grad_norm": 0.19140625, "learning_rate": 0.00036001117005292154, "loss": 2.1677, "step": 3065 }, { "epoch": 0.57, "grad_norm": 0.189453125, "learning_rate": 0.000359816699274412, "loss": 2.167, "step": 3070 }, { "epoch": 0.57, "grad_norm": 0.193359375, "learning_rate": 0.00035962180956801133, "loss": 2.1305, "step": 3075 }, { "epoch": 0.57, "grad_norm": 0.189453125, "learning_rate": 0.0003594265014445845, "loss": 2.1672, "step": 3080 }, { "epoch": 0.57, "grad_norm": 0.189453125, "learning_rate": 0.00035923077541609314, "loss": 2.1662, "step": 3085 }, { "epoch": 0.57, "grad_norm": 0.1962890625, "learning_rate": 0.0003590346319955942, "loss": 2.2116, "step": 3090 }, { "epoch": 0.57, "grad_norm": 0.1875, "learning_rate": 0.000358838071697239, "loss": 2.1816, "step": 3095 }, { "epoch": 0.58, "grad_norm": 0.1943359375, "learning_rate": 0.0003586410950362715, "loss": 2.182, "step": 3100 }, { "epoch": 0.58, "grad_norm": 0.1884765625, "learning_rate": 0.0003584437025290271, "loss": 2.1423, "step": 3105 }, { "epoch": 0.58, "grad_norm": 0.19140625, "learning_rate": 0.00035824589469293127, "loss": 2.1577, "step": 3110 }, { "epoch": 0.58, "grad_norm": 0.189453125, "learning_rate": 0.00035804767204649805, "loss": 2.2046, "step": 3115 }, { "epoch": 0.58, "grad_norm": 0.2001953125, "learning_rate": 0.00035784903510932905, "loss": 2.2101, "step": 3120 }, { "epoch": 0.58, "grad_norm": 0.1943359375, "learning_rate": 0.00035764998440211167, "loss": 2.1624, "step": 3125 }, { "epoch": 0.58, "grad_norm": 0.19140625, "learning_rate": 0.00035745052044661803, "loss": 2.213, "step": 3130 }, { "epoch": 0.58, "grad_norm": 0.1904296875, "learning_rate": 0.0003572506437657035, "loss": 2.197, "step": 3135 }, { "epoch": 0.58, "grad_norm": 0.1875, "learning_rate": 0.00035705035488330523, "loss": 2.1877, "step": 3140 }, { "epoch": 0.58, "grad_norm": 0.1923828125, "learning_rate": 0.00035684965432444094, "loss": 2.1717, "step": 3145 }, { "epoch": 0.58, "grad_norm": 0.193359375, "learning_rate": 0.00035664854261520753, "loss": 2.1887, "step": 3150 }, { "epoch": 0.59, "grad_norm": 0.19140625, "learning_rate": 0.00035644702028277955, "loss": 2.1668, "step": 3155 }, { "epoch": 0.59, "grad_norm": 0.20703125, "learning_rate": 0.00035624508785540805, "loss": 2.1618, "step": 3160 }, { "epoch": 0.59, "grad_norm": 0.1962890625, "learning_rate": 0.00035604274586241886, "loss": 2.2285, "step": 3165 }, { "epoch": 0.59, "grad_norm": 0.1982421875, "learning_rate": 0.0003558399948342116, "loss": 2.2208, "step": 3170 }, { "epoch": 0.59, "grad_norm": 0.193359375, "learning_rate": 0.00035563683530225797, "loss": 2.197, "step": 3175 }, { "epoch": 0.59, "grad_norm": 0.19140625, "learning_rate": 0.0003554332677991006, "loss": 2.2029, "step": 3180 }, { "epoch": 0.59, "grad_norm": 0.1962890625, "learning_rate": 0.0003552292928583514, "loss": 2.1402, "step": 3185 }, { "epoch": 0.59, "grad_norm": 0.2001953125, "learning_rate": 0.0003550249110146904, "loss": 2.2078, "step": 3190 }, { "epoch": 0.59, "grad_norm": 0.1943359375, "learning_rate": 0.0003548201228038642, "loss": 2.1611, "step": 3195 }, { "epoch": 0.59, "grad_norm": 0.1982421875, "learning_rate": 0.0003546149287626846, "loss": 2.1795, "step": 3200 }, { "epoch": 0.59, "grad_norm": 0.2041015625, "learning_rate": 0.00035440932942902727, "loss": 2.1683, "step": 3205 }, { "epoch": 0.6, "grad_norm": 0.201171875, "learning_rate": 0.00035420332534183023, "loss": 2.1783, "step": 3210 }, { "epoch": 0.6, "grad_norm": 0.1953125, "learning_rate": 0.0003539969170410924, "loss": 2.1756, "step": 3215 }, { "epoch": 0.6, "grad_norm": 0.1923828125, "learning_rate": 0.0003537901050678724, "loss": 2.1829, "step": 3220 }, { "epoch": 0.6, "grad_norm": 0.1953125, "learning_rate": 0.000353582889964287, "loss": 2.1947, "step": 3225 }, { "epoch": 0.6, "grad_norm": 0.1884765625, "learning_rate": 0.0003533752722735096, "loss": 2.1792, "step": 3230 }, { "epoch": 0.6, "grad_norm": 0.2001953125, "learning_rate": 0.00035316725253976887, "loss": 2.1858, "step": 3235 }, { "epoch": 0.6, "grad_norm": 0.1962890625, "learning_rate": 0.0003529588313083474, "loss": 2.1128, "step": 3240 }, { "epoch": 0.6, "grad_norm": 0.1884765625, "learning_rate": 0.0003527500091255805, "loss": 2.2005, "step": 3245 }, { "epoch": 0.6, "grad_norm": 0.185546875, "learning_rate": 0.000352540786538854, "loss": 2.1451, "step": 3250 }, { "epoch": 0.6, "grad_norm": 0.1982421875, "learning_rate": 0.0003523311640966037, "loss": 2.1474, "step": 3255 }, { "epoch": 0.6, "grad_norm": 0.1943359375, "learning_rate": 0.0003521211423483133, "loss": 2.1799, "step": 3260 }, { "epoch": 0.61, "grad_norm": 0.1923828125, "learning_rate": 0.0003519107218445134, "loss": 2.1957, "step": 3265 }, { "epoch": 0.61, "grad_norm": 0.19140625, "learning_rate": 0.00035169990313677974, "loss": 2.178, "step": 3270 }, { "epoch": 0.61, "grad_norm": 0.193359375, "learning_rate": 0.00035148868677773186, "loss": 2.2025, "step": 3275 }, { "epoch": 0.61, "grad_norm": 0.197265625, "learning_rate": 0.00035127707332103175, "loss": 2.1991, "step": 3280 }, { "epoch": 0.61, "grad_norm": 0.197265625, "learning_rate": 0.00035106506332138217, "loss": 2.1969, "step": 3285 }, { "epoch": 0.61, "grad_norm": 0.1884765625, "learning_rate": 0.00035085265733452554, "loss": 2.1569, "step": 3290 }, { "epoch": 0.61, "grad_norm": 0.1982421875, "learning_rate": 0.0003506398559172421, "loss": 2.2137, "step": 3295 }, { "epoch": 0.61, "grad_norm": 0.201171875, "learning_rate": 0.0003504266596273488, "loss": 2.1768, "step": 3300 }, { "epoch": 0.61, "grad_norm": 0.2021484375, "learning_rate": 0.00035021306902369745, "loss": 2.2186, "step": 3305 }, { "epoch": 0.61, "grad_norm": 0.1953125, "learning_rate": 0.0003499990846661737, "loss": 2.1161, "step": 3310 }, { "epoch": 0.62, "grad_norm": 0.1982421875, "learning_rate": 0.0003497847071156952, "loss": 2.1391, "step": 3315 }, { "epoch": 0.62, "grad_norm": 0.1884765625, "learning_rate": 0.0003495699369342104, "loss": 2.1398, "step": 3320 }, { "epoch": 0.62, "grad_norm": 0.189453125, "learning_rate": 0.0003493547746846968, "loss": 2.1858, "step": 3325 }, { "epoch": 0.62, "grad_norm": 0.189453125, "learning_rate": 0.0003491392209311597, "loss": 2.159, "step": 3330 }, { "epoch": 0.62, "grad_norm": 0.1943359375, "learning_rate": 0.00034892327623863077, "loss": 2.1631, "step": 3335 }, { "epoch": 0.62, "grad_norm": 0.2021484375, "learning_rate": 0.0003487069411731663, "loss": 2.1614, "step": 3340 }, { "epoch": 0.62, "grad_norm": 0.1943359375, "learning_rate": 0.00034849021630184587, "loss": 2.1665, "step": 3345 }, { "epoch": 0.62, "grad_norm": 0.1962890625, "learning_rate": 0.0003482731021927709, "loss": 2.1872, "step": 3350 }, { "epoch": 0.62, "grad_norm": 0.193359375, "learning_rate": 0.0003480555994150631, "loss": 2.1781, "step": 3355 }, { "epoch": 0.62, "grad_norm": 0.1943359375, "learning_rate": 0.0003478377085388631, "loss": 2.1907, "step": 3360 }, { "epoch": 0.62, "grad_norm": 0.1865234375, "learning_rate": 0.00034761943013532874, "loss": 2.1636, "step": 3365 }, { "epoch": 0.63, "grad_norm": 0.1904296875, "learning_rate": 0.0003474007647766336, "loss": 2.1246, "step": 3370 }, { "epoch": 0.63, "grad_norm": 0.1884765625, "learning_rate": 0.0003471817130359659, "loss": 2.1438, "step": 3375 }, { "epoch": 0.63, "grad_norm": 0.1943359375, "learning_rate": 0.0003469622754875263, "loss": 2.1563, "step": 3380 }, { "epoch": 0.63, "grad_norm": 0.1962890625, "learning_rate": 0.0003467424527065271, "loss": 2.1809, "step": 3385 }, { "epoch": 0.63, "grad_norm": 0.1943359375, "learning_rate": 0.00034652224526919014, "loss": 2.1731, "step": 3390 }, { "epoch": 0.63, "grad_norm": 0.1982421875, "learning_rate": 0.0003463016537527458, "loss": 2.1673, "step": 3395 }, { "epoch": 0.63, "grad_norm": 0.1962890625, "learning_rate": 0.00034608067873543116, "loss": 2.1539, "step": 3400 }, { "epoch": 0.63, "grad_norm": 0.19140625, "learning_rate": 0.0003458593207964885, "loss": 2.1642, "step": 3405 }, { "epoch": 0.63, "grad_norm": 0.197265625, "learning_rate": 0.0003456375805161638, "loss": 2.1938, "step": 3410 }, { "epoch": 0.63, "grad_norm": 0.1904296875, "learning_rate": 0.0003454154584757056, "loss": 2.1743, "step": 3415 }, { "epoch": 0.63, "grad_norm": 0.1875, "learning_rate": 0.0003451929552573629, "loss": 2.1621, "step": 3420 }, { "epoch": 0.64, "grad_norm": 0.1962890625, "learning_rate": 0.00034497007144438367, "loss": 2.1694, "step": 3425 }, { "epoch": 0.64, "grad_norm": 0.2041015625, "learning_rate": 0.00034474680762101406, "loss": 2.1555, "step": 3430 }, { "epoch": 0.64, "grad_norm": 0.1923828125, "learning_rate": 0.0003445231643724959, "loss": 2.1856, "step": 3435 }, { "epoch": 0.64, "grad_norm": 0.1962890625, "learning_rate": 0.0003442991422850658, "loss": 2.1824, "step": 3440 }, { "epoch": 0.64, "grad_norm": 0.19921875, "learning_rate": 0.0003440747419459534, "loss": 2.1838, "step": 3445 }, { "epoch": 0.64, "grad_norm": 0.19140625, "learning_rate": 0.0003438499639433798, "loss": 2.1467, "step": 3450 }, { "epoch": 0.64, "grad_norm": 0.19140625, "learning_rate": 0.00034362480886655615, "loss": 2.1592, "step": 3455 }, { "epoch": 0.64, "grad_norm": 0.1943359375, "learning_rate": 0.000343399277305682, "loss": 2.1921, "step": 3460 }, { "epoch": 0.64, "grad_norm": 0.19140625, "learning_rate": 0.0003431733698519437, "loss": 2.1951, "step": 3465 }, { "epoch": 0.64, "grad_norm": 0.189453125, "learning_rate": 0.0003429470870975131, "loss": 2.225, "step": 3470 }, { "epoch": 0.64, "grad_norm": 0.1943359375, "learning_rate": 0.00034272042963554554, "loss": 2.185, "step": 3475 }, { "epoch": 0.65, "grad_norm": 0.1962890625, "learning_rate": 0.0003424933980601789, "loss": 2.1617, "step": 3480 }, { "epoch": 0.65, "grad_norm": 0.1962890625, "learning_rate": 0.0003422659929665316, "loss": 2.1954, "step": 3485 }, { "epoch": 0.65, "grad_norm": 0.1923828125, "learning_rate": 0.00034203821495070103, "loss": 2.1607, "step": 3490 }, { "epoch": 0.65, "grad_norm": 0.1962890625, "learning_rate": 0.0003418100646097624, "loss": 2.1595, "step": 3495 }, { "epoch": 0.65, "grad_norm": 0.189453125, "learning_rate": 0.00034158154254176654, "loss": 2.1473, "step": 3500 }, { "epoch": 0.65, "grad_norm": 0.1962890625, "learning_rate": 0.0003413526493457391, "loss": 2.2123, "step": 3505 }, { "epoch": 0.65, "grad_norm": 0.189453125, "learning_rate": 0.0003411233856216781, "loss": 2.1788, "step": 3510 }, { "epoch": 0.65, "grad_norm": 0.1943359375, "learning_rate": 0.00034089375197055336, "loss": 2.1447, "step": 3515 }, { "epoch": 0.65, "grad_norm": 0.197265625, "learning_rate": 0.0003406637489943039, "loss": 2.1855, "step": 3520 }, { "epoch": 0.65, "grad_norm": 0.197265625, "learning_rate": 0.0003404333772958372, "loss": 2.1916, "step": 3525 }, { "epoch": 0.65, "grad_norm": 0.19140625, "learning_rate": 0.00034020263747902715, "loss": 2.1985, "step": 3530 }, { "epoch": 0.66, "grad_norm": 0.1943359375, "learning_rate": 0.00033997153014871237, "loss": 2.2015, "step": 3535 }, { "epoch": 0.66, "grad_norm": 0.197265625, "learning_rate": 0.0003397400559106953, "loss": 2.1476, "step": 3540 }, { "epoch": 0.66, "grad_norm": 0.1982421875, "learning_rate": 0.0003395082153717397, "loss": 2.159, "step": 3545 }, { "epoch": 0.66, "grad_norm": 0.197265625, "learning_rate": 0.00033927600913956986, "loss": 2.19, "step": 3550 }, { "epoch": 0.66, "grad_norm": 0.1953125, "learning_rate": 0.0003390434378228685, "loss": 2.1506, "step": 3555 }, { "epoch": 0.66, "grad_norm": 0.1923828125, "learning_rate": 0.00033881050203127527, "loss": 2.1569, "step": 3560 }, { "epoch": 0.66, "grad_norm": 0.203125, "learning_rate": 0.0003385772023753855, "loss": 2.1574, "step": 3565 }, { "epoch": 0.66, "grad_norm": 0.1953125, "learning_rate": 0.000338343539466748, "loss": 2.1559, "step": 3570 }, { "epoch": 0.66, "grad_norm": 0.19921875, "learning_rate": 0.00033810951391786384, "loss": 2.2004, "step": 3575 }, { "epoch": 0.66, "grad_norm": 0.1904296875, "learning_rate": 0.00033787512634218483, "loss": 2.1497, "step": 3580 }, { "epoch": 0.67, "grad_norm": 0.1962890625, "learning_rate": 0.0003376403773541117, "loss": 2.1442, "step": 3585 }, { "epoch": 0.67, "grad_norm": 0.19140625, "learning_rate": 0.0003374052675689925, "loss": 2.1432, "step": 3590 }, { "epoch": 0.67, "grad_norm": 0.1962890625, "learning_rate": 0.000337169797603121, "loss": 2.1708, "step": 3595 }, { "epoch": 0.67, "grad_norm": 0.19140625, "learning_rate": 0.00033693396807373536, "loss": 2.1471, "step": 3600 }, { "epoch": 0.67, "grad_norm": 0.193359375, "learning_rate": 0.00033669777959901583, "loss": 2.1216, "step": 3605 }, { "epoch": 0.67, "grad_norm": 0.1943359375, "learning_rate": 0.000336461232798084, "loss": 2.1942, "step": 3610 }, { "epoch": 0.67, "grad_norm": 0.19140625, "learning_rate": 0.0003362243282910005, "loss": 2.198, "step": 3615 }, { "epoch": 0.67, "grad_norm": 0.205078125, "learning_rate": 0.0003359870666987637, "loss": 2.1837, "step": 3620 }, { "epoch": 0.67, "grad_norm": 0.197265625, "learning_rate": 0.0003357494486433078, "loss": 2.1701, "step": 3625 }, { "epoch": 0.67, "grad_norm": 0.1962890625, "learning_rate": 0.0003355114747475019, "loss": 2.1814, "step": 3630 }, { "epoch": 0.67, "grad_norm": 0.1884765625, "learning_rate": 0.00033527314563514725, "loss": 2.1656, "step": 3635 }, { "epoch": 0.68, "grad_norm": 0.1943359375, "learning_rate": 0.0003350344619309767, "loss": 2.196, "step": 3640 }, { "epoch": 0.68, "grad_norm": 0.1923828125, "learning_rate": 0.0003347954242606522, "loss": 2.1617, "step": 3645 }, { "epoch": 0.68, "grad_norm": 0.1982421875, "learning_rate": 0.000334556033250764, "loss": 2.1825, "step": 3650 }, { "epoch": 0.68, "grad_norm": 0.1923828125, "learning_rate": 0.00033431628952882813, "loss": 2.132, "step": 3655 }, { "epoch": 0.68, "grad_norm": 0.193359375, "learning_rate": 0.00033407619372328545, "loss": 2.1444, "step": 3660 }, { "epoch": 0.68, "grad_norm": 0.197265625, "learning_rate": 0.00033383574646349973, "loss": 2.1785, "step": 3665 }, { "epoch": 0.68, "grad_norm": 0.197265625, "learning_rate": 0.0003335949483797558, "loss": 2.117, "step": 3670 }, { "epoch": 0.68, "grad_norm": 0.1923828125, "learning_rate": 0.0003333538001032583, "loss": 2.1793, "step": 3675 }, { "epoch": 0.68, "grad_norm": 0.201171875, "learning_rate": 0.00033311230226612987, "loss": 2.1872, "step": 3680 }, { "epoch": 0.68, "grad_norm": 0.1923828125, "learning_rate": 0.00033287045550140924, "loss": 2.1664, "step": 3685 }, { "epoch": 0.68, "grad_norm": 0.19140625, "learning_rate": 0.00033262826044304995, "loss": 2.1627, "step": 3690 }, { "epoch": 0.69, "grad_norm": 0.201171875, "learning_rate": 0.00033238571772591845, "loss": 2.1571, "step": 3695 }, { "epoch": 0.69, "grad_norm": 0.19140625, "learning_rate": 0.00033214282798579256, "loss": 2.1433, "step": 3700 }, { "epoch": 0.69, "grad_norm": 0.189453125, "learning_rate": 0.00033189959185935973, "loss": 2.127, "step": 3705 }, { "epoch": 0.69, "grad_norm": 0.2001953125, "learning_rate": 0.00033165600998421544, "loss": 2.1733, "step": 3710 }, { "epoch": 0.69, "grad_norm": 0.1962890625, "learning_rate": 0.0003314120829988613, "loss": 2.1758, "step": 3715 }, { "epoch": 0.69, "grad_norm": 0.19921875, "learning_rate": 0.0003311678115427039, "loss": 2.17, "step": 3720 }, { "epoch": 0.69, "grad_norm": 0.2041015625, "learning_rate": 0.0003309231962560524, "loss": 2.167, "step": 3725 }, { "epoch": 0.69, "grad_norm": 0.197265625, "learning_rate": 0.0003306782377801175, "loss": 2.172, "step": 3730 }, { "epoch": 0.69, "grad_norm": 0.1943359375, "learning_rate": 0.0003304329367570094, "loss": 2.1651, "step": 3735 }, { "epoch": 0.69, "grad_norm": 0.1982421875, "learning_rate": 0.0003301872938297362, "loss": 2.153, "step": 3740 }, { "epoch": 0.69, "grad_norm": 0.203125, "learning_rate": 0.0003299413096422025, "loss": 2.1491, "step": 3745 }, { "epoch": 0.7, "grad_norm": 0.2099609375, "learning_rate": 0.00032969498483920704, "loss": 2.1947, "step": 3750 }, { "epoch": 0.7, "grad_norm": 0.1923828125, "learning_rate": 0.00032944832006644165, "loss": 2.1276, "step": 3755 }, { "epoch": 0.7, "grad_norm": 0.2001953125, "learning_rate": 0.0003292013159704893, "loss": 2.1548, "step": 3760 }, { "epoch": 0.7, "grad_norm": 0.1953125, "learning_rate": 0.00032895397319882236, "loss": 2.1466, "step": 3765 }, { "epoch": 0.7, "grad_norm": 0.1904296875, "learning_rate": 0.0003287062923998011, "loss": 2.1091, "step": 3770 }, { "epoch": 0.7, "grad_norm": 0.1904296875, "learning_rate": 0.0003284582742226717, "loss": 2.1843, "step": 3775 }, { "epoch": 0.7, "grad_norm": 0.201171875, "learning_rate": 0.00032820991931756493, "loss": 2.2426, "step": 3780 }, { "epoch": 0.7, "grad_norm": 0.1904296875, "learning_rate": 0.00032796122833549394, "loss": 2.1489, "step": 3785 }, { "epoch": 0.7, "grad_norm": 0.189453125, "learning_rate": 0.000327712201928353, "loss": 2.1309, "step": 3790 }, { "epoch": 0.7, "grad_norm": 0.1884765625, "learning_rate": 0.00032746284074891573, "loss": 2.1768, "step": 3795 }, { "epoch": 0.71, "grad_norm": 0.189453125, "learning_rate": 0.0003272131454508331, "loss": 2.1846, "step": 3800 }, { "epoch": 0.71, "grad_norm": 0.2001953125, "learning_rate": 0.000326963116688632, "loss": 2.2, "step": 3805 }, { "epoch": 0.71, "grad_norm": 0.1904296875, "learning_rate": 0.00032671275511771337, "loss": 2.1608, "step": 3810 }, { "epoch": 0.71, "grad_norm": 0.193359375, "learning_rate": 0.00032646206139435067, "loss": 2.1848, "step": 3815 }, { "epoch": 0.71, "grad_norm": 0.1923828125, "learning_rate": 0.00032621103617568785, "loss": 2.1613, "step": 3820 }, { "epoch": 0.71, "grad_norm": 0.19140625, "learning_rate": 0.00032595968011973805, "loss": 2.1394, "step": 3825 }, { "epoch": 0.71, "grad_norm": 0.1943359375, "learning_rate": 0.00032570799388538133, "loss": 2.1516, "step": 3830 }, { "epoch": 0.71, "grad_norm": 0.2041015625, "learning_rate": 0.00032545597813236356, "loss": 2.214, "step": 3835 }, { "epoch": 0.71, "grad_norm": 0.1884765625, "learning_rate": 0.0003252036335212941, "loss": 2.1682, "step": 3840 }, { "epoch": 0.71, "grad_norm": 0.197265625, "learning_rate": 0.0003249509607136446, "loss": 2.1702, "step": 3845 }, { "epoch": 0.71, "grad_norm": 0.1943359375, "learning_rate": 0.00032469796037174674, "loss": 2.2195, "step": 3850 }, { "epoch": 0.72, "grad_norm": 0.1904296875, "learning_rate": 0.00032444463315879103, "loss": 2.0988, "step": 3855 }, { "epoch": 0.72, "grad_norm": 0.19921875, "learning_rate": 0.0003241909797388246, "loss": 2.1504, "step": 3860 }, { "epoch": 0.72, "grad_norm": 0.1982421875, "learning_rate": 0.00032393700077674987, "loss": 2.1767, "step": 3865 }, { "epoch": 0.72, "grad_norm": 0.201171875, "learning_rate": 0.0003236826969383224, "loss": 2.1628, "step": 3870 }, { "epoch": 0.72, "grad_norm": 0.201171875, "learning_rate": 0.0003234280688901495, "loss": 2.2204, "step": 3875 }, { "epoch": 0.72, "grad_norm": 0.197265625, "learning_rate": 0.00032317311729968825, "loss": 2.1606, "step": 3880 }, { "epoch": 0.72, "grad_norm": 0.2001953125, "learning_rate": 0.0003229178428352438, "loss": 2.1721, "step": 3885 }, { "epoch": 0.72, "grad_norm": 0.1923828125, "learning_rate": 0.00032266224616596785, "loss": 2.1741, "step": 3890 }, { "epoch": 0.72, "grad_norm": 0.2080078125, "learning_rate": 0.0003224063279618564, "loss": 2.1784, "step": 3895 }, { "epoch": 0.72, "grad_norm": 0.1953125, "learning_rate": 0.00032215008889374853, "loss": 2.1944, "step": 3900 }, { "epoch": 0.72, "grad_norm": 0.1962890625, "learning_rate": 0.00032189352963332425, "loss": 2.1608, "step": 3905 }, { "epoch": 0.73, "grad_norm": 0.1953125, "learning_rate": 0.0003216366508531031, "loss": 2.1662, "step": 3910 }, { "epoch": 0.73, "grad_norm": 0.2060546875, "learning_rate": 0.00032137945322644184, "loss": 2.1546, "step": 3915 }, { "epoch": 0.73, "grad_norm": 0.1962890625, "learning_rate": 0.00032112193742753333, "loss": 2.1698, "step": 3920 }, { "epoch": 0.73, "grad_norm": 0.203125, "learning_rate": 0.0003208641041314043, "loss": 2.2374, "step": 3925 }, { "epoch": 0.73, "grad_norm": 0.1953125, "learning_rate": 0.0003206059540139139, "loss": 2.1985, "step": 3930 }, { "epoch": 0.73, "grad_norm": 0.2041015625, "learning_rate": 0.0003203474877517514, "loss": 2.1759, "step": 3935 }, { "epoch": 0.73, "grad_norm": 0.1943359375, "learning_rate": 0.00032008870602243523, "loss": 2.1489, "step": 3940 }, { "epoch": 0.73, "grad_norm": 0.2021484375, "learning_rate": 0.0003198296095043104, "loss": 2.1875, "step": 3945 }, { "epoch": 0.73, "grad_norm": 0.19921875, "learning_rate": 0.00031957019887654737, "loss": 2.1403, "step": 3950 }, { "epoch": 0.73, "grad_norm": 0.2001953125, "learning_rate": 0.0003193104748191398, "loss": 2.1555, "step": 3955 }, { "epoch": 0.73, "grad_norm": 0.201171875, "learning_rate": 0.0003190504380129029, "loss": 2.178, "step": 3960 }, { "epoch": 0.74, "grad_norm": 0.19921875, "learning_rate": 0.0003187900891394718, "loss": 2.1751, "step": 3965 }, { "epoch": 0.74, "grad_norm": 0.1953125, "learning_rate": 0.0003185294288812996, "loss": 2.1864, "step": 3970 }, { "epoch": 0.74, "grad_norm": 0.1943359375, "learning_rate": 0.0003182684579216557, "loss": 2.1756, "step": 3975 }, { "epoch": 0.74, "grad_norm": 0.201171875, "learning_rate": 0.0003180071769446238, "loss": 2.1778, "step": 3980 }, { "epoch": 0.74, "grad_norm": 0.208984375, "learning_rate": 0.00031774558663510044, "loss": 2.1806, "step": 3985 }, { "epoch": 0.74, "grad_norm": 0.197265625, "learning_rate": 0.00031748368767879284, "loss": 2.1518, "step": 3990 }, { "epoch": 0.74, "grad_norm": 0.1962890625, "learning_rate": 0.0003172214807622175, "loss": 2.1643, "step": 3995 }, { "epoch": 0.74, "grad_norm": 0.197265625, "learning_rate": 0.00031695896657269785, "loss": 2.1502, "step": 4000 }, { "epoch": 0.74, "grad_norm": 0.1982421875, "learning_rate": 0.00031669614579836307, "loss": 2.1605, "step": 4005 }, { "epoch": 0.74, "grad_norm": 0.197265625, "learning_rate": 0.00031643301912814576, "loss": 2.1922, "step": 4010 }, { "epoch": 0.74, "grad_norm": 0.193359375, "learning_rate": 0.00031616958725178056, "loss": 2.146, "step": 4015 }, { "epoch": 0.75, "grad_norm": 0.19140625, "learning_rate": 0.0003159058508598021, "loss": 2.1565, "step": 4020 }, { "epoch": 0.75, "grad_norm": 0.193359375, "learning_rate": 0.00031564181064354315, "loss": 2.141, "step": 4025 }, { "epoch": 0.75, "grad_norm": 0.2041015625, "learning_rate": 0.000315377467295133, "loss": 2.1504, "step": 4030 }, { "epoch": 0.75, "grad_norm": 0.19921875, "learning_rate": 0.0003151128215074954, "loss": 2.1534, "step": 4035 }, { "epoch": 0.75, "grad_norm": 0.19921875, "learning_rate": 0.0003148478739743472, "loss": 2.1279, "step": 4040 }, { "epoch": 0.75, "grad_norm": 0.208984375, "learning_rate": 0.0003145826253901957, "loss": 2.1695, "step": 4045 }, { "epoch": 0.75, "grad_norm": 0.1953125, "learning_rate": 0.00031431707645033775, "loss": 2.2067, "step": 4050 }, { "epoch": 0.75, "grad_norm": 0.201171875, "learning_rate": 0.00031405122785085757, "loss": 2.1689, "step": 4055 }, { "epoch": 0.75, "grad_norm": 0.1953125, "learning_rate": 0.0003137850802886245, "loss": 2.1577, "step": 4060 }, { "epoch": 0.75, "grad_norm": 0.1904296875, "learning_rate": 0.00031351863446129187, "loss": 2.1237, "step": 4065 }, { "epoch": 0.76, "grad_norm": 0.1953125, "learning_rate": 0.0003132518910672948, "loss": 2.247, "step": 4070 }, { "epoch": 0.76, "grad_norm": 0.201171875, "learning_rate": 0.00031298485080584823, "loss": 2.1987, "step": 4075 }, { "epoch": 0.76, "grad_norm": 0.197265625, "learning_rate": 0.0003127175143769456, "loss": 2.1932, "step": 4080 }, { "epoch": 0.76, "grad_norm": 0.1904296875, "learning_rate": 0.00031244988248135645, "loss": 2.1826, "step": 4085 }, { "epoch": 0.76, "grad_norm": 0.1923828125, "learning_rate": 0.0003121819558206249, "loss": 2.1388, "step": 4090 }, { "epoch": 0.76, "grad_norm": 0.2001953125, "learning_rate": 0.00031191373509706767, "loss": 2.1869, "step": 4095 }, { "epoch": 0.76, "grad_norm": 0.193359375, "learning_rate": 0.00031164522101377254, "loss": 2.1403, "step": 4100 }, { "epoch": 0.76, "grad_norm": 0.203125, "learning_rate": 0.000311376414274596, "loss": 2.176, "step": 4105 }, { "epoch": 0.76, "grad_norm": 0.1943359375, "learning_rate": 0.00031110731558416175, "loss": 2.184, "step": 4110 }, { "epoch": 0.76, "grad_norm": 0.19921875, "learning_rate": 0.0003108379256478589, "loss": 2.1698, "step": 4115 }, { "epoch": 0.76, "grad_norm": 0.20703125, "learning_rate": 0.00031056824517183986, "loss": 2.1796, "step": 4120 }, { "epoch": 0.77, "grad_norm": 0.2001953125, "learning_rate": 0.0003102982748630188, "loss": 2.196, "step": 4125 }, { "epoch": 0.77, "grad_norm": 0.2041015625, "learning_rate": 0.00031002801542906943, "loss": 2.165, "step": 4130 }, { "epoch": 0.77, "grad_norm": 0.2080078125, "learning_rate": 0.00030975746757842354, "loss": 2.1812, "step": 4135 }, { "epoch": 0.77, "grad_norm": 0.1943359375, "learning_rate": 0.00030948663202026873, "loss": 2.2209, "step": 4140 }, { "epoch": 0.77, "grad_norm": 0.1982421875, "learning_rate": 0.00030921550946454694, "loss": 2.1573, "step": 4145 }, { "epoch": 0.77, "grad_norm": 0.19140625, "learning_rate": 0.0003089441006219524, "loss": 2.1966, "step": 4150 }, { "epoch": 0.77, "grad_norm": 0.1904296875, "learning_rate": 0.0003086724062039297, "loss": 2.1905, "step": 4155 }, { "epoch": 0.77, "grad_norm": 0.2001953125, "learning_rate": 0.00030840042692267214, "loss": 2.1838, "step": 4160 }, { "epoch": 0.77, "grad_norm": 0.2177734375, "learning_rate": 0.00030812816349111953, "loss": 2.1487, "step": 4165 }, { "epoch": 0.77, "grad_norm": 0.1943359375, "learning_rate": 0.00030785561662295674, "loss": 2.1751, "step": 4170 }, { "epoch": 0.77, "grad_norm": 0.1962890625, "learning_rate": 0.0003075827870326115, "loss": 2.1752, "step": 4175 }, { "epoch": 0.78, "grad_norm": 0.1923828125, "learning_rate": 0.0003073096754352526, "loss": 2.1844, "step": 4180 }, { "epoch": 0.78, "grad_norm": 0.1943359375, "learning_rate": 0.0003070362825467882, "loss": 2.1858, "step": 4185 }, { "epoch": 0.78, "grad_norm": 0.1904296875, "learning_rate": 0.00030676260908386367, "loss": 2.1701, "step": 4190 }, { "epoch": 0.78, "grad_norm": 0.19921875, "learning_rate": 0.00030648865576385993, "loss": 2.1694, "step": 4195 }, { "epoch": 0.78, "grad_norm": 0.1943359375, "learning_rate": 0.00030621442330489143, "loss": 2.1362, "step": 4200 }, { "epoch": 0.78, "grad_norm": 0.1943359375, "learning_rate": 0.0003059399124258044, "loss": 2.1407, "step": 4205 }, { "epoch": 0.78, "grad_norm": 0.1962890625, "learning_rate": 0.00030566512384617485, "loss": 2.1593, "step": 4210 }, { "epoch": 0.78, "grad_norm": 0.1982421875, "learning_rate": 0.0003053900582863068, "loss": 2.1579, "step": 4215 }, { "epoch": 0.78, "grad_norm": 0.1953125, "learning_rate": 0.0003051147164672301, "loss": 2.2069, "step": 4220 }, { "epoch": 0.78, "grad_norm": 0.19921875, "learning_rate": 0.000304839099110699, "loss": 2.1785, "step": 4225 }, { "epoch": 0.78, "grad_norm": 0.1943359375, "learning_rate": 0.0003045632069391899, "loss": 2.1446, "step": 4230 }, { "epoch": 0.79, "grad_norm": 0.193359375, "learning_rate": 0.0003042870406758996, "loss": 2.1558, "step": 4235 }, { "epoch": 0.79, "grad_norm": 0.2109375, "learning_rate": 0.0003040106010447435, "loss": 2.1839, "step": 4240 }, { "epoch": 0.79, "grad_norm": 0.1943359375, "learning_rate": 0.0003037338887703532, "loss": 2.1597, "step": 4245 }, { "epoch": 0.79, "grad_norm": 0.1982421875, "learning_rate": 0.0003034569045780755, "loss": 2.146, "step": 4250 }, { "epoch": 0.79, "grad_norm": 0.1904296875, "learning_rate": 0.0003031796491939695, "loss": 2.1618, "step": 4255 }, { "epoch": 0.79, "grad_norm": 0.19140625, "learning_rate": 0.0003029021233448056, "loss": 2.1484, "step": 4260 }, { "epoch": 0.79, "grad_norm": 0.1953125, "learning_rate": 0.0003026243277580628, "loss": 2.1625, "step": 4265 }, { "epoch": 0.79, "grad_norm": 0.1943359375, "learning_rate": 0.0003023462631619274, "loss": 2.1474, "step": 4270 }, { "epoch": 0.79, "grad_norm": 0.193359375, "learning_rate": 0.0003020679302852908, "loss": 2.1733, "step": 4275 }, { "epoch": 0.79, "grad_norm": 0.203125, "learning_rate": 0.00030178932985774757, "loss": 2.1369, "step": 4280 }, { "epoch": 0.79, "grad_norm": 0.2021484375, "learning_rate": 0.00030151046260959376, "loss": 2.1609, "step": 4285 }, { "epoch": 0.8, "grad_norm": 0.197265625, "learning_rate": 0.0003012313292718246, "loss": 2.1482, "step": 4290 }, { "epoch": 0.8, "grad_norm": 0.1982421875, "learning_rate": 0.00030095193057613306, "loss": 2.1889, "step": 4295 }, { "epoch": 0.8, "grad_norm": 0.1953125, "learning_rate": 0.0003006722672549076, "loss": 2.1691, "step": 4300 }, { "epoch": 0.8, "grad_norm": 0.197265625, "learning_rate": 0.0003003923400412304, "loss": 2.1902, "step": 4305 }, { "epoch": 0.8, "grad_norm": 0.1982421875, "learning_rate": 0.0003001121496688752, "loss": 2.1412, "step": 4310 }, { "epoch": 0.8, "grad_norm": 0.1982421875, "learning_rate": 0.00029983169687230576, "loss": 2.2059, "step": 4315 }, { "epoch": 0.8, "grad_norm": 0.19921875, "learning_rate": 0.0002995509823866736, "loss": 2.1333, "step": 4320 }, { "epoch": 0.8, "grad_norm": 0.1982421875, "learning_rate": 0.00029927000694781637, "loss": 2.159, "step": 4325 }, { "epoch": 0.8, "grad_norm": 0.203125, "learning_rate": 0.0002989887712922555, "loss": 2.1684, "step": 4330 }, { "epoch": 0.8, "grad_norm": 0.193359375, "learning_rate": 0.0002987072761571948, "loss": 2.1985, "step": 4335 }, { "epoch": 0.81, "grad_norm": 0.201171875, "learning_rate": 0.0002984255222805182, "loss": 2.1664, "step": 4340 }, { "epoch": 0.81, "grad_norm": 0.193359375, "learning_rate": 0.0002981435104007876, "loss": 2.1755, "step": 4345 }, { "epoch": 0.81, "grad_norm": 0.1943359375, "learning_rate": 0.0002978612412572416, "loss": 2.1208, "step": 4350 }, { "epoch": 0.81, "grad_norm": 0.2021484375, "learning_rate": 0.0002975787155897927, "loss": 2.1604, "step": 4355 }, { "epoch": 0.81, "grad_norm": 0.19921875, "learning_rate": 0.00029729593413902643, "loss": 2.1972, "step": 4360 }, { "epoch": 0.81, "grad_norm": 0.1962890625, "learning_rate": 0.00029701289764619824, "loss": 2.179, "step": 4365 }, { "epoch": 0.81, "grad_norm": 0.1962890625, "learning_rate": 0.0002967296068532325, "loss": 2.1741, "step": 4370 }, { "epoch": 0.81, "grad_norm": 0.193359375, "learning_rate": 0.0002964460625027199, "loss": 2.1915, "step": 4375 }, { "epoch": 0.81, "grad_norm": 0.2001953125, "learning_rate": 0.00029616226533791607, "loss": 2.1776, "step": 4380 }, { "epoch": 0.81, "grad_norm": 0.19921875, "learning_rate": 0.000295878216102739, "loss": 2.1422, "step": 4385 }, { "epoch": 0.81, "grad_norm": 0.1953125, "learning_rate": 0.0002955939155417678, "loss": 2.1374, "step": 4390 }, { "epoch": 0.82, "grad_norm": 0.1923828125, "learning_rate": 0.0002953093644002402, "loss": 2.1876, "step": 4395 }, { "epoch": 0.82, "grad_norm": 0.1953125, "learning_rate": 0.0002950245634240506, "loss": 2.1888, "step": 4400 }, { "epoch": 0.82, "grad_norm": 0.193359375, "learning_rate": 0.00029473951335974856, "loss": 2.1611, "step": 4405 }, { "epoch": 0.82, "grad_norm": 0.197265625, "learning_rate": 0.0002944542149545366, "loss": 2.1848, "step": 4410 }, { "epoch": 0.82, "grad_norm": 0.1953125, "learning_rate": 0.0002941686689562679, "loss": 2.1544, "step": 4415 }, { "epoch": 0.82, "grad_norm": 0.1943359375, "learning_rate": 0.00029388287611344506, "loss": 2.1715, "step": 4420 }, { "epoch": 0.82, "grad_norm": 0.19921875, "learning_rate": 0.0002935968371752174, "loss": 2.1448, "step": 4425 }, { "epoch": 0.82, "grad_norm": 0.1962890625, "learning_rate": 0.0002933105528913795, "loss": 2.1661, "step": 4430 }, { "epoch": 0.82, "grad_norm": 0.2060546875, "learning_rate": 0.00029302402401236904, "loss": 2.1847, "step": 4435 }, { "epoch": 0.82, "grad_norm": 0.1953125, "learning_rate": 0.00029273725128926484, "loss": 2.1442, "step": 4440 }, { "epoch": 0.82, "grad_norm": 0.193359375, "learning_rate": 0.00029245023547378493, "loss": 2.1806, "step": 4445 }, { "epoch": 0.83, "grad_norm": 0.201171875, "learning_rate": 0.0002921629773182845, "loss": 2.1519, "step": 4450 }, { "epoch": 0.83, "grad_norm": 0.1962890625, "learning_rate": 0.0002918754775757541, "loss": 2.1223, "step": 4455 }, { "epoch": 0.83, "grad_norm": 0.189453125, "learning_rate": 0.0002915877369998174, "loss": 2.1435, "step": 4460 }, { "epoch": 0.83, "grad_norm": 0.1884765625, "learning_rate": 0.0002912997563447296, "loss": 2.1613, "step": 4465 }, { "epoch": 0.83, "grad_norm": 0.1923828125, "learning_rate": 0.0002910115363653749, "loss": 2.1497, "step": 4470 }, { "epoch": 0.83, "grad_norm": 0.1953125, "learning_rate": 0.000290723077817265, "loss": 2.1734, "step": 4475 }, { "epoch": 0.83, "grad_norm": 0.2001953125, "learning_rate": 0.00029043438145653715, "loss": 2.1742, "step": 4480 }, { "epoch": 0.83, "grad_norm": 0.1982421875, "learning_rate": 0.0002901454480399517, "loss": 2.1759, "step": 4485 }, { "epoch": 0.83, "grad_norm": 0.1943359375, "learning_rate": 0.00028985627832489044, "loss": 2.1722, "step": 4490 }, { "epoch": 0.83, "grad_norm": 0.1923828125, "learning_rate": 0.0002895668730693548, "loss": 2.1574, "step": 4495 }, { "epoch": 0.83, "grad_norm": 0.1962890625, "learning_rate": 0.0002892772330319633, "loss": 2.1523, "step": 4500 }, { "epoch": 0.84, "grad_norm": 0.19921875, "learning_rate": 0.0002889873589719501, "loss": 2.1746, "step": 4505 }, { "epoch": 0.84, "grad_norm": 0.193359375, "learning_rate": 0.0002886972516491627, "loss": 2.1642, "step": 4510 }, { "epoch": 0.84, "grad_norm": 0.203125, "learning_rate": 0.0002884069118240602, "loss": 2.1204, "step": 4515 }, { "epoch": 0.84, "grad_norm": 0.193359375, "learning_rate": 0.0002881163402577111, "loss": 2.1641, "step": 4520 }, { "epoch": 0.84, "grad_norm": 0.20703125, "learning_rate": 0.00028782553771179123, "loss": 2.1502, "step": 4525 }, { "epoch": 0.84, "grad_norm": 0.1953125, "learning_rate": 0.000287534504948582, "loss": 2.1846, "step": 4530 }, { "epoch": 0.84, "grad_norm": 0.2001953125, "learning_rate": 0.00028724324273096837, "loss": 2.2048, "step": 4535 }, { "epoch": 0.84, "grad_norm": 0.208984375, "learning_rate": 0.0002869517518224366, "loss": 2.1565, "step": 4540 }, { "epoch": 0.84, "grad_norm": 0.2001953125, "learning_rate": 0.0002866600329870725, "loss": 2.1622, "step": 4545 }, { "epoch": 0.84, "grad_norm": 0.1943359375, "learning_rate": 0.00028636808698955933, "loss": 2.1589, "step": 4550 }, { "epoch": 0.85, "grad_norm": 0.1953125, "learning_rate": 0.00028607591459517596, "loss": 2.1666, "step": 4555 }, { "epoch": 0.85, "grad_norm": 0.1962890625, "learning_rate": 0.0002857835165697944, "loss": 2.1468, "step": 4560 }, { "epoch": 0.85, "grad_norm": 0.1982421875, "learning_rate": 0.0002854908936798783, "loss": 2.1915, "step": 4565 }, { "epoch": 0.85, "grad_norm": 0.19921875, "learning_rate": 0.00028519804669248084, "loss": 2.2123, "step": 4570 }, { "epoch": 0.85, "grad_norm": 0.197265625, "learning_rate": 0.0002849049763752424, "loss": 2.1614, "step": 4575 }, { "epoch": 0.85, "grad_norm": 0.2060546875, "learning_rate": 0.00028461168349638903, "loss": 2.1514, "step": 4580 }, { "epoch": 0.85, "grad_norm": 0.2001953125, "learning_rate": 0.00028431816882473, "loss": 2.1957, "step": 4585 }, { "epoch": 0.85, "grad_norm": 0.19140625, "learning_rate": 0.00028402443312965596, "loss": 2.172, "step": 4590 }, { "epoch": 0.85, "grad_norm": 0.19921875, "learning_rate": 0.000283730477181137, "loss": 2.1331, "step": 4595 }, { "epoch": 0.85, "grad_norm": 0.1982421875, "learning_rate": 0.0002834363017497205, "loss": 2.1357, "step": 4600 }, { "epoch": 0.85, "grad_norm": 0.2001953125, "learning_rate": 0.0002831419076065293, "loss": 2.1649, "step": 4605 }, { "epoch": 0.86, "grad_norm": 0.193359375, "learning_rate": 0.0002828472955232595, "loss": 2.1565, "step": 4610 }, { "epoch": 0.86, "grad_norm": 0.2021484375, "learning_rate": 0.00028255246627217824, "loss": 2.1964, "step": 4615 }, { "epoch": 0.86, "grad_norm": 0.201171875, "learning_rate": 0.00028225742062612237, "loss": 2.1533, "step": 4620 }, { "epoch": 0.86, "grad_norm": 0.2021484375, "learning_rate": 0.00028196215935849555, "loss": 2.1384, "step": 4625 }, { "epoch": 0.86, "grad_norm": 0.2021484375, "learning_rate": 0.00028166668324326695, "loss": 2.1632, "step": 4630 }, { "epoch": 0.86, "grad_norm": 0.185546875, "learning_rate": 0.0002813709930549688, "loss": 2.1184, "step": 4635 }, { "epoch": 0.86, "grad_norm": 0.197265625, "learning_rate": 0.0002810750895686944, "loss": 2.1813, "step": 4640 }, { "epoch": 0.86, "grad_norm": 0.197265625, "learning_rate": 0.0002807789735600964, "loss": 2.1603, "step": 4645 }, { "epoch": 0.86, "grad_norm": 0.197265625, "learning_rate": 0.00028048264580538435, "loss": 2.1534, "step": 4650 }, { "epoch": 0.86, "grad_norm": 0.1962890625, "learning_rate": 0.00028018610708132274, "loss": 2.191, "step": 4655 }, { "epoch": 0.86, "grad_norm": 0.205078125, "learning_rate": 0.0002798893581652295, "loss": 2.1827, "step": 4660 }, { "epoch": 0.87, "grad_norm": 0.1962890625, "learning_rate": 0.0002795923998349729, "loss": 2.1759, "step": 4665 }, { "epoch": 0.87, "grad_norm": 0.1982421875, "learning_rate": 0.0002792952328689709, "loss": 2.15, "step": 4670 }, { "epoch": 0.87, "grad_norm": 0.203125, "learning_rate": 0.0002789978580461877, "loss": 2.1464, "step": 4675 }, { "epoch": 0.87, "grad_norm": 0.193359375, "learning_rate": 0.0002787002761461328, "loss": 2.1575, "step": 4680 }, { "epoch": 0.87, "grad_norm": 0.1962890625, "learning_rate": 0.00027840248794885826, "loss": 2.1241, "step": 4685 }, { "epoch": 0.87, "grad_norm": 0.2060546875, "learning_rate": 0.0002781044942349569, "loss": 2.1715, "step": 4690 }, { "epoch": 0.87, "grad_norm": 0.1943359375, "learning_rate": 0.00027780629578556045, "loss": 2.1551, "step": 4695 }, { "epoch": 0.87, "grad_norm": 0.1982421875, "learning_rate": 0.0002775078933823372, "loss": 2.1945, "step": 4700 }, { "epoch": 0.87, "grad_norm": 0.201171875, "learning_rate": 0.00027720928780749, "loss": 2.1742, "step": 4705 }, { "epoch": 0.87, "grad_norm": 0.2001953125, "learning_rate": 0.0002769104798437546, "loss": 2.1535, "step": 4710 }, { "epoch": 0.87, "grad_norm": 0.2001953125, "learning_rate": 0.00027661147027439664, "loss": 2.1172, "step": 4715 }, { "epoch": 0.88, "grad_norm": 0.1943359375, "learning_rate": 0.00027631225988321084, "loss": 2.1983, "step": 4720 }, { "epoch": 0.88, "grad_norm": 0.1962890625, "learning_rate": 0.0002760128494545181, "loss": 2.1574, "step": 4725 }, { "epoch": 0.88, "grad_norm": 0.2001953125, "learning_rate": 0.0002757132397731636, "loss": 2.1594, "step": 4730 }, { "epoch": 0.88, "grad_norm": 0.1943359375, "learning_rate": 0.00027541343162451495, "loss": 2.1266, "step": 4735 }, { "epoch": 0.88, "grad_norm": 0.20703125, "learning_rate": 0.00027511342579446, "loss": 2.1707, "step": 4740 }, { "epoch": 0.88, "grad_norm": 0.1953125, "learning_rate": 0.0002748132230694047, "loss": 2.1271, "step": 4745 }, { "epoch": 0.88, "grad_norm": 0.205078125, "learning_rate": 0.0002745128242362711, "loss": 2.1612, "step": 4750 }, { "epoch": 0.88, "grad_norm": 0.2021484375, "learning_rate": 0.00027421223008249545, "loss": 2.1779, "step": 4755 }, { "epoch": 0.88, "grad_norm": 0.2041015625, "learning_rate": 0.00027391144139602596, "loss": 2.1914, "step": 4760 }, { "epoch": 0.88, "grad_norm": 0.2001953125, "learning_rate": 0.00027361045896532053, "loss": 2.1352, "step": 4765 }, { "epoch": 0.88, "grad_norm": 0.197265625, "learning_rate": 0.0002733092835793454, "loss": 2.1487, "step": 4770 }, { "epoch": 0.89, "grad_norm": 0.2001953125, "learning_rate": 0.0002730079160275721, "loss": 2.1512, "step": 4775 }, { "epoch": 0.89, "grad_norm": 0.2041015625, "learning_rate": 0.00027270635709997616, "loss": 2.1593, "step": 4780 }, { "epoch": 0.89, "grad_norm": 0.1962890625, "learning_rate": 0.0002724046075870348, "loss": 2.1361, "step": 4785 }, { "epoch": 0.89, "grad_norm": 0.197265625, "learning_rate": 0.0002721026682797245, "loss": 2.1535, "step": 4790 }, { "epoch": 0.89, "grad_norm": 0.2021484375, "learning_rate": 0.0002718005399695197, "loss": 2.1602, "step": 4795 }, { "epoch": 0.89, "grad_norm": 0.19921875, "learning_rate": 0.00027149822344839006, "loss": 2.1593, "step": 4800 }, { "epoch": 0.89, "grad_norm": 0.193359375, "learning_rate": 0.00027119571950879847, "loss": 2.1754, "step": 4805 }, { "epoch": 0.89, "grad_norm": 0.1953125, "learning_rate": 0.00027089302894369924, "loss": 2.1668, "step": 4810 }, { "epoch": 0.89, "grad_norm": 0.197265625, "learning_rate": 0.00027059015254653586, "loss": 2.1684, "step": 4815 }, { "epoch": 0.89, "grad_norm": 0.1962890625, "learning_rate": 0.000270287091111239, "loss": 2.1496, "step": 4820 }, { "epoch": 0.9, "grad_norm": 0.1962890625, "learning_rate": 0.00026998384543222434, "loss": 2.1385, "step": 4825 }, { "epoch": 0.9, "grad_norm": 0.201171875, "learning_rate": 0.0002696804163043904, "loss": 2.1392, "step": 4830 }, { "epoch": 0.9, "grad_norm": 0.212890625, "learning_rate": 0.00026937680452311674, "loss": 2.2032, "step": 4835 }, { "epoch": 0.9, "grad_norm": 0.201171875, "learning_rate": 0.0002690730108842615, "loss": 2.1713, "step": 4840 }, { "epoch": 0.9, "grad_norm": 0.19921875, "learning_rate": 0.0002687690361841599, "loss": 2.1981, "step": 4845 }, { "epoch": 0.9, "grad_norm": 0.201171875, "learning_rate": 0.00026846488121962136, "loss": 2.1998, "step": 4850 }, { "epoch": 0.9, "grad_norm": 0.1943359375, "learning_rate": 0.00026816054678792807, "loss": 2.1832, "step": 4855 }, { "epoch": 0.9, "grad_norm": 0.197265625, "learning_rate": 0.00026785603368683253, "loss": 2.137, "step": 4860 }, { "epoch": 0.9, "grad_norm": 0.1982421875, "learning_rate": 0.0002675513427145558, "loss": 2.1652, "step": 4865 }, { "epoch": 0.9, "grad_norm": 0.201171875, "learning_rate": 0.00026724647466978493, "loss": 2.164, "step": 4870 }, { "epoch": 0.9, "grad_norm": 0.1904296875, "learning_rate": 0.0002669414303516712, "loss": 2.1208, "step": 4875 }, { "epoch": 0.91, "grad_norm": 0.197265625, "learning_rate": 0.0002666362105598281, "loss": 2.1669, "step": 4880 }, { "epoch": 0.91, "grad_norm": 0.1982421875, "learning_rate": 0.00026633081609432895, "loss": 2.1483, "step": 4885 }, { "epoch": 0.91, "grad_norm": 0.1962890625, "learning_rate": 0.000266025247755705, "loss": 2.1685, "step": 4890 }, { "epoch": 0.91, "grad_norm": 0.1953125, "learning_rate": 0.0002657195063449432, "loss": 2.1387, "step": 4895 }, { "epoch": 0.91, "grad_norm": 0.1962890625, "learning_rate": 0.00026541359266348436, "loss": 2.1927, "step": 4900 }, { "epoch": 0.91, "grad_norm": 0.2021484375, "learning_rate": 0.0002651075075132206, "loss": 2.1684, "step": 4905 }, { "epoch": 0.91, "grad_norm": 0.2021484375, "learning_rate": 0.0002648012516964937, "loss": 2.1658, "step": 4910 }, { "epoch": 0.91, "grad_norm": 0.1982421875, "learning_rate": 0.0002644948260160928, "loss": 2.1713, "step": 4915 }, { "epoch": 0.91, "grad_norm": 0.1962890625, "learning_rate": 0.00026418823127525215, "loss": 2.1661, "step": 4920 }, { "epoch": 0.91, "grad_norm": 0.2138671875, "learning_rate": 0.0002638814682776494, "loss": 2.2177, "step": 4925 }, { "epoch": 0.91, "grad_norm": 0.1962890625, "learning_rate": 0.000263574537827403, "loss": 2.145, "step": 4930 }, { "epoch": 0.92, "grad_norm": 0.1904296875, "learning_rate": 0.00026326744072907056, "loss": 2.1322, "step": 4935 }, { "epoch": 0.92, "grad_norm": 0.2099609375, "learning_rate": 0.00026296017778764633, "loss": 2.1891, "step": 4940 }, { "epoch": 0.92, "grad_norm": 0.2041015625, "learning_rate": 0.00026265274980855947, "loss": 2.2206, "step": 4945 }, { "epoch": 0.92, "grad_norm": 0.2041015625, "learning_rate": 0.00026234515759767166, "loss": 2.1822, "step": 4950 }, { "epoch": 0.92, "grad_norm": 0.205078125, "learning_rate": 0.00026203740196127504, "loss": 2.1469, "step": 4955 }, { "epoch": 0.92, "grad_norm": 0.193359375, "learning_rate": 0.0002617294837060902, "loss": 2.1701, "step": 4960 }, { "epoch": 0.92, "grad_norm": 0.201171875, "learning_rate": 0.000261421403639264, "loss": 2.1456, "step": 4965 }, { "epoch": 0.92, "grad_norm": 0.2041015625, "learning_rate": 0.00026111316256836745, "loss": 2.1366, "step": 4970 }, { "epoch": 0.92, "grad_norm": 0.205078125, "learning_rate": 0.0002608047613013936, "loss": 2.1981, "step": 4975 }, { "epoch": 0.92, "grad_norm": 0.197265625, "learning_rate": 0.0002604962006467555, "loss": 2.1793, "step": 4980 }, { "epoch": 0.92, "grad_norm": 0.1953125, "learning_rate": 0.0002601874814132837, "loss": 2.1473, "step": 4985 }, { "epoch": 0.93, "grad_norm": 0.1923828125, "learning_rate": 0.0002598786044102249, "loss": 2.1915, "step": 4990 }, { "epoch": 0.93, "grad_norm": 0.1904296875, "learning_rate": 0.0002595695704472389, "loss": 2.1568, "step": 4995 }, { "epoch": 0.93, "grad_norm": 0.1982421875, "learning_rate": 0.0002592603803343973, "loss": 2.157, "step": 5000 }, { "epoch": 0.93, "grad_norm": 0.201171875, "learning_rate": 0.00025895103488218085, "loss": 2.1811, "step": 5005 }, { "epoch": 0.93, "grad_norm": 0.1943359375, "learning_rate": 0.0002586415349014775, "loss": 2.1387, "step": 5010 }, { "epoch": 0.93, "grad_norm": 0.19140625, "learning_rate": 0.0002583318812035803, "loss": 2.1382, "step": 5015 }, { "epoch": 0.93, "grad_norm": 0.2080078125, "learning_rate": 0.0002580220746001852, "loss": 2.1861, "step": 5020 }, { "epoch": 0.93, "grad_norm": 0.2001953125, "learning_rate": 0.000257712115903389, "loss": 2.1506, "step": 5025 }, { "epoch": 0.93, "grad_norm": 0.19921875, "learning_rate": 0.00025740200592568713, "loss": 2.1404, "step": 5030 }, { "epoch": 0.93, "grad_norm": 0.197265625, "learning_rate": 0.00025709174547997157, "loss": 2.1181, "step": 5035 }, { "epoch": 0.94, "grad_norm": 0.1953125, "learning_rate": 0.0002567813353795288, "loss": 2.1433, "step": 5040 }, { "epoch": 0.94, "grad_norm": 0.1962890625, "learning_rate": 0.00025647077643803763, "loss": 2.1527, "step": 5045 }, { "epoch": 0.94, "grad_norm": 0.2021484375, "learning_rate": 0.00025616006946956683, "loss": 2.1853, "step": 5050 }, { "epoch": 0.94, "grad_norm": 0.203125, "learning_rate": 0.0002558492152885735, "loss": 2.1775, "step": 5055 }, { "epoch": 0.94, "grad_norm": 0.2080078125, "learning_rate": 0.00025553821470990007, "loss": 2.1644, "step": 5060 }, { "epoch": 0.94, "grad_norm": 0.1962890625, "learning_rate": 0.00025522706854877344, "loss": 2.1959, "step": 5065 }, { "epoch": 0.94, "grad_norm": 0.1943359375, "learning_rate": 0.0002549157776208016, "loss": 2.2071, "step": 5070 }, { "epoch": 0.94, "grad_norm": 0.205078125, "learning_rate": 0.0002546043427419723, "loss": 2.1704, "step": 5075 }, { "epoch": 0.94, "grad_norm": 0.1962890625, "learning_rate": 0.0002542927647286505, "loss": 2.1489, "step": 5080 }, { "epoch": 0.94, "grad_norm": 0.201171875, "learning_rate": 0.00025398104439757624, "loss": 2.1703, "step": 5085 }, { "epoch": 0.94, "grad_norm": 0.1982421875, "learning_rate": 0.0002536691825658629, "loss": 2.1398, "step": 5090 }, { "epoch": 0.95, "grad_norm": 0.1904296875, "learning_rate": 0.00025335718005099474, "loss": 2.1478, "step": 5095 }, { "epoch": 0.95, "grad_norm": 0.1904296875, "learning_rate": 0.0002530450376708244, "loss": 2.137, "step": 5100 }, { "epoch": 0.95, "grad_norm": 0.19921875, "learning_rate": 0.00025273275624357165, "loss": 2.1979, "step": 5105 }, { "epoch": 0.95, "grad_norm": 0.1962890625, "learning_rate": 0.00025242033658782044, "loss": 2.1232, "step": 5110 }, { "epoch": 0.95, "grad_norm": 0.2041015625, "learning_rate": 0.00025210777952251723, "loss": 2.1871, "step": 5115 }, { "epoch": 0.95, "grad_norm": 0.2041015625, "learning_rate": 0.00025179508586696856, "loss": 2.1422, "step": 5120 }, { "epoch": 0.95, "grad_norm": 0.205078125, "learning_rate": 0.000251482256440839, "loss": 2.1773, "step": 5125 }, { "epoch": 0.95, "grad_norm": 0.2099609375, "learning_rate": 0.0002511692920641491, "loss": 2.167, "step": 5130 }, { "epoch": 0.95, "grad_norm": 0.201171875, "learning_rate": 0.0002508561935572731, "loss": 2.1768, "step": 5135 }, { "epoch": 0.95, "grad_norm": 0.205078125, "learning_rate": 0.0002505429617409369, "loss": 2.1799, "step": 5140 }, { "epoch": 0.95, "grad_norm": 0.205078125, "learning_rate": 0.0002502295974362158, "loss": 2.153, "step": 5145 }, { "epoch": 0.96, "grad_norm": 0.1962890625, "learning_rate": 0.0002499161014645324, "loss": 2.1172, "step": 5150 }, { "epoch": 0.96, "grad_norm": 0.2021484375, "learning_rate": 0.00024960247464765443, "loss": 2.206, "step": 5155 }, { "epoch": 0.96, "grad_norm": 0.2001953125, "learning_rate": 0.0002492887178076924, "loss": 2.1683, "step": 5160 }, { "epoch": 0.96, "grad_norm": 0.197265625, "learning_rate": 0.0002489748317670982, "loss": 2.1484, "step": 5165 }, { "epoch": 0.96, "grad_norm": 0.201171875, "learning_rate": 0.0002486608173486618, "loss": 2.1443, "step": 5170 }, { "epoch": 0.96, "grad_norm": 0.197265625, "learning_rate": 0.00024834667537550995, "loss": 2.16, "step": 5175 }, { "epoch": 0.96, "grad_norm": 0.19921875, "learning_rate": 0.0002480324066711039, "loss": 2.1605, "step": 5180 }, { "epoch": 0.96, "grad_norm": 0.19921875, "learning_rate": 0.0002477180120592368, "loss": 2.1522, "step": 5185 }, { "epoch": 0.96, "grad_norm": 0.2001953125, "learning_rate": 0.0002474034923640321, "loss": 2.1217, "step": 5190 }, { "epoch": 0.96, "grad_norm": 0.1982421875, "learning_rate": 0.00024708884840994095, "loss": 2.2046, "step": 5195 }, { "epoch": 0.96, "grad_norm": 0.2001953125, "learning_rate": 0.00024677408102174027, "loss": 2.1732, "step": 5200 }, { "epoch": 0.97, "grad_norm": 0.2021484375, "learning_rate": 0.00024645919102453065, "loss": 2.1298, "step": 5205 }, { "epoch": 0.97, "grad_norm": 0.19140625, "learning_rate": 0.00024614417924373385, "loss": 2.1284, "step": 5210 }, { "epoch": 0.97, "grad_norm": 0.203125, "learning_rate": 0.0002458290465050911, "loss": 2.1746, "step": 5215 }, { "epoch": 0.97, "grad_norm": 0.2001953125, "learning_rate": 0.0002455137936346606, "loss": 2.1432, "step": 5220 }, { "epoch": 0.97, "grad_norm": 0.197265625, "learning_rate": 0.0002451984214588153, "loss": 2.1479, "step": 5225 }, { "epoch": 0.97, "grad_norm": 0.1982421875, "learning_rate": 0.0002448829308042412, "loss": 2.1422, "step": 5230 }, { "epoch": 0.97, "grad_norm": 0.1953125, "learning_rate": 0.0002445673224979347, "loss": 2.1339, "step": 5235 }, { "epoch": 0.97, "grad_norm": 0.1982421875, "learning_rate": 0.0002442515973672005, "loss": 2.1753, "step": 5240 }, { "epoch": 0.97, "grad_norm": 0.1953125, "learning_rate": 0.0002439357562396496, "loss": 2.1239, "step": 5245 }, { "epoch": 0.97, "grad_norm": 0.1982421875, "learning_rate": 0.00024361979994319715, "loss": 2.1903, "step": 5250 }, { "epoch": 0.97, "grad_norm": 0.2021484375, "learning_rate": 0.00024330372930606018, "loss": 2.1487, "step": 5255 }, { "epoch": 0.98, "grad_norm": 0.201171875, "learning_rate": 0.00024298754515675533, "loss": 2.1711, "step": 5260 }, { "epoch": 0.98, "grad_norm": 0.197265625, "learning_rate": 0.0002426712483240969, "loss": 2.1315, "step": 5265 }, { "epoch": 0.98, "grad_norm": 0.201171875, "learning_rate": 0.00024235483963719453, "loss": 2.191, "step": 5270 }, { "epoch": 0.98, "grad_norm": 0.20703125, "learning_rate": 0.00024203831992545096, "loss": 2.1441, "step": 5275 }, { "epoch": 0.98, "grad_norm": 0.2021484375, "learning_rate": 0.0002417216900185602, "loss": 2.1895, "step": 5280 }, { "epoch": 0.98, "grad_norm": 0.2021484375, "learning_rate": 0.00024140495074650487, "loss": 2.2274, "step": 5285 }, { "epoch": 0.98, "grad_norm": 0.197265625, "learning_rate": 0.00024108810293955438, "loss": 2.1568, "step": 5290 }, { "epoch": 0.98, "grad_norm": 0.201171875, "learning_rate": 0.0002407711474282627, "loss": 2.1499, "step": 5295 }, { "epoch": 0.98, "grad_norm": 0.2021484375, "learning_rate": 0.00024045408504346606, "loss": 2.1909, "step": 5300 }, { "epoch": 0.98, "grad_norm": 0.1962890625, "learning_rate": 0.00024013691661628074, "loss": 2.1499, "step": 5305 }, { "epoch": 0.99, "grad_norm": 0.2041015625, "learning_rate": 0.00023981964297810118, "loss": 2.1664, "step": 5310 }, { "epoch": 0.99, "grad_norm": 0.197265625, "learning_rate": 0.00023950226496059743, "loss": 2.1702, "step": 5315 }, { "epoch": 0.99, "grad_norm": 0.1982421875, "learning_rate": 0.00023918478339571335, "loss": 2.1252, "step": 5320 }, { "epoch": 0.99, "grad_norm": 0.2001953125, "learning_rate": 0.00023886719911566404, "loss": 2.1412, "step": 5325 }, { "epoch": 0.99, "grad_norm": 0.203125, "learning_rate": 0.0002385495129529339, "loss": 2.1895, "step": 5330 }, { "epoch": 0.99, "grad_norm": 0.1943359375, "learning_rate": 0.0002382317257402745, "loss": 2.1669, "step": 5335 }, { "epoch": 0.99, "grad_norm": 0.1982421875, "learning_rate": 0.0002379138383107021, "loss": 2.1341, "step": 5340 }, { "epoch": 0.99, "grad_norm": 0.1943359375, "learning_rate": 0.0002375958514974959, "loss": 2.1688, "step": 5345 }, { "epoch": 0.99, "grad_norm": 0.2001953125, "learning_rate": 0.00023727776613419543, "loss": 2.1949, "step": 5350 }, { "epoch": 0.99, "grad_norm": 0.1953125, "learning_rate": 0.00023695958305459854, "loss": 2.1693, "step": 5355 }, { "epoch": 0.99, "grad_norm": 0.203125, "learning_rate": 0.0002366413030927594, "loss": 2.1514, "step": 5360 }, { "epoch": 1.0, "grad_norm": 0.2021484375, "learning_rate": 0.00023632292708298587, "loss": 2.1296, "step": 5365 }, { "epoch": 1.0, "grad_norm": 0.201171875, "learning_rate": 0.00023600445585983791, "loss": 2.1054, "step": 5370 }, { "epoch": 1.0, "grad_norm": 0.19921875, "learning_rate": 0.0002356858902581248, "loss": 2.1498, "step": 5375 }, { "epoch": 1.0, "grad_norm": 0.1943359375, "learning_rate": 0.00023536723111290328, "loss": 2.1251, "step": 5380 }, { "epoch": 1.0, "grad_norm": 0.1953125, "learning_rate": 0.0002350484792594754, "loss": 2.1655, "step": 5385 }, { "epoch": 1.0, "grad_norm": 0.421875, "learning_rate": 0.00023472963553338613, "loss": 2.1002, "step": 5390 }, { "epoch": 1.0, "eval_loss": 2.155019521713257, "eval_runtime": 171.4248, "eval_samples_per_second": 28.286, "eval_steps_per_second": 3.541, "step": 5390 }, { "epoch": 1.0, "grad_norm": 0.2001953125, "learning_rate": 0.00023441070077042118, "loss": 2.0676, "step": 5395 }, { "epoch": 1.0, "grad_norm": 0.201171875, "learning_rate": 0.0002340916758066051, "loss": 2.0957, "step": 5400 }, { "epoch": 1.0, "grad_norm": 0.205078125, "learning_rate": 0.00023377256147819872, "loss": 2.0928, "step": 5405 }, { "epoch": 1.0, "grad_norm": 0.2041015625, "learning_rate": 0.00023345335862169723, "loss": 2.1057, "step": 5410 }, { "epoch": 1.0, "grad_norm": 0.2021484375, "learning_rate": 0.00023313406807382782, "loss": 2.0921, "step": 5415 }, { "epoch": 1.01, "grad_norm": 0.19921875, "learning_rate": 0.0002328146906715476, "loss": 2.0752, "step": 5420 }, { "epoch": 1.01, "grad_norm": 0.201171875, "learning_rate": 0.00023249522725204123, "loss": 2.0937, "step": 5425 }, { "epoch": 1.01, "grad_norm": 0.20703125, "learning_rate": 0.0002321756786527189, "loss": 2.09, "step": 5430 }, { "epoch": 1.01, "grad_norm": 0.2021484375, "learning_rate": 0.00023185604571121418, "loss": 2.1037, "step": 5435 }, { "epoch": 1.01, "grad_norm": 0.20703125, "learning_rate": 0.00023153632926538166, "loss": 2.1509, "step": 5440 }, { "epoch": 1.01, "grad_norm": 0.201171875, "learning_rate": 0.0002312165301532948, "loss": 2.1312, "step": 5445 }, { "epoch": 1.01, "grad_norm": 0.20703125, "learning_rate": 0.00023089664921324373, "loss": 2.1174, "step": 5450 }, { "epoch": 1.01, "grad_norm": 0.201171875, "learning_rate": 0.00023057668728373315, "loss": 2.1352, "step": 5455 }, { "epoch": 1.01, "grad_norm": 0.2041015625, "learning_rate": 0.00023025664520348005, "loss": 2.1088, "step": 5460 }, { "epoch": 1.01, "grad_norm": 0.203125, "learning_rate": 0.00022993652381141138, "loss": 2.1271, "step": 5465 }, { "epoch": 1.01, "grad_norm": 0.212890625, "learning_rate": 0.00022961632394666222, "loss": 2.1281, "step": 5470 }, { "epoch": 1.02, "grad_norm": 0.2041015625, "learning_rate": 0.0002292960464485732, "loss": 2.1294, "step": 5475 }, { "epoch": 1.02, "grad_norm": 0.2041015625, "learning_rate": 0.00022897569215668843, "loss": 2.1289, "step": 5480 }, { "epoch": 1.02, "grad_norm": 0.2080078125, "learning_rate": 0.00022865526191075347, "loss": 2.0997, "step": 5485 }, { "epoch": 1.02, "grad_norm": 0.20703125, "learning_rate": 0.00022833475655071274, "loss": 2.1106, "step": 5490 }, { "epoch": 1.02, "grad_norm": 0.21484375, "learning_rate": 0.0002280141769167078, "loss": 2.1005, "step": 5495 }, { "epoch": 1.02, "grad_norm": 0.203125, "learning_rate": 0.0002276935238490748, "loss": 2.0904, "step": 5500 }, { "epoch": 1.02, "grad_norm": 0.203125, "learning_rate": 0.00022737279818834237, "loss": 2.114, "step": 5505 }, { "epoch": 1.02, "grad_norm": 0.208984375, "learning_rate": 0.0002270520007752294, "loss": 2.1366, "step": 5510 }, { "epoch": 1.02, "grad_norm": 0.216796875, "learning_rate": 0.00022673113245064296, "loss": 2.1823, "step": 5515 }, { "epoch": 1.02, "grad_norm": 0.2080078125, "learning_rate": 0.0002264101940556759, "loss": 2.1347, "step": 5520 }, { "epoch": 1.03, "grad_norm": 0.2138671875, "learning_rate": 0.00022608918643160486, "loss": 2.1263, "step": 5525 }, { "epoch": 1.03, "grad_norm": 0.2041015625, "learning_rate": 0.0002257681104198878, "loss": 2.132, "step": 5530 }, { "epoch": 1.03, "grad_norm": 0.208984375, "learning_rate": 0.00022544696686216208, "loss": 2.1122, "step": 5535 }, { "epoch": 1.03, "grad_norm": 0.20703125, "learning_rate": 0.00022512575660024205, "loss": 2.1077, "step": 5540 }, { "epoch": 1.03, "grad_norm": 0.2099609375, "learning_rate": 0.00022480448047611695, "loss": 2.1157, "step": 5545 }, { "epoch": 1.03, "grad_norm": 0.2041015625, "learning_rate": 0.0002244831393319486, "loss": 2.0735, "step": 5550 }, { "epoch": 1.03, "grad_norm": 0.2080078125, "learning_rate": 0.00022416173401006932, "loss": 2.1343, "step": 5555 }, { "epoch": 1.03, "grad_norm": 0.2060546875, "learning_rate": 0.00022384026535297963, "loss": 2.1022, "step": 5560 }, { "epoch": 1.03, "grad_norm": 0.208984375, "learning_rate": 0.00022351873420334615, "loss": 2.0955, "step": 5565 }, { "epoch": 1.03, "grad_norm": 0.220703125, "learning_rate": 0.0002231971414039991, "loss": 2.13, "step": 5570 }, { "epoch": 1.03, "grad_norm": 0.201171875, "learning_rate": 0.0002228754877979306, "loss": 2.0688, "step": 5575 }, { "epoch": 1.04, "grad_norm": 0.2119140625, "learning_rate": 0.0002225537742282919, "loss": 2.1275, "step": 5580 }, { "epoch": 1.04, "grad_norm": 0.2109375, "learning_rate": 0.0002222320015383916, "loss": 2.0992, "step": 5585 }, { "epoch": 1.04, "grad_norm": 0.2119140625, "learning_rate": 0.00022191017057169318, "loss": 2.1206, "step": 5590 }, { "epoch": 1.04, "grad_norm": 0.2099609375, "learning_rate": 0.0002215882821718129, "loss": 2.148, "step": 5595 }, { "epoch": 1.04, "grad_norm": 0.2158203125, "learning_rate": 0.0002212663371825176, "loss": 2.1268, "step": 5600 }, { "epoch": 1.04, "grad_norm": 0.21875, "learning_rate": 0.00022094433644772248, "loss": 2.0779, "step": 5605 }, { "epoch": 1.04, "grad_norm": 0.20703125, "learning_rate": 0.00022062228081148874, "loss": 2.1113, "step": 5610 }, { "epoch": 1.04, "grad_norm": 0.208984375, "learning_rate": 0.00022030017111802165, "loss": 2.1148, "step": 5615 }, { "epoch": 1.04, "grad_norm": 0.203125, "learning_rate": 0.00021997800821166807, "loss": 2.1307, "step": 5620 }, { "epoch": 1.04, "grad_norm": 0.212890625, "learning_rate": 0.00021965579293691442, "loss": 2.1319, "step": 5625 }, { "epoch": 1.04, "grad_norm": 0.2099609375, "learning_rate": 0.00021933352613838435, "loss": 2.1281, "step": 5630 }, { "epoch": 1.05, "grad_norm": 0.21484375, "learning_rate": 0.00021901120866083651, "loss": 2.1421, "step": 5635 }, { "epoch": 1.05, "grad_norm": 0.2216796875, "learning_rate": 0.00021868884134916265, "loss": 2.1655, "step": 5640 }, { "epoch": 1.05, "grad_norm": 0.2119140625, "learning_rate": 0.00021836642504838473, "loss": 2.1082, "step": 5645 }, { "epoch": 1.05, "grad_norm": 0.2119140625, "learning_rate": 0.00021804396060365355, "loss": 2.1059, "step": 5650 }, { "epoch": 1.05, "grad_norm": 0.2109375, "learning_rate": 0.00021772144886024583, "loss": 2.1674, "step": 5655 }, { "epoch": 1.05, "grad_norm": 0.2109375, "learning_rate": 0.00021739889066356232, "loss": 2.1147, "step": 5660 }, { "epoch": 1.05, "grad_norm": 0.2060546875, "learning_rate": 0.00021707628685912572, "loss": 2.12, "step": 5665 }, { "epoch": 1.05, "grad_norm": 0.22265625, "learning_rate": 0.00021675363829257803, "loss": 2.0997, "step": 5670 }, { "epoch": 1.05, "grad_norm": 0.205078125, "learning_rate": 0.00021643094580967874, "loss": 2.0945, "step": 5675 }, { "epoch": 1.05, "grad_norm": 0.208984375, "learning_rate": 0.00021610821025630243, "loss": 2.1583, "step": 5680 }, { "epoch": 1.05, "grad_norm": 0.208984375, "learning_rate": 0.00021578543247843647, "loss": 2.1116, "step": 5685 }, { "epoch": 1.06, "grad_norm": 0.2060546875, "learning_rate": 0.00021546261332217918, "loss": 2.0938, "step": 5690 }, { "epoch": 1.06, "grad_norm": 0.2080078125, "learning_rate": 0.00021513975363373703, "loss": 2.086, "step": 5695 }, { "epoch": 1.06, "grad_norm": 0.2177734375, "learning_rate": 0.00021481685425942302, "loss": 2.1388, "step": 5700 }, { "epoch": 1.06, "grad_norm": 0.2099609375, "learning_rate": 0.00021449391604565392, "loss": 2.0998, "step": 5705 }, { "epoch": 1.06, "grad_norm": 0.2109375, "learning_rate": 0.00021417093983894844, "loss": 2.1114, "step": 5710 }, { "epoch": 1.06, "grad_norm": 0.216796875, "learning_rate": 0.0002138479264859249, "loss": 2.1571, "step": 5715 }, { "epoch": 1.06, "grad_norm": 0.2119140625, "learning_rate": 0.000213524876833299, "loss": 2.1065, "step": 5720 }, { "epoch": 1.06, "grad_norm": 0.20703125, "learning_rate": 0.00021320179172788155, "loss": 2.1273, "step": 5725 }, { "epoch": 1.06, "grad_norm": 0.2216796875, "learning_rate": 0.0002128786720165763, "loss": 2.1065, "step": 5730 }, { "epoch": 1.06, "grad_norm": 0.205078125, "learning_rate": 0.00021255551854637762, "loss": 2.1065, "step": 5735 }, { "epoch": 1.06, "grad_norm": 0.216796875, "learning_rate": 0.00021223233216436857, "loss": 2.0759, "step": 5740 }, { "epoch": 1.07, "grad_norm": 0.21875, "learning_rate": 0.0002119091137177183, "loss": 2.0995, "step": 5745 }, { "epoch": 1.07, "grad_norm": 0.2099609375, "learning_rate": 0.00021158586405368017, "loss": 2.1513, "step": 5750 }, { "epoch": 1.07, "grad_norm": 0.2158203125, "learning_rate": 0.0002112625840195893, "loss": 2.1142, "step": 5755 }, { "epoch": 1.07, "grad_norm": 0.216796875, "learning_rate": 0.0002109392744628603, "loss": 2.1145, "step": 5760 }, { "epoch": 1.07, "grad_norm": 0.212890625, "learning_rate": 0.00021061593623098533, "loss": 2.1038, "step": 5765 }, { "epoch": 1.07, "grad_norm": 0.2041015625, "learning_rate": 0.00021029257017153162, "loss": 2.1134, "step": 5770 }, { "epoch": 1.07, "grad_norm": 0.2109375, "learning_rate": 0.00020996917713213945, "loss": 2.0712, "step": 5775 }, { "epoch": 1.07, "grad_norm": 0.2158203125, "learning_rate": 0.00020964575796051974, "loss": 2.1444, "step": 5780 }, { "epoch": 1.07, "grad_norm": 0.2041015625, "learning_rate": 0.00020932231350445188, "loss": 2.1248, "step": 5785 }, { "epoch": 1.07, "grad_norm": 0.2109375, "learning_rate": 0.0002089988446117817, "loss": 2.1342, "step": 5790 }, { "epoch": 1.08, "grad_norm": 0.208984375, "learning_rate": 0.00020867535213041883, "loss": 2.1064, "step": 5795 }, { "epoch": 1.08, "grad_norm": 0.2109375, "learning_rate": 0.00020835183690833496, "loss": 2.1467, "step": 5800 }, { "epoch": 1.08, "grad_norm": 0.220703125, "learning_rate": 0.00020802829979356134, "loss": 2.093, "step": 5805 }, { "epoch": 1.08, "grad_norm": 0.203125, "learning_rate": 0.0002077047416341864, "loss": 2.0891, "step": 5810 }, { "epoch": 1.08, "grad_norm": 0.208984375, "learning_rate": 0.00020738116327835413, "loss": 2.1186, "step": 5815 }, { "epoch": 1.08, "grad_norm": 0.2158203125, "learning_rate": 0.00020705756557426108, "loss": 2.1244, "step": 5820 }, { "epoch": 1.08, "grad_norm": 0.2109375, "learning_rate": 0.00020673394937015477, "loss": 2.1047, "step": 5825 }, { "epoch": 1.08, "grad_norm": 0.212890625, "learning_rate": 0.0002064103155143311, "loss": 2.0977, "step": 5830 }, { "epoch": 1.08, "grad_norm": 0.2119140625, "learning_rate": 0.00020608666485513215, "loss": 2.1302, "step": 5835 }, { "epoch": 1.08, "grad_norm": 0.21484375, "learning_rate": 0.00020576299824094432, "loss": 2.1351, "step": 5840 }, { "epoch": 1.08, "grad_norm": 0.2177734375, "learning_rate": 0.00020543931652019555, "loss": 2.0831, "step": 5845 }, { "epoch": 1.09, "grad_norm": 0.20703125, "learning_rate": 0.00020511562054135354, "loss": 2.1272, "step": 5850 }, { "epoch": 1.09, "grad_norm": 0.2119140625, "learning_rate": 0.0002047919111529234, "loss": 2.1544, "step": 5855 }, { "epoch": 1.09, "grad_norm": 0.2119140625, "learning_rate": 0.0002044681892034452, "loss": 2.135, "step": 5860 }, { "epoch": 1.09, "grad_norm": 0.21484375, "learning_rate": 0.00020414445554149208, "loss": 2.1514, "step": 5865 }, { "epoch": 1.09, "grad_norm": 0.2080078125, "learning_rate": 0.00020382071101566788, "loss": 2.067, "step": 5870 }, { "epoch": 1.09, "grad_norm": 0.2197265625, "learning_rate": 0.00020349695647460485, "loss": 2.1237, "step": 5875 }, { "epoch": 1.09, "grad_norm": 0.2119140625, "learning_rate": 0.00020317319276696161, "loss": 2.1594, "step": 5880 }, { "epoch": 1.09, "grad_norm": 0.2119140625, "learning_rate": 0.00020284942074142066, "loss": 2.1181, "step": 5885 }, { "epoch": 1.09, "grad_norm": 0.208984375, "learning_rate": 0.0002025256412466864, "loss": 2.1467, "step": 5890 }, { "epoch": 1.09, "grad_norm": 0.2158203125, "learning_rate": 0.00020220185513148277, "loss": 2.1358, "step": 5895 }, { "epoch": 1.09, "grad_norm": 0.21484375, "learning_rate": 0.00020187806324455104, "loss": 2.1006, "step": 5900 }, { "epoch": 1.1, "grad_norm": 0.2177734375, "learning_rate": 0.00020155426643464773, "loss": 2.1606, "step": 5905 }, { "epoch": 1.1, "grad_norm": 0.21484375, "learning_rate": 0.00020123046555054215, "loss": 2.1072, "step": 5910 }, { "epoch": 1.1, "grad_norm": 0.2158203125, "learning_rate": 0.00020090666144101436, "loss": 2.0934, "step": 5915 }, { "epoch": 1.1, "grad_norm": 0.2099609375, "learning_rate": 0.00020058285495485275, "loss": 2.086, "step": 5920 }, { "epoch": 1.1, "grad_norm": 0.21484375, "learning_rate": 0.00020025904694085202, "loss": 2.1406, "step": 5925 }, { "epoch": 1.1, "grad_norm": 0.220703125, "learning_rate": 0.00019993523824781104, "loss": 2.1658, "step": 5930 }, { "epoch": 1.1, "grad_norm": 0.21484375, "learning_rate": 0.0001996114297245301, "loss": 2.0964, "step": 5935 }, { "epoch": 1.1, "grad_norm": 0.2099609375, "learning_rate": 0.0001992876222198094, "loss": 2.1084, "step": 5940 }, { "epoch": 1.1, "grad_norm": 0.2099609375, "learning_rate": 0.00019896381658244622, "loss": 2.0951, "step": 5945 }, { "epoch": 1.1, "grad_norm": 0.21484375, "learning_rate": 0.00019864001366123307, "loss": 2.1141, "step": 5950 }, { "epoch": 1.1, "grad_norm": 0.2138671875, "learning_rate": 0.00019831621430495532, "loss": 2.0948, "step": 5955 }, { "epoch": 1.11, "grad_norm": 0.21484375, "learning_rate": 0.00019799241936238908, "loss": 2.1407, "step": 5960 }, { "epoch": 1.11, "grad_norm": 0.2177734375, "learning_rate": 0.00019766862968229865, "loss": 2.1298, "step": 5965 }, { "epoch": 1.11, "grad_norm": 0.2216796875, "learning_rate": 0.00019734484611343467, "loss": 2.1166, "step": 5970 }, { "epoch": 1.11, "grad_norm": 0.2119140625, "learning_rate": 0.00019702106950453193, "loss": 2.0888, "step": 5975 }, { "epoch": 1.11, "grad_norm": 0.2158203125, "learning_rate": 0.00019669730070430663, "loss": 2.12, "step": 5980 }, { "epoch": 1.11, "grad_norm": 0.2060546875, "learning_rate": 0.0001963735405614549, "loss": 2.1181, "step": 5985 }, { "epoch": 1.11, "grad_norm": 0.2109375, "learning_rate": 0.00019604978992464976, "loss": 2.0892, "step": 5990 }, { "epoch": 1.11, "grad_norm": 0.21484375, "learning_rate": 0.00019572604964253972, "loss": 2.1449, "step": 5995 }, { "epoch": 1.11, "grad_norm": 0.2109375, "learning_rate": 0.00019540232056374578, "loss": 2.0995, "step": 6000 }, { "epoch": 1.11, "grad_norm": 0.216796875, "learning_rate": 0.0001950786035368598, "loss": 2.138, "step": 6005 }, { "epoch": 1.12, "grad_norm": 0.20703125, "learning_rate": 0.00019475489941044204, "loss": 2.1263, "step": 6010 }, { "epoch": 1.12, "grad_norm": 0.2060546875, "learning_rate": 0.00019443120903301871, "loss": 2.1197, "step": 6015 }, { "epoch": 1.12, "grad_norm": 0.22265625, "learning_rate": 0.00019410753325308042, "loss": 2.0856, "step": 6020 }, { "epoch": 1.12, "grad_norm": 0.2177734375, "learning_rate": 0.00019378387291907909, "loss": 2.1015, "step": 6025 }, { "epoch": 1.12, "grad_norm": 0.208984375, "learning_rate": 0.0001934602288794263, "loss": 2.0782, "step": 6030 }, { "epoch": 1.12, "grad_norm": 0.21875, "learning_rate": 0.00019313660198249107, "loss": 2.1392, "step": 6035 }, { "epoch": 1.12, "grad_norm": 0.212890625, "learning_rate": 0.00019281299307659713, "loss": 2.0854, "step": 6040 }, { "epoch": 1.12, "grad_norm": 0.2099609375, "learning_rate": 0.00019248940301002155, "loss": 2.1192, "step": 6045 }, { "epoch": 1.12, "grad_norm": 0.21484375, "learning_rate": 0.00019216583263099147, "loss": 2.1065, "step": 6050 }, { "epoch": 1.12, "grad_norm": 0.2197265625, "learning_rate": 0.0001918422827876829, "loss": 2.0768, "step": 6055 }, { "epoch": 1.12, "grad_norm": 0.2080078125, "learning_rate": 0.00019151875432821773, "loss": 2.0851, "step": 6060 }, { "epoch": 1.13, "grad_norm": 0.2138671875, "learning_rate": 0.00019119524810066175, "loss": 2.1027, "step": 6065 }, { "epoch": 1.13, "grad_norm": 0.2099609375, "learning_rate": 0.0001908717649530228, "loss": 2.1008, "step": 6070 }, { "epoch": 1.13, "grad_norm": 0.220703125, "learning_rate": 0.0001905483057332479, "loss": 2.0943, "step": 6075 }, { "epoch": 1.13, "grad_norm": 0.2138671875, "learning_rate": 0.00019022487128922148, "loss": 2.1203, "step": 6080 }, { "epoch": 1.13, "grad_norm": 0.2177734375, "learning_rate": 0.0001899014624687631, "loss": 2.1122, "step": 6085 }, { "epoch": 1.13, "grad_norm": 0.2099609375, "learning_rate": 0.00018957808011962486, "loss": 2.1437, "step": 6090 }, { "epoch": 1.13, "grad_norm": 0.2099609375, "learning_rate": 0.00018925472508948992, "loss": 2.073, "step": 6095 }, { "epoch": 1.13, "grad_norm": 0.2099609375, "learning_rate": 0.00018893139822596938, "loss": 2.1426, "step": 6100 }, { "epoch": 1.13, "grad_norm": 0.21484375, "learning_rate": 0.00018860810037660085, "loss": 2.0974, "step": 6105 }, { "epoch": 1.13, "grad_norm": 0.2109375, "learning_rate": 0.00018828483238884564, "loss": 2.1141, "step": 6110 }, { "epoch": 1.13, "grad_norm": 0.212890625, "learning_rate": 0.00018796159511008702, "loss": 2.1222, "step": 6115 }, { "epoch": 1.14, "grad_norm": 0.2138671875, "learning_rate": 0.00018763838938762756, "loss": 2.1244, "step": 6120 }, { "epoch": 1.14, "grad_norm": 0.2177734375, "learning_rate": 0.0001873152160686871, "loss": 2.141, "step": 6125 }, { "epoch": 1.14, "grad_norm": 0.216796875, "learning_rate": 0.00018699207600040077, "loss": 2.0931, "step": 6130 }, { "epoch": 1.14, "grad_norm": 0.2099609375, "learning_rate": 0.00018666897002981626, "loss": 2.104, "step": 6135 }, { "epoch": 1.14, "grad_norm": 0.212890625, "learning_rate": 0.00018634589900389217, "loss": 2.0929, "step": 6140 }, { "epoch": 1.14, "grad_norm": 0.2119140625, "learning_rate": 0.00018602286376949515, "loss": 2.08, "step": 6145 }, { "epoch": 1.14, "grad_norm": 0.2255859375, "learning_rate": 0.00018569986517339844, "loss": 2.0757, "step": 6150 }, { "epoch": 1.14, "grad_norm": 0.205078125, "learning_rate": 0.00018537690406227888, "loss": 2.1296, "step": 6155 }, { "epoch": 1.14, "grad_norm": 0.2177734375, "learning_rate": 0.00018505398128271515, "loss": 2.113, "step": 6160 }, { "epoch": 1.14, "grad_norm": 0.21484375, "learning_rate": 0.0001847310976811856, "loss": 2.1245, "step": 6165 }, { "epoch": 1.14, "grad_norm": 0.21484375, "learning_rate": 0.00018440825410406575, "loss": 2.1231, "step": 6170 }, { "epoch": 1.15, "grad_norm": 0.2109375, "learning_rate": 0.00018408545139762627, "loss": 2.1119, "step": 6175 }, { "epoch": 1.15, "grad_norm": 0.2119140625, "learning_rate": 0.00018376269040803057, "loss": 2.1047, "step": 6180 }, { "epoch": 1.15, "grad_norm": 0.2099609375, "learning_rate": 0.0001834399719813328, "loss": 2.0886, "step": 6185 }, { "epoch": 1.15, "grad_norm": 0.208984375, "learning_rate": 0.00018311729696347562, "loss": 2.1106, "step": 6190 }, { "epoch": 1.15, "grad_norm": 0.21484375, "learning_rate": 0.0001827946662002877, "loss": 2.0957, "step": 6195 }, { "epoch": 1.15, "grad_norm": 0.212890625, "learning_rate": 0.000182472080537482, "loss": 2.1069, "step": 6200 }, { "epoch": 1.15, "grad_norm": 0.216796875, "learning_rate": 0.00018214954082065282, "loss": 2.1129, "step": 6205 }, { "epoch": 1.15, "grad_norm": 0.2197265625, "learning_rate": 0.00018182704789527452, "loss": 2.147, "step": 6210 }, { "epoch": 1.15, "grad_norm": 0.2138671875, "learning_rate": 0.00018150460260669846, "loss": 2.1113, "step": 6215 }, { "epoch": 1.15, "grad_norm": 0.21484375, "learning_rate": 0.0001811822058001512, "loss": 2.0983, "step": 6220 }, { "epoch": 1.15, "grad_norm": 0.2158203125, "learning_rate": 0.00018085985832073237, "loss": 2.1165, "step": 6225 }, { "epoch": 1.16, "grad_norm": 0.212890625, "learning_rate": 0.00018053756101341206, "loss": 2.0929, "step": 6230 }, { "epoch": 1.16, "grad_norm": 0.2236328125, "learning_rate": 0.000180215314723029, "loss": 2.1439, "step": 6235 }, { "epoch": 1.16, "grad_norm": 0.2158203125, "learning_rate": 0.0001798931202942882, "loss": 2.1208, "step": 6240 }, { "epoch": 1.16, "grad_norm": 0.2080078125, "learning_rate": 0.0001795709785717586, "loss": 2.1192, "step": 6245 }, { "epoch": 1.16, "grad_norm": 0.2109375, "learning_rate": 0.00017924889039987117, "loss": 2.1322, "step": 6250 }, { "epoch": 1.16, "grad_norm": 0.216796875, "learning_rate": 0.00017892685662291622, "loss": 2.1014, "step": 6255 }, { "epoch": 1.16, "grad_norm": 0.2109375, "learning_rate": 0.0001786048780850418, "loss": 2.1299, "step": 6260 }, { "epoch": 1.16, "grad_norm": 0.2119140625, "learning_rate": 0.00017828295563025091, "loss": 2.0805, "step": 6265 }, { "epoch": 1.16, "grad_norm": 0.220703125, "learning_rate": 0.00017796109010239977, "loss": 2.1155, "step": 6270 }, { "epoch": 1.16, "grad_norm": 0.2119140625, "learning_rate": 0.00017763928234519518, "loss": 2.1166, "step": 6275 }, { "epoch": 1.17, "grad_norm": 0.2197265625, "learning_rate": 0.0001773175332021925, "loss": 2.1122, "step": 6280 }, { "epoch": 1.17, "grad_norm": 0.21484375, "learning_rate": 0.00017699584351679363, "loss": 2.1151, "step": 6285 }, { "epoch": 1.17, "grad_norm": 0.21484375, "learning_rate": 0.0001766742141322444, "loss": 2.1076, "step": 6290 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 0.00017635264589163275, "loss": 2.1442, "step": 6295 }, { "epoch": 1.17, "grad_norm": 0.2158203125, "learning_rate": 0.0001760311396378863, "loss": 2.1386, "step": 6300 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 0.00017570969621377003, "loss": 2.1099, "step": 6305 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 0.00017538831646188443, "loss": 2.1427, "step": 6310 }, { "epoch": 1.17, "grad_norm": 0.2177734375, "learning_rate": 0.00017506700122466297, "loss": 2.1288, "step": 6315 }, { "epoch": 1.17, "grad_norm": 0.220703125, "learning_rate": 0.00017474575134437007, "loss": 2.1197, "step": 6320 }, { "epoch": 1.17, "grad_norm": 0.220703125, "learning_rate": 0.00017442456766309877, "loss": 2.0971, "step": 6325 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 0.00017410345102276872, "loss": 2.1008, "step": 6330 }, { "epoch": 1.18, "grad_norm": 0.212890625, "learning_rate": 0.0001737824022651236, "loss": 2.102, "step": 6335 }, { "epoch": 1.18, "grad_norm": 0.2197265625, "learning_rate": 0.00017346142223172926, "loss": 2.1308, "step": 6340 }, { "epoch": 1.18, "grad_norm": 0.212890625, "learning_rate": 0.0001731405117639715, "loss": 2.1104, "step": 6345 }, { "epoch": 1.18, "grad_norm": 0.2138671875, "learning_rate": 0.0001728196717030536, "loss": 2.0971, "step": 6350 }, { "epoch": 1.18, "grad_norm": 0.21875, "learning_rate": 0.00017249890288999453, "loss": 2.0961, "step": 6355 }, { "epoch": 1.18, "grad_norm": 0.216796875, "learning_rate": 0.00017217820616562615, "loss": 2.0973, "step": 6360 }, { "epoch": 1.18, "grad_norm": 0.2158203125, "learning_rate": 0.00017185758237059172, "loss": 2.1169, "step": 6365 }, { "epoch": 1.18, "grad_norm": 0.2119140625, "learning_rate": 0.000171537032345343, "loss": 2.1253, "step": 6370 }, { "epoch": 1.18, "grad_norm": 0.2119140625, "learning_rate": 0.00017121655693013856, "loss": 2.1127, "step": 6375 }, { "epoch": 1.18, "grad_norm": 0.2138671875, "learning_rate": 0.0001708961569650414, "loss": 2.0914, "step": 6380 }, { "epoch": 1.18, "grad_norm": 0.2158203125, "learning_rate": 0.00017057583328991668, "loss": 2.0782, "step": 6385 }, { "epoch": 1.19, "grad_norm": 0.212890625, "learning_rate": 0.00017025558674442972, "loss": 2.1294, "step": 6390 }, { "epoch": 1.19, "grad_norm": 0.2119140625, "learning_rate": 0.00016993541816804334, "loss": 2.1139, "step": 6395 }, { "epoch": 1.19, "grad_norm": 0.216796875, "learning_rate": 0.0001696153284000163, "loss": 2.0751, "step": 6400 }, { "epoch": 1.19, "grad_norm": 0.2177734375, "learning_rate": 0.00016929531827940066, "loss": 2.11, "step": 6405 }, { "epoch": 1.19, "grad_norm": 0.21875, "learning_rate": 0.00016897538864503968, "loss": 2.1733, "step": 6410 }, { "epoch": 1.19, "grad_norm": 0.2158203125, "learning_rate": 0.00016865554033556574, "loss": 2.1227, "step": 6415 }, { "epoch": 1.19, "grad_norm": 0.2138671875, "learning_rate": 0.00016833577418939785, "loss": 2.1076, "step": 6420 }, { "epoch": 1.19, "grad_norm": 0.2197265625, "learning_rate": 0.00016801609104473986, "loss": 2.113, "step": 6425 }, { "epoch": 1.19, "grad_norm": 0.22265625, "learning_rate": 0.0001676964917395779, "loss": 2.1067, "step": 6430 }, { "epoch": 1.19, "grad_norm": 0.2158203125, "learning_rate": 0.00016737697711167836, "loss": 2.094, "step": 6435 }, { "epoch": 1.19, "grad_norm": 0.2158203125, "learning_rate": 0.00016705754799858585, "loss": 2.0662, "step": 6440 }, { "epoch": 1.2, "grad_norm": 0.220703125, "learning_rate": 0.0001667382052376204, "loss": 2.1005, "step": 6445 }, { "epoch": 1.2, "grad_norm": 0.212890625, "learning_rate": 0.00016641894966587618, "loss": 2.1408, "step": 6450 }, { "epoch": 1.2, "grad_norm": 0.2158203125, "learning_rate": 0.00016609978212021843, "loss": 2.104, "step": 6455 }, { "epoch": 1.2, "grad_norm": 0.21484375, "learning_rate": 0.00016578070343728181, "loss": 2.1289, "step": 6460 }, { "epoch": 1.2, "grad_norm": 0.2119140625, "learning_rate": 0.00016546171445346811, "loss": 2.0986, "step": 6465 }, { "epoch": 1.2, "grad_norm": 0.2158203125, "learning_rate": 0.00016514281600494378, "loss": 2.1056, "step": 6470 }, { "epoch": 1.2, "grad_norm": 0.2177734375, "learning_rate": 0.0001648240089276382, "loss": 2.1461, "step": 6475 }, { "epoch": 1.2, "grad_norm": 0.2216796875, "learning_rate": 0.00016450529405724097, "loss": 2.0915, "step": 6480 }, { "epoch": 1.2, "grad_norm": 0.2138671875, "learning_rate": 0.00016418667222920029, "loss": 2.086, "step": 6485 }, { "epoch": 1.2, "grad_norm": 0.220703125, "learning_rate": 0.00016386814427872025, "loss": 2.1414, "step": 6490 }, { "epoch": 1.21, "grad_norm": 0.224609375, "learning_rate": 0.00016354971104075888, "loss": 2.128, "step": 6495 }, { "epoch": 1.21, "grad_norm": 0.2138671875, "learning_rate": 0.000163231373350026, "loss": 2.1093, "step": 6500 }, { "epoch": 1.21, "grad_norm": 0.212890625, "learning_rate": 0.00016291313204098092, "loss": 2.1172, "step": 6505 }, { "epoch": 1.21, "grad_norm": 0.2177734375, "learning_rate": 0.00016259498794783043, "loss": 2.1398, "step": 6510 }, { "epoch": 1.21, "grad_norm": 0.2197265625, "learning_rate": 0.00016227694190452626, "loss": 2.1342, "step": 6515 }, { "epoch": 1.21, "grad_norm": 0.2158203125, "learning_rate": 0.00016195899474476345, "loss": 2.1023, "step": 6520 }, { "epoch": 1.21, "grad_norm": 0.220703125, "learning_rate": 0.00016164114730197744, "loss": 2.0859, "step": 6525 }, { "epoch": 1.21, "grad_norm": 0.2138671875, "learning_rate": 0.00016132340040934254, "loss": 2.1153, "step": 6530 }, { "epoch": 1.21, "grad_norm": 0.21875, "learning_rate": 0.00016100575489976947, "loss": 2.1159, "step": 6535 }, { "epoch": 1.21, "grad_norm": 0.2119140625, "learning_rate": 0.00016068821160590308, "loss": 2.08, "step": 6540 }, { "epoch": 1.21, "grad_norm": 0.2119140625, "learning_rate": 0.00016037077136012054, "loss": 2.1129, "step": 6545 }, { "epoch": 1.22, "grad_norm": 0.224609375, "learning_rate": 0.0001600534349945285, "loss": 2.1458, "step": 6550 }, { "epoch": 1.22, "grad_norm": 0.2197265625, "learning_rate": 0.00015973620334096159, "loss": 2.1361, "step": 6555 }, { "epoch": 1.22, "grad_norm": 0.2138671875, "learning_rate": 0.00015941907723097994, "loss": 2.1387, "step": 6560 }, { "epoch": 1.22, "grad_norm": 0.220703125, "learning_rate": 0.0001591020574958669, "loss": 2.1254, "step": 6565 }, { "epoch": 1.22, "grad_norm": 0.21875, "learning_rate": 0.00015878514496662715, "loss": 2.1272, "step": 6570 }, { "epoch": 1.22, "grad_norm": 0.216796875, "learning_rate": 0.0001584683404739841, "loss": 2.1448, "step": 6575 }, { "epoch": 1.22, "grad_norm": 0.2255859375, "learning_rate": 0.00015815164484837832, "loss": 2.1431, "step": 6580 }, { "epoch": 1.22, "grad_norm": 0.2177734375, "learning_rate": 0.00015783505891996466, "loss": 2.1139, "step": 6585 }, { "epoch": 1.22, "grad_norm": 0.2138671875, "learning_rate": 0.00015751858351861054, "loss": 2.1558, "step": 6590 }, { "epoch": 1.22, "grad_norm": 0.21875, "learning_rate": 0.0001572022194738938, "loss": 2.1249, "step": 6595 }, { "epoch": 1.22, "grad_norm": 0.2138671875, "learning_rate": 0.00015688596761510005, "loss": 2.1071, "step": 6600 }, { "epoch": 1.23, "grad_norm": 0.21875, "learning_rate": 0.00015656982877122134, "loss": 2.0894, "step": 6605 }, { "epoch": 1.23, "grad_norm": 0.212890625, "learning_rate": 0.0001562538037709529, "loss": 2.1351, "step": 6610 }, { "epoch": 1.23, "grad_norm": 0.2138671875, "learning_rate": 0.00015593789344269188, "loss": 2.1364, "step": 6615 }, { "epoch": 1.23, "grad_norm": 0.2197265625, "learning_rate": 0.00015562209861453487, "loss": 2.078, "step": 6620 }, { "epoch": 1.23, "grad_norm": 0.212890625, "learning_rate": 0.00015530642011427542, "loss": 2.1231, "step": 6625 }, { "epoch": 1.23, "grad_norm": 0.21484375, "learning_rate": 0.00015499085876940255, "loss": 2.1232, "step": 6630 }, { "epoch": 1.23, "grad_norm": 0.21875, "learning_rate": 0.00015467541540709772, "loss": 2.1206, "step": 6635 }, { "epoch": 1.23, "grad_norm": 0.216796875, "learning_rate": 0.00015436009085423354, "loss": 2.0971, "step": 6640 }, { "epoch": 1.23, "grad_norm": 0.220703125, "learning_rate": 0.000154044885937371, "loss": 2.0972, "step": 6645 }, { "epoch": 1.23, "grad_norm": 0.2177734375, "learning_rate": 0.0001537298014827573, "loss": 2.1167, "step": 6650 }, { "epoch": 1.23, "grad_norm": 0.2099609375, "learning_rate": 0.00015341483831632434, "loss": 2.0823, "step": 6655 }, { "epoch": 1.24, "grad_norm": 0.2177734375, "learning_rate": 0.00015309999726368555, "loss": 2.138, "step": 6660 }, { "epoch": 1.24, "grad_norm": 0.2197265625, "learning_rate": 0.0001527852791501347, "loss": 2.1009, "step": 6665 }, { "epoch": 1.24, "grad_norm": 0.2197265625, "learning_rate": 0.00015247068480064307, "loss": 2.0907, "step": 6670 }, { "epoch": 1.24, "grad_norm": 0.21875, "learning_rate": 0.00015215621503985758, "loss": 2.0969, "step": 6675 }, { "epoch": 1.24, "grad_norm": 0.2158203125, "learning_rate": 0.00015184187069209858, "loss": 2.1056, "step": 6680 }, { "epoch": 1.24, "grad_norm": 0.21484375, "learning_rate": 0.00015152765258135754, "loss": 2.0957, "step": 6685 }, { "epoch": 1.24, "grad_norm": 0.2216796875, "learning_rate": 0.00015121356153129526, "loss": 2.1561, "step": 6690 }, { "epoch": 1.24, "grad_norm": 0.2158203125, "learning_rate": 0.00015089959836523927, "loss": 2.1067, "step": 6695 }, { "epoch": 1.24, "grad_norm": 0.212890625, "learning_rate": 0.00015058576390618205, "loss": 2.1244, "step": 6700 }, { "epoch": 1.24, "grad_norm": 0.21484375, "learning_rate": 0.00015027205897677855, "loss": 2.1364, "step": 6705 }, { "epoch": 1.24, "grad_norm": 0.2138671875, "learning_rate": 0.00014995848439934418, "loss": 2.0978, "step": 6710 }, { "epoch": 1.25, "grad_norm": 0.21484375, "learning_rate": 0.00014964504099585283, "loss": 2.1216, "step": 6715 }, { "epoch": 1.25, "grad_norm": 0.21484375, "learning_rate": 0.00014933172958793436, "loss": 2.0968, "step": 6720 }, { "epoch": 1.25, "grad_norm": 0.2119140625, "learning_rate": 0.00014901855099687275, "loss": 2.0871, "step": 6725 }, { "epoch": 1.25, "grad_norm": 0.22265625, "learning_rate": 0.00014870550604360373, "loss": 2.1265, "step": 6730 }, { "epoch": 1.25, "grad_norm": 0.2138671875, "learning_rate": 0.0001483925955487129, "loss": 2.0967, "step": 6735 }, { "epoch": 1.25, "grad_norm": 0.220703125, "learning_rate": 0.00014807982033243313, "loss": 2.1202, "step": 6740 }, { "epoch": 1.25, "grad_norm": 0.2099609375, "learning_rate": 0.00014776718121464283, "loss": 2.0886, "step": 6745 }, { "epoch": 1.25, "grad_norm": 0.224609375, "learning_rate": 0.00014745467901486377, "loss": 2.112, "step": 6750 }, { "epoch": 1.25, "grad_norm": 0.2138671875, "learning_rate": 0.00014714231455225862, "loss": 2.1163, "step": 6755 }, { "epoch": 1.25, "grad_norm": 0.2236328125, "learning_rate": 0.00014683008864562917, "loss": 2.1304, "step": 6760 }, { "epoch": 1.26, "grad_norm": 0.212890625, "learning_rate": 0.00014651800211341385, "loss": 2.0945, "step": 6765 }, { "epoch": 1.26, "grad_norm": 0.216796875, "learning_rate": 0.0001462060557736858, "loss": 2.1094, "step": 6770 }, { "epoch": 1.26, "grad_norm": 0.21875, "learning_rate": 0.00014589425044415075, "loss": 2.0669, "step": 6775 }, { "epoch": 1.26, "grad_norm": 0.224609375, "learning_rate": 0.0001455825869421447, "loss": 2.1211, "step": 6780 }, { "epoch": 1.26, "grad_norm": 0.2138671875, "learning_rate": 0.00014527106608463206, "loss": 2.1007, "step": 6785 }, { "epoch": 1.26, "grad_norm": 0.22265625, "learning_rate": 0.000144959688688203, "loss": 2.1065, "step": 6790 }, { "epoch": 1.26, "grad_norm": 0.216796875, "learning_rate": 0.00014464845556907196, "loss": 2.089, "step": 6795 }, { "epoch": 1.26, "grad_norm": 0.2138671875, "learning_rate": 0.000144337367543075, "loss": 2.14, "step": 6800 }, { "epoch": 1.26, "grad_norm": 0.2197265625, "learning_rate": 0.00014402642542566782, "loss": 2.1439, "step": 6805 }, { "epoch": 1.26, "grad_norm": 0.20703125, "learning_rate": 0.00014371563003192392, "loss": 2.0581, "step": 6810 }, { "epoch": 1.26, "grad_norm": 0.21875, "learning_rate": 0.0001434049821765318, "loss": 2.1186, "step": 6815 }, { "epoch": 1.27, "grad_norm": 0.2197265625, "learning_rate": 0.00014309448267379353, "loss": 2.1526, "step": 6820 }, { "epoch": 1.27, "grad_norm": 0.2138671875, "learning_rate": 0.00014278413233762214, "loss": 2.071, "step": 6825 }, { "epoch": 1.27, "grad_norm": 0.21484375, "learning_rate": 0.00014247393198153974, "loss": 2.1107, "step": 6830 }, { "epoch": 1.27, "grad_norm": 0.2177734375, "learning_rate": 0.0001421638824186753, "loss": 2.1339, "step": 6835 }, { "epoch": 1.27, "grad_norm": 0.2099609375, "learning_rate": 0.0001418539844617623, "loss": 2.1236, "step": 6840 }, { "epoch": 1.27, "grad_norm": 0.22265625, "learning_rate": 0.00014154423892313712, "loss": 2.1086, "step": 6845 }, { "epoch": 1.27, "grad_norm": 0.2216796875, "learning_rate": 0.00014123464661473646, "loss": 2.0773, "step": 6850 }, { "epoch": 1.27, "grad_norm": 0.220703125, "learning_rate": 0.00014092520834809534, "loss": 2.1404, "step": 6855 }, { "epoch": 1.27, "grad_norm": 0.2177734375, "learning_rate": 0.0001406159249343451, "loss": 2.0654, "step": 6860 }, { "epoch": 1.27, "grad_norm": 0.21875, "learning_rate": 0.0001403067971842109, "loss": 2.0919, "step": 6865 }, { "epoch": 1.27, "grad_norm": 0.228515625, "learning_rate": 0.00013999782590801022, "loss": 2.0763, "step": 6870 }, { "epoch": 1.28, "grad_norm": 0.2109375, "learning_rate": 0.0001396890119156501, "loss": 2.109, "step": 6875 }, { "epoch": 1.28, "grad_norm": 0.2294921875, "learning_rate": 0.00013938035601662545, "loss": 2.117, "step": 6880 }, { "epoch": 1.28, "grad_norm": 0.2275390625, "learning_rate": 0.00013907185902001663, "loss": 2.1378, "step": 6885 }, { "epoch": 1.28, "grad_norm": 0.216796875, "learning_rate": 0.00013876352173448764, "loss": 2.1208, "step": 6890 }, { "epoch": 1.28, "grad_norm": 0.208984375, "learning_rate": 0.00013845534496828368, "loss": 2.1245, "step": 6895 }, { "epoch": 1.28, "grad_norm": 0.2177734375, "learning_rate": 0.00013814732952922918, "loss": 2.1393, "step": 6900 }, { "epoch": 1.28, "grad_norm": 0.22265625, "learning_rate": 0.00013783947622472587, "loss": 2.1493, "step": 6905 }, { "epoch": 1.28, "grad_norm": 0.21875, "learning_rate": 0.0001375317858617502, "loss": 2.1436, "step": 6910 }, { "epoch": 1.28, "grad_norm": 0.2158203125, "learning_rate": 0.0001372242592468518, "loss": 2.1092, "step": 6915 }, { "epoch": 1.28, "grad_norm": 0.21875, "learning_rate": 0.00013691689718615079, "loss": 2.1213, "step": 6920 }, { "epoch": 1.28, "grad_norm": 0.2119140625, "learning_rate": 0.00013660970048533607, "loss": 2.0764, "step": 6925 }, { "epoch": 1.29, "grad_norm": 0.2119140625, "learning_rate": 0.00013630266994966314, "loss": 2.124, "step": 6930 }, { "epoch": 1.29, "grad_norm": 0.2197265625, "learning_rate": 0.0001359958063839518, "loss": 2.1211, "step": 6935 }, { "epoch": 1.29, "grad_norm": 0.224609375, "learning_rate": 0.00013568911059258436, "loss": 2.0699, "step": 6940 }, { "epoch": 1.29, "grad_norm": 0.2138671875, "learning_rate": 0.00013538258337950302, "loss": 2.0943, "step": 6945 }, { "epoch": 1.29, "grad_norm": 0.2177734375, "learning_rate": 0.0001350762255482085, "loss": 2.1063, "step": 6950 }, { "epoch": 1.29, "grad_norm": 0.21875, "learning_rate": 0.0001347700379017572, "loss": 2.1212, "step": 6955 }, { "epoch": 1.29, "grad_norm": 0.2138671875, "learning_rate": 0.00013446402124275947, "loss": 2.0913, "step": 6960 }, { "epoch": 1.29, "grad_norm": 0.224609375, "learning_rate": 0.00013415817637337768, "loss": 2.1277, "step": 6965 }, { "epoch": 1.29, "grad_norm": 0.224609375, "learning_rate": 0.00013385250409532343, "loss": 2.1266, "step": 6970 }, { "epoch": 1.29, "grad_norm": 0.22265625, "learning_rate": 0.0001335470052098565, "loss": 2.0951, "step": 6975 }, { "epoch": 1.29, "grad_norm": 0.2177734375, "learning_rate": 0.00013324168051778162, "loss": 2.1073, "step": 6980 }, { "epoch": 1.3, "grad_norm": 0.2138671875, "learning_rate": 0.00013293653081944716, "loss": 2.1286, "step": 6985 }, { "epoch": 1.3, "grad_norm": 0.21484375, "learning_rate": 0.00013263155691474286, "loss": 2.0843, "step": 6990 }, { "epoch": 1.3, "grad_norm": 0.21875, "learning_rate": 0.0001323267596030973, "loss": 2.1228, "step": 6995 }, { "epoch": 1.3, "grad_norm": 0.2216796875, "learning_rate": 0.0001320221396834767, "loss": 2.126, "step": 7000 }, { "epoch": 1.3, "grad_norm": 0.2177734375, "learning_rate": 0.0001317176979543817, "loss": 2.0865, "step": 7005 }, { "epoch": 1.3, "grad_norm": 0.21875, "learning_rate": 0.00013141343521384634, "loss": 2.1284, "step": 7010 }, { "epoch": 1.3, "grad_norm": 0.228515625, "learning_rate": 0.00013110935225943516, "loss": 2.0905, "step": 7015 }, { "epoch": 1.3, "grad_norm": 0.21484375, "learning_rate": 0.0001308054498882414, "loss": 2.0852, "step": 7020 }, { "epoch": 1.3, "grad_norm": 0.212890625, "learning_rate": 0.00013050172889688536, "loss": 2.1067, "step": 7025 }, { "epoch": 1.3, "grad_norm": 0.21484375, "learning_rate": 0.00013019819008151128, "loss": 2.0757, "step": 7030 }, { "epoch": 1.31, "grad_norm": 0.2138671875, "learning_rate": 0.0001298948342377864, "loss": 2.1301, "step": 7035 }, { "epoch": 1.31, "grad_norm": 0.2177734375, "learning_rate": 0.00012959166216089797, "loss": 2.0895, "step": 7040 }, { "epoch": 1.31, "grad_norm": 0.21484375, "learning_rate": 0.00012928867464555177, "loss": 2.0816, "step": 7045 }, { "epoch": 1.31, "grad_norm": 0.216796875, "learning_rate": 0.0001289858724859697, "loss": 2.1101, "step": 7050 }, { "epoch": 1.31, "grad_norm": 0.21875, "learning_rate": 0.0001286832564758876, "loss": 2.1058, "step": 7055 }, { "epoch": 1.31, "grad_norm": 0.2177734375, "learning_rate": 0.00012838082740855371, "loss": 2.1263, "step": 7060 }, { "epoch": 1.31, "grad_norm": 0.212890625, "learning_rate": 0.00012807858607672597, "loss": 2.0942, "step": 7065 }, { "epoch": 1.31, "grad_norm": 0.2216796875, "learning_rate": 0.00012777653327267037, "loss": 2.1127, "step": 7070 }, { "epoch": 1.31, "grad_norm": 0.220703125, "learning_rate": 0.00012747466978815862, "loss": 2.0978, "step": 7075 }, { "epoch": 1.31, "grad_norm": 0.2197265625, "learning_rate": 0.00012717299641446612, "loss": 2.1021, "step": 7080 }, { "epoch": 1.31, "grad_norm": 0.21484375, "learning_rate": 0.00012687151394237005, "loss": 2.1147, "step": 7085 }, { "epoch": 1.32, "grad_norm": 0.2119140625, "learning_rate": 0.00012657022316214705, "loss": 2.0918, "step": 7090 }, { "epoch": 1.32, "grad_norm": 0.216796875, "learning_rate": 0.00012626912486357147, "loss": 2.1089, "step": 7095 }, { "epoch": 1.32, "grad_norm": 0.216796875, "learning_rate": 0.00012596821983591289, "loss": 2.1268, "step": 7100 }, { "epoch": 1.32, "grad_norm": 0.21875, "learning_rate": 0.0001256675088679345, "loss": 2.1026, "step": 7105 }, { "epoch": 1.32, "grad_norm": 0.2177734375, "learning_rate": 0.00012536699274789059, "loss": 2.0627, "step": 7110 }, { "epoch": 1.32, "grad_norm": 0.2255859375, "learning_rate": 0.0001250666722635247, "loss": 2.0675, "step": 7115 }, { "epoch": 1.32, "grad_norm": 0.2197265625, "learning_rate": 0.00012476654820206773, "loss": 2.1035, "step": 7120 }, { "epoch": 1.32, "grad_norm": 0.224609375, "learning_rate": 0.0001244666213502355, "loss": 2.1104, "step": 7125 }, { "epoch": 1.32, "grad_norm": 0.21875, "learning_rate": 0.00012416689249422714, "loss": 2.0844, "step": 7130 }, { "epoch": 1.32, "grad_norm": 0.22265625, "learning_rate": 0.00012386736241972246, "loss": 2.1405, "step": 7135 }, { "epoch": 1.32, "grad_norm": 0.2109375, "learning_rate": 0.00012356803191188034, "loss": 2.1525, "step": 7140 }, { "epoch": 1.33, "grad_norm": 0.216796875, "learning_rate": 0.0001232689017553366, "loss": 2.1443, "step": 7145 }, { "epoch": 1.33, "grad_norm": 0.216796875, "learning_rate": 0.00012296997273420184, "loss": 2.1317, "step": 7150 }, { "epoch": 1.33, "grad_norm": 0.224609375, "learning_rate": 0.00012267124563205946, "loss": 2.0839, "step": 7155 }, { "epoch": 1.33, "grad_norm": 0.2177734375, "learning_rate": 0.0001223727212319633, "loss": 2.0955, "step": 7160 }, { "epoch": 1.33, "grad_norm": 0.2236328125, "learning_rate": 0.00012207440031643637, "loss": 2.1137, "step": 7165 }, { "epoch": 1.33, "grad_norm": 0.2294921875, "learning_rate": 0.0001217762836674678, "loss": 2.0972, "step": 7170 }, { "epoch": 1.33, "grad_norm": 0.21875, "learning_rate": 0.00012147837206651148, "loss": 2.1198, "step": 7175 }, { "epoch": 1.33, "grad_norm": 0.2197265625, "learning_rate": 0.00012118066629448388, "loss": 2.0673, "step": 7180 }, { "epoch": 1.33, "grad_norm": 0.2177734375, "learning_rate": 0.00012088316713176166, "loss": 2.1093, "step": 7185 }, { "epoch": 1.33, "grad_norm": 0.21484375, "learning_rate": 0.00012058587535818036, "loss": 2.0892, "step": 7190 }, { "epoch": 1.33, "grad_norm": 0.21875, "learning_rate": 0.00012028879175303137, "loss": 2.1102, "step": 7195 }, { "epoch": 1.34, "grad_norm": 0.220703125, "learning_rate": 0.00011999191709506072, "loss": 2.1176, "step": 7200 }, { "epoch": 1.34, "grad_norm": 0.21875, "learning_rate": 0.00011969525216246673, "loss": 2.0999, "step": 7205 }, { "epoch": 1.34, "grad_norm": 0.212890625, "learning_rate": 0.00011939879773289768, "loss": 2.0787, "step": 7210 }, { "epoch": 1.34, "grad_norm": 0.2265625, "learning_rate": 0.00011910255458345055, "loss": 2.1507, "step": 7215 }, { "epoch": 1.34, "grad_norm": 0.2216796875, "learning_rate": 0.00011880652349066798, "loss": 2.1248, "step": 7220 }, { "epoch": 1.34, "grad_norm": 0.2236328125, "learning_rate": 0.00011851070523053707, "loss": 2.0911, "step": 7225 }, { "epoch": 1.34, "grad_norm": 0.2138671875, "learning_rate": 0.00011821510057848695, "loss": 2.1235, "step": 7230 }, { "epoch": 1.34, "grad_norm": 0.2138671875, "learning_rate": 0.00011791971030938662, "loss": 2.1035, "step": 7235 }, { "epoch": 1.34, "grad_norm": 0.21875, "learning_rate": 0.00011762453519754357, "loss": 2.1228, "step": 7240 }, { "epoch": 1.34, "grad_norm": 0.212890625, "learning_rate": 0.00011732957601670076, "loss": 2.0994, "step": 7245 }, { "epoch": 1.35, "grad_norm": 0.2197265625, "learning_rate": 0.00011703483354003553, "loss": 2.1336, "step": 7250 }, { "epoch": 1.35, "grad_norm": 0.21875, "learning_rate": 0.00011674030854015696, "loss": 2.108, "step": 7255 }, { "epoch": 1.35, "grad_norm": 0.220703125, "learning_rate": 0.00011644600178910421, "loss": 2.1178, "step": 7260 }, { "epoch": 1.35, "grad_norm": 0.21875, "learning_rate": 0.0001161519140583442, "loss": 2.1162, "step": 7265 }, { "epoch": 1.35, "grad_norm": 0.2119140625, "learning_rate": 0.0001158580461187698, "loss": 2.0816, "step": 7270 }, { "epoch": 1.35, "grad_norm": 0.21484375, "learning_rate": 0.00011556439874069773, "loss": 2.1076, "step": 7275 }, { "epoch": 1.35, "grad_norm": 0.216796875, "learning_rate": 0.00011527097269386655, "loss": 2.1465, "step": 7280 }, { "epoch": 1.35, "grad_norm": 0.2216796875, "learning_rate": 0.00011497776874743471, "loss": 2.1064, "step": 7285 }, { "epoch": 1.35, "grad_norm": 0.2158203125, "learning_rate": 0.00011468478766997832, "loss": 2.0849, "step": 7290 }, { "epoch": 1.35, "grad_norm": 0.208984375, "learning_rate": 0.00011439203022948935, "loss": 2.1352, "step": 7295 }, { "epoch": 1.35, "grad_norm": 0.2177734375, "learning_rate": 0.00011409949719337376, "loss": 2.1146, "step": 7300 }, { "epoch": 1.36, "grad_norm": 0.220703125, "learning_rate": 0.00011380718932844882, "loss": 2.1264, "step": 7305 }, { "epoch": 1.36, "grad_norm": 0.216796875, "learning_rate": 0.00011351510740094205, "loss": 2.1164, "step": 7310 }, { "epoch": 1.36, "grad_norm": 0.220703125, "learning_rate": 0.00011322325217648839, "loss": 2.104, "step": 7315 }, { "epoch": 1.36, "grad_norm": 0.2138671875, "learning_rate": 0.00011293162442012866, "loss": 2.0908, "step": 7320 }, { "epoch": 1.36, "grad_norm": 0.2158203125, "learning_rate": 0.00011264022489630737, "loss": 2.1208, "step": 7325 }, { "epoch": 1.36, "grad_norm": 0.2177734375, "learning_rate": 0.00011234905436887078, "loss": 2.1089, "step": 7330 }, { "epoch": 1.36, "grad_norm": 0.22265625, "learning_rate": 0.0001120581136010649, "loss": 2.1016, "step": 7335 }, { "epoch": 1.36, "grad_norm": 0.21875, "learning_rate": 0.00011176740335553333, "loss": 2.1079, "step": 7340 }, { "epoch": 1.36, "grad_norm": 0.2197265625, "learning_rate": 0.00011147692439431572, "loss": 2.1365, "step": 7345 }, { "epoch": 1.36, "grad_norm": 0.2119140625, "learning_rate": 0.00011118667747884517, "loss": 2.0991, "step": 7350 }, { "epoch": 1.36, "grad_norm": 0.2275390625, "learning_rate": 0.0001108966633699466, "loss": 2.1581, "step": 7355 }, { "epoch": 1.37, "grad_norm": 0.216796875, "learning_rate": 0.00011060688282783469, "loss": 2.1253, "step": 7360 }, { "epoch": 1.37, "grad_norm": 0.2158203125, "learning_rate": 0.00011031733661211184, "loss": 2.0904, "step": 7365 }, { "epoch": 1.37, "grad_norm": 0.216796875, "learning_rate": 0.00011002802548176623, "loss": 2.0793, "step": 7370 }, { "epoch": 1.37, "grad_norm": 0.2138671875, "learning_rate": 0.00010973895019516974, "loss": 2.1235, "step": 7375 }, { "epoch": 1.37, "grad_norm": 0.21875, "learning_rate": 0.00010945011151007634, "loss": 2.1639, "step": 7380 }, { "epoch": 1.37, "grad_norm": 0.21484375, "learning_rate": 0.00010916151018361929, "loss": 2.106, "step": 7385 }, { "epoch": 1.37, "grad_norm": 0.2197265625, "learning_rate": 0.00010887314697230997, "loss": 2.1463, "step": 7390 }, { "epoch": 1.37, "grad_norm": 0.2158203125, "learning_rate": 0.0001085850226320357, "loss": 2.1063, "step": 7395 }, { "epoch": 1.37, "grad_norm": 0.216796875, "learning_rate": 0.00010829713791805738, "loss": 2.1395, "step": 7400 }, { "epoch": 1.37, "grad_norm": 0.2138671875, "learning_rate": 0.00010800949358500794, "loss": 2.1373, "step": 7405 }, { "epoch": 1.37, "grad_norm": 0.2216796875, "learning_rate": 0.00010772209038689003, "loss": 2.1083, "step": 7410 }, { "epoch": 1.38, "grad_norm": 0.216796875, "learning_rate": 0.00010743492907707455, "loss": 2.1363, "step": 7415 }, { "epoch": 1.38, "grad_norm": 0.216796875, "learning_rate": 0.00010714801040829796, "loss": 2.1342, "step": 7420 }, { "epoch": 1.38, "grad_norm": 0.2216796875, "learning_rate": 0.00010686133513266079, "loss": 2.1208, "step": 7425 }, { "epoch": 1.38, "grad_norm": 0.2138671875, "learning_rate": 0.00010657490400162584, "loss": 2.118, "step": 7430 }, { "epoch": 1.38, "grad_norm": 0.2177734375, "learning_rate": 0.00010628871776601542, "loss": 2.1088, "step": 7435 }, { "epoch": 1.38, "grad_norm": 0.2197265625, "learning_rate": 0.00010600277717601042, "loss": 2.1007, "step": 7440 }, { "epoch": 1.38, "grad_norm": 0.220703125, "learning_rate": 0.00010571708298114751, "loss": 2.1492, "step": 7445 }, { "epoch": 1.38, "grad_norm": 0.22265625, "learning_rate": 0.00010543163593031753, "loss": 2.1433, "step": 7450 }, { "epoch": 1.38, "grad_norm": 0.212890625, "learning_rate": 0.00010514643677176354, "loss": 2.1069, "step": 7455 }, { "epoch": 1.38, "grad_norm": 0.2197265625, "learning_rate": 0.00010486148625307868, "loss": 2.1463, "step": 7460 }, { "epoch": 1.38, "grad_norm": 0.216796875, "learning_rate": 0.00010457678512120463, "loss": 2.0955, "step": 7465 }, { "epoch": 1.39, "grad_norm": 0.2294921875, "learning_rate": 0.00010429233412242887, "loss": 2.0892, "step": 7470 }, { "epoch": 1.39, "grad_norm": 0.2099609375, "learning_rate": 0.00010400813400238368, "loss": 2.0786, "step": 7475 }, { "epoch": 1.39, "grad_norm": 0.2158203125, "learning_rate": 0.00010372418550604351, "loss": 2.1339, "step": 7480 }, { "epoch": 1.39, "grad_norm": 0.21875, "learning_rate": 0.00010344048937772297, "loss": 2.1183, "step": 7485 }, { "epoch": 1.39, "grad_norm": 0.2255859375, "learning_rate": 0.00010315704636107564, "loss": 2.1244, "step": 7490 }, { "epoch": 1.39, "grad_norm": 0.216796875, "learning_rate": 0.0001028738571990913, "loss": 2.0664, "step": 7495 }, { "epoch": 1.39, "grad_norm": 0.216796875, "learning_rate": 0.0001025909226340944, "loss": 2.1083, "step": 7500 }, { "epoch": 1.39, "grad_norm": 0.216796875, "learning_rate": 0.0001023082434077419, "loss": 2.1205, "step": 7505 }, { "epoch": 1.39, "grad_norm": 0.2138671875, "learning_rate": 0.00010202582026102153, "loss": 2.1116, "step": 7510 }, { "epoch": 1.39, "grad_norm": 0.2236328125, "learning_rate": 0.00010174365393424992, "loss": 2.1348, "step": 7515 }, { "epoch": 1.4, "grad_norm": 0.2177734375, "learning_rate": 0.00010146174516707005, "loss": 2.1105, "step": 7520 }, { "epoch": 1.4, "grad_norm": 0.21875, "learning_rate": 0.00010118009469845027, "loss": 2.1106, "step": 7525 }, { "epoch": 1.4, "grad_norm": 0.216796875, "learning_rate": 0.00010089870326668154, "loss": 2.1576, "step": 7530 }, { "epoch": 1.4, "grad_norm": 0.21875, "learning_rate": 0.00010061757160937586, "loss": 2.1146, "step": 7535 }, { "epoch": 1.4, "grad_norm": 0.2177734375, "learning_rate": 0.00010033670046346433, "loss": 2.0823, "step": 7540 }, { "epoch": 1.4, "grad_norm": 0.216796875, "learning_rate": 0.00010005609056519513, "loss": 2.0989, "step": 7545 }, { "epoch": 1.4, "grad_norm": 0.2255859375, "learning_rate": 9.97757426501317e-05, "loss": 2.1095, "step": 7550 }, { "epoch": 1.4, "grad_norm": 0.2177734375, "learning_rate": 9.949565745315055e-05, "loss": 2.1397, "step": 7555 }, { "epoch": 1.4, "grad_norm": 0.2158203125, "learning_rate": 9.921583570843986e-05, "loss": 2.0911, "step": 7560 }, { "epoch": 1.4, "grad_norm": 0.220703125, "learning_rate": 9.893627814949693e-05, "loss": 2.1112, "step": 7565 }, { "epoch": 1.4, "grad_norm": 0.2216796875, "learning_rate": 9.865698550912667e-05, "loss": 2.0844, "step": 7570 }, { "epoch": 1.41, "grad_norm": 0.216796875, "learning_rate": 9.837795851943954e-05, "loss": 2.1082, "step": 7575 }, { "epoch": 1.41, "grad_norm": 0.2138671875, "learning_rate": 9.809919791184963e-05, "loss": 2.1108, "step": 7580 }, { "epoch": 1.41, "grad_norm": 0.216796875, "learning_rate": 9.782070441707276e-05, "loss": 2.1201, "step": 7585 }, { "epoch": 1.41, "grad_norm": 0.212890625, "learning_rate": 9.754247876512457e-05, "loss": 2.098, "step": 7590 }, { "epoch": 1.41, "grad_norm": 0.2236328125, "learning_rate": 9.726452168531879e-05, "loss": 2.1319, "step": 7595 }, { "epoch": 1.41, "grad_norm": 0.2265625, "learning_rate": 9.698683390626476e-05, "loss": 2.145, "step": 7600 }, { "epoch": 1.41, "grad_norm": 0.2216796875, "learning_rate": 9.67094161558661e-05, "loss": 2.1141, "step": 7605 }, { "epoch": 1.41, "grad_norm": 0.216796875, "learning_rate": 9.64322691613188e-05, "loss": 2.0464, "step": 7610 }, { "epoch": 1.41, "grad_norm": 0.21875, "learning_rate": 9.615539364910881e-05, "loss": 2.1155, "step": 7615 }, { "epoch": 1.41, "grad_norm": 0.22265625, "learning_rate": 9.587879034501062e-05, "loss": 2.1043, "step": 7620 }, { "epoch": 1.41, "grad_norm": 0.228515625, "learning_rate": 9.5602459974085e-05, "loss": 2.1154, "step": 7625 }, { "epoch": 1.42, "grad_norm": 0.2138671875, "learning_rate": 9.532640326067763e-05, "loss": 2.0965, "step": 7630 }, { "epoch": 1.42, "grad_norm": 0.2216796875, "learning_rate": 9.505062092841644e-05, "loss": 2.0938, "step": 7635 }, { "epoch": 1.42, "grad_norm": 0.2236328125, "learning_rate": 9.477511370021026e-05, "loss": 2.0867, "step": 7640 }, { "epoch": 1.42, "grad_norm": 0.2158203125, "learning_rate": 9.44998822982471e-05, "loss": 2.0947, "step": 7645 }, { "epoch": 1.42, "grad_norm": 0.216796875, "learning_rate": 9.422492744399137e-05, "loss": 2.1152, "step": 7650 }, { "epoch": 1.42, "grad_norm": 0.2255859375, "learning_rate": 9.395024985818309e-05, "loss": 2.0936, "step": 7655 }, { "epoch": 1.42, "grad_norm": 0.212890625, "learning_rate": 9.367585026083518e-05, "loss": 2.1191, "step": 7660 }, { "epoch": 1.42, "grad_norm": 0.2197265625, "learning_rate": 9.340172937123201e-05, "loss": 2.1362, "step": 7665 }, { "epoch": 1.42, "grad_norm": 0.2197265625, "learning_rate": 9.312788790792728e-05, "loss": 2.1423, "step": 7670 }, { "epoch": 1.42, "grad_norm": 0.21875, "learning_rate": 9.285432658874216e-05, "loss": 2.1184, "step": 7675 }, { "epoch": 1.42, "grad_norm": 0.21484375, "learning_rate": 9.258104613076385e-05, "loss": 2.1006, "step": 7680 }, { "epoch": 1.43, "grad_norm": 0.2216796875, "learning_rate": 9.230804725034274e-05, "loss": 2.0857, "step": 7685 }, { "epoch": 1.43, "grad_norm": 0.2275390625, "learning_rate": 9.203533066309168e-05, "loss": 2.1165, "step": 7690 }, { "epoch": 1.43, "grad_norm": 0.228515625, "learning_rate": 9.176289708388329e-05, "loss": 2.1702, "step": 7695 }, { "epoch": 1.43, "grad_norm": 0.2275390625, "learning_rate": 9.149074722684814e-05, "loss": 2.1102, "step": 7700 }, { "epoch": 1.43, "grad_norm": 0.2177734375, "learning_rate": 9.121888180537348e-05, "loss": 2.084, "step": 7705 }, { "epoch": 1.43, "grad_norm": 0.2138671875, "learning_rate": 9.094730153210076e-05, "loss": 2.1463, "step": 7710 }, { "epoch": 1.43, "grad_norm": 0.2177734375, "learning_rate": 9.067600711892396e-05, "loss": 2.1166, "step": 7715 }, { "epoch": 1.43, "grad_norm": 0.220703125, "learning_rate": 9.04049992769877e-05, "loss": 2.1383, "step": 7720 }, { "epoch": 1.43, "grad_norm": 0.216796875, "learning_rate": 9.013427871668562e-05, "loss": 2.11, "step": 7725 }, { "epoch": 1.43, "grad_norm": 0.2177734375, "learning_rate": 8.986384614765817e-05, "loss": 2.1083, "step": 7730 }, { "epoch": 1.44, "grad_norm": 0.21875, "learning_rate": 8.959370227879067e-05, "loss": 2.1375, "step": 7735 }, { "epoch": 1.44, "grad_norm": 0.2177734375, "learning_rate": 8.932384781821208e-05, "loss": 2.111, "step": 7740 }, { "epoch": 1.44, "grad_norm": 0.2177734375, "learning_rate": 8.905428347329245e-05, "loss": 2.1026, "step": 7745 }, { "epoch": 1.44, "grad_norm": 0.21875, "learning_rate": 8.878500995064148e-05, "loss": 2.1506, "step": 7750 }, { "epoch": 1.44, "grad_norm": 0.2177734375, "learning_rate": 8.851602795610646e-05, "loss": 2.1432, "step": 7755 }, { "epoch": 1.44, "grad_norm": 0.2158203125, "learning_rate": 8.824733819477051e-05, "loss": 2.0726, "step": 7760 }, { "epoch": 1.44, "grad_norm": 0.2109375, "learning_rate": 8.797894137095077e-05, "loss": 2.0853, "step": 7765 }, { "epoch": 1.44, "grad_norm": 0.2236328125, "learning_rate": 8.77108381881964e-05, "loss": 2.1539, "step": 7770 }, { "epoch": 1.44, "grad_norm": 0.220703125, "learning_rate": 8.744302934928701e-05, "loss": 2.0791, "step": 7775 }, { "epoch": 1.44, "grad_norm": 0.2119140625, "learning_rate": 8.717551555623051e-05, "loss": 2.0752, "step": 7780 }, { "epoch": 1.44, "grad_norm": 0.220703125, "learning_rate": 8.690829751026141e-05, "loss": 2.1238, "step": 7785 }, { "epoch": 1.45, "grad_norm": 0.2333984375, "learning_rate": 8.664137591183901e-05, "loss": 2.1237, "step": 7790 }, { "epoch": 1.45, "grad_norm": 0.2119140625, "learning_rate": 8.637475146064554e-05, "loss": 2.1183, "step": 7795 }, { "epoch": 1.45, "grad_norm": 0.220703125, "learning_rate": 8.610842485558428e-05, "loss": 2.1416, "step": 7800 }, { "epoch": 1.45, "grad_norm": 0.2255859375, "learning_rate": 8.584239679477775e-05, "loss": 2.1213, "step": 7805 }, { "epoch": 1.45, "grad_norm": 0.2177734375, "learning_rate": 8.557666797556612e-05, "loss": 2.1557, "step": 7810 }, { "epoch": 1.45, "grad_norm": 0.228515625, "learning_rate": 8.531123909450476e-05, "loss": 2.0979, "step": 7815 }, { "epoch": 1.45, "grad_norm": 0.21875, "learning_rate": 8.504611084736305e-05, "loss": 2.1238, "step": 7820 }, { "epoch": 1.45, "grad_norm": 0.2138671875, "learning_rate": 8.478128392912239e-05, "loss": 2.1331, "step": 7825 }, { "epoch": 1.45, "grad_norm": 0.2119140625, "learning_rate": 8.451675903397416e-05, "loss": 2.114, "step": 7830 }, { "epoch": 1.45, "grad_norm": 0.21875, "learning_rate": 8.425253685531809e-05, "loss": 2.152, "step": 7835 }, { "epoch": 1.45, "grad_norm": 0.2109375, "learning_rate": 8.39886180857604e-05, "loss": 2.1003, "step": 7840 }, { "epoch": 1.46, "grad_norm": 0.220703125, "learning_rate": 8.372500341711199e-05, "loss": 2.1093, "step": 7845 }, { "epoch": 1.46, "grad_norm": 0.212890625, "learning_rate": 8.346169354038657e-05, "loss": 2.1132, "step": 7850 }, { "epoch": 1.46, "grad_norm": 0.2099609375, "learning_rate": 8.319868914579894e-05, "loss": 2.0899, "step": 7855 }, { "epoch": 1.46, "grad_norm": 0.2236328125, "learning_rate": 8.293599092276332e-05, "loss": 2.1106, "step": 7860 }, { "epoch": 1.46, "grad_norm": 0.224609375, "learning_rate": 8.267359955989087e-05, "loss": 2.1119, "step": 7865 }, { "epoch": 1.46, "grad_norm": 0.2197265625, "learning_rate": 8.241151574498896e-05, "loss": 2.1188, "step": 7870 }, { "epoch": 1.46, "grad_norm": 0.220703125, "learning_rate": 8.214974016505838e-05, "loss": 2.0889, "step": 7875 }, { "epoch": 1.46, "grad_norm": 0.216796875, "learning_rate": 8.188827350629213e-05, "loss": 2.1212, "step": 7880 }, { "epoch": 1.46, "grad_norm": 0.2236328125, "learning_rate": 8.162711645407335e-05, "loss": 2.111, "step": 7885 }, { "epoch": 1.46, "grad_norm": 0.2177734375, "learning_rate": 8.136626969297365e-05, "loss": 2.1206, "step": 7890 }, { "epoch": 1.46, "grad_norm": 0.2138671875, "learning_rate": 8.110573390675125e-05, "loss": 2.0869, "step": 7895 }, { "epoch": 1.47, "grad_norm": 0.2265625, "learning_rate": 8.084550977834915e-05, "loss": 2.1158, "step": 7900 }, { "epoch": 1.47, "grad_norm": 0.220703125, "learning_rate": 8.058559798989362e-05, "loss": 2.1113, "step": 7905 }, { "epoch": 1.47, "grad_norm": 0.220703125, "learning_rate": 8.032599922269206e-05, "loss": 2.1123, "step": 7910 }, { "epoch": 1.47, "grad_norm": 0.216796875, "learning_rate": 8.006671415723108e-05, "loss": 2.1223, "step": 7915 }, { "epoch": 1.47, "grad_norm": 0.2177734375, "learning_rate": 7.980774347317548e-05, "loss": 2.0928, "step": 7920 }, { "epoch": 1.47, "grad_norm": 0.21875, "learning_rate": 7.954908784936566e-05, "loss": 2.0741, "step": 7925 }, { "epoch": 1.47, "grad_norm": 0.21875, "learning_rate": 7.929074796381618e-05, "loss": 2.1044, "step": 7930 }, { "epoch": 1.47, "grad_norm": 0.21484375, "learning_rate": 7.903272449371395e-05, "loss": 2.119, "step": 7935 }, { "epoch": 1.47, "grad_norm": 0.2236328125, "learning_rate": 7.87750181154167e-05, "loss": 2.1355, "step": 7940 }, { "epoch": 1.47, "grad_norm": 0.2177734375, "learning_rate": 7.851762950445057e-05, "loss": 2.133, "step": 7945 }, { "epoch": 1.47, "grad_norm": 0.2216796875, "learning_rate": 7.826055933550891e-05, "loss": 2.1551, "step": 7950 }, { "epoch": 1.48, "grad_norm": 0.2158203125, "learning_rate": 7.800380828245051e-05, "loss": 2.1216, "step": 7955 }, { "epoch": 1.48, "grad_norm": 0.220703125, "learning_rate": 7.774737701829747e-05, "loss": 2.0914, "step": 7960 }, { "epoch": 1.48, "grad_norm": 0.2255859375, "learning_rate": 7.749126621523363e-05, "loss": 2.1388, "step": 7965 }, { "epoch": 1.48, "grad_norm": 0.2236328125, "learning_rate": 7.723547654460285e-05, "loss": 2.0935, "step": 7970 }, { "epoch": 1.48, "grad_norm": 0.21484375, "learning_rate": 7.698000867690724e-05, "loss": 2.091, "step": 7975 }, { "epoch": 1.48, "grad_norm": 0.216796875, "learning_rate": 7.672486328180529e-05, "loss": 2.1153, "step": 7980 }, { "epoch": 1.48, "grad_norm": 0.2265625, "learning_rate": 7.647004102811013e-05, "loss": 2.1339, "step": 7985 }, { "epoch": 1.48, "grad_norm": 0.216796875, "learning_rate": 7.621554258378818e-05, "loss": 2.1211, "step": 7990 }, { "epoch": 1.48, "grad_norm": 0.2216796875, "learning_rate": 7.596136861595651e-05, "loss": 2.1512, "step": 7995 }, { "epoch": 1.48, "grad_norm": 0.2216796875, "learning_rate": 7.57075197908822e-05, "loss": 2.1514, "step": 8000 }, { "epoch": 1.49, "grad_norm": 0.212890625, "learning_rate": 7.545399677397964e-05, "loss": 2.1236, "step": 8005 }, { "epoch": 1.49, "grad_norm": 0.216796875, "learning_rate": 7.520080022980935e-05, "loss": 2.103, "step": 8010 }, { "epoch": 1.49, "grad_norm": 0.2138671875, "learning_rate": 7.494793082207605e-05, "loss": 2.1457, "step": 8015 }, { "epoch": 1.49, "grad_norm": 0.2197265625, "learning_rate": 7.46953892136268e-05, "loss": 2.1162, "step": 8020 }, { "epoch": 1.49, "grad_norm": 0.216796875, "learning_rate": 7.444317606644973e-05, "loss": 2.1181, "step": 8025 }, { "epoch": 1.49, "grad_norm": 0.220703125, "learning_rate": 7.419129204167151e-05, "loss": 2.1427, "step": 8030 }, { "epoch": 1.49, "grad_norm": 0.2236328125, "learning_rate": 7.39397377995565e-05, "loss": 2.1352, "step": 8035 }, { "epoch": 1.49, "grad_norm": 0.21875, "learning_rate": 7.368851399950447e-05, "loss": 2.0876, "step": 8040 }, { "epoch": 1.49, "grad_norm": 0.2138671875, "learning_rate": 7.343762130004872e-05, "loss": 2.1071, "step": 8045 }, { "epoch": 1.49, "grad_norm": 0.2216796875, "learning_rate": 7.318706035885507e-05, "loss": 2.0977, "step": 8050 }, { "epoch": 1.49, "grad_norm": 0.216796875, "learning_rate": 7.29368318327195e-05, "loss": 2.1158, "step": 8055 }, { "epoch": 1.5, "grad_norm": 0.21875, "learning_rate": 7.268693637756658e-05, "loss": 2.0564, "step": 8060 }, { "epoch": 1.5, "grad_norm": 0.21875, "learning_rate": 7.243737464844787e-05, "loss": 2.1404, "step": 8065 }, { "epoch": 1.5, "grad_norm": 0.2197265625, "learning_rate": 7.218814729954005e-05, "loss": 2.1203, "step": 8070 }, { "epoch": 1.5, "grad_norm": 0.220703125, "learning_rate": 7.193925498414357e-05, "loss": 2.1112, "step": 8075 }, { "epoch": 1.5, "grad_norm": 0.2265625, "learning_rate": 7.169069835468017e-05, "loss": 2.1359, "step": 8080 }, { "epoch": 1.5, "grad_norm": 0.2265625, "learning_rate": 7.144247806269213e-05, "loss": 2.0982, "step": 8085 }, { "epoch": 1.5, "grad_norm": 0.220703125, "learning_rate": 7.119459475883983e-05, "loss": 2.0748, "step": 8090 }, { "epoch": 1.5, "grad_norm": 0.2158203125, "learning_rate": 7.094704909290036e-05, "loss": 2.1213, "step": 8095 }, { "epoch": 1.5, "grad_norm": 0.220703125, "learning_rate": 7.069984171376571e-05, "loss": 2.1405, "step": 8100 }, { "epoch": 1.5, "grad_norm": 0.2197265625, "learning_rate": 7.045297326944125e-05, "loss": 2.1144, "step": 8105 }, { "epoch": 1.5, "grad_norm": 0.21875, "learning_rate": 7.020644440704376e-05, "loss": 2.0758, "step": 8110 }, { "epoch": 1.51, "grad_norm": 0.2392578125, "learning_rate": 6.996025577279986e-05, "loss": 2.1157, "step": 8115 }, { "epoch": 1.51, "grad_norm": 0.2158203125, "learning_rate": 6.971440801204454e-05, "loss": 2.1075, "step": 8120 }, { "epoch": 1.51, "grad_norm": 0.2255859375, "learning_rate": 6.946890176921915e-05, "loss": 2.1304, "step": 8125 }, { "epoch": 1.51, "grad_norm": 0.220703125, "learning_rate": 6.922373768786954e-05, "loss": 2.1381, "step": 8130 }, { "epoch": 1.51, "grad_norm": 0.2138671875, "learning_rate": 6.897891641064511e-05, "loss": 2.078, "step": 8135 }, { "epoch": 1.51, "grad_norm": 0.2216796875, "learning_rate": 6.873443857929638e-05, "loss": 2.1653, "step": 8140 }, { "epoch": 1.51, "grad_norm": 0.2265625, "learning_rate": 6.849030483467367e-05, "loss": 2.1208, "step": 8145 }, { "epoch": 1.51, "grad_norm": 0.21484375, "learning_rate": 6.82465158167253e-05, "loss": 2.0856, "step": 8150 }, { "epoch": 1.51, "grad_norm": 0.220703125, "learning_rate": 6.800307216449615e-05, "loss": 2.1454, "step": 8155 }, { "epoch": 1.51, "grad_norm": 0.2197265625, "learning_rate": 6.775997451612548e-05, "loss": 2.1249, "step": 8160 }, { "epoch": 1.51, "grad_norm": 0.2197265625, "learning_rate": 6.75172235088457e-05, "loss": 2.1168, "step": 8165 }, { "epoch": 1.52, "grad_norm": 0.2138671875, "learning_rate": 6.727481977898076e-05, "loss": 2.1398, "step": 8170 }, { "epoch": 1.52, "grad_norm": 0.2138671875, "learning_rate": 6.703276396194404e-05, "loss": 2.0943, "step": 8175 }, { "epoch": 1.52, "grad_norm": 0.2119140625, "learning_rate": 6.679105669223704e-05, "loss": 2.1336, "step": 8180 }, { "epoch": 1.52, "grad_norm": 0.224609375, "learning_rate": 6.654969860344757e-05, "loss": 2.0909, "step": 8185 }, { "epoch": 1.52, "grad_norm": 0.2109375, "learning_rate": 6.630869032824821e-05, "loss": 2.1254, "step": 8190 }, { "epoch": 1.52, "grad_norm": 0.2158203125, "learning_rate": 6.606803249839448e-05, "loss": 2.1044, "step": 8195 }, { "epoch": 1.52, "grad_norm": 0.2216796875, "learning_rate": 6.582772574472325e-05, "loss": 2.1604, "step": 8200 }, { "epoch": 1.52, "grad_norm": 0.2373046875, "learning_rate": 6.558777069715138e-05, "loss": 2.1012, "step": 8205 }, { "epoch": 1.52, "grad_norm": 0.21875, "learning_rate": 6.534816798467338e-05, "loss": 2.0951, "step": 8210 }, { "epoch": 1.52, "grad_norm": 0.2177734375, "learning_rate": 6.510891823536054e-05, "loss": 2.0685, "step": 8215 }, { "epoch": 1.53, "grad_norm": 0.21484375, "learning_rate": 6.487002207635877e-05, "loss": 2.1161, "step": 8220 }, { "epoch": 1.53, "grad_norm": 0.2236328125, "learning_rate": 6.463148013388713e-05, "loss": 2.1549, "step": 8225 }, { "epoch": 1.53, "grad_norm": 0.21875, "learning_rate": 6.439329303323616e-05, "loss": 2.1182, "step": 8230 }, { "epoch": 1.53, "grad_norm": 0.2158203125, "learning_rate": 6.415546139876629e-05, "loss": 2.0894, "step": 8235 }, { "epoch": 1.53, "grad_norm": 0.216796875, "learning_rate": 6.391798585390614e-05, "loss": 2.1289, "step": 8240 }, { "epoch": 1.53, "grad_norm": 0.2119140625, "learning_rate": 6.368086702115081e-05, "loss": 2.0734, "step": 8245 }, { "epoch": 1.53, "grad_norm": 0.216796875, "learning_rate": 6.344410552206066e-05, "loss": 2.1034, "step": 8250 }, { "epoch": 1.53, "grad_norm": 0.224609375, "learning_rate": 6.320770197725911e-05, "loss": 2.1449, "step": 8255 }, { "epoch": 1.53, "grad_norm": 0.2177734375, "learning_rate": 6.297165700643117e-05, "loss": 2.0802, "step": 8260 }, { "epoch": 1.53, "grad_norm": 0.2158203125, "learning_rate": 6.273597122832224e-05, "loss": 2.1051, "step": 8265 }, { "epoch": 1.53, "grad_norm": 0.212890625, "learning_rate": 6.250064526073598e-05, "loss": 2.1103, "step": 8270 }, { "epoch": 1.54, "grad_norm": 0.224609375, "learning_rate": 6.226567972053286e-05, "loss": 2.0815, "step": 8275 }, { "epoch": 1.54, "grad_norm": 0.21875, "learning_rate": 6.203107522362863e-05, "loss": 2.1086, "step": 8280 }, { "epoch": 1.54, "grad_norm": 0.2177734375, "learning_rate": 6.179683238499263e-05, "loss": 2.0913, "step": 8285 }, { "epoch": 1.54, "grad_norm": 0.2197265625, "learning_rate": 6.156295181864613e-05, "loss": 2.085, "step": 8290 }, { "epoch": 1.54, "grad_norm": 0.220703125, "learning_rate": 6.132943413766077e-05, "loss": 2.0987, "step": 8295 }, { "epoch": 1.54, "grad_norm": 0.21875, "learning_rate": 6.109627995415712e-05, "loss": 2.1141, "step": 8300 }, { "epoch": 1.54, "grad_norm": 0.2158203125, "learning_rate": 6.086348987930273e-05, "loss": 2.1072, "step": 8305 }, { "epoch": 1.54, "grad_norm": 0.220703125, "learning_rate": 6.0631064523310756e-05, "loss": 2.1292, "step": 8310 }, { "epoch": 1.54, "grad_norm": 0.2216796875, "learning_rate": 6.039900449543836e-05, "loss": 2.1378, "step": 8315 }, { "epoch": 1.54, "grad_norm": 0.21875, "learning_rate": 6.016731040398502e-05, "loss": 2.128, "step": 8320 }, { "epoch": 1.54, "grad_norm": 0.2158203125, "learning_rate": 5.9935982856291005e-05, "loss": 2.0829, "step": 8325 }, { "epoch": 1.55, "grad_norm": 0.2138671875, "learning_rate": 5.970502245873573e-05, "loss": 2.0905, "step": 8330 }, { "epoch": 1.55, "grad_norm": 0.2216796875, "learning_rate": 5.947442981673637e-05, "loss": 2.1405, "step": 8335 }, { "epoch": 1.55, "grad_norm": 0.22265625, "learning_rate": 5.924420553474581e-05, "loss": 2.122, "step": 8340 }, { "epoch": 1.55, "grad_norm": 0.2138671875, "learning_rate": 5.90143502162515e-05, "loss": 2.1289, "step": 8345 }, { "epoch": 1.55, "grad_norm": 0.2216796875, "learning_rate": 5.8784864463773824e-05, "loss": 2.1152, "step": 8350 }, { "epoch": 1.55, "grad_norm": 0.22265625, "learning_rate": 5.85557488788643e-05, "loss": 2.1293, "step": 8355 }, { "epoch": 1.55, "grad_norm": 0.21875, "learning_rate": 5.832700406210414e-05, "loss": 2.1264, "step": 8360 }, { "epoch": 1.55, "grad_norm": 0.21484375, "learning_rate": 5.809863061310261e-05, "loss": 2.0865, "step": 8365 }, { "epoch": 1.55, "grad_norm": 0.2177734375, "learning_rate": 5.7870629130495746e-05, "loss": 2.0962, "step": 8370 }, { "epoch": 1.55, "grad_norm": 0.21875, "learning_rate": 5.76430002119442e-05, "loss": 2.123, "step": 8375 }, { "epoch": 1.55, "grad_norm": 0.2158203125, "learning_rate": 5.741574445413218e-05, "loss": 2.105, "step": 8380 }, { "epoch": 1.56, "grad_norm": 0.2177734375, "learning_rate": 5.718886245276589e-05, "loss": 2.0846, "step": 8385 }, { "epoch": 1.56, "grad_norm": 0.2197265625, "learning_rate": 5.6962354802571574e-05, "loss": 2.0829, "step": 8390 }, { "epoch": 1.56, "grad_norm": 0.2158203125, "learning_rate": 5.673622209729426e-05, "loss": 2.1085, "step": 8395 }, { "epoch": 1.56, "grad_norm": 0.2216796875, "learning_rate": 5.651046492969616e-05, "loss": 2.1053, "step": 8400 }, { "epoch": 1.56, "grad_norm": 0.21484375, "learning_rate": 5.628508389155507e-05, "loss": 2.1162, "step": 8405 }, { "epoch": 1.56, "grad_norm": 0.224609375, "learning_rate": 5.606007957366284e-05, "loss": 2.1035, "step": 8410 }, { "epoch": 1.56, "grad_norm": 0.2216796875, "learning_rate": 5.583545256582374e-05, "loss": 2.1368, "step": 8415 }, { "epoch": 1.56, "grad_norm": 0.2294921875, "learning_rate": 5.5611203456853267e-05, "loss": 2.1249, "step": 8420 }, { "epoch": 1.56, "grad_norm": 0.224609375, "learning_rate": 5.538733283457591e-05, "loss": 2.145, "step": 8425 }, { "epoch": 1.56, "grad_norm": 0.21875, "learning_rate": 5.516384128582444e-05, "loss": 2.1136, "step": 8430 }, { "epoch": 1.56, "grad_norm": 0.2197265625, "learning_rate": 5.4940729396437704e-05, "loss": 2.117, "step": 8435 }, { "epoch": 1.57, "grad_norm": 0.224609375, "learning_rate": 5.471799775125943e-05, "loss": 2.1447, "step": 8440 }, { "epoch": 1.57, "grad_norm": 0.2177734375, "learning_rate": 5.4495646934136625e-05, "loss": 2.1128, "step": 8445 }, { "epoch": 1.57, "grad_norm": 0.2177734375, "learning_rate": 5.4273677527917966e-05, "loss": 2.1114, "step": 8450 }, { "epoch": 1.57, "grad_norm": 0.2197265625, "learning_rate": 5.405209011445242e-05, "loss": 2.1023, "step": 8455 }, { "epoch": 1.57, "grad_norm": 0.2177734375, "learning_rate": 5.383088527458753e-05, "loss": 2.1129, "step": 8460 }, { "epoch": 1.57, "grad_norm": 0.216796875, "learning_rate": 5.361006358816818e-05, "loss": 2.1446, "step": 8465 }, { "epoch": 1.57, "grad_norm": 0.2099609375, "learning_rate": 5.338962563403478e-05, "loss": 2.0962, "step": 8470 }, { "epoch": 1.57, "grad_norm": 0.2236328125, "learning_rate": 5.3169571990021684e-05, "loss": 2.0977, "step": 8475 }, { "epoch": 1.57, "grad_norm": 0.220703125, "learning_rate": 5.294990323295621e-05, "loss": 2.0961, "step": 8480 }, { "epoch": 1.57, "grad_norm": 0.224609375, "learning_rate": 5.273061993865651e-05, "loss": 2.1561, "step": 8485 }, { "epoch": 1.58, "grad_norm": 0.224609375, "learning_rate": 5.251172268193041e-05, "loss": 2.15, "step": 8490 }, { "epoch": 1.58, "grad_norm": 0.2236328125, "learning_rate": 5.229321203657382e-05, "loss": 2.1232, "step": 8495 }, { "epoch": 1.58, "grad_norm": 0.220703125, "learning_rate": 5.20750885753692e-05, "loss": 2.1288, "step": 8500 }, { "epoch": 1.58, "grad_norm": 0.21875, "learning_rate": 5.1857352870084086e-05, "loss": 2.1265, "step": 8505 }, { "epoch": 1.58, "grad_norm": 0.2216796875, "learning_rate": 5.164000549146954e-05, "loss": 2.1376, "step": 8510 }, { "epoch": 1.58, "grad_norm": 0.21875, "learning_rate": 5.1423047009258904e-05, "loss": 2.1105, "step": 8515 }, { "epoch": 1.58, "grad_norm": 0.2138671875, "learning_rate": 5.120647799216587e-05, "loss": 2.1161, "step": 8520 }, { "epoch": 1.58, "grad_norm": 0.2177734375, "learning_rate": 5.0990299007883304e-05, "loss": 2.1267, "step": 8525 }, { "epoch": 1.58, "grad_norm": 0.2216796875, "learning_rate": 5.077451062308174e-05, "loss": 2.104, "step": 8530 }, { "epoch": 1.58, "grad_norm": 0.216796875, "learning_rate": 5.055911340340771e-05, "loss": 2.0874, "step": 8535 }, { "epoch": 1.58, "grad_norm": 0.2255859375, "learning_rate": 5.0344107913482516e-05, "loss": 2.1245, "step": 8540 }, { "epoch": 1.59, "grad_norm": 0.2177734375, "learning_rate": 5.012949471690045e-05, "loss": 2.0561, "step": 8545 }, { "epoch": 1.59, "grad_norm": 0.2216796875, "learning_rate": 4.9915274376227805e-05, "loss": 2.1394, "step": 8550 }, { "epoch": 1.59, "grad_norm": 0.21875, "learning_rate": 4.970144745300063e-05, "loss": 2.1378, "step": 8555 }, { "epoch": 1.59, "grad_norm": 0.2216796875, "learning_rate": 4.948801450772409e-05, "loss": 2.1072, "step": 8560 }, { "epoch": 1.59, "grad_norm": 0.2216796875, "learning_rate": 4.9274976099870415e-05, "loss": 2.1504, "step": 8565 }, { "epoch": 1.59, "grad_norm": 0.220703125, "learning_rate": 4.9062332787877705e-05, "loss": 2.0927, "step": 8570 }, { "epoch": 1.59, "grad_norm": 0.21484375, "learning_rate": 4.885008512914837e-05, "loss": 2.0769, "step": 8575 }, { "epoch": 1.59, "grad_norm": 0.21484375, "learning_rate": 4.863823368004763e-05, "loss": 2.0962, "step": 8580 }, { "epoch": 1.59, "grad_norm": 0.220703125, "learning_rate": 4.842677899590238e-05, "loss": 2.0833, "step": 8585 }, { "epoch": 1.59, "grad_norm": 0.21484375, "learning_rate": 4.8215721630999075e-05, "loss": 2.0788, "step": 8590 }, { "epoch": 1.59, "grad_norm": 0.2197265625, "learning_rate": 4.800506213858293e-05, "loss": 2.0668, "step": 8595 }, { "epoch": 1.6, "grad_norm": 0.224609375, "learning_rate": 4.779480107085632e-05, "loss": 2.1316, "step": 8600 }, { "epoch": 1.6, "grad_norm": 0.2158203125, "learning_rate": 4.7584938978976845e-05, "loss": 2.0993, "step": 8605 }, { "epoch": 1.6, "grad_norm": 0.216796875, "learning_rate": 4.737547641305668e-05, "loss": 2.083, "step": 8610 }, { "epoch": 1.6, "grad_norm": 0.224609375, "learning_rate": 4.716641392216048e-05, "loss": 2.0819, "step": 8615 }, { "epoch": 1.6, "grad_norm": 0.2216796875, "learning_rate": 4.695775205430426e-05, "loss": 2.0953, "step": 8620 }, { "epoch": 1.6, "grad_norm": 0.21484375, "learning_rate": 4.674949135645383e-05, "loss": 2.1031, "step": 8625 }, { "epoch": 1.6, "grad_norm": 0.2158203125, "learning_rate": 4.654163237452345e-05, "loss": 2.0836, "step": 8630 }, { "epoch": 1.6, "grad_norm": 0.2138671875, "learning_rate": 4.6334175653374476e-05, "loss": 2.0672, "step": 8635 }, { "epoch": 1.6, "grad_norm": 0.216796875, "learning_rate": 4.612712173681353e-05, "loss": 2.0927, "step": 8640 }, { "epoch": 1.6, "grad_norm": 0.2294921875, "learning_rate": 4.592047116759164e-05, "loss": 2.0775, "step": 8645 }, { "epoch": 1.6, "grad_norm": 0.2119140625, "learning_rate": 4.571422448740246e-05, "loss": 2.1157, "step": 8650 }, { "epoch": 1.61, "grad_norm": 0.2197265625, "learning_rate": 4.550838223688074e-05, "loss": 2.128, "step": 8655 }, { "epoch": 1.61, "grad_norm": 0.2236328125, "learning_rate": 4.530294495560141e-05, "loss": 2.1186, "step": 8660 }, { "epoch": 1.61, "grad_norm": 0.216796875, "learning_rate": 4.5097913182077656e-05, "loss": 2.1303, "step": 8665 }, { "epoch": 1.61, "grad_norm": 0.2265625, "learning_rate": 4.4893287453759755e-05, "loss": 2.124, "step": 8670 }, { "epoch": 1.61, "grad_norm": 0.2158203125, "learning_rate": 4.4689068307033544e-05, "loss": 2.1581, "step": 8675 }, { "epoch": 1.61, "grad_norm": 0.2158203125, "learning_rate": 4.4485256277219246e-05, "loss": 2.1182, "step": 8680 }, { "epoch": 1.61, "grad_norm": 0.21875, "learning_rate": 4.428185189856986e-05, "loss": 2.0936, "step": 8685 }, { "epoch": 1.61, "grad_norm": 0.2158203125, "learning_rate": 4.4078855704269575e-05, "loss": 2.0769, "step": 8690 }, { "epoch": 1.61, "grad_norm": 0.2158203125, "learning_rate": 4.387626822643294e-05, "loss": 2.1441, "step": 8695 }, { "epoch": 1.61, "grad_norm": 0.21484375, "learning_rate": 4.3674089996102966e-05, "loss": 2.1038, "step": 8700 }, { "epoch": 1.62, "grad_norm": 0.2158203125, "learning_rate": 4.347232154324992e-05, "loss": 2.1426, "step": 8705 }, { "epoch": 1.62, "grad_norm": 0.2138671875, "learning_rate": 4.32709633967699e-05, "loss": 2.0974, "step": 8710 }, { "epoch": 1.62, "grad_norm": 0.220703125, "learning_rate": 4.307001608448353e-05, "loss": 2.0787, "step": 8715 }, { "epoch": 1.62, "grad_norm": 0.212890625, "learning_rate": 4.2869480133134435e-05, "loss": 2.0808, "step": 8720 }, { "epoch": 1.62, "grad_norm": 0.224609375, "learning_rate": 4.266935606838796e-05, "loss": 2.1249, "step": 8725 }, { "epoch": 1.62, "grad_norm": 0.2197265625, "learning_rate": 4.246964441482986e-05, "loss": 2.1447, "step": 8730 }, { "epoch": 1.62, "grad_norm": 0.2177734375, "learning_rate": 4.2270345695964734e-05, "loss": 2.1202, "step": 8735 }, { "epoch": 1.62, "grad_norm": 0.2216796875, "learning_rate": 4.207146043421477e-05, "loss": 2.1282, "step": 8740 }, { "epoch": 1.62, "grad_norm": 0.21875, "learning_rate": 4.1872989150918375e-05, "loss": 2.1001, "step": 8745 }, { "epoch": 1.62, "grad_norm": 0.220703125, "learning_rate": 4.16749323663288e-05, "loss": 2.1221, "step": 8750 }, { "epoch": 1.62, "grad_norm": 0.21875, "learning_rate": 4.147729059961278e-05, "loss": 2.1309, "step": 8755 }, { "epoch": 1.63, "grad_norm": 0.216796875, "learning_rate": 4.128006436884906e-05, "loss": 2.152, "step": 8760 }, { "epoch": 1.63, "grad_norm": 0.2216796875, "learning_rate": 4.1083254191027384e-05, "loss": 2.1294, "step": 8765 }, { "epoch": 1.63, "grad_norm": 0.22265625, "learning_rate": 4.088686058204656e-05, "loss": 2.1232, "step": 8770 }, { "epoch": 1.63, "grad_norm": 0.2158203125, "learning_rate": 4.069088405671375e-05, "loss": 2.0924, "step": 8775 }, { "epoch": 1.63, "grad_norm": 0.21875, "learning_rate": 4.049532512874261e-05, "loss": 2.1028, "step": 8780 }, { "epoch": 1.63, "grad_norm": 0.2265625, "learning_rate": 4.0300184310752265e-05, "loss": 2.1369, "step": 8785 }, { "epoch": 1.63, "grad_norm": 0.2255859375, "learning_rate": 4.0105462114265754e-05, "loss": 2.0931, "step": 8790 }, { "epoch": 1.63, "grad_norm": 0.2236328125, "learning_rate": 3.991115904970888e-05, "loss": 2.1279, "step": 8795 }, { "epoch": 1.63, "grad_norm": 0.21875, "learning_rate": 3.97172756264087e-05, "loss": 2.0991, "step": 8800 }, { "epoch": 1.63, "grad_norm": 0.220703125, "learning_rate": 3.952381235259228e-05, "loss": 2.1481, "step": 8805 }, { "epoch": 1.63, "grad_norm": 0.2158203125, "learning_rate": 3.933076973538532e-05, "loss": 2.0896, "step": 8810 }, { "epoch": 1.64, "grad_norm": 0.2236328125, "learning_rate": 3.9138148280811014e-05, "loss": 2.1299, "step": 8815 }, { "epoch": 1.64, "grad_norm": 0.220703125, "learning_rate": 3.894594849378828e-05, "loss": 2.1204, "step": 8820 }, { "epoch": 1.64, "grad_norm": 0.212890625, "learning_rate": 3.8754170878131e-05, "loss": 2.0644, "step": 8825 }, { "epoch": 1.64, "grad_norm": 0.21875, "learning_rate": 3.856281593654623e-05, "loss": 2.1133, "step": 8830 }, { "epoch": 1.64, "grad_norm": 0.2197265625, "learning_rate": 3.8371884170633134e-05, "loss": 2.0851, "step": 8835 }, { "epoch": 1.64, "grad_norm": 0.212890625, "learning_rate": 3.818137608088161e-05, "loss": 2.1087, "step": 8840 }, { "epoch": 1.64, "grad_norm": 0.21875, "learning_rate": 3.7991292166670966e-05, "loss": 2.1384, "step": 8845 }, { "epoch": 1.64, "grad_norm": 0.2158203125, "learning_rate": 3.780163292626859e-05, "loss": 2.1075, "step": 8850 }, { "epoch": 1.64, "grad_norm": 0.2158203125, "learning_rate": 3.76123988568287e-05, "loss": 2.0926, "step": 8855 }, { "epoch": 1.64, "grad_norm": 0.2177734375, "learning_rate": 3.742359045439105e-05, "loss": 2.1276, "step": 8860 }, { "epoch": 1.64, "grad_norm": 0.216796875, "learning_rate": 3.723520821387958e-05, "loss": 2.1149, "step": 8865 }, { "epoch": 1.65, "grad_norm": 0.2158203125, "learning_rate": 3.704725262910094e-05, "loss": 2.0676, "step": 8870 }, { "epoch": 1.65, "grad_norm": 0.2275390625, "learning_rate": 3.6859724192743704e-05, "loss": 2.1569, "step": 8875 }, { "epoch": 1.65, "grad_norm": 0.2138671875, "learning_rate": 3.6672623396376584e-05, "loss": 2.0912, "step": 8880 }, { "epoch": 1.65, "grad_norm": 0.224609375, "learning_rate": 3.648595073044729e-05, "loss": 2.0974, "step": 8885 }, { "epoch": 1.65, "grad_norm": 0.2177734375, "learning_rate": 3.629970668428129e-05, "loss": 2.1083, "step": 8890 }, { "epoch": 1.65, "grad_norm": 0.212890625, "learning_rate": 3.611389174608068e-05, "loss": 2.1057, "step": 8895 }, { "epoch": 1.65, "grad_norm": 0.2177734375, "learning_rate": 3.592850640292249e-05, "loss": 2.0982, "step": 8900 }, { "epoch": 1.65, "grad_norm": 0.2236328125, "learning_rate": 3.574355114075773e-05, "loss": 2.0863, "step": 8905 }, { "epoch": 1.65, "grad_norm": 0.220703125, "learning_rate": 3.555902644441016e-05, "loss": 2.1439, "step": 8910 }, { "epoch": 1.65, "grad_norm": 0.224609375, "learning_rate": 3.5374932797574734e-05, "loss": 2.1372, "step": 8915 }, { "epoch": 1.65, "grad_norm": 0.220703125, "learning_rate": 3.5191270682816604e-05, "loss": 2.1334, "step": 8920 }, { "epoch": 1.66, "grad_norm": 0.2177734375, "learning_rate": 3.5008040581569634e-05, "loss": 2.1059, "step": 8925 }, { "epoch": 1.66, "grad_norm": 0.220703125, "learning_rate": 3.4825242974135474e-05, "loss": 2.1059, "step": 8930 }, { "epoch": 1.66, "grad_norm": 0.2197265625, "learning_rate": 3.464287833968176e-05, "loss": 2.1372, "step": 8935 }, { "epoch": 1.66, "grad_norm": 0.2177734375, "learning_rate": 3.4460947156241376e-05, "loss": 2.1008, "step": 8940 }, { "epoch": 1.66, "grad_norm": 0.2216796875, "learning_rate": 3.427944990071108e-05, "loss": 2.0944, "step": 8945 }, { "epoch": 1.66, "grad_norm": 0.2197265625, "learning_rate": 3.409838704884984e-05, "loss": 2.1204, "step": 8950 }, { "epoch": 1.66, "grad_norm": 0.2197265625, "learning_rate": 3.391775907527834e-05, "loss": 2.1195, "step": 8955 }, { "epoch": 1.66, "grad_norm": 0.2265625, "learning_rate": 3.373756645347703e-05, "loss": 2.1051, "step": 8960 }, { "epoch": 1.66, "grad_norm": 0.2138671875, "learning_rate": 3.355780965578526e-05, "loss": 2.0931, "step": 8965 }, { "epoch": 1.66, "grad_norm": 0.21484375, "learning_rate": 3.337848915339994e-05, "loss": 2.1239, "step": 8970 }, { "epoch": 1.67, "grad_norm": 0.2236328125, "learning_rate": 3.31996054163743e-05, "loss": 2.0869, "step": 8975 }, { "epoch": 1.67, "grad_norm": 0.21484375, "learning_rate": 3.302115891361683e-05, "loss": 2.082, "step": 8980 }, { "epoch": 1.67, "grad_norm": 0.2216796875, "learning_rate": 3.2843150112889564e-05, "loss": 2.085, "step": 8985 }, { "epoch": 1.67, "grad_norm": 0.2177734375, "learning_rate": 3.266557948080757e-05, "loss": 2.0749, "step": 8990 }, { "epoch": 1.67, "grad_norm": 0.21875, "learning_rate": 3.2488447482837146e-05, "loss": 2.0904, "step": 8995 }, { "epoch": 1.67, "grad_norm": 0.212890625, "learning_rate": 3.231175458329465e-05, "loss": 2.1296, "step": 9000 }, { "epoch": 1.67, "grad_norm": 0.216796875, "learning_rate": 3.213550124534579e-05, "loss": 2.1353, "step": 9005 }, { "epoch": 1.67, "grad_norm": 0.220703125, "learning_rate": 3.1959687931003765e-05, "loss": 2.1098, "step": 9010 }, { "epoch": 1.67, "grad_norm": 0.216796875, "learning_rate": 3.178431510112845e-05, "loss": 2.095, "step": 9015 }, { "epoch": 1.67, "grad_norm": 0.21484375, "learning_rate": 3.160938321542506e-05, "loss": 2.1216, "step": 9020 }, { "epoch": 1.67, "grad_norm": 0.2177734375, "learning_rate": 3.143489273244291e-05, "loss": 2.1403, "step": 9025 }, { "epoch": 1.68, "grad_norm": 0.216796875, "learning_rate": 3.126084410957446e-05, "loss": 2.0983, "step": 9030 }, { "epoch": 1.68, "grad_norm": 0.2158203125, "learning_rate": 3.1087237803053584e-05, "loss": 2.0944, "step": 9035 }, { "epoch": 1.68, "grad_norm": 0.2197265625, "learning_rate": 3.091407426795503e-05, "loss": 2.1514, "step": 9040 }, { "epoch": 1.68, "grad_norm": 0.22265625, "learning_rate": 3.0741353958192755e-05, "loss": 2.0635, "step": 9045 }, { "epoch": 1.68, "grad_norm": 0.22265625, "learning_rate": 3.0569077326518904e-05, "loss": 2.104, "step": 9050 }, { "epoch": 1.68, "grad_norm": 0.2138671875, "learning_rate": 3.0397244824522618e-05, "loss": 2.0914, "step": 9055 }, { "epoch": 1.68, "grad_norm": 0.22265625, "learning_rate": 3.0225856902628847e-05, "loss": 2.1016, "step": 9060 }, { "epoch": 1.68, "grad_norm": 0.2177734375, "learning_rate": 3.0054914010097145e-05, "loss": 2.0724, "step": 9065 }, { "epoch": 1.68, "grad_norm": 0.2158203125, "learning_rate": 2.9884416595020505e-05, "loss": 2.1027, "step": 9070 }, { "epoch": 1.68, "grad_norm": 0.216796875, "learning_rate": 2.971436510432424e-05, "loss": 2.1249, "step": 9075 }, { "epoch": 1.68, "grad_norm": 0.2177734375, "learning_rate": 2.9544759983764736e-05, "loss": 2.1062, "step": 9080 }, { "epoch": 1.69, "grad_norm": 0.2197265625, "learning_rate": 2.9375601677928254e-05, "loss": 2.0937, "step": 9085 }, { "epoch": 1.69, "grad_norm": 0.2197265625, "learning_rate": 2.9206890630229876e-05, "loss": 2.1203, "step": 9090 }, { "epoch": 1.69, "grad_norm": 0.2255859375, "learning_rate": 2.9038627282912268e-05, "loss": 2.1202, "step": 9095 }, { "epoch": 1.69, "grad_norm": 0.212890625, "learning_rate": 2.887081207704454e-05, "loss": 2.0738, "step": 9100 }, { "epoch": 1.69, "grad_norm": 0.220703125, "learning_rate": 2.870344545252106e-05, "loss": 2.1016, "step": 9105 }, { "epoch": 1.69, "grad_norm": 0.2236328125, "learning_rate": 2.8536527848060446e-05, "loss": 2.0854, "step": 9110 }, { "epoch": 1.69, "grad_norm": 0.21484375, "learning_rate": 2.8370059701204122e-05, "loss": 2.1124, "step": 9115 }, { "epoch": 1.69, "grad_norm": 0.220703125, "learning_rate": 2.820404144831541e-05, "loss": 2.0718, "step": 9120 }, { "epoch": 1.69, "grad_norm": 0.22265625, "learning_rate": 2.8038473524578447e-05, "loss": 2.0919, "step": 9125 }, { "epoch": 1.69, "grad_norm": 0.2177734375, "learning_rate": 2.787335636399675e-05, "loss": 2.1262, "step": 9130 }, { "epoch": 1.69, "grad_norm": 0.21484375, "learning_rate": 2.7708690399392366e-05, "loss": 2.1434, "step": 9135 }, { "epoch": 1.7, "grad_norm": 0.216796875, "learning_rate": 2.7544476062404557e-05, "loss": 2.1098, "step": 9140 }, { "epoch": 1.7, "grad_norm": 0.2197265625, "learning_rate": 2.738071378348872e-05, "loss": 2.1332, "step": 9145 }, { "epoch": 1.7, "grad_norm": 0.216796875, "learning_rate": 2.7217403991915368e-05, "loss": 2.1317, "step": 9150 }, { "epoch": 1.7, "grad_norm": 0.2138671875, "learning_rate": 2.7054547115768735e-05, "loss": 2.0972, "step": 9155 }, { "epoch": 1.7, "grad_norm": 0.2265625, "learning_rate": 2.6892143581946116e-05, "loss": 2.0682, "step": 9160 }, { "epoch": 1.7, "grad_norm": 0.220703125, "learning_rate": 2.673019381615609e-05, "loss": 2.1369, "step": 9165 }, { "epoch": 1.7, "grad_norm": 0.2177734375, "learning_rate": 2.6568698242918055e-05, "loss": 2.068, "step": 9170 }, { "epoch": 1.7, "grad_norm": 0.2138671875, "learning_rate": 2.640765728556074e-05, "loss": 2.134, "step": 9175 }, { "epoch": 1.7, "grad_norm": 0.2177734375, "learning_rate": 2.6247071366221175e-05, "loss": 2.1242, "step": 9180 }, { "epoch": 1.7, "grad_norm": 0.224609375, "learning_rate": 2.6086940905843606e-05, "loss": 2.1113, "step": 9185 }, { "epoch": 1.71, "grad_norm": 0.22265625, "learning_rate": 2.5927266324178345e-05, "loss": 2.151, "step": 9190 }, { "epoch": 1.71, "grad_norm": 0.2265625, "learning_rate": 2.5768048039780858e-05, "loss": 2.1334, "step": 9195 }, { "epoch": 1.71, "grad_norm": 0.2216796875, "learning_rate": 2.5609286470010262e-05, "loss": 2.0881, "step": 9200 }, { "epoch": 1.71, "grad_norm": 0.2158203125, "learning_rate": 2.545098203102876e-05, "loss": 2.0929, "step": 9205 }, { "epoch": 1.71, "grad_norm": 0.21484375, "learning_rate": 2.529313513780016e-05, "loss": 2.0808, "step": 9210 }, { "epoch": 1.71, "grad_norm": 0.2216796875, "learning_rate": 2.513574620408874e-05, "loss": 2.1191, "step": 9215 }, { "epoch": 1.71, "grad_norm": 0.2158203125, "learning_rate": 2.4978815642458654e-05, "loss": 2.0935, "step": 9220 }, { "epoch": 1.71, "grad_norm": 0.212890625, "learning_rate": 2.482234386427227e-05, "loss": 2.106, "step": 9225 }, { "epoch": 1.71, "grad_norm": 0.212890625, "learning_rate": 2.4666331279689425e-05, "loss": 2.1415, "step": 9230 }, { "epoch": 1.71, "grad_norm": 0.2177734375, "learning_rate": 2.4510778297666282e-05, "loss": 2.1146, "step": 9235 }, { "epoch": 1.71, "grad_norm": 0.2177734375, "learning_rate": 2.435568532595427e-05, "loss": 2.1097, "step": 9240 }, { "epoch": 1.72, "grad_norm": 0.2158203125, "learning_rate": 2.4201052771099008e-05, "loss": 2.1454, "step": 9245 }, { "epoch": 1.72, "grad_norm": 0.220703125, "learning_rate": 2.404688103843902e-05, "loss": 2.1059, "step": 9250 }, { "epoch": 1.72, "grad_norm": 0.2197265625, "learning_rate": 2.389317053210518e-05, "loss": 2.116, "step": 9255 }, { "epoch": 1.72, "grad_norm": 0.2197265625, "learning_rate": 2.3739921655019147e-05, "loss": 2.0831, "step": 9260 }, { "epoch": 1.72, "grad_norm": 0.2177734375, "learning_rate": 2.358713480889254e-05, "loss": 2.1126, "step": 9265 }, { "epoch": 1.72, "grad_norm": 0.2216796875, "learning_rate": 2.3434810394225927e-05, "loss": 2.08, "step": 9270 }, { "epoch": 1.72, "grad_norm": 0.21484375, "learning_rate": 2.3282948810307637e-05, "loss": 2.0732, "step": 9275 }, { "epoch": 1.72, "grad_norm": 0.2265625, "learning_rate": 2.313155045521278e-05, "loss": 2.1507, "step": 9280 }, { "epoch": 1.72, "grad_norm": 0.2138671875, "learning_rate": 2.2980615725802213e-05, "loss": 2.1155, "step": 9285 }, { "epoch": 1.72, "grad_norm": 0.216796875, "learning_rate": 2.283014501772154e-05, "loss": 2.152, "step": 9290 }, { "epoch": 1.72, "grad_norm": 0.2138671875, "learning_rate": 2.268013872539998e-05, "loss": 2.1221, "step": 9295 }, { "epoch": 1.73, "grad_norm": 0.21875, "learning_rate": 2.2530597242049378e-05, "loss": 2.1292, "step": 9300 }, { "epoch": 1.73, "grad_norm": 0.224609375, "learning_rate": 2.238152095966315e-05, "loss": 2.0971, "step": 9305 }, { "epoch": 1.73, "grad_norm": 0.2265625, "learning_rate": 2.223291026901533e-05, "loss": 2.1177, "step": 9310 }, { "epoch": 1.73, "grad_norm": 0.220703125, "learning_rate": 2.208476555965946e-05, "loss": 2.088, "step": 9315 }, { "epoch": 1.73, "grad_norm": 0.2216796875, "learning_rate": 2.1937087219927576e-05, "loss": 2.1734, "step": 9320 }, { "epoch": 1.73, "grad_norm": 0.2158203125, "learning_rate": 2.178987563692938e-05, "loss": 2.1248, "step": 9325 }, { "epoch": 1.73, "grad_norm": 0.22265625, "learning_rate": 2.1643131196550835e-05, "loss": 2.1097, "step": 9330 }, { "epoch": 1.73, "grad_norm": 0.236328125, "learning_rate": 2.1496854283453472e-05, "loss": 2.0857, "step": 9335 }, { "epoch": 1.73, "grad_norm": 0.212890625, "learning_rate": 2.1351045281073412e-05, "loss": 2.0775, "step": 9340 }, { "epoch": 1.73, "grad_norm": 0.22265625, "learning_rate": 2.1205704571620076e-05, "loss": 2.1096, "step": 9345 }, { "epoch": 1.73, "grad_norm": 0.2119140625, "learning_rate": 2.1060832536075403e-05, "loss": 2.1098, "step": 9350 }, { "epoch": 1.74, "grad_norm": 0.216796875, "learning_rate": 2.0916429554192818e-05, "loss": 2.0878, "step": 9355 }, { "epoch": 1.74, "grad_norm": 0.216796875, "learning_rate": 2.07724960044962e-05, "loss": 2.138, "step": 9360 }, { "epoch": 1.74, "grad_norm": 0.220703125, "learning_rate": 2.0629032264278904e-05, "loss": 2.1298, "step": 9365 }, { "epoch": 1.74, "grad_norm": 0.21484375, "learning_rate": 2.0486038709602706e-05, "loss": 2.0918, "step": 9370 }, { "epoch": 1.74, "grad_norm": 0.220703125, "learning_rate": 2.034351571529709e-05, "loss": 2.1409, "step": 9375 }, { "epoch": 1.74, "grad_norm": 0.2119140625, "learning_rate": 2.0201463654957766e-05, "loss": 2.1113, "step": 9380 }, { "epoch": 1.74, "grad_norm": 0.22265625, "learning_rate": 2.0059882900946227e-05, "loss": 2.1025, "step": 9385 }, { "epoch": 1.74, "grad_norm": 0.21875, "learning_rate": 1.9918773824388405e-05, "loss": 2.0689, "step": 9390 }, { "epoch": 1.74, "grad_norm": 0.224609375, "learning_rate": 1.977813679517386e-05, "loss": 2.1106, "step": 9395 }, { "epoch": 1.74, "grad_norm": 0.21875, "learning_rate": 1.96379721819548e-05, "loss": 2.1136, "step": 9400 }, { "epoch": 1.74, "grad_norm": 0.2216796875, "learning_rate": 1.9498280352145004e-05, "loss": 2.143, "step": 9405 }, { "epoch": 1.75, "grad_norm": 0.2275390625, "learning_rate": 1.9359061671919032e-05, "loss": 2.1268, "step": 9410 }, { "epoch": 1.75, "grad_norm": 0.21875, "learning_rate": 1.9220316506211077e-05, "loss": 2.0934, "step": 9415 }, { "epoch": 1.75, "grad_norm": 0.21484375, "learning_rate": 1.9082045218714262e-05, "loss": 2.1128, "step": 9420 }, { "epoch": 1.75, "grad_norm": 0.2236328125, "learning_rate": 1.8944248171879453e-05, "loss": 2.1024, "step": 9425 }, { "epoch": 1.75, "grad_norm": 0.21875, "learning_rate": 1.8806925726914225e-05, "loss": 2.122, "step": 9430 }, { "epoch": 1.75, "grad_norm": 0.2138671875, "learning_rate": 1.8670078243782353e-05, "loss": 2.0871, "step": 9435 }, { "epoch": 1.75, "grad_norm": 0.21875, "learning_rate": 1.853370608120244e-05, "loss": 2.0643, "step": 9440 }, { "epoch": 1.75, "grad_norm": 0.2197265625, "learning_rate": 1.839780959664714e-05, "loss": 2.1121, "step": 9445 }, { "epoch": 1.75, "grad_norm": 0.21875, "learning_rate": 1.8262389146342217e-05, "loss": 2.0914, "step": 9450 }, { "epoch": 1.75, "grad_norm": 0.220703125, "learning_rate": 1.8127445085265716e-05, "loss": 2.1003, "step": 9455 }, { "epoch": 1.76, "grad_norm": 0.2197265625, "learning_rate": 1.79929777671467e-05, "loss": 2.0827, "step": 9460 }, { "epoch": 1.76, "grad_norm": 0.216796875, "learning_rate": 1.785898754446469e-05, "loss": 2.1114, "step": 9465 }, { "epoch": 1.76, "grad_norm": 0.2158203125, "learning_rate": 1.7725474768448636e-05, "loss": 2.0928, "step": 9470 }, { "epoch": 1.76, "grad_norm": 0.2236328125, "learning_rate": 1.759243978907583e-05, "loss": 2.0907, "step": 9475 }, { "epoch": 1.76, "grad_norm": 0.216796875, "learning_rate": 1.7459882955071237e-05, "loss": 2.1334, "step": 9480 }, { "epoch": 1.76, "grad_norm": 0.22265625, "learning_rate": 1.732780461390635e-05, "loss": 2.096, "step": 9485 }, { "epoch": 1.76, "grad_norm": 0.21484375, "learning_rate": 1.7196205111798446e-05, "loss": 2.1169, "step": 9490 }, { "epoch": 1.76, "grad_norm": 0.216796875, "learning_rate": 1.7065084793709607e-05, "loss": 2.1197, "step": 9495 }, { "epoch": 1.76, "grad_norm": 0.2158203125, "learning_rate": 1.693444400334583e-05, "loss": 2.0802, "step": 9500 }, { "epoch": 1.76, "grad_norm": 0.23046875, "learning_rate": 1.680428308315618e-05, "loss": 2.079, "step": 9505 }, { "epoch": 1.76, "grad_norm": 0.216796875, "learning_rate": 1.6674602374331693e-05, "loss": 2.0876, "step": 9510 }, { "epoch": 1.77, "grad_norm": 0.2265625, "learning_rate": 1.6545402216804783e-05, "loss": 2.1172, "step": 9515 }, { "epoch": 1.77, "grad_norm": 0.2177734375, "learning_rate": 1.6416682949248142e-05, "loss": 2.0708, "step": 9520 }, { "epoch": 1.77, "grad_norm": 0.2158203125, "learning_rate": 1.628844490907384e-05, "loss": 2.1358, "step": 9525 }, { "epoch": 1.77, "grad_norm": 0.2216796875, "learning_rate": 1.616068843243257e-05, "loss": 2.1241, "step": 9530 }, { "epoch": 1.77, "grad_norm": 0.2158203125, "learning_rate": 1.6033413854212643e-05, "loss": 2.1311, "step": 9535 }, { "epoch": 1.77, "grad_norm": 0.2119140625, "learning_rate": 1.5906621508039342e-05, "loss": 2.0832, "step": 9540 }, { "epoch": 1.77, "grad_norm": 0.2197265625, "learning_rate": 1.5780311726273634e-05, "loss": 2.091, "step": 9545 }, { "epoch": 1.77, "grad_norm": 0.21875, "learning_rate": 1.5654484840011617e-05, "loss": 2.0968, "step": 9550 }, { "epoch": 1.77, "grad_norm": 0.220703125, "learning_rate": 1.552914117908375e-05, "loss": 2.1037, "step": 9555 }, { "epoch": 1.77, "grad_norm": 0.216796875, "learning_rate": 1.5404281072053517e-05, "loss": 2.1281, "step": 9560 }, { "epoch": 1.77, "grad_norm": 0.216796875, "learning_rate": 1.5279904846217085e-05, "loss": 2.1145, "step": 9565 }, { "epoch": 1.78, "grad_norm": 0.220703125, "learning_rate": 1.515601282760215e-05, "loss": 2.1418, "step": 9570 }, { "epoch": 1.78, "grad_norm": 0.2177734375, "learning_rate": 1.5032605340967132e-05, "loss": 2.0932, "step": 9575 }, { "epoch": 1.78, "grad_norm": 0.220703125, "learning_rate": 1.4909682709800355e-05, "loss": 2.1146, "step": 9580 }, { "epoch": 1.78, "grad_norm": 0.2236328125, "learning_rate": 1.4787245256319227e-05, "loss": 2.104, "step": 9585 }, { "epoch": 1.78, "grad_norm": 0.2177734375, "learning_rate": 1.4665293301469374e-05, "loss": 2.0964, "step": 9590 }, { "epoch": 1.78, "grad_norm": 0.2216796875, "learning_rate": 1.4543827164923619e-05, "loss": 2.1167, "step": 9595 }, { "epoch": 1.78, "grad_norm": 0.2197265625, "learning_rate": 1.4422847165081555e-05, "loss": 2.0994, "step": 9600 }, { "epoch": 1.78, "grad_norm": 0.220703125, "learning_rate": 1.4302353619068309e-05, "loss": 2.0851, "step": 9605 }, { "epoch": 1.78, "grad_norm": 0.216796875, "learning_rate": 1.4182346842733873e-05, "loss": 2.1048, "step": 9610 }, { "epoch": 1.78, "grad_norm": 0.2177734375, "learning_rate": 1.4062827150652302e-05, "loss": 2.0988, "step": 9615 }, { "epoch": 1.78, "grad_norm": 0.2177734375, "learning_rate": 1.394379485612085e-05, "loss": 2.1497, "step": 9620 }, { "epoch": 1.79, "grad_norm": 0.224609375, "learning_rate": 1.3825250271159173e-05, "loss": 2.1561, "step": 9625 }, { "epoch": 1.79, "grad_norm": 0.2265625, "learning_rate": 1.3707193706508391e-05, "loss": 2.0875, "step": 9630 }, { "epoch": 1.79, "grad_norm": 0.2138671875, "learning_rate": 1.3589625471630562e-05, "loss": 2.0693, "step": 9635 }, { "epoch": 1.79, "grad_norm": 0.2216796875, "learning_rate": 1.3472545874707565e-05, "loss": 2.1013, "step": 9640 }, { "epoch": 1.79, "grad_norm": 0.2197265625, "learning_rate": 1.3355955222640326e-05, "loss": 2.1421, "step": 9645 }, { "epoch": 1.79, "grad_norm": 0.216796875, "learning_rate": 1.3239853821048287e-05, "loss": 2.0988, "step": 9650 }, { "epoch": 1.79, "grad_norm": 0.2216796875, "learning_rate": 1.3124241974268291e-05, "loss": 2.1146, "step": 9655 }, { "epoch": 1.79, "grad_norm": 0.2216796875, "learning_rate": 1.3009119985353969e-05, "loss": 2.1093, "step": 9660 }, { "epoch": 1.79, "grad_norm": 0.2216796875, "learning_rate": 1.2894488156074813e-05, "loss": 2.1261, "step": 9665 }, { "epoch": 1.79, "grad_norm": 0.21484375, "learning_rate": 1.2780346786915598e-05, "loss": 2.1356, "step": 9670 }, { "epoch": 1.79, "grad_norm": 0.2236328125, "learning_rate": 1.2666696177075276e-05, "loss": 2.1216, "step": 9675 }, { "epoch": 1.8, "grad_norm": 0.224609375, "learning_rate": 1.2553536624466456e-05, "loss": 2.137, "step": 9680 }, { "epoch": 1.8, "grad_norm": 0.224609375, "learning_rate": 1.2440868425714613e-05, "loss": 2.1244, "step": 9685 }, { "epoch": 1.8, "grad_norm": 0.216796875, "learning_rate": 1.2328691876157128e-05, "loss": 2.1037, "step": 9690 }, { "epoch": 1.8, "grad_norm": 0.2216796875, "learning_rate": 1.2217007269842651e-05, "loss": 2.0922, "step": 9695 }, { "epoch": 1.8, "grad_norm": 0.2197265625, "learning_rate": 1.2105814899530288e-05, "loss": 2.1027, "step": 9700 }, { "epoch": 1.8, "grad_norm": 0.2294921875, "learning_rate": 1.1995115056688889e-05, "loss": 2.1215, "step": 9705 }, { "epoch": 1.8, "grad_norm": 0.220703125, "learning_rate": 1.18849080314962e-05, "loss": 2.1065, "step": 9710 }, { "epoch": 1.8, "grad_norm": 0.224609375, "learning_rate": 1.177519411283814e-05, "loss": 2.112, "step": 9715 }, { "epoch": 1.8, "grad_norm": 0.2197265625, "learning_rate": 1.1665973588308122e-05, "loss": 2.1123, "step": 9720 }, { "epoch": 1.8, "grad_norm": 0.22265625, "learning_rate": 1.1557246744206084e-05, "loss": 2.1194, "step": 9725 }, { "epoch": 1.81, "grad_norm": 0.2333984375, "learning_rate": 1.1449013865538027e-05, "loss": 2.0927, "step": 9730 }, { "epoch": 1.81, "grad_norm": 0.2138671875, "learning_rate": 1.1341275236015003e-05, "loss": 2.1198, "step": 9735 }, { "epoch": 1.81, "grad_norm": 0.2138671875, "learning_rate": 1.1234031138052592e-05, "loss": 2.0599, "step": 9740 }, { "epoch": 1.81, "grad_norm": 0.224609375, "learning_rate": 1.1127281852769944e-05, "loss": 2.1762, "step": 9745 }, { "epoch": 1.81, "grad_norm": 0.2255859375, "learning_rate": 1.1021027659989225e-05, "loss": 2.106, "step": 9750 }, { "epoch": 1.81, "grad_norm": 0.2236328125, "learning_rate": 1.0915268838234838e-05, "loss": 2.1149, "step": 9755 }, { "epoch": 1.81, "grad_norm": 0.22265625, "learning_rate": 1.0810005664732558e-05, "loss": 2.1219, "step": 9760 }, { "epoch": 1.81, "grad_norm": 0.2177734375, "learning_rate": 1.0705238415409068e-05, "loss": 2.1004, "step": 9765 }, { "epoch": 1.81, "grad_norm": 0.224609375, "learning_rate": 1.0600967364891001e-05, "loss": 2.0973, "step": 9770 }, { "epoch": 1.81, "grad_norm": 0.21875, "learning_rate": 1.0497192786504228e-05, "loss": 2.1278, "step": 9775 }, { "epoch": 1.81, "grad_norm": 0.2216796875, "learning_rate": 1.0393914952273398e-05, "loss": 2.1386, "step": 9780 }, { "epoch": 1.82, "grad_norm": 0.2158203125, "learning_rate": 1.0291134132920866e-05, "loss": 2.1055, "step": 9785 }, { "epoch": 1.82, "grad_norm": 0.2177734375, "learning_rate": 1.0188850597866273e-05, "loss": 2.1044, "step": 9790 }, { "epoch": 1.82, "grad_norm": 0.2255859375, "learning_rate": 1.0087064615225683e-05, "loss": 2.1127, "step": 9795 }, { "epoch": 1.82, "grad_norm": 0.2177734375, "learning_rate": 9.985776451810936e-06, "loss": 2.1143, "step": 9800 }, { "epoch": 1.82, "grad_norm": 0.2138671875, "learning_rate": 9.884986373128934e-06, "loss": 2.1134, "step": 9805 }, { "epoch": 1.82, "grad_norm": 0.2197265625, "learning_rate": 9.78469464338092e-06, "loss": 2.1277, "step": 9810 }, { "epoch": 1.82, "grad_norm": 0.2197265625, "learning_rate": 9.684901525461865e-06, "loss": 2.1005, "step": 9815 }, { "epoch": 1.82, "grad_norm": 0.2216796875, "learning_rate": 9.58560728095974e-06, "loss": 2.1352, "step": 9820 }, { "epoch": 1.82, "grad_norm": 0.224609375, "learning_rate": 9.486812170154724e-06, "loss": 2.1596, "step": 9825 }, { "epoch": 1.82, "grad_norm": 0.21875, "learning_rate": 9.388516452018702e-06, "loss": 2.1258, "step": 9830 }, { "epoch": 1.82, "grad_norm": 0.2158203125, "learning_rate": 9.290720384214479e-06, "loss": 2.159, "step": 9835 }, { "epoch": 1.83, "grad_norm": 0.22265625, "learning_rate": 9.193424223095103e-06, "loss": 2.093, "step": 9840 }, { "epoch": 1.83, "grad_norm": 0.2177734375, "learning_rate": 9.096628223703207e-06, "loss": 2.0915, "step": 9845 }, { "epoch": 1.83, "grad_norm": 0.212890625, "learning_rate": 9.00033263977047e-06, "loss": 2.1096, "step": 9850 }, { "epoch": 1.83, "grad_norm": 0.2177734375, "learning_rate": 8.904537723716621e-06, "loss": 2.0945, "step": 9855 }, { "epoch": 1.83, "grad_norm": 0.2216796875, "learning_rate": 8.809243726649107e-06, "loss": 2.111, "step": 9860 }, { "epoch": 1.83, "grad_norm": 0.216796875, "learning_rate": 8.71445089836238e-06, "loss": 2.1174, "step": 9865 }, { "epoch": 1.83, "grad_norm": 0.21875, "learning_rate": 8.620159487337076e-06, "loss": 2.1279, "step": 9870 }, { "epoch": 1.83, "grad_norm": 0.2158203125, "learning_rate": 8.526369740739481e-06, "loss": 2.1237, "step": 9875 }, { "epoch": 1.83, "grad_norm": 0.2314453125, "learning_rate": 8.43308190442087e-06, "loss": 2.1158, "step": 9880 }, { "epoch": 1.83, "grad_norm": 0.228515625, "learning_rate": 8.340296222916921e-06, "loss": 2.1349, "step": 9885 }, { "epoch": 1.83, "grad_norm": 0.2216796875, "learning_rate": 8.24801293944688e-06, "loss": 2.0741, "step": 9890 }, { "epoch": 1.84, "grad_norm": 0.2138671875, "learning_rate": 8.15623229591318e-06, "loss": 2.1205, "step": 9895 }, { "epoch": 1.84, "grad_norm": 0.21875, "learning_rate": 8.064954532900659e-06, "loss": 2.1016, "step": 9900 }, { "epoch": 1.84, "grad_norm": 0.2158203125, "learning_rate": 7.97417988967588e-06, "loss": 2.1264, "step": 9905 }, { "epoch": 1.84, "grad_norm": 0.2177734375, "learning_rate": 7.883908604186685e-06, "loss": 2.0931, "step": 9910 }, { "epoch": 1.84, "grad_norm": 0.2177734375, "learning_rate": 7.794140913061366e-06, "loss": 2.125, "step": 9915 }, { "epoch": 1.84, "grad_norm": 0.21875, "learning_rate": 7.704877051608206e-06, "loss": 2.0809, "step": 9920 }, { "epoch": 1.84, "grad_norm": 0.2265625, "learning_rate": 7.61611725381477e-06, "loss": 2.1238, "step": 9925 }, { "epoch": 1.84, "grad_norm": 0.21484375, "learning_rate": 7.5278617523472985e-06, "loss": 2.0693, "step": 9930 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 7.440110778550224e-06, "loss": 2.1027, "step": 9935 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 7.352864562445283e-06, "loss": 2.1164, "step": 9940 }, { "epoch": 1.85, "grad_norm": 0.216796875, "learning_rate": 7.266123332731267e-06, "loss": 2.1191, "step": 9945 }, { "epoch": 1.85, "grad_norm": 0.2158203125, "learning_rate": 7.17988731678314e-06, "loss": 2.1598, "step": 9950 }, { "epoch": 1.85, "grad_norm": 0.2158203125, "learning_rate": 7.094156740651525e-06, "loss": 2.0958, "step": 9955 }, { "epoch": 1.85, "grad_norm": 0.220703125, "learning_rate": 7.0089318290622375e-06, "loss": 2.1188, "step": 9960 }, { "epoch": 1.85, "grad_norm": 0.2216796875, "learning_rate": 6.924212805415553e-06, "loss": 2.0864, "step": 9965 }, { "epoch": 1.85, "grad_norm": 0.2255859375, "learning_rate": 6.839999891785609e-06, "loss": 2.1835, "step": 9970 }, { "epoch": 1.85, "grad_norm": 0.22265625, "learning_rate": 6.756293308919892e-06, "loss": 2.091, "step": 9975 }, { "epoch": 1.85, "grad_norm": 0.2177734375, "learning_rate": 6.673093276238751e-06, "loss": 2.1132, "step": 9980 }, { "epoch": 1.85, "grad_norm": 0.224609375, "learning_rate": 6.5904000118345745e-06, "loss": 2.1281, "step": 9985 }, { "epoch": 1.85, "grad_norm": 0.21875, "learning_rate": 6.508213732471391e-06, "loss": 2.1066, "step": 9990 }, { "epoch": 1.85, "grad_norm": 0.2236328125, "learning_rate": 6.426534653584337e-06, "loss": 2.1207, "step": 9995 }, { "epoch": 1.86, "grad_norm": 0.2236328125, "learning_rate": 6.345362989278947e-06, "loss": 2.0985, "step": 10000 }, { "epoch": 1.86, "grad_norm": 0.2236328125, "learning_rate": 6.264698952330705e-06, "loss": 2.1387, "step": 10005 }, { "epoch": 1.86, "grad_norm": 0.21875, "learning_rate": 6.18454275418443e-06, "loss": 2.1239, "step": 10010 }, { "epoch": 1.86, "grad_norm": 0.22265625, "learning_rate": 6.104894604953759e-06, "loss": 2.0651, "step": 10015 }, { "epoch": 1.86, "grad_norm": 0.2255859375, "learning_rate": 6.0257547134205725e-06, "loss": 2.1373, "step": 10020 }, { "epoch": 1.86, "grad_norm": 0.216796875, "learning_rate": 5.947123287034439e-06, "loss": 2.098, "step": 10025 }, { "epoch": 1.86, "grad_norm": 0.2177734375, "learning_rate": 5.86900053191215e-06, "loss": 2.0951, "step": 10030 }, { "epoch": 1.86, "grad_norm": 0.2216796875, "learning_rate": 5.791386652837027e-06, "loss": 2.1572, "step": 10035 }, { "epoch": 1.86, "grad_norm": 0.2294921875, "learning_rate": 5.7142818532585515e-06, "loss": 2.1361, "step": 10040 }, { "epoch": 1.86, "grad_norm": 0.220703125, "learning_rate": 5.63768633529167e-06, "loss": 2.1041, "step": 10045 }, { "epoch": 1.86, "grad_norm": 0.2119140625, "learning_rate": 5.5616002997164185e-06, "loss": 2.1299, "step": 10050 }, { "epoch": 1.87, "grad_norm": 0.2294921875, "learning_rate": 5.486023945977304e-06, "loss": 2.1305, "step": 10055 }, { "epoch": 1.87, "grad_norm": 0.2138671875, "learning_rate": 5.4109574721827646e-06, "loss": 2.0973, "step": 10060 }, { "epoch": 1.87, "grad_norm": 0.2138671875, "learning_rate": 5.336401075104825e-06, "loss": 2.0838, "step": 10065 }, { "epoch": 1.87, "grad_norm": 0.21875, "learning_rate": 5.262354950178217e-06, "loss": 2.1095, "step": 10070 }, { "epoch": 1.87, "grad_norm": 0.2255859375, "learning_rate": 5.188819291500302e-06, "loss": 2.1235, "step": 10075 }, { "epoch": 1.87, "grad_norm": 0.220703125, "learning_rate": 5.115794291830245e-06, "loss": 2.1142, "step": 10080 }, { "epoch": 1.87, "grad_norm": 0.21484375, "learning_rate": 5.04328014258868e-06, "loss": 2.09, "step": 10085 }, { "epoch": 1.87, "grad_norm": 0.22265625, "learning_rate": 4.971277033857092e-06, "loss": 2.1228, "step": 10090 }, { "epoch": 1.87, "grad_norm": 0.2138671875, "learning_rate": 4.89978515437739e-06, "loss": 2.0957, "step": 10095 }, { "epoch": 1.87, "grad_norm": 0.2216796875, "learning_rate": 4.828804691551448e-06, "loss": 2.1217, "step": 10100 }, { "epoch": 1.87, "grad_norm": 0.2197265625, "learning_rate": 4.758335831440497e-06, "loss": 2.133, "step": 10105 }, { "epoch": 1.88, "grad_norm": 0.216796875, "learning_rate": 4.688378758764689e-06, "loss": 2.1422, "step": 10110 }, { "epoch": 1.88, "grad_norm": 0.2177734375, "learning_rate": 4.618933656902758e-06, "loss": 2.1195, "step": 10115 }, { "epoch": 1.88, "grad_norm": 0.22265625, "learning_rate": 4.5500007078911996e-06, "loss": 2.0933, "step": 10120 }, { "epoch": 1.88, "grad_norm": 0.2138671875, "learning_rate": 4.481580092424187e-06, "loss": 2.1086, "step": 10125 }, { "epoch": 1.88, "grad_norm": 0.21484375, "learning_rate": 4.41367198985283e-06, "loss": 2.09, "step": 10130 }, { "epoch": 1.88, "grad_norm": 0.2216796875, "learning_rate": 4.3462765781848045e-06, "loss": 2.107, "step": 10135 }, { "epoch": 1.88, "grad_norm": 0.2197265625, "learning_rate": 4.279394034083839e-06, "loss": 2.0996, "step": 10140 }, { "epoch": 1.88, "grad_norm": 0.2197265625, "learning_rate": 4.213024532869314e-06, "loss": 2.1024, "step": 10145 }, { "epoch": 1.88, "grad_norm": 0.21875, "learning_rate": 4.147168248515798e-06, "loss": 2.1083, "step": 10150 }, { "epoch": 1.88, "grad_norm": 0.22265625, "learning_rate": 4.081825353652424e-06, "loss": 2.1142, "step": 10155 }, { "epoch": 1.88, "grad_norm": 0.22265625, "learning_rate": 4.01699601956278e-06, "loss": 2.1095, "step": 10160 }, { "epoch": 1.89, "grad_norm": 0.2373046875, "learning_rate": 3.952680416184151e-06, "loss": 2.097, "step": 10165 }, { "epoch": 1.89, "grad_norm": 0.2275390625, "learning_rate": 3.88887871210708e-06, "loss": 2.0742, "step": 10170 }, { "epoch": 1.89, "grad_norm": 0.220703125, "learning_rate": 3.825591074575208e-06, "loss": 2.098, "step": 10175 }, { "epoch": 1.89, "grad_norm": 0.2138671875, "learning_rate": 3.762817669484564e-06, "loss": 2.1232, "step": 10180 }, { "epoch": 1.89, "grad_norm": 0.2294921875, "learning_rate": 3.700558661383191e-06, "loss": 2.1437, "step": 10185 }, { "epoch": 1.89, "grad_norm": 0.2158203125, "learning_rate": 3.638814213470787e-06, "loss": 2.0822, "step": 10190 }, { "epoch": 1.89, "grad_norm": 0.2333984375, "learning_rate": 3.577584487598218e-06, "loss": 2.1089, "step": 10195 }, { "epoch": 1.89, "grad_norm": 0.2197265625, "learning_rate": 3.51686964426714e-06, "loss": 2.1156, "step": 10200 }, { "epoch": 1.89, "grad_norm": 0.22265625, "learning_rate": 3.4566698426294674e-06, "loss": 2.1033, "step": 10205 }, { "epoch": 1.89, "grad_norm": 0.2158203125, "learning_rate": 3.396985240487105e-06, "loss": 2.0581, "step": 10210 }, { "epoch": 1.9, "grad_norm": 0.22265625, "learning_rate": 3.3378159942914376e-06, "loss": 2.1041, "step": 10215 }, { "epoch": 1.9, "grad_norm": 0.21484375, "learning_rate": 3.2791622591429536e-06, "loss": 2.089, "step": 10220 }, { "epoch": 1.9, "grad_norm": 0.220703125, "learning_rate": 3.2210241887908444e-06, "loss": 2.1456, "step": 10225 }, { "epoch": 1.9, "grad_norm": 0.22265625, "learning_rate": 3.163401935632537e-06, "loss": 2.1347, "step": 10230 }, { "epoch": 1.9, "grad_norm": 0.2158203125, "learning_rate": 3.1062956507133867e-06, "loss": 2.0823, "step": 10235 }, { "epoch": 1.9, "grad_norm": 0.2236328125, "learning_rate": 3.0497054837262506e-06, "loss": 2.0742, "step": 10240 }, { "epoch": 1.9, "grad_norm": 0.2255859375, "learning_rate": 2.9936315830110473e-06, "loss": 2.1019, "step": 10245 }, { "epoch": 1.9, "grad_norm": 0.21875, "learning_rate": 2.938074095554444e-06, "loss": 2.0905, "step": 10250 }, { "epoch": 1.9, "grad_norm": 0.21875, "learning_rate": 2.883033166989413e-06, "loss": 2.0628, "step": 10255 }, { "epoch": 1.9, "grad_norm": 0.2197265625, "learning_rate": 2.828508941594854e-06, "loss": 2.1302, "step": 10260 }, { "epoch": 1.9, "grad_norm": 0.2177734375, "learning_rate": 2.774501562295262e-06, "loss": 2.0996, "step": 10265 }, { "epoch": 1.91, "grad_norm": 0.2197265625, "learning_rate": 2.7210111706603036e-06, "loss": 2.113, "step": 10270 }, { "epoch": 1.91, "grad_norm": 0.220703125, "learning_rate": 2.6680379069044416e-06, "loss": 2.1421, "step": 10275 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 2.6155819098866664e-06, "loss": 2.1139, "step": 10280 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 2.563643317109965e-06, "loss": 2.1383, "step": 10285 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 2.5122222647211424e-06, "loss": 2.1012, "step": 10290 }, { "epoch": 1.91, "grad_norm": 0.224609375, "learning_rate": 2.4613188875102667e-06, "loss": 2.1486, "step": 10295 }, { "epoch": 1.91, "grad_norm": 0.2216796875, "learning_rate": 2.410933318910513e-06, "loss": 2.1413, "step": 10300 }, { "epoch": 1.91, "grad_norm": 0.22265625, "learning_rate": 2.3610656909976993e-06, "loss": 2.1081, "step": 10305 }, { "epoch": 1.91, "grad_norm": 0.224609375, "learning_rate": 2.3117161344899274e-06, "loss": 2.1173, "step": 10310 }, { "epoch": 1.91, "grad_norm": 0.220703125, "learning_rate": 2.2628847787473427e-06, "loss": 2.1125, "step": 10315 }, { "epoch": 1.91, "grad_norm": 0.216796875, "learning_rate": 2.2145717517716437e-06, "loss": 2.1233, "step": 10320 }, { "epoch": 1.92, "grad_norm": 0.2177734375, "learning_rate": 2.1667771802059255e-06, "loss": 2.1439, "step": 10325 }, { "epoch": 1.92, "grad_norm": 0.2197265625, "learning_rate": 2.1195011893341945e-06, "loss": 2.1062, "step": 10330 }, { "epoch": 1.92, "grad_norm": 0.2158203125, "learning_rate": 2.07274390308112e-06, "loss": 2.1253, "step": 10335 }, { "epoch": 1.92, "grad_norm": 0.22265625, "learning_rate": 2.026505444011684e-06, "loss": 2.1364, "step": 10340 }, { "epoch": 1.92, "grad_norm": 0.21875, "learning_rate": 1.9807859333308865e-06, "loss": 2.1224, "step": 10345 }, { "epoch": 1.92, "grad_norm": 0.2255859375, "learning_rate": 1.9355854908833514e-06, "loss": 2.1277, "step": 10350 }, { "epoch": 1.92, "grad_norm": 0.220703125, "learning_rate": 1.8909042351531459e-06, "loss": 2.1251, "step": 10355 }, { "epoch": 1.92, "grad_norm": 0.22265625, "learning_rate": 1.8467422832633142e-06, "loss": 2.1136, "step": 10360 }, { "epoch": 1.92, "grad_norm": 0.2265625, "learning_rate": 1.8030997509757007e-06, "loss": 2.1412, "step": 10365 }, { "epoch": 1.92, "grad_norm": 0.21484375, "learning_rate": 1.7599767526905953e-06, "loss": 2.04, "step": 10370 }, { "epoch": 1.92, "grad_norm": 0.2177734375, "learning_rate": 1.717373401446376e-06, "loss": 2.0922, "step": 10375 }, { "epoch": 1.93, "grad_norm": 0.212890625, "learning_rate": 1.675289808919378e-06, "loss": 2.0762, "step": 10380 }, { "epoch": 1.93, "grad_norm": 0.2158203125, "learning_rate": 1.633726085423337e-06, "loss": 2.138, "step": 10385 }, { "epoch": 1.93, "grad_norm": 0.2236328125, "learning_rate": 1.5926823399093905e-06, "loss": 2.1341, "step": 10390 }, { "epoch": 1.93, "grad_norm": 0.220703125, "learning_rate": 1.5521586799655874e-06, "loss": 2.0983, "step": 10395 }, { "epoch": 1.93, "grad_norm": 0.224609375, "learning_rate": 1.5121552118167125e-06, "loss": 2.1025, "step": 10400 }, { "epoch": 1.93, "grad_norm": 0.2138671875, "learning_rate": 1.472672040323908e-06, "loss": 2.099, "step": 10405 }, { "epoch": 1.93, "grad_norm": 0.2236328125, "learning_rate": 1.4337092689845844e-06, "loss": 2.1013, "step": 10410 }, { "epoch": 1.93, "grad_norm": 0.2177734375, "learning_rate": 1.3952669999318657e-06, "loss": 2.1162, "step": 10415 }, { "epoch": 1.93, "grad_norm": 0.2177734375, "learning_rate": 1.3573453339345898e-06, "loss": 2.1213, "step": 10420 }, { "epoch": 1.93, "grad_norm": 0.2216796875, "learning_rate": 1.3199443703969083e-06, "loss": 2.1008, "step": 10425 }, { "epoch": 1.94, "grad_norm": 0.2216796875, "learning_rate": 1.2830642073580645e-06, "loss": 2.1289, "step": 10430 }, { "epoch": 1.94, "grad_norm": 0.2294921875, "learning_rate": 1.2467049414921273e-06, "loss": 2.1208, "step": 10435 }, { "epoch": 1.94, "grad_norm": 0.2236328125, "learning_rate": 1.2108666681076796e-06, "loss": 2.1018, "step": 10440 }, { "epoch": 1.94, "grad_norm": 0.228515625, "learning_rate": 1.175549481147753e-06, "loss": 2.1416, "step": 10445 }, { "epoch": 1.94, "grad_norm": 0.220703125, "learning_rate": 1.1407534731892933e-06, "loss": 2.1281, "step": 10450 }, { "epoch": 1.94, "grad_norm": 0.2138671875, "learning_rate": 1.106478735443184e-06, "loss": 2.0847, "step": 10455 }, { "epoch": 1.94, "grad_norm": 0.2138671875, "learning_rate": 1.07272535775389e-06, "loss": 2.078, "step": 10460 }, { "epoch": 1.94, "grad_norm": 0.216796875, "learning_rate": 1.039493428599192e-06, "loss": 2.1367, "step": 10465 }, { "epoch": 1.94, "grad_norm": 0.21875, "learning_rate": 1.0067830350900532e-06, "loss": 2.12, "step": 10470 }, { "epoch": 1.94, "grad_norm": 0.220703125, "learning_rate": 9.745942629703075e-07, "loss": 2.0917, "step": 10475 }, { "epoch": 1.94, "grad_norm": 0.2197265625, "learning_rate": 9.429271966164388e-07, "loss": 2.1365, "step": 10480 }, { "epoch": 1.95, "grad_norm": 0.2177734375, "learning_rate": 9.117819190374022e-07, "loss": 2.1, "step": 10485 }, { "epoch": 1.95, "grad_norm": 0.2236328125, "learning_rate": 8.811585118744026e-07, "loss": 2.0986, "step": 10490 }, { "epoch": 1.95, "grad_norm": 0.2265625, "learning_rate": 8.510570554006502e-07, "loss": 2.1679, "step": 10495 }, { "epoch": 1.95, "grad_norm": 0.212890625, "learning_rate": 8.21477628521139e-07, "loss": 2.1362, "step": 10500 }, { "epoch": 1.95, "grad_norm": 0.220703125, "learning_rate": 7.924203087725124e-07, "loss": 2.0937, "step": 10505 }, { "epoch": 1.95, "grad_norm": 0.21875, "learning_rate": 7.63885172322798e-07, "loss": 2.1098, "step": 10510 }, { "epoch": 1.95, "grad_norm": 0.216796875, "learning_rate": 7.35872293971207e-07, "loss": 2.1114, "step": 10515 }, { "epoch": 1.95, "grad_norm": 0.2158203125, "learning_rate": 7.083817471479349e-07, "loss": 2.1369, "step": 10520 }, { "epoch": 1.95, "grad_norm": 0.212890625, "learning_rate": 6.814136039140717e-07, "loss": 2.0751, "step": 10525 }, { "epoch": 1.95, "grad_norm": 0.2197265625, "learning_rate": 6.54967934961248e-07, "loss": 2.1256, "step": 10530 }, { "epoch": 1.95, "grad_norm": 0.2294921875, "learning_rate": 6.290448096115453e-07, "loss": 2.0886, "step": 10535 }, { "epoch": 1.96, "grad_norm": 0.216796875, "learning_rate": 6.036442958173183e-07, "loss": 2.1488, "step": 10540 }, { "epoch": 1.96, "grad_norm": 0.22265625, "learning_rate": 5.787664601609954e-07, "loss": 2.1258, "step": 10545 }, { "epoch": 1.96, "grad_norm": 0.216796875, "learning_rate": 5.544113678549235e-07, "loss": 2.0979, "step": 10550 }, { "epoch": 1.96, "grad_norm": 0.2177734375, "learning_rate": 5.30579082741145e-07, "loss": 2.0995, "step": 10555 }, { "epoch": 1.96, "grad_norm": 0.220703125, "learning_rate": 5.072696672913102e-07, "loss": 2.1074, "step": 10560 }, { "epoch": 1.96, "grad_norm": 0.2158203125, "learning_rate": 4.8448318260641e-07, "loss": 2.0968, "step": 10565 }, { "epoch": 1.96, "grad_norm": 0.22265625, "learning_rate": 4.622196884167318e-07, "loss": 2.1238, "step": 10570 }, { "epoch": 1.96, "grad_norm": 0.21875, "learning_rate": 4.4047924308161516e-07, "loss": 2.0813, "step": 10575 }, { "epoch": 1.96, "grad_norm": 0.2236328125, "learning_rate": 4.1926190358934083e-07, "loss": 2.1372, "step": 10580 }, { "epoch": 1.96, "grad_norm": 0.216796875, "learning_rate": 3.985677255569753e-07, "loss": 2.0857, "step": 10585 }, { "epoch": 1.96, "grad_norm": 0.2177734375, "learning_rate": 3.7839676323023765e-07, "loss": 2.1184, "step": 10590 }, { "epoch": 1.97, "grad_norm": 0.2197265625, "learning_rate": 3.5874906948327737e-07, "loss": 2.1057, "step": 10595 }, { "epoch": 1.97, "grad_norm": 0.2138671875, "learning_rate": 3.396246958186744e-07, "loss": 2.1105, "step": 10600 }, { "epoch": 1.97, "grad_norm": 0.21875, "learning_rate": 3.210236923671728e-07, "loss": 2.1313, "step": 10605 }, { "epoch": 1.97, "grad_norm": 0.2255859375, "learning_rate": 3.029461078876361e-07, "loss": 2.158, "step": 10610 }, { "epoch": 1.97, "grad_norm": 0.2294921875, "learning_rate": 2.8539198976686997e-07, "loss": 2.1015, "step": 10615 }, { "epoch": 1.97, "grad_norm": 0.21484375, "learning_rate": 2.6836138401955534e-07, "loss": 2.1259, "step": 10620 }, { "epoch": 1.97, "grad_norm": 0.22265625, "learning_rate": 2.518543352880265e-07, "loss": 2.1453, "step": 10625 }, { "epoch": 1.97, "grad_norm": 0.2197265625, "learning_rate": 2.35870886842271e-07, "loss": 2.1135, "step": 10630 }, { "epoch": 1.97, "grad_norm": 0.2177734375, "learning_rate": 2.204110805797077e-07, "loss": 2.0939, "step": 10635 }, { "epoch": 1.97, "grad_norm": 0.2177734375, "learning_rate": 2.0547495702518682e-07, "loss": 2.1107, "step": 10640 }, { "epoch": 1.97, "grad_norm": 0.2265625, "learning_rate": 1.9106255533083428e-07, "loss": 2.0952, "step": 10645 }, { "epoch": 1.98, "grad_norm": 0.21875, "learning_rate": 1.7717391327585208e-07, "loss": 2.1414, "step": 10650 }, { "epoch": 1.98, "grad_norm": 0.220703125, "learning_rate": 1.6380906726660705e-07, "loss": 2.1247, "step": 10655 }, { "epoch": 1.98, "grad_norm": 0.2197265625, "learning_rate": 1.5096805233638654e-07, "loss": 2.1159, "step": 10660 }, { "epoch": 1.98, "grad_norm": 0.2158203125, "learning_rate": 1.3865090214539856e-07, "loss": 2.0838, "step": 10665 }, { "epoch": 1.98, "grad_norm": 0.220703125, "learning_rate": 1.2685764898059393e-07, "loss": 2.0818, "step": 10670 }, { "epoch": 1.98, "grad_norm": 0.21875, "learning_rate": 1.1558832375566653e-07, "loss": 2.0904, "step": 10675 }, { "epoch": 1.98, "grad_norm": 0.2197265625, "learning_rate": 1.0484295601089767e-07, "loss": 2.137, "step": 10680 }, { "epoch": 1.98, "grad_norm": 0.2294921875, "learning_rate": 9.462157391317838e-08, "loss": 2.1282, "step": 10685 }, { "epoch": 1.98, "grad_norm": 0.2255859375, "learning_rate": 8.492420425583181e-08, "loss": 2.1308, "step": 10690 }, { "epoch": 1.98, "grad_norm": 0.21484375, "learning_rate": 7.575087245861313e-08, "loss": 2.0737, "step": 10695 }, { "epoch": 1.99, "grad_norm": 0.21484375, "learning_rate": 6.710160256755416e-08, "loss": 2.102, "step": 10700 }, { "epoch": 1.99, "grad_norm": 0.2236328125, "learning_rate": 5.897641725505221e-08, "loss": 2.0975, "step": 10705 }, { "epoch": 1.99, "grad_norm": 0.2177734375, "learning_rate": 5.137533781964798e-08, "loss": 2.0821, "step": 10710 }, { "epoch": 1.99, "grad_norm": 0.220703125, "learning_rate": 4.429838418607002e-08, "loss": 2.1072, "step": 10715 }, { "epoch": 1.99, "grad_norm": 0.2158203125, "learning_rate": 3.774557490516806e-08, "loss": 2.1058, "step": 10720 }, { "epoch": 1.99, "grad_norm": 0.224609375, "learning_rate": 3.171692715382424e-08, "loss": 2.1674, "step": 10725 }, { "epoch": 1.99, "grad_norm": 0.2197265625, "learning_rate": 2.6212456734953094e-08, "loss": 2.1061, "step": 10730 }, { "epoch": 1.99, "grad_norm": 0.2216796875, "learning_rate": 2.123217807743494e-08, "loss": 2.1099, "step": 10735 }, { "epoch": 1.99, "grad_norm": 0.2216796875, "learning_rate": 1.6776104236071455e-08, "loss": 2.1166, "step": 10740 }, { "epoch": 1.99, "grad_norm": 0.220703125, "learning_rate": 1.2844246891607904e-08, "loss": 2.1206, "step": 10745 }, { "epoch": 1.99, "grad_norm": 0.2177734375, "learning_rate": 9.436616350622096e-09, "loss": 2.1129, "step": 10750 }, { "epoch": 2.0, "grad_norm": 0.220703125, "learning_rate": 6.5532215455244015e-09, "loss": 2.0892, "step": 10755 }, { "epoch": 2.0, "grad_norm": 0.228515625, "learning_rate": 4.194070034579944e-09, "loss": 2.139, "step": 10760 }, { "epoch": 2.0, "grad_norm": 0.2255859375, "learning_rate": 2.3591680018419935e-09, "loss": 2.1614, "step": 10765 }, { "epoch": 2.0, "grad_norm": 0.2177734375, "learning_rate": 1.0485202571297593e-09, "loss": 2.118, "step": 10770 }, { "epoch": 2.0, "grad_norm": 0.2177734375, "learning_rate": 2.621302360727995e-10, "loss": 2.111, "step": 10775 }, { "epoch": 2.0, "grad_norm": 0.427734375, "learning_rate": 0.0, "loss": 2.0952, "step": 10780 }, { "epoch": 2.0, "eval_loss": 2.1485531330108643, "eval_runtime": 171.3478, "eval_samples_per_second": 28.299, "eval_steps_per_second": 3.543, "step": 10780 }, { "epoch": 2.0, "step": 10780, "total_flos": 6.122136059783414e+17, "train_loss": 2.151964877841645, "train_runtime": 21635.5911, "train_samples_per_second": 7.971, "train_steps_per_second": 0.498 } ], "logging_steps": 5, "max_steps": 10780, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 6.122136059783414e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }