{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1858450698535056, "eval_steps": 500, "global_step": 23000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.080220428413287e-05, "grad_norm": 120784.5234375, "learning_rate": 4.040404040404041e-08, "loss": 6344.0191, "step": 10 }, { "epoch": 0.00016160440856826573, "grad_norm": 246101.4375, "learning_rate": 8.080808080808082e-08, "loss": 7230.2391, "step": 20 }, { "epoch": 0.00024240661285239863, "grad_norm": 220140.828125, "learning_rate": 1.2121212121212122e-07, "loss": 7248.1844, "step": 30 }, { "epoch": 0.00032320881713653147, "grad_norm": 468224.40625, "learning_rate": 1.6161616161616163e-07, "loss": 8245.5844, "step": 40 }, { "epoch": 0.00040401102142066436, "grad_norm": 129730.1171875, "learning_rate": 2.0202020202020202e-07, "loss": 5464.3164, "step": 50 }, { "epoch": 0.00048481322570479725, "grad_norm": 37984.21484375, "learning_rate": 2.4242424242424244e-07, "loss": 7551.0562, "step": 60 }, { "epoch": 0.0005656154299889301, "grad_norm": 170187.078125, "learning_rate": 2.8282828282828283e-07, "loss": 6856.2141, "step": 70 }, { "epoch": 0.0006464176342730629, "grad_norm": 64738.05078125, "learning_rate": 3.2323232323232327e-07, "loss": 7045.2477, "step": 80 }, { "epoch": 0.0007272198385571959, "grad_norm": 114463.8515625, "learning_rate": 3.6363636363636366e-07, "loss": 5803.459, "step": 90 }, { "epoch": 0.0008080220428413287, "grad_norm": 102587.6796875, "learning_rate": 4.0404040404040405e-07, "loss": 3976.1211, "step": 100 }, { "epoch": 0.0008888242471254616, "grad_norm": 78112.53125, "learning_rate": 4.444444444444445e-07, "loss": 3535.432, "step": 110 }, { "epoch": 0.0009696264514095945, "grad_norm": 100944.5703125, "learning_rate": 4.848484848484849e-07, "loss": 4244.4645, "step": 120 }, { "epoch": 0.0010504286556937273, "grad_norm": 178465.359375, "learning_rate": 5.252525252525253e-07, "loss": 5655.2863, "step": 130 }, { "epoch": 0.0011312308599778602, "grad_norm": 70585.359375, "learning_rate": 5.656565656565657e-07, "loss": 3637.5656, "step": 140 }, { "epoch": 0.001212033064261993, "grad_norm": 51223.41015625, "learning_rate": 6.060606060606061e-07, "loss": 2629.8635, "step": 150 }, { "epoch": 0.0012928352685461259, "grad_norm": 53755.01953125, "learning_rate": 6.464646464646465e-07, "loss": 4522.2559, "step": 160 }, { "epoch": 0.0013736374728302587, "grad_norm": 72124.0625, "learning_rate": 6.868686868686869e-07, "loss": 3395.8777, "step": 170 }, { "epoch": 0.0014544396771143918, "grad_norm": 11780.193359375, "learning_rate": 7.272727272727273e-07, "loss": 1784.9486, "step": 180 }, { "epoch": 0.0015352418813985246, "grad_norm": 62803.8828125, "learning_rate": 7.676767676767678e-07, "loss": 2220.277, "step": 190 }, { "epoch": 0.0016160440856826574, "grad_norm": 118829.34375, "learning_rate": 8.080808080808081e-07, "loss": 2133.5133, "step": 200 }, { "epoch": 0.0016968462899667903, "grad_norm": 7391.849609375, "learning_rate": 8.484848484848486e-07, "loss": 1618.4662, "step": 210 }, { "epoch": 0.0017776484942509231, "grad_norm": 7889.263671875, "learning_rate": 8.88888888888889e-07, "loss": 1578.3401, "step": 220 }, { "epoch": 0.001858450698535056, "grad_norm": 10828.140625, "learning_rate": 9.292929292929294e-07, "loss": 1327.2748, "step": 230 }, { "epoch": 0.001939252902819189, "grad_norm": 15991.2119140625, "learning_rate": 9.696969696969698e-07, "loss": 1342.3516, "step": 240 }, { "epoch": 0.0020200551071033216, "grad_norm": 7872.46484375, "learning_rate": 1.0101010101010103e-06, "loss": 1077.598, "step": 250 }, { "epoch": 0.0021008573113874547, "grad_norm": 3977.63623046875, "learning_rate": 1.0505050505050506e-06, "loss": 835.4583, "step": 260 }, { "epoch": 0.0021816595156715873, "grad_norm": 3596.71923828125, "learning_rate": 1.090909090909091e-06, "loss": 766.0804, "step": 270 }, { "epoch": 0.0022624617199557204, "grad_norm": 10194.791015625, "learning_rate": 1.1313131313131313e-06, "loss": 777.5695, "step": 280 }, { "epoch": 0.0023432639242398534, "grad_norm": 7022.6103515625, "learning_rate": 1.1717171717171719e-06, "loss": 639.281, "step": 290 }, { "epoch": 0.002424066128523986, "grad_norm": 2920.81787109375, "learning_rate": 1.2121212121212122e-06, "loss": 703.318, "step": 300 }, { "epoch": 0.002504868332808119, "grad_norm": 4137.4970703125, "learning_rate": 1.2525252525252527e-06, "loss": 766.8718, "step": 310 }, { "epoch": 0.0025856705370922517, "grad_norm": 8379.064453125, "learning_rate": 1.292929292929293e-06, "loss": 625.7259, "step": 320 }, { "epoch": 0.002666472741376385, "grad_norm": 2632.939697265625, "learning_rate": 1.3333333333333334e-06, "loss": 529.0049, "step": 330 }, { "epoch": 0.0027472749456605174, "grad_norm": 3072.3486328125, "learning_rate": 1.3737373737373738e-06, "loss": 557.5195, "step": 340 }, { "epoch": 0.0028280771499446505, "grad_norm": 2430.994384765625, "learning_rate": 1.4141414141414143e-06, "loss": 548.776, "step": 350 }, { "epoch": 0.0029088793542287835, "grad_norm": 2531.90771484375, "learning_rate": 1.4545454545454546e-06, "loss": 602.2291, "step": 360 }, { "epoch": 0.002989681558512916, "grad_norm": 5374.16552734375, "learning_rate": 1.4949494949494952e-06, "loss": 447.6404, "step": 370 }, { "epoch": 0.003070483762797049, "grad_norm": 1357.3726806640625, "learning_rate": 1.5353535353535355e-06, "loss": 342.7574, "step": 380 }, { "epoch": 0.003151285967081182, "grad_norm": 1249.0936279296875, "learning_rate": 1.5757575757575759e-06, "loss": 562.0835, "step": 390 }, { "epoch": 0.003232088171365315, "grad_norm": 1270.3609619140625, "learning_rate": 1.6161616161616162e-06, "loss": 467.299, "step": 400 }, { "epoch": 0.0033128903756494475, "grad_norm": 2576.8291015625, "learning_rate": 1.6565656565656565e-06, "loss": 448.897, "step": 410 }, { "epoch": 0.0033936925799335806, "grad_norm": 1376.2908935546875, "learning_rate": 1.6969696969696973e-06, "loss": 382.242, "step": 420 }, { "epoch": 0.0034744947842177136, "grad_norm": 1212.802490234375, "learning_rate": 1.7373737373737376e-06, "loss": 486.5346, "step": 430 }, { "epoch": 0.0035552969885018462, "grad_norm": 801.7883911132812, "learning_rate": 1.777777777777778e-06, "loss": 452.1537, "step": 440 }, { "epoch": 0.0036360991927859793, "grad_norm": 1590.736572265625, "learning_rate": 1.818181818181818e-06, "loss": 449.5157, "step": 450 }, { "epoch": 0.003716901397070112, "grad_norm": 3368.974853515625, "learning_rate": 1.8585858585858588e-06, "loss": 445.9489, "step": 460 }, { "epoch": 0.003797703601354245, "grad_norm": 2164.530517578125, "learning_rate": 1.8989898989898992e-06, "loss": 449.8674, "step": 470 }, { "epoch": 0.003878505805638378, "grad_norm": 1234.2860107421875, "learning_rate": 1.9393939393939395e-06, "loss": 375.5616, "step": 480 }, { "epoch": 0.003959308009922511, "grad_norm": 1191.9912109375, "learning_rate": 1.9797979797979796e-06, "loss": 394.9626, "step": 490 }, { "epoch": 0.004040110214206643, "grad_norm": 2829.901611328125, "learning_rate": 2.0202020202020206e-06, "loss": 398.194, "step": 500 }, { "epoch": 0.004120912418490777, "grad_norm": 3573.80322265625, "learning_rate": 2.0606060606060607e-06, "loss": 440.5374, "step": 510 }, { "epoch": 0.004201714622774909, "grad_norm": 991.9559326171875, "learning_rate": 2.1010101010101013e-06, "loss": 552.4083, "step": 520 }, { "epoch": 0.004282516827059042, "grad_norm": 2654.274658203125, "learning_rate": 2.1414141414141414e-06, "loss": 414.1354, "step": 530 }, { "epoch": 0.004363319031343175, "grad_norm": 1416.258056640625, "learning_rate": 2.181818181818182e-06, "loss": 443.3496, "step": 540 }, { "epoch": 0.004444121235627308, "grad_norm": 1433.0880126953125, "learning_rate": 2.2222222222222225e-06, "loss": 476.5702, "step": 550 }, { "epoch": 0.004524923439911441, "grad_norm": 852.6603393554688, "learning_rate": 2.2626262626262626e-06, "loss": 425.1744, "step": 560 }, { "epoch": 0.004605725644195573, "grad_norm": 1662.8759765625, "learning_rate": 2.303030303030303e-06, "loss": 432.2859, "step": 570 }, { "epoch": 0.004686527848479707, "grad_norm": 1177.7154541015625, "learning_rate": 2.3434343434343437e-06, "loss": 472.836, "step": 580 }, { "epoch": 0.0047673300527638395, "grad_norm": 1203.2510986328125, "learning_rate": 2.383838383838384e-06, "loss": 400.3267, "step": 590 }, { "epoch": 0.004848132257047972, "grad_norm": 806.4482421875, "learning_rate": 2.4242424242424244e-06, "loss": 381.2281, "step": 600 }, { "epoch": 0.004928934461332105, "grad_norm": 1223.798095703125, "learning_rate": 2.4646464646464645e-06, "loss": 314.522, "step": 610 }, { "epoch": 0.005009736665616238, "grad_norm": 1220.419189453125, "learning_rate": 2.5050505050505055e-06, "loss": 391.7079, "step": 620 }, { "epoch": 0.005090538869900371, "grad_norm": 1846.539306640625, "learning_rate": 2.5454545454545456e-06, "loss": 415.6819, "step": 630 }, { "epoch": 0.0051713410741845035, "grad_norm": 1243.7620849609375, "learning_rate": 2.585858585858586e-06, "loss": 437.1002, "step": 640 }, { "epoch": 0.005252143278468637, "grad_norm": 1140.9453125, "learning_rate": 2.6262626262626263e-06, "loss": 415.1159, "step": 650 }, { "epoch": 0.00533294548275277, "grad_norm": 4821.37060546875, "learning_rate": 2.666666666666667e-06, "loss": 464.5417, "step": 660 }, { "epoch": 0.005413747687036902, "grad_norm": 1713.6087646484375, "learning_rate": 2.7070707070707074e-06, "loss": 375.5785, "step": 670 }, { "epoch": 0.005494549891321035, "grad_norm": 2620.644287109375, "learning_rate": 2.7474747474747475e-06, "loss": 299.2374, "step": 680 }, { "epoch": 0.005575352095605168, "grad_norm": 1759.0950927734375, "learning_rate": 2.787878787878788e-06, "loss": 415.4854, "step": 690 }, { "epoch": 0.005656154299889301, "grad_norm": 1536.8719482421875, "learning_rate": 2.8282828282828286e-06, "loss": 449.5622, "step": 700 }, { "epoch": 0.0057369565041734336, "grad_norm": 2095.785400390625, "learning_rate": 2.8686868686868687e-06, "loss": 380.9472, "step": 710 }, { "epoch": 0.005817758708457567, "grad_norm": 1478.4825439453125, "learning_rate": 2.9090909090909093e-06, "loss": 390.7729, "step": 720 }, { "epoch": 0.0058985609127417, "grad_norm": 1876.679443359375, "learning_rate": 2.9494949494949494e-06, "loss": 391.9601, "step": 730 }, { "epoch": 0.005979363117025832, "grad_norm": 749.0634765625, "learning_rate": 2.9898989898989904e-06, "loss": 253.9756, "step": 740 }, { "epoch": 0.006060165321309965, "grad_norm": 906.431396484375, "learning_rate": 3.0303030303030305e-06, "loss": 347.2193, "step": 750 }, { "epoch": 0.006140967525594098, "grad_norm": 1034.1180419921875, "learning_rate": 3.070707070707071e-06, "loss": 311.8948, "step": 760 }, { "epoch": 0.006221769729878231, "grad_norm": 2706.747802734375, "learning_rate": 3.111111111111111e-06, "loss": 329.9375, "step": 770 }, { "epoch": 0.006302571934162364, "grad_norm": 1195.091064453125, "learning_rate": 3.1515151515151517e-06, "loss": 303.2004, "step": 780 }, { "epoch": 0.006383374138446497, "grad_norm": 1622.099609375, "learning_rate": 3.191919191919192e-06, "loss": 359.3405, "step": 790 }, { "epoch": 0.00646417634273063, "grad_norm": 1255.0582275390625, "learning_rate": 3.2323232323232324e-06, "loss": 362.8006, "step": 800 }, { "epoch": 0.006544978547014762, "grad_norm": 1571.966552734375, "learning_rate": 3.2727272727272733e-06, "loss": 372.003, "step": 810 }, { "epoch": 0.006625780751298895, "grad_norm": 1350.624755859375, "learning_rate": 3.313131313131313e-06, "loss": 391.2471, "step": 820 }, { "epoch": 0.0067065829555830285, "grad_norm": 1288.0430908203125, "learning_rate": 3.3535353535353536e-06, "loss": 381.0373, "step": 830 }, { "epoch": 0.006787385159867161, "grad_norm": 1756.347900390625, "learning_rate": 3.3939393939393946e-06, "loss": 366.8195, "step": 840 }, { "epoch": 0.006868187364151294, "grad_norm": 1164.05224609375, "learning_rate": 3.4343434343434343e-06, "loss": 340.2402, "step": 850 }, { "epoch": 0.006948989568435427, "grad_norm": 6004.3291015625, "learning_rate": 3.4747474747474752e-06, "loss": 397.7737, "step": 860 }, { "epoch": 0.00702979177271956, "grad_norm": 1949.6370849609375, "learning_rate": 3.515151515151515e-06, "loss": 352.9498, "step": 870 }, { "epoch": 0.0071105939770036925, "grad_norm": 823.8360595703125, "learning_rate": 3.555555555555556e-06, "loss": 432.1595, "step": 880 }, { "epoch": 0.007191396181287825, "grad_norm": 3512.72607421875, "learning_rate": 3.5959595959595965e-06, "loss": 342.5901, "step": 890 }, { "epoch": 0.007272198385571959, "grad_norm": 1352.2506103515625, "learning_rate": 3.636363636363636e-06, "loss": 306.887, "step": 900 }, { "epoch": 0.007353000589856091, "grad_norm": 2983.867919921875, "learning_rate": 3.676767676767677e-06, "loss": 381.7238, "step": 910 }, { "epoch": 0.007433802794140224, "grad_norm": 2423.1806640625, "learning_rate": 3.7171717171717177e-06, "loss": 303.3808, "step": 920 }, { "epoch": 0.007514604998424357, "grad_norm": 953.1580810546875, "learning_rate": 3.757575757575758e-06, "loss": 380.6255, "step": 930 }, { "epoch": 0.00759540720270849, "grad_norm": 5818.9609375, "learning_rate": 3.7979797979797984e-06, "loss": 372.4149, "step": 940 }, { "epoch": 0.007676209406992623, "grad_norm": 1317.5467529296875, "learning_rate": 3.8383838383838385e-06, "loss": 369.7815, "step": 950 }, { "epoch": 0.007757011611276756, "grad_norm": 929.1298217773438, "learning_rate": 3.878787878787879e-06, "loss": 395.9197, "step": 960 }, { "epoch": 0.007837813815560889, "grad_norm": 1342.6861572265625, "learning_rate": 3.9191919191919196e-06, "loss": 384.7229, "step": 970 }, { "epoch": 0.007918616019845021, "grad_norm": 1253.3720703125, "learning_rate": 3.959595959595959e-06, "loss": 302.0778, "step": 980 }, { "epoch": 0.007999418224129154, "grad_norm": 966.1517333984375, "learning_rate": 4.000000000000001e-06, "loss": 407.2742, "step": 990 }, { "epoch": 0.008080220428413287, "grad_norm": 949.992919921875, "learning_rate": 4.040404040404041e-06, "loss": 333.9852, "step": 1000 }, { "epoch": 0.00816102263269742, "grad_norm": 2160.320068359375, "learning_rate": 4.080808080808081e-06, "loss": 320.1485, "step": 1010 }, { "epoch": 0.008241824836981554, "grad_norm": 1784.5238037109375, "learning_rate": 4.1212121212121215e-06, "loss": 388.0309, "step": 1020 }, { "epoch": 0.008322627041265686, "grad_norm": 11426.04296875, "learning_rate": 4.161616161616161e-06, "loss": 342.7975, "step": 1030 }, { "epoch": 0.008403429245549819, "grad_norm": 1648.6806640625, "learning_rate": 4.2020202020202026e-06, "loss": 377.397, "step": 1040 }, { "epoch": 0.008484231449833951, "grad_norm": 929.9481811523438, "learning_rate": 4.242424242424243e-06, "loss": 316.3978, "step": 1050 }, { "epoch": 0.008565033654118084, "grad_norm": 1129.7996826171875, "learning_rate": 4.282828282828283e-06, "loss": 299.9879, "step": 1060 }, { "epoch": 0.008645835858402217, "grad_norm": 1400.36376953125, "learning_rate": 4.323232323232323e-06, "loss": 350.1171, "step": 1070 }, { "epoch": 0.00872663806268635, "grad_norm": 1833.9061279296875, "learning_rate": 4.363636363636364e-06, "loss": 325.1123, "step": 1080 }, { "epoch": 0.008807440266970484, "grad_norm": 1330.033203125, "learning_rate": 4.4040404040404044e-06, "loss": 325.3823, "step": 1090 }, { "epoch": 0.008888242471254616, "grad_norm": 910.3088989257812, "learning_rate": 4.444444444444445e-06, "loss": 301.4958, "step": 1100 }, { "epoch": 0.008969044675538749, "grad_norm": 1182.816162109375, "learning_rate": 4.484848484848485e-06, "loss": 277.6404, "step": 1110 }, { "epoch": 0.009049846879822881, "grad_norm": 4707.03857421875, "learning_rate": 4.525252525252525e-06, "loss": 424.5438, "step": 1120 }, { "epoch": 0.009130649084107014, "grad_norm": 2059.0185546875, "learning_rate": 4.565656565656566e-06, "loss": 324.0304, "step": 1130 }, { "epoch": 0.009211451288391147, "grad_norm": 3679.1044921875, "learning_rate": 4.606060606060606e-06, "loss": 303.365, "step": 1140 }, { "epoch": 0.00929225349267528, "grad_norm": 2518.44970703125, "learning_rate": 4.646464646464647e-06, "loss": 257.8196, "step": 1150 }, { "epoch": 0.009373055696959414, "grad_norm": 1017.4381713867188, "learning_rate": 4.6868686868686874e-06, "loss": 308.7409, "step": 1160 }, { "epoch": 0.009453857901243546, "grad_norm": 1016.4625244140625, "learning_rate": 4.727272727272727e-06, "loss": 309.7148, "step": 1170 }, { "epoch": 0.009534660105527679, "grad_norm": 2000.1339111328125, "learning_rate": 4.767676767676768e-06, "loss": 416.5233, "step": 1180 }, { "epoch": 0.009615462309811812, "grad_norm": 1349.24755859375, "learning_rate": 4.808080808080808e-06, "loss": 312.0022, "step": 1190 }, { "epoch": 0.009696264514095944, "grad_norm": 605.5498046875, "learning_rate": 4.848484848484849e-06, "loss": 373.616, "step": 1200 }, { "epoch": 0.009777066718380077, "grad_norm": 1045.7061767578125, "learning_rate": 4.888888888888889e-06, "loss": 263.9121, "step": 1210 }, { "epoch": 0.00985786892266421, "grad_norm": 997.4849243164062, "learning_rate": 4.929292929292929e-06, "loss": 299.2633, "step": 1220 }, { "epoch": 0.009938671126948344, "grad_norm": 690.6622314453125, "learning_rate": 4.96969696969697e-06, "loss": 216.8853, "step": 1230 }, { "epoch": 0.010019473331232476, "grad_norm": 1591.2972412109375, "learning_rate": 5.010101010101011e-06, "loss": 339.3444, "step": 1240 }, { "epoch": 0.010100275535516609, "grad_norm": 749.3450317382812, "learning_rate": 5.050505050505051e-06, "loss": 315.4824, "step": 1250 }, { "epoch": 0.010181077739800742, "grad_norm": 923.2822265625, "learning_rate": 5.090909090909091e-06, "loss": 265.8641, "step": 1260 }, { "epoch": 0.010261879944084874, "grad_norm": 1161.5048828125, "learning_rate": 5.131313131313131e-06, "loss": 255.6592, "step": 1270 }, { "epoch": 0.010342682148369007, "grad_norm": 949.8252563476562, "learning_rate": 5.171717171717172e-06, "loss": 350.1398, "step": 1280 }, { "epoch": 0.01042348435265314, "grad_norm": 1018.5875854492188, "learning_rate": 5.212121212121213e-06, "loss": 313.8918, "step": 1290 }, { "epoch": 0.010504286556937274, "grad_norm": 3170.640869140625, "learning_rate": 5.2525252525252526e-06, "loss": 305.4674, "step": 1300 }, { "epoch": 0.010585088761221407, "grad_norm": 1600.813720703125, "learning_rate": 5.292929292929293e-06, "loss": 331.4024, "step": 1310 }, { "epoch": 0.01066589096550554, "grad_norm": 1063.93408203125, "learning_rate": 5.333333333333334e-06, "loss": 373.562, "step": 1320 }, { "epoch": 0.010746693169789672, "grad_norm": 665.6146240234375, "learning_rate": 5.373737373737374e-06, "loss": 235.0867, "step": 1330 }, { "epoch": 0.010827495374073804, "grad_norm": 1380.6151123046875, "learning_rate": 5.414141414141415e-06, "loss": 324.2148, "step": 1340 }, { "epoch": 0.010908297578357937, "grad_norm": 3627.843994140625, "learning_rate": 5.4545454545454545e-06, "loss": 364.2515, "step": 1350 }, { "epoch": 0.01098909978264207, "grad_norm": 850.1503295898438, "learning_rate": 5.494949494949495e-06, "loss": 408.481, "step": 1360 }, { "epoch": 0.011069901986926204, "grad_norm": 2091.482421875, "learning_rate": 5.5353535353535355e-06, "loss": 267.4172, "step": 1370 }, { "epoch": 0.011150704191210337, "grad_norm": 1496.604248046875, "learning_rate": 5.575757575757576e-06, "loss": 295.7435, "step": 1380 }, { "epoch": 0.01123150639549447, "grad_norm": 2673.033203125, "learning_rate": 5.616161616161617e-06, "loss": 251.5225, "step": 1390 }, { "epoch": 0.011312308599778602, "grad_norm": 1121.48779296875, "learning_rate": 5.656565656565657e-06, "loss": 318.3079, "step": 1400 }, { "epoch": 0.011393110804062734, "grad_norm": 866.069091796875, "learning_rate": 5.696969696969697e-06, "loss": 283.861, "step": 1410 }, { "epoch": 0.011473913008346867, "grad_norm": 1132.9764404296875, "learning_rate": 5.7373737373737374e-06, "loss": 312.9168, "step": 1420 }, { "epoch": 0.011554715212631, "grad_norm": 1029.2037353515625, "learning_rate": 5.777777777777778e-06, "loss": 307.1583, "step": 1430 }, { "epoch": 0.011635517416915134, "grad_norm": 1138.4461669921875, "learning_rate": 5.8181818181818185e-06, "loss": 266.8528, "step": 1440 }, { "epoch": 0.011716319621199267, "grad_norm": 1278.30224609375, "learning_rate": 5.858585858585859e-06, "loss": 339.6735, "step": 1450 }, { "epoch": 0.0117971218254834, "grad_norm": 908.4046630859375, "learning_rate": 5.898989898989899e-06, "loss": 331.0705, "step": 1460 }, { "epoch": 0.011877924029767532, "grad_norm": 1165.220947265625, "learning_rate": 5.93939393939394e-06, "loss": 267.2519, "step": 1470 }, { "epoch": 0.011958726234051665, "grad_norm": 3219.732177734375, "learning_rate": 5.979797979797981e-06, "loss": 485.5872, "step": 1480 }, { "epoch": 0.012039528438335797, "grad_norm": 1050.6778564453125, "learning_rate": 6.0202020202020204e-06, "loss": 325.8213, "step": 1490 }, { "epoch": 0.01212033064261993, "grad_norm": 1031.156005859375, "learning_rate": 6.060606060606061e-06, "loss": 302.1713, "step": 1500 }, { "epoch": 0.012201132846904064, "grad_norm": 1457.485107421875, "learning_rate": 6.101010101010101e-06, "loss": 267.4604, "step": 1510 }, { "epoch": 0.012281935051188197, "grad_norm": 1509.94091796875, "learning_rate": 6.141414141414142e-06, "loss": 368.4667, "step": 1520 }, { "epoch": 0.01236273725547233, "grad_norm": 1406.15673828125, "learning_rate": 6.181818181818183e-06, "loss": 349.9011, "step": 1530 }, { "epoch": 0.012443539459756462, "grad_norm": 1459.2613525390625, "learning_rate": 6.222222222222222e-06, "loss": 325.6512, "step": 1540 }, { "epoch": 0.012524341664040595, "grad_norm": 3242.78271484375, "learning_rate": 6.262626262626263e-06, "loss": 375.0032, "step": 1550 }, { "epoch": 0.012605143868324727, "grad_norm": 2443.515625, "learning_rate": 6.303030303030303e-06, "loss": 379.789, "step": 1560 }, { "epoch": 0.01268594607260886, "grad_norm": 1770.516845703125, "learning_rate": 6.343434343434344e-06, "loss": 251.652, "step": 1570 }, { "epoch": 0.012766748276892994, "grad_norm": 1482.888671875, "learning_rate": 6.383838383838384e-06, "loss": 324.2996, "step": 1580 }, { "epoch": 0.012847550481177127, "grad_norm": 1663.3935546875, "learning_rate": 6.424242424242424e-06, "loss": 301.8472, "step": 1590 }, { "epoch": 0.01292835268546126, "grad_norm": 23524.42578125, "learning_rate": 6.464646464646465e-06, "loss": 335.0892, "step": 1600 }, { "epoch": 0.013009154889745392, "grad_norm": 862.7949829101562, "learning_rate": 6.505050505050505e-06, "loss": 306.2031, "step": 1610 }, { "epoch": 0.013089957094029525, "grad_norm": 2391.976806640625, "learning_rate": 6.545454545454547e-06, "loss": 310.1344, "step": 1620 }, { "epoch": 0.013170759298313657, "grad_norm": 1260.0029296875, "learning_rate": 6.5858585858585856e-06, "loss": 372.1161, "step": 1630 }, { "epoch": 0.01325156150259779, "grad_norm": 1587.1514892578125, "learning_rate": 6.626262626262626e-06, "loss": 351.7325, "step": 1640 }, { "epoch": 0.013332363706881924, "grad_norm": 1152.1556396484375, "learning_rate": 6.666666666666667e-06, "loss": 373.3706, "step": 1650 }, { "epoch": 0.013413165911166057, "grad_norm": 1005.15771484375, "learning_rate": 6.707070707070707e-06, "loss": 317.8097, "step": 1660 }, { "epoch": 0.01349396811545019, "grad_norm": 1090.3779296875, "learning_rate": 6.747474747474749e-06, "loss": 315.5722, "step": 1670 }, { "epoch": 0.013574770319734322, "grad_norm": 1011.2723388671875, "learning_rate": 6.787878787878789e-06, "loss": 231.6832, "step": 1680 }, { "epoch": 0.013655572524018455, "grad_norm": 798.0405883789062, "learning_rate": 6.828282828282828e-06, "loss": 306.8582, "step": 1690 }, { "epoch": 0.013736374728302587, "grad_norm": 855.2308959960938, "learning_rate": 6.8686868686868685e-06, "loss": 282.2944, "step": 1700 }, { "epoch": 0.01381717693258672, "grad_norm": 1325.1092529296875, "learning_rate": 6.909090909090909e-06, "loss": 427.1749, "step": 1710 }, { "epoch": 0.013897979136870854, "grad_norm": 1027.2860107421875, "learning_rate": 6.9494949494949505e-06, "loss": 232.57, "step": 1720 }, { "epoch": 0.013978781341154987, "grad_norm": 1047.0118408203125, "learning_rate": 6.989898989898991e-06, "loss": 272.4593, "step": 1730 }, { "epoch": 0.01405958354543912, "grad_norm": 1453.4931640625, "learning_rate": 7.03030303030303e-06, "loss": 287.978, "step": 1740 }, { "epoch": 0.014140385749723252, "grad_norm": 1631.84521484375, "learning_rate": 7.0707070707070704e-06, "loss": 222.9585, "step": 1750 }, { "epoch": 0.014221187954007385, "grad_norm": 1109.9012451171875, "learning_rate": 7.111111111111112e-06, "loss": 263.4153, "step": 1760 }, { "epoch": 0.014301990158291518, "grad_norm": 1374.5731201171875, "learning_rate": 7.151515151515152e-06, "loss": 224.7149, "step": 1770 }, { "epoch": 0.01438279236257565, "grad_norm": 700.2552490234375, "learning_rate": 7.191919191919193e-06, "loss": 243.2667, "step": 1780 }, { "epoch": 0.014463594566859785, "grad_norm": 689.7608032226562, "learning_rate": 7.232323232323232e-06, "loss": 209.3831, "step": 1790 }, { "epoch": 0.014544396771143917, "grad_norm": 1072.593994140625, "learning_rate": 7.272727272727272e-06, "loss": 233.4359, "step": 1800 }, { "epoch": 0.01462519897542805, "grad_norm": 783.2555541992188, "learning_rate": 7.313131313131314e-06, "loss": 259.1145, "step": 1810 }, { "epoch": 0.014706001179712182, "grad_norm": 1364.049560546875, "learning_rate": 7.353535353535354e-06, "loss": 244.7944, "step": 1820 }, { "epoch": 0.014786803383996315, "grad_norm": 1947.3690185546875, "learning_rate": 7.393939393939395e-06, "loss": 276.0544, "step": 1830 }, { "epoch": 0.014867605588280448, "grad_norm": 1572.007568359375, "learning_rate": 7.434343434343435e-06, "loss": 310.638, "step": 1840 }, { "epoch": 0.01494840779256458, "grad_norm": 1419.4361572265625, "learning_rate": 7.474747474747475e-06, "loss": 409.634, "step": 1850 }, { "epoch": 0.015029209996848715, "grad_norm": 1355.0137939453125, "learning_rate": 7.515151515151516e-06, "loss": 253.9533, "step": 1860 }, { "epoch": 0.015110012201132847, "grad_norm": 768.96923828125, "learning_rate": 7.555555555555556e-06, "loss": 304.4757, "step": 1870 }, { "epoch": 0.01519081440541698, "grad_norm": 959.4989624023438, "learning_rate": 7.595959595959597e-06, "loss": 345.032, "step": 1880 }, { "epoch": 0.015271616609701113, "grad_norm": 1339.84228515625, "learning_rate": 7.636363636363638e-06, "loss": 296.0798, "step": 1890 }, { "epoch": 0.015352418813985245, "grad_norm": 1036.2491455078125, "learning_rate": 7.676767676767677e-06, "loss": 300.197, "step": 1900 }, { "epoch": 0.015433221018269378, "grad_norm": 707.27587890625, "learning_rate": 7.717171717171717e-06, "loss": 277.1603, "step": 1910 }, { "epoch": 0.015514023222553512, "grad_norm": 1203.453857421875, "learning_rate": 7.757575757575758e-06, "loss": 303.6785, "step": 1920 }, { "epoch": 0.015594825426837645, "grad_norm": 1172.7025146484375, "learning_rate": 7.797979797979799e-06, "loss": 246.9346, "step": 1930 }, { "epoch": 0.015675627631121777, "grad_norm": 1082.4605712890625, "learning_rate": 7.838383838383839e-06, "loss": 296.1259, "step": 1940 }, { "epoch": 0.01575642983540591, "grad_norm": 1456.42529296875, "learning_rate": 7.878787878787878e-06, "loss": 269.7506, "step": 1950 }, { "epoch": 0.015837232039690043, "grad_norm": 1794.72119140625, "learning_rate": 7.919191919191919e-06, "loss": 246.411, "step": 1960 }, { "epoch": 0.015918034243974175, "grad_norm": 3157.114990234375, "learning_rate": 7.959595959595959e-06, "loss": 256.127, "step": 1970 }, { "epoch": 0.015998836448258308, "grad_norm": 1361.6929931640625, "learning_rate": 8.000000000000001e-06, "loss": 308.1984, "step": 1980 }, { "epoch": 0.01607963865254244, "grad_norm": 1006.965087890625, "learning_rate": 8.040404040404042e-06, "loss": 341.8693, "step": 1990 }, { "epoch": 0.016160440856826573, "grad_norm": 1541.38720703125, "learning_rate": 8.080808080808082e-06, "loss": 274.4633, "step": 2000 }, { "epoch": 0.016241243061110706, "grad_norm": 1735.104248046875, "learning_rate": 8.121212121212121e-06, "loss": 258.4104, "step": 2010 }, { "epoch": 0.01632204526539484, "grad_norm": 2176.154052734375, "learning_rate": 8.161616161616162e-06, "loss": 314.5508, "step": 2020 }, { "epoch": 0.016402847469678974, "grad_norm": 1052.533447265625, "learning_rate": 8.202020202020202e-06, "loss": 312.927, "step": 2030 }, { "epoch": 0.016483649673963107, "grad_norm": 1208.69189453125, "learning_rate": 8.242424242424243e-06, "loss": 254.359, "step": 2040 }, { "epoch": 0.01656445187824724, "grad_norm": 1978.5025634765625, "learning_rate": 8.282828282828283e-06, "loss": 299.8744, "step": 2050 }, { "epoch": 0.016645254082531372, "grad_norm": 3605.6494140625, "learning_rate": 8.323232323232322e-06, "loss": 298.715, "step": 2060 }, { "epoch": 0.016726056286815505, "grad_norm": 1599.973876953125, "learning_rate": 8.363636363636365e-06, "loss": 258.9758, "step": 2070 }, { "epoch": 0.016806858491099638, "grad_norm": 1183.451904296875, "learning_rate": 8.404040404040405e-06, "loss": 352.8176, "step": 2080 }, { "epoch": 0.01688766069538377, "grad_norm": 1582.7120361328125, "learning_rate": 8.444444444444446e-06, "loss": 309.3484, "step": 2090 }, { "epoch": 0.016968462899667903, "grad_norm": 932.9716796875, "learning_rate": 8.484848484848486e-06, "loss": 255.265, "step": 2100 }, { "epoch": 0.017049265103952035, "grad_norm": 922.5059814453125, "learning_rate": 8.525252525252525e-06, "loss": 253.6583, "step": 2110 }, { "epoch": 0.017130067308236168, "grad_norm": 1196.361083984375, "learning_rate": 8.565656565656566e-06, "loss": 323.4844, "step": 2120 }, { "epoch": 0.0172108695125203, "grad_norm": 1005.9546508789062, "learning_rate": 8.606060606060606e-06, "loss": 351.4896, "step": 2130 }, { "epoch": 0.017291671716804433, "grad_norm": 1585.8636474609375, "learning_rate": 8.646464646464647e-06, "loss": 325.0701, "step": 2140 }, { "epoch": 0.017372473921088566, "grad_norm": 3758.6982421875, "learning_rate": 8.686868686868687e-06, "loss": 203.9509, "step": 2150 }, { "epoch": 0.0174532761253727, "grad_norm": 1602.480224609375, "learning_rate": 8.727272727272728e-06, "loss": 294.8089, "step": 2160 }, { "epoch": 0.017534078329656835, "grad_norm": 1571.1812744140625, "learning_rate": 8.767676767676768e-06, "loss": 220.656, "step": 2170 }, { "epoch": 0.017614880533940967, "grad_norm": 1073.2261962890625, "learning_rate": 8.808080808080809e-06, "loss": 221.6837, "step": 2180 }, { "epoch": 0.0176956827382251, "grad_norm": 1125.1983642578125, "learning_rate": 8.84848484848485e-06, "loss": 286.0061, "step": 2190 }, { "epoch": 0.017776484942509233, "grad_norm": 1008.795654296875, "learning_rate": 8.88888888888889e-06, "loss": 282.5281, "step": 2200 }, { "epoch": 0.017857287146793365, "grad_norm": 1810.7894287109375, "learning_rate": 8.92929292929293e-06, "loss": 226.4816, "step": 2210 }, { "epoch": 0.017938089351077498, "grad_norm": 894.2589721679688, "learning_rate": 8.96969696969697e-06, "loss": 251.6401, "step": 2220 }, { "epoch": 0.01801889155536163, "grad_norm": 1232.9827880859375, "learning_rate": 9.01010101010101e-06, "loss": 248.7544, "step": 2230 }, { "epoch": 0.018099693759645763, "grad_norm": 1993.9267578125, "learning_rate": 9.05050505050505e-06, "loss": 256.8296, "step": 2240 }, { "epoch": 0.018180495963929896, "grad_norm": 967.433837890625, "learning_rate": 9.090909090909091e-06, "loss": 245.6321, "step": 2250 }, { "epoch": 0.018261298168214028, "grad_norm": 2560.1728515625, "learning_rate": 9.131313131313132e-06, "loss": 234.9478, "step": 2260 }, { "epoch": 0.01834210037249816, "grad_norm": 590.1747436523438, "learning_rate": 9.171717171717172e-06, "loss": 236.6796, "step": 2270 }, { "epoch": 0.018422902576782293, "grad_norm": 1504.942626953125, "learning_rate": 9.212121212121213e-06, "loss": 367.3362, "step": 2280 }, { "epoch": 0.018503704781066426, "grad_norm": 977.7069091796875, "learning_rate": 9.252525252525253e-06, "loss": 276.6067, "step": 2290 }, { "epoch": 0.01858450698535056, "grad_norm": 1384.83203125, "learning_rate": 9.292929292929294e-06, "loss": 340.7896, "step": 2300 }, { "epoch": 0.018665309189634695, "grad_norm": 1302.6343994140625, "learning_rate": 9.333333333333334e-06, "loss": 198.1527, "step": 2310 }, { "epoch": 0.018746111393918827, "grad_norm": 836.4732666015625, "learning_rate": 9.373737373737375e-06, "loss": 209.2565, "step": 2320 }, { "epoch": 0.01882691359820296, "grad_norm": 1400.0604248046875, "learning_rate": 9.414141414141414e-06, "loss": 305.2968, "step": 2330 }, { "epoch": 0.018907715802487093, "grad_norm": 1454.78125, "learning_rate": 9.454545454545454e-06, "loss": 260.4474, "step": 2340 }, { "epoch": 0.018988518006771225, "grad_norm": 4679.6923828125, "learning_rate": 9.494949494949495e-06, "loss": 247.9314, "step": 2350 }, { "epoch": 0.019069320211055358, "grad_norm": 6640.1201171875, "learning_rate": 9.535353535353535e-06, "loss": 259.6065, "step": 2360 }, { "epoch": 0.01915012241533949, "grad_norm": 1564.781982421875, "learning_rate": 9.575757575757578e-06, "loss": 245.3016, "step": 2370 }, { "epoch": 0.019230924619623623, "grad_norm": 2078.74267578125, "learning_rate": 9.616161616161616e-06, "loss": 284.7133, "step": 2380 }, { "epoch": 0.019311726823907756, "grad_norm": 1441.0360107421875, "learning_rate": 9.656565656565657e-06, "loss": 270.4787, "step": 2390 }, { "epoch": 0.01939252902819189, "grad_norm": 744.0514526367188, "learning_rate": 9.696969696969698e-06, "loss": 200.2359, "step": 2400 }, { "epoch": 0.01947333123247602, "grad_norm": 1168.913818359375, "learning_rate": 9.737373737373738e-06, "loss": 269.4149, "step": 2410 }, { "epoch": 0.019554133436760154, "grad_norm": 3848.146484375, "learning_rate": 9.777777777777779e-06, "loss": 250.6479, "step": 2420 }, { "epoch": 0.019634935641044286, "grad_norm": 798.60595703125, "learning_rate": 9.818181818181818e-06, "loss": 245.7341, "step": 2430 }, { "epoch": 0.01971573784532842, "grad_norm": 2139.72265625, "learning_rate": 9.858585858585858e-06, "loss": 275.653, "step": 2440 }, { "epoch": 0.019796540049612555, "grad_norm": 1223.6392822265625, "learning_rate": 9.898989898989899e-06, "loss": 271.2685, "step": 2450 }, { "epoch": 0.019877342253896688, "grad_norm": 1007.25439453125, "learning_rate": 9.93939393939394e-06, "loss": 235.147, "step": 2460 }, { "epoch": 0.01995814445818082, "grad_norm": 1195.012939453125, "learning_rate": 9.979797979797981e-06, "loss": 302.4475, "step": 2470 }, { "epoch": 0.020038946662464953, "grad_norm": 1530.9473876953125, "learning_rate": 1.0020202020202022e-05, "loss": 259.1597, "step": 2480 }, { "epoch": 0.020119748866749085, "grad_norm": 1403.546142578125, "learning_rate": 1.006060606060606e-05, "loss": 348.2433, "step": 2490 }, { "epoch": 0.020200551071033218, "grad_norm": 1328.4873046875, "learning_rate": 1.0101010101010101e-05, "loss": 289.0594, "step": 2500 }, { "epoch": 0.02028135327531735, "grad_norm": 1171.5048828125, "learning_rate": 1.0141414141414142e-05, "loss": 197.3946, "step": 2510 }, { "epoch": 0.020362155479601483, "grad_norm": 1274.6544189453125, "learning_rate": 1.0181818181818182e-05, "loss": 295.2945, "step": 2520 }, { "epoch": 0.020442957683885616, "grad_norm": 2023.71337890625, "learning_rate": 1.0222222222222223e-05, "loss": 271.9707, "step": 2530 }, { "epoch": 0.02052375988816975, "grad_norm": 1765.538818359375, "learning_rate": 1.0262626262626262e-05, "loss": 246.6141, "step": 2540 }, { "epoch": 0.02060456209245388, "grad_norm": 859.3914794921875, "learning_rate": 1.0303030303030304e-05, "loss": 276.1375, "step": 2550 }, { "epoch": 0.020685364296738014, "grad_norm": 1024.8955078125, "learning_rate": 1.0343434343434345e-05, "loss": 239.6529, "step": 2560 }, { "epoch": 0.020766166501022146, "grad_norm": 1012.91455078125, "learning_rate": 1.0383838383838385e-05, "loss": 199.6656, "step": 2570 }, { "epoch": 0.02084696870530628, "grad_norm": 1371.8551025390625, "learning_rate": 1.0424242424242426e-05, "loss": 247.0821, "step": 2580 }, { "epoch": 0.020927770909590415, "grad_norm": 1338.343017578125, "learning_rate": 1.0464646464646465e-05, "loss": 260.7634, "step": 2590 }, { "epoch": 0.021008573113874548, "grad_norm": 917.8023071289062, "learning_rate": 1.0505050505050505e-05, "loss": 255.2869, "step": 2600 }, { "epoch": 0.02108937531815868, "grad_norm": 1167.3427734375, "learning_rate": 1.0545454545454546e-05, "loss": 255.108, "step": 2610 }, { "epoch": 0.021170177522442813, "grad_norm": 1662.1556396484375, "learning_rate": 1.0585858585858586e-05, "loss": 304.7596, "step": 2620 }, { "epoch": 0.021250979726726946, "grad_norm": 1393.7713623046875, "learning_rate": 1.0626262626262627e-05, "loss": 298.9948, "step": 2630 }, { "epoch": 0.02133178193101108, "grad_norm": 4169.89306640625, "learning_rate": 1.0666666666666667e-05, "loss": 256.329, "step": 2640 }, { "epoch": 0.02141258413529521, "grad_norm": 961.8526000976562, "learning_rate": 1.0707070707070708e-05, "loss": 268.2124, "step": 2650 }, { "epoch": 0.021493386339579344, "grad_norm": 2141.90869140625, "learning_rate": 1.0747474747474748e-05, "loss": 279.6773, "step": 2660 }, { "epoch": 0.021574188543863476, "grad_norm": 1454.77392578125, "learning_rate": 1.0787878787878789e-05, "loss": 250.6374, "step": 2670 }, { "epoch": 0.02165499074814761, "grad_norm": 1119.782958984375, "learning_rate": 1.082828282828283e-05, "loss": 299.24, "step": 2680 }, { "epoch": 0.02173579295243174, "grad_norm": 1507.88916015625, "learning_rate": 1.086868686868687e-05, "loss": 304.945, "step": 2690 }, { "epoch": 0.021816595156715874, "grad_norm": 1235.4326171875, "learning_rate": 1.0909090909090909e-05, "loss": 329.6972, "step": 2700 }, { "epoch": 0.021897397361000007, "grad_norm": 1516.6436767578125, "learning_rate": 1.094949494949495e-05, "loss": 276.193, "step": 2710 }, { "epoch": 0.02197819956528414, "grad_norm": 1332.3309326171875, "learning_rate": 1.098989898989899e-05, "loss": 309.7016, "step": 2720 }, { "epoch": 0.022059001769568275, "grad_norm": 1349.0360107421875, "learning_rate": 1.103030303030303e-05, "loss": 423.0254, "step": 2730 }, { "epoch": 0.022139803973852408, "grad_norm": 2262.348876953125, "learning_rate": 1.1070707070707071e-05, "loss": 260.9447, "step": 2740 }, { "epoch": 0.02222060617813654, "grad_norm": 1374.3009033203125, "learning_rate": 1.1111111111111112e-05, "loss": 283.2908, "step": 2750 }, { "epoch": 0.022301408382420673, "grad_norm": 768.2625732421875, "learning_rate": 1.1151515151515152e-05, "loss": 243.0171, "step": 2760 }, { "epoch": 0.022382210586704806, "grad_norm": 4175.06396484375, "learning_rate": 1.1191919191919193e-05, "loss": 236.5765, "step": 2770 }, { "epoch": 0.02246301279098894, "grad_norm": 1220.933837890625, "learning_rate": 1.1232323232323233e-05, "loss": 254.5181, "step": 2780 }, { "epoch": 0.02254381499527307, "grad_norm": 1589.7581787109375, "learning_rate": 1.1272727272727274e-05, "loss": 183.8809, "step": 2790 }, { "epoch": 0.022624617199557204, "grad_norm": 1031.3692626953125, "learning_rate": 1.1313131313131314e-05, "loss": 246.612, "step": 2800 }, { "epoch": 0.022705419403841336, "grad_norm": 1396.37744140625, "learning_rate": 1.1353535353535353e-05, "loss": 224.2803, "step": 2810 }, { "epoch": 0.02278622160812547, "grad_norm": 1879.5634765625, "learning_rate": 1.1393939393939394e-05, "loss": 287.104, "step": 2820 }, { "epoch": 0.0228670238124096, "grad_norm": 1376.0625, "learning_rate": 1.1434343434343434e-05, "loss": 284.0272, "step": 2830 }, { "epoch": 0.022947826016693734, "grad_norm": 682.961181640625, "learning_rate": 1.1474747474747475e-05, "loss": 210.3876, "step": 2840 }, { "epoch": 0.023028628220977867, "grad_norm": 1677.2169189453125, "learning_rate": 1.1515151515151517e-05, "loss": 326.8289, "step": 2850 }, { "epoch": 0.023109430425262, "grad_norm": 733.2987060546875, "learning_rate": 1.1555555555555556e-05, "loss": 230.43, "step": 2860 }, { "epoch": 0.023190232629546136, "grad_norm": 630.212890625, "learning_rate": 1.1595959595959597e-05, "loss": 205.1732, "step": 2870 }, { "epoch": 0.023271034833830268, "grad_norm": 1535.36572265625, "learning_rate": 1.1636363636363637e-05, "loss": 304.9371, "step": 2880 }, { "epoch": 0.0233518370381144, "grad_norm": 1065.3255615234375, "learning_rate": 1.1676767676767678e-05, "loss": 219.3441, "step": 2890 }, { "epoch": 0.023432639242398533, "grad_norm": 2319.004638671875, "learning_rate": 1.1717171717171718e-05, "loss": 274.9734, "step": 2900 }, { "epoch": 0.023513441446682666, "grad_norm": 1362.914794921875, "learning_rate": 1.1757575757575757e-05, "loss": 214.0071, "step": 2910 }, { "epoch": 0.0235942436509668, "grad_norm": 1748.42333984375, "learning_rate": 1.1797979797979798e-05, "loss": 247.8715, "step": 2920 }, { "epoch": 0.02367504585525093, "grad_norm": 1357.4864501953125, "learning_rate": 1.1838383838383838e-05, "loss": 367.5896, "step": 2930 }, { "epoch": 0.023755848059535064, "grad_norm": 1028.1129150390625, "learning_rate": 1.187878787878788e-05, "loss": 248.4116, "step": 2940 }, { "epoch": 0.023836650263819197, "grad_norm": 1497.2218017578125, "learning_rate": 1.1919191919191921e-05, "loss": 309.2206, "step": 2950 }, { "epoch": 0.02391745246810333, "grad_norm": 830.5894775390625, "learning_rate": 1.1959595959595961e-05, "loss": 203.645, "step": 2960 }, { "epoch": 0.023998254672387462, "grad_norm": 1067.825439453125, "learning_rate": 1.2e-05, "loss": 226.1753, "step": 2970 }, { "epoch": 0.024079056876671594, "grad_norm": 1088.949462890625, "learning_rate": 1.2040404040404041e-05, "loss": 235.4068, "step": 2980 }, { "epoch": 0.024159859080955727, "grad_norm": 1378.7232666015625, "learning_rate": 1.2080808080808081e-05, "loss": 258.9663, "step": 2990 }, { "epoch": 0.02424066128523986, "grad_norm": 1665.1182861328125, "learning_rate": 1.2121212121212122e-05, "loss": 178.7374, "step": 3000 }, { "epoch": 0.024321463489523996, "grad_norm": 1254.1650390625, "learning_rate": 1.2161616161616162e-05, "loss": 196.0946, "step": 3010 }, { "epoch": 0.02440226569380813, "grad_norm": 1680.0679931640625, "learning_rate": 1.2202020202020201e-05, "loss": 232.687, "step": 3020 }, { "epoch": 0.02448306789809226, "grad_norm": 1063.8585205078125, "learning_rate": 1.2242424242424242e-05, "loss": 247.0947, "step": 3030 }, { "epoch": 0.024563870102376394, "grad_norm": 3708.090087890625, "learning_rate": 1.2282828282828284e-05, "loss": 252.8591, "step": 3040 }, { "epoch": 0.024644672306660526, "grad_norm": 1465.7147216796875, "learning_rate": 1.2323232323232325e-05, "loss": 262.5838, "step": 3050 }, { "epoch": 0.02472547451094466, "grad_norm": 1327.937255859375, "learning_rate": 1.2363636363636365e-05, "loss": 363.5593, "step": 3060 }, { "epoch": 0.02480627671522879, "grad_norm": 1344.0130615234375, "learning_rate": 1.2404040404040404e-05, "loss": 257.7762, "step": 3070 }, { "epoch": 0.024887078919512924, "grad_norm": 1054.320068359375, "learning_rate": 1.2444444444444445e-05, "loss": 247.01, "step": 3080 }, { "epoch": 0.024967881123797057, "grad_norm": 1068.103515625, "learning_rate": 1.2484848484848485e-05, "loss": 248.3725, "step": 3090 }, { "epoch": 0.02504868332808119, "grad_norm": 1697.66259765625, "learning_rate": 1.2525252525252526e-05, "loss": 258.5855, "step": 3100 }, { "epoch": 0.025129485532365322, "grad_norm": 2154.507080078125, "learning_rate": 1.2565656565656566e-05, "loss": 324.4505, "step": 3110 }, { "epoch": 0.025210287736649455, "grad_norm": 1405.111083984375, "learning_rate": 1.2606060606060607e-05, "loss": 201.9885, "step": 3120 }, { "epoch": 0.025291089940933587, "grad_norm": 1943.2344970703125, "learning_rate": 1.2646464646464647e-05, "loss": 279.3126, "step": 3130 }, { "epoch": 0.02537189214521772, "grad_norm": 1883.3538818359375, "learning_rate": 1.2686868686868688e-05, "loss": 305.4725, "step": 3140 }, { "epoch": 0.025452694349501856, "grad_norm": 2943.758544921875, "learning_rate": 1.2727272727272727e-05, "loss": 282.1084, "step": 3150 }, { "epoch": 0.02553349655378599, "grad_norm": 1242.160400390625, "learning_rate": 1.2767676767676767e-05, "loss": 220.8162, "step": 3160 }, { "epoch": 0.02561429875807012, "grad_norm": 2627.211181640625, "learning_rate": 1.2808080808080808e-05, "loss": 303.4214, "step": 3170 }, { "epoch": 0.025695100962354254, "grad_norm": 1310.1988525390625, "learning_rate": 1.2848484848484848e-05, "loss": 259.6753, "step": 3180 }, { "epoch": 0.025775903166638386, "grad_norm": 1910.4666748046875, "learning_rate": 1.2888888888888889e-05, "loss": 273.7454, "step": 3190 }, { "epoch": 0.02585670537092252, "grad_norm": 740.5687255859375, "learning_rate": 1.292929292929293e-05, "loss": 292.6491, "step": 3200 }, { "epoch": 0.02593750757520665, "grad_norm": 1202.99462890625, "learning_rate": 1.296969696969697e-05, "loss": 243.2686, "step": 3210 }, { "epoch": 0.026018309779490784, "grad_norm": 2174.3525390625, "learning_rate": 1.301010101010101e-05, "loss": 276.3461, "step": 3220 }, { "epoch": 0.026099111983774917, "grad_norm": 1177.9141845703125, "learning_rate": 1.3050505050505051e-05, "loss": 246.7616, "step": 3230 }, { "epoch": 0.02617991418805905, "grad_norm": 1337.3155517578125, "learning_rate": 1.3090909090909093e-05, "loss": 279.5415, "step": 3240 }, { "epoch": 0.026260716392343182, "grad_norm": 970.5191040039062, "learning_rate": 1.3131313131313134e-05, "loss": 240.0207, "step": 3250 }, { "epoch": 0.026341518596627315, "grad_norm": 1119.1689453125, "learning_rate": 1.3171717171717171e-05, "loss": 242.7696, "step": 3260 }, { "epoch": 0.026422320800911447, "grad_norm": 4005.26318359375, "learning_rate": 1.3212121212121212e-05, "loss": 288.4158, "step": 3270 }, { "epoch": 0.02650312300519558, "grad_norm": 2148.187255859375, "learning_rate": 1.3252525252525252e-05, "loss": 224.5919, "step": 3280 }, { "epoch": 0.026583925209479716, "grad_norm": 1367.2222900390625, "learning_rate": 1.3292929292929293e-05, "loss": 212.8641, "step": 3290 }, { "epoch": 0.02666472741376385, "grad_norm": 1278.0506591796875, "learning_rate": 1.3333333333333333e-05, "loss": 222.3518, "step": 3300 }, { "epoch": 0.02674552961804798, "grad_norm": 1361.965087890625, "learning_rate": 1.3373737373737374e-05, "loss": 257.0285, "step": 3310 }, { "epoch": 0.026826331822332114, "grad_norm": 1330.8619384765625, "learning_rate": 1.3414141414141414e-05, "loss": 293.7213, "step": 3320 }, { "epoch": 0.026907134026616247, "grad_norm": 1107.8526611328125, "learning_rate": 1.3454545454545457e-05, "loss": 198.6048, "step": 3330 }, { "epoch": 0.02698793623090038, "grad_norm": 1048.5009765625, "learning_rate": 1.3494949494949497e-05, "loss": 197.4301, "step": 3340 }, { "epoch": 0.027068738435184512, "grad_norm": 2446.47119140625, "learning_rate": 1.3535353535353538e-05, "loss": 287.2842, "step": 3350 }, { "epoch": 0.027149540639468644, "grad_norm": 1396.544921875, "learning_rate": 1.3575757575757578e-05, "loss": 258.2231, "step": 3360 }, { "epoch": 0.027230342843752777, "grad_norm": 904.5388793945312, "learning_rate": 1.3616161616161615e-05, "loss": 376.3808, "step": 3370 }, { "epoch": 0.02731114504803691, "grad_norm": 1247.5994873046875, "learning_rate": 1.3656565656565656e-05, "loss": 318.9619, "step": 3380 }, { "epoch": 0.027391947252321042, "grad_norm": 1123.76220703125, "learning_rate": 1.3696969696969697e-05, "loss": 235.3794, "step": 3390 }, { "epoch": 0.027472749456605175, "grad_norm": 2435.860107421875, "learning_rate": 1.3737373737373737e-05, "loss": 290.7648, "step": 3400 }, { "epoch": 0.027553551660889308, "grad_norm": 1895.08642578125, "learning_rate": 1.3777777777777778e-05, "loss": 228.9499, "step": 3410 }, { "epoch": 0.02763435386517344, "grad_norm": 1057.9578857421875, "learning_rate": 1.3818181818181818e-05, "loss": 199.0401, "step": 3420 }, { "epoch": 0.027715156069457576, "grad_norm": 1296.09130859375, "learning_rate": 1.385858585858586e-05, "loss": 238.6514, "step": 3430 }, { "epoch": 0.02779595827374171, "grad_norm": 3411.5341796875, "learning_rate": 1.3898989898989901e-05, "loss": 299.6798, "step": 3440 }, { "epoch": 0.02787676047802584, "grad_norm": 1198.8946533203125, "learning_rate": 1.3939393939393942e-05, "loss": 258.3361, "step": 3450 }, { "epoch": 0.027957562682309974, "grad_norm": 839.223388671875, "learning_rate": 1.3979797979797982e-05, "loss": 222.8733, "step": 3460 }, { "epoch": 0.028038364886594107, "grad_norm": 1462.1505126953125, "learning_rate": 1.402020202020202e-05, "loss": 206.9738, "step": 3470 }, { "epoch": 0.02811916709087824, "grad_norm": 1066.7890625, "learning_rate": 1.406060606060606e-05, "loss": 190.5506, "step": 3480 }, { "epoch": 0.028199969295162372, "grad_norm": 2753.951171875, "learning_rate": 1.41010101010101e-05, "loss": 210.7065, "step": 3490 }, { "epoch": 0.028280771499446505, "grad_norm": 1700.187744140625, "learning_rate": 1.4141414141414141e-05, "loss": 239.0851, "step": 3500 }, { "epoch": 0.028361573703730637, "grad_norm": 1913.965576171875, "learning_rate": 1.4181818181818181e-05, "loss": 244.5272, "step": 3510 }, { "epoch": 0.02844237590801477, "grad_norm": 1347.6934814453125, "learning_rate": 1.4222222222222224e-05, "loss": 219.5849, "step": 3520 }, { "epoch": 0.028523178112298903, "grad_norm": 894.3698120117188, "learning_rate": 1.4262626262626264e-05, "loss": 261.3107, "step": 3530 }, { "epoch": 0.028603980316583035, "grad_norm": 649.42236328125, "learning_rate": 1.4303030303030305e-05, "loss": 202.5557, "step": 3540 }, { "epoch": 0.028684782520867168, "grad_norm": 824.8812255859375, "learning_rate": 1.4343434343434345e-05, "loss": 245.8003, "step": 3550 }, { "epoch": 0.0287655847251513, "grad_norm": 828.0931396484375, "learning_rate": 1.4383838383838386e-05, "loss": 260.2875, "step": 3560 }, { "epoch": 0.028846386929435437, "grad_norm": 1334.4947509765625, "learning_rate": 1.4424242424242426e-05, "loss": 232.7898, "step": 3570 }, { "epoch": 0.02892718913371957, "grad_norm": 1371.1171875, "learning_rate": 1.4464646464646464e-05, "loss": 418.4771, "step": 3580 }, { "epoch": 0.029007991338003702, "grad_norm": 18497.5234375, "learning_rate": 1.4505050505050504e-05, "loss": 303.3979, "step": 3590 }, { "epoch": 0.029088793542287834, "grad_norm": 1640.417724609375, "learning_rate": 1.4545454545454545e-05, "loss": 246.1203, "step": 3600 }, { "epoch": 0.029169595746571967, "grad_norm": 866.4635620117188, "learning_rate": 1.4585858585858587e-05, "loss": 227.0032, "step": 3610 }, { "epoch": 0.0292503979508561, "grad_norm": 1206.3389892578125, "learning_rate": 1.4626262626262627e-05, "loss": 240.7797, "step": 3620 }, { "epoch": 0.029331200155140232, "grad_norm": 1930.5679931640625, "learning_rate": 1.4666666666666668e-05, "loss": 244.9207, "step": 3630 }, { "epoch": 0.029412002359424365, "grad_norm": 1362.0755615234375, "learning_rate": 1.4707070707070709e-05, "loss": 223.5896, "step": 3640 }, { "epoch": 0.029492804563708497, "grad_norm": 1778.240478515625, "learning_rate": 1.4747474747474749e-05, "loss": 233.0804, "step": 3650 }, { "epoch": 0.02957360676799263, "grad_norm": 1185.7432861328125, "learning_rate": 1.478787878787879e-05, "loss": 269.5211, "step": 3660 }, { "epoch": 0.029654408972276763, "grad_norm": 1272.7274169921875, "learning_rate": 1.482828282828283e-05, "loss": 256.0854, "step": 3670 }, { "epoch": 0.029735211176560895, "grad_norm": 3724.482421875, "learning_rate": 1.486868686868687e-05, "loss": 220.0564, "step": 3680 }, { "epoch": 0.029816013380845028, "grad_norm": 1362.2408447265625, "learning_rate": 1.4909090909090908e-05, "loss": 196.9579, "step": 3690 }, { "epoch": 0.02989681558512916, "grad_norm": 1142.985107421875, "learning_rate": 1.494949494949495e-05, "loss": 298.3712, "step": 3700 }, { "epoch": 0.029977617789413297, "grad_norm": 1711.4461669921875, "learning_rate": 1.498989898989899e-05, "loss": 248.673, "step": 3710 }, { "epoch": 0.03005841999369743, "grad_norm": 1854.973876953125, "learning_rate": 1.5030303030303031e-05, "loss": 178.4528, "step": 3720 }, { "epoch": 0.030139222197981562, "grad_norm": 2415.3564453125, "learning_rate": 1.5070707070707072e-05, "loss": 279.0313, "step": 3730 }, { "epoch": 0.030220024402265695, "grad_norm": 1113.0447998046875, "learning_rate": 1.5111111111111112e-05, "loss": 263.2642, "step": 3740 }, { "epoch": 0.030300826606549827, "grad_norm": 1523.1632080078125, "learning_rate": 1.5151515151515153e-05, "loss": 292.6833, "step": 3750 }, { "epoch": 0.03038162881083396, "grad_norm": 1810.5382080078125, "learning_rate": 1.5191919191919193e-05, "loss": 260.2465, "step": 3760 }, { "epoch": 0.030462431015118092, "grad_norm": 2051.318115234375, "learning_rate": 1.5232323232323234e-05, "loss": 249.5686, "step": 3770 }, { "epoch": 0.030543233219402225, "grad_norm": 1145.482421875, "learning_rate": 1.5272727272727276e-05, "loss": 217.0, "step": 3780 }, { "epoch": 0.030624035423686358, "grad_norm": 1456.9969482421875, "learning_rate": 1.531313131313131e-05, "loss": 247.1355, "step": 3790 }, { "epoch": 0.03070483762797049, "grad_norm": 2063.9072265625, "learning_rate": 1.5353535353535354e-05, "loss": 317.8373, "step": 3800 }, { "epoch": 0.030785639832254623, "grad_norm": 1188.59130859375, "learning_rate": 1.5393939393939393e-05, "loss": 251.0659, "step": 3810 }, { "epoch": 0.030866442036538756, "grad_norm": 542.1653442382812, "learning_rate": 1.5434343434343435e-05, "loss": 205.6288, "step": 3820 }, { "epoch": 0.030947244240822888, "grad_norm": 858.66552734375, "learning_rate": 1.5474747474747474e-05, "loss": 261.1724, "step": 3830 }, { "epoch": 0.031028046445107024, "grad_norm": 1392.4208984375, "learning_rate": 1.5515151515151516e-05, "loss": 263.898, "step": 3840 }, { "epoch": 0.031108848649391157, "grad_norm": 1089.10888671875, "learning_rate": 1.5555555555555555e-05, "loss": 263.4895, "step": 3850 }, { "epoch": 0.03118965085367529, "grad_norm": 1323.1083984375, "learning_rate": 1.5595959595959597e-05, "loss": 224.5914, "step": 3860 }, { "epoch": 0.03127045305795942, "grad_norm": 748.7206420898438, "learning_rate": 1.563636363636364e-05, "loss": 185.8181, "step": 3870 }, { "epoch": 0.031351255262243555, "grad_norm": 1530.072021484375, "learning_rate": 1.5676767676767678e-05, "loss": 295.1081, "step": 3880 }, { "epoch": 0.031432057466527684, "grad_norm": 1390.1978759765625, "learning_rate": 1.571717171717172e-05, "loss": 221.7333, "step": 3890 }, { "epoch": 0.03151285967081182, "grad_norm": 1188.6934814453125, "learning_rate": 1.5757575757575756e-05, "loss": 231.4922, "step": 3900 }, { "epoch": 0.031593661875095956, "grad_norm": 1810.8616943359375, "learning_rate": 1.5797979797979798e-05, "loss": 226.2008, "step": 3910 }, { "epoch": 0.031674464079380085, "grad_norm": 1351.2021484375, "learning_rate": 1.5838383838383837e-05, "loss": 213.8082, "step": 3920 }, { "epoch": 0.03175526628366422, "grad_norm": 1504.8511962890625, "learning_rate": 1.587878787878788e-05, "loss": 237.731, "step": 3930 }, { "epoch": 0.03183606848794835, "grad_norm": 3990.205810546875, "learning_rate": 1.5919191919191918e-05, "loss": 308.5875, "step": 3940 }, { "epoch": 0.03191687069223249, "grad_norm": 1052.4140625, "learning_rate": 1.595959595959596e-05, "loss": 173.6135, "step": 3950 }, { "epoch": 0.031997672896516616, "grad_norm": 818.5986328125, "learning_rate": 1.6000000000000003e-05, "loss": 255.928, "step": 3960 }, { "epoch": 0.03207847510080075, "grad_norm": 3015.482666015625, "learning_rate": 1.604040404040404e-05, "loss": 246.6157, "step": 3970 }, { "epoch": 0.03215927730508488, "grad_norm": 1520.350341796875, "learning_rate": 1.6080808080808084e-05, "loss": 294.5478, "step": 3980 }, { "epoch": 0.03224007950936902, "grad_norm": 1362.8385009765625, "learning_rate": 1.6121212121212123e-05, "loss": 263.2382, "step": 3990 }, { "epoch": 0.032320881713653146, "grad_norm": 1330.2135009765625, "learning_rate": 1.6161616161616165e-05, "loss": 227.9952, "step": 4000 }, { "epoch": 0.03240168391793728, "grad_norm": 2001.479248046875, "learning_rate": 1.62020202020202e-05, "loss": 373.7298, "step": 4010 }, { "epoch": 0.03248248612222141, "grad_norm": 670.8789672851562, "learning_rate": 1.6242424242424243e-05, "loss": 252.2481, "step": 4020 }, { "epoch": 0.03256328832650555, "grad_norm": 1504.35205078125, "learning_rate": 1.628282828282828e-05, "loss": 259.7328, "step": 4030 }, { "epoch": 0.03264409053078968, "grad_norm": 1177.47509765625, "learning_rate": 1.6323232323232324e-05, "loss": 220.6592, "step": 4040 }, { "epoch": 0.03272489273507381, "grad_norm": 889.9537353515625, "learning_rate": 1.6363636363636366e-05, "loss": 210.0868, "step": 4050 }, { "epoch": 0.03280569493935795, "grad_norm": 1655.767333984375, "learning_rate": 1.6404040404040405e-05, "loss": 247.7082, "step": 4060 }, { "epoch": 0.03288649714364208, "grad_norm": 1741.26416015625, "learning_rate": 1.6444444444444447e-05, "loss": 213.1305, "step": 4070 }, { "epoch": 0.032967299347926214, "grad_norm": 1701.3470458984375, "learning_rate": 1.6484848484848486e-05, "loss": 197.4172, "step": 4080 }, { "epoch": 0.03304810155221034, "grad_norm": 1241.48876953125, "learning_rate": 1.6525252525252528e-05, "loss": 200.895, "step": 4090 }, { "epoch": 0.03312890375649448, "grad_norm": 4305.5234375, "learning_rate": 1.6565656565656567e-05, "loss": 270.1561, "step": 4100 }, { "epoch": 0.03320970596077861, "grad_norm": 1233.9559326171875, "learning_rate": 1.6606060606060606e-05, "loss": 234.697, "step": 4110 }, { "epoch": 0.033290508165062745, "grad_norm": 1864.9722900390625, "learning_rate": 1.6646464646464645e-05, "loss": 207.7519, "step": 4120 }, { "epoch": 0.033371310369346874, "grad_norm": 696.45654296875, "learning_rate": 1.6686868686868687e-05, "loss": 260.9977, "step": 4130 }, { "epoch": 0.03345211257363101, "grad_norm": 1083.8914794921875, "learning_rate": 1.672727272727273e-05, "loss": 296.5648, "step": 4140 }, { "epoch": 0.03353291477791514, "grad_norm": 787.8980102539062, "learning_rate": 1.6767676767676768e-05, "loss": 252.0068, "step": 4150 }, { "epoch": 0.033613716982199275, "grad_norm": 3963.899658203125, "learning_rate": 1.680808080808081e-05, "loss": 239.0976, "step": 4160 }, { "epoch": 0.033694519186483404, "grad_norm": 1345.8841552734375, "learning_rate": 1.684848484848485e-05, "loss": 200.5568, "step": 4170 }, { "epoch": 0.03377532139076754, "grad_norm": 1667.1441650390625, "learning_rate": 1.688888888888889e-05, "loss": 246.528, "step": 4180 }, { "epoch": 0.033856123595051676, "grad_norm": 941.3829956054688, "learning_rate": 1.692929292929293e-05, "loss": 188.0032, "step": 4190 }, { "epoch": 0.033936925799335806, "grad_norm": 2224.00048828125, "learning_rate": 1.6969696969696972e-05, "loss": 233.2688, "step": 4200 }, { "epoch": 0.03401772800361994, "grad_norm": 990.577880859375, "learning_rate": 1.701010101010101e-05, "loss": 229.0408, "step": 4210 }, { "epoch": 0.03409853020790407, "grad_norm": 1741.591064453125, "learning_rate": 1.705050505050505e-05, "loss": 210.8973, "step": 4220 }, { "epoch": 0.03417933241218821, "grad_norm": 1565.2149658203125, "learning_rate": 1.7090909090909092e-05, "loss": 172.9691, "step": 4230 }, { "epoch": 0.034260134616472336, "grad_norm": 1411.6668701171875, "learning_rate": 1.713131313131313e-05, "loss": 223.8018, "step": 4240 }, { "epoch": 0.03434093682075647, "grad_norm": 849.447998046875, "learning_rate": 1.7171717171717173e-05, "loss": 277.72, "step": 4250 }, { "epoch": 0.0344217390250406, "grad_norm": 1456.3353271484375, "learning_rate": 1.7212121212121212e-05, "loss": 269.1795, "step": 4260 }, { "epoch": 0.03450254122932474, "grad_norm": 2039.048583984375, "learning_rate": 1.7252525252525255e-05, "loss": 203.6644, "step": 4270 }, { "epoch": 0.03458334343360887, "grad_norm": 1037.1063232421875, "learning_rate": 1.7292929292929293e-05, "loss": 268.1442, "step": 4280 }, { "epoch": 0.034664145637893, "grad_norm": 1481.98095703125, "learning_rate": 1.7333333333333336e-05, "loss": 246.1609, "step": 4290 }, { "epoch": 0.03474494784217713, "grad_norm": 1042.147216796875, "learning_rate": 1.7373737373737375e-05, "loss": 360.0711, "step": 4300 }, { "epoch": 0.03482575004646127, "grad_norm": 1008.8258666992188, "learning_rate": 1.7414141414141417e-05, "loss": 254.1684, "step": 4310 }, { "epoch": 0.0349065522507454, "grad_norm": 1818.73681640625, "learning_rate": 1.7454545454545456e-05, "loss": 248.8469, "step": 4320 }, { "epoch": 0.03498735445502953, "grad_norm": 2598.832763671875, "learning_rate": 1.7494949494949494e-05, "loss": 215.4962, "step": 4330 }, { "epoch": 0.03506815665931367, "grad_norm": 5505.1572265625, "learning_rate": 1.7535353535353537e-05, "loss": 206.1825, "step": 4340 }, { "epoch": 0.0351489588635978, "grad_norm": 872.9111328125, "learning_rate": 1.7575757575757576e-05, "loss": 226.9096, "step": 4350 }, { "epoch": 0.035229761067881935, "grad_norm": 1309.483154296875, "learning_rate": 1.7616161616161618e-05, "loss": 347.4825, "step": 4360 }, { "epoch": 0.035310563272166064, "grad_norm": 1847.357666015625, "learning_rate": 1.7656565656565657e-05, "loss": 283.7126, "step": 4370 }, { "epoch": 0.0353913654764502, "grad_norm": 1132.7510986328125, "learning_rate": 1.76969696969697e-05, "loss": 238.7522, "step": 4380 }, { "epoch": 0.03547216768073433, "grad_norm": 1338.4906005859375, "learning_rate": 1.7737373737373738e-05, "loss": 206.4677, "step": 4390 }, { "epoch": 0.035552969885018465, "grad_norm": 889.9144897460938, "learning_rate": 1.777777777777778e-05, "loss": 218.258, "step": 4400 }, { "epoch": 0.035633772089302594, "grad_norm": 1081.747314453125, "learning_rate": 1.781818181818182e-05, "loss": 227.4267, "step": 4410 }, { "epoch": 0.03571457429358673, "grad_norm": 1337.2747802734375, "learning_rate": 1.785858585858586e-05, "loss": 216.5905, "step": 4420 }, { "epoch": 0.03579537649787086, "grad_norm": 1070.0733642578125, "learning_rate": 1.78989898989899e-05, "loss": 244.2413, "step": 4430 }, { "epoch": 0.035876178702154995, "grad_norm": 2713.52392578125, "learning_rate": 1.793939393939394e-05, "loss": 214.338, "step": 4440 }, { "epoch": 0.035956980906439125, "grad_norm": 1579.244873046875, "learning_rate": 1.797979797979798e-05, "loss": 236.9733, "step": 4450 }, { "epoch": 0.03603778311072326, "grad_norm": 1429.0421142578125, "learning_rate": 1.802020202020202e-05, "loss": 210.1912, "step": 4460 }, { "epoch": 0.0361185853150074, "grad_norm": 1236.484375, "learning_rate": 1.8060606060606062e-05, "loss": 209.4003, "step": 4470 }, { "epoch": 0.036199387519291526, "grad_norm": 2510.634521484375, "learning_rate": 1.81010101010101e-05, "loss": 261.3127, "step": 4480 }, { "epoch": 0.03628018972357566, "grad_norm": 697.04345703125, "learning_rate": 1.8141414141414143e-05, "loss": 202.3308, "step": 4490 }, { "epoch": 0.03636099192785979, "grad_norm": 1664.605712890625, "learning_rate": 1.8181818181818182e-05, "loss": 230.7549, "step": 4500 }, { "epoch": 0.03644179413214393, "grad_norm": 1968.6279296875, "learning_rate": 1.8222222222222224e-05, "loss": 222.0125, "step": 4510 }, { "epoch": 0.036522596336428056, "grad_norm": 1813.247314453125, "learning_rate": 1.8262626262626263e-05, "loss": 206.1146, "step": 4520 }, { "epoch": 0.03660339854071219, "grad_norm": 1681.3162841796875, "learning_rate": 1.8303030303030305e-05, "loss": 281.5203, "step": 4530 }, { "epoch": 0.03668420074499632, "grad_norm": 813.0327758789062, "learning_rate": 1.8343434343434344e-05, "loss": 241.7396, "step": 4540 }, { "epoch": 0.03676500294928046, "grad_norm": 1714.4927978515625, "learning_rate": 1.8383838383838383e-05, "loss": 229.0337, "step": 4550 }, { "epoch": 0.03684580515356459, "grad_norm": 1173.26318359375, "learning_rate": 1.8424242424242425e-05, "loss": 167.8814, "step": 4560 }, { "epoch": 0.03692660735784872, "grad_norm": 1044.22509765625, "learning_rate": 1.8464646464646464e-05, "loss": 181.4134, "step": 4570 }, { "epoch": 0.03700740956213285, "grad_norm": 1544.4964599609375, "learning_rate": 1.8505050505050506e-05, "loss": 264.1711, "step": 4580 }, { "epoch": 0.03708821176641699, "grad_norm": 3204.8271484375, "learning_rate": 1.8545454545454545e-05, "loss": 209.0515, "step": 4590 }, { "epoch": 0.03716901397070112, "grad_norm": 1948.9998779296875, "learning_rate": 1.8585858585858588e-05, "loss": 204.1481, "step": 4600 }, { "epoch": 0.037249816174985254, "grad_norm": 985.3388671875, "learning_rate": 1.8626262626262626e-05, "loss": 251.0652, "step": 4610 }, { "epoch": 0.03733061837926939, "grad_norm": 4716.29833984375, "learning_rate": 1.866666666666667e-05, "loss": 234.005, "step": 4620 }, { "epoch": 0.03741142058355352, "grad_norm": 2745.129150390625, "learning_rate": 1.8707070707070707e-05, "loss": 222.8053, "step": 4630 }, { "epoch": 0.037492222787837655, "grad_norm": 852.2494506835938, "learning_rate": 1.874747474747475e-05, "loss": 244.6, "step": 4640 }, { "epoch": 0.037573024992121784, "grad_norm": 1276.906494140625, "learning_rate": 1.878787878787879e-05, "loss": 243.4739, "step": 4650 }, { "epoch": 0.03765382719640592, "grad_norm": 2488.490478515625, "learning_rate": 1.8828282828282827e-05, "loss": 241.5105, "step": 4660 }, { "epoch": 0.03773462940069005, "grad_norm": 1208.5731201171875, "learning_rate": 1.886868686868687e-05, "loss": 266.4298, "step": 4670 }, { "epoch": 0.037815431604974185, "grad_norm": 1110.9935302734375, "learning_rate": 1.890909090909091e-05, "loss": 220.2013, "step": 4680 }, { "epoch": 0.037896233809258315, "grad_norm": 966.4763793945312, "learning_rate": 1.894949494949495e-05, "loss": 213.4089, "step": 4690 }, { "epoch": 0.03797703601354245, "grad_norm": 888.4136352539062, "learning_rate": 1.898989898989899e-05, "loss": 192.6133, "step": 4700 }, { "epoch": 0.03805783821782658, "grad_norm": 1441.930419921875, "learning_rate": 1.9030303030303032e-05, "loss": 210.6855, "step": 4710 }, { "epoch": 0.038138640422110716, "grad_norm": 1268.2919921875, "learning_rate": 1.907070707070707e-05, "loss": 196.9399, "step": 4720 }, { "epoch": 0.038219442626394845, "grad_norm": 714.101318359375, "learning_rate": 1.9111111111111113e-05, "loss": 236.1493, "step": 4730 }, { "epoch": 0.03830024483067898, "grad_norm": 1360.3662109375, "learning_rate": 1.9151515151515155e-05, "loss": 277.1614, "step": 4740 }, { "epoch": 0.03838104703496312, "grad_norm": 857.1802368164062, "learning_rate": 1.919191919191919e-05, "loss": 233.6975, "step": 4750 }, { "epoch": 0.038461849239247246, "grad_norm": 1430.3370361328125, "learning_rate": 1.9232323232323233e-05, "loss": 206.9375, "step": 4760 }, { "epoch": 0.03854265144353138, "grad_norm": 999.745849609375, "learning_rate": 1.9272727272727272e-05, "loss": 177.6682, "step": 4770 }, { "epoch": 0.03862345364781551, "grad_norm": 1979.0234375, "learning_rate": 1.9313131313131314e-05, "loss": 237.5471, "step": 4780 }, { "epoch": 0.03870425585209965, "grad_norm": 1399.9544677734375, "learning_rate": 1.9353535353535353e-05, "loss": 209.8267, "step": 4790 }, { "epoch": 0.03878505805638378, "grad_norm": 1058.5128173828125, "learning_rate": 1.9393939393939395e-05, "loss": 206.1269, "step": 4800 }, { "epoch": 0.03886586026066791, "grad_norm": 1852.674072265625, "learning_rate": 1.9434343434343434e-05, "loss": 192.6013, "step": 4810 }, { "epoch": 0.03894666246495204, "grad_norm": 1104.2967529296875, "learning_rate": 1.9474747474747476e-05, "loss": 252.5522, "step": 4820 }, { "epoch": 0.03902746466923618, "grad_norm": 1426.0396728515625, "learning_rate": 1.951515151515152e-05, "loss": 250.6448, "step": 4830 }, { "epoch": 0.03910826687352031, "grad_norm": 1632.4510498046875, "learning_rate": 1.9555555555555557e-05, "loss": 163.3638, "step": 4840 }, { "epoch": 0.03918906907780444, "grad_norm": 700.0907592773438, "learning_rate": 1.95959595959596e-05, "loss": 236.7388, "step": 4850 }, { "epoch": 0.03926987128208857, "grad_norm": 1205.572265625, "learning_rate": 1.9636363636363635e-05, "loss": 272.2705, "step": 4860 }, { "epoch": 0.03935067348637271, "grad_norm": 799.412353515625, "learning_rate": 1.9676767676767677e-05, "loss": 171.4291, "step": 4870 }, { "epoch": 0.03943147569065684, "grad_norm": 1350.2025146484375, "learning_rate": 1.9717171717171716e-05, "loss": 233.9921, "step": 4880 }, { "epoch": 0.039512277894940974, "grad_norm": 976.219970703125, "learning_rate": 1.975757575757576e-05, "loss": 189.0711, "step": 4890 }, { "epoch": 0.03959308009922511, "grad_norm": 947.8401489257812, "learning_rate": 1.9797979797979797e-05, "loss": 207.2786, "step": 4900 }, { "epoch": 0.03967388230350924, "grad_norm": 1402.2440185546875, "learning_rate": 1.983838383838384e-05, "loss": 233.6717, "step": 4910 }, { "epoch": 0.039754684507793375, "grad_norm": 2319.2314453125, "learning_rate": 1.987878787878788e-05, "loss": 268.4254, "step": 4920 }, { "epoch": 0.039835486712077504, "grad_norm": 1344.019775390625, "learning_rate": 1.991919191919192e-05, "loss": 215.5304, "step": 4930 }, { "epoch": 0.03991628891636164, "grad_norm": 1209.1622314453125, "learning_rate": 1.9959595959595963e-05, "loss": 202.8059, "step": 4940 }, { "epoch": 0.03999709112064577, "grad_norm": 1872.3892822265625, "learning_rate": 2e-05, "loss": 193.5764, "step": 4950 }, { "epoch": 0.040077893324929906, "grad_norm": 1944.2449951171875, "learning_rate": 2.0040404040404044e-05, "loss": 273.8487, "step": 4960 }, { "epoch": 0.040158695529214035, "grad_norm": 988.1495361328125, "learning_rate": 2.008080808080808e-05, "loss": 202.3245, "step": 4970 }, { "epoch": 0.04023949773349817, "grad_norm": 1082.6280517578125, "learning_rate": 2.012121212121212e-05, "loss": 190.6009, "step": 4980 }, { "epoch": 0.0403202999377823, "grad_norm": 1510.5738525390625, "learning_rate": 2.016161616161616e-05, "loss": 262.4282, "step": 4990 }, { "epoch": 0.040401102142066436, "grad_norm": 1080.0328369140625, "learning_rate": 2.0202020202020203e-05, "loss": 179.5178, "step": 5000 }, { "epoch": 0.040481904346350565, "grad_norm": 1204.5341796875, "learning_rate": 2.0242424242424245e-05, "loss": 208.4234, "step": 5010 }, { "epoch": 0.0405627065506347, "grad_norm": 788.6203002929688, "learning_rate": 2.0282828282828284e-05, "loss": 222.1854, "step": 5020 }, { "epoch": 0.04064350875491884, "grad_norm": 2447.934326171875, "learning_rate": 2.0323232323232326e-05, "loss": 183.1969, "step": 5030 }, { "epoch": 0.04072431095920297, "grad_norm": 1879.5914306640625, "learning_rate": 2.0363636363636365e-05, "loss": 235.8428, "step": 5040 }, { "epoch": 0.0408051131634871, "grad_norm": 859.5083618164062, "learning_rate": 2.0404040404040407e-05, "loss": 223.1974, "step": 5050 }, { "epoch": 0.04088591536777123, "grad_norm": 591.982421875, "learning_rate": 2.0444444444444446e-05, "loss": 195.905, "step": 5060 }, { "epoch": 0.04096671757205537, "grad_norm": 2516.256103515625, "learning_rate": 2.0484848484848485e-05, "loss": 224.4586, "step": 5070 }, { "epoch": 0.0410475197763395, "grad_norm": 1155.78271484375, "learning_rate": 2.0525252525252524e-05, "loss": 237.8034, "step": 5080 }, { "epoch": 0.04112832198062363, "grad_norm": 760.8511962890625, "learning_rate": 2.0565656565656566e-05, "loss": 213.4372, "step": 5090 }, { "epoch": 0.04120912418490776, "grad_norm": 746.3182983398438, "learning_rate": 2.0606060606060608e-05, "loss": 246.9279, "step": 5100 }, { "epoch": 0.0412899263891919, "grad_norm": 1112.6119384765625, "learning_rate": 2.0646464646464647e-05, "loss": 215.4636, "step": 5110 }, { "epoch": 0.04137072859347603, "grad_norm": 1308.880126953125, "learning_rate": 2.068686868686869e-05, "loss": 184.3576, "step": 5120 }, { "epoch": 0.041451530797760164, "grad_norm": 1182.3695068359375, "learning_rate": 2.0727272727272728e-05, "loss": 251.3663, "step": 5130 }, { "epoch": 0.04153233300204429, "grad_norm": 3545.449951171875, "learning_rate": 2.076767676767677e-05, "loss": 221.7183, "step": 5140 }, { "epoch": 0.04161313520632843, "grad_norm": 1155.616455078125, "learning_rate": 2.080808080808081e-05, "loss": 181.7703, "step": 5150 }, { "epoch": 0.04169393741061256, "grad_norm": 927.0892333984375, "learning_rate": 2.084848484848485e-05, "loss": 242.7771, "step": 5160 }, { "epoch": 0.041774739614896694, "grad_norm": 1621.09326171875, "learning_rate": 2.088888888888889e-05, "loss": 168.8398, "step": 5170 }, { "epoch": 0.04185554181918083, "grad_norm": 1823.0281982421875, "learning_rate": 2.092929292929293e-05, "loss": 226.3993, "step": 5180 }, { "epoch": 0.04193634402346496, "grad_norm": 1904.581298828125, "learning_rate": 2.096969696969697e-05, "loss": 274.6904, "step": 5190 }, { "epoch": 0.042017146227749096, "grad_norm": 1195.8973388671875, "learning_rate": 2.101010101010101e-05, "loss": 193.929, "step": 5200 }, { "epoch": 0.042097948432033225, "grad_norm": 809.5712890625, "learning_rate": 2.1050505050505052e-05, "loss": 183.259, "step": 5210 }, { "epoch": 0.04217875063631736, "grad_norm": 1392.5491943359375, "learning_rate": 2.109090909090909e-05, "loss": 220.0326, "step": 5220 }, { "epoch": 0.04225955284060149, "grad_norm": 1818.6051025390625, "learning_rate": 2.1131313131313134e-05, "loss": 209.3423, "step": 5230 }, { "epoch": 0.042340355044885626, "grad_norm": 756.583740234375, "learning_rate": 2.1171717171717172e-05, "loss": 152.79, "step": 5240 }, { "epoch": 0.042421157249169755, "grad_norm": 1358.5194091796875, "learning_rate": 2.1212121212121215e-05, "loss": 223.6846, "step": 5250 }, { "epoch": 0.04250195945345389, "grad_norm": 2302.727783203125, "learning_rate": 2.1252525252525254e-05, "loss": 206.5412, "step": 5260 }, { "epoch": 0.04258276165773802, "grad_norm": 1090.666259765625, "learning_rate": 2.1292929292929296e-05, "loss": 197.9379, "step": 5270 }, { "epoch": 0.04266356386202216, "grad_norm": 1535.5264892578125, "learning_rate": 2.1333333333333335e-05, "loss": 172.5529, "step": 5280 }, { "epoch": 0.042744366066306286, "grad_norm": 1242.1055908203125, "learning_rate": 2.1373737373737373e-05, "loss": 182.2667, "step": 5290 }, { "epoch": 0.04282516827059042, "grad_norm": 1571.221923828125, "learning_rate": 2.1414141414141416e-05, "loss": 206.9152, "step": 5300 }, { "epoch": 0.04290597047487456, "grad_norm": 1733.92578125, "learning_rate": 2.1454545454545455e-05, "loss": 253.6228, "step": 5310 }, { "epoch": 0.04298677267915869, "grad_norm": 1736.4722900390625, "learning_rate": 2.1494949494949497e-05, "loss": 209.5105, "step": 5320 }, { "epoch": 0.04306757488344282, "grad_norm": 846.6854248046875, "learning_rate": 2.1535353535353536e-05, "loss": 227.6331, "step": 5330 }, { "epoch": 0.04314837708772695, "grad_norm": 793.491943359375, "learning_rate": 2.1575757575757578e-05, "loss": 190.9206, "step": 5340 }, { "epoch": 0.04322917929201109, "grad_norm": 1314.4940185546875, "learning_rate": 2.1616161616161617e-05, "loss": 278.1586, "step": 5350 }, { "epoch": 0.04330998149629522, "grad_norm": 1807.1669921875, "learning_rate": 2.165656565656566e-05, "loss": 246.6954, "step": 5360 }, { "epoch": 0.043390783700579354, "grad_norm": 1456.6739501953125, "learning_rate": 2.1696969696969698e-05, "loss": 198.1051, "step": 5370 }, { "epoch": 0.04347158590486348, "grad_norm": 2645.863037109375, "learning_rate": 2.173737373737374e-05, "loss": 263.7012, "step": 5380 }, { "epoch": 0.04355238810914762, "grad_norm": 890.2818603515625, "learning_rate": 2.177777777777778e-05, "loss": 206.08, "step": 5390 }, { "epoch": 0.04363319031343175, "grad_norm": 1066.948974609375, "learning_rate": 2.1818181818181818e-05, "loss": 203.3024, "step": 5400 }, { "epoch": 0.043713992517715884, "grad_norm": 1678.3651123046875, "learning_rate": 2.185858585858586e-05, "loss": 287.4994, "step": 5410 }, { "epoch": 0.04379479472200001, "grad_norm": 1427.133544921875, "learning_rate": 2.18989898989899e-05, "loss": 236.3808, "step": 5420 }, { "epoch": 0.04387559692628415, "grad_norm": 993.3723754882812, "learning_rate": 2.193939393939394e-05, "loss": 221.2247, "step": 5430 }, { "epoch": 0.04395639913056828, "grad_norm": 919.2279663085938, "learning_rate": 2.197979797979798e-05, "loss": 232.8961, "step": 5440 }, { "epoch": 0.044037201334852415, "grad_norm": 1196.51904296875, "learning_rate": 2.2020202020202022e-05, "loss": 208.7773, "step": 5450 }, { "epoch": 0.04411800353913655, "grad_norm": 937.6903076171875, "learning_rate": 2.206060606060606e-05, "loss": 159.7425, "step": 5460 }, { "epoch": 0.04419880574342068, "grad_norm": 2946.419921875, "learning_rate": 2.2101010101010103e-05, "loss": 201.0844, "step": 5470 }, { "epoch": 0.044279607947704816, "grad_norm": 1663.4422607421875, "learning_rate": 2.2141414141414142e-05, "loss": 140.8333, "step": 5480 }, { "epoch": 0.044360410151988945, "grad_norm": 1202.589599609375, "learning_rate": 2.2181818181818184e-05, "loss": 210.6169, "step": 5490 }, { "epoch": 0.04444121235627308, "grad_norm": 1676.0555419921875, "learning_rate": 2.2222222222222223e-05, "loss": 300.5972, "step": 5500 }, { "epoch": 0.04452201456055721, "grad_norm": 1122.7333984375, "learning_rate": 2.2262626262626262e-05, "loss": 223.7688, "step": 5510 }, { "epoch": 0.044602816764841346, "grad_norm": 842.3754272460938, "learning_rate": 2.2303030303030304e-05, "loss": 231.1573, "step": 5520 }, { "epoch": 0.044683618969125476, "grad_norm": 912.3519897460938, "learning_rate": 2.2343434343434343e-05, "loss": 161.5479, "step": 5530 }, { "epoch": 0.04476442117340961, "grad_norm": 2117.377197265625, "learning_rate": 2.2383838383838385e-05, "loss": 178.251, "step": 5540 }, { "epoch": 0.04484522337769374, "grad_norm": 1402.2164306640625, "learning_rate": 2.2424242424242424e-05, "loss": 209.2086, "step": 5550 }, { "epoch": 0.04492602558197788, "grad_norm": 1458.323974609375, "learning_rate": 2.2464646464646467e-05, "loss": 243.9479, "step": 5560 }, { "epoch": 0.045006827786262006, "grad_norm": 2175.216796875, "learning_rate": 2.2505050505050505e-05, "loss": 189.8892, "step": 5570 }, { "epoch": 0.04508762999054614, "grad_norm": 1899.4354248046875, "learning_rate": 2.2545454545454548e-05, "loss": 335.0552, "step": 5580 }, { "epoch": 0.04516843219483028, "grad_norm": 1230.814697265625, "learning_rate": 2.2585858585858587e-05, "loss": 194.9335, "step": 5590 }, { "epoch": 0.04524923439911441, "grad_norm": 2101.527587890625, "learning_rate": 2.262626262626263e-05, "loss": 257.3806, "step": 5600 }, { "epoch": 0.045330036603398544, "grad_norm": 1695.30810546875, "learning_rate": 2.2666666666666668e-05, "loss": 219.7137, "step": 5610 }, { "epoch": 0.04541083880768267, "grad_norm": 1386.2855224609375, "learning_rate": 2.2707070707070706e-05, "loss": 236.2214, "step": 5620 }, { "epoch": 0.04549164101196681, "grad_norm": 1138.779052734375, "learning_rate": 2.274747474747475e-05, "loss": 192.9845, "step": 5630 }, { "epoch": 0.04557244321625094, "grad_norm": 2650.991943359375, "learning_rate": 2.2787878787878788e-05, "loss": 233.3904, "step": 5640 }, { "epoch": 0.045653245420535074, "grad_norm": 1309.0333251953125, "learning_rate": 2.282828282828283e-05, "loss": 225.3846, "step": 5650 }, { "epoch": 0.0457340476248192, "grad_norm": 930.385009765625, "learning_rate": 2.286868686868687e-05, "loss": 236.1336, "step": 5660 }, { "epoch": 0.04581484982910334, "grad_norm": 1646.2891845703125, "learning_rate": 2.290909090909091e-05, "loss": 227.3526, "step": 5670 }, { "epoch": 0.04589565203338747, "grad_norm": 2285.751708984375, "learning_rate": 2.294949494949495e-05, "loss": 236.6346, "step": 5680 }, { "epoch": 0.045976454237671605, "grad_norm": 3180.75537109375, "learning_rate": 2.2989898989898992e-05, "loss": 177.5457, "step": 5690 }, { "epoch": 0.046057256441955734, "grad_norm": 1423.35009765625, "learning_rate": 2.3030303030303034e-05, "loss": 194.2139, "step": 5700 }, { "epoch": 0.04613805864623987, "grad_norm": 1577.701171875, "learning_rate": 2.307070707070707e-05, "loss": 183.8717, "step": 5710 }, { "epoch": 0.046218860850524, "grad_norm": 1255.1485595703125, "learning_rate": 2.3111111111111112e-05, "loss": 213.8492, "step": 5720 }, { "epoch": 0.046299663054808135, "grad_norm": 1154.9453125, "learning_rate": 2.315151515151515e-05, "loss": 219.6154, "step": 5730 }, { "epoch": 0.04638046525909227, "grad_norm": 3208.9140625, "learning_rate": 2.3191919191919193e-05, "loss": 212.2527, "step": 5740 }, { "epoch": 0.0464612674633764, "grad_norm": 826.8831787109375, "learning_rate": 2.3232323232323232e-05, "loss": 193.0573, "step": 5750 }, { "epoch": 0.046542069667660536, "grad_norm": 953.578369140625, "learning_rate": 2.3272727272727274e-05, "loss": 200.1285, "step": 5760 }, { "epoch": 0.046622871871944666, "grad_norm": 948.6517944335938, "learning_rate": 2.3313131313131313e-05, "loss": 226.3946, "step": 5770 }, { "epoch": 0.0467036740762288, "grad_norm": 1502.9415283203125, "learning_rate": 2.3353535353535355e-05, "loss": 301.4247, "step": 5780 }, { "epoch": 0.04678447628051293, "grad_norm": 592.7190551757812, "learning_rate": 2.3393939393939397e-05, "loss": 171.6613, "step": 5790 }, { "epoch": 0.04686527848479707, "grad_norm": 774.3163452148438, "learning_rate": 2.3434343434343436e-05, "loss": 174.7567, "step": 5800 }, { "epoch": 0.046946080689081196, "grad_norm": 1000.3840942382812, "learning_rate": 2.347474747474748e-05, "loss": 140.1143, "step": 5810 }, { "epoch": 0.04702688289336533, "grad_norm": 1050.761474609375, "learning_rate": 2.3515151515151514e-05, "loss": 234.2542, "step": 5820 }, { "epoch": 0.04710768509764946, "grad_norm": 1076.979248046875, "learning_rate": 2.3555555555555556e-05, "loss": 170.2877, "step": 5830 }, { "epoch": 0.0471884873019336, "grad_norm": 1528.865478515625, "learning_rate": 2.3595959595959595e-05, "loss": 280.3715, "step": 5840 }, { "epoch": 0.047269289506217727, "grad_norm": 1554.0205078125, "learning_rate": 2.3636363636363637e-05, "loss": 258.9206, "step": 5850 }, { "epoch": 0.04735009171050186, "grad_norm": 969.7879028320312, "learning_rate": 2.3676767676767676e-05, "loss": 173.4592, "step": 5860 }, { "epoch": 0.047430893914786, "grad_norm": 1271.55322265625, "learning_rate": 2.371717171717172e-05, "loss": 187.7373, "step": 5870 }, { "epoch": 0.04751169611907013, "grad_norm": 757.3799438476562, "learning_rate": 2.375757575757576e-05, "loss": 206.0978, "step": 5880 }, { "epoch": 0.047592498323354264, "grad_norm": 1099.2119140625, "learning_rate": 2.37979797979798e-05, "loss": 191.4486, "step": 5890 }, { "epoch": 0.04767330052763839, "grad_norm": 895.0558471679688, "learning_rate": 2.3838383838383842e-05, "loss": 197.1677, "step": 5900 }, { "epoch": 0.04775410273192253, "grad_norm": 900.752685546875, "learning_rate": 2.387878787878788e-05, "loss": 209.3482, "step": 5910 }, { "epoch": 0.04783490493620666, "grad_norm": 865.3425903320312, "learning_rate": 2.3919191919191923e-05, "loss": 211.704, "step": 5920 }, { "epoch": 0.047915707140490794, "grad_norm": 1376.961181640625, "learning_rate": 2.395959595959596e-05, "loss": 197.2012, "step": 5930 }, { "epoch": 0.047996509344774924, "grad_norm": 2671.92236328125, "learning_rate": 2.4e-05, "loss": 262.8319, "step": 5940 }, { "epoch": 0.04807731154905906, "grad_norm": 4328.66552734375, "learning_rate": 2.404040404040404e-05, "loss": 263.4226, "step": 5950 }, { "epoch": 0.04815811375334319, "grad_norm": 1454.4398193359375, "learning_rate": 2.4080808080808082e-05, "loss": 173.7909, "step": 5960 }, { "epoch": 0.048238915957627325, "grad_norm": 1238.2913818359375, "learning_rate": 2.4121212121212124e-05, "loss": 190.8571, "step": 5970 }, { "epoch": 0.048319718161911454, "grad_norm": 1106.6146240234375, "learning_rate": 2.4161616161616163e-05, "loss": 252.6962, "step": 5980 }, { "epoch": 0.04840052036619559, "grad_norm": 1612.1171875, "learning_rate": 2.4202020202020205e-05, "loss": 176.1714, "step": 5990 }, { "epoch": 0.04848132257047972, "grad_norm": 684.4707641601562, "learning_rate": 2.4242424242424244e-05, "loss": 236.6299, "step": 6000 }, { "epoch": 0.048562124774763855, "grad_norm": 5278.638671875, "learning_rate": 2.4282828282828286e-05, "loss": 200.9588, "step": 6010 }, { "epoch": 0.04864292697904799, "grad_norm": 2136.859375, "learning_rate": 2.4323232323232325e-05, "loss": 249.8048, "step": 6020 }, { "epoch": 0.04872372918333212, "grad_norm": 704.8456420898438, "learning_rate": 2.4363636363636364e-05, "loss": 210.816, "step": 6030 }, { "epoch": 0.04880453138761626, "grad_norm": 2405.291259765625, "learning_rate": 2.4404040404040403e-05, "loss": 180.7068, "step": 6040 }, { "epoch": 0.048885333591900386, "grad_norm": 1121.5928955078125, "learning_rate": 2.4444444444444445e-05, "loss": 268.1764, "step": 6050 }, { "epoch": 0.04896613579618452, "grad_norm": 1185.4925537109375, "learning_rate": 2.4484848484848484e-05, "loss": 252.5901, "step": 6060 }, { "epoch": 0.04904693800046865, "grad_norm": 1037.7261962890625, "learning_rate": 2.4525252525252526e-05, "loss": 217.7089, "step": 6070 }, { "epoch": 0.04912774020475279, "grad_norm": 3574.91943359375, "learning_rate": 2.4565656565656568e-05, "loss": 248.0757, "step": 6080 }, { "epoch": 0.049208542409036916, "grad_norm": 1335.7510986328125, "learning_rate": 2.4606060606060607e-05, "loss": 243.5903, "step": 6090 }, { "epoch": 0.04928934461332105, "grad_norm": 1548.2281494140625, "learning_rate": 2.464646464646465e-05, "loss": 204.2808, "step": 6100 }, { "epoch": 0.04937014681760518, "grad_norm": 1327.641357421875, "learning_rate": 2.4686868686868688e-05, "loss": 175.3226, "step": 6110 }, { "epoch": 0.04945094902188932, "grad_norm": 1096.567626953125, "learning_rate": 2.472727272727273e-05, "loss": 251.3891, "step": 6120 }, { "epoch": 0.04953175122617345, "grad_norm": 916.0780639648438, "learning_rate": 2.476767676767677e-05, "loss": 265.5964, "step": 6130 }, { "epoch": 0.04961255343045758, "grad_norm": 3319.821533203125, "learning_rate": 2.4808080808080808e-05, "loss": 199.656, "step": 6140 }, { "epoch": 0.04969335563474172, "grad_norm": 804.5398559570312, "learning_rate": 2.4848484848484847e-05, "loss": 176.793, "step": 6150 }, { "epoch": 0.04977415783902585, "grad_norm": 1266.6590576171875, "learning_rate": 2.488888888888889e-05, "loss": 172.6065, "step": 6160 }, { "epoch": 0.049854960043309984, "grad_norm": 953.1856689453125, "learning_rate": 2.492929292929293e-05, "loss": 259.3056, "step": 6170 }, { "epoch": 0.049935762247594113, "grad_norm": 1643.679443359375, "learning_rate": 2.496969696969697e-05, "loss": 227.5671, "step": 6180 }, { "epoch": 0.05001656445187825, "grad_norm": 2092.3837890625, "learning_rate": 2.5010101010101013e-05, "loss": 231.7141, "step": 6190 }, { "epoch": 0.05009736665616238, "grad_norm": 5872.7822265625, "learning_rate": 2.505050505050505e-05, "loss": 307.3282, "step": 6200 }, { "epoch": 0.050178168860446515, "grad_norm": 1653.10888671875, "learning_rate": 2.5090909090909094e-05, "loss": 290.71, "step": 6210 }, { "epoch": 0.050258971064730644, "grad_norm": 5940.2861328125, "learning_rate": 2.5131313131313133e-05, "loss": 298.1718, "step": 6220 }, { "epoch": 0.05033977326901478, "grad_norm": 1056.6617431640625, "learning_rate": 2.5171717171717175e-05, "loss": 167.573, "step": 6230 }, { "epoch": 0.05042057547329891, "grad_norm": 1492.5479736328125, "learning_rate": 2.5212121212121214e-05, "loss": 209.9481, "step": 6240 }, { "epoch": 0.050501377677583045, "grad_norm": 764.5651245117188, "learning_rate": 2.5252525252525256e-05, "loss": 189.9945, "step": 6250 }, { "epoch": 0.050582179881867174, "grad_norm": 2933.18603515625, "learning_rate": 2.5292929292929295e-05, "loss": 228.2252, "step": 6260 }, { "epoch": 0.05066298208615131, "grad_norm": 2692.583740234375, "learning_rate": 2.5333333333333337e-05, "loss": 217.1123, "step": 6270 }, { "epoch": 0.05074378429043544, "grad_norm": 1611.5694580078125, "learning_rate": 2.5373737373737376e-05, "loss": 199.2745, "step": 6280 }, { "epoch": 0.050824586494719576, "grad_norm": 638.3251953125, "learning_rate": 2.5414141414141418e-05, "loss": 283.4336, "step": 6290 }, { "epoch": 0.05090538869900371, "grad_norm": 960.7551879882812, "learning_rate": 2.5454545454545454e-05, "loss": 199.2895, "step": 6300 }, { "epoch": 0.05098619090328784, "grad_norm": 1416.6865234375, "learning_rate": 2.5494949494949492e-05, "loss": 247.6437, "step": 6310 }, { "epoch": 0.05106699310757198, "grad_norm": 962.5587158203125, "learning_rate": 2.5535353535353535e-05, "loss": 222.514, "step": 6320 }, { "epoch": 0.051147795311856106, "grad_norm": 1019.0704956054688, "learning_rate": 2.5575757575757573e-05, "loss": 233.7968, "step": 6330 }, { "epoch": 0.05122859751614024, "grad_norm": 1380.1087646484375, "learning_rate": 2.5616161616161616e-05, "loss": 203.472, "step": 6340 }, { "epoch": 0.05130939972042437, "grad_norm": 765.1551513671875, "learning_rate": 2.5656565656565658e-05, "loss": 202.9591, "step": 6350 }, { "epoch": 0.05139020192470851, "grad_norm": 854.4512329101562, "learning_rate": 2.5696969696969697e-05, "loss": 152.8654, "step": 6360 }, { "epoch": 0.05147100412899264, "grad_norm": 1366.1529541015625, "learning_rate": 2.573737373737374e-05, "loss": 202.4912, "step": 6370 }, { "epoch": 0.05155180633327677, "grad_norm": 812.153564453125, "learning_rate": 2.5777777777777778e-05, "loss": 190.2283, "step": 6380 }, { "epoch": 0.0516326085375609, "grad_norm": 2072.30029296875, "learning_rate": 2.581818181818182e-05, "loss": 250.4601, "step": 6390 }, { "epoch": 0.05171341074184504, "grad_norm": 1064.25732421875, "learning_rate": 2.585858585858586e-05, "loss": 243.8253, "step": 6400 }, { "epoch": 0.05179421294612917, "grad_norm": 1004.585205078125, "learning_rate": 2.58989898989899e-05, "loss": 233.6981, "step": 6410 }, { "epoch": 0.0518750151504133, "grad_norm": 781.0443115234375, "learning_rate": 2.593939393939394e-05, "loss": 231.5708, "step": 6420 }, { "epoch": 0.05195581735469744, "grad_norm": 1038.6923828125, "learning_rate": 2.5979797979797982e-05, "loss": 166.9408, "step": 6430 }, { "epoch": 0.05203661955898157, "grad_norm": 1369.49560546875, "learning_rate": 2.602020202020202e-05, "loss": 212.7086, "step": 6440 }, { "epoch": 0.052117421763265705, "grad_norm": 1065.5115966796875, "learning_rate": 2.6060606060606063e-05, "loss": 197.232, "step": 6450 }, { "epoch": 0.052198223967549834, "grad_norm": 1192.5135498046875, "learning_rate": 2.6101010101010102e-05, "loss": 210.3559, "step": 6460 }, { "epoch": 0.05227902617183397, "grad_norm": 2817.4658203125, "learning_rate": 2.6141414141414145e-05, "loss": 218.5129, "step": 6470 }, { "epoch": 0.0523598283761181, "grad_norm": 1661.2547607421875, "learning_rate": 2.6181818181818187e-05, "loss": 251.3214, "step": 6480 }, { "epoch": 0.052440630580402235, "grad_norm": 1465.83251953125, "learning_rate": 2.6222222222222226e-05, "loss": 167.4487, "step": 6490 }, { "epoch": 0.052521432784686364, "grad_norm": 1172.0814208984375, "learning_rate": 2.6262626262626268e-05, "loss": 202.9199, "step": 6500 }, { "epoch": 0.0526022349889705, "grad_norm": 845.3886108398438, "learning_rate": 2.63030303030303e-05, "loss": 245.5335, "step": 6510 }, { "epoch": 0.05268303719325463, "grad_norm": 1505.1903076171875, "learning_rate": 2.6343434343434342e-05, "loss": 189.6107, "step": 6520 }, { "epoch": 0.052763839397538766, "grad_norm": 855.8611450195312, "learning_rate": 2.6383838383838384e-05, "loss": 168.8279, "step": 6530 }, { "epoch": 0.052844641601822895, "grad_norm": 1719.1915283203125, "learning_rate": 2.6424242424242423e-05, "loss": 266.6213, "step": 6540 }, { "epoch": 0.05292544380610703, "grad_norm": 1334.455322265625, "learning_rate": 2.6464646464646466e-05, "loss": 154.8023, "step": 6550 }, { "epoch": 0.05300624601039116, "grad_norm": 1549.58154296875, "learning_rate": 2.6505050505050504e-05, "loss": 188.1264, "step": 6560 }, { "epoch": 0.053087048214675296, "grad_norm": 740.02587890625, "learning_rate": 2.6545454545454547e-05, "loss": 241.7192, "step": 6570 }, { "epoch": 0.05316785041895943, "grad_norm": 1759.15869140625, "learning_rate": 2.6585858585858585e-05, "loss": 249.789, "step": 6580 }, { "epoch": 0.05324865262324356, "grad_norm": 1615.3770751953125, "learning_rate": 2.6626262626262628e-05, "loss": 256.1343, "step": 6590 }, { "epoch": 0.0533294548275277, "grad_norm": 1187.103515625, "learning_rate": 2.6666666666666667e-05, "loss": 153.0139, "step": 6600 }, { "epoch": 0.05341025703181183, "grad_norm": 835.982177734375, "learning_rate": 2.670707070707071e-05, "loss": 240.5858, "step": 6610 }, { "epoch": 0.05349105923609596, "grad_norm": 658.9365234375, "learning_rate": 2.6747474747474748e-05, "loss": 212.2474, "step": 6620 }, { "epoch": 0.05357186144038009, "grad_norm": 836.15185546875, "learning_rate": 2.678787878787879e-05, "loss": 203.4595, "step": 6630 }, { "epoch": 0.05365266364466423, "grad_norm": 1312.960205078125, "learning_rate": 2.682828282828283e-05, "loss": 178.5055, "step": 6640 }, { "epoch": 0.05373346584894836, "grad_norm": 2402.58642578125, "learning_rate": 2.686868686868687e-05, "loss": 204.5425, "step": 6650 }, { "epoch": 0.05381426805323249, "grad_norm": 743.2178344726562, "learning_rate": 2.6909090909090913e-05, "loss": 151.0765, "step": 6660 }, { "epoch": 0.05389507025751662, "grad_norm": 2009.14599609375, "learning_rate": 2.6949494949494952e-05, "loss": 255.5723, "step": 6670 }, { "epoch": 0.05397587246180076, "grad_norm": 1129.924560546875, "learning_rate": 2.6989898989898994e-05, "loss": 249.5275, "step": 6680 }, { "epoch": 0.05405667466608489, "grad_norm": 1877.5682373046875, "learning_rate": 2.7030303030303033e-05, "loss": 201.4787, "step": 6690 }, { "epoch": 0.054137476870369024, "grad_norm": 1205.5860595703125, "learning_rate": 2.7070707070707075e-05, "loss": 165.1917, "step": 6700 }, { "epoch": 0.05421827907465316, "grad_norm": 833.5079956054688, "learning_rate": 2.7111111111111114e-05, "loss": 222.6354, "step": 6710 }, { "epoch": 0.05429908127893729, "grad_norm": 1644.57470703125, "learning_rate": 2.7151515151515157e-05, "loss": 183.0618, "step": 6720 }, { "epoch": 0.054379883483221425, "grad_norm": 1261.3482666015625, "learning_rate": 2.7191919191919192e-05, "loss": 204.8876, "step": 6730 }, { "epoch": 0.054460685687505554, "grad_norm": 1064.4910888671875, "learning_rate": 2.723232323232323e-05, "loss": 228.8735, "step": 6740 }, { "epoch": 0.05454148789178969, "grad_norm": 1227.28369140625, "learning_rate": 2.7272727272727273e-05, "loss": 244.5206, "step": 6750 }, { "epoch": 0.05462229009607382, "grad_norm": 747.6671142578125, "learning_rate": 2.7313131313131312e-05, "loss": 171.5991, "step": 6760 }, { "epoch": 0.054703092300357956, "grad_norm": 1191.174560546875, "learning_rate": 2.7353535353535354e-05, "loss": 183.062, "step": 6770 }, { "epoch": 0.054783894504642085, "grad_norm": 1179.271484375, "learning_rate": 2.7393939393939393e-05, "loss": 203.4914, "step": 6780 }, { "epoch": 0.05486469670892622, "grad_norm": 1980.94287109375, "learning_rate": 2.7434343434343435e-05, "loss": 190.7682, "step": 6790 }, { "epoch": 0.05494549891321035, "grad_norm": 1313.760498046875, "learning_rate": 2.7474747474747474e-05, "loss": 179.4395, "step": 6800 }, { "epoch": 0.055026301117494486, "grad_norm": 818.7135620117188, "learning_rate": 2.7515151515151516e-05, "loss": 240.7207, "step": 6810 }, { "epoch": 0.055107103321778615, "grad_norm": 1303.9735107421875, "learning_rate": 2.7555555555555555e-05, "loss": 197.5866, "step": 6820 }, { "epoch": 0.05518790552606275, "grad_norm": 4817.638671875, "learning_rate": 2.7595959595959597e-05, "loss": 236.2139, "step": 6830 }, { "epoch": 0.05526870773034688, "grad_norm": 1369.7080078125, "learning_rate": 2.7636363636363636e-05, "loss": 155.043, "step": 6840 }, { "epoch": 0.05534950993463102, "grad_norm": 1351.29150390625, "learning_rate": 2.767676767676768e-05, "loss": 193.5722, "step": 6850 }, { "epoch": 0.05543031213891515, "grad_norm": 1340.113525390625, "learning_rate": 2.771717171717172e-05, "loss": 196.4928, "step": 6860 }, { "epoch": 0.05551111434319928, "grad_norm": 1829.1298828125, "learning_rate": 2.775757575757576e-05, "loss": 207.2559, "step": 6870 }, { "epoch": 0.05559191654748342, "grad_norm": 1614.317138671875, "learning_rate": 2.7797979797979802e-05, "loss": 191.8481, "step": 6880 }, { "epoch": 0.05567271875176755, "grad_norm": 1409.754150390625, "learning_rate": 2.783838383838384e-05, "loss": 172.1698, "step": 6890 }, { "epoch": 0.05575352095605168, "grad_norm": 1008.7220458984375, "learning_rate": 2.7878787878787883e-05, "loss": 204.197, "step": 6900 }, { "epoch": 0.05583432316033581, "grad_norm": 1213.98291015625, "learning_rate": 2.7919191919191922e-05, "loss": 176.116, "step": 6910 }, { "epoch": 0.05591512536461995, "grad_norm": 1919.146484375, "learning_rate": 2.7959595959595964e-05, "loss": 205.5731, "step": 6920 }, { "epoch": 0.05599592756890408, "grad_norm": 1582.1240234375, "learning_rate": 2.8000000000000003e-05, "loss": 204.0053, "step": 6930 }, { "epoch": 0.056076729773188214, "grad_norm": 1082.2257080078125, "learning_rate": 2.804040404040404e-05, "loss": 216.7668, "step": 6940 }, { "epoch": 0.05615753197747234, "grad_norm": 1451.9715576171875, "learning_rate": 2.808080808080808e-05, "loss": 128.3858, "step": 6950 }, { "epoch": 0.05623833418175648, "grad_norm": 2580.067138671875, "learning_rate": 2.812121212121212e-05, "loss": 187.2646, "step": 6960 }, { "epoch": 0.05631913638604061, "grad_norm": 1153.5308837890625, "learning_rate": 2.8161616161616162e-05, "loss": 170.1935, "step": 6970 }, { "epoch": 0.056399938590324744, "grad_norm": 842.653076171875, "learning_rate": 2.82020202020202e-05, "loss": 229.306, "step": 6980 }, { "epoch": 0.05648074079460888, "grad_norm": 1086.96337890625, "learning_rate": 2.8242424242424243e-05, "loss": 180.9517, "step": 6990 }, { "epoch": 0.05656154299889301, "grad_norm": 963.1438598632812, "learning_rate": 2.8282828282828282e-05, "loss": 186.2078, "step": 7000 }, { "epoch": 0.056642345203177145, "grad_norm": 1010.3299560546875, "learning_rate": 2.8323232323232324e-05, "loss": 223.6001, "step": 7010 }, { "epoch": 0.056723147407461275, "grad_norm": 1217.844482421875, "learning_rate": 2.8363636363636363e-05, "loss": 179.5198, "step": 7020 }, { "epoch": 0.05680394961174541, "grad_norm": 1364.8577880859375, "learning_rate": 2.8404040404040405e-05, "loss": 212.5286, "step": 7030 }, { "epoch": 0.05688475181602954, "grad_norm": 804.541748046875, "learning_rate": 2.8444444444444447e-05, "loss": 201.4965, "step": 7040 }, { "epoch": 0.056965554020313676, "grad_norm": 2093.808349609375, "learning_rate": 2.8484848484848486e-05, "loss": 202.5039, "step": 7050 }, { "epoch": 0.057046356224597805, "grad_norm": 1088.9471435546875, "learning_rate": 2.852525252525253e-05, "loss": 164.6322, "step": 7060 }, { "epoch": 0.05712715842888194, "grad_norm": 1510.014404296875, "learning_rate": 2.8565656565656567e-05, "loss": 246.0487, "step": 7070 }, { "epoch": 0.05720796063316607, "grad_norm": 617.3926391601562, "learning_rate": 2.860606060606061e-05, "loss": 166.5255, "step": 7080 }, { "epoch": 0.057288762837450206, "grad_norm": 1088.094482421875, "learning_rate": 2.864646464646465e-05, "loss": 180.0012, "step": 7090 }, { "epoch": 0.057369565041734336, "grad_norm": 754.35400390625, "learning_rate": 2.868686868686869e-05, "loss": 165.6718, "step": 7100 }, { "epoch": 0.05745036724601847, "grad_norm": 847.3502197265625, "learning_rate": 2.872727272727273e-05, "loss": 150.3254, "step": 7110 }, { "epoch": 0.0575311694503026, "grad_norm": 3462.79541015625, "learning_rate": 2.876767676767677e-05, "loss": 206.9913, "step": 7120 }, { "epoch": 0.05761197165458674, "grad_norm": 1302.846923828125, "learning_rate": 2.880808080808081e-05, "loss": 218.2749, "step": 7130 }, { "epoch": 0.05769277385887087, "grad_norm": 1508.3194580078125, "learning_rate": 2.8848484848484853e-05, "loss": 198.5009, "step": 7140 }, { "epoch": 0.057773576063155, "grad_norm": 1260.8990478515625, "learning_rate": 2.8888888888888888e-05, "loss": 287.7319, "step": 7150 }, { "epoch": 0.05785437826743914, "grad_norm": 2510.641357421875, "learning_rate": 2.8929292929292927e-05, "loss": 212.435, "step": 7160 }, { "epoch": 0.05793518047172327, "grad_norm": 1610.3782958984375, "learning_rate": 2.896969696969697e-05, "loss": 195.9904, "step": 7170 }, { "epoch": 0.058015982676007403, "grad_norm": 2051.1611328125, "learning_rate": 2.9010101010101008e-05, "loss": 230.746, "step": 7180 }, { "epoch": 0.05809678488029153, "grad_norm": 1708.345703125, "learning_rate": 2.905050505050505e-05, "loss": 202.1169, "step": 7190 }, { "epoch": 0.05817758708457567, "grad_norm": 991.0370483398438, "learning_rate": 2.909090909090909e-05, "loss": 182.8259, "step": 7200 }, { "epoch": 0.0582583892888598, "grad_norm": 1151.1380615234375, "learning_rate": 2.913131313131313e-05, "loss": 241.0473, "step": 7210 }, { "epoch": 0.058339191493143934, "grad_norm": 1103.3897705078125, "learning_rate": 2.9171717171717174e-05, "loss": 151.7667, "step": 7220 }, { "epoch": 0.05841999369742806, "grad_norm": 1151.0849609375, "learning_rate": 2.9212121212121213e-05, "loss": 197.9749, "step": 7230 }, { "epoch": 0.0585007959017122, "grad_norm": 983.3527221679688, "learning_rate": 2.9252525252525255e-05, "loss": 186.8989, "step": 7240 }, { "epoch": 0.05858159810599633, "grad_norm": 669.5452880859375, "learning_rate": 2.9292929292929294e-05, "loss": 179.281, "step": 7250 }, { "epoch": 0.058662400310280464, "grad_norm": 1186.9957275390625, "learning_rate": 2.9333333333333336e-05, "loss": 170.5622, "step": 7260 }, { "epoch": 0.0587432025145646, "grad_norm": 1314.4376220703125, "learning_rate": 2.9373737373737375e-05, "loss": 175.4961, "step": 7270 }, { "epoch": 0.05882400471884873, "grad_norm": 1278.834716796875, "learning_rate": 2.9414141414141417e-05, "loss": 183.9097, "step": 7280 }, { "epoch": 0.058904806923132866, "grad_norm": 1116.2734375, "learning_rate": 2.9454545454545456e-05, "loss": 143.516, "step": 7290 }, { "epoch": 0.058985609127416995, "grad_norm": 1352.628173828125, "learning_rate": 2.9494949494949498e-05, "loss": 204.3025, "step": 7300 }, { "epoch": 0.05906641133170113, "grad_norm": 1091.3201904296875, "learning_rate": 2.9535353535353537e-05, "loss": 181.4761, "step": 7310 }, { "epoch": 0.05914721353598526, "grad_norm": 1040.334716796875, "learning_rate": 2.957575757575758e-05, "loss": 170.6319, "step": 7320 }, { "epoch": 0.059228015740269396, "grad_norm": 1476.125732421875, "learning_rate": 2.9616161616161618e-05, "loss": 161.3749, "step": 7330 }, { "epoch": 0.059308817944553525, "grad_norm": 1488.0325927734375, "learning_rate": 2.965656565656566e-05, "loss": 183.3941, "step": 7340 }, { "epoch": 0.05938962014883766, "grad_norm": 481.60833740234375, "learning_rate": 2.96969696969697e-05, "loss": 199.2278, "step": 7350 }, { "epoch": 0.05947042235312179, "grad_norm": 1610.34521484375, "learning_rate": 2.973737373737374e-05, "loss": 201.723, "step": 7360 }, { "epoch": 0.05955122455740593, "grad_norm": 1576.0423583984375, "learning_rate": 2.9777777777777777e-05, "loss": 222.0852, "step": 7370 }, { "epoch": 0.059632026761690056, "grad_norm": 889.7515258789062, "learning_rate": 2.9818181818181816e-05, "loss": 193.5616, "step": 7380 }, { "epoch": 0.05971282896597419, "grad_norm": 746.6514282226562, "learning_rate": 2.9858585858585858e-05, "loss": 166.2696, "step": 7390 }, { "epoch": 0.05979363117025832, "grad_norm": 1730.69580078125, "learning_rate": 2.98989898989899e-05, "loss": 209.8799, "step": 7400 }, { "epoch": 0.05987443337454246, "grad_norm": 690.6642456054688, "learning_rate": 2.993939393939394e-05, "loss": 230.9101, "step": 7410 }, { "epoch": 0.05995523557882659, "grad_norm": 863.1697387695312, "learning_rate": 2.997979797979798e-05, "loss": 150.7177, "step": 7420 }, { "epoch": 0.06003603778311072, "grad_norm": 1267.2069091796875, "learning_rate": 3.002020202020202e-05, "loss": 210.8308, "step": 7430 }, { "epoch": 0.06011683998739486, "grad_norm": 1010.417724609375, "learning_rate": 3.0060606060606062e-05, "loss": 191.3645, "step": 7440 }, { "epoch": 0.06019764219167899, "grad_norm": 689.7382202148438, "learning_rate": 3.01010101010101e-05, "loss": 187.7134, "step": 7450 }, { "epoch": 0.060278444395963124, "grad_norm": 1864.760986328125, "learning_rate": 3.0141414141414144e-05, "loss": 214.7331, "step": 7460 }, { "epoch": 0.06035924660024725, "grad_norm": 1038.37353515625, "learning_rate": 3.0181818181818182e-05, "loss": 217.9106, "step": 7470 }, { "epoch": 0.06044004880453139, "grad_norm": 622.6604614257812, "learning_rate": 3.0222222222222225e-05, "loss": 155.4263, "step": 7480 }, { "epoch": 0.06052085100881552, "grad_norm": 878.7538452148438, "learning_rate": 3.0262626262626263e-05, "loss": 231.1667, "step": 7490 }, { "epoch": 0.060601653213099654, "grad_norm": 1581.2225341796875, "learning_rate": 3.0303030303030306e-05, "loss": 163.4888, "step": 7500 }, { "epoch": 0.060682455417383784, "grad_norm": 1152.7149658203125, "learning_rate": 3.0343434343434345e-05, "loss": 182.3645, "step": 7510 }, { "epoch": 0.06076325762166792, "grad_norm": 1109.6708984375, "learning_rate": 3.0383838383838387e-05, "loss": 175.0838, "step": 7520 }, { "epoch": 0.06084405982595205, "grad_norm": 1053.8270263671875, "learning_rate": 3.0424242424242426e-05, "loss": 181.691, "step": 7530 }, { "epoch": 0.060924862030236185, "grad_norm": 2113.046875, "learning_rate": 3.0464646464646468e-05, "loss": 224.7368, "step": 7540 }, { "epoch": 0.06100566423452032, "grad_norm": 1166.90478515625, "learning_rate": 3.050505050505051e-05, "loss": 206.4759, "step": 7550 }, { "epoch": 0.06108646643880445, "grad_norm": 1273.3836669921875, "learning_rate": 3.054545454545455e-05, "loss": 171.2801, "step": 7560 }, { "epoch": 0.061167268643088586, "grad_norm": 2534.885498046875, "learning_rate": 3.058585858585859e-05, "loss": 159.7586, "step": 7570 }, { "epoch": 0.061248070847372715, "grad_norm": 3763.103515625, "learning_rate": 3.062626262626262e-05, "loss": 323.4677, "step": 7580 }, { "epoch": 0.06132887305165685, "grad_norm": 1977.9522705078125, "learning_rate": 3.066666666666667e-05, "loss": 227.4736, "step": 7590 }, { "epoch": 0.06140967525594098, "grad_norm": 1690.8280029296875, "learning_rate": 3.070707070707071e-05, "loss": 192.4362, "step": 7600 }, { "epoch": 0.06149047746022512, "grad_norm": 1523.7828369140625, "learning_rate": 3.074747474747475e-05, "loss": 234.8535, "step": 7610 }, { "epoch": 0.061571279664509246, "grad_norm": 1146.36865234375, "learning_rate": 3.0787878787878786e-05, "loss": 142.9756, "step": 7620 }, { "epoch": 0.06165208186879338, "grad_norm": 895.3403930664062, "learning_rate": 3.082828282828283e-05, "loss": 201.5379, "step": 7630 }, { "epoch": 0.06173288407307751, "grad_norm": 1039.900634765625, "learning_rate": 3.086868686868687e-05, "loss": 230.2974, "step": 7640 }, { "epoch": 0.06181368627736165, "grad_norm": 1130.9986572265625, "learning_rate": 3.090909090909091e-05, "loss": 189.1531, "step": 7650 }, { "epoch": 0.061894488481645776, "grad_norm": 1224.142822265625, "learning_rate": 3.094949494949495e-05, "loss": 204.0206, "step": 7660 }, { "epoch": 0.06197529068592991, "grad_norm": 2115.472412109375, "learning_rate": 3.098989898989899e-05, "loss": 180.536, "step": 7670 }, { "epoch": 0.06205609289021405, "grad_norm": 779.9313354492188, "learning_rate": 3.103030303030303e-05, "loss": 158.623, "step": 7680 }, { "epoch": 0.06213689509449818, "grad_norm": 1337.7568359375, "learning_rate": 3.107070707070707e-05, "loss": 159.0383, "step": 7690 }, { "epoch": 0.062217697298782314, "grad_norm": 1851.648193359375, "learning_rate": 3.111111111111111e-05, "loss": 179.8161, "step": 7700 }, { "epoch": 0.06229849950306644, "grad_norm": 1469.6453857421875, "learning_rate": 3.1151515151515156e-05, "loss": 187.596, "step": 7710 }, { "epoch": 0.06237930170735058, "grad_norm": 1624.6527099609375, "learning_rate": 3.1191919191919194e-05, "loss": 214.8479, "step": 7720 }, { "epoch": 0.06246010391163471, "grad_norm": 1006.6346435546875, "learning_rate": 3.123232323232323e-05, "loss": 154.5748, "step": 7730 }, { "epoch": 0.06254090611591884, "grad_norm": 1002.5286254882812, "learning_rate": 3.127272727272728e-05, "loss": 184.4432, "step": 7740 }, { "epoch": 0.06262170832020297, "grad_norm": 1352.4193115234375, "learning_rate": 3.131313131313132e-05, "loss": 237.0036, "step": 7750 }, { "epoch": 0.06270251052448711, "grad_norm": 1084.147216796875, "learning_rate": 3.1353535353535357e-05, "loss": 164.8318, "step": 7760 }, { "epoch": 0.06278331272877125, "grad_norm": 1302.1048583984375, "learning_rate": 3.1393939393939395e-05, "loss": 164.3788, "step": 7770 }, { "epoch": 0.06286411493305537, "grad_norm": 1383.396484375, "learning_rate": 3.143434343434344e-05, "loss": 175.5805, "step": 7780 }, { "epoch": 0.0629449171373395, "grad_norm": 1246.53857421875, "learning_rate": 3.147474747474747e-05, "loss": 210.3966, "step": 7790 }, { "epoch": 0.06302571934162364, "grad_norm": 1285.145263671875, "learning_rate": 3.151515151515151e-05, "loss": 246.1903, "step": 7800 }, { "epoch": 0.06310652154590778, "grad_norm": 1620.3326416015625, "learning_rate": 3.155555555555556e-05, "loss": 196.0127, "step": 7810 }, { "epoch": 0.06318732375019191, "grad_norm": 1016.9979858398438, "learning_rate": 3.1595959595959596e-05, "loss": 210.5301, "step": 7820 }, { "epoch": 0.06326812595447603, "grad_norm": 1945.8780517578125, "learning_rate": 3.1636363636363635e-05, "loss": 239.3266, "step": 7830 }, { "epoch": 0.06334892815876017, "grad_norm": 1864.5794677734375, "learning_rate": 3.1676767676767674e-05, "loss": 193.7567, "step": 7840 }, { "epoch": 0.0634297303630443, "grad_norm": 1095.450927734375, "learning_rate": 3.171717171717172e-05, "loss": 191.1735, "step": 7850 }, { "epoch": 0.06351053256732844, "grad_norm": 1031.504150390625, "learning_rate": 3.175757575757576e-05, "loss": 185.8655, "step": 7860 }, { "epoch": 0.06359133477161256, "grad_norm": 1385.5076904296875, "learning_rate": 3.17979797979798e-05, "loss": 177.6908, "step": 7870 }, { "epoch": 0.0636721369758967, "grad_norm": 1074.5181884765625, "learning_rate": 3.1838383838383836e-05, "loss": 204.0031, "step": 7880 }, { "epoch": 0.06375293918018084, "grad_norm": 953.3314208984375, "learning_rate": 3.187878787878788e-05, "loss": 180.9185, "step": 7890 }, { "epoch": 0.06383374138446497, "grad_norm": 868.8043823242188, "learning_rate": 3.191919191919192e-05, "loss": 220.1422, "step": 7900 }, { "epoch": 0.0639145435887491, "grad_norm": 5921.494140625, "learning_rate": 3.195959595959596e-05, "loss": 167.039, "step": 7910 }, { "epoch": 0.06399534579303323, "grad_norm": 1500.1710205078125, "learning_rate": 3.2000000000000005e-05, "loss": 138.7559, "step": 7920 }, { "epoch": 0.06407614799731737, "grad_norm": 1143.7266845703125, "learning_rate": 3.2040404040404044e-05, "loss": 195.0978, "step": 7930 }, { "epoch": 0.0641569502016015, "grad_norm": 523.0445556640625, "learning_rate": 3.208080808080808e-05, "loss": 151.1692, "step": 7940 }, { "epoch": 0.06423775240588563, "grad_norm": 2158.39013671875, "learning_rate": 3.212121212121212e-05, "loss": 236.7984, "step": 7950 }, { "epoch": 0.06431855461016976, "grad_norm": 659.3209228515625, "learning_rate": 3.216161616161617e-05, "loss": 181.1136, "step": 7960 }, { "epoch": 0.0643993568144539, "grad_norm": 608.638671875, "learning_rate": 3.2202020202020206e-05, "loss": 194.2183, "step": 7970 }, { "epoch": 0.06448015901873803, "grad_norm": 1122.7078857421875, "learning_rate": 3.2242424242424245e-05, "loss": 160.3627, "step": 7980 }, { "epoch": 0.06456096122302217, "grad_norm": 1686.80810546875, "learning_rate": 3.2282828282828284e-05, "loss": 223.456, "step": 7990 }, { "epoch": 0.06464176342730629, "grad_norm": 1573.1317138671875, "learning_rate": 3.232323232323233e-05, "loss": 224.0322, "step": 8000 }, { "epoch": 0.06472256563159043, "grad_norm": 1321.1458740234375, "learning_rate": 3.236363636363636e-05, "loss": 252.9104, "step": 8010 }, { "epoch": 0.06480336783587456, "grad_norm": 1179.701171875, "learning_rate": 3.24040404040404e-05, "loss": 223.4346, "step": 8020 }, { "epoch": 0.0648841700401587, "grad_norm": 977.9105224609375, "learning_rate": 3.2444444444444446e-05, "loss": 152.0468, "step": 8030 }, { "epoch": 0.06496497224444282, "grad_norm": 2066.90380859375, "learning_rate": 3.2484848484848485e-05, "loss": 190.8667, "step": 8040 }, { "epoch": 0.06504577444872696, "grad_norm": 3095.08935546875, "learning_rate": 3.2525252525252524e-05, "loss": 192.0303, "step": 8050 }, { "epoch": 0.0651265766530111, "grad_norm": 2343.95947265625, "learning_rate": 3.256565656565656e-05, "loss": 157.5384, "step": 8060 }, { "epoch": 0.06520737885729523, "grad_norm": 1510.8023681640625, "learning_rate": 3.260606060606061e-05, "loss": 239.6892, "step": 8070 }, { "epoch": 0.06528818106157935, "grad_norm": 1445.597900390625, "learning_rate": 3.264646464646465e-05, "loss": 177.4803, "step": 8080 }, { "epoch": 0.06536898326586349, "grad_norm": 1667.5521240234375, "learning_rate": 3.2686868686868686e-05, "loss": 190.0459, "step": 8090 }, { "epoch": 0.06544978547014763, "grad_norm": 925.2418212890625, "learning_rate": 3.272727272727273e-05, "loss": 190.8257, "step": 8100 }, { "epoch": 0.06553058767443176, "grad_norm": 1247.4376220703125, "learning_rate": 3.276767676767677e-05, "loss": 203.9773, "step": 8110 }, { "epoch": 0.0656113898787159, "grad_norm": 1212.892822265625, "learning_rate": 3.280808080808081e-05, "loss": 203.7381, "step": 8120 }, { "epoch": 0.06569219208300002, "grad_norm": 1091.890380859375, "learning_rate": 3.284848484848485e-05, "loss": 194.8187, "step": 8130 }, { "epoch": 0.06577299428728416, "grad_norm": 2029.2864990234375, "learning_rate": 3.2888888888888894e-05, "loss": 246.8937, "step": 8140 }, { "epoch": 0.06585379649156829, "grad_norm": 920.1378784179688, "learning_rate": 3.292929292929293e-05, "loss": 215.9934, "step": 8150 }, { "epoch": 0.06593459869585243, "grad_norm": 1521.0574951171875, "learning_rate": 3.296969696969697e-05, "loss": 167.3099, "step": 8160 }, { "epoch": 0.06601540090013655, "grad_norm": 1420.7525634765625, "learning_rate": 3.301010101010101e-05, "loss": 206.7512, "step": 8170 }, { "epoch": 0.06609620310442069, "grad_norm": 840.5839233398438, "learning_rate": 3.3050505050505056e-05, "loss": 202.7185, "step": 8180 }, { "epoch": 0.06617700530870482, "grad_norm": 1193.502197265625, "learning_rate": 3.3090909090909095e-05, "loss": 160.3612, "step": 8190 }, { "epoch": 0.06625780751298896, "grad_norm": 2222.778564453125, "learning_rate": 3.3131313131313134e-05, "loss": 150.92, "step": 8200 }, { "epoch": 0.06633860971727308, "grad_norm": 776.4454956054688, "learning_rate": 3.317171717171717e-05, "loss": 159.6749, "step": 8210 }, { "epoch": 0.06641941192155722, "grad_norm": 1179.86279296875, "learning_rate": 3.321212121212121e-05, "loss": 147.1537, "step": 8220 }, { "epoch": 0.06650021412584135, "grad_norm": 1168.2757568359375, "learning_rate": 3.325252525252525e-05, "loss": 163.5715, "step": 8230 }, { "epoch": 0.06658101633012549, "grad_norm": 996.3876953125, "learning_rate": 3.329292929292929e-05, "loss": 156.5557, "step": 8240 }, { "epoch": 0.06666181853440963, "grad_norm": 1006.9996337890625, "learning_rate": 3.3333333333333335e-05, "loss": 176.6802, "step": 8250 }, { "epoch": 0.06674262073869375, "grad_norm": 877.4000854492188, "learning_rate": 3.3373737373737374e-05, "loss": 182.8363, "step": 8260 }, { "epoch": 0.06682342294297788, "grad_norm": 2153.091552734375, "learning_rate": 3.341414141414141e-05, "loss": 184.7595, "step": 8270 }, { "epoch": 0.06690422514726202, "grad_norm": 1884.7989501953125, "learning_rate": 3.345454545454546e-05, "loss": 197.672, "step": 8280 }, { "epoch": 0.06698502735154616, "grad_norm": 1494.185791015625, "learning_rate": 3.34949494949495e-05, "loss": 178.7865, "step": 8290 }, { "epoch": 0.06706582955583028, "grad_norm": 2600.398193359375, "learning_rate": 3.3535353535353536e-05, "loss": 222.7885, "step": 8300 }, { "epoch": 0.06714663176011441, "grad_norm": 1300.013671875, "learning_rate": 3.3575757575757575e-05, "loss": 182.5449, "step": 8310 }, { "epoch": 0.06722743396439855, "grad_norm": 2145.218505859375, "learning_rate": 3.361616161616162e-05, "loss": 281.7514, "step": 8320 }, { "epoch": 0.06730823616868269, "grad_norm": 1519.411865234375, "learning_rate": 3.365656565656566e-05, "loss": 167.7319, "step": 8330 }, { "epoch": 0.06738903837296681, "grad_norm": 750.0274047851562, "learning_rate": 3.36969696969697e-05, "loss": 173.5281, "step": 8340 }, { "epoch": 0.06746984057725094, "grad_norm": 1222.1435546875, "learning_rate": 3.373737373737374e-05, "loss": 162.324, "step": 8350 }, { "epoch": 0.06755064278153508, "grad_norm": 955.3302001953125, "learning_rate": 3.377777777777778e-05, "loss": 184.845, "step": 8360 }, { "epoch": 0.06763144498581922, "grad_norm": 1112.2943115234375, "learning_rate": 3.381818181818182e-05, "loss": 160.8671, "step": 8370 }, { "epoch": 0.06771224719010335, "grad_norm": 1163.462646484375, "learning_rate": 3.385858585858586e-05, "loss": 147.4451, "step": 8380 }, { "epoch": 0.06779304939438748, "grad_norm": 925.3172607421875, "learning_rate": 3.38989898989899e-05, "loss": 175.2849, "step": 8390 }, { "epoch": 0.06787385159867161, "grad_norm": 1990.568359375, "learning_rate": 3.3939393939393945e-05, "loss": 199.808, "step": 8400 }, { "epoch": 0.06795465380295575, "grad_norm": 2128.471923828125, "learning_rate": 3.3979797979797984e-05, "loss": 205.3155, "step": 8410 }, { "epoch": 0.06803545600723988, "grad_norm": 2691.37353515625, "learning_rate": 3.402020202020202e-05, "loss": 216.1895, "step": 8420 }, { "epoch": 0.068116258211524, "grad_norm": 3363.869140625, "learning_rate": 3.406060606060606e-05, "loss": 201.8374, "step": 8430 }, { "epoch": 0.06819706041580814, "grad_norm": 1438.0633544921875, "learning_rate": 3.41010101010101e-05, "loss": 222.5396, "step": 8440 }, { "epoch": 0.06827786262009228, "grad_norm": 1703.8653564453125, "learning_rate": 3.414141414141414e-05, "loss": 213.8756, "step": 8450 }, { "epoch": 0.06835866482437641, "grad_norm": 1938.7177734375, "learning_rate": 3.4181818181818185e-05, "loss": 215.6468, "step": 8460 }, { "epoch": 0.06843946702866054, "grad_norm": 851.2493896484375, "learning_rate": 3.4222222222222224e-05, "loss": 182.1204, "step": 8470 }, { "epoch": 0.06852026923294467, "grad_norm": 1202.3365478515625, "learning_rate": 3.426262626262626e-05, "loss": 183.2559, "step": 8480 }, { "epoch": 0.06860107143722881, "grad_norm": 1543.7257080078125, "learning_rate": 3.43030303030303e-05, "loss": 239.1145, "step": 8490 }, { "epoch": 0.06868187364151294, "grad_norm": 748.701171875, "learning_rate": 3.434343434343435e-05, "loss": 285.6954, "step": 8500 }, { "epoch": 0.06876267584579707, "grad_norm": 5747.30224609375, "learning_rate": 3.4383838383838386e-05, "loss": 194.7742, "step": 8510 }, { "epoch": 0.0688434780500812, "grad_norm": 1587.478271484375, "learning_rate": 3.4424242424242425e-05, "loss": 213.0056, "step": 8520 }, { "epoch": 0.06892428025436534, "grad_norm": 907.9869995117188, "learning_rate": 3.4464646464646463e-05, "loss": 183.2854, "step": 8530 }, { "epoch": 0.06900508245864947, "grad_norm": 1065.7462158203125, "learning_rate": 3.450505050505051e-05, "loss": 147.3901, "step": 8540 }, { "epoch": 0.06908588466293361, "grad_norm": 1654.4375, "learning_rate": 3.454545454545455e-05, "loss": 152.4811, "step": 8550 }, { "epoch": 0.06916668686721773, "grad_norm": 1075.144775390625, "learning_rate": 3.458585858585859e-05, "loss": 217.3591, "step": 8560 }, { "epoch": 0.06924748907150187, "grad_norm": 1216.4287109375, "learning_rate": 3.4626262626262626e-05, "loss": 184.7878, "step": 8570 }, { "epoch": 0.069328291275786, "grad_norm": 1143.3253173828125, "learning_rate": 3.466666666666667e-05, "loss": 185.8244, "step": 8580 }, { "epoch": 0.06940909348007014, "grad_norm": 2943.891357421875, "learning_rate": 3.470707070707071e-05, "loss": 203.6546, "step": 8590 }, { "epoch": 0.06948989568435426, "grad_norm": 562.7566528320312, "learning_rate": 3.474747474747475e-05, "loss": 224.7354, "step": 8600 }, { "epoch": 0.0695706978886384, "grad_norm": 1995.734130859375, "learning_rate": 3.4787878787878795e-05, "loss": 178.8969, "step": 8610 }, { "epoch": 0.06965150009292254, "grad_norm": 1593.8944091796875, "learning_rate": 3.4828282828282834e-05, "loss": 164.9677, "step": 8620 }, { "epoch": 0.06973230229720667, "grad_norm": 715.6300048828125, "learning_rate": 3.486868686868687e-05, "loss": 147.6377, "step": 8630 }, { "epoch": 0.0698131045014908, "grad_norm": 3263.21044921875, "learning_rate": 3.490909090909091e-05, "loss": 235.9829, "step": 8640 }, { "epoch": 0.06989390670577493, "grad_norm": 3928.5576171875, "learning_rate": 3.494949494949495e-05, "loss": 220.889, "step": 8650 }, { "epoch": 0.06997470891005907, "grad_norm": 1176.6265869140625, "learning_rate": 3.498989898989899e-05, "loss": 173.9785, "step": 8660 }, { "epoch": 0.0700555111143432, "grad_norm": 1472.33349609375, "learning_rate": 3.503030303030303e-05, "loss": 166.185, "step": 8670 }, { "epoch": 0.07013631331862734, "grad_norm": 943.4843139648438, "learning_rate": 3.5070707070707073e-05, "loss": 196.9754, "step": 8680 }, { "epoch": 0.07021711552291146, "grad_norm": 1376.169189453125, "learning_rate": 3.511111111111111e-05, "loss": 249.9044, "step": 8690 }, { "epoch": 0.0702979177271956, "grad_norm": 862.705078125, "learning_rate": 3.515151515151515e-05, "loss": 134.6473, "step": 8700 }, { "epoch": 0.07037871993147973, "grad_norm": 1661.3258056640625, "learning_rate": 3.519191919191919e-05, "loss": 260.4335, "step": 8710 }, { "epoch": 0.07045952213576387, "grad_norm": 858.2864379882812, "learning_rate": 3.5232323232323236e-05, "loss": 156.1466, "step": 8720 }, { "epoch": 0.07054032434004799, "grad_norm": 1033.8033447265625, "learning_rate": 3.5272727272727274e-05, "loss": 158.8132, "step": 8730 }, { "epoch": 0.07062112654433213, "grad_norm": 2244.4833984375, "learning_rate": 3.531313131313131e-05, "loss": 185.2664, "step": 8740 }, { "epoch": 0.07070192874861626, "grad_norm": 828.0194091796875, "learning_rate": 3.535353535353535e-05, "loss": 189.655, "step": 8750 }, { "epoch": 0.0707827309529004, "grad_norm": 764.8339233398438, "learning_rate": 3.53939393939394e-05, "loss": 169.7833, "step": 8760 }, { "epoch": 0.07086353315718452, "grad_norm": 1434.6533203125, "learning_rate": 3.543434343434344e-05, "loss": 169.9032, "step": 8770 }, { "epoch": 0.07094433536146866, "grad_norm": 1811.5740966796875, "learning_rate": 3.5474747474747475e-05, "loss": 256.6501, "step": 8780 }, { "epoch": 0.0710251375657528, "grad_norm": 923.5958251953125, "learning_rate": 3.551515151515152e-05, "loss": 157.0084, "step": 8790 }, { "epoch": 0.07110593977003693, "grad_norm": 1671.8385009765625, "learning_rate": 3.555555555555556e-05, "loss": 206.8505, "step": 8800 }, { "epoch": 0.07118674197432107, "grad_norm": 2508.17626953125, "learning_rate": 3.55959595959596e-05, "loss": 204.866, "step": 8810 }, { "epoch": 0.07126754417860519, "grad_norm": 852.1519775390625, "learning_rate": 3.563636363636364e-05, "loss": 147.0554, "step": 8820 }, { "epoch": 0.07134834638288932, "grad_norm": 925.072021484375, "learning_rate": 3.567676767676768e-05, "loss": 205.7363, "step": 8830 }, { "epoch": 0.07142914858717346, "grad_norm": 1310.513916015625, "learning_rate": 3.571717171717172e-05, "loss": 252.9427, "step": 8840 }, { "epoch": 0.0715099507914576, "grad_norm": 1795.476806640625, "learning_rate": 3.575757575757576e-05, "loss": 184.422, "step": 8850 }, { "epoch": 0.07159075299574172, "grad_norm": 1071.3101806640625, "learning_rate": 3.57979797979798e-05, "loss": 165.8845, "step": 8860 }, { "epoch": 0.07167155520002585, "grad_norm": 724.8527221679688, "learning_rate": 3.583838383838384e-05, "loss": 180.9637, "step": 8870 }, { "epoch": 0.07175235740430999, "grad_norm": 999.9872436523438, "learning_rate": 3.587878787878788e-05, "loss": 186.5519, "step": 8880 }, { "epoch": 0.07183315960859413, "grad_norm": 1380.1075439453125, "learning_rate": 3.5919191919191916e-05, "loss": 224.0065, "step": 8890 }, { "epoch": 0.07191396181287825, "grad_norm": 1093.9498291015625, "learning_rate": 3.595959595959596e-05, "loss": 165.8288, "step": 8900 }, { "epoch": 0.07199476401716239, "grad_norm": 2711.7353515625, "learning_rate": 3.6e-05, "loss": 163.5886, "step": 8910 }, { "epoch": 0.07207556622144652, "grad_norm": 1537.253662109375, "learning_rate": 3.604040404040404e-05, "loss": 165.1541, "step": 8920 }, { "epoch": 0.07215636842573066, "grad_norm": 1100.548095703125, "learning_rate": 3.608080808080808e-05, "loss": 169.8857, "step": 8930 }, { "epoch": 0.0722371706300148, "grad_norm": 1088.0587158203125, "learning_rate": 3.6121212121212124e-05, "loss": 186.5488, "step": 8940 }, { "epoch": 0.07231797283429892, "grad_norm": 1160.9769287109375, "learning_rate": 3.616161616161616e-05, "loss": 190.4372, "step": 8950 }, { "epoch": 0.07239877503858305, "grad_norm": 1117.179443359375, "learning_rate": 3.62020202020202e-05, "loss": 134.2678, "step": 8960 }, { "epoch": 0.07247957724286719, "grad_norm": 1956.5089111328125, "learning_rate": 3.624242424242425e-05, "loss": 172.5972, "step": 8970 }, { "epoch": 0.07256037944715132, "grad_norm": 2694.300537109375, "learning_rate": 3.6282828282828286e-05, "loss": 200.7171, "step": 8980 }, { "epoch": 0.07264118165143545, "grad_norm": 1112.63720703125, "learning_rate": 3.6323232323232325e-05, "loss": 151.1802, "step": 8990 }, { "epoch": 0.07272198385571958, "grad_norm": 2176.663330078125, "learning_rate": 3.6363636363636364e-05, "loss": 162.1838, "step": 9000 }, { "epoch": 0.07280278606000372, "grad_norm": 1126.674072265625, "learning_rate": 3.640404040404041e-05, "loss": 180.7603, "step": 9010 }, { "epoch": 0.07288358826428785, "grad_norm": 1244.1241455078125, "learning_rate": 3.644444444444445e-05, "loss": 114.9891, "step": 9020 }, { "epoch": 0.07296439046857198, "grad_norm": 2782.807373046875, "learning_rate": 3.648484848484849e-05, "loss": 199.3326, "step": 9030 }, { "epoch": 0.07304519267285611, "grad_norm": 1218.721435546875, "learning_rate": 3.6525252525252526e-05, "loss": 143.516, "step": 9040 }, { "epoch": 0.07312599487714025, "grad_norm": 799.7451782226562, "learning_rate": 3.656565656565657e-05, "loss": 131.4932, "step": 9050 }, { "epoch": 0.07320679708142439, "grad_norm": 547.2342529296875, "learning_rate": 3.660606060606061e-05, "loss": 151.0718, "step": 9060 }, { "epoch": 0.07328759928570851, "grad_norm": 1480.14501953125, "learning_rate": 3.664646464646464e-05, "loss": 201.2714, "step": 9070 }, { "epoch": 0.07336840148999264, "grad_norm": 516.29931640625, "learning_rate": 3.668686868686869e-05, "loss": 174.2223, "step": 9080 }, { "epoch": 0.07344920369427678, "grad_norm": 912.0347900390625, "learning_rate": 3.672727272727273e-05, "loss": 180.7495, "step": 9090 }, { "epoch": 0.07353000589856092, "grad_norm": 1243.236083984375, "learning_rate": 3.6767676767676766e-05, "loss": 189.5693, "step": 9100 }, { "epoch": 0.07361080810284505, "grad_norm": 742.5632934570312, "learning_rate": 3.6808080808080805e-05, "loss": 194.6184, "step": 9110 }, { "epoch": 0.07369161030712917, "grad_norm": 1145.2069091796875, "learning_rate": 3.684848484848485e-05, "loss": 185.7565, "step": 9120 }, { "epoch": 0.07377241251141331, "grad_norm": 1086.181640625, "learning_rate": 3.688888888888889e-05, "loss": 167.5214, "step": 9130 }, { "epoch": 0.07385321471569745, "grad_norm": 1321.85400390625, "learning_rate": 3.692929292929293e-05, "loss": 221.075, "step": 9140 }, { "epoch": 0.07393401691998158, "grad_norm": 2168.907958984375, "learning_rate": 3.6969696969696974e-05, "loss": 219.8519, "step": 9150 }, { "epoch": 0.0740148191242657, "grad_norm": 1007.1217041015625, "learning_rate": 3.701010101010101e-05, "loss": 194.1429, "step": 9160 }, { "epoch": 0.07409562132854984, "grad_norm": 1099.997802734375, "learning_rate": 3.705050505050505e-05, "loss": 220.093, "step": 9170 }, { "epoch": 0.07417642353283398, "grad_norm": 745.4526977539062, "learning_rate": 3.709090909090909e-05, "loss": 143.6606, "step": 9180 }, { "epoch": 0.07425722573711811, "grad_norm": 2050.18212890625, "learning_rate": 3.7131313131313136e-05, "loss": 222.1462, "step": 9190 }, { "epoch": 0.07433802794140223, "grad_norm": 2366.43896484375, "learning_rate": 3.7171717171717175e-05, "loss": 191.5783, "step": 9200 }, { "epoch": 0.07441883014568637, "grad_norm": 905.9826049804688, "learning_rate": 3.7212121212121214e-05, "loss": 168.3594, "step": 9210 }, { "epoch": 0.07449963234997051, "grad_norm": 1320.197509765625, "learning_rate": 3.725252525252525e-05, "loss": 140.7596, "step": 9220 }, { "epoch": 0.07458043455425464, "grad_norm": 3290.6748046875, "learning_rate": 3.72929292929293e-05, "loss": 161.625, "step": 9230 }, { "epoch": 0.07466123675853878, "grad_norm": 1262.2984619140625, "learning_rate": 3.733333333333334e-05, "loss": 163.4713, "step": 9240 }, { "epoch": 0.0747420389628229, "grad_norm": 1434.2923583984375, "learning_rate": 3.7373737373737376e-05, "loss": 168.6003, "step": 9250 }, { "epoch": 0.07482284116710704, "grad_norm": 716.86083984375, "learning_rate": 3.7414141414141415e-05, "loss": 165.9515, "step": 9260 }, { "epoch": 0.07490364337139117, "grad_norm": 4190.2119140625, "learning_rate": 3.745454545454546e-05, "loss": 201.2678, "step": 9270 }, { "epoch": 0.07498444557567531, "grad_norm": 1093.3636474609375, "learning_rate": 3.74949494949495e-05, "loss": 145.2475, "step": 9280 }, { "epoch": 0.07506524777995943, "grad_norm": 1217.2969970703125, "learning_rate": 3.753535353535353e-05, "loss": 219.3316, "step": 9290 }, { "epoch": 0.07514604998424357, "grad_norm": 1122.0589599609375, "learning_rate": 3.757575757575758e-05, "loss": 201.7304, "step": 9300 }, { "epoch": 0.0752268521885277, "grad_norm": 779.33447265625, "learning_rate": 3.7616161616161616e-05, "loss": 199.0558, "step": 9310 }, { "epoch": 0.07530765439281184, "grad_norm": 1110.2554931640625, "learning_rate": 3.7656565656565655e-05, "loss": 190.4773, "step": 9320 }, { "epoch": 0.07538845659709596, "grad_norm": 1762.0330810546875, "learning_rate": 3.76969696969697e-05, "loss": 151.1484, "step": 9330 }, { "epoch": 0.0754692588013801, "grad_norm": 1020.6593627929688, "learning_rate": 3.773737373737374e-05, "loss": 169.2854, "step": 9340 }, { "epoch": 0.07555006100566423, "grad_norm": 1202.7464599609375, "learning_rate": 3.777777777777778e-05, "loss": 164.9156, "step": 9350 }, { "epoch": 0.07563086320994837, "grad_norm": 1249.380126953125, "learning_rate": 3.781818181818182e-05, "loss": 227.8868, "step": 9360 }, { "epoch": 0.0757116654142325, "grad_norm": 1847.7193603515625, "learning_rate": 3.785858585858586e-05, "loss": 176.2305, "step": 9370 }, { "epoch": 0.07579246761851663, "grad_norm": 1446.559814453125, "learning_rate": 3.78989898989899e-05, "loss": 147.3983, "step": 9380 }, { "epoch": 0.07587326982280077, "grad_norm": 1216.9862060546875, "learning_rate": 3.793939393939394e-05, "loss": 165.7989, "step": 9390 }, { "epoch": 0.0759540720270849, "grad_norm": 2848.1337890625, "learning_rate": 3.797979797979798e-05, "loss": 157.2326, "step": 9400 }, { "epoch": 0.07603487423136904, "grad_norm": 940.07177734375, "learning_rate": 3.8020202020202025e-05, "loss": 221.2202, "step": 9410 }, { "epoch": 0.07611567643565316, "grad_norm": 1160.6881103515625, "learning_rate": 3.8060606060606064e-05, "loss": 158.6488, "step": 9420 }, { "epoch": 0.0761964786399373, "grad_norm": 2879.822998046875, "learning_rate": 3.81010101010101e-05, "loss": 223.0843, "step": 9430 }, { "epoch": 0.07627728084422143, "grad_norm": 1498.8753662109375, "learning_rate": 3.814141414141414e-05, "loss": 161.5878, "step": 9440 }, { "epoch": 0.07635808304850557, "grad_norm": 1229.455078125, "learning_rate": 3.818181818181819e-05, "loss": 198.7026, "step": 9450 }, { "epoch": 0.07643888525278969, "grad_norm": 929.4089965820312, "learning_rate": 3.8222222222222226e-05, "loss": 251.8946, "step": 9460 }, { "epoch": 0.07651968745707383, "grad_norm": 934.8760375976562, "learning_rate": 3.8262626262626265e-05, "loss": 162.8998, "step": 9470 }, { "epoch": 0.07660048966135796, "grad_norm": 915.323974609375, "learning_rate": 3.830303030303031e-05, "loss": 171.6096, "step": 9480 }, { "epoch": 0.0766812918656421, "grad_norm": 1115.416748046875, "learning_rate": 3.834343434343435e-05, "loss": 163.5847, "step": 9490 }, { "epoch": 0.07676209406992623, "grad_norm": 2073.390869140625, "learning_rate": 3.838383838383838e-05, "loss": 261.5975, "step": 9500 }, { "epoch": 0.07684289627421036, "grad_norm": 1260.9718017578125, "learning_rate": 3.842424242424243e-05, "loss": 131.5614, "step": 9510 }, { "epoch": 0.07692369847849449, "grad_norm": 1239.3568115234375, "learning_rate": 3.8464646464646466e-05, "loss": 164.9494, "step": 9520 }, { "epoch": 0.07700450068277863, "grad_norm": 1221.8023681640625, "learning_rate": 3.8505050505050505e-05, "loss": 245.5353, "step": 9530 }, { "epoch": 0.07708530288706276, "grad_norm": 533.4956665039062, "learning_rate": 3.8545454545454544e-05, "loss": 149.1819, "step": 9540 }, { "epoch": 0.07716610509134689, "grad_norm": 1410.545166015625, "learning_rate": 3.858585858585859e-05, "loss": 172.351, "step": 9550 }, { "epoch": 0.07724690729563102, "grad_norm": 927.8252563476562, "learning_rate": 3.862626262626263e-05, "loss": 152.7654, "step": 9560 }, { "epoch": 0.07732770949991516, "grad_norm": 1244.0257568359375, "learning_rate": 3.866666666666667e-05, "loss": 171.8292, "step": 9570 }, { "epoch": 0.0774085117041993, "grad_norm": 664.2005615234375, "learning_rate": 3.8707070707070706e-05, "loss": 109.5876, "step": 9580 }, { "epoch": 0.07748931390848342, "grad_norm": 1334.42626953125, "learning_rate": 3.874747474747475e-05, "loss": 129.1926, "step": 9590 }, { "epoch": 0.07757011611276755, "grad_norm": 1236.3963623046875, "learning_rate": 3.878787878787879e-05, "loss": 195.8564, "step": 9600 }, { "epoch": 0.07765091831705169, "grad_norm": 517.2808227539062, "learning_rate": 3.882828282828283e-05, "loss": 213.7913, "step": 9610 }, { "epoch": 0.07773172052133583, "grad_norm": 1618.37890625, "learning_rate": 3.886868686868687e-05, "loss": 187.1561, "step": 9620 }, { "epoch": 0.07781252272561995, "grad_norm": 1312.72705078125, "learning_rate": 3.8909090909090914e-05, "loss": 192.0711, "step": 9630 }, { "epoch": 0.07789332492990408, "grad_norm": 970.0208129882812, "learning_rate": 3.894949494949495e-05, "loss": 150.2422, "step": 9640 }, { "epoch": 0.07797412713418822, "grad_norm": 1302.1982421875, "learning_rate": 3.898989898989899e-05, "loss": 149.0684, "step": 9650 }, { "epoch": 0.07805492933847236, "grad_norm": 3663.638427734375, "learning_rate": 3.903030303030304e-05, "loss": 197.8055, "step": 9660 }, { "epoch": 0.07813573154275649, "grad_norm": 1510.8233642578125, "learning_rate": 3.9070707070707076e-05, "loss": 217.5386, "step": 9670 }, { "epoch": 0.07821653374704061, "grad_norm": 4103.01904296875, "learning_rate": 3.9111111111111115e-05, "loss": 198.1855, "step": 9680 }, { "epoch": 0.07829733595132475, "grad_norm": 1483.061279296875, "learning_rate": 3.9151515151515153e-05, "loss": 200.777, "step": 9690 }, { "epoch": 0.07837813815560889, "grad_norm": 1666.9429931640625, "learning_rate": 3.91919191919192e-05, "loss": 233.7256, "step": 9700 }, { "epoch": 0.07845894035989302, "grad_norm": 1422.605224609375, "learning_rate": 3.923232323232323e-05, "loss": 198.8265, "step": 9710 }, { "epoch": 0.07853974256417715, "grad_norm": 955.90869140625, "learning_rate": 3.927272727272727e-05, "loss": 178.4951, "step": 9720 }, { "epoch": 0.07862054476846128, "grad_norm": 768.09228515625, "learning_rate": 3.9313131313131316e-05, "loss": 169.9865, "step": 9730 }, { "epoch": 0.07870134697274542, "grad_norm": 2241.572998046875, "learning_rate": 3.9353535353535355e-05, "loss": 171.3657, "step": 9740 }, { "epoch": 0.07878214917702955, "grad_norm": 828.8177490234375, "learning_rate": 3.939393939393939e-05, "loss": 212.0979, "step": 9750 }, { "epoch": 0.07886295138131368, "grad_norm": 1248.2691650390625, "learning_rate": 3.943434343434343e-05, "loss": 144.412, "step": 9760 }, { "epoch": 0.07894375358559781, "grad_norm": 1106.5013427734375, "learning_rate": 3.947474747474748e-05, "loss": 170.0778, "step": 9770 }, { "epoch": 0.07902455578988195, "grad_norm": 1183.6558837890625, "learning_rate": 3.951515151515152e-05, "loss": 183.0114, "step": 9780 }, { "epoch": 0.07910535799416608, "grad_norm": 790.3275146484375, "learning_rate": 3.9555555555555556e-05, "loss": 148.8441, "step": 9790 }, { "epoch": 0.07918616019845022, "grad_norm": 1040.2529296875, "learning_rate": 3.9595959595959594e-05, "loss": 181.9307, "step": 9800 }, { "epoch": 0.07926696240273434, "grad_norm": 1023.8417358398438, "learning_rate": 3.963636363636364e-05, "loss": 160.768, "step": 9810 }, { "epoch": 0.07934776460701848, "grad_norm": 1530.9327392578125, "learning_rate": 3.967676767676768e-05, "loss": 189.5177, "step": 9820 }, { "epoch": 0.07942856681130261, "grad_norm": 1020.0157470703125, "learning_rate": 3.971717171717172e-05, "loss": 175.2649, "step": 9830 }, { "epoch": 0.07950936901558675, "grad_norm": 1115.394287109375, "learning_rate": 3.975757575757576e-05, "loss": 180.9754, "step": 9840 }, { "epoch": 0.07959017121987087, "grad_norm": 596.3128051757812, "learning_rate": 3.97979797979798e-05, "loss": 190.0906, "step": 9850 }, { "epoch": 0.07967097342415501, "grad_norm": 1229.2056884765625, "learning_rate": 3.983838383838384e-05, "loss": 153.1082, "step": 9860 }, { "epoch": 0.07975177562843914, "grad_norm": 1460.8936767578125, "learning_rate": 3.987878787878788e-05, "loss": 176.2776, "step": 9870 }, { "epoch": 0.07983257783272328, "grad_norm": 4583.7373046875, "learning_rate": 3.9919191919191926e-05, "loss": 163.6398, "step": 9880 }, { "epoch": 0.0799133800370074, "grad_norm": 1115.4329833984375, "learning_rate": 3.9959595959595964e-05, "loss": 174.1616, "step": 9890 }, { "epoch": 0.07999418224129154, "grad_norm": 2389.8232421875, "learning_rate": 4e-05, "loss": 163.833, "step": 9900 }, { "epoch": 0.08007498444557568, "grad_norm": 2180.552978515625, "learning_rate": 4.004040404040404e-05, "loss": 189.1826, "step": 9910 }, { "epoch": 0.08015578664985981, "grad_norm": 2478.71533203125, "learning_rate": 4.008080808080809e-05, "loss": 262.7207, "step": 9920 }, { "epoch": 0.08023658885414395, "grad_norm": 952.7739868164062, "learning_rate": 4.012121212121212e-05, "loss": 152.5153, "step": 9930 }, { "epoch": 0.08031739105842807, "grad_norm": 1591.5555419921875, "learning_rate": 4.016161616161616e-05, "loss": 161.1086, "step": 9940 }, { "epoch": 0.0803981932627122, "grad_norm": 2502.85400390625, "learning_rate": 4.0202020202020204e-05, "loss": 180.6969, "step": 9950 }, { "epoch": 0.08047899546699634, "grad_norm": 1107.453857421875, "learning_rate": 4.024242424242424e-05, "loss": 245.7, "step": 9960 }, { "epoch": 0.08055979767128048, "grad_norm": 732.3422241210938, "learning_rate": 4.028282828282828e-05, "loss": 128.3301, "step": 9970 }, { "epoch": 0.0806405998755646, "grad_norm": 852.4077758789062, "learning_rate": 4.032323232323232e-05, "loss": 171.862, "step": 9980 }, { "epoch": 0.08072140207984874, "grad_norm": 974.1900634765625, "learning_rate": 4.0363636363636367e-05, "loss": 245.3339, "step": 9990 }, { "epoch": 0.08080220428413287, "grad_norm": 917.6868286132812, "learning_rate": 4.0404040404040405e-05, "loss": 151.893, "step": 10000 }, { "epoch": 0.08088300648841701, "grad_norm": 1009.7120971679688, "learning_rate": 4.0444444444444444e-05, "loss": 196.4933, "step": 10010 }, { "epoch": 0.08096380869270113, "grad_norm": 2075.980224609375, "learning_rate": 4.048484848484849e-05, "loss": 190.6988, "step": 10020 }, { "epoch": 0.08104461089698527, "grad_norm": 2236.189697265625, "learning_rate": 4.052525252525253e-05, "loss": 173.7965, "step": 10030 }, { "epoch": 0.0811254131012694, "grad_norm": 1387.155517578125, "learning_rate": 4.056565656565657e-05, "loss": 287.3005, "step": 10040 }, { "epoch": 0.08120621530555354, "grad_norm": 1775.0162353515625, "learning_rate": 4.0606060606060606e-05, "loss": 161.304, "step": 10050 }, { "epoch": 0.08128701750983768, "grad_norm": 2991.034423828125, "learning_rate": 4.064646464646465e-05, "loss": 183.5822, "step": 10060 }, { "epoch": 0.0813678197141218, "grad_norm": 2393.831298828125, "learning_rate": 4.068686868686869e-05, "loss": 216.6813, "step": 10070 }, { "epoch": 0.08144862191840593, "grad_norm": 753.2874755859375, "learning_rate": 4.072727272727273e-05, "loss": 176.5726, "step": 10080 }, { "epoch": 0.08152942412269007, "grad_norm": 917.6944580078125, "learning_rate": 4.076767676767677e-05, "loss": 182.8443, "step": 10090 }, { "epoch": 0.0816102263269742, "grad_norm": 1497.6976318359375, "learning_rate": 4.0808080808080814e-05, "loss": 179.7216, "step": 10100 }, { "epoch": 0.08169102853125833, "grad_norm": 896.7678833007812, "learning_rate": 4.084848484848485e-05, "loss": 171.1982, "step": 10110 }, { "epoch": 0.08177183073554246, "grad_norm": 2560.443115234375, "learning_rate": 4.088888888888889e-05, "loss": 242.9725, "step": 10120 }, { "epoch": 0.0818526329398266, "grad_norm": 914.347900390625, "learning_rate": 4.092929292929293e-05, "loss": 166.4831, "step": 10130 }, { "epoch": 0.08193343514411074, "grad_norm": 1105.142822265625, "learning_rate": 4.096969696969697e-05, "loss": 182.9737, "step": 10140 }, { "epoch": 0.08201423734839486, "grad_norm": 1321.1329345703125, "learning_rate": 4.101010101010101e-05, "loss": 152.9314, "step": 10150 }, { "epoch": 0.082095039552679, "grad_norm": 924.438720703125, "learning_rate": 4.105050505050505e-05, "loss": 187.9318, "step": 10160 }, { "epoch": 0.08217584175696313, "grad_norm": 834.27685546875, "learning_rate": 4.109090909090909e-05, "loss": 177.3875, "step": 10170 }, { "epoch": 0.08225664396124727, "grad_norm": 996.1378173828125, "learning_rate": 4.113131313131313e-05, "loss": 163.1871, "step": 10180 }, { "epoch": 0.08233744616553139, "grad_norm": 1657.6314697265625, "learning_rate": 4.117171717171717e-05, "loss": 195.5306, "step": 10190 }, { "epoch": 0.08241824836981552, "grad_norm": 1040.5526123046875, "learning_rate": 4.1212121212121216e-05, "loss": 179.7722, "step": 10200 }, { "epoch": 0.08249905057409966, "grad_norm": 1405.0408935546875, "learning_rate": 4.1252525252525255e-05, "loss": 177.9881, "step": 10210 }, { "epoch": 0.0825798527783838, "grad_norm": 1484.392333984375, "learning_rate": 4.1292929292929294e-05, "loss": 170.4384, "step": 10220 }, { "epoch": 0.08266065498266793, "grad_norm": 533.9537963867188, "learning_rate": 4.133333333333333e-05, "loss": 166.7928, "step": 10230 }, { "epoch": 0.08274145718695206, "grad_norm": 1133.5531005859375, "learning_rate": 4.137373737373738e-05, "loss": 185.8696, "step": 10240 }, { "epoch": 0.08282225939123619, "grad_norm": 1964.5546875, "learning_rate": 4.141414141414142e-05, "loss": 247.6705, "step": 10250 }, { "epoch": 0.08290306159552033, "grad_norm": 1816.8203125, "learning_rate": 4.1454545454545456e-05, "loss": 188.6205, "step": 10260 }, { "epoch": 0.08298386379980446, "grad_norm": 809.494873046875, "learning_rate": 4.1494949494949495e-05, "loss": 186.0191, "step": 10270 }, { "epoch": 0.08306466600408859, "grad_norm": 1444.1771240234375, "learning_rate": 4.153535353535354e-05, "loss": 171.1266, "step": 10280 }, { "epoch": 0.08314546820837272, "grad_norm": 1594.9212646484375, "learning_rate": 4.157575757575758e-05, "loss": 152.8558, "step": 10290 }, { "epoch": 0.08322627041265686, "grad_norm": 1367.26318359375, "learning_rate": 4.161616161616162e-05, "loss": 215.3903, "step": 10300 }, { "epoch": 0.083307072616941, "grad_norm": 1395.453857421875, "learning_rate": 4.165656565656566e-05, "loss": 219.8608, "step": 10310 }, { "epoch": 0.08338787482122512, "grad_norm": 823.7109375, "learning_rate": 4.16969696969697e-05, "loss": 198.1472, "step": 10320 }, { "epoch": 0.08346867702550925, "grad_norm": 4030.11083984375, "learning_rate": 4.173737373737374e-05, "loss": 191.6869, "step": 10330 }, { "epoch": 0.08354947922979339, "grad_norm": 1047.6395263671875, "learning_rate": 4.177777777777778e-05, "loss": 198.535, "step": 10340 }, { "epoch": 0.08363028143407752, "grad_norm": 1213.222412109375, "learning_rate": 4.181818181818182e-05, "loss": 209.4711, "step": 10350 }, { "epoch": 0.08371108363836166, "grad_norm": 862.3009643554688, "learning_rate": 4.185858585858586e-05, "loss": 197.0447, "step": 10360 }, { "epoch": 0.08379188584264578, "grad_norm": 7555.03271484375, "learning_rate": 4.18989898989899e-05, "loss": 257.4577, "step": 10370 }, { "epoch": 0.08387268804692992, "grad_norm": 1448.51806640625, "learning_rate": 4.193939393939394e-05, "loss": 145.9192, "step": 10380 }, { "epoch": 0.08395349025121406, "grad_norm": 563.4491577148438, "learning_rate": 4.197979797979798e-05, "loss": 185.379, "step": 10390 }, { "epoch": 0.08403429245549819, "grad_norm": 2999.943603515625, "learning_rate": 4.202020202020202e-05, "loss": 207.6808, "step": 10400 }, { "epoch": 0.08411509465978231, "grad_norm": 1272.3822021484375, "learning_rate": 4.206060606060606e-05, "loss": 203.2208, "step": 10410 }, { "epoch": 0.08419589686406645, "grad_norm": 1877.1287841796875, "learning_rate": 4.2101010101010105e-05, "loss": 208.5024, "step": 10420 }, { "epoch": 0.08427669906835059, "grad_norm": 880.4778442382812, "learning_rate": 4.2141414141414144e-05, "loss": 114.9268, "step": 10430 }, { "epoch": 0.08435750127263472, "grad_norm": 2282.030517578125, "learning_rate": 4.218181818181818e-05, "loss": 185.7249, "step": 10440 }, { "epoch": 0.08443830347691884, "grad_norm": 1241.139892578125, "learning_rate": 4.222222222222222e-05, "loss": 171.4185, "step": 10450 }, { "epoch": 0.08451910568120298, "grad_norm": 1106.116943359375, "learning_rate": 4.226262626262627e-05, "loss": 233.517, "step": 10460 }, { "epoch": 0.08459990788548712, "grad_norm": 1352.7723388671875, "learning_rate": 4.2303030303030306e-05, "loss": 164.0137, "step": 10470 }, { "epoch": 0.08468071008977125, "grad_norm": 824.0071411132812, "learning_rate": 4.2343434343434345e-05, "loss": 149.2571, "step": 10480 }, { "epoch": 0.08476151229405539, "grad_norm": 1494.287841796875, "learning_rate": 4.2383838383838384e-05, "loss": 158.6048, "step": 10490 }, { "epoch": 0.08484231449833951, "grad_norm": 928.8883056640625, "learning_rate": 4.242424242424243e-05, "loss": 160.3929, "step": 10500 }, { "epoch": 0.08492311670262365, "grad_norm": 3102.35791015625, "learning_rate": 4.246464646464647e-05, "loss": 262.7529, "step": 10510 }, { "epoch": 0.08500391890690778, "grad_norm": 753.5589599609375, "learning_rate": 4.250505050505051e-05, "loss": 147.7691, "step": 10520 }, { "epoch": 0.08508472111119192, "grad_norm": 1001.4116821289062, "learning_rate": 4.254545454545455e-05, "loss": 137.3231, "step": 10530 }, { "epoch": 0.08516552331547604, "grad_norm": 813.2144775390625, "learning_rate": 4.258585858585859e-05, "loss": 201.1259, "step": 10540 }, { "epoch": 0.08524632551976018, "grad_norm": 1098.597900390625, "learning_rate": 4.262626262626263e-05, "loss": 165.8937, "step": 10550 }, { "epoch": 0.08532712772404431, "grad_norm": 1298.8853759765625, "learning_rate": 4.266666666666667e-05, "loss": 198.1985, "step": 10560 }, { "epoch": 0.08540792992832845, "grad_norm": 1016.0570678710938, "learning_rate": 4.270707070707071e-05, "loss": 206.1271, "step": 10570 }, { "epoch": 0.08548873213261257, "grad_norm": 2057.573974609375, "learning_rate": 4.274747474747475e-05, "loss": 172.7861, "step": 10580 }, { "epoch": 0.08556953433689671, "grad_norm": 1261.774169921875, "learning_rate": 4.2787878787878786e-05, "loss": 149.1268, "step": 10590 }, { "epoch": 0.08565033654118084, "grad_norm": 1344.2037353515625, "learning_rate": 4.282828282828283e-05, "loss": 195.2029, "step": 10600 }, { "epoch": 0.08573113874546498, "grad_norm": 3034.935546875, "learning_rate": 4.286868686868687e-05, "loss": 222.7399, "step": 10610 }, { "epoch": 0.08581194094974912, "grad_norm": 970.0159912109375, "learning_rate": 4.290909090909091e-05, "loss": 168.5838, "step": 10620 }, { "epoch": 0.08589274315403324, "grad_norm": 953.5883178710938, "learning_rate": 4.294949494949495e-05, "loss": 162.8468, "step": 10630 }, { "epoch": 0.08597354535831737, "grad_norm": 2465.924072265625, "learning_rate": 4.2989898989898994e-05, "loss": 204.2057, "step": 10640 }, { "epoch": 0.08605434756260151, "grad_norm": 1194.1285400390625, "learning_rate": 4.303030303030303e-05, "loss": 197.6671, "step": 10650 }, { "epoch": 0.08613514976688565, "grad_norm": 881.1695556640625, "learning_rate": 4.307070707070707e-05, "loss": 163.8398, "step": 10660 }, { "epoch": 0.08621595197116977, "grad_norm": 3846.036376953125, "learning_rate": 4.311111111111111e-05, "loss": 178.4458, "step": 10670 }, { "epoch": 0.0862967541754539, "grad_norm": 1206.0906982421875, "learning_rate": 4.3151515151515156e-05, "loss": 153.2348, "step": 10680 }, { "epoch": 0.08637755637973804, "grad_norm": 898.4664306640625, "learning_rate": 4.3191919191919195e-05, "loss": 143.3541, "step": 10690 }, { "epoch": 0.08645835858402218, "grad_norm": 2398.2255859375, "learning_rate": 4.3232323232323234e-05, "loss": 212.5595, "step": 10700 }, { "epoch": 0.0865391607883063, "grad_norm": 1220.5733642578125, "learning_rate": 4.327272727272728e-05, "loss": 179.1572, "step": 10710 }, { "epoch": 0.08661996299259044, "grad_norm": 1974.5457763671875, "learning_rate": 4.331313131313132e-05, "loss": 190.0654, "step": 10720 }, { "epoch": 0.08670076519687457, "grad_norm": 2125.98583984375, "learning_rate": 4.335353535353536e-05, "loss": 168.3283, "step": 10730 }, { "epoch": 0.08678156740115871, "grad_norm": 1389.6546630859375, "learning_rate": 4.3393939393939396e-05, "loss": 159.5211, "step": 10740 }, { "epoch": 0.08686236960544283, "grad_norm": 807.5780029296875, "learning_rate": 4.343434343434344e-05, "loss": 192.4806, "step": 10750 }, { "epoch": 0.08694317180972697, "grad_norm": 1139.113037109375, "learning_rate": 4.347474747474748e-05, "loss": 155.3915, "step": 10760 }, { "epoch": 0.0870239740140111, "grad_norm": 1003.0131225585938, "learning_rate": 4.351515151515152e-05, "loss": 150.0795, "step": 10770 }, { "epoch": 0.08710477621829524, "grad_norm": 962.0811767578125, "learning_rate": 4.355555555555556e-05, "loss": 194.0887, "step": 10780 }, { "epoch": 0.08718557842257937, "grad_norm": 626.3510131835938, "learning_rate": 4.35959595959596e-05, "loss": 148.0051, "step": 10790 }, { "epoch": 0.0872663806268635, "grad_norm": 2664.0517578125, "learning_rate": 4.3636363636363636e-05, "loss": 199.106, "step": 10800 }, { "epoch": 0.08734718283114763, "grad_norm": 895.5698852539062, "learning_rate": 4.3676767676767674e-05, "loss": 172.3583, "step": 10810 }, { "epoch": 0.08742798503543177, "grad_norm": 1103.621826171875, "learning_rate": 4.371717171717172e-05, "loss": 203.0746, "step": 10820 }, { "epoch": 0.0875087872397159, "grad_norm": 1323.6517333984375, "learning_rate": 4.375757575757576e-05, "loss": 158.2676, "step": 10830 }, { "epoch": 0.08758958944400003, "grad_norm": 1170.48779296875, "learning_rate": 4.37979797979798e-05, "loss": 151.4179, "step": 10840 }, { "epoch": 0.08767039164828416, "grad_norm": 1171.2379150390625, "learning_rate": 4.383838383838384e-05, "loss": 181.4644, "step": 10850 }, { "epoch": 0.0877511938525683, "grad_norm": 1495.01025390625, "learning_rate": 4.387878787878788e-05, "loss": 167.6518, "step": 10860 }, { "epoch": 0.08783199605685243, "grad_norm": 1283.5498046875, "learning_rate": 4.391919191919192e-05, "loss": 138.4191, "step": 10870 }, { "epoch": 0.08791279826113656, "grad_norm": 1028.198974609375, "learning_rate": 4.395959595959596e-05, "loss": 167.4624, "step": 10880 }, { "epoch": 0.0879936004654207, "grad_norm": 958.3167114257812, "learning_rate": 4.4000000000000006e-05, "loss": 187.0603, "step": 10890 }, { "epoch": 0.08807440266970483, "grad_norm": 1248.95556640625, "learning_rate": 4.4040404040404044e-05, "loss": 194.5838, "step": 10900 }, { "epoch": 0.08815520487398897, "grad_norm": 1088.8775634765625, "learning_rate": 4.408080808080808e-05, "loss": 137.5682, "step": 10910 }, { "epoch": 0.0882360070782731, "grad_norm": 1130.275146484375, "learning_rate": 4.412121212121212e-05, "loss": 180.1215, "step": 10920 }, { "epoch": 0.08831680928255722, "grad_norm": 1201.9453125, "learning_rate": 4.416161616161617e-05, "loss": 157.9624, "step": 10930 }, { "epoch": 0.08839761148684136, "grad_norm": 1291.0989990234375, "learning_rate": 4.420202020202021e-05, "loss": 159.1208, "step": 10940 }, { "epoch": 0.0884784136911255, "grad_norm": 946.185546875, "learning_rate": 4.4242424242424246e-05, "loss": 200.6266, "step": 10950 }, { "epoch": 0.08855921589540963, "grad_norm": 2330.45361328125, "learning_rate": 4.4282828282828284e-05, "loss": 163.2997, "step": 10960 }, { "epoch": 0.08864001809969375, "grad_norm": 959.3818359375, "learning_rate": 4.432323232323233e-05, "loss": 200.18, "step": 10970 }, { "epoch": 0.08872082030397789, "grad_norm": 1215.0078125, "learning_rate": 4.436363636363637e-05, "loss": 194.7184, "step": 10980 }, { "epoch": 0.08880162250826203, "grad_norm": 731.89501953125, "learning_rate": 4.44040404040404e-05, "loss": 183.9504, "step": 10990 }, { "epoch": 0.08888242471254616, "grad_norm": 751.2623291015625, "learning_rate": 4.4444444444444447e-05, "loss": 164.3441, "step": 11000 }, { "epoch": 0.08896322691683028, "grad_norm": 1424.0635986328125, "learning_rate": 4.4484848484848485e-05, "loss": 202.2657, "step": 11010 }, { "epoch": 0.08904402912111442, "grad_norm": 1039.608642578125, "learning_rate": 4.4525252525252524e-05, "loss": 177.8795, "step": 11020 }, { "epoch": 0.08912483132539856, "grad_norm": 1338.993408203125, "learning_rate": 4.456565656565656e-05, "loss": 167.3348, "step": 11030 }, { "epoch": 0.08920563352968269, "grad_norm": 2591.8984375, "learning_rate": 4.460606060606061e-05, "loss": 165.2155, "step": 11040 }, { "epoch": 0.08928643573396683, "grad_norm": 931.5535888671875, "learning_rate": 4.464646464646465e-05, "loss": 198.5493, "step": 11050 }, { "epoch": 0.08936723793825095, "grad_norm": 1028.25927734375, "learning_rate": 4.4686868686868686e-05, "loss": 169.1925, "step": 11060 }, { "epoch": 0.08944804014253509, "grad_norm": 1156.0810546875, "learning_rate": 4.472727272727273e-05, "loss": 172.742, "step": 11070 }, { "epoch": 0.08952884234681922, "grad_norm": 1409.868408203125, "learning_rate": 4.476767676767677e-05, "loss": 152.6085, "step": 11080 }, { "epoch": 0.08960964455110336, "grad_norm": 1091.4266357421875, "learning_rate": 4.480808080808081e-05, "loss": 149.4833, "step": 11090 }, { "epoch": 0.08969044675538748, "grad_norm": 1274.9849853515625, "learning_rate": 4.484848484848485e-05, "loss": 239.724, "step": 11100 }, { "epoch": 0.08977124895967162, "grad_norm": 2974.341552734375, "learning_rate": 4.4888888888888894e-05, "loss": 184.0551, "step": 11110 }, { "epoch": 0.08985205116395575, "grad_norm": 893.275390625, "learning_rate": 4.492929292929293e-05, "loss": 163.8494, "step": 11120 }, { "epoch": 0.08993285336823989, "grad_norm": 1236.2047119140625, "learning_rate": 4.496969696969697e-05, "loss": 223.2573, "step": 11130 }, { "epoch": 0.09001365557252401, "grad_norm": 560.5221557617188, "learning_rate": 4.501010101010101e-05, "loss": 143.0442, "step": 11140 }, { "epoch": 0.09009445777680815, "grad_norm": 1671.4537353515625, "learning_rate": 4.5050505050505056e-05, "loss": 230.4551, "step": 11150 }, { "epoch": 0.09017525998109228, "grad_norm": 2295.0419921875, "learning_rate": 4.5090909090909095e-05, "loss": 191.5067, "step": 11160 }, { "epoch": 0.09025606218537642, "grad_norm": 1291.7230224609375, "learning_rate": 4.5131313131313134e-05, "loss": 165.677, "step": 11170 }, { "epoch": 0.09033686438966056, "grad_norm": 817.165771484375, "learning_rate": 4.517171717171717e-05, "loss": 145.0678, "step": 11180 }, { "epoch": 0.09041766659394468, "grad_norm": 938.4746704101562, "learning_rate": 4.521212121212122e-05, "loss": 168.7315, "step": 11190 }, { "epoch": 0.09049846879822881, "grad_norm": 820.7261352539062, "learning_rate": 4.525252525252526e-05, "loss": 174.4509, "step": 11200 }, { "epoch": 0.09057927100251295, "grad_norm": 899.671875, "learning_rate": 4.529292929292929e-05, "loss": 153.883, "step": 11210 }, { "epoch": 0.09066007320679709, "grad_norm": 2744.694091796875, "learning_rate": 4.5333333333333335e-05, "loss": 152.8033, "step": 11220 }, { "epoch": 0.09074087541108121, "grad_norm": 1821.427734375, "learning_rate": 4.5373737373737374e-05, "loss": 203.8027, "step": 11230 }, { "epoch": 0.09082167761536535, "grad_norm": 937.9207763671875, "learning_rate": 4.541414141414141e-05, "loss": 183.0782, "step": 11240 }, { "epoch": 0.09090247981964948, "grad_norm": 1872.185546875, "learning_rate": 4.545454545454546e-05, "loss": 153.9125, "step": 11250 }, { "epoch": 0.09098328202393362, "grad_norm": 772.832275390625, "learning_rate": 4.54949494949495e-05, "loss": 143.9607, "step": 11260 }, { "epoch": 0.09106408422821774, "grad_norm": 2842.652587890625, "learning_rate": 4.5535353535353536e-05, "loss": 189.6786, "step": 11270 }, { "epoch": 0.09114488643250188, "grad_norm": 1738.2589111328125, "learning_rate": 4.5575757575757575e-05, "loss": 163.1939, "step": 11280 }, { "epoch": 0.09122568863678601, "grad_norm": 1521.9814453125, "learning_rate": 4.561616161616162e-05, "loss": 208.5482, "step": 11290 }, { "epoch": 0.09130649084107015, "grad_norm": 1132.692138671875, "learning_rate": 4.565656565656566e-05, "loss": 161.8789, "step": 11300 }, { "epoch": 0.09138729304535427, "grad_norm": 1395.38671875, "learning_rate": 4.56969696969697e-05, "loss": 164.5156, "step": 11310 }, { "epoch": 0.0914680952496384, "grad_norm": 661.7669067382812, "learning_rate": 4.573737373737374e-05, "loss": 163.1058, "step": 11320 }, { "epoch": 0.09154889745392254, "grad_norm": 1353.42578125, "learning_rate": 4.577777777777778e-05, "loss": 178.0265, "step": 11330 }, { "epoch": 0.09162969965820668, "grad_norm": 758.9215087890625, "learning_rate": 4.581818181818182e-05, "loss": 186.8539, "step": 11340 }, { "epoch": 0.09171050186249081, "grad_norm": 926.5440673828125, "learning_rate": 4.585858585858586e-05, "loss": 174.9289, "step": 11350 }, { "epoch": 0.09179130406677494, "grad_norm": 1032.7493896484375, "learning_rate": 4.58989898989899e-05, "loss": 157.8875, "step": 11360 }, { "epoch": 0.09187210627105907, "grad_norm": 1104.685302734375, "learning_rate": 4.5939393939393945e-05, "loss": 180.8059, "step": 11370 }, { "epoch": 0.09195290847534321, "grad_norm": 807.3258056640625, "learning_rate": 4.5979797979797984e-05, "loss": 133.7691, "step": 11380 }, { "epoch": 0.09203371067962735, "grad_norm": 1296.3505859375, "learning_rate": 4.602020202020202e-05, "loss": 168.8188, "step": 11390 }, { "epoch": 0.09211451288391147, "grad_norm": 717.891357421875, "learning_rate": 4.606060606060607e-05, "loss": 143.9607, "step": 11400 }, { "epoch": 0.0921953150881956, "grad_norm": 841.2793579101562, "learning_rate": 4.610101010101011e-05, "loss": 185.6871, "step": 11410 }, { "epoch": 0.09227611729247974, "grad_norm": 1319.2064208984375, "learning_rate": 4.614141414141414e-05, "loss": 163.8516, "step": 11420 }, { "epoch": 0.09235691949676388, "grad_norm": 1566.4168701171875, "learning_rate": 4.618181818181818e-05, "loss": 165.3434, "step": 11430 }, { "epoch": 0.092437721701048, "grad_norm": 919.2448120117188, "learning_rate": 4.6222222222222224e-05, "loss": 133.6576, "step": 11440 }, { "epoch": 0.09251852390533213, "grad_norm": 976.5399780273438, "learning_rate": 4.626262626262626e-05, "loss": 172.4401, "step": 11450 }, { "epoch": 0.09259932610961627, "grad_norm": 1343.291015625, "learning_rate": 4.63030303030303e-05, "loss": 203.6782, "step": 11460 }, { "epoch": 0.0926801283139004, "grad_norm": 839.6242065429688, "learning_rate": 4.634343434343435e-05, "loss": 175.6062, "step": 11470 }, { "epoch": 0.09276093051818454, "grad_norm": 909.182861328125, "learning_rate": 4.6383838383838386e-05, "loss": 159.7895, "step": 11480 }, { "epoch": 0.09284173272246866, "grad_norm": 2260.60107421875, "learning_rate": 4.6424242424242425e-05, "loss": 168.3035, "step": 11490 }, { "epoch": 0.0929225349267528, "grad_norm": 1016.5995483398438, "learning_rate": 4.6464646464646464e-05, "loss": 149.6134, "step": 11500 }, { "epoch": 0.09300333713103694, "grad_norm": 1180.2608642578125, "learning_rate": 4.650505050505051e-05, "loss": 123.0571, "step": 11510 }, { "epoch": 0.09308413933532107, "grad_norm": 1054.5244140625, "learning_rate": 4.654545454545455e-05, "loss": 180.0858, "step": 11520 }, { "epoch": 0.0931649415396052, "grad_norm": 1064.1981201171875, "learning_rate": 4.658585858585859e-05, "loss": 305.0233, "step": 11530 }, { "epoch": 0.09324574374388933, "grad_norm": 1455.9857177734375, "learning_rate": 4.6626262626262626e-05, "loss": 174.1713, "step": 11540 }, { "epoch": 0.09332654594817347, "grad_norm": 801.254150390625, "learning_rate": 4.666666666666667e-05, "loss": 175.7632, "step": 11550 }, { "epoch": 0.0934073481524576, "grad_norm": 811.2909545898438, "learning_rate": 4.670707070707071e-05, "loss": 152.5526, "step": 11560 }, { "epoch": 0.09348815035674173, "grad_norm": 860.8350830078125, "learning_rate": 4.674747474747475e-05, "loss": 159.9338, "step": 11570 }, { "epoch": 0.09356895256102586, "grad_norm": 1820.26318359375, "learning_rate": 4.6787878787878795e-05, "loss": 205.222, "step": 11580 }, { "epoch": 0.09364975476531, "grad_norm": 1027.9521484375, "learning_rate": 4.6828282828282834e-05, "loss": 167.9507, "step": 11590 }, { "epoch": 0.09373055696959413, "grad_norm": 3551.64599609375, "learning_rate": 4.686868686868687e-05, "loss": 194.2529, "step": 11600 }, { "epoch": 0.09381135917387827, "grad_norm": 957.3357543945312, "learning_rate": 4.690909090909091e-05, "loss": 185.5651, "step": 11610 }, { "epoch": 0.09389216137816239, "grad_norm": 587.98828125, "learning_rate": 4.694949494949496e-05, "loss": 208.2654, "step": 11620 }, { "epoch": 0.09397296358244653, "grad_norm": 824.1953735351562, "learning_rate": 4.698989898989899e-05, "loss": 118.9412, "step": 11630 }, { "epoch": 0.09405376578673066, "grad_norm": 997.8128051757812, "learning_rate": 4.703030303030303e-05, "loss": 144.5133, "step": 11640 }, { "epoch": 0.0941345679910148, "grad_norm": 825.6588745117188, "learning_rate": 4.7070707070707074e-05, "loss": 130.3822, "step": 11650 }, { "epoch": 0.09421537019529892, "grad_norm": 1590.2271728515625, "learning_rate": 4.711111111111111e-05, "loss": 147.783, "step": 11660 }, { "epoch": 0.09429617239958306, "grad_norm": 831.8695068359375, "learning_rate": 4.715151515151515e-05, "loss": 150.0897, "step": 11670 }, { "epoch": 0.0943769746038672, "grad_norm": 879.4678955078125, "learning_rate": 4.719191919191919e-05, "loss": 134.1156, "step": 11680 }, { "epoch": 0.09445777680815133, "grad_norm": 1955.2484130859375, "learning_rate": 4.7232323232323236e-05, "loss": 155.8984, "step": 11690 }, { "epoch": 0.09453857901243545, "grad_norm": 1074.5509033203125, "learning_rate": 4.7272727272727275e-05, "loss": 186.7514, "step": 11700 }, { "epoch": 0.09461938121671959, "grad_norm": 992.748046875, "learning_rate": 4.7313131313131314e-05, "loss": 155.1787, "step": 11710 }, { "epoch": 0.09470018342100373, "grad_norm": 990.5753784179688, "learning_rate": 4.735353535353535e-05, "loss": 140.0309, "step": 11720 }, { "epoch": 0.09478098562528786, "grad_norm": 792.1434936523438, "learning_rate": 4.73939393939394e-05, "loss": 138.4607, "step": 11730 }, { "epoch": 0.094861787829572, "grad_norm": 1121.582763671875, "learning_rate": 4.743434343434344e-05, "loss": 121.2844, "step": 11740 }, { "epoch": 0.09494259003385612, "grad_norm": 1351.6878662109375, "learning_rate": 4.7474747474747476e-05, "loss": 198.7702, "step": 11750 }, { "epoch": 0.09502339223814026, "grad_norm": 2031.31494140625, "learning_rate": 4.751515151515152e-05, "loss": 164.1413, "step": 11760 }, { "epoch": 0.09510419444242439, "grad_norm": 761.006103515625, "learning_rate": 4.755555555555556e-05, "loss": 143.7779, "step": 11770 }, { "epoch": 0.09518499664670853, "grad_norm": 1228.6676025390625, "learning_rate": 4.75959595959596e-05, "loss": 177.4899, "step": 11780 }, { "epoch": 0.09526579885099265, "grad_norm": 817.8377075195312, "learning_rate": 4.763636363636364e-05, "loss": 158.3827, "step": 11790 }, { "epoch": 0.09534660105527679, "grad_norm": 1481.2501220703125, "learning_rate": 4.7676767676767684e-05, "loss": 144.1627, "step": 11800 }, { "epoch": 0.09542740325956092, "grad_norm": 899.5111083984375, "learning_rate": 4.771717171717172e-05, "loss": 188.1043, "step": 11810 }, { "epoch": 0.09550820546384506, "grad_norm": 766.15869140625, "learning_rate": 4.775757575757576e-05, "loss": 190.8986, "step": 11820 }, { "epoch": 0.09558900766812918, "grad_norm": 818.2703857421875, "learning_rate": 4.77979797979798e-05, "loss": 226.6272, "step": 11830 }, { "epoch": 0.09566980987241332, "grad_norm": 1306.8607177734375, "learning_rate": 4.7838383838383846e-05, "loss": 179.7575, "step": 11840 }, { "epoch": 0.09575061207669745, "grad_norm": 1371.1048583984375, "learning_rate": 4.787878787878788e-05, "loss": 184.5432, "step": 11850 }, { "epoch": 0.09583141428098159, "grad_norm": 1219.8428955078125, "learning_rate": 4.791919191919192e-05, "loss": 256.0484, "step": 11860 }, { "epoch": 0.09591221648526572, "grad_norm": 1769.408935546875, "learning_rate": 4.795959595959596e-05, "loss": 216.8512, "step": 11870 }, { "epoch": 0.09599301868954985, "grad_norm": 1021.0985107421875, "learning_rate": 4.8e-05, "loss": 150.8418, "step": 11880 }, { "epoch": 0.09607382089383398, "grad_norm": 789.3172607421875, "learning_rate": 4.804040404040404e-05, "loss": 168.713, "step": 11890 }, { "epoch": 0.09615462309811812, "grad_norm": 1145.4168701171875, "learning_rate": 4.808080808080808e-05, "loss": 248.8971, "step": 11900 }, { "epoch": 0.09623542530240226, "grad_norm": 1330.3175048828125, "learning_rate": 4.8121212121212125e-05, "loss": 143.9111, "step": 11910 }, { "epoch": 0.09631622750668638, "grad_norm": 1145.8402099609375, "learning_rate": 4.8161616161616163e-05, "loss": 196.2298, "step": 11920 }, { "epoch": 0.09639702971097051, "grad_norm": 3719.409423828125, "learning_rate": 4.82020202020202e-05, "loss": 141.981, "step": 11930 }, { "epoch": 0.09647783191525465, "grad_norm": 783.9396362304688, "learning_rate": 4.824242424242425e-05, "loss": 139.5111, "step": 11940 }, { "epoch": 0.09655863411953879, "grad_norm": 817.3587036132812, "learning_rate": 4.828282828282829e-05, "loss": 193.1281, "step": 11950 }, { "epoch": 0.09663943632382291, "grad_norm": 2150.12939453125, "learning_rate": 4.8323232323232326e-05, "loss": 173.1304, "step": 11960 }, { "epoch": 0.09672023852810704, "grad_norm": 1299.8162841796875, "learning_rate": 4.8363636363636364e-05, "loss": 181.7705, "step": 11970 }, { "epoch": 0.09680104073239118, "grad_norm": 936.7069091796875, "learning_rate": 4.840404040404041e-05, "loss": 173.4242, "step": 11980 }, { "epoch": 0.09688184293667532, "grad_norm": 613.501708984375, "learning_rate": 4.844444444444445e-05, "loss": 153.117, "step": 11990 }, { "epoch": 0.09696264514095944, "grad_norm": 1406.6314697265625, "learning_rate": 4.848484848484849e-05, "loss": 185.4404, "step": 12000 }, { "epoch": 0.09704344734524357, "grad_norm": 1215.42138671875, "learning_rate": 4.852525252525253e-05, "loss": 209.0183, "step": 12010 }, { "epoch": 0.09712424954952771, "grad_norm": 1375.722412109375, "learning_rate": 4.856565656565657e-05, "loss": 201.9598, "step": 12020 }, { "epoch": 0.09720505175381185, "grad_norm": 987.8963623046875, "learning_rate": 4.860606060606061e-05, "loss": 151.2616, "step": 12030 }, { "epoch": 0.09728585395809598, "grad_norm": 1280.0211181640625, "learning_rate": 4.864646464646465e-05, "loss": 137.8948, "step": 12040 }, { "epoch": 0.0973666561623801, "grad_norm": 1247.3785400390625, "learning_rate": 4.868686868686869e-05, "loss": 154.3858, "step": 12050 }, { "epoch": 0.09744745836666424, "grad_norm": 1069.8919677734375, "learning_rate": 4.872727272727273e-05, "loss": 154.6211, "step": 12060 }, { "epoch": 0.09752826057094838, "grad_norm": 1880.9874267578125, "learning_rate": 4.8767676767676767e-05, "loss": 149.5197, "step": 12070 }, { "epoch": 0.09760906277523251, "grad_norm": 1230.8167724609375, "learning_rate": 4.8808080808080805e-05, "loss": 149.4375, "step": 12080 }, { "epoch": 0.09768986497951664, "grad_norm": 1191.7757568359375, "learning_rate": 4.884848484848485e-05, "loss": 170.0373, "step": 12090 }, { "epoch": 0.09777066718380077, "grad_norm": 1369.3248291015625, "learning_rate": 4.888888888888889e-05, "loss": 161.5461, "step": 12100 }, { "epoch": 0.09785146938808491, "grad_norm": 864.84423828125, "learning_rate": 4.892929292929293e-05, "loss": 185.1635, "step": 12110 }, { "epoch": 0.09793227159236904, "grad_norm": 1059.955810546875, "learning_rate": 4.896969696969697e-05, "loss": 190.465, "step": 12120 }, { "epoch": 0.09801307379665317, "grad_norm": 1468.4659423828125, "learning_rate": 4.901010101010101e-05, "loss": 209.5102, "step": 12130 }, { "epoch": 0.0980938760009373, "grad_norm": 1922.64404296875, "learning_rate": 4.905050505050505e-05, "loss": 136.4074, "step": 12140 }, { "epoch": 0.09817467820522144, "grad_norm": 731.4076538085938, "learning_rate": 4.909090909090909e-05, "loss": 132.6805, "step": 12150 }, { "epoch": 0.09825548040950557, "grad_norm": 1013.0802612304688, "learning_rate": 4.9131313131313137e-05, "loss": 169.2464, "step": 12160 }, { "epoch": 0.09833628261378971, "grad_norm": 1180.7650146484375, "learning_rate": 4.9171717171717175e-05, "loss": 134.0749, "step": 12170 }, { "epoch": 0.09841708481807383, "grad_norm": 1976.8056640625, "learning_rate": 4.9212121212121214e-05, "loss": 173.6256, "step": 12180 }, { "epoch": 0.09849788702235797, "grad_norm": 1068.174560546875, "learning_rate": 4.925252525252525e-05, "loss": 172.0746, "step": 12190 }, { "epoch": 0.0985786892266421, "grad_norm": 707.3936157226562, "learning_rate": 4.92929292929293e-05, "loss": 186.8521, "step": 12200 }, { "epoch": 0.09865949143092624, "grad_norm": 1194.782470703125, "learning_rate": 4.933333333333334e-05, "loss": 218.8846, "step": 12210 }, { "epoch": 0.09874029363521036, "grad_norm": 1251.564453125, "learning_rate": 4.9373737373737376e-05, "loss": 163.8027, "step": 12220 }, { "epoch": 0.0988210958394945, "grad_norm": 1061.1964111328125, "learning_rate": 4.9414141414141415e-05, "loss": 189.2458, "step": 12230 }, { "epoch": 0.09890189804377864, "grad_norm": 946.7278442382812, "learning_rate": 4.945454545454546e-05, "loss": 147.6934, "step": 12240 }, { "epoch": 0.09898270024806277, "grad_norm": 1294.2952880859375, "learning_rate": 4.94949494949495e-05, "loss": 224.1921, "step": 12250 }, { "epoch": 0.0990635024523469, "grad_norm": 1992.6143798828125, "learning_rate": 4.953535353535354e-05, "loss": 168.6858, "step": 12260 }, { "epoch": 0.09914430465663103, "grad_norm": 931.4608764648438, "learning_rate": 4.957575757575758e-05, "loss": 164.2498, "step": 12270 }, { "epoch": 0.09922510686091517, "grad_norm": 928.4422607421875, "learning_rate": 4.9616161616161616e-05, "loss": 157.498, "step": 12280 }, { "epoch": 0.0993059090651993, "grad_norm": 834.3302612304688, "learning_rate": 4.9656565656565655e-05, "loss": 195.0842, "step": 12290 }, { "epoch": 0.09938671126948344, "grad_norm": 1873.80517578125, "learning_rate": 4.9696969696969694e-05, "loss": 224.0035, "step": 12300 }, { "epoch": 0.09946751347376756, "grad_norm": 1335.8856201171875, "learning_rate": 4.973737373737374e-05, "loss": 128.0538, "step": 12310 }, { "epoch": 0.0995483156780517, "grad_norm": 887.349853515625, "learning_rate": 4.977777777777778e-05, "loss": 140.0356, "step": 12320 }, { "epoch": 0.09962911788233583, "grad_norm": 1496.796630859375, "learning_rate": 4.981818181818182e-05, "loss": 143.5405, "step": 12330 }, { "epoch": 0.09970992008661997, "grad_norm": 1359.33544921875, "learning_rate": 4.985858585858586e-05, "loss": 163.7442, "step": 12340 }, { "epoch": 0.09979072229090409, "grad_norm": 762.21923828125, "learning_rate": 4.98989898989899e-05, "loss": 159.0385, "step": 12350 }, { "epoch": 0.09987152449518823, "grad_norm": 1305.80615234375, "learning_rate": 4.993939393939394e-05, "loss": 191.1846, "step": 12360 }, { "epoch": 0.09995232669947236, "grad_norm": 918.177978515625, "learning_rate": 4.997979797979798e-05, "loss": 162.5437, "step": 12370 }, { "epoch": 0.1000331289037565, "grad_norm": 620.300048828125, "learning_rate": 4.9999999751358095e-05, "loss": 183.1986, "step": 12380 }, { "epoch": 0.10011393110804062, "grad_norm": 1200.64794921875, "learning_rate": 4.999999776222285e-05, "loss": 125.1974, "step": 12390 }, { "epoch": 0.10019473331232476, "grad_norm": 1327.4674072265625, "learning_rate": 4.9999993783952516e-05, "loss": 203.2498, "step": 12400 }, { "epoch": 0.1002755355166089, "grad_norm": 1414.9403076171875, "learning_rate": 4.999998781654741e-05, "loss": 183.1395, "step": 12410 }, { "epoch": 0.10035633772089303, "grad_norm": 1874.2359619140625, "learning_rate": 4.9999979860008006e-05, "loss": 238.1009, "step": 12420 }, { "epoch": 0.10043713992517717, "grad_norm": 1188.5574951171875, "learning_rate": 4.9999969914334944e-05, "loss": 181.5796, "step": 12430 }, { "epoch": 0.10051794212946129, "grad_norm": 1656.1356201171875, "learning_rate": 4.999995797952901e-05, "loss": 192.1247, "step": 12440 }, { "epoch": 0.10059874433374542, "grad_norm": 636.4390869140625, "learning_rate": 4.9999944055591154e-05, "loss": 144.699, "step": 12450 }, { "epoch": 0.10067954653802956, "grad_norm": 958.838134765625, "learning_rate": 4.999992814252249e-05, "loss": 157.3529, "step": 12460 }, { "epoch": 0.1007603487423137, "grad_norm": 1260.2628173828125, "learning_rate": 4.999991024032426e-05, "loss": 146.3447, "step": 12470 }, { "epoch": 0.10084115094659782, "grad_norm": 1077.9141845703125, "learning_rate": 4.9999890348997925e-05, "loss": 143.8297, "step": 12480 }, { "epoch": 0.10092195315088195, "grad_norm": 1348.9508056640625, "learning_rate": 4.999986846854504e-05, "loss": 173.3844, "step": 12490 }, { "epoch": 0.10100275535516609, "grad_norm": 736.1795654296875, "learning_rate": 4.999984459896735e-05, "loss": 173.8864, "step": 12500 }, { "epoch": 0.10108355755945023, "grad_norm": 2159.8544921875, "learning_rate": 4.9999818740266766e-05, "loss": 262.527, "step": 12510 }, { "epoch": 0.10116435976373435, "grad_norm": 1445.3455810546875, "learning_rate": 4.999979089244534e-05, "loss": 127.1179, "step": 12520 }, { "epoch": 0.10124516196801848, "grad_norm": 1050.8997802734375, "learning_rate": 4.999976105550528e-05, "loss": 171.1639, "step": 12530 }, { "epoch": 0.10132596417230262, "grad_norm": 593.4418334960938, "learning_rate": 4.9999729229448975e-05, "loss": 148.7333, "step": 12540 }, { "epoch": 0.10140676637658676, "grad_norm": 1813.0040283203125, "learning_rate": 4.9999695414278944e-05, "loss": 169.583, "step": 12550 }, { "epoch": 0.10148756858087088, "grad_norm": 1632.8077392578125, "learning_rate": 4.9999659609997875e-05, "loss": 178.7264, "step": 12560 }, { "epoch": 0.10156837078515502, "grad_norm": 1241.4422607421875, "learning_rate": 4.9999621816608634e-05, "loss": 195.7228, "step": 12570 }, { "epoch": 0.10164917298943915, "grad_norm": 963.0867309570312, "learning_rate": 4.999958203411421e-05, "loss": 196.8765, "step": 12580 }, { "epoch": 0.10172997519372329, "grad_norm": 879.3704833984375, "learning_rate": 4.999954026251778e-05, "loss": 172.8657, "step": 12590 }, { "epoch": 0.10181077739800742, "grad_norm": 1103.077880859375, "learning_rate": 4.999949650182266e-05, "loss": 168.6242, "step": 12600 }, { "epoch": 0.10189157960229155, "grad_norm": 679.7638549804688, "learning_rate": 4.9999450752032345e-05, "loss": 130.0992, "step": 12610 }, { "epoch": 0.10197238180657568, "grad_norm": 3675.49951171875, "learning_rate": 4.999940301315046e-05, "loss": 209.7475, "step": 12620 }, { "epoch": 0.10205318401085982, "grad_norm": 862.3643188476562, "learning_rate": 4.999935328518081e-05, "loss": 160.7178, "step": 12630 }, { "epoch": 0.10213398621514395, "grad_norm": 894.45068359375, "learning_rate": 4.999930156812734e-05, "loss": 165.6423, "step": 12640 }, { "epoch": 0.10221478841942808, "grad_norm": 856.2412719726562, "learning_rate": 4.9999247861994194e-05, "loss": 140.6881, "step": 12650 }, { "epoch": 0.10229559062371221, "grad_norm": 1286.828125, "learning_rate": 4.999919216678561e-05, "loss": 177.9921, "step": 12660 }, { "epoch": 0.10237639282799635, "grad_norm": 1610.512939453125, "learning_rate": 4.999913448250605e-05, "loss": 197.9213, "step": 12670 }, { "epoch": 0.10245719503228048, "grad_norm": 690.97998046875, "learning_rate": 4.999907480916009e-05, "loss": 148.6186, "step": 12680 }, { "epoch": 0.10253799723656461, "grad_norm": 1106.3350830078125, "learning_rate": 4.999901314675246e-05, "loss": 209.407, "step": 12690 }, { "epoch": 0.10261879944084874, "grad_norm": 1395.5164794921875, "learning_rate": 4.99989494952881e-05, "loss": 182.4703, "step": 12700 }, { "epoch": 0.10269960164513288, "grad_norm": 1877.5772705078125, "learning_rate": 4.999888385477205e-05, "loss": 156.0251, "step": 12710 }, { "epoch": 0.10278040384941702, "grad_norm": 1271.5089111328125, "learning_rate": 4.9998816225209544e-05, "loss": 154.951, "step": 12720 }, { "epoch": 0.10286120605370115, "grad_norm": 1295.865966796875, "learning_rate": 4.9998746606605964e-05, "loss": 217.4705, "step": 12730 }, { "epoch": 0.10294200825798527, "grad_norm": 1123.7264404296875, "learning_rate": 4.999867499896684e-05, "loss": 162.33, "step": 12740 }, { "epoch": 0.10302281046226941, "grad_norm": 1871.114990234375, "learning_rate": 4.999860140229788e-05, "loss": 166.8098, "step": 12750 }, { "epoch": 0.10310361266655355, "grad_norm": 1002.0478515625, "learning_rate": 4.999852581660493e-05, "loss": 163.5526, "step": 12760 }, { "epoch": 0.10318441487083768, "grad_norm": 1245.7467041015625, "learning_rate": 4.9998448241894006e-05, "loss": 216.9302, "step": 12770 }, { "epoch": 0.1032652170751218, "grad_norm": 1028.039306640625, "learning_rate": 4.999836867817129e-05, "loss": 203.9537, "step": 12780 }, { "epoch": 0.10334601927940594, "grad_norm": 562.0195922851562, "learning_rate": 4.99982871254431e-05, "loss": 129.1438, "step": 12790 }, { "epoch": 0.10342682148369008, "grad_norm": 849.0570068359375, "learning_rate": 4.999820358371593e-05, "loss": 169.0486, "step": 12800 }, { "epoch": 0.10350762368797421, "grad_norm": 1529.56103515625, "learning_rate": 4.999811805299643e-05, "loss": 141.6357, "step": 12810 }, { "epoch": 0.10358842589225833, "grad_norm": 1446.593505859375, "learning_rate": 4.999803053329141e-05, "loss": 195.0215, "step": 12820 }, { "epoch": 0.10366922809654247, "grad_norm": 1069.249755859375, "learning_rate": 4.999794102460781e-05, "loss": 188.44, "step": 12830 }, { "epoch": 0.1037500303008266, "grad_norm": 1103.1290283203125, "learning_rate": 4.999784952695278e-05, "loss": 131.1704, "step": 12840 }, { "epoch": 0.10383083250511074, "grad_norm": 3719.030029296875, "learning_rate": 4.999775604033358e-05, "loss": 186.096, "step": 12850 }, { "epoch": 0.10391163470939488, "grad_norm": 934.4454345703125, "learning_rate": 4.9997660564757654e-05, "loss": 162.9967, "step": 12860 }, { "epoch": 0.103992436913679, "grad_norm": 881.9823608398438, "learning_rate": 4.999756310023261e-05, "loss": 189.1937, "step": 12870 }, { "epoch": 0.10407323911796314, "grad_norm": 4495.25634765625, "learning_rate": 4.9997463646766185e-05, "loss": 167.4466, "step": 12880 }, { "epoch": 0.10415404132224727, "grad_norm": 981.443359375, "learning_rate": 4.99973622043663e-05, "loss": 145.4228, "step": 12890 }, { "epoch": 0.10423484352653141, "grad_norm": 1170.6580810546875, "learning_rate": 4.9997258773041036e-05, "loss": 194.973, "step": 12900 }, { "epoch": 0.10431564573081553, "grad_norm": 909.8919677734375, "learning_rate": 4.999715335279861e-05, "loss": 173.4462, "step": 12910 }, { "epoch": 0.10439644793509967, "grad_norm": 756.8289184570312, "learning_rate": 4.9997045943647415e-05, "loss": 177.1729, "step": 12920 }, { "epoch": 0.1044772501393838, "grad_norm": 1074.613037109375, "learning_rate": 4.9996936545595986e-05, "loss": 152.6241, "step": 12930 }, { "epoch": 0.10455805234366794, "grad_norm": 948.0105590820312, "learning_rate": 4.999682515865304e-05, "loss": 160.605, "step": 12940 }, { "epoch": 0.10463885454795206, "grad_norm": 986.80419921875, "learning_rate": 4.999671178282744e-05, "loss": 195.0648, "step": 12950 }, { "epoch": 0.1047196567522362, "grad_norm": 1015.398193359375, "learning_rate": 4.99965964181282e-05, "loss": 152.1088, "step": 12960 }, { "epoch": 0.10480045895652033, "grad_norm": 867.7134399414062, "learning_rate": 4.999647906456451e-05, "loss": 158.7183, "step": 12970 }, { "epoch": 0.10488126116080447, "grad_norm": 1498.0318603515625, "learning_rate": 4.9996359722145694e-05, "loss": 178.2639, "step": 12980 }, { "epoch": 0.1049620633650886, "grad_norm": 1281.1707763671875, "learning_rate": 4.999623839088125e-05, "loss": 167.5088, "step": 12990 }, { "epoch": 0.10504286556937273, "grad_norm": 1084.959228515625, "learning_rate": 4.999611507078083e-05, "loss": 157.0286, "step": 13000 }, { "epoch": 0.10512366777365686, "grad_norm": 1000.6400756835938, "learning_rate": 4.9995989761854254e-05, "loss": 146.6623, "step": 13010 }, { "epoch": 0.105204469977941, "grad_norm": 1509.530029296875, "learning_rate": 4.99958624641115e-05, "loss": 177.9203, "step": 13020 }, { "epoch": 0.10528527218222514, "grad_norm": 1420.79345703125, "learning_rate": 4.999573317756267e-05, "loss": 148.3134, "step": 13030 }, { "epoch": 0.10536607438650926, "grad_norm": 1180.6531982421875, "learning_rate": 4.999560190221807e-05, "loss": 170.9963, "step": 13040 }, { "epoch": 0.1054468765907934, "grad_norm": 1092.0330810546875, "learning_rate": 4.999546863808815e-05, "loss": 133.3349, "step": 13050 }, { "epoch": 0.10552767879507753, "grad_norm": 1312.984619140625, "learning_rate": 4.99953333851835e-05, "loss": 122.4938, "step": 13060 }, { "epoch": 0.10560848099936167, "grad_norm": 1187.057861328125, "learning_rate": 4.999519614351488e-05, "loss": 137.0068, "step": 13070 }, { "epoch": 0.10568928320364579, "grad_norm": 1474.0391845703125, "learning_rate": 4.999505691309322e-05, "loss": 215.5605, "step": 13080 }, { "epoch": 0.10577008540792993, "grad_norm": 739.2951049804688, "learning_rate": 4.9994915693929586e-05, "loss": 153.0441, "step": 13090 }, { "epoch": 0.10585088761221406, "grad_norm": 576.087890625, "learning_rate": 4.9994772486035225e-05, "loss": 151.3807, "step": 13100 }, { "epoch": 0.1059316898164982, "grad_norm": 1152.282958984375, "learning_rate": 4.9994627289421534e-05, "loss": 190.3949, "step": 13110 }, { "epoch": 0.10601249202078232, "grad_norm": 2059.785888671875, "learning_rate": 4.999448010410005e-05, "loss": 140.3872, "step": 13120 }, { "epoch": 0.10609329422506646, "grad_norm": 784.2423095703125, "learning_rate": 4.99943309300825e-05, "loss": 103.543, "step": 13130 }, { "epoch": 0.10617409642935059, "grad_norm": 1098.57958984375, "learning_rate": 4.9994179767380746e-05, "loss": 150.3387, "step": 13140 }, { "epoch": 0.10625489863363473, "grad_norm": 1574.0692138671875, "learning_rate": 4.999402661600682e-05, "loss": 192.4264, "step": 13150 }, { "epoch": 0.10633570083791886, "grad_norm": 2210.006591796875, "learning_rate": 4.9993871475972895e-05, "loss": 125.3871, "step": 13160 }, { "epoch": 0.10641650304220299, "grad_norm": 1684.879150390625, "learning_rate": 4.999371434729132e-05, "loss": 205.1452, "step": 13170 }, { "epoch": 0.10649730524648712, "grad_norm": 1359.27490234375, "learning_rate": 4.999355522997461e-05, "loss": 203.7965, "step": 13180 }, { "epoch": 0.10657810745077126, "grad_norm": 916.1061401367188, "learning_rate": 4.999339412403541e-05, "loss": 162.6433, "step": 13190 }, { "epoch": 0.1066589096550554, "grad_norm": 1082.037109375, "learning_rate": 4.9993231029486544e-05, "loss": 175.1734, "step": 13200 }, { "epoch": 0.10673971185933952, "grad_norm": 1730.4796142578125, "learning_rate": 4.9993065946340986e-05, "loss": 206.4621, "step": 13210 }, { "epoch": 0.10682051406362365, "grad_norm": 1461.9183349609375, "learning_rate": 4.999289887461188e-05, "loss": 180.7627, "step": 13220 }, { "epoch": 0.10690131626790779, "grad_norm": 1014.8677368164062, "learning_rate": 4.999272981431251e-05, "loss": 179.8561, "step": 13230 }, { "epoch": 0.10698211847219193, "grad_norm": 1612.005859375, "learning_rate": 4.9992558765456334e-05, "loss": 142.1859, "step": 13240 }, { "epoch": 0.10706292067647605, "grad_norm": 958.360107421875, "learning_rate": 4.999238572805694e-05, "loss": 185.2089, "step": 13250 }, { "epoch": 0.10714372288076018, "grad_norm": 810.4140625, "learning_rate": 4.999221070212813e-05, "loss": 154.9561, "step": 13260 }, { "epoch": 0.10722452508504432, "grad_norm": 1180.060546875, "learning_rate": 4.999203368768381e-05, "loss": 152.1395, "step": 13270 }, { "epoch": 0.10730532728932846, "grad_norm": 820.518310546875, "learning_rate": 4.9991854684738066e-05, "loss": 162.0506, "step": 13280 }, { "epoch": 0.10738612949361259, "grad_norm": 654.80078125, "learning_rate": 4.999167369330514e-05, "loss": 227.117, "step": 13290 }, { "epoch": 0.10746693169789671, "grad_norm": 2003.2425537109375, "learning_rate": 4.9991490713399436e-05, "loss": 192.4266, "step": 13300 }, { "epoch": 0.10754773390218085, "grad_norm": 598.6465454101562, "learning_rate": 4.9991305745035514e-05, "loss": 153.8487, "step": 13310 }, { "epoch": 0.10762853610646499, "grad_norm": 1834.0950927734375, "learning_rate": 4.9991118788228084e-05, "loss": 177.134, "step": 13320 }, { "epoch": 0.10770933831074912, "grad_norm": 1758.453125, "learning_rate": 4.9990929842992026e-05, "loss": 170.0616, "step": 13330 }, { "epoch": 0.10779014051503324, "grad_norm": 1039.255859375, "learning_rate": 4.9990738909342384e-05, "loss": 146.2972, "step": 13340 }, { "epoch": 0.10787094271931738, "grad_norm": 1003.7833862304688, "learning_rate": 4.9990545987294324e-05, "loss": 151.9346, "step": 13350 }, { "epoch": 0.10795174492360152, "grad_norm": 5873.03857421875, "learning_rate": 4.999035107686322e-05, "loss": 144.1906, "step": 13360 }, { "epoch": 0.10803254712788565, "grad_norm": 2187.155517578125, "learning_rate": 4.999015417806457e-05, "loss": 196.4291, "step": 13370 }, { "epoch": 0.10811334933216978, "grad_norm": 1240.0516357421875, "learning_rate": 4.998995529091404e-05, "loss": 155.5472, "step": 13380 }, { "epoch": 0.10819415153645391, "grad_norm": 1135.625, "learning_rate": 4.998975441542745e-05, "loss": 197.4514, "step": 13390 }, { "epoch": 0.10827495374073805, "grad_norm": 1170.193603515625, "learning_rate": 4.9989551551620794e-05, "loss": 166.823, "step": 13400 }, { "epoch": 0.10835575594502218, "grad_norm": 1421.60791015625, "learning_rate": 4.998934669951021e-05, "loss": 154.3157, "step": 13410 }, { "epoch": 0.10843655814930632, "grad_norm": 927.9348754882812, "learning_rate": 4.9989139859111995e-05, "loss": 168.5901, "step": 13420 }, { "epoch": 0.10851736035359044, "grad_norm": 1038.8873291015625, "learning_rate": 4.99889310304426e-05, "loss": 152.3316, "step": 13430 }, { "epoch": 0.10859816255787458, "grad_norm": 947.6807250976562, "learning_rate": 4.9988720213518655e-05, "loss": 135.1435, "step": 13440 }, { "epoch": 0.10867896476215871, "grad_norm": 1577.244384765625, "learning_rate": 4.998850740835692e-05, "loss": 178.216, "step": 13450 }, { "epoch": 0.10875976696644285, "grad_norm": 885.4010620117188, "learning_rate": 4.998829261497433e-05, "loss": 203.6382, "step": 13460 }, { "epoch": 0.10884056917072697, "grad_norm": 847.8015747070312, "learning_rate": 4.998807583338798e-05, "loss": 153.3626, "step": 13470 }, { "epoch": 0.10892137137501111, "grad_norm": 1210.1986083984375, "learning_rate": 4.998785706361512e-05, "loss": 145.362, "step": 13480 }, { "epoch": 0.10900217357929524, "grad_norm": 1167.423095703125, "learning_rate": 4.9987636305673144e-05, "loss": 162.2581, "step": 13490 }, { "epoch": 0.10908297578357938, "grad_norm": 1180.3104248046875, "learning_rate": 4.9987413559579636e-05, "loss": 149.3035, "step": 13500 }, { "epoch": 0.1091637779878635, "grad_norm": 1402.9918212890625, "learning_rate": 4.9987188825352294e-05, "loss": 169.6259, "step": 13510 }, { "epoch": 0.10924458019214764, "grad_norm": 2665.6416015625, "learning_rate": 4.998696210300902e-05, "loss": 192.3846, "step": 13520 }, { "epoch": 0.10932538239643178, "grad_norm": 725.3150634765625, "learning_rate": 4.9986733392567845e-05, "loss": 197.2868, "step": 13530 }, { "epoch": 0.10940618460071591, "grad_norm": 974.5929565429688, "learning_rate": 4.998650269404697e-05, "loss": 178.5823, "step": 13540 }, { "epoch": 0.10948698680500005, "grad_norm": 1542.777587890625, "learning_rate": 4.998627000746475e-05, "loss": 200.5336, "step": 13550 }, { "epoch": 0.10956778900928417, "grad_norm": 1063.777099609375, "learning_rate": 4.9986035332839694e-05, "loss": 164.4375, "step": 13560 }, { "epoch": 0.1096485912135683, "grad_norm": 1302.0216064453125, "learning_rate": 4.998579867019048e-05, "loss": 135.7528, "step": 13570 }, { "epoch": 0.10972939341785244, "grad_norm": 807.3289794921875, "learning_rate": 4.998556001953593e-05, "loss": 156.5582, "step": 13580 }, { "epoch": 0.10981019562213658, "grad_norm": 727.23974609375, "learning_rate": 4.998531938089503e-05, "loss": 180.1156, "step": 13590 }, { "epoch": 0.1098909978264207, "grad_norm": 1151.44189453125, "learning_rate": 4.998507675428695e-05, "loss": 138.4998, "step": 13600 }, { "epoch": 0.10997180003070484, "grad_norm": 1855.6485595703125, "learning_rate": 4.998483213973098e-05, "loss": 166.6201, "step": 13610 }, { "epoch": 0.11005260223498897, "grad_norm": 1196.5108642578125, "learning_rate": 4.9984585537246566e-05, "loss": 135.4699, "step": 13620 }, { "epoch": 0.11013340443927311, "grad_norm": 1511.992919921875, "learning_rate": 4.998433694685335e-05, "loss": 136.3776, "step": 13630 }, { "epoch": 0.11021420664355723, "grad_norm": 1018.0720825195312, "learning_rate": 4.998408636857111e-05, "loss": 142.9319, "step": 13640 }, { "epoch": 0.11029500884784137, "grad_norm": 1056.8865966796875, "learning_rate": 4.998383380241978e-05, "loss": 163.6253, "step": 13650 }, { "epoch": 0.1103758110521255, "grad_norm": 568.1683959960938, "learning_rate": 4.9983579248419445e-05, "loss": 156.528, "step": 13660 }, { "epoch": 0.11045661325640964, "grad_norm": 2031.0875244140625, "learning_rate": 4.998332270659037e-05, "loss": 188.1777, "step": 13670 }, { "epoch": 0.11053741546069376, "grad_norm": 682.0467529296875, "learning_rate": 4.9983064176952976e-05, "loss": 214.7207, "step": 13680 }, { "epoch": 0.1106182176649779, "grad_norm": 1406.7552490234375, "learning_rate": 4.998280365952782e-05, "loss": 177.2549, "step": 13690 }, { "epoch": 0.11069901986926203, "grad_norm": 922.1409301757812, "learning_rate": 4.998254115433563e-05, "loss": 174.0214, "step": 13700 }, { "epoch": 0.11077982207354617, "grad_norm": 3660.348388671875, "learning_rate": 4.9982276661397286e-05, "loss": 184.1588, "step": 13710 }, { "epoch": 0.1108606242778303, "grad_norm": 1597.2998046875, "learning_rate": 4.998201018073385e-05, "loss": 152.5763, "step": 13720 }, { "epoch": 0.11094142648211443, "grad_norm": 1081.3948974609375, "learning_rate": 4.9981741712366515e-05, "loss": 179.4719, "step": 13730 }, { "epoch": 0.11102222868639856, "grad_norm": 1722.0684814453125, "learning_rate": 4.9981471256316645e-05, "loss": 166.4671, "step": 13740 }, { "epoch": 0.1111030308906827, "grad_norm": 1164.61083984375, "learning_rate": 4.998119881260576e-05, "loss": 168.6889, "step": 13750 }, { "epoch": 0.11118383309496684, "grad_norm": 798.0043334960938, "learning_rate": 4.998092438125552e-05, "loss": 120.9161, "step": 13760 }, { "epoch": 0.11126463529925096, "grad_norm": 1977.904052734375, "learning_rate": 4.998064796228779e-05, "loss": 138.4463, "step": 13770 }, { "epoch": 0.1113454375035351, "grad_norm": 784.3284301757812, "learning_rate": 4.998036955572453e-05, "loss": 160.8799, "step": 13780 }, { "epoch": 0.11142623970781923, "grad_norm": 1237.744873046875, "learning_rate": 4.9980089161587916e-05, "loss": 161.2431, "step": 13790 }, { "epoch": 0.11150704191210337, "grad_norm": 1126.8914794921875, "learning_rate": 4.9979806779900255e-05, "loss": 188.3983, "step": 13800 }, { "epoch": 0.11158784411638749, "grad_norm": 1386.9434814453125, "learning_rate": 4.997952241068401e-05, "loss": 192.1466, "step": 13810 }, { "epoch": 0.11166864632067162, "grad_norm": 680.4060668945312, "learning_rate": 4.99792360539618e-05, "loss": 120.4083, "step": 13820 }, { "epoch": 0.11174944852495576, "grad_norm": 1295.848388671875, "learning_rate": 4.997894770975643e-05, "loss": 166.7768, "step": 13830 }, { "epoch": 0.1118302507292399, "grad_norm": 1008.2403564453125, "learning_rate": 4.9978657378090814e-05, "loss": 144.6575, "step": 13840 }, { "epoch": 0.11191105293352403, "grad_norm": 1428.162353515625, "learning_rate": 4.997836505898807e-05, "loss": 154.2511, "step": 13850 }, { "epoch": 0.11199185513780816, "grad_norm": 825.681640625, "learning_rate": 4.997807075247146e-05, "loss": 121.8282, "step": 13860 }, { "epoch": 0.11207265734209229, "grad_norm": 815.547119140625, "learning_rate": 4.997777445856439e-05, "loss": 161.0432, "step": 13870 }, { "epoch": 0.11215345954637643, "grad_norm": 715.5890502929688, "learning_rate": 4.997747617729044e-05, "loss": 152.8754, "step": 13880 }, { "epoch": 0.11223426175066056, "grad_norm": 965.2642822265625, "learning_rate": 4.997717590867335e-05, "loss": 119.2692, "step": 13890 }, { "epoch": 0.11231506395494469, "grad_norm": 2111.7919921875, "learning_rate": 4.997687365273699e-05, "loss": 158.9874, "step": 13900 }, { "epoch": 0.11239586615922882, "grad_norm": 879.88037109375, "learning_rate": 4.9976569409505424e-05, "loss": 176.3235, "step": 13910 }, { "epoch": 0.11247666836351296, "grad_norm": 1380.322998046875, "learning_rate": 4.997626317900286e-05, "loss": 137.2757, "step": 13920 }, { "epoch": 0.1125574705677971, "grad_norm": 718.0297241210938, "learning_rate": 4.997595496125366e-05, "loss": 191.654, "step": 13930 }, { "epoch": 0.11263827277208122, "grad_norm": 1362.7103271484375, "learning_rate": 4.997564475628234e-05, "loss": 134.5998, "step": 13940 }, { "epoch": 0.11271907497636535, "grad_norm": 842.7426147460938, "learning_rate": 4.99753325641136e-05, "loss": 143.3749, "step": 13950 }, { "epoch": 0.11279987718064949, "grad_norm": 1636.0693359375, "learning_rate": 4.997501838477226e-05, "loss": 132.4023, "step": 13960 }, { "epoch": 0.11288067938493362, "grad_norm": 1336.4649658203125, "learning_rate": 4.997470221828334e-05, "loss": 136.2148, "step": 13970 }, { "epoch": 0.11296148158921776, "grad_norm": 906.9151000976562, "learning_rate": 4.997438406467197e-05, "loss": 175.0376, "step": 13980 }, { "epoch": 0.11304228379350188, "grad_norm": 1826.1988525390625, "learning_rate": 4.997406392396349e-05, "loss": 152.145, "step": 13990 }, { "epoch": 0.11312308599778602, "grad_norm": 1275.584228515625, "learning_rate": 4.997374179618335e-05, "loss": 190.0866, "step": 14000 }, { "epoch": 0.11320388820207015, "grad_norm": 937.593017578125, "learning_rate": 4.997341768135719e-05, "loss": 141.1656, "step": 14010 }, { "epoch": 0.11328469040635429, "grad_norm": 945.1254272460938, "learning_rate": 4.99730915795108e-05, "loss": 180.2019, "step": 14020 }, { "epoch": 0.11336549261063841, "grad_norm": 867.3080444335938, "learning_rate": 4.9972763490670116e-05, "loss": 161.3901, "step": 14030 }, { "epoch": 0.11344629481492255, "grad_norm": 1019.8523559570312, "learning_rate": 4.997243341486127e-05, "loss": 198.7109, "step": 14040 }, { "epoch": 0.11352709701920669, "grad_norm": 1486.674072265625, "learning_rate": 4.9972101352110476e-05, "loss": 223.1289, "step": 14050 }, { "epoch": 0.11360789922349082, "grad_norm": 1696.5440673828125, "learning_rate": 4.9971767302444204e-05, "loss": 145.1186, "step": 14060 }, { "epoch": 0.11368870142777494, "grad_norm": 673.3330688476562, "learning_rate": 4.9971431265889014e-05, "loss": 140.5065, "step": 14070 }, { "epoch": 0.11376950363205908, "grad_norm": 3967.926513671875, "learning_rate": 4.997109324247163e-05, "loss": 163.7622, "step": 14080 }, { "epoch": 0.11385030583634322, "grad_norm": 1310.7440185546875, "learning_rate": 4.997075323221897e-05, "loss": 165.0551, "step": 14090 }, { "epoch": 0.11393110804062735, "grad_norm": 887.0441284179688, "learning_rate": 4.9970411235158066e-05, "loss": 164.2679, "step": 14100 }, { "epoch": 0.11401191024491149, "grad_norm": 1011.033935546875, "learning_rate": 4.997006725131615e-05, "loss": 126.0271, "step": 14110 }, { "epoch": 0.11409271244919561, "grad_norm": 1387.5933837890625, "learning_rate": 4.996972128072057e-05, "loss": 159.5976, "step": 14120 }, { "epoch": 0.11417351465347975, "grad_norm": 1430.6734619140625, "learning_rate": 4.996937332339887e-05, "loss": 142.3303, "step": 14130 }, { "epoch": 0.11425431685776388, "grad_norm": 1413.59814453125, "learning_rate": 4.9969023379378724e-05, "loss": 212.5765, "step": 14140 }, { "epoch": 0.11433511906204802, "grad_norm": 981.312744140625, "learning_rate": 4.996867144868798e-05, "loss": 123.0717, "step": 14150 }, { "epoch": 0.11441592126633214, "grad_norm": 1553.310546875, "learning_rate": 4.996831753135464e-05, "loss": 187.7103, "step": 14160 }, { "epoch": 0.11449672347061628, "grad_norm": 1003.1934204101562, "learning_rate": 4.996796162740686e-05, "loss": 140.4176, "step": 14170 }, { "epoch": 0.11457752567490041, "grad_norm": 916.1061401367188, "learning_rate": 4.996760373687297e-05, "loss": 162.5518, "step": 14180 }, { "epoch": 0.11465832787918455, "grad_norm": 946.1961669921875, "learning_rate": 4.9967243859781426e-05, "loss": 156.3168, "step": 14190 }, { "epoch": 0.11473913008346867, "grad_norm": 775.9445190429688, "learning_rate": 4.9966881996160876e-05, "loss": 156.1551, "step": 14200 }, { "epoch": 0.11481993228775281, "grad_norm": 1236.88427734375, "learning_rate": 4.996651814604011e-05, "loss": 174.8899, "step": 14210 }, { "epoch": 0.11490073449203694, "grad_norm": 1576.7642822265625, "learning_rate": 4.9966152309448076e-05, "loss": 158.9026, "step": 14220 }, { "epoch": 0.11498153669632108, "grad_norm": 1118.7216796875, "learning_rate": 4.996578448641388e-05, "loss": 138.2308, "step": 14230 }, { "epoch": 0.1150623389006052, "grad_norm": 1023.9251708984375, "learning_rate": 4.9965414676966796e-05, "loss": 129.1291, "step": 14240 }, { "epoch": 0.11514314110488934, "grad_norm": 1101.9742431640625, "learning_rate": 4.996504288113624e-05, "loss": 196.3162, "step": 14250 }, { "epoch": 0.11522394330917347, "grad_norm": 796.50048828125, "learning_rate": 4.9964669098951786e-05, "loss": 147.224, "step": 14260 }, { "epoch": 0.11530474551345761, "grad_norm": 1557.001708984375, "learning_rate": 4.996429333044319e-05, "loss": 134.4548, "step": 14270 }, { "epoch": 0.11538554771774175, "grad_norm": 1414.4412841796875, "learning_rate": 4.996391557564035e-05, "loss": 153.0477, "step": 14280 }, { "epoch": 0.11546634992202587, "grad_norm": 950.5034790039062, "learning_rate": 4.996353583457331e-05, "loss": 151.1424, "step": 14290 }, { "epoch": 0.11554715212631, "grad_norm": 5219.78955078125, "learning_rate": 4.9963154107272295e-05, "loss": 194.4314, "step": 14300 }, { "epoch": 0.11562795433059414, "grad_norm": 915.6011962890625, "learning_rate": 4.996277039376767e-05, "loss": 173.4856, "step": 14310 }, { "epoch": 0.11570875653487828, "grad_norm": 817.5308837890625, "learning_rate": 4.996238469408997e-05, "loss": 108.486, "step": 14320 }, { "epoch": 0.1157895587391624, "grad_norm": 2541.830078125, "learning_rate": 4.996199700826988e-05, "loss": 220.6018, "step": 14330 }, { "epoch": 0.11587036094344653, "grad_norm": 1460.224853515625, "learning_rate": 4.996160733633824e-05, "loss": 185.1798, "step": 14340 }, { "epoch": 0.11595116314773067, "grad_norm": 1051.0694580078125, "learning_rate": 4.996121567832608e-05, "loss": 148.2247, "step": 14350 }, { "epoch": 0.11603196535201481, "grad_norm": 1205.9510498046875, "learning_rate": 4.9960822034264534e-05, "loss": 169.8678, "step": 14360 }, { "epoch": 0.11611276755629893, "grad_norm": 1407.5836181640625, "learning_rate": 4.996042640418494e-05, "loss": 158.6641, "step": 14370 }, { "epoch": 0.11619356976058307, "grad_norm": 1719.08349609375, "learning_rate": 4.996002878811876e-05, "loss": 112.0756, "step": 14380 }, { "epoch": 0.1162743719648672, "grad_norm": 1174.4710693359375, "learning_rate": 4.995962918609766e-05, "loss": 168.0695, "step": 14390 }, { "epoch": 0.11635517416915134, "grad_norm": 1517.498291015625, "learning_rate": 4.995922759815339e-05, "loss": 190.5525, "step": 14400 }, { "epoch": 0.11643597637343547, "grad_norm": 920.9449462890625, "learning_rate": 4.995882402431794e-05, "loss": 165.8069, "step": 14410 }, { "epoch": 0.1165167785777196, "grad_norm": 741.5789794921875, "learning_rate": 4.995841846462341e-05, "loss": 148.2098, "step": 14420 }, { "epoch": 0.11659758078200373, "grad_norm": 995.2381591796875, "learning_rate": 4.995801091910206e-05, "loss": 138.0731, "step": 14430 }, { "epoch": 0.11667838298628787, "grad_norm": 877.2784423828125, "learning_rate": 4.995760138778633e-05, "loss": 148.9934, "step": 14440 }, { "epoch": 0.116759185190572, "grad_norm": 1467.7303466796875, "learning_rate": 4.995718987070879e-05, "loss": 228.5873, "step": 14450 }, { "epoch": 0.11683998739485613, "grad_norm": 1128.6104736328125, "learning_rate": 4.99567763679022e-05, "loss": 149.8821, "step": 14460 }, { "epoch": 0.11692078959914026, "grad_norm": 932.7119140625, "learning_rate": 4.9956360879399444e-05, "loss": 129.9151, "step": 14470 }, { "epoch": 0.1170015918034244, "grad_norm": 1253.4259033203125, "learning_rate": 4.9955943405233584e-05, "loss": 205.2673, "step": 14480 }, { "epoch": 0.11708239400770853, "grad_norm": 2555.6826171875, "learning_rate": 4.995552394543784e-05, "loss": 143.3954, "step": 14490 }, { "epoch": 0.11716319621199266, "grad_norm": 855.253662109375, "learning_rate": 4.995510250004559e-05, "loss": 154.8423, "step": 14500 }, { "epoch": 0.11724399841627679, "grad_norm": 2018.5384521484375, "learning_rate": 4.9954679069090364e-05, "loss": 234.7133, "step": 14510 }, { "epoch": 0.11732480062056093, "grad_norm": 1564.721435546875, "learning_rate": 4.995425365260585e-05, "loss": 139.8708, "step": 14520 }, { "epoch": 0.11740560282484507, "grad_norm": 688.4728393554688, "learning_rate": 4.9953826250625896e-05, "loss": 149.8579, "step": 14530 }, { "epoch": 0.1174864050291292, "grad_norm": 2625.092529296875, "learning_rate": 4.995339686318451e-05, "loss": 158.0993, "step": 14540 }, { "epoch": 0.11756720723341332, "grad_norm": 1898.60546875, "learning_rate": 4.995296549031585e-05, "loss": 132.3351, "step": 14550 }, { "epoch": 0.11764800943769746, "grad_norm": 911.2061767578125, "learning_rate": 4.995253213205425e-05, "loss": 141.7144, "step": 14560 }, { "epoch": 0.1177288116419816, "grad_norm": 1163.8433837890625, "learning_rate": 4.9952096788434186e-05, "loss": 130.6005, "step": 14570 }, { "epoch": 0.11780961384626573, "grad_norm": 1011.4424438476562, "learning_rate": 4.9951659459490294e-05, "loss": 150.9337, "step": 14580 }, { "epoch": 0.11789041605054985, "grad_norm": 845.7263793945312, "learning_rate": 4.9951220145257374e-05, "loss": 286.6193, "step": 14590 }, { "epoch": 0.11797121825483399, "grad_norm": 1084.8226318359375, "learning_rate": 4.9950778845770376e-05, "loss": 188.6081, "step": 14600 }, { "epoch": 0.11805202045911813, "grad_norm": 1993.1123046875, "learning_rate": 4.9950335561064423e-05, "loss": 177.9271, "step": 14610 }, { "epoch": 0.11813282266340226, "grad_norm": 2498.151611328125, "learning_rate": 4.994989029117476e-05, "loss": 185.4942, "step": 14620 }, { "epoch": 0.11821362486768638, "grad_norm": 1064.155517578125, "learning_rate": 4.994944303613684e-05, "loss": 157.5091, "step": 14630 }, { "epoch": 0.11829442707197052, "grad_norm": 880.9674072265625, "learning_rate": 4.994899379598623e-05, "loss": 180.4067, "step": 14640 }, { "epoch": 0.11837522927625466, "grad_norm": 1241.9085693359375, "learning_rate": 4.99485425707587e-05, "loss": 203.0391, "step": 14650 }, { "epoch": 0.11845603148053879, "grad_norm": 803.6256713867188, "learning_rate": 4.994808936049013e-05, "loss": 183.2648, "step": 14660 }, { "epoch": 0.11853683368482293, "grad_norm": 1581.531005859375, "learning_rate": 4.9947634165216584e-05, "loss": 186.4468, "step": 14670 }, { "epoch": 0.11861763588910705, "grad_norm": 791.8948974609375, "learning_rate": 4.994717698497428e-05, "loss": 179.2291, "step": 14680 }, { "epoch": 0.11869843809339119, "grad_norm": 1237.831298828125, "learning_rate": 4.994671781979959e-05, "loss": 139.9062, "step": 14690 }, { "epoch": 0.11877924029767532, "grad_norm": 835.0073852539062, "learning_rate": 4.994625666972906e-05, "loss": 156.4611, "step": 14700 }, { "epoch": 0.11886004250195946, "grad_norm": 1219.32958984375, "learning_rate": 4.994579353479938e-05, "loss": 160.4347, "step": 14710 }, { "epoch": 0.11894084470624358, "grad_norm": 1276.5611572265625, "learning_rate": 4.9945328415047385e-05, "loss": 167.3067, "step": 14720 }, { "epoch": 0.11902164691052772, "grad_norm": 1150.7158203125, "learning_rate": 4.994486131051009e-05, "loss": 143.2454, "step": 14730 }, { "epoch": 0.11910244911481185, "grad_norm": 1355.6668701171875, "learning_rate": 4.994439222122468e-05, "loss": 156.0403, "step": 14740 }, { "epoch": 0.11918325131909599, "grad_norm": 1088.11865234375, "learning_rate": 4.994392114722845e-05, "loss": 182.5412, "step": 14750 }, { "epoch": 0.11926405352338011, "grad_norm": 896.38232421875, "learning_rate": 4.994344808855888e-05, "loss": 170.0941, "step": 14760 }, { "epoch": 0.11934485572766425, "grad_norm": 1636.67333984375, "learning_rate": 4.994297304525363e-05, "loss": 192.0554, "step": 14770 }, { "epoch": 0.11942565793194838, "grad_norm": 1122.7872314453125, "learning_rate": 4.994249601735049e-05, "loss": 139.6245, "step": 14780 }, { "epoch": 0.11950646013623252, "grad_norm": 999.4352416992188, "learning_rate": 4.994201700488741e-05, "loss": 157.1384, "step": 14790 }, { "epoch": 0.11958726234051664, "grad_norm": 1360.1336669921875, "learning_rate": 4.99415360079025e-05, "loss": 126.2787, "step": 14800 }, { "epoch": 0.11966806454480078, "grad_norm": 954.704345703125, "learning_rate": 4.994105302643404e-05, "loss": 168.1486, "step": 14810 }, { "epoch": 0.11974886674908491, "grad_norm": 817.1705932617188, "learning_rate": 4.994056806052046e-05, "loss": 133.3082, "step": 14820 }, { "epoch": 0.11982966895336905, "grad_norm": 878.759521484375, "learning_rate": 4.994008111020033e-05, "loss": 188.8091, "step": 14830 }, { "epoch": 0.11991047115765319, "grad_norm": 2776.26220703125, "learning_rate": 4.993959217551242e-05, "loss": 159.8074, "step": 14840 }, { "epoch": 0.11999127336193731, "grad_norm": 1184.76171875, "learning_rate": 4.993910125649561e-05, "loss": 192.4007, "step": 14850 }, { "epoch": 0.12007207556622145, "grad_norm": 2491.531982421875, "learning_rate": 4.9938608353188966e-05, "loss": 209.929, "step": 14860 }, { "epoch": 0.12015287777050558, "grad_norm": 1279.2767333984375, "learning_rate": 4.993811346563171e-05, "loss": 159.6929, "step": 14870 }, { "epoch": 0.12023367997478972, "grad_norm": 783.5645751953125, "learning_rate": 4.993761659386322e-05, "loss": 164.0424, "step": 14880 }, { "epoch": 0.12031448217907384, "grad_norm": 1100.378662109375, "learning_rate": 4.993711773792302e-05, "loss": 152.5349, "step": 14890 }, { "epoch": 0.12039528438335798, "grad_norm": 2542.6484375, "learning_rate": 4.993661689785081e-05, "loss": 131.4937, "step": 14900 }, { "epoch": 0.12047608658764211, "grad_norm": 1040.3431396484375, "learning_rate": 4.9936114073686435e-05, "loss": 135.133, "step": 14910 }, { "epoch": 0.12055688879192625, "grad_norm": 1289.4642333984375, "learning_rate": 4.9935609265469905e-05, "loss": 138.4034, "step": 14920 }, { "epoch": 0.12063769099621037, "grad_norm": 712.0211181640625, "learning_rate": 4.993510247324139e-05, "loss": 120.3609, "step": 14930 }, { "epoch": 0.1207184932004945, "grad_norm": 872.5120849609375, "learning_rate": 4.99345936970412e-05, "loss": 130.8901, "step": 14940 }, { "epoch": 0.12079929540477864, "grad_norm": 887.6231079101562, "learning_rate": 4.993408293690983e-05, "loss": 168.3752, "step": 14950 }, { "epoch": 0.12088009760906278, "grad_norm": 883.7645874023438, "learning_rate": 4.993357019288791e-05, "loss": 178.7722, "step": 14960 }, { "epoch": 0.12096089981334691, "grad_norm": 1578.703369140625, "learning_rate": 4.9933055465016245e-05, "loss": 226.0479, "step": 14970 }, { "epoch": 0.12104170201763104, "grad_norm": 4105.41455078125, "learning_rate": 4.9932538753335776e-05, "loss": 205.1221, "step": 14980 }, { "epoch": 0.12112250422191517, "grad_norm": 1194.9835205078125, "learning_rate": 4.9932020057887625e-05, "loss": 161.8187, "step": 14990 }, { "epoch": 0.12120330642619931, "grad_norm": 1620.5238037109375, "learning_rate": 4.9931499378713064e-05, "loss": 166.9932, "step": 15000 }, { "epoch": 0.12128410863048344, "grad_norm": 1244.3077392578125, "learning_rate": 4.993097671585352e-05, "loss": 163.8744, "step": 15010 }, { "epoch": 0.12136491083476757, "grad_norm": 639.976318359375, "learning_rate": 4.9930452069350566e-05, "loss": 191.1954, "step": 15020 }, { "epoch": 0.1214457130390517, "grad_norm": 2938.000244140625, "learning_rate": 4.992992543924596e-05, "loss": 200.1803, "step": 15030 }, { "epoch": 0.12152651524333584, "grad_norm": 2456.51953125, "learning_rate": 4.99293968255816e-05, "loss": 178.8182, "step": 15040 }, { "epoch": 0.12160731744761998, "grad_norm": 1298.4368896484375, "learning_rate": 4.992886622839955e-05, "loss": 141.6572, "step": 15050 }, { "epoch": 0.1216881196519041, "grad_norm": 930.0308227539062, "learning_rate": 4.9928333647742024e-05, "loss": 147.9233, "step": 15060 }, { "epoch": 0.12176892185618823, "grad_norm": 1263.79345703125, "learning_rate": 4.9927799083651385e-05, "loss": 130.5263, "step": 15070 }, { "epoch": 0.12184972406047237, "grad_norm": 1646.8670654296875, "learning_rate": 4.9927262536170183e-05, "loss": 167.601, "step": 15080 }, { "epoch": 0.1219305262647565, "grad_norm": 618.90234375, "learning_rate": 4.9926724005341095e-05, "loss": 168.1324, "step": 15090 }, { "epoch": 0.12201132846904064, "grad_norm": 724.7408447265625, "learning_rate": 4.992618349120698e-05, "loss": 134.3964, "step": 15100 }, { "epoch": 0.12209213067332476, "grad_norm": 1000.2115478515625, "learning_rate": 4.992564099381084e-05, "loss": 135.418, "step": 15110 }, { "epoch": 0.1221729328776089, "grad_norm": 1031.4586181640625, "learning_rate": 4.9925096513195846e-05, "loss": 151.0433, "step": 15120 }, { "epoch": 0.12225373508189304, "grad_norm": 1462.1236572265625, "learning_rate": 4.992455004940531e-05, "loss": 131.2349, "step": 15130 }, { "epoch": 0.12233453728617717, "grad_norm": 1075.0465087890625, "learning_rate": 4.9924001602482705e-05, "loss": 94.1535, "step": 15140 }, { "epoch": 0.1224153394904613, "grad_norm": 1040.51025390625, "learning_rate": 4.992345117247169e-05, "loss": 137.572, "step": 15150 }, { "epoch": 0.12249614169474543, "grad_norm": 1554.9383544921875, "learning_rate": 4.9922898759416046e-05, "loss": 188.1611, "step": 15160 }, { "epoch": 0.12257694389902957, "grad_norm": 1242.22705078125, "learning_rate": 4.992234436335972e-05, "loss": 175.7165, "step": 15170 }, { "epoch": 0.1226577461033137, "grad_norm": 1831.4871826171875, "learning_rate": 4.9921787984346846e-05, "loss": 166.0569, "step": 15180 }, { "epoch": 0.12273854830759783, "grad_norm": 775.4189453125, "learning_rate": 4.992122962242167e-05, "loss": 161.268, "step": 15190 }, { "epoch": 0.12281935051188196, "grad_norm": 1125.548095703125, "learning_rate": 4.992066927762862e-05, "loss": 149.2984, "step": 15200 }, { "epoch": 0.1229001527161661, "grad_norm": 732.4982299804688, "learning_rate": 4.992010695001229e-05, "loss": 108.5541, "step": 15210 }, { "epoch": 0.12298095492045023, "grad_norm": 980.6292724609375, "learning_rate": 4.9919542639617425e-05, "loss": 149.5483, "step": 15220 }, { "epoch": 0.12306175712473437, "grad_norm": 1520.5997314453125, "learning_rate": 4.991897634648891e-05, "loss": 135.0774, "step": 15230 }, { "epoch": 0.12314255932901849, "grad_norm": 1217.220458984375, "learning_rate": 4.991840807067181e-05, "loss": 187.3661, "step": 15240 }, { "epoch": 0.12322336153330263, "grad_norm": 821.0241088867188, "learning_rate": 4.991783781221134e-05, "loss": 190.1053, "step": 15250 }, { "epoch": 0.12330416373758676, "grad_norm": 1041.9854736328125, "learning_rate": 4.9917265571152875e-05, "loss": 157.6719, "step": 15260 }, { "epoch": 0.1233849659418709, "grad_norm": 1650.58984375, "learning_rate": 4.9916691347541946e-05, "loss": 148.1537, "step": 15270 }, { "epoch": 0.12346576814615502, "grad_norm": 1302.0968017578125, "learning_rate": 4.9916115141424235e-05, "loss": 146.3277, "step": 15280 }, { "epoch": 0.12354657035043916, "grad_norm": 1479.9271240234375, "learning_rate": 4.991553695284559e-05, "loss": 135.4141, "step": 15290 }, { "epoch": 0.1236273725547233, "grad_norm": 992.5084838867188, "learning_rate": 4.991495678185202e-05, "loss": 145.0812, "step": 15300 }, { "epoch": 0.12370817475900743, "grad_norm": 1374.5244140625, "learning_rate": 4.991437462848968e-05, "loss": 199.361, "step": 15310 }, { "epoch": 0.12378897696329155, "grad_norm": 1825.560791015625, "learning_rate": 4.991379049280489e-05, "loss": 199.0117, "step": 15320 }, { "epoch": 0.12386977916757569, "grad_norm": 1683.6776123046875, "learning_rate": 4.991320437484414e-05, "loss": 132.4031, "step": 15330 }, { "epoch": 0.12395058137185982, "grad_norm": 1264.279052734375, "learning_rate": 4.991261627465404e-05, "loss": 157.1223, "step": 15340 }, { "epoch": 0.12403138357614396, "grad_norm": 1643.26171875, "learning_rate": 4.99120261922814e-05, "loss": 170.2161, "step": 15350 }, { "epoch": 0.1241121857804281, "grad_norm": 5203.81591796875, "learning_rate": 4.9911434127773176e-05, "loss": 145.5372, "step": 15360 }, { "epoch": 0.12419298798471222, "grad_norm": 1837.298828125, "learning_rate": 4.991084008117646e-05, "loss": 135.7863, "step": 15370 }, { "epoch": 0.12427379018899636, "grad_norm": 699.8900756835938, "learning_rate": 4.991024405253852e-05, "loss": 112.2561, "step": 15380 }, { "epoch": 0.12435459239328049, "grad_norm": 970.3558959960938, "learning_rate": 4.990964604190679e-05, "loss": 133.0993, "step": 15390 }, { "epoch": 0.12443539459756463, "grad_norm": 1397.918701171875, "learning_rate": 4.9909046049328846e-05, "loss": 201.8272, "step": 15400 }, { "epoch": 0.12451619680184875, "grad_norm": 939.9736328125, "learning_rate": 4.990844407485242e-05, "loss": 127.3693, "step": 15410 }, { "epoch": 0.12459699900613289, "grad_norm": 875.4765625, "learning_rate": 4.9907840118525415e-05, "loss": 160.38, "step": 15420 }, { "epoch": 0.12467780121041702, "grad_norm": 1410.2291259765625, "learning_rate": 4.990723418039588e-05, "loss": 152.7454, "step": 15430 }, { "epoch": 0.12475860341470116, "grad_norm": 1288.0728759765625, "learning_rate": 4.9906626260512036e-05, "loss": 150.0691, "step": 15440 }, { "epoch": 0.12483940561898528, "grad_norm": 1040.0684814453125, "learning_rate": 4.9906016358922246e-05, "loss": 176.1957, "step": 15450 }, { "epoch": 0.12492020782326942, "grad_norm": 944.9694213867188, "learning_rate": 4.990540447567503e-05, "loss": 129.6401, "step": 15460 }, { "epoch": 0.12500101002755354, "grad_norm": 947.33642578125, "learning_rate": 4.990479061081908e-05, "loss": 168.0807, "step": 15470 }, { "epoch": 0.12508181223183767, "grad_norm": 1530.232666015625, "learning_rate": 4.9904174764403255e-05, "loss": 192.025, "step": 15480 }, { "epoch": 0.1251626144361218, "grad_norm": 1547.114013671875, "learning_rate": 4.9903556936476524e-05, "loss": 137.4716, "step": 15490 }, { "epoch": 0.12524341664040595, "grad_norm": 1204.82177734375, "learning_rate": 4.9902937127088065e-05, "loss": 163.7098, "step": 15500 }, { "epoch": 0.12532421884469008, "grad_norm": 1420.2991943359375, "learning_rate": 4.9902315336287184e-05, "loss": 159.6538, "step": 15510 }, { "epoch": 0.12540502104897422, "grad_norm": 1532.7950439453125, "learning_rate": 4.990169156412336e-05, "loss": 190.6898, "step": 15520 }, { "epoch": 0.12548582325325836, "grad_norm": 4929.6416015625, "learning_rate": 4.990106581064622e-05, "loss": 149.6133, "step": 15530 }, { "epoch": 0.1255666254575425, "grad_norm": 1571.04736328125, "learning_rate": 4.9900438075905555e-05, "loss": 148.4885, "step": 15540 }, { "epoch": 0.12564742766182663, "grad_norm": 724.03564453125, "learning_rate": 4.9899808359951314e-05, "loss": 172.4704, "step": 15550 }, { "epoch": 0.12572822986611074, "grad_norm": 860.8834838867188, "learning_rate": 4.989917666283359e-05, "loss": 128.226, "step": 15560 }, { "epoch": 0.12580903207039487, "grad_norm": 1097.7288818359375, "learning_rate": 4.9898542984602656e-05, "loss": 144.4699, "step": 15570 }, { "epoch": 0.125889834274679, "grad_norm": 2218.3935546875, "learning_rate": 4.989790732530892e-05, "loss": 148.3845, "step": 15580 }, { "epoch": 0.12597063647896314, "grad_norm": 548.2178955078125, "learning_rate": 4.989726968500297e-05, "loss": 122.0915, "step": 15590 }, { "epoch": 0.12605143868324728, "grad_norm": 1305.503173828125, "learning_rate": 4.989663006373553e-05, "loss": 135.4369, "step": 15600 }, { "epoch": 0.12613224088753142, "grad_norm": 1803.6846923828125, "learning_rate": 4.9895988461557494e-05, "loss": 174.447, "step": 15610 }, { "epoch": 0.12621304309181555, "grad_norm": 844.888671875, "learning_rate": 4.989534487851992e-05, "loss": 143.4764, "step": 15620 }, { "epoch": 0.1262938452960997, "grad_norm": 2333.817626953125, "learning_rate": 4.9894699314674006e-05, "loss": 192.7881, "step": 15630 }, { "epoch": 0.12637464750038382, "grad_norm": 1151.9573974609375, "learning_rate": 4.9894051770071113e-05, "loss": 106.1197, "step": 15640 }, { "epoch": 0.12645544970466793, "grad_norm": 803.3490600585938, "learning_rate": 4.989340224476278e-05, "loss": 168.2896, "step": 15650 }, { "epoch": 0.12653625190895207, "grad_norm": 1133.505126953125, "learning_rate": 4.9892750738800664e-05, "loss": 156.3058, "step": 15660 }, { "epoch": 0.1266170541132362, "grad_norm": 1112.0980224609375, "learning_rate": 4.989209725223662e-05, "loss": 99.6213, "step": 15670 }, { "epoch": 0.12669785631752034, "grad_norm": 1125.371337890625, "learning_rate": 4.989144178512263e-05, "loss": 120.4679, "step": 15680 }, { "epoch": 0.12677865852180448, "grad_norm": 1283.361083984375, "learning_rate": 4.9890784337510865e-05, "loss": 177.7757, "step": 15690 }, { "epoch": 0.1268594607260886, "grad_norm": 982.0589599609375, "learning_rate": 4.9890124909453615e-05, "loss": 154.7183, "step": 15700 }, { "epoch": 0.12694026293037275, "grad_norm": 810.0673828125, "learning_rate": 4.988946350100336e-05, "loss": 186.0556, "step": 15710 }, { "epoch": 0.12702106513465689, "grad_norm": 1088.3094482421875, "learning_rate": 4.988880011221272e-05, "loss": 139.6886, "step": 15720 }, { "epoch": 0.127101867338941, "grad_norm": 1257.6068115234375, "learning_rate": 4.9888134743134484e-05, "loss": 174.5065, "step": 15730 }, { "epoch": 0.12718266954322513, "grad_norm": 1530.7177734375, "learning_rate": 4.988746739382158e-05, "loss": 166.6874, "step": 15740 }, { "epoch": 0.12726347174750927, "grad_norm": 1231.8719482421875, "learning_rate": 4.988679806432712e-05, "loss": 170.5947, "step": 15750 }, { "epoch": 0.1273442739517934, "grad_norm": 919.1317138671875, "learning_rate": 4.988612675470435e-05, "loss": 158.5722, "step": 15760 }, { "epoch": 0.12742507615607754, "grad_norm": 691.2894897460938, "learning_rate": 4.988545346500668e-05, "loss": 170.0254, "step": 15770 }, { "epoch": 0.12750587836036167, "grad_norm": 698.147705078125, "learning_rate": 4.9884778195287695e-05, "loss": 163.0237, "step": 15780 }, { "epoch": 0.1275866805646458, "grad_norm": 1487.1177978515625, "learning_rate": 4.988410094560111e-05, "loss": 186.4031, "step": 15790 }, { "epoch": 0.12766748276892995, "grad_norm": 1065.00537109375, "learning_rate": 4.988342171600082e-05, "loss": 127.2453, "step": 15800 }, { "epoch": 0.12774828497321408, "grad_norm": 1005.9658203125, "learning_rate": 4.988274050654086e-05, "loss": 141.061, "step": 15810 }, { "epoch": 0.1278290871774982, "grad_norm": 1164.0673828125, "learning_rate": 4.988205731727544e-05, "loss": 165.4086, "step": 15820 }, { "epoch": 0.12790988938178233, "grad_norm": 7027.51513671875, "learning_rate": 4.988137214825891e-05, "loss": 220.1764, "step": 15830 }, { "epoch": 0.12799069158606646, "grad_norm": 1196.4290771484375, "learning_rate": 4.988068499954578e-05, "loss": 147.7958, "step": 15840 }, { "epoch": 0.1280714937903506, "grad_norm": 932.5397338867188, "learning_rate": 4.9879995871190743e-05, "loss": 190.3842, "step": 15850 }, { "epoch": 0.12815229599463474, "grad_norm": 864.31884765625, "learning_rate": 4.9879304763248615e-05, "loss": 149.0053, "step": 15860 }, { "epoch": 0.12823309819891887, "grad_norm": 2883.14306640625, "learning_rate": 4.9878611675774375e-05, "loss": 176.4589, "step": 15870 }, { "epoch": 0.128313900403203, "grad_norm": 950.1251831054688, "learning_rate": 4.9877916608823196e-05, "loss": 172.9278, "step": 15880 }, { "epoch": 0.12839470260748714, "grad_norm": 1128.0977783203125, "learning_rate": 4.9877219562450364e-05, "loss": 164.3597, "step": 15890 }, { "epoch": 0.12847550481177125, "grad_norm": 952.9031982421875, "learning_rate": 4.987652053671134e-05, "loss": 159.8279, "step": 15900 }, { "epoch": 0.1285563070160554, "grad_norm": 3416.225341796875, "learning_rate": 4.987581953166175e-05, "loss": 160.0613, "step": 15910 }, { "epoch": 0.12863710922033952, "grad_norm": 807.5946044921875, "learning_rate": 4.9875116547357356e-05, "loss": 150.7103, "step": 15920 }, { "epoch": 0.12871791142462366, "grad_norm": 688.88427734375, "learning_rate": 4.9874411583854106e-05, "loss": 148.7598, "step": 15930 }, { "epoch": 0.1287987136289078, "grad_norm": 914.3491821289062, "learning_rate": 4.987370464120808e-05, "loss": 124.2359, "step": 15940 }, { "epoch": 0.12887951583319193, "grad_norm": 5167.775390625, "learning_rate": 4.987299571947553e-05, "loss": 146.1354, "step": 15950 }, { "epoch": 0.12896031803747607, "grad_norm": 1595.9141845703125, "learning_rate": 4.9872284818712865e-05, "loss": 139.1633, "step": 15960 }, { "epoch": 0.1290411202417602, "grad_norm": 533.0267333984375, "learning_rate": 4.9871571938976645e-05, "loss": 167.4221, "step": 15970 }, { "epoch": 0.12912192244604434, "grad_norm": 960.4555053710938, "learning_rate": 4.98708570803236e-05, "loss": 129.6839, "step": 15980 }, { "epoch": 0.12920272465032845, "grad_norm": 603.4066772460938, "learning_rate": 4.9870140242810585e-05, "loss": 105.636, "step": 15990 }, { "epoch": 0.12928352685461258, "grad_norm": 1337.03369140625, "learning_rate": 4.986942142649465e-05, "loss": 168.6452, "step": 16000 }, { "epoch": 0.12936432905889672, "grad_norm": 2447.898193359375, "learning_rate": 4.9868700631432995e-05, "loss": 206.3615, "step": 16010 }, { "epoch": 0.12944513126318086, "grad_norm": 736.8400268554688, "learning_rate": 4.9867977857682965e-05, "loss": 145.3305, "step": 16020 }, { "epoch": 0.129525933467465, "grad_norm": 1626.4791259765625, "learning_rate": 4.986725310530206e-05, "loss": 128.715, "step": 16030 }, { "epoch": 0.12960673567174913, "grad_norm": 612.1223754882812, "learning_rate": 4.986652637434795e-05, "loss": 186.9655, "step": 16040 }, { "epoch": 0.12968753787603327, "grad_norm": 1173.4058837890625, "learning_rate": 4.9865797664878456e-05, "loss": 147.4328, "step": 16050 }, { "epoch": 0.1297683400803174, "grad_norm": 1522.8951416015625, "learning_rate": 4.986506697695157e-05, "loss": 156.7457, "step": 16060 }, { "epoch": 0.12984914228460154, "grad_norm": 520.1161499023438, "learning_rate": 4.986433431062541e-05, "loss": 114.943, "step": 16070 }, { "epoch": 0.12992994448888565, "grad_norm": 885.0980834960938, "learning_rate": 4.986359966595828e-05, "loss": 161.2493, "step": 16080 }, { "epoch": 0.13001074669316978, "grad_norm": 1161.5257568359375, "learning_rate": 4.9862863043008645e-05, "loss": 169.9536, "step": 16090 }, { "epoch": 0.13009154889745392, "grad_norm": 1123.99560546875, "learning_rate": 4.986212444183509e-05, "loss": 171.4671, "step": 16100 }, { "epoch": 0.13017235110173805, "grad_norm": 1075.032958984375, "learning_rate": 4.9861383862496405e-05, "loss": 143.3025, "step": 16110 }, { "epoch": 0.1302531533060222, "grad_norm": 2145.561279296875, "learning_rate": 4.9860641305051496e-05, "loss": 150.4522, "step": 16120 }, { "epoch": 0.13033395551030633, "grad_norm": 960.9412231445312, "learning_rate": 4.9859896769559454e-05, "loss": 125.8122, "step": 16130 }, { "epoch": 0.13041475771459046, "grad_norm": 1217.901611328125, "learning_rate": 4.985915025607952e-05, "loss": 133.196, "step": 16140 }, { "epoch": 0.1304955599188746, "grad_norm": 767.8223876953125, "learning_rate": 4.9858401764671095e-05, "loss": 150.1083, "step": 16150 }, { "epoch": 0.1305763621231587, "grad_norm": 3365.16259765625, "learning_rate": 4.9857651295393716e-05, "loss": 170.0273, "step": 16160 }, { "epoch": 0.13065716432744284, "grad_norm": 758.3485107421875, "learning_rate": 4.985689884830711e-05, "loss": 138.8482, "step": 16170 }, { "epoch": 0.13073796653172698, "grad_norm": 1844.4720458984375, "learning_rate": 4.985614442347114e-05, "loss": 192.0719, "step": 16180 }, { "epoch": 0.13081876873601112, "grad_norm": 785.1895751953125, "learning_rate": 4.985538802094583e-05, "loss": 132.7963, "step": 16190 }, { "epoch": 0.13089957094029525, "grad_norm": 1228.786376953125, "learning_rate": 4.985462964079137e-05, "loss": 150.1391, "step": 16200 }, { "epoch": 0.1309803731445794, "grad_norm": 1071.201904296875, "learning_rate": 4.9853869283068086e-05, "loss": 154.3323, "step": 16210 }, { "epoch": 0.13106117534886352, "grad_norm": 1365.28759765625, "learning_rate": 4.9853106947836504e-05, "loss": 133.5463, "step": 16220 }, { "epoch": 0.13114197755314766, "grad_norm": 1116.1046142578125, "learning_rate": 4.985234263515725e-05, "loss": 172.6865, "step": 16230 }, { "epoch": 0.1312227797574318, "grad_norm": 1164.0814208984375, "learning_rate": 4.985157634509115e-05, "loss": 140.3057, "step": 16240 }, { "epoch": 0.1313035819617159, "grad_norm": 1354.85986328125, "learning_rate": 4.985080807769918e-05, "loss": 170.2302, "step": 16250 }, { "epoch": 0.13138438416600004, "grad_norm": 1436.9825439453125, "learning_rate": 4.9850037833042463e-05, "loss": 154.777, "step": 16260 }, { "epoch": 0.13146518637028418, "grad_norm": 3008.28662109375, "learning_rate": 4.984926561118227e-05, "loss": 193.904, "step": 16270 }, { "epoch": 0.1315459885745683, "grad_norm": 2124.60693359375, "learning_rate": 4.984849141218007e-05, "loss": 181.6443, "step": 16280 }, { "epoch": 0.13162679077885245, "grad_norm": 1305.1060791015625, "learning_rate": 4.984771523609744e-05, "loss": 135.3111, "step": 16290 }, { "epoch": 0.13170759298313658, "grad_norm": 1435.72021484375, "learning_rate": 4.984693708299614e-05, "loss": 144.1642, "step": 16300 }, { "epoch": 0.13178839518742072, "grad_norm": 2482.44970703125, "learning_rate": 4.984615695293809e-05, "loss": 168.4511, "step": 16310 }, { "epoch": 0.13186919739170486, "grad_norm": 1040.33154296875, "learning_rate": 4.984537484598536e-05, "loss": 151.9577, "step": 16320 }, { "epoch": 0.13194999959598896, "grad_norm": 2147.656494140625, "learning_rate": 4.9844590762200185e-05, "loss": 156.3386, "step": 16330 }, { "epoch": 0.1320308018002731, "grad_norm": 852.7472534179688, "learning_rate": 4.9843804701644936e-05, "loss": 116.6514, "step": 16340 }, { "epoch": 0.13211160400455724, "grad_norm": 1123.03076171875, "learning_rate": 4.984301666438217e-05, "loss": 135.8654, "step": 16350 }, { "epoch": 0.13219240620884137, "grad_norm": 960.448974609375, "learning_rate": 4.9842226650474574e-05, "loss": 152.3355, "step": 16360 }, { "epoch": 0.1322732084131255, "grad_norm": 1971.15380859375, "learning_rate": 4.984143465998502e-05, "loss": 137.8897, "step": 16370 }, { "epoch": 0.13235401061740965, "grad_norm": 944.2211303710938, "learning_rate": 4.984064069297652e-05, "loss": 200.7082, "step": 16380 }, { "epoch": 0.13243481282169378, "grad_norm": 1767.5118408203125, "learning_rate": 4.9839844749512245e-05, "loss": 135.5678, "step": 16390 }, { "epoch": 0.13251561502597792, "grad_norm": 1431.9781494140625, "learning_rate": 4.983904682965551e-05, "loss": 159.7814, "step": 16400 }, { "epoch": 0.13259641723026205, "grad_norm": 722.2365112304688, "learning_rate": 4.9838246933469826e-05, "loss": 176.3924, "step": 16410 }, { "epoch": 0.13267721943454616, "grad_norm": 1308.9915771484375, "learning_rate": 4.9837445061018825e-05, "loss": 179.9182, "step": 16420 }, { "epoch": 0.1327580216388303, "grad_norm": 3178.224365234375, "learning_rate": 4.98366412123663e-05, "loss": 197.8065, "step": 16430 }, { "epoch": 0.13283882384311443, "grad_norm": 1109.3902587890625, "learning_rate": 4.9835835387576226e-05, "loss": 137.1438, "step": 16440 }, { "epoch": 0.13291962604739857, "grad_norm": 785.2881469726562, "learning_rate": 4.983502758671271e-05, "loss": 156.4888, "step": 16450 }, { "epoch": 0.1330004282516827, "grad_norm": 1051.4027099609375, "learning_rate": 4.9834217809840027e-05, "loss": 109.1863, "step": 16460 }, { "epoch": 0.13308123045596684, "grad_norm": 924.291748046875, "learning_rate": 4.98334060570226e-05, "loss": 178.5278, "step": 16470 }, { "epoch": 0.13316203266025098, "grad_norm": 842.4341430664062, "learning_rate": 4.983259232832503e-05, "loss": 166.3891, "step": 16480 }, { "epoch": 0.13324283486453511, "grad_norm": 1710.2845458984375, "learning_rate": 4.983177662381205e-05, "loss": 152.9183, "step": 16490 }, { "epoch": 0.13332363706881925, "grad_norm": 776.0635986328125, "learning_rate": 4.983095894354858e-05, "loss": 150.6734, "step": 16500 }, { "epoch": 0.13340443927310336, "grad_norm": 738.4081420898438, "learning_rate": 4.983013928759965e-05, "loss": 130.03, "step": 16510 }, { "epoch": 0.1334852414773875, "grad_norm": 1212.0933837890625, "learning_rate": 4.98293176560305e-05, "loss": 159.4638, "step": 16520 }, { "epoch": 0.13356604368167163, "grad_norm": 1721.5943603515625, "learning_rate": 4.982849404890649e-05, "loss": 146.906, "step": 16530 }, { "epoch": 0.13364684588595577, "grad_norm": 1130.90283203125, "learning_rate": 4.982766846629316e-05, "loss": 157.2213, "step": 16540 }, { "epoch": 0.1337276480902399, "grad_norm": 878.40234375, "learning_rate": 4.982684090825619e-05, "loss": 183.1443, "step": 16550 }, { "epoch": 0.13380845029452404, "grad_norm": 896.69384765625, "learning_rate": 4.9826011374861435e-05, "loss": 150.7616, "step": 16560 }, { "epoch": 0.13388925249880818, "grad_norm": 1271.7767333984375, "learning_rate": 4.982517986617489e-05, "loss": 151.5091, "step": 16570 }, { "epoch": 0.1339700547030923, "grad_norm": 786.6751098632812, "learning_rate": 4.982434638226271e-05, "loss": 140.3826, "step": 16580 }, { "epoch": 0.13405085690737642, "grad_norm": 777.1525268554688, "learning_rate": 4.982351092319122e-05, "loss": 121.1654, "step": 16590 }, { "epoch": 0.13413165911166056, "grad_norm": 1195.732666015625, "learning_rate": 4.982267348902688e-05, "loss": 129.7749, "step": 16600 }, { "epoch": 0.1342124613159447, "grad_norm": 730.3426513671875, "learning_rate": 4.982183407983635e-05, "loss": 99.6988, "step": 16610 }, { "epoch": 0.13429326352022883, "grad_norm": 1521.4287109375, "learning_rate": 4.982099269568639e-05, "loss": 134.8955, "step": 16620 }, { "epoch": 0.13437406572451296, "grad_norm": 2275.628662109375, "learning_rate": 4.982014933664395e-05, "loss": 132.2674, "step": 16630 }, { "epoch": 0.1344548679287971, "grad_norm": 1096.541259765625, "learning_rate": 4.981930400277614e-05, "loss": 169.6705, "step": 16640 }, { "epoch": 0.13453567013308124, "grad_norm": 1358.02001953125, "learning_rate": 4.981845669415022e-05, "loss": 143.6309, "step": 16650 }, { "epoch": 0.13461647233736537, "grad_norm": 655.5990600585938, "learning_rate": 4.9817607410833586e-05, "loss": 157.5329, "step": 16660 }, { "epoch": 0.1346972745416495, "grad_norm": 4219.76171875, "learning_rate": 4.9816756152893845e-05, "loss": 184.4929, "step": 16670 }, { "epoch": 0.13477807674593362, "grad_norm": 1044.2806396484375, "learning_rate": 4.98159029203987e-05, "loss": 145.4762, "step": 16680 }, { "epoch": 0.13485887895021775, "grad_norm": 786.1893920898438, "learning_rate": 4.9815047713416067e-05, "loss": 124.9497, "step": 16690 }, { "epoch": 0.1349396811545019, "grad_norm": 790.2650756835938, "learning_rate": 4.9814190532013955e-05, "loss": 136.2918, "step": 16700 }, { "epoch": 0.13502048335878603, "grad_norm": 2891.427978515625, "learning_rate": 4.9813331376260585e-05, "loss": 168.4338, "step": 16710 }, { "epoch": 0.13510128556307016, "grad_norm": 947.1802978515625, "learning_rate": 4.981247024622432e-05, "loss": 196.8272, "step": 16720 }, { "epoch": 0.1351820877673543, "grad_norm": 3245.217041015625, "learning_rate": 4.9811607141973674e-05, "loss": 161.3576, "step": 16730 }, { "epoch": 0.13526288997163843, "grad_norm": 970.99609375, "learning_rate": 4.981074206357731e-05, "loss": 118.8778, "step": 16740 }, { "epoch": 0.13534369217592257, "grad_norm": 818.5444946289062, "learning_rate": 4.980987501110408e-05, "loss": 111.06, "step": 16750 }, { "epoch": 0.1354244943802067, "grad_norm": 1678.4248046875, "learning_rate": 4.980900598462295e-05, "loss": 209.277, "step": 16760 }, { "epoch": 0.13550529658449081, "grad_norm": 782.6038208007812, "learning_rate": 4.980813498420306e-05, "loss": 148.4335, "step": 16770 }, { "epoch": 0.13558609878877495, "grad_norm": 2577.811767578125, "learning_rate": 4.980726200991374e-05, "loss": 122.7456, "step": 16780 }, { "epoch": 0.1356669009930591, "grad_norm": 1101.7432861328125, "learning_rate": 4.980638706182442e-05, "loss": 146.7527, "step": 16790 }, { "epoch": 0.13574770319734322, "grad_norm": 1047.087646484375, "learning_rate": 4.980551014000474e-05, "loss": 153.0954, "step": 16800 }, { "epoch": 0.13582850540162736, "grad_norm": 616.9617309570312, "learning_rate": 4.9804631244524445e-05, "loss": 169.0794, "step": 16810 }, { "epoch": 0.1359093076059115, "grad_norm": 3564.375, "learning_rate": 4.980375037545349e-05, "loss": 139.7387, "step": 16820 }, { "epoch": 0.13599010981019563, "grad_norm": 869.0906982421875, "learning_rate": 4.980286753286195e-05, "loss": 214.401, "step": 16830 }, { "epoch": 0.13607091201447977, "grad_norm": 1770.462646484375, "learning_rate": 4.980198271682007e-05, "loss": 164.5436, "step": 16840 }, { "epoch": 0.13615171421876388, "grad_norm": 861.6243286132812, "learning_rate": 4.980109592739825e-05, "loss": 140.7934, "step": 16850 }, { "epoch": 0.136232516423048, "grad_norm": 815.6874389648438, "learning_rate": 4.9800207164667044e-05, "loss": 148.0081, "step": 16860 }, { "epoch": 0.13631331862733215, "grad_norm": 1228.7813720703125, "learning_rate": 4.979931642869717e-05, "loss": 186.2767, "step": 16870 }, { "epoch": 0.13639412083161628, "grad_norm": 1478.2159423828125, "learning_rate": 4.979842371955952e-05, "loss": 131.0911, "step": 16880 }, { "epoch": 0.13647492303590042, "grad_norm": 2854.59228515625, "learning_rate": 4.979752903732509e-05, "loss": 165.121, "step": 16890 }, { "epoch": 0.13655572524018456, "grad_norm": 762.3851318359375, "learning_rate": 4.979663238206508e-05, "loss": 147.5221, "step": 16900 }, { "epoch": 0.1366365274444687, "grad_norm": 1374.7017822265625, "learning_rate": 4.979573375385083e-05, "loss": 170.8515, "step": 16910 }, { "epoch": 0.13671732964875283, "grad_norm": 914.9188232421875, "learning_rate": 4.979483315275385e-05, "loss": 180.3649, "step": 16920 }, { "epoch": 0.13679813185303696, "grad_norm": 787.9354248046875, "learning_rate": 4.979393057884578e-05, "loss": 137.5717, "step": 16930 }, { "epoch": 0.13687893405732107, "grad_norm": 1231.185302734375, "learning_rate": 4.9793026032198453e-05, "loss": 160.6839, "step": 16940 }, { "epoch": 0.1369597362616052, "grad_norm": 933.662841796875, "learning_rate": 4.9792119512883816e-05, "loss": 147.167, "step": 16950 }, { "epoch": 0.13704053846588934, "grad_norm": 3033.047119140625, "learning_rate": 4.979121102097402e-05, "loss": 187.1447, "step": 16960 }, { "epoch": 0.13712134067017348, "grad_norm": 4561.677734375, "learning_rate": 4.9790300556541334e-05, "loss": 158.3624, "step": 16970 }, { "epoch": 0.13720214287445762, "grad_norm": 1468.9378662109375, "learning_rate": 4.978938811965821e-05, "loss": 164.9483, "step": 16980 }, { "epoch": 0.13728294507874175, "grad_norm": 995.0773315429688, "learning_rate": 4.978847371039724e-05, "loss": 174.3868, "step": 16990 }, { "epoch": 0.1373637472830259, "grad_norm": 1231.4886474609375, "learning_rate": 4.978755732883118e-05, "loss": 183.0145, "step": 17000 }, { "epoch": 0.13744454948731002, "grad_norm": 1484.8287353515625, "learning_rate": 4.978663897503294e-05, "loss": 156.891, "step": 17010 }, { "epoch": 0.13752535169159413, "grad_norm": 1120.732421875, "learning_rate": 4.97857186490756e-05, "loss": 119.2117, "step": 17020 }, { "epoch": 0.13760615389587827, "grad_norm": 1300.58642578125, "learning_rate": 4.978479635103237e-05, "loss": 168.5496, "step": 17030 }, { "epoch": 0.1376869561001624, "grad_norm": 780.037841796875, "learning_rate": 4.978387208097665e-05, "loss": 126.0169, "step": 17040 }, { "epoch": 0.13776775830444654, "grad_norm": 1085.820068359375, "learning_rate": 4.978294583898196e-05, "loss": 143.0688, "step": 17050 }, { "epoch": 0.13784856050873068, "grad_norm": 1167.7269287109375, "learning_rate": 4.978201762512201e-05, "loss": 147.1071, "step": 17060 }, { "epoch": 0.1379293627130148, "grad_norm": 2116.8076171875, "learning_rate": 4.978108743947066e-05, "loss": 176.9654, "step": 17070 }, { "epoch": 0.13801016491729895, "grad_norm": 1101.4508056640625, "learning_rate": 4.97801552821019e-05, "loss": 147.9685, "step": 17080 }, { "epoch": 0.13809096712158309, "grad_norm": 1551.89306640625, "learning_rate": 4.977922115308992e-05, "loss": 150.4693, "step": 17090 }, { "epoch": 0.13817176932586722, "grad_norm": 1168.867431640625, "learning_rate": 4.977828505250903e-05, "loss": 161.2141, "step": 17100 }, { "epoch": 0.13825257153015133, "grad_norm": 759.61962890625, "learning_rate": 4.977734698043371e-05, "loss": 150.161, "step": 17110 }, { "epoch": 0.13833337373443547, "grad_norm": 757.785400390625, "learning_rate": 4.977640693693862e-05, "loss": 135.1811, "step": 17120 }, { "epoch": 0.1384141759387196, "grad_norm": 2548.685302734375, "learning_rate": 4.9775464922098524e-05, "loss": 188.9629, "step": 17130 }, { "epoch": 0.13849497814300374, "grad_norm": 911.2066650390625, "learning_rate": 4.977452093598839e-05, "loss": 138.8588, "step": 17140 }, { "epoch": 0.13857578034728787, "grad_norm": 1549.0982666015625, "learning_rate": 4.977357497868334e-05, "loss": 162.1509, "step": 17150 }, { "epoch": 0.138656582551572, "grad_norm": 1119.6712646484375, "learning_rate": 4.9772627050258604e-05, "loss": 181.102, "step": 17160 }, { "epoch": 0.13873738475585615, "grad_norm": 817.0682983398438, "learning_rate": 4.977167715078963e-05, "loss": 167.9592, "step": 17170 }, { "epoch": 0.13881818696014028, "grad_norm": 734.8985595703125, "learning_rate": 4.977072528035199e-05, "loss": 134.1706, "step": 17180 }, { "epoch": 0.13889898916442442, "grad_norm": 927.8806762695312, "learning_rate": 4.976977143902143e-05, "loss": 150.6093, "step": 17190 }, { "epoch": 0.13897979136870853, "grad_norm": 1223.3267822265625, "learning_rate": 4.9768815626873836e-05, "loss": 140.2831, "step": 17200 }, { "epoch": 0.13906059357299266, "grad_norm": 966.7178955078125, "learning_rate": 4.9767857843985245e-05, "loss": 137.6861, "step": 17210 }, { "epoch": 0.1391413957772768, "grad_norm": 1357.1478271484375, "learning_rate": 4.976689809043188e-05, "loss": 188.3903, "step": 17220 }, { "epoch": 0.13922219798156094, "grad_norm": 1198.27001953125, "learning_rate": 4.97659363662901e-05, "loss": 112.1936, "step": 17230 }, { "epoch": 0.13930300018584507, "grad_norm": 1542.5234375, "learning_rate": 4.976497267163642e-05, "loss": 178.1322, "step": 17240 }, { "epoch": 0.1393838023901292, "grad_norm": 574.7208862304688, "learning_rate": 4.9764007006547516e-05, "loss": 130.098, "step": 17250 }, { "epoch": 0.13946460459441334, "grad_norm": 1796.7242431640625, "learning_rate": 4.976303937110024e-05, "loss": 145.7055, "step": 17260 }, { "epoch": 0.13954540679869748, "grad_norm": 958.0619506835938, "learning_rate": 4.9762069765371556e-05, "loss": 134.8362, "step": 17270 }, { "epoch": 0.1396262090029816, "grad_norm": 940.2138671875, "learning_rate": 4.976109818943863e-05, "loss": 182.4838, "step": 17280 }, { "epoch": 0.13970701120726572, "grad_norm": 547.7529907226562, "learning_rate": 4.976012464337876e-05, "loss": 112.5585, "step": 17290 }, { "epoch": 0.13978781341154986, "grad_norm": 1002.335693359375, "learning_rate": 4.97591491272694e-05, "loss": 158.1674, "step": 17300 }, { "epoch": 0.139868615615834, "grad_norm": 1832.1202392578125, "learning_rate": 4.9758171641188174e-05, "loss": 159.6637, "step": 17310 }, { "epoch": 0.13994941782011813, "grad_norm": 2267.85546875, "learning_rate": 4.975719218521285e-05, "loss": 121.5045, "step": 17320 }, { "epoch": 0.14003022002440227, "grad_norm": 1080.1688232421875, "learning_rate": 4.975621075942137e-05, "loss": 145.8474, "step": 17330 }, { "epoch": 0.1401110222286864, "grad_norm": 5002.1943359375, "learning_rate": 4.975522736389182e-05, "loss": 162.2757, "step": 17340 }, { "epoch": 0.14019182443297054, "grad_norm": 1529.05810546875, "learning_rate": 4.975424199870244e-05, "loss": 204.3265, "step": 17350 }, { "epoch": 0.14027262663725468, "grad_norm": 827.3128662109375, "learning_rate": 4.975325466393163e-05, "loss": 171.37, "step": 17360 }, { "epoch": 0.14035342884153879, "grad_norm": 802.5576782226562, "learning_rate": 4.975226535965795e-05, "loss": 172.4737, "step": 17370 }, { "epoch": 0.14043423104582292, "grad_norm": 954.416015625, "learning_rate": 4.9751274085960097e-05, "loss": 198.0529, "step": 17380 }, { "epoch": 0.14051503325010706, "grad_norm": 978.6486206054688, "learning_rate": 4.975028084291697e-05, "loss": 137.7169, "step": 17390 }, { "epoch": 0.1405958354543912, "grad_norm": 785.1177368164062, "learning_rate": 4.9749285630607587e-05, "loss": 141.8211, "step": 17400 }, { "epoch": 0.14067663765867533, "grad_norm": 1156.6881103515625, "learning_rate": 4.9748288449111126e-05, "loss": 147.5348, "step": 17410 }, { "epoch": 0.14075743986295947, "grad_norm": 1045.3648681640625, "learning_rate": 4.974728929850694e-05, "loss": 161.647, "step": 17420 }, { "epoch": 0.1408382420672436, "grad_norm": 1343.872802734375, "learning_rate": 4.974628817887451e-05, "loss": 145.1868, "step": 17430 }, { "epoch": 0.14091904427152774, "grad_norm": 1589.7122802734375, "learning_rate": 4.97452850902935e-05, "loss": 118.9889, "step": 17440 }, { "epoch": 0.14099984647581185, "grad_norm": 706.4135131835938, "learning_rate": 4.9744280032843726e-05, "loss": 157.0399, "step": 17450 }, { "epoch": 0.14108064868009598, "grad_norm": 1923.8131103515625, "learning_rate": 4.974327300660515e-05, "loss": 152.5646, "step": 17460 }, { "epoch": 0.14116145088438012, "grad_norm": 1136.072021484375, "learning_rate": 4.974226401165789e-05, "loss": 163.3207, "step": 17470 }, { "epoch": 0.14124225308866425, "grad_norm": 1105.5438232421875, "learning_rate": 4.974125304808224e-05, "loss": 150.4518, "step": 17480 }, { "epoch": 0.1413230552929484, "grad_norm": 1104.9368896484375, "learning_rate": 4.974024011595864e-05, "loss": 163.8363, "step": 17490 }, { "epoch": 0.14140385749723253, "grad_norm": 855.8195190429688, "learning_rate": 4.973922521536766e-05, "loss": 242.4139, "step": 17500 }, { "epoch": 0.14148465970151666, "grad_norm": 1221.6298828125, "learning_rate": 4.973820834639008e-05, "loss": 148.5189, "step": 17510 }, { "epoch": 0.1415654619058008, "grad_norm": 1131.8109130859375, "learning_rate": 4.973718950910679e-05, "loss": 156.5813, "step": 17520 }, { "epoch": 0.14164626411008494, "grad_norm": 1203.230712890625, "learning_rate": 4.973616870359886e-05, "loss": 149.7097, "step": 17530 }, { "epoch": 0.14172706631436904, "grad_norm": 1183.5244140625, "learning_rate": 4.9735145929947506e-05, "loss": 159.902, "step": 17540 }, { "epoch": 0.14180786851865318, "grad_norm": 1980.1556396484375, "learning_rate": 4.973412118823412e-05, "loss": 138.3822, "step": 17550 }, { "epoch": 0.14188867072293732, "grad_norm": 2065.0751953125, "learning_rate": 4.973309447854021e-05, "loss": 187.6551, "step": 17560 }, { "epoch": 0.14196947292722145, "grad_norm": 466.6748962402344, "learning_rate": 4.973206580094749e-05, "loss": 164.5834, "step": 17570 }, { "epoch": 0.1420502751315056, "grad_norm": 1023.6004028320312, "learning_rate": 4.9731035155537805e-05, "loss": 142.399, "step": 17580 }, { "epoch": 0.14213107733578972, "grad_norm": 1570.9815673828125, "learning_rate": 4.973000254239314e-05, "loss": 150.091, "step": 17590 }, { "epoch": 0.14221187954007386, "grad_norm": 886.7949829101562, "learning_rate": 4.972896796159568e-05, "loss": 215.1895, "step": 17600 }, { "epoch": 0.142292681744358, "grad_norm": 2179.057373046875, "learning_rate": 4.972793141322773e-05, "loss": 134.2585, "step": 17610 }, { "epoch": 0.14237348394864213, "grad_norm": 1217.0042724609375, "learning_rate": 4.9726892897371754e-05, "loss": 172.0116, "step": 17620 }, { "epoch": 0.14245428615292624, "grad_norm": 1240.138671875, "learning_rate": 4.9725852414110396e-05, "loss": 150.7606, "step": 17630 }, { "epoch": 0.14253508835721038, "grad_norm": 1866.2113037109375, "learning_rate": 4.972480996352644e-05, "loss": 169.5855, "step": 17640 }, { "epoch": 0.1426158905614945, "grad_norm": 1343.4752197265625, "learning_rate": 4.972376554570282e-05, "loss": 138.5565, "step": 17650 }, { "epoch": 0.14269669276577865, "grad_norm": 3173.331787109375, "learning_rate": 4.972271916072264e-05, "loss": 161.6781, "step": 17660 }, { "epoch": 0.14277749497006278, "grad_norm": 1009.836669921875, "learning_rate": 4.972167080866917e-05, "loss": 167.5559, "step": 17670 }, { "epoch": 0.14285829717434692, "grad_norm": 1085.6519775390625, "learning_rate": 4.9720620489625804e-05, "loss": 163.0791, "step": 17680 }, { "epoch": 0.14293909937863106, "grad_norm": 533.2945556640625, "learning_rate": 4.971956820367612e-05, "loss": 136.0262, "step": 17690 }, { "epoch": 0.1430199015829152, "grad_norm": 1245.5428466796875, "learning_rate": 4.971851395090384e-05, "loss": 175.3457, "step": 17700 }, { "epoch": 0.1431007037871993, "grad_norm": 1112.46630859375, "learning_rate": 4.9717457731392854e-05, "loss": 153.0563, "step": 17710 }, { "epoch": 0.14318150599148344, "grad_norm": 1563.6947021484375, "learning_rate": 4.971639954522719e-05, "loss": 170.2951, "step": 17720 }, { "epoch": 0.14326230819576757, "grad_norm": 4072.32568359375, "learning_rate": 4.971533939249105e-05, "loss": 154.5233, "step": 17730 }, { "epoch": 0.1433431104000517, "grad_norm": 995.8030395507812, "learning_rate": 4.971427727326877e-05, "loss": 135.2525, "step": 17740 }, { "epoch": 0.14342391260433585, "grad_norm": 1153.982666015625, "learning_rate": 4.971321318764488e-05, "loss": 145.9184, "step": 17750 }, { "epoch": 0.14350471480861998, "grad_norm": 1135.168212890625, "learning_rate": 4.971214713570403e-05, "loss": 106.7287, "step": 17760 }, { "epoch": 0.14358551701290412, "grad_norm": 2647.79736328125, "learning_rate": 4.9711079117531054e-05, "loss": 140.8377, "step": 17770 }, { "epoch": 0.14366631921718825, "grad_norm": 720.4559936523438, "learning_rate": 4.9710009133210915e-05, "loss": 223.2201, "step": 17780 }, { "epoch": 0.1437471214214724, "grad_norm": 877.2671508789062, "learning_rate": 4.970893718282876e-05, "loss": 126.1438, "step": 17790 }, { "epoch": 0.1438279236257565, "grad_norm": 1463.1614990234375, "learning_rate": 4.970786326646987e-05, "loss": 158.2247, "step": 17800 }, { "epoch": 0.14390872583004063, "grad_norm": 946.1857299804688, "learning_rate": 4.970678738421969e-05, "loss": 124.8285, "step": 17810 }, { "epoch": 0.14398952803432477, "grad_norm": 1228.1309814453125, "learning_rate": 4.9705709536163824e-05, "loss": 133.9969, "step": 17820 }, { "epoch": 0.1440703302386089, "grad_norm": 1009.2578735351562, "learning_rate": 4.9704629722388035e-05, "loss": 129.6919, "step": 17830 }, { "epoch": 0.14415113244289304, "grad_norm": 1004.6964111328125, "learning_rate": 4.9703547942978244e-05, "loss": 162.5422, "step": 17840 }, { "epoch": 0.14423193464717718, "grad_norm": 1071.4462890625, "learning_rate": 4.9702464198020517e-05, "loss": 118.5496, "step": 17850 }, { "epoch": 0.14431273685146132, "grad_norm": 733.4691162109375, "learning_rate": 4.9701378487601074e-05, "loss": 183.4323, "step": 17860 }, { "epoch": 0.14439353905574545, "grad_norm": 1384.6099853515625, "learning_rate": 4.970029081180632e-05, "loss": 126.8512, "step": 17870 }, { "epoch": 0.1444743412600296, "grad_norm": 1482.7987060546875, "learning_rate": 4.969920117072277e-05, "loss": 193.9891, "step": 17880 }, { "epoch": 0.1445551434643137, "grad_norm": 1112.25390625, "learning_rate": 4.969810956443715e-05, "loss": 163.0959, "step": 17890 }, { "epoch": 0.14463594566859783, "grad_norm": 1225.2054443359375, "learning_rate": 4.96970159930363e-05, "loss": 185.9541, "step": 17900 }, { "epoch": 0.14471674787288197, "grad_norm": 542.3731689453125, "learning_rate": 4.9695920456607226e-05, "loss": 171.242, "step": 17910 }, { "epoch": 0.1447975500771661, "grad_norm": 713.3396606445312, "learning_rate": 4.96948229552371e-05, "loss": 129.3645, "step": 17920 }, { "epoch": 0.14487835228145024, "grad_norm": 1782.2279052734375, "learning_rate": 4.9693723489013253e-05, "loss": 149.2307, "step": 17930 }, { "epoch": 0.14495915448573438, "grad_norm": 791.181640625, "learning_rate": 4.969262205802315e-05, "loss": 186.5776, "step": 17940 }, { "epoch": 0.1450399566900185, "grad_norm": 1428.761962890625, "learning_rate": 4.9691518662354434e-05, "loss": 142.7595, "step": 17950 }, { "epoch": 0.14512075889430265, "grad_norm": 897.8218994140625, "learning_rate": 4.96904133020949e-05, "loss": 127.4383, "step": 17960 }, { "epoch": 0.14520156109858676, "grad_norm": 1025.088623046875, "learning_rate": 4.968930597733249e-05, "loss": 184.0482, "step": 17970 }, { "epoch": 0.1452823633028709, "grad_norm": 1242.26220703125, "learning_rate": 4.968819668815532e-05, "loss": 102.1975, "step": 17980 }, { "epoch": 0.14536316550715503, "grad_norm": 1774.1246337890625, "learning_rate": 4.9687085434651636e-05, "loss": 133.0901, "step": 17990 }, { "epoch": 0.14544396771143916, "grad_norm": 1936.0460205078125, "learning_rate": 4.968597221690986e-05, "loss": 196.2127, "step": 18000 }, { "epoch": 0.1455247699157233, "grad_norm": 1869.914306640625, "learning_rate": 4.968485703501857e-05, "loss": 112.3861, "step": 18010 }, { "epoch": 0.14560557212000744, "grad_norm": 792.3851318359375, "learning_rate": 4.9683739889066497e-05, "loss": 169.0835, "step": 18020 }, { "epoch": 0.14568637432429157, "grad_norm": 1420.177734375, "learning_rate": 4.968262077914252e-05, "loss": 164.7283, "step": 18030 }, { "epoch": 0.1457671765285757, "grad_norm": 850.1755981445312, "learning_rate": 4.9681499705335685e-05, "loss": 136.3054, "step": 18040 }, { "epoch": 0.14584797873285985, "grad_norm": 729.207763671875, "learning_rate": 4.96803766677352e-05, "loss": 172.2848, "step": 18050 }, { "epoch": 0.14592878093714395, "grad_norm": 1386.603271484375, "learning_rate": 4.96792516664304e-05, "loss": 182.5913, "step": 18060 }, { "epoch": 0.1460095831414281, "grad_norm": 900.0386962890625, "learning_rate": 4.967812470151082e-05, "loss": 165.9354, "step": 18070 }, { "epoch": 0.14609038534571223, "grad_norm": 940.3046875, "learning_rate": 4.9676995773066105e-05, "loss": 112.6717, "step": 18080 }, { "epoch": 0.14617118754999636, "grad_norm": 4443.7900390625, "learning_rate": 4.967586488118609e-05, "loss": 161.5711, "step": 18090 }, { "epoch": 0.1462519897542805, "grad_norm": 955.7782592773438, "learning_rate": 4.9674732025960755e-05, "loss": 194.389, "step": 18100 }, { "epoch": 0.14633279195856463, "grad_norm": 2043.8023681640625, "learning_rate": 4.9673597207480236e-05, "loss": 147.0529, "step": 18110 }, { "epoch": 0.14641359416284877, "grad_norm": 2997.87890625, "learning_rate": 4.967246042583482e-05, "loss": 146.364, "step": 18120 }, { "epoch": 0.1464943963671329, "grad_norm": 1237.5732421875, "learning_rate": 4.967132168111496e-05, "loss": 132.0931, "step": 18130 }, { "epoch": 0.14657519857141701, "grad_norm": 1432.5120849609375, "learning_rate": 4.967018097341126e-05, "loss": 213.1167, "step": 18140 }, { "epoch": 0.14665600077570115, "grad_norm": 1035.17724609375, "learning_rate": 4.966903830281449e-05, "loss": 133.9097, "step": 18150 }, { "epoch": 0.1467368029799853, "grad_norm": 879.0906982421875, "learning_rate": 4.9667893669415546e-05, "loss": 161.3391, "step": 18160 }, { "epoch": 0.14681760518426942, "grad_norm": 1065.3148193359375, "learning_rate": 4.966674707330551e-05, "loss": 147.773, "step": 18170 }, { "epoch": 0.14689840738855356, "grad_norm": 993.1563720703125, "learning_rate": 4.966559851457562e-05, "loss": 144.7801, "step": 18180 }, { "epoch": 0.1469792095928377, "grad_norm": 551.8864135742188, "learning_rate": 4.966444799331726e-05, "loss": 142.696, "step": 18190 }, { "epoch": 0.14706001179712183, "grad_norm": 641.4093017578125, "learning_rate": 4.966329550962196e-05, "loss": 134.7732, "step": 18200 }, { "epoch": 0.14714081400140597, "grad_norm": 853.4235229492188, "learning_rate": 4.9662141063581436e-05, "loss": 142.1104, "step": 18210 }, { "epoch": 0.1472216162056901, "grad_norm": 1158.81005859375, "learning_rate": 4.9660984655287525e-05, "loss": 161.9792, "step": 18220 }, { "epoch": 0.1473024184099742, "grad_norm": 1042.7049560546875, "learning_rate": 4.965982628483224e-05, "loss": 155.6552, "step": 18230 }, { "epoch": 0.14738322061425835, "grad_norm": 746.765380859375, "learning_rate": 4.965866595230776e-05, "loss": 121.6951, "step": 18240 }, { "epoch": 0.14746402281854248, "grad_norm": 687.2789916992188, "learning_rate": 4.9657503657806395e-05, "loss": 113.0879, "step": 18250 }, { "epoch": 0.14754482502282662, "grad_norm": 1133.392822265625, "learning_rate": 4.9656339401420624e-05, "loss": 166.3725, "step": 18260 }, { "epoch": 0.14762562722711076, "grad_norm": 733.4806518554688, "learning_rate": 4.965517318324308e-05, "loss": 139.1904, "step": 18270 }, { "epoch": 0.1477064294313949, "grad_norm": 1013.0859375, "learning_rate": 4.9654005003366566e-05, "loss": 136.2502, "step": 18280 }, { "epoch": 0.14778723163567903, "grad_norm": 708.066162109375, "learning_rate": 4.965283486188401e-05, "loss": 193.8193, "step": 18290 }, { "epoch": 0.14786803383996316, "grad_norm": 1149.5897216796875, "learning_rate": 4.965166275888854e-05, "loss": 131.3347, "step": 18300 }, { "epoch": 0.1479488360442473, "grad_norm": 728.546142578125, "learning_rate": 4.965048869447339e-05, "loss": 170.7203, "step": 18310 }, { "epoch": 0.1480296382485314, "grad_norm": 1086.8751220703125, "learning_rate": 4.964931266873198e-05, "loss": 140.5396, "step": 18320 }, { "epoch": 0.14811044045281554, "grad_norm": 2335.054931640625, "learning_rate": 4.96481346817579e-05, "loss": 190.4257, "step": 18330 }, { "epoch": 0.14819124265709968, "grad_norm": 889.5890502929688, "learning_rate": 4.9646954733644856e-05, "loss": 143.3664, "step": 18340 }, { "epoch": 0.14827204486138382, "grad_norm": 638.7859497070312, "learning_rate": 4.9645772824486734e-05, "loss": 130.9303, "step": 18350 }, { "epoch": 0.14835284706566795, "grad_norm": 429.1936340332031, "learning_rate": 4.964458895437759e-05, "loss": 194.556, "step": 18360 }, { "epoch": 0.1484336492699521, "grad_norm": 625.8038330078125, "learning_rate": 4.96434031234116e-05, "loss": 137.4827, "step": 18370 }, { "epoch": 0.14851445147423623, "grad_norm": 1951.56005859375, "learning_rate": 4.964221533168312e-05, "loss": 177.7926, "step": 18380 }, { "epoch": 0.14859525367852036, "grad_norm": 2383.371337890625, "learning_rate": 4.9641025579286656e-05, "loss": 142.2735, "step": 18390 }, { "epoch": 0.14867605588280447, "grad_norm": 1063.8594970703125, "learning_rate": 4.9639833866316874e-05, "loss": 138.8885, "step": 18400 }, { "epoch": 0.1487568580870886, "grad_norm": 1529.0601806640625, "learning_rate": 4.963864019286859e-05, "loss": 140.622, "step": 18410 }, { "epoch": 0.14883766029137274, "grad_norm": 1062.4075927734375, "learning_rate": 4.963744455903679e-05, "loss": 167.9912, "step": 18420 }, { "epoch": 0.14891846249565688, "grad_norm": 590.7759399414062, "learning_rate": 4.963624696491659e-05, "loss": 130.7837, "step": 18430 }, { "epoch": 0.14899926469994101, "grad_norm": 991.8165283203125, "learning_rate": 4.963504741060329e-05, "loss": 156.975, "step": 18440 }, { "epoch": 0.14908006690422515, "grad_norm": 1038.615966796875, "learning_rate": 4.963384589619233e-05, "loss": 135.4892, "step": 18450 }, { "epoch": 0.1491608691085093, "grad_norm": 738.695556640625, "learning_rate": 4.9632642421779295e-05, "loss": 129.8741, "step": 18460 }, { "epoch": 0.14924167131279342, "grad_norm": 1388.894287109375, "learning_rate": 4.9631436987459964e-05, "loss": 140.8633, "step": 18470 }, { "epoch": 0.14932247351707756, "grad_norm": 1890.2734375, "learning_rate": 4.9630229593330226e-05, "loss": 195.7736, "step": 18480 }, { "epoch": 0.14940327572136167, "grad_norm": 1176.2431640625, "learning_rate": 4.9629020239486155e-05, "loss": 176.4671, "step": 18490 }, { "epoch": 0.1494840779256458, "grad_norm": 1625.3804931640625, "learning_rate": 4.962780892602398e-05, "loss": 182.2511, "step": 18500 }, { "epoch": 0.14956488012992994, "grad_norm": 1016.1923828125, "learning_rate": 4.962659565304008e-05, "loss": 139.6862, "step": 18510 }, { "epoch": 0.14964568233421408, "grad_norm": 1304.57373046875, "learning_rate": 4.962538042063097e-05, "loss": 133.6651, "step": 18520 }, { "epoch": 0.1497264845384982, "grad_norm": 2883.068359375, "learning_rate": 4.962416322889337e-05, "loss": 155.5502, "step": 18530 }, { "epoch": 0.14980728674278235, "grad_norm": 518.0875244140625, "learning_rate": 4.9622944077924106e-05, "loss": 138.3091, "step": 18540 }, { "epoch": 0.14988808894706648, "grad_norm": 1520.2349853515625, "learning_rate": 4.9621722967820184e-05, "loss": 134.8868, "step": 18550 }, { "epoch": 0.14996889115135062, "grad_norm": 1127.28662109375, "learning_rate": 4.962049989867877e-05, "loss": 149.5082, "step": 18560 }, { "epoch": 0.15004969335563473, "grad_norm": 1920.3206787109375, "learning_rate": 4.961927487059716e-05, "loss": 206.6959, "step": 18570 }, { "epoch": 0.15013049555991886, "grad_norm": 985.6328735351562, "learning_rate": 4.961804788367285e-05, "loss": 147.696, "step": 18580 }, { "epoch": 0.150211297764203, "grad_norm": 1025.6527099609375, "learning_rate": 4.961681893800344e-05, "loss": 142.4606, "step": 18590 }, { "epoch": 0.15029209996848714, "grad_norm": 1423.64794921875, "learning_rate": 4.961558803368673e-05, "loss": 171.7357, "step": 18600 }, { "epoch": 0.15037290217277127, "grad_norm": 1430.884765625, "learning_rate": 4.961435517082065e-05, "loss": 176.0427, "step": 18610 }, { "epoch": 0.1504537043770554, "grad_norm": 664.1461181640625, "learning_rate": 4.9613120349503286e-05, "loss": 93.5966, "step": 18620 }, { "epoch": 0.15053450658133954, "grad_norm": 1300.7425537109375, "learning_rate": 4.961188356983291e-05, "loss": 171.2061, "step": 18630 }, { "epoch": 0.15061530878562368, "grad_norm": 1404.8843994140625, "learning_rate": 4.9610644831907896e-05, "loss": 156.9324, "step": 18640 }, { "epoch": 0.15069611098990782, "grad_norm": 787.6842651367188, "learning_rate": 4.960940413582683e-05, "loss": 151.5972, "step": 18650 }, { "epoch": 0.15077691319419192, "grad_norm": 926.42333984375, "learning_rate": 4.960816148168842e-05, "loss": 111.1213, "step": 18660 }, { "epoch": 0.15085771539847606, "grad_norm": 650.7733154296875, "learning_rate": 4.9606916869591527e-05, "loss": 143.3633, "step": 18670 }, { "epoch": 0.1509385176027602, "grad_norm": 1304.4024658203125, "learning_rate": 4.960567029963519e-05, "loss": 164.9596, "step": 18680 }, { "epoch": 0.15101931980704433, "grad_norm": 1975.5609130859375, "learning_rate": 4.9604421771918594e-05, "loss": 159.01, "step": 18690 }, { "epoch": 0.15110012201132847, "grad_norm": 786.7325439453125, "learning_rate": 4.960317128654108e-05, "loss": 131.1655, "step": 18700 }, { "epoch": 0.1511809242156126, "grad_norm": 1006.70654296875, "learning_rate": 4.9601918843602145e-05, "loss": 172.4386, "step": 18710 }, { "epoch": 0.15126172641989674, "grad_norm": 913.4758911132812, "learning_rate": 4.960066444320143e-05, "loss": 123.8537, "step": 18720 }, { "epoch": 0.15134252862418088, "grad_norm": 1134.485107421875, "learning_rate": 4.959940808543875e-05, "loss": 143.2191, "step": 18730 }, { "epoch": 0.151423330828465, "grad_norm": 1133.67431640625, "learning_rate": 4.959814977041406e-05, "loss": 111.9317, "step": 18740 }, { "epoch": 0.15150413303274912, "grad_norm": 982.4358520507812, "learning_rate": 4.9596889498227486e-05, "loss": 160.2977, "step": 18750 }, { "epoch": 0.15158493523703326, "grad_norm": 816.73681640625, "learning_rate": 4.9595627268979294e-05, "loss": 128.3574, "step": 18760 }, { "epoch": 0.1516657374413174, "grad_norm": 720.4200439453125, "learning_rate": 4.9594363082769925e-05, "loss": 132.2612, "step": 18770 }, { "epoch": 0.15174653964560153, "grad_norm": 1521.4068603515625, "learning_rate": 4.959309693969996e-05, "loss": 129.5006, "step": 18780 }, { "epoch": 0.15182734184988567, "grad_norm": 856.1842041015625, "learning_rate": 4.959182883987012e-05, "loss": 145.0418, "step": 18790 }, { "epoch": 0.1519081440541698, "grad_norm": 1037.7059326171875, "learning_rate": 4.959055878338134e-05, "loss": 156.9543, "step": 18800 }, { "epoch": 0.15198894625845394, "grad_norm": 499.3943786621094, "learning_rate": 4.9589286770334654e-05, "loss": 127.4943, "step": 18810 }, { "epoch": 0.15206974846273807, "grad_norm": 1569.54736328125, "learning_rate": 4.9588012800831264e-05, "loss": 186.4054, "step": 18820 }, { "epoch": 0.15215055066702218, "grad_norm": 1288.7344970703125, "learning_rate": 4.9586736874972535e-05, "loss": 134.4534, "step": 18830 }, { "epoch": 0.15223135287130632, "grad_norm": 981.8977661132812, "learning_rate": 4.958545899285999e-05, "loss": 147.6222, "step": 18840 }, { "epoch": 0.15231215507559046, "grad_norm": 1461.2166748046875, "learning_rate": 4.958417915459531e-05, "loss": 136.6087, "step": 18850 }, { "epoch": 0.1523929572798746, "grad_norm": 1042.28759765625, "learning_rate": 4.958289736028032e-05, "loss": 125.8117, "step": 18860 }, { "epoch": 0.15247375948415873, "grad_norm": 1394.02099609375, "learning_rate": 4.958161361001701e-05, "loss": 106.0165, "step": 18870 }, { "epoch": 0.15255456168844286, "grad_norm": 1479.97509765625, "learning_rate": 4.9580327903907514e-05, "loss": 135.8925, "step": 18880 }, { "epoch": 0.152635363892727, "grad_norm": 983.6040649414062, "learning_rate": 4.957904024205414e-05, "loss": 182.2857, "step": 18890 }, { "epoch": 0.15271616609701114, "grad_norm": 952.5586547851562, "learning_rate": 4.957775062455933e-05, "loss": 114.4358, "step": 18900 }, { "epoch": 0.15279696830129527, "grad_norm": 1429.8126220703125, "learning_rate": 4.95764590515257e-05, "loss": 155.2205, "step": 18910 }, { "epoch": 0.15287777050557938, "grad_norm": 1141.6102294921875, "learning_rate": 4.957516552305602e-05, "loss": 196.4725, "step": 18920 }, { "epoch": 0.15295857270986352, "grad_norm": 971.5953369140625, "learning_rate": 4.957387003925321e-05, "loss": 143.6367, "step": 18930 }, { "epoch": 0.15303937491414765, "grad_norm": 986.298095703125, "learning_rate": 4.9572572600220323e-05, "loss": 161.2457, "step": 18940 }, { "epoch": 0.1531201771184318, "grad_norm": 618.5545654296875, "learning_rate": 4.957127320606062e-05, "loss": 98.7228, "step": 18950 }, { "epoch": 0.15320097932271592, "grad_norm": 1332.11572265625, "learning_rate": 4.956997185687747e-05, "loss": 119.0759, "step": 18960 }, { "epoch": 0.15328178152700006, "grad_norm": 1110.71923828125, "learning_rate": 4.9568668552774424e-05, "loss": 157.8614, "step": 18970 }, { "epoch": 0.1533625837312842, "grad_norm": 567.2537841796875, "learning_rate": 4.956736329385517e-05, "loss": 127.0969, "step": 18980 }, { "epoch": 0.15344338593556833, "grad_norm": 828.8348999023438, "learning_rate": 4.9566056080223574e-05, "loss": 163.5427, "step": 18990 }, { "epoch": 0.15352418813985247, "grad_norm": 1140.49853515625, "learning_rate": 4.956474691198363e-05, "loss": 132.9715, "step": 19000 }, { "epoch": 0.15360499034413658, "grad_norm": 958.723388671875, "learning_rate": 4.956343578923952e-05, "loss": 136.1955, "step": 19010 }, { "epoch": 0.1536857925484207, "grad_norm": 990.290771484375, "learning_rate": 4.956212271209555e-05, "loss": 133.8689, "step": 19020 }, { "epoch": 0.15376659475270485, "grad_norm": 1795.55908203125, "learning_rate": 4.956080768065621e-05, "loss": 141.3235, "step": 19030 }, { "epoch": 0.15384739695698899, "grad_norm": 794.1259765625, "learning_rate": 4.9559490695026113e-05, "loss": 124.8279, "step": 19040 }, { "epoch": 0.15392819916127312, "grad_norm": 1143.3475341796875, "learning_rate": 4.955817175531005e-05, "loss": 157.4777, "step": 19050 }, { "epoch": 0.15400900136555726, "grad_norm": 725.7691650390625, "learning_rate": 4.9556850861612976e-05, "loss": 152.8616, "step": 19060 }, { "epoch": 0.1540898035698414, "grad_norm": 829.7158813476562, "learning_rate": 4.955552801403998e-05, "loss": 127.6073, "step": 19070 }, { "epoch": 0.15417060577412553, "grad_norm": 836.8492431640625, "learning_rate": 4.9554203212696304e-05, "loss": 128.8572, "step": 19080 }, { "epoch": 0.15425140797840964, "grad_norm": 939.9307250976562, "learning_rate": 4.9552876457687374e-05, "loss": 132.6982, "step": 19090 }, { "epoch": 0.15433221018269377, "grad_norm": 1570.5841064453125, "learning_rate": 4.955154774911875e-05, "loss": 101.3377, "step": 19100 }, { "epoch": 0.1544130123869779, "grad_norm": 608.4269409179688, "learning_rate": 4.955021708709614e-05, "loss": 138.4897, "step": 19110 }, { "epoch": 0.15449381459126205, "grad_norm": 1717.3123779296875, "learning_rate": 4.9548884471725434e-05, "loss": 183.6861, "step": 19120 }, { "epoch": 0.15457461679554618, "grad_norm": 721.2388305664062, "learning_rate": 4.9547549903112654e-05, "loss": 183.5123, "step": 19130 }, { "epoch": 0.15465541899983032, "grad_norm": 1664.793701171875, "learning_rate": 4.954621338136398e-05, "loss": 160.0894, "step": 19140 }, { "epoch": 0.15473622120411445, "grad_norm": 810.6116333007812, "learning_rate": 4.954487490658577e-05, "loss": 150.3457, "step": 19150 }, { "epoch": 0.1548170234083986, "grad_norm": 1055.0728759765625, "learning_rate": 4.95435344788845e-05, "loss": 168.4552, "step": 19160 }, { "epoch": 0.15489782561268273, "grad_norm": 878.2693481445312, "learning_rate": 4.954219209836684e-05, "loss": 123.0568, "step": 19170 }, { "epoch": 0.15497862781696684, "grad_norm": 971.3629760742188, "learning_rate": 4.954084776513957e-05, "loss": 112.7831, "step": 19180 }, { "epoch": 0.15505943002125097, "grad_norm": 1250.1171875, "learning_rate": 4.953950147930969e-05, "loss": 128.6498, "step": 19190 }, { "epoch": 0.1551402322255351, "grad_norm": 1516.2861328125, "learning_rate": 4.9538153240984286e-05, "loss": 135.834, "step": 19200 }, { "epoch": 0.15522103442981924, "grad_norm": 917.4509887695312, "learning_rate": 4.953680305027065e-05, "loss": 199.9036, "step": 19210 }, { "epoch": 0.15530183663410338, "grad_norm": 1195.2305908203125, "learning_rate": 4.9535450907276204e-05, "loss": 167.8621, "step": 19220 }, { "epoch": 0.15538263883838752, "grad_norm": 1185.2828369140625, "learning_rate": 4.953409681210853e-05, "loss": 150.506, "step": 19230 }, { "epoch": 0.15546344104267165, "grad_norm": 1028.5968017578125, "learning_rate": 4.9532740764875377e-05, "loss": 160.7915, "step": 19240 }, { "epoch": 0.1555442432469558, "grad_norm": 1641.87451171875, "learning_rate": 4.953138276568462e-05, "loss": 167.9964, "step": 19250 }, { "epoch": 0.1556250454512399, "grad_norm": 676.5661010742188, "learning_rate": 4.953002281464432e-05, "loss": 168.5508, "step": 19260 }, { "epoch": 0.15570584765552403, "grad_norm": 581.885009765625, "learning_rate": 4.952866091186269e-05, "loss": 138.8725, "step": 19270 }, { "epoch": 0.15578664985980817, "grad_norm": 1236.295166015625, "learning_rate": 4.952729705744808e-05, "loss": 123.4911, "step": 19280 }, { "epoch": 0.1558674520640923, "grad_norm": 888.4049682617188, "learning_rate": 4.9525931251509e-05, "loss": 138.8721, "step": 19290 }, { "epoch": 0.15594825426837644, "grad_norm": 656.3297119140625, "learning_rate": 4.9524563494154145e-05, "loss": 161.5853, "step": 19300 }, { "epoch": 0.15602905647266058, "grad_norm": 899.3927612304688, "learning_rate": 4.952319378549232e-05, "loss": 130.1569, "step": 19310 }, { "epoch": 0.1561098586769447, "grad_norm": 1051.7786865234375, "learning_rate": 4.95218221256325e-05, "loss": 155.7519, "step": 19320 }, { "epoch": 0.15619066088122885, "grad_norm": 351.5675048828125, "learning_rate": 4.952044851468385e-05, "loss": 118.1119, "step": 19330 }, { "epoch": 0.15627146308551298, "grad_norm": 648.6687622070312, "learning_rate": 4.951907295275563e-05, "loss": 186.3808, "step": 19340 }, { "epoch": 0.1563522652897971, "grad_norm": 737.3397827148438, "learning_rate": 4.951769543995731e-05, "loss": 164.0862, "step": 19350 }, { "epoch": 0.15643306749408123, "grad_norm": 1181.345458984375, "learning_rate": 4.951631597639849e-05, "loss": 95.8942, "step": 19360 }, { "epoch": 0.15651386969836537, "grad_norm": 1376.4288330078125, "learning_rate": 4.9514934562188915e-05, "loss": 155.4059, "step": 19370 }, { "epoch": 0.1565946719026495, "grad_norm": 984.8242797851562, "learning_rate": 4.951355119743851e-05, "loss": 136.2228, "step": 19380 }, { "epoch": 0.15667547410693364, "grad_norm": 1276.46142578125, "learning_rate": 4.9512165882257335e-05, "loss": 134.6281, "step": 19390 }, { "epoch": 0.15675627631121777, "grad_norm": 855.7882690429688, "learning_rate": 4.9510778616755616e-05, "loss": 127.1215, "step": 19400 }, { "epoch": 0.1568370785155019, "grad_norm": 1180.2899169921875, "learning_rate": 4.9509389401043735e-05, "loss": 157.9484, "step": 19410 }, { "epoch": 0.15691788071978605, "grad_norm": 920.9578857421875, "learning_rate": 4.950799823523222e-05, "loss": 140.1964, "step": 19420 }, { "epoch": 0.15699868292407018, "grad_norm": 1415.94091796875, "learning_rate": 4.950660511943176e-05, "loss": 135.1266, "step": 19430 }, { "epoch": 0.1570794851283543, "grad_norm": 396.073974609375, "learning_rate": 4.95052100537532e-05, "loss": 144.9773, "step": 19440 }, { "epoch": 0.15716028733263843, "grad_norm": 716.8720092773438, "learning_rate": 4.950381303830755e-05, "loss": 118.7841, "step": 19450 }, { "epoch": 0.15724108953692256, "grad_norm": 798.32666015625, "learning_rate": 4.950241407320594e-05, "loss": 136.819, "step": 19460 }, { "epoch": 0.1573218917412067, "grad_norm": 1627.865966796875, "learning_rate": 4.95010131585597e-05, "loss": 155.6983, "step": 19470 }, { "epoch": 0.15740269394549083, "grad_norm": 2705.7021484375, "learning_rate": 4.9499610294480284e-05, "loss": 151.8552, "step": 19480 }, { "epoch": 0.15748349614977497, "grad_norm": 970.4476928710938, "learning_rate": 4.9498205481079315e-05, "loss": 154.0217, "step": 19490 }, { "epoch": 0.1575642983540591, "grad_norm": 775.9421997070312, "learning_rate": 4.949679871846857e-05, "loss": 171.9853, "step": 19500 }, { "epoch": 0.15764510055834324, "grad_norm": 736.22314453125, "learning_rate": 4.949539000675998e-05, "loss": 115.1917, "step": 19510 }, { "epoch": 0.15772590276262735, "grad_norm": 773.5701293945312, "learning_rate": 4.9493979346065624e-05, "loss": 143.0012, "step": 19520 }, { "epoch": 0.1578067049669115, "grad_norm": 1225.4534912109375, "learning_rate": 4.9492566736497744e-05, "loss": 146.8361, "step": 19530 }, { "epoch": 0.15788750717119562, "grad_norm": 849.607421875, "learning_rate": 4.949115217816873e-05, "loss": 154.6573, "step": 19540 }, { "epoch": 0.15796830937547976, "grad_norm": 1171.7467041015625, "learning_rate": 4.948973567119114e-05, "loss": 143.7283, "step": 19550 }, { "epoch": 0.1580491115797639, "grad_norm": 1184.739013671875, "learning_rate": 4.9488317215677673e-05, "loss": 147.067, "step": 19560 }, { "epoch": 0.15812991378404803, "grad_norm": 671.5252075195312, "learning_rate": 4.948689681174119e-05, "loss": 134.3241, "step": 19570 }, { "epoch": 0.15821071598833217, "grad_norm": 1588.4747314453125, "learning_rate": 4.948547445949471e-05, "loss": 132.0108, "step": 19580 }, { "epoch": 0.1582915181926163, "grad_norm": 1395.05712890625, "learning_rate": 4.94840501590514e-05, "loss": 132.0019, "step": 19590 }, { "epoch": 0.15837232039690044, "grad_norm": 1854.0704345703125, "learning_rate": 4.948262391052458e-05, "loss": 113.3447, "step": 19600 }, { "epoch": 0.15845312260118455, "grad_norm": 1134.071044921875, "learning_rate": 4.948119571402775e-05, "loss": 133.9102, "step": 19610 }, { "epoch": 0.15853392480546868, "grad_norm": 1435.5302734375, "learning_rate": 4.947976556967452e-05, "loss": 142.5358, "step": 19620 }, { "epoch": 0.15861472700975282, "grad_norm": 672.7803344726562, "learning_rate": 4.947833347757869e-05, "loss": 104.7007, "step": 19630 }, { "epoch": 0.15869552921403696, "grad_norm": 597.4839477539062, "learning_rate": 4.9476899437854205e-05, "loss": 134.072, "step": 19640 }, { "epoch": 0.1587763314183211, "grad_norm": 1603.63525390625, "learning_rate": 4.9475463450615175e-05, "loss": 115.9036, "step": 19650 }, { "epoch": 0.15885713362260523, "grad_norm": 663.64306640625, "learning_rate": 4.9474025515975835e-05, "loss": 112.9661, "step": 19660 }, { "epoch": 0.15893793582688936, "grad_norm": 1277.085205078125, "learning_rate": 4.947258563405061e-05, "loss": 138.1168, "step": 19670 }, { "epoch": 0.1590187380311735, "grad_norm": 1482.751708984375, "learning_rate": 4.947114380495406e-05, "loss": 194.949, "step": 19680 }, { "epoch": 0.15909954023545764, "grad_norm": 1446.81787109375, "learning_rate": 4.94697000288009e-05, "loss": 145.5572, "step": 19690 }, { "epoch": 0.15918034243974175, "grad_norm": 932.5813598632812, "learning_rate": 4.946825430570602e-05, "loss": 145.6906, "step": 19700 }, { "epoch": 0.15926114464402588, "grad_norm": 1143.1041259765625, "learning_rate": 4.946680663578443e-05, "loss": 151.2845, "step": 19710 }, { "epoch": 0.15934194684831002, "grad_norm": 1063.902587890625, "learning_rate": 4.9465357019151325e-05, "loss": 155.4715, "step": 19720 }, { "epoch": 0.15942274905259415, "grad_norm": 945.45556640625, "learning_rate": 4.946390545592204e-05, "loss": 188.0311, "step": 19730 }, { "epoch": 0.1595035512568783, "grad_norm": 1733.0693359375, "learning_rate": 4.9462451946212085e-05, "loss": 150.3059, "step": 19740 }, { "epoch": 0.15958435346116243, "grad_norm": 934.3817749023438, "learning_rate": 4.946099649013708e-05, "loss": 115.0981, "step": 19750 }, { "epoch": 0.15966515566544656, "grad_norm": 756.0308227539062, "learning_rate": 4.945953908781286e-05, "loss": 126.3076, "step": 19760 }, { "epoch": 0.1597459578697307, "grad_norm": 646.1735229492188, "learning_rate": 4.945807973935536e-05, "loss": 141.5272, "step": 19770 }, { "epoch": 0.1598267600740148, "grad_norm": 900.1630249023438, "learning_rate": 4.94566184448807e-05, "loss": 169.0281, "step": 19780 }, { "epoch": 0.15990756227829894, "grad_norm": 1312.0203857421875, "learning_rate": 4.945515520450515e-05, "loss": 168.3111, "step": 19790 }, { "epoch": 0.15998836448258308, "grad_norm": 824.4857177734375, "learning_rate": 4.9453690018345144e-05, "loss": 160.6633, "step": 19800 }, { "epoch": 0.16006916668686721, "grad_norm": 606.3312377929688, "learning_rate": 4.945222288651724e-05, "loss": 167.2263, "step": 19810 }, { "epoch": 0.16014996889115135, "grad_norm": 1857.02685546875, "learning_rate": 4.945075380913819e-05, "loss": 190.6399, "step": 19820 }, { "epoch": 0.1602307710954355, "grad_norm": 1485.077392578125, "learning_rate": 4.944928278632487e-05, "loss": 156.7114, "step": 19830 }, { "epoch": 0.16031157329971962, "grad_norm": 824.557373046875, "learning_rate": 4.944780981819433e-05, "loss": 143.3107, "step": 19840 }, { "epoch": 0.16039237550400376, "grad_norm": 987.8336791992188, "learning_rate": 4.944633490486376e-05, "loss": 100.8076, "step": 19850 }, { "epoch": 0.1604731777082879, "grad_norm": 1033.8603515625, "learning_rate": 4.944485804645052e-05, "loss": 162.6909, "step": 19860 }, { "epoch": 0.160553979912572, "grad_norm": 1208.977783203125, "learning_rate": 4.9443379243072094e-05, "loss": 132.4686, "step": 19870 }, { "epoch": 0.16063478211685614, "grad_norm": 719.5220947265625, "learning_rate": 4.944189849484618e-05, "loss": 141.4025, "step": 19880 }, { "epoch": 0.16071558432114028, "grad_norm": 1385.524658203125, "learning_rate": 4.9440415801890566e-05, "loss": 182.8163, "step": 19890 }, { "epoch": 0.1607963865254244, "grad_norm": 1521.5345458984375, "learning_rate": 4.9438931164323236e-05, "loss": 110.8893, "step": 19900 }, { "epoch": 0.16087718872970855, "grad_norm": 931.8245849609375, "learning_rate": 4.9437444582262316e-05, "loss": 141.6139, "step": 19910 }, { "epoch": 0.16095799093399268, "grad_norm": 1338.6199951171875, "learning_rate": 4.9435956055826083e-05, "loss": 167.0462, "step": 19920 }, { "epoch": 0.16103879313827682, "grad_norm": 592.0842895507812, "learning_rate": 4.943446558513297e-05, "loss": 104.6128, "step": 19930 }, { "epoch": 0.16111959534256096, "grad_norm": 765.056640625, "learning_rate": 4.943297317030156e-05, "loss": 136.0432, "step": 19940 }, { "epoch": 0.16120039754684506, "grad_norm": 896.3486328125, "learning_rate": 4.943147881145063e-05, "loss": 152.3156, "step": 19950 }, { "epoch": 0.1612811997511292, "grad_norm": 669.0186157226562, "learning_rate": 4.942998250869904e-05, "loss": 146.4306, "step": 19960 }, { "epoch": 0.16136200195541334, "grad_norm": 738.7496948242188, "learning_rate": 4.9428484262165865e-05, "loss": 167.2415, "step": 19970 }, { "epoch": 0.16144280415969747, "grad_norm": 1352.85009765625, "learning_rate": 4.9426984071970305e-05, "loss": 168.2593, "step": 19980 }, { "epoch": 0.1615236063639816, "grad_norm": 1093.584228515625, "learning_rate": 4.942548193823173e-05, "loss": 186.0099, "step": 19990 }, { "epoch": 0.16160440856826574, "grad_norm": 1222.2086181640625, "learning_rate": 4.942397786106965e-05, "loss": 159.3606, "step": 20000 }, { "epoch": 0.16168521077254988, "grad_norm": 1016.3326416015625, "learning_rate": 4.942247184060375e-05, "loss": 121.1088, "step": 20010 }, { "epoch": 0.16176601297683402, "grad_norm": 760.5179443359375, "learning_rate": 4.942096387695385e-05, "loss": 126.0567, "step": 20020 }, { "epoch": 0.16184681518111815, "grad_norm": 2420.37841796875, "learning_rate": 4.941945397023993e-05, "loss": 142.4873, "step": 20030 }, { "epoch": 0.16192761738540226, "grad_norm": 1585.40234375, "learning_rate": 4.9417942120582114e-05, "loss": 128.2617, "step": 20040 }, { "epoch": 0.1620084195896864, "grad_norm": 1405.9205322265625, "learning_rate": 4.941642832810072e-05, "loss": 123.314, "step": 20050 }, { "epoch": 0.16208922179397053, "grad_norm": 827.2435913085938, "learning_rate": 4.9414912592916185e-05, "loss": 143.6495, "step": 20060 }, { "epoch": 0.16217002399825467, "grad_norm": 742.4790649414062, "learning_rate": 4.9413394915149094e-05, "loss": 200.291, "step": 20070 }, { "epoch": 0.1622508262025388, "grad_norm": 1240.279296875, "learning_rate": 4.9411875294920215e-05, "loss": 165.2704, "step": 20080 }, { "epoch": 0.16233162840682294, "grad_norm": 671.4786987304688, "learning_rate": 4.9410353732350455e-05, "loss": 158.4867, "step": 20090 }, { "epoch": 0.16241243061110708, "grad_norm": 609.6265869140625, "learning_rate": 4.940883022756088e-05, "loss": 164.4025, "step": 20100 }, { "epoch": 0.16249323281539121, "grad_norm": 702.2029418945312, "learning_rate": 4.94073047806727e-05, "loss": 143.9458, "step": 20110 }, { "epoch": 0.16257403501967535, "grad_norm": 658.32763671875, "learning_rate": 4.94057773918073e-05, "loss": 116.2497, "step": 20120 }, { "epoch": 0.16265483722395946, "grad_norm": 618.864990234375, "learning_rate": 4.940424806108619e-05, "loss": 113.3011, "step": 20130 }, { "epoch": 0.1627356394282436, "grad_norm": 646.5340576171875, "learning_rate": 4.9402716788631073e-05, "loss": 209.5348, "step": 20140 }, { "epoch": 0.16281644163252773, "grad_norm": 794.1364135742188, "learning_rate": 4.940118357456377e-05, "loss": 112.0908, "step": 20150 }, { "epoch": 0.16289724383681187, "grad_norm": 1476.8046875, "learning_rate": 4.939964841900627e-05, "loss": 156.0068, "step": 20160 }, { "epoch": 0.162978046041096, "grad_norm": 1192.75146484375, "learning_rate": 4.939811132208073e-05, "loss": 208.6819, "step": 20170 }, { "epoch": 0.16305884824538014, "grad_norm": 1222.39208984375, "learning_rate": 4.939657228390945e-05, "loss": 127.3492, "step": 20180 }, { "epoch": 0.16313965044966428, "grad_norm": 1381.251708984375, "learning_rate": 4.939503130461487e-05, "loss": 141.149, "step": 20190 }, { "epoch": 0.1632204526539484, "grad_norm": 948.350341796875, "learning_rate": 4.9393488384319605e-05, "loss": 146.2455, "step": 20200 }, { "epoch": 0.16330125485823252, "grad_norm": 1100.5667724609375, "learning_rate": 4.939194352314643e-05, "loss": 134.0395, "step": 20210 }, { "epoch": 0.16338205706251666, "grad_norm": 1312.8271484375, "learning_rate": 4.939039672121825e-05, "loss": 162.6504, "step": 20220 }, { "epoch": 0.1634628592668008, "grad_norm": 742.9317626953125, "learning_rate": 4.938884797865814e-05, "loss": 119.811, "step": 20230 }, { "epoch": 0.16354366147108493, "grad_norm": 1479.2552490234375, "learning_rate": 4.938729729558932e-05, "loss": 127.0003, "step": 20240 }, { "epoch": 0.16362446367536906, "grad_norm": 2271.06982421875, "learning_rate": 4.938574467213518e-05, "loss": 180.906, "step": 20250 }, { "epoch": 0.1637052658796532, "grad_norm": 848.5759887695312, "learning_rate": 4.938419010841925e-05, "loss": 108.292, "step": 20260 }, { "epoch": 0.16378606808393734, "grad_norm": 865.2387084960938, "learning_rate": 4.938263360456523e-05, "loss": 146.3953, "step": 20270 }, { "epoch": 0.16386687028822147, "grad_norm": 1448.966552734375, "learning_rate": 4.938107516069694e-05, "loss": 197.9948, "step": 20280 }, { "epoch": 0.1639476724925056, "grad_norm": 856.9296875, "learning_rate": 4.9379514776938405e-05, "loss": 120.4137, "step": 20290 }, { "epoch": 0.16402847469678972, "grad_norm": 1423.47607421875, "learning_rate": 4.9377952453413765e-05, "loss": 147.8874, "step": 20300 }, { "epoch": 0.16410927690107385, "grad_norm": 1550.7271728515625, "learning_rate": 4.9376388190247324e-05, "loss": 153.0022, "step": 20310 }, { "epoch": 0.164190079105358, "grad_norm": 1133.5203857421875, "learning_rate": 4.937482198756355e-05, "loss": 169.8442, "step": 20320 }, { "epoch": 0.16427088130964212, "grad_norm": 921.5442504882812, "learning_rate": 4.937325384548705e-05, "loss": 179.4838, "step": 20330 }, { "epoch": 0.16435168351392626, "grad_norm": 920.5408935546875, "learning_rate": 4.9371683764142615e-05, "loss": 108.6345, "step": 20340 }, { "epoch": 0.1644324857182104, "grad_norm": 1286.8828125, "learning_rate": 4.9370111743655145e-05, "loss": 148.3994, "step": 20350 }, { "epoch": 0.16451328792249453, "grad_norm": 918.86181640625, "learning_rate": 4.9368537784149724e-05, "loss": 139.1599, "step": 20360 }, { "epoch": 0.16459409012677867, "grad_norm": 1502.0557861328125, "learning_rate": 4.936696188575159e-05, "loss": 164.7047, "step": 20370 }, { "epoch": 0.16467489233106278, "grad_norm": 921.6201171875, "learning_rate": 4.9365384048586125e-05, "loss": 146.2348, "step": 20380 }, { "epoch": 0.1647556945353469, "grad_norm": 761.9798583984375, "learning_rate": 4.936380427277888e-05, "loss": 122.8679, "step": 20390 }, { "epoch": 0.16483649673963105, "grad_norm": 1140.500732421875, "learning_rate": 4.936222255845554e-05, "loss": 120.364, "step": 20400 }, { "epoch": 0.16491729894391519, "grad_norm": 1554.974609375, "learning_rate": 4.936063890574196e-05, "loss": 170.4009, "step": 20410 }, { "epoch": 0.16499810114819932, "grad_norm": 756.5431518554688, "learning_rate": 4.935905331476414e-05, "loss": 124.2604, "step": 20420 }, { "epoch": 0.16507890335248346, "grad_norm": 759.2258911132812, "learning_rate": 4.9357465785648247e-05, "loss": 122.2716, "step": 20430 }, { "epoch": 0.1651597055567676, "grad_norm": 1130.5928955078125, "learning_rate": 4.935587631852058e-05, "loss": 120.7589, "step": 20440 }, { "epoch": 0.16524050776105173, "grad_norm": 994.0823364257812, "learning_rate": 4.935428491350761e-05, "loss": 147.596, "step": 20450 }, { "epoch": 0.16532130996533587, "grad_norm": 1129.5496826171875, "learning_rate": 4.9352691570735965e-05, "loss": 175.5725, "step": 20460 }, { "epoch": 0.16540211216961997, "grad_norm": 974.0113525390625, "learning_rate": 4.935109629033242e-05, "loss": 198.5151, "step": 20470 }, { "epoch": 0.1654829143739041, "grad_norm": 1393.72607421875, "learning_rate": 4.93494990724239e-05, "loss": 142.9287, "step": 20480 }, { "epoch": 0.16556371657818825, "grad_norm": 1063.649169921875, "learning_rate": 4.934789991713748e-05, "loss": 123.0873, "step": 20490 }, { "epoch": 0.16564451878247238, "grad_norm": 1238.6739501953125, "learning_rate": 4.9346298824600405e-05, "loss": 120.6401, "step": 20500 }, { "epoch": 0.16572532098675652, "grad_norm": 843.2783203125, "learning_rate": 4.934469579494008e-05, "loss": 121.2858, "step": 20510 }, { "epoch": 0.16580612319104066, "grad_norm": 983.3422241210938, "learning_rate": 4.934309082828402e-05, "loss": 143.3731, "step": 20520 }, { "epoch": 0.1658869253953248, "grad_norm": 959.219970703125, "learning_rate": 4.934148392475996e-05, "loss": 145.4661, "step": 20530 }, { "epoch": 0.16596772759960893, "grad_norm": 733.232666015625, "learning_rate": 4.933987508449572e-05, "loss": 126.4923, "step": 20540 }, { "epoch": 0.16604852980389306, "grad_norm": 853.0502319335938, "learning_rate": 4.933826430761933e-05, "loss": 153.8978, "step": 20550 }, { "epoch": 0.16612933200817717, "grad_norm": 1198.431640625, "learning_rate": 4.933665159425895e-05, "loss": 166.0935, "step": 20560 }, { "epoch": 0.1662101342124613, "grad_norm": 514.8040771484375, "learning_rate": 4.933503694454289e-05, "loss": 85.8809, "step": 20570 }, { "epoch": 0.16629093641674544, "grad_norm": 791.435302734375, "learning_rate": 4.9333420358599624e-05, "loss": 141.1844, "step": 20580 }, { "epoch": 0.16637173862102958, "grad_norm": 1096.3251953125, "learning_rate": 4.9331801836557776e-05, "loss": 141.8443, "step": 20590 }, { "epoch": 0.16645254082531372, "grad_norm": 4899.19921875, "learning_rate": 4.9330181378546124e-05, "loss": 176.5427, "step": 20600 }, { "epoch": 0.16653334302959785, "grad_norm": 453.80072021484375, "learning_rate": 4.93285589846936e-05, "loss": 143.0679, "step": 20610 }, { "epoch": 0.166614145233882, "grad_norm": 1067.961669921875, "learning_rate": 4.9326934655129295e-05, "loss": 125.034, "step": 20620 }, { "epoch": 0.16669494743816612, "grad_norm": 674.2916259765625, "learning_rate": 4.932530838998244e-05, "loss": 121.1949, "step": 20630 }, { "epoch": 0.16677574964245023, "grad_norm": 2042.013671875, "learning_rate": 4.9323680189382434e-05, "loss": 248.2044, "step": 20640 }, { "epoch": 0.16685655184673437, "grad_norm": 972.5521240234375, "learning_rate": 4.932205005345882e-05, "loss": 152.445, "step": 20650 }, { "epoch": 0.1669373540510185, "grad_norm": 659.5952758789062, "learning_rate": 4.9320417982341313e-05, "loss": 123.7578, "step": 20660 }, { "epoch": 0.16701815625530264, "grad_norm": 1101.409912109375, "learning_rate": 4.9318783976159765e-05, "loss": 157.3332, "step": 20670 }, { "epoch": 0.16709895845958678, "grad_norm": 1593.623291015625, "learning_rate": 4.931714803504418e-05, "loss": 161.4305, "step": 20680 }, { "epoch": 0.1671797606638709, "grad_norm": 785.7627563476562, "learning_rate": 4.9315510159124734e-05, "loss": 133.448, "step": 20690 }, { "epoch": 0.16726056286815505, "grad_norm": 1091.3284912109375, "learning_rate": 4.931387034853173e-05, "loss": 167.9184, "step": 20700 }, { "epoch": 0.16734136507243919, "grad_norm": 1226.1614990234375, "learning_rate": 4.931222860339565e-05, "loss": 127.1648, "step": 20710 }, { "epoch": 0.16742216727672332, "grad_norm": 1060.7408447265625, "learning_rate": 4.931058492384712e-05, "loss": 131.7696, "step": 20720 }, { "epoch": 0.16750296948100743, "grad_norm": 821.4359130859375, "learning_rate": 4.9308939310016916e-05, "loss": 135.4715, "step": 20730 }, { "epoch": 0.16758377168529157, "grad_norm": 2822.77197265625, "learning_rate": 4.930729176203598e-05, "loss": 169.724, "step": 20740 }, { "epoch": 0.1676645738895757, "grad_norm": 814.9628295898438, "learning_rate": 4.930564228003538e-05, "loss": 111.3484, "step": 20750 }, { "epoch": 0.16774537609385984, "grad_norm": 1205.373046875, "learning_rate": 4.930399086414638e-05, "loss": 125.1682, "step": 20760 }, { "epoch": 0.16782617829814397, "grad_norm": 874.1157836914062, "learning_rate": 4.9302337514500374e-05, "loss": 145.6209, "step": 20770 }, { "epoch": 0.1679069805024281, "grad_norm": 1192.6041259765625, "learning_rate": 4.930068223122889e-05, "loss": 154.289, "step": 20780 }, { "epoch": 0.16798778270671225, "grad_norm": 1004.2535400390625, "learning_rate": 4.929902501446366e-05, "loss": 183.3328, "step": 20790 }, { "epoch": 0.16806858491099638, "grad_norm": 996.1227416992188, "learning_rate": 4.929736586433652e-05, "loss": 133.2687, "step": 20800 }, { "epoch": 0.16814938711528052, "grad_norm": 676.2308349609375, "learning_rate": 4.929570478097949e-05, "loss": 149.5055, "step": 20810 }, { "epoch": 0.16823018931956463, "grad_norm": 1042.8621826171875, "learning_rate": 4.9294041764524726e-05, "loss": 126.4842, "step": 20820 }, { "epoch": 0.16831099152384876, "grad_norm": 1661.6201171875, "learning_rate": 4.9292376815104566e-05, "loss": 133.8418, "step": 20830 }, { "epoch": 0.1683917937281329, "grad_norm": 1082.744140625, "learning_rate": 4.929070993285146e-05, "loss": 175.319, "step": 20840 }, { "epoch": 0.16847259593241704, "grad_norm": 943.1874389648438, "learning_rate": 4.928904111789805e-05, "loss": 122.9224, "step": 20850 }, { "epoch": 0.16855339813670117, "grad_norm": 582.3085327148438, "learning_rate": 4.9287370370377106e-05, "loss": 154.6284, "step": 20860 }, { "epoch": 0.1686342003409853, "grad_norm": 363.685546875, "learning_rate": 4.928569769042156e-05, "loss": 159.0539, "step": 20870 }, { "epoch": 0.16871500254526944, "grad_norm": 1049.855224609375, "learning_rate": 4.928402307816451e-05, "loss": 140.1967, "step": 20880 }, { "epoch": 0.16879580474955358, "grad_norm": 912.501708984375, "learning_rate": 4.92823465337392e-05, "loss": 124.3262, "step": 20890 }, { "epoch": 0.1688766069538377, "grad_norm": 869.2931518554688, "learning_rate": 4.9280668057279014e-05, "loss": 125.99, "step": 20900 }, { "epoch": 0.16895740915812182, "grad_norm": 834.9353637695312, "learning_rate": 4.9278987648917495e-05, "loss": 121.5135, "step": 20910 }, { "epoch": 0.16903821136240596, "grad_norm": 933.3566284179688, "learning_rate": 4.9277305308788365e-05, "loss": 135.7604, "step": 20920 }, { "epoch": 0.1691190135666901, "grad_norm": 6561.35693359375, "learning_rate": 4.927562103702547e-05, "loss": 169.7086, "step": 20930 }, { "epoch": 0.16919981577097423, "grad_norm": 1966.267822265625, "learning_rate": 4.92739348337628e-05, "loss": 135.3359, "step": 20940 }, { "epoch": 0.16928061797525837, "grad_norm": 1062.4327392578125, "learning_rate": 4.927224669913456e-05, "loss": 130.8302, "step": 20950 }, { "epoch": 0.1693614201795425, "grad_norm": 1002.7949829101562, "learning_rate": 4.927055663327503e-05, "loss": 152.034, "step": 20960 }, { "epoch": 0.16944222238382664, "grad_norm": 1231.685302734375, "learning_rate": 4.92688646363187e-05, "loss": 125.2035, "step": 20970 }, { "epoch": 0.16952302458811078, "grad_norm": 1325.4017333984375, "learning_rate": 4.9267170708400196e-05, "loss": 153.9901, "step": 20980 }, { "epoch": 0.16960382679239489, "grad_norm": 1044.0921630859375, "learning_rate": 4.9265474849654284e-05, "loss": 160.7131, "step": 20990 }, { "epoch": 0.16968462899667902, "grad_norm": 706.5172119140625, "learning_rate": 4.92637770602159e-05, "loss": 150.8885, "step": 21000 }, { "epoch": 0.16976543120096316, "grad_norm": 443.4945373535156, "learning_rate": 4.9262077340220135e-05, "loss": 137.6814, "step": 21010 }, { "epoch": 0.1698462334052473, "grad_norm": 3524.959228515625, "learning_rate": 4.926037568980223e-05, "loss": 131.1957, "step": 21020 }, { "epoch": 0.16992703560953143, "grad_norm": 1061.660888671875, "learning_rate": 4.925867210909756e-05, "loss": 109.0972, "step": 21030 }, { "epoch": 0.17000783781381557, "grad_norm": 977.8228759765625, "learning_rate": 4.925696659824169e-05, "loss": 140.1199, "step": 21040 }, { "epoch": 0.1700886400180997, "grad_norm": 886.8487548828125, "learning_rate": 4.925525915737031e-05, "loss": 123.5597, "step": 21050 }, { "epoch": 0.17016944222238384, "grad_norm": 2220.47314453125, "learning_rate": 4.925354978661928e-05, "loss": 116.6628, "step": 21060 }, { "epoch": 0.17025024442666795, "grad_norm": 750.9844970703125, "learning_rate": 4.925183848612459e-05, "loss": 150.5052, "step": 21070 }, { "epoch": 0.17033104663095208, "grad_norm": 1006.42333984375, "learning_rate": 4.9250125256022416e-05, "loss": 119.5853, "step": 21080 }, { "epoch": 0.17041184883523622, "grad_norm": 1162.7249755859375, "learning_rate": 4.9248410096449075e-05, "loss": 137.6988, "step": 21090 }, { "epoch": 0.17049265103952035, "grad_norm": 666.7645263671875, "learning_rate": 4.9246693007541024e-05, "loss": 181.7554, "step": 21100 }, { "epoch": 0.1705734532438045, "grad_norm": 1177.0545654296875, "learning_rate": 4.9244973989434886e-05, "loss": 157.994, "step": 21110 }, { "epoch": 0.17065425544808863, "grad_norm": 1034.1756591796875, "learning_rate": 4.924325304226745e-05, "loss": 135.4122, "step": 21120 }, { "epoch": 0.17073505765237276, "grad_norm": 1536.7457275390625, "learning_rate": 4.9241530166175614e-05, "loss": 153.4128, "step": 21130 }, { "epoch": 0.1708158598566569, "grad_norm": 881.2988891601562, "learning_rate": 4.9239805361296485e-05, "loss": 161.9859, "step": 21140 }, { "epoch": 0.17089666206094103, "grad_norm": 1522.52197265625, "learning_rate": 4.923807862776728e-05, "loss": 178.6479, "step": 21150 }, { "epoch": 0.17097746426522514, "grad_norm": 658.5277709960938, "learning_rate": 4.9236349965725406e-05, "loss": 130.6969, "step": 21160 }, { "epoch": 0.17105826646950928, "grad_norm": 865.1202392578125, "learning_rate": 4.9234619375308396e-05, "loss": 125.0552, "step": 21170 }, { "epoch": 0.17113906867379342, "grad_norm": 1105.9573974609375, "learning_rate": 4.9232886856653936e-05, "loss": 148.4609, "step": 21180 }, { "epoch": 0.17121987087807755, "grad_norm": 762.8492431640625, "learning_rate": 4.923115240989989e-05, "loss": 177.1173, "step": 21190 }, { "epoch": 0.1713006730823617, "grad_norm": 977.0111083984375, "learning_rate": 4.922941603518425e-05, "loss": 131.6727, "step": 21200 }, { "epoch": 0.17138147528664582, "grad_norm": 864.1790771484375, "learning_rate": 4.922767773264517e-05, "loss": 153.2473, "step": 21210 }, { "epoch": 0.17146227749092996, "grad_norm": 576.5042724609375, "learning_rate": 4.9225937502420974e-05, "loss": 142.9099, "step": 21220 }, { "epoch": 0.1715430796952141, "grad_norm": 826.1639404296875, "learning_rate": 4.9224195344650105e-05, "loss": 165.9666, "step": 21230 }, { "epoch": 0.17162388189949823, "grad_norm": 1021.5435180664062, "learning_rate": 4.922245125947119e-05, "loss": 131.2517, "step": 21240 }, { "epoch": 0.17170468410378234, "grad_norm": 750.6788330078125, "learning_rate": 4.9220705247022985e-05, "loss": 128.4156, "step": 21250 }, { "epoch": 0.17178548630806648, "grad_norm": 1202.5364990234375, "learning_rate": 4.921895730744443e-05, "loss": 165.3158, "step": 21260 }, { "epoch": 0.1718662885123506, "grad_norm": 695.836181640625, "learning_rate": 4.921720744087459e-05, "loss": 108.6388, "step": 21270 }, { "epoch": 0.17194709071663475, "grad_norm": 658.8826904296875, "learning_rate": 4.921545564745269e-05, "loss": 134.6856, "step": 21280 }, { "epoch": 0.17202789292091888, "grad_norm": 2134.118896484375, "learning_rate": 4.9213701927318134e-05, "loss": 147.9883, "step": 21290 }, { "epoch": 0.17210869512520302, "grad_norm": 981.736083984375, "learning_rate": 4.921194628061043e-05, "loss": 134.1546, "step": 21300 }, { "epoch": 0.17218949732948716, "grad_norm": 619.3984985351562, "learning_rate": 4.921018870746928e-05, "loss": 119.4398, "step": 21310 }, { "epoch": 0.1722702995337713, "grad_norm": 710.1690673828125, "learning_rate": 4.9208429208034525e-05, "loss": 145.9413, "step": 21320 }, { "epoch": 0.1723511017380554, "grad_norm": 1467.351806640625, "learning_rate": 4.920666778244616e-05, "loss": 159.9146, "step": 21330 }, { "epoch": 0.17243190394233954, "grad_norm": 862.062255859375, "learning_rate": 4.920490443084433e-05, "loss": 129.9643, "step": 21340 }, { "epoch": 0.17251270614662367, "grad_norm": 1797.9549560546875, "learning_rate": 4.920313915336934e-05, "loss": 144.6173, "step": 21350 }, { "epoch": 0.1725935083509078, "grad_norm": 1055.6019287109375, "learning_rate": 4.920137195016165e-05, "loss": 156.1974, "step": 21360 }, { "epoch": 0.17267431055519195, "grad_norm": 8324.7998046875, "learning_rate": 4.919960282136185e-05, "loss": 164.71, "step": 21370 }, { "epoch": 0.17275511275947608, "grad_norm": 971.8815307617188, "learning_rate": 4.919783176711074e-05, "loss": 153.0036, "step": 21380 }, { "epoch": 0.17283591496376022, "grad_norm": 988.0467529296875, "learning_rate": 4.9196058787549184e-05, "loss": 152.4217, "step": 21390 }, { "epoch": 0.17291671716804435, "grad_norm": 570.156494140625, "learning_rate": 4.919428388281829e-05, "loss": 127.8879, "step": 21400 }, { "epoch": 0.1729975193723285, "grad_norm": 1578.5853271484375, "learning_rate": 4.9192507053059255e-05, "loss": 148.3654, "step": 21410 }, { "epoch": 0.1730783215766126, "grad_norm": 1456.0301513671875, "learning_rate": 4.919072829841347e-05, "loss": 178.0747, "step": 21420 }, { "epoch": 0.17315912378089673, "grad_norm": 588.3489990234375, "learning_rate": 4.918894761902245e-05, "loss": 143.8035, "step": 21430 }, { "epoch": 0.17323992598518087, "grad_norm": 927.64404296875, "learning_rate": 4.918716501502789e-05, "loss": 126.7022, "step": 21440 }, { "epoch": 0.173320728189465, "grad_norm": 1143.9464111328125, "learning_rate": 4.9185380486571595e-05, "loss": 79.5578, "step": 21450 }, { "epoch": 0.17340153039374914, "grad_norm": 838.281494140625, "learning_rate": 4.918359403379559e-05, "loss": 177.1575, "step": 21460 }, { "epoch": 0.17348233259803328, "grad_norm": 1176.1307373046875, "learning_rate": 4.918180565684198e-05, "loss": 151.7111, "step": 21470 }, { "epoch": 0.17356313480231741, "grad_norm": 1099.8289794921875, "learning_rate": 4.9180015355853084e-05, "loss": 152.3932, "step": 21480 }, { "epoch": 0.17364393700660155, "grad_norm": 1128.26123046875, "learning_rate": 4.917822313097134e-05, "loss": 116.6667, "step": 21490 }, { "epoch": 0.17372473921088566, "grad_norm": 563.2874755859375, "learning_rate": 4.917642898233933e-05, "loss": 142.594, "step": 21500 }, { "epoch": 0.1738055414151698, "grad_norm": 1359.58984375, "learning_rate": 4.917463291009983e-05, "loss": 133.1546, "step": 21510 }, { "epoch": 0.17388634361945393, "grad_norm": 848.3027954101562, "learning_rate": 4.917283491439574e-05, "loss": 152.585, "step": 21520 }, { "epoch": 0.17396714582373807, "grad_norm": 857.1946411132812, "learning_rate": 4.91710349953701e-05, "loss": 178.7701, "step": 21530 }, { "epoch": 0.1740479480280222, "grad_norm": 1514.6451416015625, "learning_rate": 4.916923315316615e-05, "loss": 165.9126, "step": 21540 }, { "epoch": 0.17412875023230634, "grad_norm": 1154.829833984375, "learning_rate": 4.916742938792723e-05, "loss": 129.4643, "step": 21550 }, { "epoch": 0.17420955243659048, "grad_norm": 1806.4046630859375, "learning_rate": 4.9165623699796875e-05, "loss": 131.133, "step": 21560 }, { "epoch": 0.1742903546408746, "grad_norm": 2297.58642578125, "learning_rate": 4.916381608891874e-05, "loss": 181.9885, "step": 21570 }, { "epoch": 0.17437115684515875, "grad_norm": 889.8167724609375, "learning_rate": 4.916200655543667e-05, "loss": 124.0278, "step": 21580 }, { "epoch": 0.17445195904944286, "grad_norm": 652.2991333007812, "learning_rate": 4.916019509949461e-05, "loss": 129.1368, "step": 21590 }, { "epoch": 0.174532761253727, "grad_norm": 2002.193603515625, "learning_rate": 4.915838172123671e-05, "loss": 156.8969, "step": 21600 }, { "epoch": 0.17461356345801113, "grad_norm": 1632.37646484375, "learning_rate": 4.915656642080726e-05, "loss": 134.3347, "step": 21610 }, { "epoch": 0.17469436566229526, "grad_norm": 1093.335205078125, "learning_rate": 4.915474919835067e-05, "loss": 134.8566, "step": 21620 }, { "epoch": 0.1747751678665794, "grad_norm": 1129.8870849609375, "learning_rate": 4.915293005401155e-05, "loss": 140.7855, "step": 21630 }, { "epoch": 0.17485597007086354, "grad_norm": 1071.5859375, "learning_rate": 4.915110898793462e-05, "loss": 164.049, "step": 21640 }, { "epoch": 0.17493677227514767, "grad_norm": 729.1155395507812, "learning_rate": 4.9149286000264805e-05, "loss": 113.2319, "step": 21650 }, { "epoch": 0.1750175744794318, "grad_norm": 753.5103149414062, "learning_rate": 4.9147461091147125e-05, "loss": 102.7337, "step": 21660 }, { "epoch": 0.17509837668371594, "grad_norm": 531.7659301757812, "learning_rate": 4.914563426072678e-05, "loss": 149.2153, "step": 21670 }, { "epoch": 0.17517917888800005, "grad_norm": 1287.9884033203125, "learning_rate": 4.914380550914914e-05, "loss": 146.6886, "step": 21680 }, { "epoch": 0.1752599810922842, "grad_norm": 1483.9490966796875, "learning_rate": 4.9141974836559704e-05, "loss": 157.0422, "step": 21690 }, { "epoch": 0.17534078329656833, "grad_norm": 899.7554931640625, "learning_rate": 4.9140142243104116e-05, "loss": 149.8844, "step": 21700 }, { "epoch": 0.17542158550085246, "grad_norm": 1013.6568603515625, "learning_rate": 4.91383077289282e-05, "loss": 99.9814, "step": 21710 }, { "epoch": 0.1755023877051366, "grad_norm": 1174.3260498046875, "learning_rate": 4.9136471294177924e-05, "loss": 138.6684, "step": 21720 }, { "epoch": 0.17558318990942073, "grad_norm": 1020.420166015625, "learning_rate": 4.913463293899939e-05, "loss": 165.4781, "step": 21730 }, { "epoch": 0.17566399211370487, "grad_norm": 1076.771728515625, "learning_rate": 4.913279266353888e-05, "loss": 138.3935, "step": 21740 }, { "epoch": 0.175744794317989, "grad_norm": 1392.3963623046875, "learning_rate": 4.9130950467942814e-05, "loss": 130.0977, "step": 21750 }, { "epoch": 0.17582559652227311, "grad_norm": 593.2973022460938, "learning_rate": 4.912910635235777e-05, "loss": 116.5755, "step": 21760 }, { "epoch": 0.17590639872655725, "grad_norm": 1103.0938720703125, "learning_rate": 4.9127260316930466e-05, "loss": 138.2039, "step": 21770 }, { "epoch": 0.1759872009308414, "grad_norm": 737.562744140625, "learning_rate": 4.912541236180779e-05, "loss": 122.1219, "step": 21780 }, { "epoch": 0.17606800313512552, "grad_norm": 860.9588623046875, "learning_rate": 4.9123562487136774e-05, "loss": 153.3838, "step": 21790 }, { "epoch": 0.17614880533940966, "grad_norm": 1145.1170654296875, "learning_rate": 4.912171069306459e-05, "loss": 123.9352, "step": 21800 }, { "epoch": 0.1762296075436938, "grad_norm": 940.3277587890625, "learning_rate": 4.911985697973861e-05, "loss": 142.1018, "step": 21810 }, { "epoch": 0.17631040974797793, "grad_norm": 815.0631713867188, "learning_rate": 4.911800134730629e-05, "loss": 151.9103, "step": 21820 }, { "epoch": 0.17639121195226207, "grad_norm": 1341.9578857421875, "learning_rate": 4.9116143795915295e-05, "loss": 146.6557, "step": 21830 }, { "epoch": 0.1764720141565462, "grad_norm": 861.9169311523438, "learning_rate": 4.9114284325713416e-05, "loss": 131.5514, "step": 21840 }, { "epoch": 0.1765528163608303, "grad_norm": 2296.96484375, "learning_rate": 4.91124229368486e-05, "loss": 127.922, "step": 21850 }, { "epoch": 0.17663361856511445, "grad_norm": 3126.313232421875, "learning_rate": 4.911055962946896e-05, "loss": 178.3461, "step": 21860 }, { "epoch": 0.17671442076939858, "grad_norm": 1112.87646484375, "learning_rate": 4.910869440372274e-05, "loss": 131.921, "step": 21870 }, { "epoch": 0.17679522297368272, "grad_norm": 1277.1387939453125, "learning_rate": 4.910682725975835e-05, "loss": 172.0795, "step": 21880 }, { "epoch": 0.17687602517796686, "grad_norm": 1765.8804931640625, "learning_rate": 4.910495819772434e-05, "loss": 116.7105, "step": 21890 }, { "epoch": 0.176956827382251, "grad_norm": 1198.19384765625, "learning_rate": 4.9103087217769445e-05, "loss": 145.228, "step": 21900 }, { "epoch": 0.17703762958653513, "grad_norm": 926.2941284179688, "learning_rate": 4.910121432004252e-05, "loss": 161.9553, "step": 21910 }, { "epoch": 0.17711843179081926, "grad_norm": 759.7756958007812, "learning_rate": 4.9099339504692573e-05, "loss": 113.5963, "step": 21920 }, { "epoch": 0.1771992339951034, "grad_norm": 816.7711181640625, "learning_rate": 4.909746277186879e-05, "loss": 145.3244, "step": 21930 }, { "epoch": 0.1772800361993875, "grad_norm": 1038.8468017578125, "learning_rate": 4.909558412172047e-05, "loss": 175.112, "step": 21940 }, { "epoch": 0.17736083840367164, "grad_norm": 1387.3388671875, "learning_rate": 4.909370355439712e-05, "loss": 124.688, "step": 21950 }, { "epoch": 0.17744164060795578, "grad_norm": 666.5802612304688, "learning_rate": 4.909182107004835e-05, "loss": 85.6291, "step": 21960 }, { "epoch": 0.17752244281223992, "grad_norm": 1517.434326171875, "learning_rate": 4.908993666882395e-05, "loss": 125.7463, "step": 21970 }, { "epoch": 0.17760324501652405, "grad_norm": 746.6646118164062, "learning_rate": 4.9088050350873835e-05, "loss": 143.7797, "step": 21980 }, { "epoch": 0.1776840472208082, "grad_norm": 838.4737548828125, "learning_rate": 4.9086162116348114e-05, "loss": 120.7253, "step": 21990 }, { "epoch": 0.17776484942509233, "grad_norm": 1266.503662109375, "learning_rate": 4.9084271965397014e-05, "loss": 141.1467, "step": 22000 }, { "epoch": 0.17784565162937646, "grad_norm": 1920.794677734375, "learning_rate": 4.9082379898170914e-05, "loss": 143.4987, "step": 22010 }, { "epoch": 0.17792645383366057, "grad_norm": 1311.13720703125, "learning_rate": 4.908048591482038e-05, "loss": 152.7263, "step": 22020 }, { "epoch": 0.1780072560379447, "grad_norm": 1175.812744140625, "learning_rate": 4.9078590015496096e-05, "loss": 129.1362, "step": 22030 }, { "epoch": 0.17808805824222884, "grad_norm": 1120.2232666015625, "learning_rate": 4.907669220034891e-05, "loss": 138.7439, "step": 22040 }, { "epoch": 0.17816886044651298, "grad_norm": 1485.8729248046875, "learning_rate": 4.9074792469529815e-05, "loss": 274.2296, "step": 22050 }, { "epoch": 0.1782496626507971, "grad_norm": 969.6702270507812, "learning_rate": 4.9072890823189974e-05, "loss": 119.1664, "step": 22060 }, { "epoch": 0.17833046485508125, "grad_norm": 941.8471069335938, "learning_rate": 4.907098726148069e-05, "loss": 160.9961, "step": 22070 }, { "epoch": 0.17841126705936539, "grad_norm": 1140.767333984375, "learning_rate": 4.906908178455342e-05, "loss": 145.0228, "step": 22080 }, { "epoch": 0.17849206926364952, "grad_norm": 715.76904296875, "learning_rate": 4.9067174392559776e-05, "loss": 103.8144, "step": 22090 }, { "epoch": 0.17857287146793366, "grad_norm": 1049.420654296875, "learning_rate": 4.906526508565152e-05, "loss": 124.3415, "step": 22100 }, { "epoch": 0.17865367367221777, "grad_norm": 1215.5096435546875, "learning_rate": 4.9063353863980565e-05, "loss": 182.3524, "step": 22110 }, { "epoch": 0.1787344758765019, "grad_norm": 1063.1181640625, "learning_rate": 4.906144072769898e-05, "loss": 122.475, "step": 22120 }, { "epoch": 0.17881527808078604, "grad_norm": 1125.4080810546875, "learning_rate": 4.9059525676958986e-05, "loss": 139.1991, "step": 22130 }, { "epoch": 0.17889608028507017, "grad_norm": 1022.3780517578125, "learning_rate": 4.905760871191295e-05, "loss": 195.8249, "step": 22140 }, { "epoch": 0.1789768824893543, "grad_norm": 1560.5745849609375, "learning_rate": 4.9055689832713396e-05, "loss": 125.5297, "step": 22150 }, { "epoch": 0.17905768469363845, "grad_norm": 5475.33447265625, "learning_rate": 4.9053769039513006e-05, "loss": 148.3278, "step": 22160 }, { "epoch": 0.17913848689792258, "grad_norm": 737.4940795898438, "learning_rate": 4.90518463324646e-05, "loss": 140.7979, "step": 22170 }, { "epoch": 0.17921928910220672, "grad_norm": 627.595458984375, "learning_rate": 4.904992171172117e-05, "loss": 157.4451, "step": 22180 }, { "epoch": 0.17930009130649083, "grad_norm": 734.084228515625, "learning_rate": 4.904799517743585e-05, "loss": 139.5384, "step": 22190 }, { "epoch": 0.17938089351077496, "grad_norm": 1306.1624755859375, "learning_rate": 4.9046066729761905e-05, "loss": 118.6943, "step": 22200 }, { "epoch": 0.1794616957150591, "grad_norm": 1220.8062744140625, "learning_rate": 4.904413636885279e-05, "loss": 168.1177, "step": 22210 }, { "epoch": 0.17954249791934324, "grad_norm": 1183.083251953125, "learning_rate": 4.90422040948621e-05, "loss": 158.66, "step": 22220 }, { "epoch": 0.17962330012362737, "grad_norm": 1026.29345703125, "learning_rate": 4.904026990794356e-05, "loss": 128.3199, "step": 22230 }, { "epoch": 0.1797041023279115, "grad_norm": 771.4765625, "learning_rate": 4.9038333808251074e-05, "loss": 111.4788, "step": 22240 }, { "epoch": 0.17978490453219564, "grad_norm": 1059.020751953125, "learning_rate": 4.90363957959387e-05, "loss": 134.2409, "step": 22250 }, { "epoch": 0.17986570673647978, "grad_norm": 1103.791748046875, "learning_rate": 4.903445587116061e-05, "loss": 170.8439, "step": 22260 }, { "epoch": 0.17994650894076392, "grad_norm": 622.0547485351562, "learning_rate": 4.9032514034071175e-05, "loss": 122.9244, "step": 22270 }, { "epoch": 0.18002731114504802, "grad_norm": 958.7100830078125, "learning_rate": 4.903057028482489e-05, "loss": 99.9289, "step": 22280 }, { "epoch": 0.18010811334933216, "grad_norm": 996.9059448242188, "learning_rate": 4.902862462357641e-05, "loss": 164.1096, "step": 22290 }, { "epoch": 0.1801889155536163, "grad_norm": 481.1267395019531, "learning_rate": 4.9026677050480555e-05, "loss": 118.1814, "step": 22300 }, { "epoch": 0.18026971775790043, "grad_norm": 1068.948486328125, "learning_rate": 4.9024727565692274e-05, "loss": 144.3041, "step": 22310 }, { "epoch": 0.18035051996218457, "grad_norm": 997.0437622070312, "learning_rate": 4.9022776169366666e-05, "loss": 155.9985, "step": 22320 }, { "epoch": 0.1804313221664687, "grad_norm": 1654.6068115234375, "learning_rate": 4.902082286165902e-05, "loss": 164.7646, "step": 22330 }, { "epoch": 0.18051212437075284, "grad_norm": 3154.902099609375, "learning_rate": 4.901886764272474e-05, "loss": 98.5395, "step": 22340 }, { "epoch": 0.18059292657503698, "grad_norm": 2299.2890625, "learning_rate": 4.901691051271939e-05, "loss": 146.594, "step": 22350 }, { "epoch": 0.1806737287793211, "grad_norm": 1197.87451171875, "learning_rate": 4.90149514717987e-05, "loss": 145.7994, "step": 22360 }, { "epoch": 0.18075453098360522, "grad_norm": 1042.1507568359375, "learning_rate": 4.901299052011852e-05, "loss": 144.4632, "step": 22370 }, { "epoch": 0.18083533318788936, "grad_norm": 1308.17041015625, "learning_rate": 4.90110276578349e-05, "loss": 119.5589, "step": 22380 }, { "epoch": 0.1809161353921735, "grad_norm": 904.1051025390625, "learning_rate": 4.900906288510401e-05, "loss": 116.1322, "step": 22390 }, { "epoch": 0.18099693759645763, "grad_norm": 807.6878051757812, "learning_rate": 4.900709620208216e-05, "loss": 179.056, "step": 22400 }, { "epoch": 0.18107773980074177, "grad_norm": 644.4304809570312, "learning_rate": 4.900512760892585e-05, "loss": 110.5187, "step": 22410 }, { "epoch": 0.1811585420050259, "grad_norm": 924.395263671875, "learning_rate": 4.9003157105791706e-05, "loss": 116.3889, "step": 22420 }, { "epoch": 0.18123934420931004, "grad_norm": 1086.564208984375, "learning_rate": 4.9001184692836505e-05, "loss": 121.7575, "step": 22430 }, { "epoch": 0.18132014641359417, "grad_norm": 876.027587890625, "learning_rate": 4.8999210370217194e-05, "loss": 159.8317, "step": 22440 }, { "epoch": 0.18140094861787828, "grad_norm": 1000.0255126953125, "learning_rate": 4.899723413809085e-05, "loss": 139.3787, "step": 22450 }, { "epoch": 0.18148175082216242, "grad_norm": 2290.170654296875, "learning_rate": 4.899525599661472e-05, "loss": 97.0173, "step": 22460 }, { "epoch": 0.18156255302644655, "grad_norm": 1271.3829345703125, "learning_rate": 4.899327594594619e-05, "loss": 131.2777, "step": 22470 }, { "epoch": 0.1816433552307307, "grad_norm": 907.057861328125, "learning_rate": 4.899129398624281e-05, "loss": 101.1429, "step": 22480 }, { "epoch": 0.18172415743501483, "grad_norm": 1061.6846923828125, "learning_rate": 4.898931011766228e-05, "loss": 107.645, "step": 22490 }, { "epoch": 0.18180495963929896, "grad_norm": 1035.3082275390625, "learning_rate": 4.898732434036244e-05, "loss": 121.6947, "step": 22500 }, { "epoch": 0.1818857618435831, "grad_norm": 1202.3555908203125, "learning_rate": 4.898533665450128e-05, "loss": 162.0105, "step": 22510 }, { "epoch": 0.18196656404786724, "grad_norm": 1121.3282470703125, "learning_rate": 4.898334706023696e-05, "loss": 113.9376, "step": 22520 }, { "epoch": 0.18204736625215137, "grad_norm": 1393.017578125, "learning_rate": 4.8981355557727796e-05, "loss": 139.8284, "step": 22530 }, { "epoch": 0.18212816845643548, "grad_norm": 2622.062744140625, "learning_rate": 4.897936214713222e-05, "loss": 168.307, "step": 22540 }, { "epoch": 0.18220897066071962, "grad_norm": 804.2191772460938, "learning_rate": 4.897736682860885e-05, "loss": 146.0737, "step": 22550 }, { "epoch": 0.18228977286500375, "grad_norm": 1040.8131103515625, "learning_rate": 4.897536960231644e-05, "loss": 157.3722, "step": 22560 }, { "epoch": 0.1823705750692879, "grad_norm": 677.9693603515625, "learning_rate": 4.89733704684139e-05, "loss": 103.1337, "step": 22570 }, { "epoch": 0.18245137727357202, "grad_norm": 1318.36376953125, "learning_rate": 4.89713694270603e-05, "loss": 141.4517, "step": 22580 }, { "epoch": 0.18253217947785616, "grad_norm": 1170.07861328125, "learning_rate": 4.8969366478414854e-05, "loss": 108.5731, "step": 22590 }, { "epoch": 0.1826129816821403, "grad_norm": 1243.060791015625, "learning_rate": 4.896736162263691e-05, "loss": 173.4797, "step": 22600 }, { "epoch": 0.18269378388642443, "grad_norm": 624.64111328125, "learning_rate": 4.8965354859886006e-05, "loss": 103.4098, "step": 22610 }, { "epoch": 0.18277458609070854, "grad_norm": 982.3737182617188, "learning_rate": 4.89633461903218e-05, "loss": 122.2358, "step": 22620 }, { "epoch": 0.18285538829499268, "grad_norm": 1259.17578125, "learning_rate": 4.8961335614104115e-05, "loss": 124.4892, "step": 22630 }, { "epoch": 0.1829361904992768, "grad_norm": 1049.9534912109375, "learning_rate": 4.895932313139292e-05, "loss": 165.0955, "step": 22640 }, { "epoch": 0.18301699270356095, "grad_norm": 805.4934692382812, "learning_rate": 4.895730874234834e-05, "loss": 114.1204, "step": 22650 }, { "epoch": 0.18309779490784509, "grad_norm": 1224.6595458984375, "learning_rate": 4.895529244713066e-05, "loss": 115.779, "step": 22660 }, { "epoch": 0.18317859711212922, "grad_norm": 1112.3597412109375, "learning_rate": 4.895327424590029e-05, "loss": 130.0822, "step": 22670 }, { "epoch": 0.18325939931641336, "grad_norm": 1353.516845703125, "learning_rate": 4.895125413881783e-05, "loss": 171.6577, "step": 22680 }, { "epoch": 0.1833402015206975, "grad_norm": 1338.9501953125, "learning_rate": 4.8949232126044e-05, "loss": 109.7441, "step": 22690 }, { "epoch": 0.18342100372498163, "grad_norm": 1896.7581787109375, "learning_rate": 4.894720820773967e-05, "loss": 174.3409, "step": 22700 }, { "epoch": 0.18350180592926574, "grad_norm": 766.3515014648438, "learning_rate": 4.894518238406589e-05, "loss": 137.257, "step": 22710 }, { "epoch": 0.18358260813354987, "grad_norm": 691.5870971679688, "learning_rate": 4.8943154655183844e-05, "loss": 100.2154, "step": 22720 }, { "epoch": 0.183663410337834, "grad_norm": 5152.01025390625, "learning_rate": 4.894112502125487e-05, "loss": 173.3306, "step": 22730 }, { "epoch": 0.18374421254211815, "grad_norm": 788.1614379882812, "learning_rate": 4.8939093482440444e-05, "loss": 142.851, "step": 22740 }, { "epoch": 0.18382501474640228, "grad_norm": 1022.0597534179688, "learning_rate": 4.8937060038902224e-05, "loss": 160.1116, "step": 22750 }, { "epoch": 0.18390581695068642, "grad_norm": 757.3877563476562, "learning_rate": 4.8935024690801986e-05, "loss": 104.9073, "step": 22760 }, { "epoch": 0.18398661915497055, "grad_norm": 1713.2584228515625, "learning_rate": 4.893298743830168e-05, "loss": 141.8529, "step": 22770 }, { "epoch": 0.1840674213592547, "grad_norm": 1372.67822265625, "learning_rate": 4.89309482815634e-05, "loss": 157.4392, "step": 22780 }, { "epoch": 0.18414822356353883, "grad_norm": 900.3597412109375, "learning_rate": 4.892890722074941e-05, "loss": 161.8838, "step": 22790 }, { "epoch": 0.18422902576782293, "grad_norm": 1206.832763671875, "learning_rate": 4.8926864256022074e-05, "loss": 142.6541, "step": 22800 }, { "epoch": 0.18430982797210707, "grad_norm": 3348.473388671875, "learning_rate": 4.892481938754396e-05, "loss": 153.2709, "step": 22810 }, { "epoch": 0.1843906301763912, "grad_norm": 1615.8541259765625, "learning_rate": 4.892277261547778e-05, "loss": 176.116, "step": 22820 }, { "epoch": 0.18447143238067534, "grad_norm": 1190.1331787109375, "learning_rate": 4.892072393998636e-05, "loss": 130.4555, "step": 22830 }, { "epoch": 0.18455223458495948, "grad_norm": 1158.7845458984375, "learning_rate": 4.891867336123273e-05, "loss": 117.6687, "step": 22840 }, { "epoch": 0.18463303678924362, "grad_norm": 850.7025756835938, "learning_rate": 4.891662087938002e-05, "loss": 125.4999, "step": 22850 }, { "epoch": 0.18471383899352775, "grad_norm": 1056.5823974609375, "learning_rate": 4.891456649459155e-05, "loss": 104.6559, "step": 22860 }, { "epoch": 0.1847946411978119, "grad_norm": 821.6203002929688, "learning_rate": 4.891251020703078e-05, "loss": 104.2007, "step": 22870 }, { "epoch": 0.184875443402096, "grad_norm": 842.5684814453125, "learning_rate": 4.8910452016861316e-05, "loss": 149.866, "step": 22880 }, { "epoch": 0.18495624560638013, "grad_norm": 1477.5130615234375, "learning_rate": 4.890839192424692e-05, "loss": 118.6691, "step": 22890 }, { "epoch": 0.18503704781066427, "grad_norm": 679.1465454101562, "learning_rate": 4.89063299293515e-05, "loss": 113.9977, "step": 22900 }, { "epoch": 0.1851178500149484, "grad_norm": 1265.08349609375, "learning_rate": 4.890426603233913e-05, "loss": 187.4383, "step": 22910 }, { "epoch": 0.18519865221923254, "grad_norm": 1642.4959716796875, "learning_rate": 4.890220023337402e-05, "loss": 114.8031, "step": 22920 }, { "epoch": 0.18527945442351668, "grad_norm": 1289.17529296875, "learning_rate": 4.890013253262052e-05, "loss": 139.2596, "step": 22930 }, { "epoch": 0.1853602566278008, "grad_norm": 1148.701171875, "learning_rate": 4.889806293024317e-05, "loss": 115.5268, "step": 22940 }, { "epoch": 0.18544105883208495, "grad_norm": 854.2969970703125, "learning_rate": 4.889599142640663e-05, "loss": 104.5058, "step": 22950 }, { "epoch": 0.18552186103636908, "grad_norm": 638.8955078125, "learning_rate": 4.889391802127572e-05, "loss": 137.5948, "step": 22960 }, { "epoch": 0.1856026632406532, "grad_norm": 1359.7989501953125, "learning_rate": 4.8891842715015415e-05, "loss": 155.3802, "step": 22970 }, { "epoch": 0.18568346544493733, "grad_norm": 900.0406494140625, "learning_rate": 4.888976550779082e-05, "loss": 148.2721, "step": 22980 }, { "epoch": 0.18576426764922147, "grad_norm": 950.3909912109375, "learning_rate": 4.888768639976723e-05, "loss": 128.0501, "step": 22990 }, { "epoch": 0.1858450698535056, "grad_norm": 904.05615234375, "learning_rate": 4.888560539111007e-05, "loss": 117.1624, "step": 23000 } ], "logging_steps": 10, "max_steps": 123750, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }