{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09696264514095944, "eval_steps": 500, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.080220428413287e-05, "grad_norm": 120784.5234375, "learning_rate": 4.040404040404041e-08, "loss": 6344.0191, "step": 10 }, { "epoch": 0.00016160440856826573, "grad_norm": 246101.4375, "learning_rate": 8.080808080808082e-08, "loss": 7230.2391, "step": 20 }, { "epoch": 0.00024240661285239863, "grad_norm": 220140.828125, "learning_rate": 1.2121212121212122e-07, "loss": 7248.1844, "step": 30 }, { "epoch": 0.00032320881713653147, "grad_norm": 468224.40625, "learning_rate": 1.6161616161616163e-07, "loss": 8245.5844, "step": 40 }, { "epoch": 0.00040401102142066436, "grad_norm": 129730.1171875, "learning_rate": 2.0202020202020202e-07, "loss": 5464.3164, "step": 50 }, { "epoch": 0.00048481322570479725, "grad_norm": 37984.21484375, "learning_rate": 2.4242424242424244e-07, "loss": 7551.0562, "step": 60 }, { "epoch": 0.0005656154299889301, "grad_norm": 170187.078125, "learning_rate": 2.8282828282828283e-07, "loss": 6856.2141, "step": 70 }, { "epoch": 0.0006464176342730629, "grad_norm": 64738.05078125, "learning_rate": 3.2323232323232327e-07, "loss": 7045.2477, "step": 80 }, { "epoch": 0.0007272198385571959, "grad_norm": 114463.8515625, "learning_rate": 3.6363636363636366e-07, "loss": 5803.459, "step": 90 }, { "epoch": 0.0008080220428413287, "grad_norm": 102587.6796875, "learning_rate": 4.0404040404040405e-07, "loss": 3976.1211, "step": 100 }, { "epoch": 0.0008888242471254616, "grad_norm": 78112.53125, "learning_rate": 4.444444444444445e-07, "loss": 3535.432, "step": 110 }, { "epoch": 0.0009696264514095945, "grad_norm": 100944.5703125, "learning_rate": 4.848484848484849e-07, "loss": 4244.4645, "step": 120 }, { "epoch": 0.0010504286556937273, "grad_norm": 178465.359375, "learning_rate": 5.252525252525253e-07, "loss": 5655.2863, "step": 130 }, { "epoch": 0.0011312308599778602, "grad_norm": 70585.359375, "learning_rate": 5.656565656565657e-07, "loss": 3637.5656, "step": 140 }, { "epoch": 0.001212033064261993, "grad_norm": 51223.41015625, "learning_rate": 6.060606060606061e-07, "loss": 2629.8635, "step": 150 }, { "epoch": 0.0012928352685461259, "grad_norm": 53755.01953125, "learning_rate": 6.464646464646465e-07, "loss": 4522.2559, "step": 160 }, { "epoch": 0.0013736374728302587, "grad_norm": 72124.0625, "learning_rate": 6.868686868686869e-07, "loss": 3395.8777, "step": 170 }, { "epoch": 0.0014544396771143918, "grad_norm": 11780.193359375, "learning_rate": 7.272727272727273e-07, "loss": 1784.9486, "step": 180 }, { "epoch": 0.0015352418813985246, "grad_norm": 62803.8828125, "learning_rate": 7.676767676767678e-07, "loss": 2220.277, "step": 190 }, { "epoch": 0.0016160440856826574, "grad_norm": 118829.34375, "learning_rate": 8.080808080808081e-07, "loss": 2133.5133, "step": 200 }, { "epoch": 0.0016968462899667903, "grad_norm": 7391.849609375, "learning_rate": 8.484848484848486e-07, "loss": 1618.4662, "step": 210 }, { "epoch": 0.0017776484942509231, "grad_norm": 7889.263671875, "learning_rate": 8.88888888888889e-07, "loss": 1578.3401, "step": 220 }, { "epoch": 0.001858450698535056, "grad_norm": 10828.140625, "learning_rate": 9.292929292929294e-07, "loss": 1327.2748, "step": 230 }, { "epoch": 0.001939252902819189, "grad_norm": 15991.2119140625, "learning_rate": 9.696969696969698e-07, "loss": 1342.3516, "step": 240 }, { "epoch": 0.0020200551071033216, "grad_norm": 7872.46484375, "learning_rate": 1.0101010101010103e-06, "loss": 1077.598, "step": 250 }, { "epoch": 0.0021008573113874547, "grad_norm": 3977.63623046875, "learning_rate": 1.0505050505050506e-06, "loss": 835.4583, "step": 260 }, { "epoch": 0.0021816595156715873, "grad_norm": 3596.71923828125, "learning_rate": 1.090909090909091e-06, "loss": 766.0804, "step": 270 }, { "epoch": 0.0022624617199557204, "grad_norm": 10194.791015625, "learning_rate": 1.1313131313131313e-06, "loss": 777.5695, "step": 280 }, { "epoch": 0.0023432639242398534, "grad_norm": 7022.6103515625, "learning_rate": 1.1717171717171719e-06, "loss": 639.281, "step": 290 }, { "epoch": 0.002424066128523986, "grad_norm": 2920.81787109375, "learning_rate": 1.2121212121212122e-06, "loss": 703.318, "step": 300 }, { "epoch": 0.002504868332808119, "grad_norm": 4137.4970703125, "learning_rate": 1.2525252525252527e-06, "loss": 766.8718, "step": 310 }, { "epoch": 0.0025856705370922517, "grad_norm": 8379.064453125, "learning_rate": 1.292929292929293e-06, "loss": 625.7259, "step": 320 }, { "epoch": 0.002666472741376385, "grad_norm": 2632.939697265625, "learning_rate": 1.3333333333333334e-06, "loss": 529.0049, "step": 330 }, { "epoch": 0.0027472749456605174, "grad_norm": 3072.3486328125, "learning_rate": 1.3737373737373738e-06, "loss": 557.5195, "step": 340 }, { "epoch": 0.0028280771499446505, "grad_norm": 2430.994384765625, "learning_rate": 1.4141414141414143e-06, "loss": 548.776, "step": 350 }, { "epoch": 0.0029088793542287835, "grad_norm": 2531.90771484375, "learning_rate": 1.4545454545454546e-06, "loss": 602.2291, "step": 360 }, { "epoch": 0.002989681558512916, "grad_norm": 5374.16552734375, "learning_rate": 1.4949494949494952e-06, "loss": 447.6404, "step": 370 }, { "epoch": 0.003070483762797049, "grad_norm": 1357.3726806640625, "learning_rate": 1.5353535353535355e-06, "loss": 342.7574, "step": 380 }, { "epoch": 0.003151285967081182, "grad_norm": 1249.0936279296875, "learning_rate": 1.5757575757575759e-06, "loss": 562.0835, "step": 390 }, { "epoch": 0.003232088171365315, "grad_norm": 1270.3609619140625, "learning_rate": 1.6161616161616162e-06, "loss": 467.299, "step": 400 }, { "epoch": 0.0033128903756494475, "grad_norm": 2576.8291015625, "learning_rate": 1.6565656565656565e-06, "loss": 448.897, "step": 410 }, { "epoch": 0.0033936925799335806, "grad_norm": 1376.2908935546875, "learning_rate": 1.6969696969696973e-06, "loss": 382.242, "step": 420 }, { "epoch": 0.0034744947842177136, "grad_norm": 1212.802490234375, "learning_rate": 1.7373737373737376e-06, "loss": 486.5346, "step": 430 }, { "epoch": 0.0035552969885018462, "grad_norm": 801.7883911132812, "learning_rate": 1.777777777777778e-06, "loss": 452.1537, "step": 440 }, { "epoch": 0.0036360991927859793, "grad_norm": 1590.736572265625, "learning_rate": 1.818181818181818e-06, "loss": 449.5157, "step": 450 }, { "epoch": 0.003716901397070112, "grad_norm": 3368.974853515625, "learning_rate": 1.8585858585858588e-06, "loss": 445.9489, "step": 460 }, { "epoch": 0.003797703601354245, "grad_norm": 2164.530517578125, "learning_rate": 1.8989898989898992e-06, "loss": 449.8674, "step": 470 }, { "epoch": 0.003878505805638378, "grad_norm": 1234.2860107421875, "learning_rate": 1.9393939393939395e-06, "loss": 375.5616, "step": 480 }, { "epoch": 0.003959308009922511, "grad_norm": 1191.9912109375, "learning_rate": 1.9797979797979796e-06, "loss": 394.9626, "step": 490 }, { "epoch": 0.004040110214206643, "grad_norm": 2829.901611328125, "learning_rate": 2.0202020202020206e-06, "loss": 398.194, "step": 500 }, { "epoch": 0.004120912418490777, "grad_norm": 3573.80322265625, "learning_rate": 2.0606060606060607e-06, "loss": 440.5374, "step": 510 }, { "epoch": 0.004201714622774909, "grad_norm": 991.9559326171875, "learning_rate": 2.1010101010101013e-06, "loss": 552.4083, "step": 520 }, { "epoch": 0.004282516827059042, "grad_norm": 2654.274658203125, "learning_rate": 2.1414141414141414e-06, "loss": 414.1354, "step": 530 }, { "epoch": 0.004363319031343175, "grad_norm": 1416.258056640625, "learning_rate": 2.181818181818182e-06, "loss": 443.3496, "step": 540 }, { "epoch": 0.004444121235627308, "grad_norm": 1433.0880126953125, "learning_rate": 2.2222222222222225e-06, "loss": 476.5702, "step": 550 }, { "epoch": 0.004524923439911441, "grad_norm": 852.6603393554688, "learning_rate": 2.2626262626262626e-06, "loss": 425.1744, "step": 560 }, { "epoch": 0.004605725644195573, "grad_norm": 1662.8759765625, "learning_rate": 2.303030303030303e-06, "loss": 432.2859, "step": 570 }, { "epoch": 0.004686527848479707, "grad_norm": 1177.7154541015625, "learning_rate": 2.3434343434343437e-06, "loss": 472.836, "step": 580 }, { "epoch": 0.0047673300527638395, "grad_norm": 1203.2510986328125, "learning_rate": 2.383838383838384e-06, "loss": 400.3267, "step": 590 }, { "epoch": 0.004848132257047972, "grad_norm": 806.4482421875, "learning_rate": 2.4242424242424244e-06, "loss": 381.2281, "step": 600 }, { "epoch": 0.004928934461332105, "grad_norm": 1223.798095703125, "learning_rate": 2.4646464646464645e-06, "loss": 314.522, "step": 610 }, { "epoch": 0.005009736665616238, "grad_norm": 1220.419189453125, "learning_rate": 2.5050505050505055e-06, "loss": 391.7079, "step": 620 }, { "epoch": 0.005090538869900371, "grad_norm": 1846.539306640625, "learning_rate": 2.5454545454545456e-06, "loss": 415.6819, "step": 630 }, { "epoch": 0.0051713410741845035, "grad_norm": 1243.7620849609375, "learning_rate": 2.585858585858586e-06, "loss": 437.1002, "step": 640 }, { "epoch": 0.005252143278468637, "grad_norm": 1140.9453125, "learning_rate": 2.6262626262626263e-06, "loss": 415.1159, "step": 650 }, { "epoch": 0.00533294548275277, "grad_norm": 4821.37060546875, "learning_rate": 2.666666666666667e-06, "loss": 464.5417, "step": 660 }, { "epoch": 0.005413747687036902, "grad_norm": 1713.6087646484375, "learning_rate": 2.7070707070707074e-06, "loss": 375.5785, "step": 670 }, { "epoch": 0.005494549891321035, "grad_norm": 2620.644287109375, "learning_rate": 2.7474747474747475e-06, "loss": 299.2374, "step": 680 }, { "epoch": 0.005575352095605168, "grad_norm": 1759.0950927734375, "learning_rate": 2.787878787878788e-06, "loss": 415.4854, "step": 690 }, { "epoch": 0.005656154299889301, "grad_norm": 1536.8719482421875, "learning_rate": 2.8282828282828286e-06, "loss": 449.5622, "step": 700 }, { "epoch": 0.0057369565041734336, "grad_norm": 2095.785400390625, "learning_rate": 2.8686868686868687e-06, "loss": 380.9472, "step": 710 }, { "epoch": 0.005817758708457567, "grad_norm": 1478.4825439453125, "learning_rate": 2.9090909090909093e-06, "loss": 390.7729, "step": 720 }, { "epoch": 0.0058985609127417, "grad_norm": 1876.679443359375, "learning_rate": 2.9494949494949494e-06, "loss": 391.9601, "step": 730 }, { "epoch": 0.005979363117025832, "grad_norm": 749.0634765625, "learning_rate": 2.9898989898989904e-06, "loss": 253.9756, "step": 740 }, { "epoch": 0.006060165321309965, "grad_norm": 906.431396484375, "learning_rate": 3.0303030303030305e-06, "loss": 347.2193, "step": 750 }, { "epoch": 0.006140967525594098, "grad_norm": 1034.1180419921875, "learning_rate": 3.070707070707071e-06, "loss": 311.8948, "step": 760 }, { "epoch": 0.006221769729878231, "grad_norm": 2706.747802734375, "learning_rate": 3.111111111111111e-06, "loss": 329.9375, "step": 770 }, { "epoch": 0.006302571934162364, "grad_norm": 1195.091064453125, "learning_rate": 3.1515151515151517e-06, "loss": 303.2004, "step": 780 }, { "epoch": 0.006383374138446497, "grad_norm": 1622.099609375, "learning_rate": 3.191919191919192e-06, "loss": 359.3405, "step": 790 }, { "epoch": 0.00646417634273063, "grad_norm": 1255.0582275390625, "learning_rate": 3.2323232323232324e-06, "loss": 362.8006, "step": 800 }, { "epoch": 0.006544978547014762, "grad_norm": 1571.966552734375, "learning_rate": 3.2727272727272733e-06, "loss": 372.003, "step": 810 }, { "epoch": 0.006625780751298895, "grad_norm": 1350.624755859375, "learning_rate": 3.313131313131313e-06, "loss": 391.2471, "step": 820 }, { "epoch": 0.0067065829555830285, "grad_norm": 1288.0430908203125, "learning_rate": 3.3535353535353536e-06, "loss": 381.0373, "step": 830 }, { "epoch": 0.006787385159867161, "grad_norm": 1756.347900390625, "learning_rate": 3.3939393939393946e-06, "loss": 366.8195, "step": 840 }, { "epoch": 0.006868187364151294, "grad_norm": 1164.05224609375, "learning_rate": 3.4343434343434343e-06, "loss": 340.2402, "step": 850 }, { "epoch": 0.006948989568435427, "grad_norm": 6004.3291015625, "learning_rate": 3.4747474747474752e-06, "loss": 397.7737, "step": 860 }, { "epoch": 0.00702979177271956, "grad_norm": 1949.6370849609375, "learning_rate": 3.515151515151515e-06, "loss": 352.9498, "step": 870 }, { "epoch": 0.0071105939770036925, "grad_norm": 823.8360595703125, "learning_rate": 3.555555555555556e-06, "loss": 432.1595, "step": 880 }, { "epoch": 0.007191396181287825, "grad_norm": 3512.72607421875, "learning_rate": 3.5959595959595965e-06, "loss": 342.5901, "step": 890 }, { "epoch": 0.007272198385571959, "grad_norm": 1352.2506103515625, "learning_rate": 3.636363636363636e-06, "loss": 306.887, "step": 900 }, { "epoch": 0.007353000589856091, "grad_norm": 2983.867919921875, "learning_rate": 3.676767676767677e-06, "loss": 381.7238, "step": 910 }, { "epoch": 0.007433802794140224, "grad_norm": 2423.1806640625, "learning_rate": 3.7171717171717177e-06, "loss": 303.3808, "step": 920 }, { "epoch": 0.007514604998424357, "grad_norm": 953.1580810546875, "learning_rate": 3.757575757575758e-06, "loss": 380.6255, "step": 930 }, { "epoch": 0.00759540720270849, "grad_norm": 5818.9609375, "learning_rate": 3.7979797979797984e-06, "loss": 372.4149, "step": 940 }, { "epoch": 0.007676209406992623, "grad_norm": 1317.5467529296875, "learning_rate": 3.8383838383838385e-06, "loss": 369.7815, "step": 950 }, { "epoch": 0.007757011611276756, "grad_norm": 929.1298217773438, "learning_rate": 3.878787878787879e-06, "loss": 395.9197, "step": 960 }, { "epoch": 0.007837813815560889, "grad_norm": 1342.6861572265625, "learning_rate": 3.9191919191919196e-06, "loss": 384.7229, "step": 970 }, { "epoch": 0.007918616019845021, "grad_norm": 1253.3720703125, "learning_rate": 3.959595959595959e-06, "loss": 302.0778, "step": 980 }, { "epoch": 0.007999418224129154, "grad_norm": 966.1517333984375, "learning_rate": 4.000000000000001e-06, "loss": 407.2742, "step": 990 }, { "epoch": 0.008080220428413287, "grad_norm": 949.992919921875, "learning_rate": 4.040404040404041e-06, "loss": 333.9852, "step": 1000 }, { "epoch": 0.00816102263269742, "grad_norm": 2160.320068359375, "learning_rate": 4.080808080808081e-06, "loss": 320.1485, "step": 1010 }, { "epoch": 0.008241824836981554, "grad_norm": 1784.5238037109375, "learning_rate": 4.1212121212121215e-06, "loss": 388.0309, "step": 1020 }, { "epoch": 0.008322627041265686, "grad_norm": 11426.04296875, "learning_rate": 4.161616161616161e-06, "loss": 342.7975, "step": 1030 }, { "epoch": 0.008403429245549819, "grad_norm": 1648.6806640625, "learning_rate": 4.2020202020202026e-06, "loss": 377.397, "step": 1040 }, { "epoch": 0.008484231449833951, "grad_norm": 929.9481811523438, "learning_rate": 4.242424242424243e-06, "loss": 316.3978, "step": 1050 }, { "epoch": 0.008565033654118084, "grad_norm": 1129.7996826171875, "learning_rate": 4.282828282828283e-06, "loss": 299.9879, "step": 1060 }, { "epoch": 0.008645835858402217, "grad_norm": 1400.36376953125, "learning_rate": 4.323232323232323e-06, "loss": 350.1171, "step": 1070 }, { "epoch": 0.00872663806268635, "grad_norm": 1833.9061279296875, "learning_rate": 4.363636363636364e-06, "loss": 325.1123, "step": 1080 }, { "epoch": 0.008807440266970484, "grad_norm": 1330.033203125, "learning_rate": 4.4040404040404044e-06, "loss": 325.3823, "step": 1090 }, { "epoch": 0.008888242471254616, "grad_norm": 910.3088989257812, "learning_rate": 4.444444444444445e-06, "loss": 301.4958, "step": 1100 }, { "epoch": 0.008969044675538749, "grad_norm": 1182.816162109375, "learning_rate": 4.484848484848485e-06, "loss": 277.6404, "step": 1110 }, { "epoch": 0.009049846879822881, "grad_norm": 4707.03857421875, "learning_rate": 4.525252525252525e-06, "loss": 424.5438, "step": 1120 }, { "epoch": 0.009130649084107014, "grad_norm": 2059.0185546875, "learning_rate": 4.565656565656566e-06, "loss": 324.0304, "step": 1130 }, { "epoch": 0.009211451288391147, "grad_norm": 3679.1044921875, "learning_rate": 4.606060606060606e-06, "loss": 303.365, "step": 1140 }, { "epoch": 0.00929225349267528, "grad_norm": 2518.44970703125, "learning_rate": 4.646464646464647e-06, "loss": 257.8196, "step": 1150 }, { "epoch": 0.009373055696959414, "grad_norm": 1017.4381713867188, "learning_rate": 4.6868686868686874e-06, "loss": 308.7409, "step": 1160 }, { "epoch": 0.009453857901243546, "grad_norm": 1016.4625244140625, "learning_rate": 4.727272727272727e-06, "loss": 309.7148, "step": 1170 }, { "epoch": 0.009534660105527679, "grad_norm": 2000.1339111328125, "learning_rate": 4.767676767676768e-06, "loss": 416.5233, "step": 1180 }, { "epoch": 0.009615462309811812, "grad_norm": 1349.24755859375, "learning_rate": 4.808080808080808e-06, "loss": 312.0022, "step": 1190 }, { "epoch": 0.009696264514095944, "grad_norm": 605.5498046875, "learning_rate": 4.848484848484849e-06, "loss": 373.616, "step": 1200 }, { "epoch": 0.009777066718380077, "grad_norm": 1045.7061767578125, "learning_rate": 4.888888888888889e-06, "loss": 263.9121, "step": 1210 }, { "epoch": 0.00985786892266421, "grad_norm": 997.4849243164062, "learning_rate": 4.929292929292929e-06, "loss": 299.2633, "step": 1220 }, { "epoch": 0.009938671126948344, "grad_norm": 690.6622314453125, "learning_rate": 4.96969696969697e-06, "loss": 216.8853, "step": 1230 }, { "epoch": 0.010019473331232476, "grad_norm": 1591.2972412109375, "learning_rate": 5.010101010101011e-06, "loss": 339.3444, "step": 1240 }, { "epoch": 0.010100275535516609, "grad_norm": 749.3450317382812, "learning_rate": 5.050505050505051e-06, "loss": 315.4824, "step": 1250 }, { "epoch": 0.010181077739800742, "grad_norm": 923.2822265625, "learning_rate": 5.090909090909091e-06, "loss": 265.8641, "step": 1260 }, { "epoch": 0.010261879944084874, "grad_norm": 1161.5048828125, "learning_rate": 5.131313131313131e-06, "loss": 255.6592, "step": 1270 }, { "epoch": 0.010342682148369007, "grad_norm": 949.8252563476562, "learning_rate": 5.171717171717172e-06, "loss": 350.1398, "step": 1280 }, { "epoch": 0.01042348435265314, "grad_norm": 1018.5875854492188, "learning_rate": 5.212121212121213e-06, "loss": 313.8918, "step": 1290 }, { "epoch": 0.010504286556937274, "grad_norm": 3170.640869140625, "learning_rate": 5.2525252525252526e-06, "loss": 305.4674, "step": 1300 }, { "epoch": 0.010585088761221407, "grad_norm": 1600.813720703125, "learning_rate": 5.292929292929293e-06, "loss": 331.4024, "step": 1310 }, { "epoch": 0.01066589096550554, "grad_norm": 1063.93408203125, "learning_rate": 5.333333333333334e-06, "loss": 373.562, "step": 1320 }, { "epoch": 0.010746693169789672, "grad_norm": 665.6146240234375, "learning_rate": 5.373737373737374e-06, "loss": 235.0867, "step": 1330 }, { "epoch": 0.010827495374073804, "grad_norm": 1380.6151123046875, "learning_rate": 5.414141414141415e-06, "loss": 324.2148, "step": 1340 }, { "epoch": 0.010908297578357937, "grad_norm": 3627.843994140625, "learning_rate": 5.4545454545454545e-06, "loss": 364.2515, "step": 1350 }, { "epoch": 0.01098909978264207, "grad_norm": 850.1503295898438, "learning_rate": 5.494949494949495e-06, "loss": 408.481, "step": 1360 }, { "epoch": 0.011069901986926204, "grad_norm": 2091.482421875, "learning_rate": 5.5353535353535355e-06, "loss": 267.4172, "step": 1370 }, { "epoch": 0.011150704191210337, "grad_norm": 1496.604248046875, "learning_rate": 5.575757575757576e-06, "loss": 295.7435, "step": 1380 }, { "epoch": 0.01123150639549447, "grad_norm": 2673.033203125, "learning_rate": 5.616161616161617e-06, "loss": 251.5225, "step": 1390 }, { "epoch": 0.011312308599778602, "grad_norm": 1121.48779296875, "learning_rate": 5.656565656565657e-06, "loss": 318.3079, "step": 1400 }, { "epoch": 0.011393110804062734, "grad_norm": 866.069091796875, "learning_rate": 5.696969696969697e-06, "loss": 283.861, "step": 1410 }, { "epoch": 0.011473913008346867, "grad_norm": 1132.9764404296875, "learning_rate": 5.7373737373737374e-06, "loss": 312.9168, "step": 1420 }, { "epoch": 0.011554715212631, "grad_norm": 1029.2037353515625, "learning_rate": 5.777777777777778e-06, "loss": 307.1583, "step": 1430 }, { "epoch": 0.011635517416915134, "grad_norm": 1138.4461669921875, "learning_rate": 5.8181818181818185e-06, "loss": 266.8528, "step": 1440 }, { "epoch": 0.011716319621199267, "grad_norm": 1278.30224609375, "learning_rate": 5.858585858585859e-06, "loss": 339.6735, "step": 1450 }, { "epoch": 0.0117971218254834, "grad_norm": 908.4046630859375, "learning_rate": 5.898989898989899e-06, "loss": 331.0705, "step": 1460 }, { "epoch": 0.011877924029767532, "grad_norm": 1165.220947265625, "learning_rate": 5.93939393939394e-06, "loss": 267.2519, "step": 1470 }, { "epoch": 0.011958726234051665, "grad_norm": 3219.732177734375, "learning_rate": 5.979797979797981e-06, "loss": 485.5872, "step": 1480 }, { "epoch": 0.012039528438335797, "grad_norm": 1050.6778564453125, "learning_rate": 6.0202020202020204e-06, "loss": 325.8213, "step": 1490 }, { "epoch": 0.01212033064261993, "grad_norm": 1031.156005859375, "learning_rate": 6.060606060606061e-06, "loss": 302.1713, "step": 1500 }, { "epoch": 0.012201132846904064, "grad_norm": 1457.485107421875, "learning_rate": 6.101010101010101e-06, "loss": 267.4604, "step": 1510 }, { "epoch": 0.012281935051188197, "grad_norm": 1509.94091796875, "learning_rate": 6.141414141414142e-06, "loss": 368.4667, "step": 1520 }, { "epoch": 0.01236273725547233, "grad_norm": 1406.15673828125, "learning_rate": 6.181818181818183e-06, "loss": 349.9011, "step": 1530 }, { "epoch": 0.012443539459756462, "grad_norm": 1459.2613525390625, "learning_rate": 6.222222222222222e-06, "loss": 325.6512, "step": 1540 }, { "epoch": 0.012524341664040595, "grad_norm": 3242.78271484375, "learning_rate": 6.262626262626263e-06, "loss": 375.0032, "step": 1550 }, { "epoch": 0.012605143868324727, "grad_norm": 2443.515625, "learning_rate": 6.303030303030303e-06, "loss": 379.789, "step": 1560 }, { "epoch": 0.01268594607260886, "grad_norm": 1770.516845703125, "learning_rate": 6.343434343434344e-06, "loss": 251.652, "step": 1570 }, { "epoch": 0.012766748276892994, "grad_norm": 1482.888671875, "learning_rate": 6.383838383838384e-06, "loss": 324.2996, "step": 1580 }, { "epoch": 0.012847550481177127, "grad_norm": 1663.3935546875, "learning_rate": 6.424242424242424e-06, "loss": 301.8472, "step": 1590 }, { "epoch": 0.01292835268546126, "grad_norm": 23524.42578125, "learning_rate": 6.464646464646465e-06, "loss": 335.0892, "step": 1600 }, { "epoch": 0.013009154889745392, "grad_norm": 862.7949829101562, "learning_rate": 6.505050505050505e-06, "loss": 306.2031, "step": 1610 }, { "epoch": 0.013089957094029525, "grad_norm": 2391.976806640625, "learning_rate": 6.545454545454547e-06, "loss": 310.1344, "step": 1620 }, { "epoch": 0.013170759298313657, "grad_norm": 1260.0029296875, "learning_rate": 6.5858585858585856e-06, "loss": 372.1161, "step": 1630 }, { "epoch": 0.01325156150259779, "grad_norm": 1587.1514892578125, "learning_rate": 6.626262626262626e-06, "loss": 351.7325, "step": 1640 }, { "epoch": 0.013332363706881924, "grad_norm": 1152.1556396484375, "learning_rate": 6.666666666666667e-06, "loss": 373.3706, "step": 1650 }, { "epoch": 0.013413165911166057, "grad_norm": 1005.15771484375, "learning_rate": 6.707070707070707e-06, "loss": 317.8097, "step": 1660 }, { "epoch": 0.01349396811545019, "grad_norm": 1090.3779296875, "learning_rate": 6.747474747474749e-06, "loss": 315.5722, "step": 1670 }, { "epoch": 0.013574770319734322, "grad_norm": 1011.2723388671875, "learning_rate": 6.787878787878789e-06, "loss": 231.6832, "step": 1680 }, { "epoch": 0.013655572524018455, "grad_norm": 798.0405883789062, "learning_rate": 6.828282828282828e-06, "loss": 306.8582, "step": 1690 }, { "epoch": 0.013736374728302587, "grad_norm": 855.2308959960938, "learning_rate": 6.8686868686868685e-06, "loss": 282.2944, "step": 1700 }, { "epoch": 0.01381717693258672, "grad_norm": 1325.1092529296875, "learning_rate": 6.909090909090909e-06, "loss": 427.1749, "step": 1710 }, { "epoch": 0.013897979136870854, "grad_norm": 1027.2860107421875, "learning_rate": 6.9494949494949505e-06, "loss": 232.57, "step": 1720 }, { "epoch": 0.013978781341154987, "grad_norm": 1047.0118408203125, "learning_rate": 6.989898989898991e-06, "loss": 272.4593, "step": 1730 }, { "epoch": 0.01405958354543912, "grad_norm": 1453.4931640625, "learning_rate": 7.03030303030303e-06, "loss": 287.978, "step": 1740 }, { "epoch": 0.014140385749723252, "grad_norm": 1631.84521484375, "learning_rate": 7.0707070707070704e-06, "loss": 222.9585, "step": 1750 }, { "epoch": 0.014221187954007385, "grad_norm": 1109.9012451171875, "learning_rate": 7.111111111111112e-06, "loss": 263.4153, "step": 1760 }, { "epoch": 0.014301990158291518, "grad_norm": 1374.5731201171875, "learning_rate": 7.151515151515152e-06, "loss": 224.7149, "step": 1770 }, { "epoch": 0.01438279236257565, "grad_norm": 700.2552490234375, "learning_rate": 7.191919191919193e-06, "loss": 243.2667, "step": 1780 }, { "epoch": 0.014463594566859785, "grad_norm": 689.7608032226562, "learning_rate": 7.232323232323232e-06, "loss": 209.3831, "step": 1790 }, { "epoch": 0.014544396771143917, "grad_norm": 1072.593994140625, "learning_rate": 7.272727272727272e-06, "loss": 233.4359, "step": 1800 }, { "epoch": 0.01462519897542805, "grad_norm": 783.2555541992188, "learning_rate": 7.313131313131314e-06, "loss": 259.1145, "step": 1810 }, { "epoch": 0.014706001179712182, "grad_norm": 1364.049560546875, "learning_rate": 7.353535353535354e-06, "loss": 244.7944, "step": 1820 }, { "epoch": 0.014786803383996315, "grad_norm": 1947.3690185546875, "learning_rate": 7.393939393939395e-06, "loss": 276.0544, "step": 1830 }, { "epoch": 0.014867605588280448, "grad_norm": 1572.007568359375, "learning_rate": 7.434343434343435e-06, "loss": 310.638, "step": 1840 }, { "epoch": 0.01494840779256458, "grad_norm": 1419.4361572265625, "learning_rate": 7.474747474747475e-06, "loss": 409.634, "step": 1850 }, { "epoch": 0.015029209996848715, "grad_norm": 1355.0137939453125, "learning_rate": 7.515151515151516e-06, "loss": 253.9533, "step": 1860 }, { "epoch": 0.015110012201132847, "grad_norm": 768.96923828125, "learning_rate": 7.555555555555556e-06, "loss": 304.4757, "step": 1870 }, { "epoch": 0.01519081440541698, "grad_norm": 959.4989624023438, "learning_rate": 7.595959595959597e-06, "loss": 345.032, "step": 1880 }, { "epoch": 0.015271616609701113, "grad_norm": 1339.84228515625, "learning_rate": 7.636363636363638e-06, "loss": 296.0798, "step": 1890 }, { "epoch": 0.015352418813985245, "grad_norm": 1036.2491455078125, "learning_rate": 7.676767676767677e-06, "loss": 300.197, "step": 1900 }, { "epoch": 0.015433221018269378, "grad_norm": 707.27587890625, "learning_rate": 7.717171717171717e-06, "loss": 277.1603, "step": 1910 }, { "epoch": 0.015514023222553512, "grad_norm": 1203.453857421875, "learning_rate": 7.757575757575758e-06, "loss": 303.6785, "step": 1920 }, { "epoch": 0.015594825426837645, "grad_norm": 1172.7025146484375, "learning_rate": 7.797979797979799e-06, "loss": 246.9346, "step": 1930 }, { "epoch": 0.015675627631121777, "grad_norm": 1082.4605712890625, "learning_rate": 7.838383838383839e-06, "loss": 296.1259, "step": 1940 }, { "epoch": 0.01575642983540591, "grad_norm": 1456.42529296875, "learning_rate": 7.878787878787878e-06, "loss": 269.7506, "step": 1950 }, { "epoch": 0.015837232039690043, "grad_norm": 1794.72119140625, "learning_rate": 7.919191919191919e-06, "loss": 246.411, "step": 1960 }, { "epoch": 0.015918034243974175, "grad_norm": 3157.114990234375, "learning_rate": 7.959595959595959e-06, "loss": 256.127, "step": 1970 }, { "epoch": 0.015998836448258308, "grad_norm": 1361.6929931640625, "learning_rate": 8.000000000000001e-06, "loss": 308.1984, "step": 1980 }, { "epoch": 0.01607963865254244, "grad_norm": 1006.965087890625, "learning_rate": 8.040404040404042e-06, "loss": 341.8693, "step": 1990 }, { "epoch": 0.016160440856826573, "grad_norm": 1541.38720703125, "learning_rate": 8.080808080808082e-06, "loss": 274.4633, "step": 2000 }, { "epoch": 0.016241243061110706, "grad_norm": 1735.104248046875, "learning_rate": 8.121212121212121e-06, "loss": 258.4104, "step": 2010 }, { "epoch": 0.01632204526539484, "grad_norm": 2176.154052734375, "learning_rate": 8.161616161616162e-06, "loss": 314.5508, "step": 2020 }, { "epoch": 0.016402847469678974, "grad_norm": 1052.533447265625, "learning_rate": 8.202020202020202e-06, "loss": 312.927, "step": 2030 }, { "epoch": 0.016483649673963107, "grad_norm": 1208.69189453125, "learning_rate": 8.242424242424243e-06, "loss": 254.359, "step": 2040 }, { "epoch": 0.01656445187824724, "grad_norm": 1978.5025634765625, "learning_rate": 8.282828282828283e-06, "loss": 299.8744, "step": 2050 }, { "epoch": 0.016645254082531372, "grad_norm": 3605.6494140625, "learning_rate": 8.323232323232322e-06, "loss": 298.715, "step": 2060 }, { "epoch": 0.016726056286815505, "grad_norm": 1599.973876953125, "learning_rate": 8.363636363636365e-06, "loss": 258.9758, "step": 2070 }, { "epoch": 0.016806858491099638, "grad_norm": 1183.451904296875, "learning_rate": 8.404040404040405e-06, "loss": 352.8176, "step": 2080 }, { "epoch": 0.01688766069538377, "grad_norm": 1582.7120361328125, "learning_rate": 8.444444444444446e-06, "loss": 309.3484, "step": 2090 }, { "epoch": 0.016968462899667903, "grad_norm": 932.9716796875, "learning_rate": 8.484848484848486e-06, "loss": 255.265, "step": 2100 }, { "epoch": 0.017049265103952035, "grad_norm": 922.5059814453125, "learning_rate": 8.525252525252525e-06, "loss": 253.6583, "step": 2110 }, { "epoch": 0.017130067308236168, "grad_norm": 1196.361083984375, "learning_rate": 8.565656565656566e-06, "loss": 323.4844, "step": 2120 }, { "epoch": 0.0172108695125203, "grad_norm": 1005.9546508789062, "learning_rate": 8.606060606060606e-06, "loss": 351.4896, "step": 2130 }, { "epoch": 0.017291671716804433, "grad_norm": 1585.8636474609375, "learning_rate": 8.646464646464647e-06, "loss": 325.0701, "step": 2140 }, { "epoch": 0.017372473921088566, "grad_norm": 3758.6982421875, "learning_rate": 8.686868686868687e-06, "loss": 203.9509, "step": 2150 }, { "epoch": 0.0174532761253727, "grad_norm": 1602.480224609375, "learning_rate": 8.727272727272728e-06, "loss": 294.8089, "step": 2160 }, { "epoch": 0.017534078329656835, "grad_norm": 1571.1812744140625, "learning_rate": 8.767676767676768e-06, "loss": 220.656, "step": 2170 }, { "epoch": 0.017614880533940967, "grad_norm": 1073.2261962890625, "learning_rate": 8.808080808080809e-06, "loss": 221.6837, "step": 2180 }, { "epoch": 0.0176956827382251, "grad_norm": 1125.1983642578125, "learning_rate": 8.84848484848485e-06, "loss": 286.0061, "step": 2190 }, { "epoch": 0.017776484942509233, "grad_norm": 1008.795654296875, "learning_rate": 8.88888888888889e-06, "loss": 282.5281, "step": 2200 }, { "epoch": 0.017857287146793365, "grad_norm": 1810.7894287109375, "learning_rate": 8.92929292929293e-06, "loss": 226.4816, "step": 2210 }, { "epoch": 0.017938089351077498, "grad_norm": 894.2589721679688, "learning_rate": 8.96969696969697e-06, "loss": 251.6401, "step": 2220 }, { "epoch": 0.01801889155536163, "grad_norm": 1232.9827880859375, "learning_rate": 9.01010101010101e-06, "loss": 248.7544, "step": 2230 }, { "epoch": 0.018099693759645763, "grad_norm": 1993.9267578125, "learning_rate": 9.05050505050505e-06, "loss": 256.8296, "step": 2240 }, { "epoch": 0.018180495963929896, "grad_norm": 967.433837890625, "learning_rate": 9.090909090909091e-06, "loss": 245.6321, "step": 2250 }, { "epoch": 0.018261298168214028, "grad_norm": 2560.1728515625, "learning_rate": 9.131313131313132e-06, "loss": 234.9478, "step": 2260 }, { "epoch": 0.01834210037249816, "grad_norm": 590.1747436523438, "learning_rate": 9.171717171717172e-06, "loss": 236.6796, "step": 2270 }, { "epoch": 0.018422902576782293, "grad_norm": 1504.942626953125, "learning_rate": 9.212121212121213e-06, "loss": 367.3362, "step": 2280 }, { "epoch": 0.018503704781066426, "grad_norm": 977.7069091796875, "learning_rate": 9.252525252525253e-06, "loss": 276.6067, "step": 2290 }, { "epoch": 0.01858450698535056, "grad_norm": 1384.83203125, "learning_rate": 9.292929292929294e-06, "loss": 340.7896, "step": 2300 }, { "epoch": 0.018665309189634695, "grad_norm": 1302.6343994140625, "learning_rate": 9.333333333333334e-06, "loss": 198.1527, "step": 2310 }, { "epoch": 0.018746111393918827, "grad_norm": 836.4732666015625, "learning_rate": 9.373737373737375e-06, "loss": 209.2565, "step": 2320 }, { "epoch": 0.01882691359820296, "grad_norm": 1400.0604248046875, "learning_rate": 9.414141414141414e-06, "loss": 305.2968, "step": 2330 }, { "epoch": 0.018907715802487093, "grad_norm": 1454.78125, "learning_rate": 9.454545454545454e-06, "loss": 260.4474, "step": 2340 }, { "epoch": 0.018988518006771225, "grad_norm": 4679.6923828125, "learning_rate": 9.494949494949495e-06, "loss": 247.9314, "step": 2350 }, { "epoch": 0.019069320211055358, "grad_norm": 6640.1201171875, "learning_rate": 9.535353535353535e-06, "loss": 259.6065, "step": 2360 }, { "epoch": 0.01915012241533949, "grad_norm": 1564.781982421875, "learning_rate": 9.575757575757578e-06, "loss": 245.3016, "step": 2370 }, { "epoch": 0.019230924619623623, "grad_norm": 2078.74267578125, "learning_rate": 9.616161616161616e-06, "loss": 284.7133, "step": 2380 }, { "epoch": 0.019311726823907756, "grad_norm": 1441.0360107421875, "learning_rate": 9.656565656565657e-06, "loss": 270.4787, "step": 2390 }, { "epoch": 0.01939252902819189, "grad_norm": 744.0514526367188, "learning_rate": 9.696969696969698e-06, "loss": 200.2359, "step": 2400 }, { "epoch": 0.01947333123247602, "grad_norm": 1168.913818359375, "learning_rate": 9.737373737373738e-06, "loss": 269.4149, "step": 2410 }, { "epoch": 0.019554133436760154, "grad_norm": 3848.146484375, "learning_rate": 9.777777777777779e-06, "loss": 250.6479, "step": 2420 }, { "epoch": 0.019634935641044286, "grad_norm": 798.60595703125, "learning_rate": 9.818181818181818e-06, "loss": 245.7341, "step": 2430 }, { "epoch": 0.01971573784532842, "grad_norm": 2139.72265625, "learning_rate": 9.858585858585858e-06, "loss": 275.653, "step": 2440 }, { "epoch": 0.019796540049612555, "grad_norm": 1223.6392822265625, "learning_rate": 9.898989898989899e-06, "loss": 271.2685, "step": 2450 }, { "epoch": 0.019877342253896688, "grad_norm": 1007.25439453125, "learning_rate": 9.93939393939394e-06, "loss": 235.147, "step": 2460 }, { "epoch": 0.01995814445818082, "grad_norm": 1195.012939453125, "learning_rate": 9.979797979797981e-06, "loss": 302.4475, "step": 2470 }, { "epoch": 0.020038946662464953, "grad_norm": 1530.9473876953125, "learning_rate": 1.0020202020202022e-05, "loss": 259.1597, "step": 2480 }, { "epoch": 0.020119748866749085, "grad_norm": 1403.546142578125, "learning_rate": 1.006060606060606e-05, "loss": 348.2433, "step": 2490 }, { "epoch": 0.020200551071033218, "grad_norm": 1328.4873046875, "learning_rate": 1.0101010101010101e-05, "loss": 289.0594, "step": 2500 }, { "epoch": 0.02028135327531735, "grad_norm": 1171.5048828125, "learning_rate": 1.0141414141414142e-05, "loss": 197.3946, "step": 2510 }, { "epoch": 0.020362155479601483, "grad_norm": 1274.6544189453125, "learning_rate": 1.0181818181818182e-05, "loss": 295.2945, "step": 2520 }, { "epoch": 0.020442957683885616, "grad_norm": 2023.71337890625, "learning_rate": 1.0222222222222223e-05, "loss": 271.9707, "step": 2530 }, { "epoch": 0.02052375988816975, "grad_norm": 1765.538818359375, "learning_rate": 1.0262626262626262e-05, "loss": 246.6141, "step": 2540 }, { "epoch": 0.02060456209245388, "grad_norm": 859.3914794921875, "learning_rate": 1.0303030303030304e-05, "loss": 276.1375, "step": 2550 }, { "epoch": 0.020685364296738014, "grad_norm": 1024.8955078125, "learning_rate": 1.0343434343434345e-05, "loss": 239.6529, "step": 2560 }, { "epoch": 0.020766166501022146, "grad_norm": 1012.91455078125, "learning_rate": 1.0383838383838385e-05, "loss": 199.6656, "step": 2570 }, { "epoch": 0.02084696870530628, "grad_norm": 1371.8551025390625, "learning_rate": 1.0424242424242426e-05, "loss": 247.0821, "step": 2580 }, { "epoch": 0.020927770909590415, "grad_norm": 1338.343017578125, "learning_rate": 1.0464646464646465e-05, "loss": 260.7634, "step": 2590 }, { "epoch": 0.021008573113874548, "grad_norm": 917.8023071289062, "learning_rate": 1.0505050505050505e-05, "loss": 255.2869, "step": 2600 }, { "epoch": 0.02108937531815868, "grad_norm": 1167.3427734375, "learning_rate": 1.0545454545454546e-05, "loss": 255.108, "step": 2610 }, { "epoch": 0.021170177522442813, "grad_norm": 1662.1556396484375, "learning_rate": 1.0585858585858586e-05, "loss": 304.7596, "step": 2620 }, { "epoch": 0.021250979726726946, "grad_norm": 1393.7713623046875, "learning_rate": 1.0626262626262627e-05, "loss": 298.9948, "step": 2630 }, { "epoch": 0.02133178193101108, "grad_norm": 4169.89306640625, "learning_rate": 1.0666666666666667e-05, "loss": 256.329, "step": 2640 }, { "epoch": 0.02141258413529521, "grad_norm": 961.8526000976562, "learning_rate": 1.0707070707070708e-05, "loss": 268.2124, "step": 2650 }, { "epoch": 0.021493386339579344, "grad_norm": 2141.90869140625, "learning_rate": 1.0747474747474748e-05, "loss": 279.6773, "step": 2660 }, { "epoch": 0.021574188543863476, "grad_norm": 1454.77392578125, "learning_rate": 1.0787878787878789e-05, "loss": 250.6374, "step": 2670 }, { "epoch": 0.02165499074814761, "grad_norm": 1119.782958984375, "learning_rate": 1.082828282828283e-05, "loss": 299.24, "step": 2680 }, { "epoch": 0.02173579295243174, "grad_norm": 1507.88916015625, "learning_rate": 1.086868686868687e-05, "loss": 304.945, "step": 2690 }, { "epoch": 0.021816595156715874, "grad_norm": 1235.4326171875, "learning_rate": 1.0909090909090909e-05, "loss": 329.6972, "step": 2700 }, { "epoch": 0.021897397361000007, "grad_norm": 1516.6436767578125, "learning_rate": 1.094949494949495e-05, "loss": 276.193, "step": 2710 }, { "epoch": 0.02197819956528414, "grad_norm": 1332.3309326171875, "learning_rate": 1.098989898989899e-05, "loss": 309.7016, "step": 2720 }, { "epoch": 0.022059001769568275, "grad_norm": 1349.0360107421875, "learning_rate": 1.103030303030303e-05, "loss": 423.0254, "step": 2730 }, { "epoch": 0.022139803973852408, "grad_norm": 2262.348876953125, "learning_rate": 1.1070707070707071e-05, "loss": 260.9447, "step": 2740 }, { "epoch": 0.02222060617813654, "grad_norm": 1374.3009033203125, "learning_rate": 1.1111111111111112e-05, "loss": 283.2908, "step": 2750 }, { "epoch": 0.022301408382420673, "grad_norm": 768.2625732421875, "learning_rate": 1.1151515151515152e-05, "loss": 243.0171, "step": 2760 }, { "epoch": 0.022382210586704806, "grad_norm": 4175.06396484375, "learning_rate": 1.1191919191919193e-05, "loss": 236.5765, "step": 2770 }, { "epoch": 0.02246301279098894, "grad_norm": 1220.933837890625, "learning_rate": 1.1232323232323233e-05, "loss": 254.5181, "step": 2780 }, { "epoch": 0.02254381499527307, "grad_norm": 1589.7581787109375, "learning_rate": 1.1272727272727274e-05, "loss": 183.8809, "step": 2790 }, { "epoch": 0.022624617199557204, "grad_norm": 1031.3692626953125, "learning_rate": 1.1313131313131314e-05, "loss": 246.612, "step": 2800 }, { "epoch": 0.022705419403841336, "grad_norm": 1396.37744140625, "learning_rate": 1.1353535353535353e-05, "loss": 224.2803, "step": 2810 }, { "epoch": 0.02278622160812547, "grad_norm": 1879.5634765625, "learning_rate": 1.1393939393939394e-05, "loss": 287.104, "step": 2820 }, { "epoch": 0.0228670238124096, "grad_norm": 1376.0625, "learning_rate": 1.1434343434343434e-05, "loss": 284.0272, "step": 2830 }, { "epoch": 0.022947826016693734, "grad_norm": 682.961181640625, "learning_rate": 1.1474747474747475e-05, "loss": 210.3876, "step": 2840 }, { "epoch": 0.023028628220977867, "grad_norm": 1677.2169189453125, "learning_rate": 1.1515151515151517e-05, "loss": 326.8289, "step": 2850 }, { "epoch": 0.023109430425262, "grad_norm": 733.2987060546875, "learning_rate": 1.1555555555555556e-05, "loss": 230.43, "step": 2860 }, { "epoch": 0.023190232629546136, "grad_norm": 630.212890625, "learning_rate": 1.1595959595959597e-05, "loss": 205.1732, "step": 2870 }, { "epoch": 0.023271034833830268, "grad_norm": 1535.36572265625, "learning_rate": 1.1636363636363637e-05, "loss": 304.9371, "step": 2880 }, { "epoch": 0.0233518370381144, "grad_norm": 1065.3255615234375, "learning_rate": 1.1676767676767678e-05, "loss": 219.3441, "step": 2890 }, { "epoch": 0.023432639242398533, "grad_norm": 2319.004638671875, "learning_rate": 1.1717171717171718e-05, "loss": 274.9734, "step": 2900 }, { "epoch": 0.023513441446682666, "grad_norm": 1362.914794921875, "learning_rate": 1.1757575757575757e-05, "loss": 214.0071, "step": 2910 }, { "epoch": 0.0235942436509668, "grad_norm": 1748.42333984375, "learning_rate": 1.1797979797979798e-05, "loss": 247.8715, "step": 2920 }, { "epoch": 0.02367504585525093, "grad_norm": 1357.4864501953125, "learning_rate": 1.1838383838383838e-05, "loss": 367.5896, "step": 2930 }, { "epoch": 0.023755848059535064, "grad_norm": 1028.1129150390625, "learning_rate": 1.187878787878788e-05, "loss": 248.4116, "step": 2940 }, { "epoch": 0.023836650263819197, "grad_norm": 1497.2218017578125, "learning_rate": 1.1919191919191921e-05, "loss": 309.2206, "step": 2950 }, { "epoch": 0.02391745246810333, "grad_norm": 830.5894775390625, "learning_rate": 1.1959595959595961e-05, "loss": 203.645, "step": 2960 }, { "epoch": 0.023998254672387462, "grad_norm": 1067.825439453125, "learning_rate": 1.2e-05, "loss": 226.1753, "step": 2970 }, { "epoch": 0.024079056876671594, "grad_norm": 1088.949462890625, "learning_rate": 1.2040404040404041e-05, "loss": 235.4068, "step": 2980 }, { "epoch": 0.024159859080955727, "grad_norm": 1378.7232666015625, "learning_rate": 1.2080808080808081e-05, "loss": 258.9663, "step": 2990 }, { "epoch": 0.02424066128523986, "grad_norm": 1665.1182861328125, "learning_rate": 1.2121212121212122e-05, "loss": 178.7374, "step": 3000 }, { "epoch": 0.024321463489523996, "grad_norm": 1254.1650390625, "learning_rate": 1.2161616161616162e-05, "loss": 196.0946, "step": 3010 }, { "epoch": 0.02440226569380813, "grad_norm": 1680.0679931640625, "learning_rate": 1.2202020202020201e-05, "loss": 232.687, "step": 3020 }, { "epoch": 0.02448306789809226, "grad_norm": 1063.8585205078125, "learning_rate": 1.2242424242424242e-05, "loss": 247.0947, "step": 3030 }, { "epoch": 0.024563870102376394, "grad_norm": 3708.090087890625, "learning_rate": 1.2282828282828284e-05, "loss": 252.8591, "step": 3040 }, { "epoch": 0.024644672306660526, "grad_norm": 1465.7147216796875, "learning_rate": 1.2323232323232325e-05, "loss": 262.5838, "step": 3050 }, { "epoch": 0.02472547451094466, "grad_norm": 1327.937255859375, "learning_rate": 1.2363636363636365e-05, "loss": 363.5593, "step": 3060 }, { "epoch": 0.02480627671522879, "grad_norm": 1344.0130615234375, "learning_rate": 1.2404040404040404e-05, "loss": 257.7762, "step": 3070 }, { "epoch": 0.024887078919512924, "grad_norm": 1054.320068359375, "learning_rate": 1.2444444444444445e-05, "loss": 247.01, "step": 3080 }, { "epoch": 0.024967881123797057, "grad_norm": 1068.103515625, "learning_rate": 1.2484848484848485e-05, "loss": 248.3725, "step": 3090 }, { "epoch": 0.02504868332808119, "grad_norm": 1697.66259765625, "learning_rate": 1.2525252525252526e-05, "loss": 258.5855, "step": 3100 }, { "epoch": 0.025129485532365322, "grad_norm": 2154.507080078125, "learning_rate": 1.2565656565656566e-05, "loss": 324.4505, "step": 3110 }, { "epoch": 0.025210287736649455, "grad_norm": 1405.111083984375, "learning_rate": 1.2606060606060607e-05, "loss": 201.9885, "step": 3120 }, { "epoch": 0.025291089940933587, "grad_norm": 1943.2344970703125, "learning_rate": 1.2646464646464647e-05, "loss": 279.3126, "step": 3130 }, { "epoch": 0.02537189214521772, "grad_norm": 1883.3538818359375, "learning_rate": 1.2686868686868688e-05, "loss": 305.4725, "step": 3140 }, { "epoch": 0.025452694349501856, "grad_norm": 2943.758544921875, "learning_rate": 1.2727272727272727e-05, "loss": 282.1084, "step": 3150 }, { "epoch": 0.02553349655378599, "grad_norm": 1242.160400390625, "learning_rate": 1.2767676767676767e-05, "loss": 220.8162, "step": 3160 }, { "epoch": 0.02561429875807012, "grad_norm": 2627.211181640625, "learning_rate": 1.2808080808080808e-05, "loss": 303.4214, "step": 3170 }, { "epoch": 0.025695100962354254, "grad_norm": 1310.1988525390625, "learning_rate": 1.2848484848484848e-05, "loss": 259.6753, "step": 3180 }, { "epoch": 0.025775903166638386, "grad_norm": 1910.4666748046875, "learning_rate": 1.2888888888888889e-05, "loss": 273.7454, "step": 3190 }, { "epoch": 0.02585670537092252, "grad_norm": 740.5687255859375, "learning_rate": 1.292929292929293e-05, "loss": 292.6491, "step": 3200 }, { "epoch": 0.02593750757520665, "grad_norm": 1202.99462890625, "learning_rate": 1.296969696969697e-05, "loss": 243.2686, "step": 3210 }, { "epoch": 0.026018309779490784, "grad_norm": 2174.3525390625, "learning_rate": 1.301010101010101e-05, "loss": 276.3461, "step": 3220 }, { "epoch": 0.026099111983774917, "grad_norm": 1177.9141845703125, "learning_rate": 1.3050505050505051e-05, "loss": 246.7616, "step": 3230 }, { "epoch": 0.02617991418805905, "grad_norm": 1337.3155517578125, "learning_rate": 1.3090909090909093e-05, "loss": 279.5415, "step": 3240 }, { "epoch": 0.026260716392343182, "grad_norm": 970.5191040039062, "learning_rate": 1.3131313131313134e-05, "loss": 240.0207, "step": 3250 }, { "epoch": 0.026341518596627315, "grad_norm": 1119.1689453125, "learning_rate": 1.3171717171717171e-05, "loss": 242.7696, "step": 3260 }, { "epoch": 0.026422320800911447, "grad_norm": 4005.26318359375, "learning_rate": 1.3212121212121212e-05, "loss": 288.4158, "step": 3270 }, { "epoch": 0.02650312300519558, "grad_norm": 2148.187255859375, "learning_rate": 1.3252525252525252e-05, "loss": 224.5919, "step": 3280 }, { "epoch": 0.026583925209479716, "grad_norm": 1367.2222900390625, "learning_rate": 1.3292929292929293e-05, "loss": 212.8641, "step": 3290 }, { "epoch": 0.02666472741376385, "grad_norm": 1278.0506591796875, "learning_rate": 1.3333333333333333e-05, "loss": 222.3518, "step": 3300 }, { "epoch": 0.02674552961804798, "grad_norm": 1361.965087890625, "learning_rate": 1.3373737373737374e-05, "loss": 257.0285, "step": 3310 }, { "epoch": 0.026826331822332114, "grad_norm": 1330.8619384765625, "learning_rate": 1.3414141414141414e-05, "loss": 293.7213, "step": 3320 }, { "epoch": 0.026907134026616247, "grad_norm": 1107.8526611328125, "learning_rate": 1.3454545454545457e-05, "loss": 198.6048, "step": 3330 }, { "epoch": 0.02698793623090038, "grad_norm": 1048.5009765625, "learning_rate": 1.3494949494949497e-05, "loss": 197.4301, "step": 3340 }, { "epoch": 0.027068738435184512, "grad_norm": 2446.47119140625, "learning_rate": 1.3535353535353538e-05, "loss": 287.2842, "step": 3350 }, { "epoch": 0.027149540639468644, "grad_norm": 1396.544921875, "learning_rate": 1.3575757575757578e-05, "loss": 258.2231, "step": 3360 }, { "epoch": 0.027230342843752777, "grad_norm": 904.5388793945312, "learning_rate": 1.3616161616161615e-05, "loss": 376.3808, "step": 3370 }, { "epoch": 0.02731114504803691, "grad_norm": 1247.5994873046875, "learning_rate": 1.3656565656565656e-05, "loss": 318.9619, "step": 3380 }, { "epoch": 0.027391947252321042, "grad_norm": 1123.76220703125, "learning_rate": 1.3696969696969697e-05, "loss": 235.3794, "step": 3390 }, { "epoch": 0.027472749456605175, "grad_norm": 2435.860107421875, "learning_rate": 1.3737373737373737e-05, "loss": 290.7648, "step": 3400 }, { "epoch": 0.027553551660889308, "grad_norm": 1895.08642578125, "learning_rate": 1.3777777777777778e-05, "loss": 228.9499, "step": 3410 }, { "epoch": 0.02763435386517344, "grad_norm": 1057.9578857421875, "learning_rate": 1.3818181818181818e-05, "loss": 199.0401, "step": 3420 }, { "epoch": 0.027715156069457576, "grad_norm": 1296.09130859375, "learning_rate": 1.385858585858586e-05, "loss": 238.6514, "step": 3430 }, { "epoch": 0.02779595827374171, "grad_norm": 3411.5341796875, "learning_rate": 1.3898989898989901e-05, "loss": 299.6798, "step": 3440 }, { "epoch": 0.02787676047802584, "grad_norm": 1198.8946533203125, "learning_rate": 1.3939393939393942e-05, "loss": 258.3361, "step": 3450 }, { "epoch": 0.027957562682309974, "grad_norm": 839.223388671875, "learning_rate": 1.3979797979797982e-05, "loss": 222.8733, "step": 3460 }, { "epoch": 0.028038364886594107, "grad_norm": 1462.1505126953125, "learning_rate": 1.402020202020202e-05, "loss": 206.9738, "step": 3470 }, { "epoch": 0.02811916709087824, "grad_norm": 1066.7890625, "learning_rate": 1.406060606060606e-05, "loss": 190.5506, "step": 3480 }, { "epoch": 0.028199969295162372, "grad_norm": 2753.951171875, "learning_rate": 1.41010101010101e-05, "loss": 210.7065, "step": 3490 }, { "epoch": 0.028280771499446505, "grad_norm": 1700.187744140625, "learning_rate": 1.4141414141414141e-05, "loss": 239.0851, "step": 3500 }, { "epoch": 0.028361573703730637, "grad_norm": 1913.965576171875, "learning_rate": 1.4181818181818181e-05, "loss": 244.5272, "step": 3510 }, { "epoch": 0.02844237590801477, "grad_norm": 1347.6934814453125, "learning_rate": 1.4222222222222224e-05, "loss": 219.5849, "step": 3520 }, { "epoch": 0.028523178112298903, "grad_norm": 894.3698120117188, "learning_rate": 1.4262626262626264e-05, "loss": 261.3107, "step": 3530 }, { "epoch": 0.028603980316583035, "grad_norm": 649.42236328125, "learning_rate": 1.4303030303030305e-05, "loss": 202.5557, "step": 3540 }, { "epoch": 0.028684782520867168, "grad_norm": 824.8812255859375, "learning_rate": 1.4343434343434345e-05, "loss": 245.8003, "step": 3550 }, { "epoch": 0.0287655847251513, "grad_norm": 828.0931396484375, "learning_rate": 1.4383838383838386e-05, "loss": 260.2875, "step": 3560 }, { "epoch": 0.028846386929435437, "grad_norm": 1334.4947509765625, "learning_rate": 1.4424242424242426e-05, "loss": 232.7898, "step": 3570 }, { "epoch": 0.02892718913371957, "grad_norm": 1371.1171875, "learning_rate": 1.4464646464646464e-05, "loss": 418.4771, "step": 3580 }, { "epoch": 0.029007991338003702, "grad_norm": 18497.5234375, "learning_rate": 1.4505050505050504e-05, "loss": 303.3979, "step": 3590 }, { "epoch": 0.029088793542287834, "grad_norm": 1640.417724609375, "learning_rate": 1.4545454545454545e-05, "loss": 246.1203, "step": 3600 }, { "epoch": 0.029169595746571967, "grad_norm": 866.4635620117188, "learning_rate": 1.4585858585858587e-05, "loss": 227.0032, "step": 3610 }, { "epoch": 0.0292503979508561, "grad_norm": 1206.3389892578125, "learning_rate": 1.4626262626262627e-05, "loss": 240.7797, "step": 3620 }, { "epoch": 0.029331200155140232, "grad_norm": 1930.5679931640625, "learning_rate": 1.4666666666666668e-05, "loss": 244.9207, "step": 3630 }, { "epoch": 0.029412002359424365, "grad_norm": 1362.0755615234375, "learning_rate": 1.4707070707070709e-05, "loss": 223.5896, "step": 3640 }, { "epoch": 0.029492804563708497, "grad_norm": 1778.240478515625, "learning_rate": 1.4747474747474749e-05, "loss": 233.0804, "step": 3650 }, { "epoch": 0.02957360676799263, "grad_norm": 1185.7432861328125, "learning_rate": 1.478787878787879e-05, "loss": 269.5211, "step": 3660 }, { "epoch": 0.029654408972276763, "grad_norm": 1272.7274169921875, "learning_rate": 1.482828282828283e-05, "loss": 256.0854, "step": 3670 }, { "epoch": 0.029735211176560895, "grad_norm": 3724.482421875, "learning_rate": 1.486868686868687e-05, "loss": 220.0564, "step": 3680 }, { "epoch": 0.029816013380845028, "grad_norm": 1362.2408447265625, "learning_rate": 1.4909090909090908e-05, "loss": 196.9579, "step": 3690 }, { "epoch": 0.02989681558512916, "grad_norm": 1142.985107421875, "learning_rate": 1.494949494949495e-05, "loss": 298.3712, "step": 3700 }, { "epoch": 0.029977617789413297, "grad_norm": 1711.4461669921875, "learning_rate": 1.498989898989899e-05, "loss": 248.673, "step": 3710 }, { "epoch": 0.03005841999369743, "grad_norm": 1854.973876953125, "learning_rate": 1.5030303030303031e-05, "loss": 178.4528, "step": 3720 }, { "epoch": 0.030139222197981562, "grad_norm": 2415.3564453125, "learning_rate": 1.5070707070707072e-05, "loss": 279.0313, "step": 3730 }, { "epoch": 0.030220024402265695, "grad_norm": 1113.0447998046875, "learning_rate": 1.5111111111111112e-05, "loss": 263.2642, "step": 3740 }, { "epoch": 0.030300826606549827, "grad_norm": 1523.1632080078125, "learning_rate": 1.5151515151515153e-05, "loss": 292.6833, "step": 3750 }, { "epoch": 0.03038162881083396, "grad_norm": 1810.5382080078125, "learning_rate": 1.5191919191919193e-05, "loss": 260.2465, "step": 3760 }, { "epoch": 0.030462431015118092, "grad_norm": 2051.318115234375, "learning_rate": 1.5232323232323234e-05, "loss": 249.5686, "step": 3770 }, { "epoch": 0.030543233219402225, "grad_norm": 1145.482421875, "learning_rate": 1.5272727272727276e-05, "loss": 217.0, "step": 3780 }, { "epoch": 0.030624035423686358, "grad_norm": 1456.9969482421875, "learning_rate": 1.531313131313131e-05, "loss": 247.1355, "step": 3790 }, { "epoch": 0.03070483762797049, "grad_norm": 2063.9072265625, "learning_rate": 1.5353535353535354e-05, "loss": 317.8373, "step": 3800 }, { "epoch": 0.030785639832254623, "grad_norm": 1188.59130859375, "learning_rate": 1.5393939393939393e-05, "loss": 251.0659, "step": 3810 }, { "epoch": 0.030866442036538756, "grad_norm": 542.1653442382812, "learning_rate": 1.5434343434343435e-05, "loss": 205.6288, "step": 3820 }, { "epoch": 0.030947244240822888, "grad_norm": 858.66552734375, "learning_rate": 1.5474747474747474e-05, "loss": 261.1724, "step": 3830 }, { "epoch": 0.031028046445107024, "grad_norm": 1392.4208984375, "learning_rate": 1.5515151515151516e-05, "loss": 263.898, "step": 3840 }, { "epoch": 0.031108848649391157, "grad_norm": 1089.10888671875, "learning_rate": 1.5555555555555555e-05, "loss": 263.4895, "step": 3850 }, { "epoch": 0.03118965085367529, "grad_norm": 1323.1083984375, "learning_rate": 1.5595959595959597e-05, "loss": 224.5914, "step": 3860 }, { "epoch": 0.03127045305795942, "grad_norm": 748.7206420898438, "learning_rate": 1.563636363636364e-05, "loss": 185.8181, "step": 3870 }, { "epoch": 0.031351255262243555, "grad_norm": 1530.072021484375, "learning_rate": 1.5676767676767678e-05, "loss": 295.1081, "step": 3880 }, { "epoch": 0.031432057466527684, "grad_norm": 1390.1978759765625, "learning_rate": 1.571717171717172e-05, "loss": 221.7333, "step": 3890 }, { "epoch": 0.03151285967081182, "grad_norm": 1188.6934814453125, "learning_rate": 1.5757575757575756e-05, "loss": 231.4922, "step": 3900 }, { "epoch": 0.031593661875095956, "grad_norm": 1810.8616943359375, "learning_rate": 1.5797979797979798e-05, "loss": 226.2008, "step": 3910 }, { "epoch": 0.031674464079380085, "grad_norm": 1351.2021484375, "learning_rate": 1.5838383838383837e-05, "loss": 213.8082, "step": 3920 }, { "epoch": 0.03175526628366422, "grad_norm": 1504.8511962890625, "learning_rate": 1.587878787878788e-05, "loss": 237.731, "step": 3930 }, { "epoch": 0.03183606848794835, "grad_norm": 3990.205810546875, "learning_rate": 1.5919191919191918e-05, "loss": 308.5875, "step": 3940 }, { "epoch": 0.03191687069223249, "grad_norm": 1052.4140625, "learning_rate": 1.595959595959596e-05, "loss": 173.6135, "step": 3950 }, { "epoch": 0.031997672896516616, "grad_norm": 818.5986328125, "learning_rate": 1.6000000000000003e-05, "loss": 255.928, "step": 3960 }, { "epoch": 0.03207847510080075, "grad_norm": 3015.482666015625, "learning_rate": 1.604040404040404e-05, "loss": 246.6157, "step": 3970 }, { "epoch": 0.03215927730508488, "grad_norm": 1520.350341796875, "learning_rate": 1.6080808080808084e-05, "loss": 294.5478, "step": 3980 }, { "epoch": 0.03224007950936902, "grad_norm": 1362.8385009765625, "learning_rate": 1.6121212121212123e-05, "loss": 263.2382, "step": 3990 }, { "epoch": 0.032320881713653146, "grad_norm": 1330.2135009765625, "learning_rate": 1.6161616161616165e-05, "loss": 227.9952, "step": 4000 }, { "epoch": 0.03240168391793728, "grad_norm": 2001.479248046875, "learning_rate": 1.62020202020202e-05, "loss": 373.7298, "step": 4010 }, { "epoch": 0.03248248612222141, "grad_norm": 670.8789672851562, "learning_rate": 1.6242424242424243e-05, "loss": 252.2481, "step": 4020 }, { "epoch": 0.03256328832650555, "grad_norm": 1504.35205078125, "learning_rate": 1.628282828282828e-05, "loss": 259.7328, "step": 4030 }, { "epoch": 0.03264409053078968, "grad_norm": 1177.47509765625, "learning_rate": 1.6323232323232324e-05, "loss": 220.6592, "step": 4040 }, { "epoch": 0.03272489273507381, "grad_norm": 889.9537353515625, "learning_rate": 1.6363636363636366e-05, "loss": 210.0868, "step": 4050 }, { "epoch": 0.03280569493935795, "grad_norm": 1655.767333984375, "learning_rate": 1.6404040404040405e-05, "loss": 247.7082, "step": 4060 }, { "epoch": 0.03288649714364208, "grad_norm": 1741.26416015625, "learning_rate": 1.6444444444444447e-05, "loss": 213.1305, "step": 4070 }, { "epoch": 0.032967299347926214, "grad_norm": 1701.3470458984375, "learning_rate": 1.6484848484848486e-05, "loss": 197.4172, "step": 4080 }, { "epoch": 0.03304810155221034, "grad_norm": 1241.48876953125, "learning_rate": 1.6525252525252528e-05, "loss": 200.895, "step": 4090 }, { "epoch": 0.03312890375649448, "grad_norm": 4305.5234375, "learning_rate": 1.6565656565656567e-05, "loss": 270.1561, "step": 4100 }, { "epoch": 0.03320970596077861, "grad_norm": 1233.9559326171875, "learning_rate": 1.6606060606060606e-05, "loss": 234.697, "step": 4110 }, { "epoch": 0.033290508165062745, "grad_norm": 1864.9722900390625, "learning_rate": 1.6646464646464645e-05, "loss": 207.7519, "step": 4120 }, { "epoch": 0.033371310369346874, "grad_norm": 696.45654296875, "learning_rate": 1.6686868686868687e-05, "loss": 260.9977, "step": 4130 }, { "epoch": 0.03345211257363101, "grad_norm": 1083.8914794921875, "learning_rate": 1.672727272727273e-05, "loss": 296.5648, "step": 4140 }, { "epoch": 0.03353291477791514, "grad_norm": 787.8980102539062, "learning_rate": 1.6767676767676768e-05, "loss": 252.0068, "step": 4150 }, { "epoch": 0.033613716982199275, "grad_norm": 3963.899658203125, "learning_rate": 1.680808080808081e-05, "loss": 239.0976, "step": 4160 }, { "epoch": 0.033694519186483404, "grad_norm": 1345.8841552734375, "learning_rate": 1.684848484848485e-05, "loss": 200.5568, "step": 4170 }, { "epoch": 0.03377532139076754, "grad_norm": 1667.1441650390625, "learning_rate": 1.688888888888889e-05, "loss": 246.528, "step": 4180 }, { "epoch": 0.033856123595051676, "grad_norm": 941.3829956054688, "learning_rate": 1.692929292929293e-05, "loss": 188.0032, "step": 4190 }, { "epoch": 0.033936925799335806, "grad_norm": 2224.00048828125, "learning_rate": 1.6969696969696972e-05, "loss": 233.2688, "step": 4200 }, { "epoch": 0.03401772800361994, "grad_norm": 990.577880859375, "learning_rate": 1.701010101010101e-05, "loss": 229.0408, "step": 4210 }, { "epoch": 0.03409853020790407, "grad_norm": 1741.591064453125, "learning_rate": 1.705050505050505e-05, "loss": 210.8973, "step": 4220 }, { "epoch": 0.03417933241218821, "grad_norm": 1565.2149658203125, "learning_rate": 1.7090909090909092e-05, "loss": 172.9691, "step": 4230 }, { "epoch": 0.034260134616472336, "grad_norm": 1411.6668701171875, "learning_rate": 1.713131313131313e-05, "loss": 223.8018, "step": 4240 }, { "epoch": 0.03434093682075647, "grad_norm": 849.447998046875, "learning_rate": 1.7171717171717173e-05, "loss": 277.72, "step": 4250 }, { "epoch": 0.0344217390250406, "grad_norm": 1456.3353271484375, "learning_rate": 1.7212121212121212e-05, "loss": 269.1795, "step": 4260 }, { "epoch": 0.03450254122932474, "grad_norm": 2039.048583984375, "learning_rate": 1.7252525252525255e-05, "loss": 203.6644, "step": 4270 }, { "epoch": 0.03458334343360887, "grad_norm": 1037.1063232421875, "learning_rate": 1.7292929292929293e-05, "loss": 268.1442, "step": 4280 }, { "epoch": 0.034664145637893, "grad_norm": 1481.98095703125, "learning_rate": 1.7333333333333336e-05, "loss": 246.1609, "step": 4290 }, { "epoch": 0.03474494784217713, "grad_norm": 1042.147216796875, "learning_rate": 1.7373737373737375e-05, "loss": 360.0711, "step": 4300 }, { "epoch": 0.03482575004646127, "grad_norm": 1008.8258666992188, "learning_rate": 1.7414141414141417e-05, "loss": 254.1684, "step": 4310 }, { "epoch": 0.0349065522507454, "grad_norm": 1818.73681640625, "learning_rate": 1.7454545454545456e-05, "loss": 248.8469, "step": 4320 }, { "epoch": 0.03498735445502953, "grad_norm": 2598.832763671875, "learning_rate": 1.7494949494949494e-05, "loss": 215.4962, "step": 4330 }, { "epoch": 0.03506815665931367, "grad_norm": 5505.1572265625, "learning_rate": 1.7535353535353537e-05, "loss": 206.1825, "step": 4340 }, { "epoch": 0.0351489588635978, "grad_norm": 872.9111328125, "learning_rate": 1.7575757575757576e-05, "loss": 226.9096, "step": 4350 }, { "epoch": 0.035229761067881935, "grad_norm": 1309.483154296875, "learning_rate": 1.7616161616161618e-05, "loss": 347.4825, "step": 4360 }, { "epoch": 0.035310563272166064, "grad_norm": 1847.357666015625, "learning_rate": 1.7656565656565657e-05, "loss": 283.7126, "step": 4370 }, { "epoch": 0.0353913654764502, "grad_norm": 1132.7510986328125, "learning_rate": 1.76969696969697e-05, "loss": 238.7522, "step": 4380 }, { "epoch": 0.03547216768073433, "grad_norm": 1338.4906005859375, "learning_rate": 1.7737373737373738e-05, "loss": 206.4677, "step": 4390 }, { "epoch": 0.035552969885018465, "grad_norm": 889.9144897460938, "learning_rate": 1.777777777777778e-05, "loss": 218.258, "step": 4400 }, { "epoch": 0.035633772089302594, "grad_norm": 1081.747314453125, "learning_rate": 1.781818181818182e-05, "loss": 227.4267, "step": 4410 }, { "epoch": 0.03571457429358673, "grad_norm": 1337.2747802734375, "learning_rate": 1.785858585858586e-05, "loss": 216.5905, "step": 4420 }, { "epoch": 0.03579537649787086, "grad_norm": 1070.0733642578125, "learning_rate": 1.78989898989899e-05, "loss": 244.2413, "step": 4430 }, { "epoch": 0.035876178702154995, "grad_norm": 2713.52392578125, "learning_rate": 1.793939393939394e-05, "loss": 214.338, "step": 4440 }, { "epoch": 0.035956980906439125, "grad_norm": 1579.244873046875, "learning_rate": 1.797979797979798e-05, "loss": 236.9733, "step": 4450 }, { "epoch": 0.03603778311072326, "grad_norm": 1429.0421142578125, "learning_rate": 1.802020202020202e-05, "loss": 210.1912, "step": 4460 }, { "epoch": 0.0361185853150074, "grad_norm": 1236.484375, "learning_rate": 1.8060606060606062e-05, "loss": 209.4003, "step": 4470 }, { "epoch": 0.036199387519291526, "grad_norm": 2510.634521484375, "learning_rate": 1.81010101010101e-05, "loss": 261.3127, "step": 4480 }, { "epoch": 0.03628018972357566, "grad_norm": 697.04345703125, "learning_rate": 1.8141414141414143e-05, "loss": 202.3308, "step": 4490 }, { "epoch": 0.03636099192785979, "grad_norm": 1664.605712890625, "learning_rate": 1.8181818181818182e-05, "loss": 230.7549, "step": 4500 }, { "epoch": 0.03644179413214393, "grad_norm": 1968.6279296875, "learning_rate": 1.8222222222222224e-05, "loss": 222.0125, "step": 4510 }, { "epoch": 0.036522596336428056, "grad_norm": 1813.247314453125, "learning_rate": 1.8262626262626263e-05, "loss": 206.1146, "step": 4520 }, { "epoch": 0.03660339854071219, "grad_norm": 1681.3162841796875, "learning_rate": 1.8303030303030305e-05, "loss": 281.5203, "step": 4530 }, { "epoch": 0.03668420074499632, "grad_norm": 813.0327758789062, "learning_rate": 1.8343434343434344e-05, "loss": 241.7396, "step": 4540 }, { "epoch": 0.03676500294928046, "grad_norm": 1714.4927978515625, "learning_rate": 1.8383838383838383e-05, "loss": 229.0337, "step": 4550 }, { "epoch": 0.03684580515356459, "grad_norm": 1173.26318359375, "learning_rate": 1.8424242424242425e-05, "loss": 167.8814, "step": 4560 }, { "epoch": 0.03692660735784872, "grad_norm": 1044.22509765625, "learning_rate": 1.8464646464646464e-05, "loss": 181.4134, "step": 4570 }, { "epoch": 0.03700740956213285, "grad_norm": 1544.4964599609375, "learning_rate": 1.8505050505050506e-05, "loss": 264.1711, "step": 4580 }, { "epoch": 0.03708821176641699, "grad_norm": 3204.8271484375, "learning_rate": 1.8545454545454545e-05, "loss": 209.0515, "step": 4590 }, { "epoch": 0.03716901397070112, "grad_norm": 1948.9998779296875, "learning_rate": 1.8585858585858588e-05, "loss": 204.1481, "step": 4600 }, { "epoch": 0.037249816174985254, "grad_norm": 985.3388671875, "learning_rate": 1.8626262626262626e-05, "loss": 251.0652, "step": 4610 }, { "epoch": 0.03733061837926939, "grad_norm": 4716.29833984375, "learning_rate": 1.866666666666667e-05, "loss": 234.005, "step": 4620 }, { "epoch": 0.03741142058355352, "grad_norm": 2745.129150390625, "learning_rate": 1.8707070707070707e-05, "loss": 222.8053, "step": 4630 }, { "epoch": 0.037492222787837655, "grad_norm": 852.2494506835938, "learning_rate": 1.874747474747475e-05, "loss": 244.6, "step": 4640 }, { "epoch": 0.037573024992121784, "grad_norm": 1276.906494140625, "learning_rate": 1.878787878787879e-05, "loss": 243.4739, "step": 4650 }, { "epoch": 0.03765382719640592, "grad_norm": 2488.490478515625, "learning_rate": 1.8828282828282827e-05, "loss": 241.5105, "step": 4660 }, { "epoch": 0.03773462940069005, "grad_norm": 1208.5731201171875, "learning_rate": 1.886868686868687e-05, "loss": 266.4298, "step": 4670 }, { "epoch": 0.037815431604974185, "grad_norm": 1110.9935302734375, "learning_rate": 1.890909090909091e-05, "loss": 220.2013, "step": 4680 }, { "epoch": 0.037896233809258315, "grad_norm": 966.4763793945312, "learning_rate": 1.894949494949495e-05, "loss": 213.4089, "step": 4690 }, { "epoch": 0.03797703601354245, "grad_norm": 888.4136352539062, "learning_rate": 1.898989898989899e-05, "loss": 192.6133, "step": 4700 }, { "epoch": 0.03805783821782658, "grad_norm": 1441.930419921875, "learning_rate": 1.9030303030303032e-05, "loss": 210.6855, "step": 4710 }, { "epoch": 0.038138640422110716, "grad_norm": 1268.2919921875, "learning_rate": 1.907070707070707e-05, "loss": 196.9399, "step": 4720 }, { "epoch": 0.038219442626394845, "grad_norm": 714.101318359375, "learning_rate": 1.9111111111111113e-05, "loss": 236.1493, "step": 4730 }, { "epoch": 0.03830024483067898, "grad_norm": 1360.3662109375, "learning_rate": 1.9151515151515155e-05, "loss": 277.1614, "step": 4740 }, { "epoch": 0.03838104703496312, "grad_norm": 857.1802368164062, "learning_rate": 1.919191919191919e-05, "loss": 233.6975, "step": 4750 }, { "epoch": 0.038461849239247246, "grad_norm": 1430.3370361328125, "learning_rate": 1.9232323232323233e-05, "loss": 206.9375, "step": 4760 }, { "epoch": 0.03854265144353138, "grad_norm": 999.745849609375, "learning_rate": 1.9272727272727272e-05, "loss": 177.6682, "step": 4770 }, { "epoch": 0.03862345364781551, "grad_norm": 1979.0234375, "learning_rate": 1.9313131313131314e-05, "loss": 237.5471, "step": 4780 }, { "epoch": 0.03870425585209965, "grad_norm": 1399.9544677734375, "learning_rate": 1.9353535353535353e-05, "loss": 209.8267, "step": 4790 }, { "epoch": 0.03878505805638378, "grad_norm": 1058.5128173828125, "learning_rate": 1.9393939393939395e-05, "loss": 206.1269, "step": 4800 }, { "epoch": 0.03886586026066791, "grad_norm": 1852.674072265625, "learning_rate": 1.9434343434343434e-05, "loss": 192.6013, "step": 4810 }, { "epoch": 0.03894666246495204, "grad_norm": 1104.2967529296875, "learning_rate": 1.9474747474747476e-05, "loss": 252.5522, "step": 4820 }, { "epoch": 0.03902746466923618, "grad_norm": 1426.0396728515625, "learning_rate": 1.951515151515152e-05, "loss": 250.6448, "step": 4830 }, { "epoch": 0.03910826687352031, "grad_norm": 1632.4510498046875, "learning_rate": 1.9555555555555557e-05, "loss": 163.3638, "step": 4840 }, { "epoch": 0.03918906907780444, "grad_norm": 700.0907592773438, "learning_rate": 1.95959595959596e-05, "loss": 236.7388, "step": 4850 }, { "epoch": 0.03926987128208857, "grad_norm": 1205.572265625, "learning_rate": 1.9636363636363635e-05, "loss": 272.2705, "step": 4860 }, { "epoch": 0.03935067348637271, "grad_norm": 799.412353515625, "learning_rate": 1.9676767676767677e-05, "loss": 171.4291, "step": 4870 }, { "epoch": 0.03943147569065684, "grad_norm": 1350.2025146484375, "learning_rate": 1.9717171717171716e-05, "loss": 233.9921, "step": 4880 }, { "epoch": 0.039512277894940974, "grad_norm": 976.219970703125, "learning_rate": 1.975757575757576e-05, "loss": 189.0711, "step": 4890 }, { "epoch": 0.03959308009922511, "grad_norm": 947.8401489257812, "learning_rate": 1.9797979797979797e-05, "loss": 207.2786, "step": 4900 }, { "epoch": 0.03967388230350924, "grad_norm": 1402.2440185546875, "learning_rate": 1.983838383838384e-05, "loss": 233.6717, "step": 4910 }, { "epoch": 0.039754684507793375, "grad_norm": 2319.2314453125, "learning_rate": 1.987878787878788e-05, "loss": 268.4254, "step": 4920 }, { "epoch": 0.039835486712077504, "grad_norm": 1344.019775390625, "learning_rate": 1.991919191919192e-05, "loss": 215.5304, "step": 4930 }, { "epoch": 0.03991628891636164, "grad_norm": 1209.1622314453125, "learning_rate": 1.9959595959595963e-05, "loss": 202.8059, "step": 4940 }, { "epoch": 0.03999709112064577, "grad_norm": 1872.3892822265625, "learning_rate": 2e-05, "loss": 193.5764, "step": 4950 }, { "epoch": 0.040077893324929906, "grad_norm": 1944.2449951171875, "learning_rate": 2.0040404040404044e-05, "loss": 273.8487, "step": 4960 }, { "epoch": 0.040158695529214035, "grad_norm": 988.1495361328125, "learning_rate": 2.008080808080808e-05, "loss": 202.3245, "step": 4970 }, { "epoch": 0.04023949773349817, "grad_norm": 1082.6280517578125, "learning_rate": 2.012121212121212e-05, "loss": 190.6009, "step": 4980 }, { "epoch": 0.0403202999377823, "grad_norm": 1510.5738525390625, "learning_rate": 2.016161616161616e-05, "loss": 262.4282, "step": 4990 }, { "epoch": 0.040401102142066436, "grad_norm": 1080.0328369140625, "learning_rate": 2.0202020202020203e-05, "loss": 179.5178, "step": 5000 }, { "epoch": 0.040481904346350565, "grad_norm": 1204.5341796875, "learning_rate": 2.0242424242424245e-05, "loss": 208.4234, "step": 5010 }, { "epoch": 0.0405627065506347, "grad_norm": 788.6203002929688, "learning_rate": 2.0282828282828284e-05, "loss": 222.1854, "step": 5020 }, { "epoch": 0.04064350875491884, "grad_norm": 2447.934326171875, "learning_rate": 2.0323232323232326e-05, "loss": 183.1969, "step": 5030 }, { "epoch": 0.04072431095920297, "grad_norm": 1879.5914306640625, "learning_rate": 2.0363636363636365e-05, "loss": 235.8428, "step": 5040 }, { "epoch": 0.0408051131634871, "grad_norm": 859.5083618164062, "learning_rate": 2.0404040404040407e-05, "loss": 223.1974, "step": 5050 }, { "epoch": 0.04088591536777123, "grad_norm": 591.982421875, "learning_rate": 2.0444444444444446e-05, "loss": 195.905, "step": 5060 }, { "epoch": 0.04096671757205537, "grad_norm": 2516.256103515625, "learning_rate": 2.0484848484848485e-05, "loss": 224.4586, "step": 5070 }, { "epoch": 0.0410475197763395, "grad_norm": 1155.78271484375, "learning_rate": 2.0525252525252524e-05, "loss": 237.8034, "step": 5080 }, { "epoch": 0.04112832198062363, "grad_norm": 760.8511962890625, "learning_rate": 2.0565656565656566e-05, "loss": 213.4372, "step": 5090 }, { "epoch": 0.04120912418490776, "grad_norm": 746.3182983398438, "learning_rate": 2.0606060606060608e-05, "loss": 246.9279, "step": 5100 }, { "epoch": 0.0412899263891919, "grad_norm": 1112.6119384765625, "learning_rate": 2.0646464646464647e-05, "loss": 215.4636, "step": 5110 }, { "epoch": 0.04137072859347603, "grad_norm": 1308.880126953125, "learning_rate": 2.068686868686869e-05, "loss": 184.3576, "step": 5120 }, { "epoch": 0.041451530797760164, "grad_norm": 1182.3695068359375, "learning_rate": 2.0727272727272728e-05, "loss": 251.3663, "step": 5130 }, { "epoch": 0.04153233300204429, "grad_norm": 3545.449951171875, "learning_rate": 2.076767676767677e-05, "loss": 221.7183, "step": 5140 }, { "epoch": 0.04161313520632843, "grad_norm": 1155.616455078125, "learning_rate": 2.080808080808081e-05, "loss": 181.7703, "step": 5150 }, { "epoch": 0.04169393741061256, "grad_norm": 927.0892333984375, "learning_rate": 2.084848484848485e-05, "loss": 242.7771, "step": 5160 }, { "epoch": 0.041774739614896694, "grad_norm": 1621.09326171875, "learning_rate": 2.088888888888889e-05, "loss": 168.8398, "step": 5170 }, { "epoch": 0.04185554181918083, "grad_norm": 1823.0281982421875, "learning_rate": 2.092929292929293e-05, "loss": 226.3993, "step": 5180 }, { "epoch": 0.04193634402346496, "grad_norm": 1904.581298828125, "learning_rate": 2.096969696969697e-05, "loss": 274.6904, "step": 5190 }, { "epoch": 0.042017146227749096, "grad_norm": 1195.8973388671875, "learning_rate": 2.101010101010101e-05, "loss": 193.929, "step": 5200 }, { "epoch": 0.042097948432033225, "grad_norm": 809.5712890625, "learning_rate": 2.1050505050505052e-05, "loss": 183.259, "step": 5210 }, { "epoch": 0.04217875063631736, "grad_norm": 1392.5491943359375, "learning_rate": 2.109090909090909e-05, "loss": 220.0326, "step": 5220 }, { "epoch": 0.04225955284060149, "grad_norm": 1818.6051025390625, "learning_rate": 2.1131313131313134e-05, "loss": 209.3423, "step": 5230 }, { "epoch": 0.042340355044885626, "grad_norm": 756.583740234375, "learning_rate": 2.1171717171717172e-05, "loss": 152.79, "step": 5240 }, { "epoch": 0.042421157249169755, "grad_norm": 1358.5194091796875, "learning_rate": 2.1212121212121215e-05, "loss": 223.6846, "step": 5250 }, { "epoch": 0.04250195945345389, "grad_norm": 2302.727783203125, "learning_rate": 2.1252525252525254e-05, "loss": 206.5412, "step": 5260 }, { "epoch": 0.04258276165773802, "grad_norm": 1090.666259765625, "learning_rate": 2.1292929292929296e-05, "loss": 197.9379, "step": 5270 }, { "epoch": 0.04266356386202216, "grad_norm": 1535.5264892578125, "learning_rate": 2.1333333333333335e-05, "loss": 172.5529, "step": 5280 }, { "epoch": 0.042744366066306286, "grad_norm": 1242.1055908203125, "learning_rate": 2.1373737373737373e-05, "loss": 182.2667, "step": 5290 }, { "epoch": 0.04282516827059042, "grad_norm": 1571.221923828125, "learning_rate": 2.1414141414141416e-05, "loss": 206.9152, "step": 5300 }, { "epoch": 0.04290597047487456, "grad_norm": 1733.92578125, "learning_rate": 2.1454545454545455e-05, "loss": 253.6228, "step": 5310 }, { "epoch": 0.04298677267915869, "grad_norm": 1736.4722900390625, "learning_rate": 2.1494949494949497e-05, "loss": 209.5105, "step": 5320 }, { "epoch": 0.04306757488344282, "grad_norm": 846.6854248046875, "learning_rate": 2.1535353535353536e-05, "loss": 227.6331, "step": 5330 }, { "epoch": 0.04314837708772695, "grad_norm": 793.491943359375, "learning_rate": 2.1575757575757578e-05, "loss": 190.9206, "step": 5340 }, { "epoch": 0.04322917929201109, "grad_norm": 1314.4940185546875, "learning_rate": 2.1616161616161617e-05, "loss": 278.1586, "step": 5350 }, { "epoch": 0.04330998149629522, "grad_norm": 1807.1669921875, "learning_rate": 2.165656565656566e-05, "loss": 246.6954, "step": 5360 }, { "epoch": 0.043390783700579354, "grad_norm": 1456.6739501953125, "learning_rate": 2.1696969696969698e-05, "loss": 198.1051, "step": 5370 }, { "epoch": 0.04347158590486348, "grad_norm": 2645.863037109375, "learning_rate": 2.173737373737374e-05, "loss": 263.7012, "step": 5380 }, { "epoch": 0.04355238810914762, "grad_norm": 890.2818603515625, "learning_rate": 2.177777777777778e-05, "loss": 206.08, "step": 5390 }, { "epoch": 0.04363319031343175, "grad_norm": 1066.948974609375, "learning_rate": 2.1818181818181818e-05, "loss": 203.3024, "step": 5400 }, { "epoch": 0.043713992517715884, "grad_norm": 1678.3651123046875, "learning_rate": 2.185858585858586e-05, "loss": 287.4994, "step": 5410 }, { "epoch": 0.04379479472200001, "grad_norm": 1427.133544921875, "learning_rate": 2.18989898989899e-05, "loss": 236.3808, "step": 5420 }, { "epoch": 0.04387559692628415, "grad_norm": 993.3723754882812, "learning_rate": 2.193939393939394e-05, "loss": 221.2247, "step": 5430 }, { "epoch": 0.04395639913056828, "grad_norm": 919.2279663085938, "learning_rate": 2.197979797979798e-05, "loss": 232.8961, "step": 5440 }, { "epoch": 0.044037201334852415, "grad_norm": 1196.51904296875, "learning_rate": 2.2020202020202022e-05, "loss": 208.7773, "step": 5450 }, { "epoch": 0.04411800353913655, "grad_norm": 937.6903076171875, "learning_rate": 2.206060606060606e-05, "loss": 159.7425, "step": 5460 }, { "epoch": 0.04419880574342068, "grad_norm": 2946.419921875, "learning_rate": 2.2101010101010103e-05, "loss": 201.0844, "step": 5470 }, { "epoch": 0.044279607947704816, "grad_norm": 1663.4422607421875, "learning_rate": 2.2141414141414142e-05, "loss": 140.8333, "step": 5480 }, { "epoch": 0.044360410151988945, "grad_norm": 1202.589599609375, "learning_rate": 2.2181818181818184e-05, "loss": 210.6169, "step": 5490 }, { "epoch": 0.04444121235627308, "grad_norm": 1676.0555419921875, "learning_rate": 2.2222222222222223e-05, "loss": 300.5972, "step": 5500 }, { "epoch": 0.04452201456055721, "grad_norm": 1122.7333984375, "learning_rate": 2.2262626262626262e-05, "loss": 223.7688, "step": 5510 }, { "epoch": 0.044602816764841346, "grad_norm": 842.3754272460938, "learning_rate": 2.2303030303030304e-05, "loss": 231.1573, "step": 5520 }, { "epoch": 0.044683618969125476, "grad_norm": 912.3519897460938, "learning_rate": 2.2343434343434343e-05, "loss": 161.5479, "step": 5530 }, { "epoch": 0.04476442117340961, "grad_norm": 2117.377197265625, "learning_rate": 2.2383838383838385e-05, "loss": 178.251, "step": 5540 }, { "epoch": 0.04484522337769374, "grad_norm": 1402.2164306640625, "learning_rate": 2.2424242424242424e-05, "loss": 209.2086, "step": 5550 }, { "epoch": 0.04492602558197788, "grad_norm": 1458.323974609375, "learning_rate": 2.2464646464646467e-05, "loss": 243.9479, "step": 5560 }, { "epoch": 0.045006827786262006, "grad_norm": 2175.216796875, "learning_rate": 2.2505050505050505e-05, "loss": 189.8892, "step": 5570 }, { "epoch": 0.04508762999054614, "grad_norm": 1899.4354248046875, "learning_rate": 2.2545454545454548e-05, "loss": 335.0552, "step": 5580 }, { "epoch": 0.04516843219483028, "grad_norm": 1230.814697265625, "learning_rate": 2.2585858585858587e-05, "loss": 194.9335, "step": 5590 }, { "epoch": 0.04524923439911441, "grad_norm": 2101.527587890625, "learning_rate": 2.262626262626263e-05, "loss": 257.3806, "step": 5600 }, { "epoch": 0.045330036603398544, "grad_norm": 1695.30810546875, "learning_rate": 2.2666666666666668e-05, "loss": 219.7137, "step": 5610 }, { "epoch": 0.04541083880768267, "grad_norm": 1386.2855224609375, "learning_rate": 2.2707070707070706e-05, "loss": 236.2214, "step": 5620 }, { "epoch": 0.04549164101196681, "grad_norm": 1138.779052734375, "learning_rate": 2.274747474747475e-05, "loss": 192.9845, "step": 5630 }, { "epoch": 0.04557244321625094, "grad_norm": 2650.991943359375, "learning_rate": 2.2787878787878788e-05, "loss": 233.3904, "step": 5640 }, { "epoch": 0.045653245420535074, "grad_norm": 1309.0333251953125, "learning_rate": 2.282828282828283e-05, "loss": 225.3846, "step": 5650 }, { "epoch": 0.0457340476248192, "grad_norm": 930.385009765625, "learning_rate": 2.286868686868687e-05, "loss": 236.1336, "step": 5660 }, { "epoch": 0.04581484982910334, "grad_norm": 1646.2891845703125, "learning_rate": 2.290909090909091e-05, "loss": 227.3526, "step": 5670 }, { "epoch": 0.04589565203338747, "grad_norm": 2285.751708984375, "learning_rate": 2.294949494949495e-05, "loss": 236.6346, "step": 5680 }, { "epoch": 0.045976454237671605, "grad_norm": 3180.75537109375, "learning_rate": 2.2989898989898992e-05, "loss": 177.5457, "step": 5690 }, { "epoch": 0.046057256441955734, "grad_norm": 1423.35009765625, "learning_rate": 2.3030303030303034e-05, "loss": 194.2139, "step": 5700 }, { "epoch": 0.04613805864623987, "grad_norm": 1577.701171875, "learning_rate": 2.307070707070707e-05, "loss": 183.8717, "step": 5710 }, { "epoch": 0.046218860850524, "grad_norm": 1255.1485595703125, "learning_rate": 2.3111111111111112e-05, "loss": 213.8492, "step": 5720 }, { "epoch": 0.046299663054808135, "grad_norm": 1154.9453125, "learning_rate": 2.315151515151515e-05, "loss": 219.6154, "step": 5730 }, { "epoch": 0.04638046525909227, "grad_norm": 3208.9140625, "learning_rate": 2.3191919191919193e-05, "loss": 212.2527, "step": 5740 }, { "epoch": 0.0464612674633764, "grad_norm": 826.8831787109375, "learning_rate": 2.3232323232323232e-05, "loss": 193.0573, "step": 5750 }, { "epoch": 0.046542069667660536, "grad_norm": 953.578369140625, "learning_rate": 2.3272727272727274e-05, "loss": 200.1285, "step": 5760 }, { "epoch": 0.046622871871944666, "grad_norm": 948.6517944335938, "learning_rate": 2.3313131313131313e-05, "loss": 226.3946, "step": 5770 }, { "epoch": 0.0467036740762288, "grad_norm": 1502.9415283203125, "learning_rate": 2.3353535353535355e-05, "loss": 301.4247, "step": 5780 }, { "epoch": 0.04678447628051293, "grad_norm": 592.7190551757812, "learning_rate": 2.3393939393939397e-05, "loss": 171.6613, "step": 5790 }, { "epoch": 0.04686527848479707, "grad_norm": 774.3163452148438, "learning_rate": 2.3434343434343436e-05, "loss": 174.7567, "step": 5800 }, { "epoch": 0.046946080689081196, "grad_norm": 1000.3840942382812, "learning_rate": 2.347474747474748e-05, "loss": 140.1143, "step": 5810 }, { "epoch": 0.04702688289336533, "grad_norm": 1050.761474609375, "learning_rate": 2.3515151515151514e-05, "loss": 234.2542, "step": 5820 }, { "epoch": 0.04710768509764946, "grad_norm": 1076.979248046875, "learning_rate": 2.3555555555555556e-05, "loss": 170.2877, "step": 5830 }, { "epoch": 0.0471884873019336, "grad_norm": 1528.865478515625, "learning_rate": 2.3595959595959595e-05, "loss": 280.3715, "step": 5840 }, { "epoch": 0.047269289506217727, "grad_norm": 1554.0205078125, "learning_rate": 2.3636363636363637e-05, "loss": 258.9206, "step": 5850 }, { "epoch": 0.04735009171050186, "grad_norm": 969.7879028320312, "learning_rate": 2.3676767676767676e-05, "loss": 173.4592, "step": 5860 }, { "epoch": 0.047430893914786, "grad_norm": 1271.55322265625, "learning_rate": 2.371717171717172e-05, "loss": 187.7373, "step": 5870 }, { "epoch": 0.04751169611907013, "grad_norm": 757.3799438476562, "learning_rate": 2.375757575757576e-05, "loss": 206.0978, "step": 5880 }, { "epoch": 0.047592498323354264, "grad_norm": 1099.2119140625, "learning_rate": 2.37979797979798e-05, "loss": 191.4486, "step": 5890 }, { "epoch": 0.04767330052763839, "grad_norm": 895.0558471679688, "learning_rate": 2.3838383838383842e-05, "loss": 197.1677, "step": 5900 }, { "epoch": 0.04775410273192253, "grad_norm": 900.752685546875, "learning_rate": 2.387878787878788e-05, "loss": 209.3482, "step": 5910 }, { "epoch": 0.04783490493620666, "grad_norm": 865.3425903320312, "learning_rate": 2.3919191919191923e-05, "loss": 211.704, "step": 5920 }, { "epoch": 0.047915707140490794, "grad_norm": 1376.961181640625, "learning_rate": 2.395959595959596e-05, "loss": 197.2012, "step": 5930 }, { "epoch": 0.047996509344774924, "grad_norm": 2671.92236328125, "learning_rate": 2.4e-05, "loss": 262.8319, "step": 5940 }, { "epoch": 0.04807731154905906, "grad_norm": 4328.66552734375, "learning_rate": 2.404040404040404e-05, "loss": 263.4226, "step": 5950 }, { "epoch": 0.04815811375334319, "grad_norm": 1454.4398193359375, "learning_rate": 2.4080808080808082e-05, "loss": 173.7909, "step": 5960 }, { "epoch": 0.048238915957627325, "grad_norm": 1238.2913818359375, "learning_rate": 2.4121212121212124e-05, "loss": 190.8571, "step": 5970 }, { "epoch": 0.048319718161911454, "grad_norm": 1106.6146240234375, "learning_rate": 2.4161616161616163e-05, "loss": 252.6962, "step": 5980 }, { "epoch": 0.04840052036619559, "grad_norm": 1612.1171875, "learning_rate": 2.4202020202020205e-05, "loss": 176.1714, "step": 5990 }, { "epoch": 0.04848132257047972, "grad_norm": 684.4707641601562, "learning_rate": 2.4242424242424244e-05, "loss": 236.6299, "step": 6000 }, { "epoch": 0.048562124774763855, "grad_norm": 5278.638671875, "learning_rate": 2.4282828282828286e-05, "loss": 200.9588, "step": 6010 }, { "epoch": 0.04864292697904799, "grad_norm": 2136.859375, "learning_rate": 2.4323232323232325e-05, "loss": 249.8048, "step": 6020 }, { "epoch": 0.04872372918333212, "grad_norm": 704.8456420898438, "learning_rate": 2.4363636363636364e-05, "loss": 210.816, "step": 6030 }, { "epoch": 0.04880453138761626, "grad_norm": 2405.291259765625, "learning_rate": 2.4404040404040403e-05, "loss": 180.7068, "step": 6040 }, { "epoch": 0.048885333591900386, "grad_norm": 1121.5928955078125, "learning_rate": 2.4444444444444445e-05, "loss": 268.1764, "step": 6050 }, { "epoch": 0.04896613579618452, "grad_norm": 1185.4925537109375, "learning_rate": 2.4484848484848484e-05, "loss": 252.5901, "step": 6060 }, { "epoch": 0.04904693800046865, "grad_norm": 1037.7261962890625, "learning_rate": 2.4525252525252526e-05, "loss": 217.7089, "step": 6070 }, { "epoch": 0.04912774020475279, "grad_norm": 3574.91943359375, "learning_rate": 2.4565656565656568e-05, "loss": 248.0757, "step": 6080 }, { "epoch": 0.049208542409036916, "grad_norm": 1335.7510986328125, "learning_rate": 2.4606060606060607e-05, "loss": 243.5903, "step": 6090 }, { "epoch": 0.04928934461332105, "grad_norm": 1548.2281494140625, "learning_rate": 2.464646464646465e-05, "loss": 204.2808, "step": 6100 }, { "epoch": 0.04937014681760518, "grad_norm": 1327.641357421875, "learning_rate": 2.4686868686868688e-05, "loss": 175.3226, "step": 6110 }, { "epoch": 0.04945094902188932, "grad_norm": 1096.567626953125, "learning_rate": 2.472727272727273e-05, "loss": 251.3891, "step": 6120 }, { "epoch": 0.04953175122617345, "grad_norm": 916.0780639648438, "learning_rate": 2.476767676767677e-05, "loss": 265.5964, "step": 6130 }, { "epoch": 0.04961255343045758, "grad_norm": 3319.821533203125, "learning_rate": 2.4808080808080808e-05, "loss": 199.656, "step": 6140 }, { "epoch": 0.04969335563474172, "grad_norm": 804.5398559570312, "learning_rate": 2.4848484848484847e-05, "loss": 176.793, "step": 6150 }, { "epoch": 0.04977415783902585, "grad_norm": 1266.6590576171875, "learning_rate": 2.488888888888889e-05, "loss": 172.6065, "step": 6160 }, { "epoch": 0.049854960043309984, "grad_norm": 953.1856689453125, "learning_rate": 2.492929292929293e-05, "loss": 259.3056, "step": 6170 }, { "epoch": 0.049935762247594113, "grad_norm": 1643.679443359375, "learning_rate": 2.496969696969697e-05, "loss": 227.5671, "step": 6180 }, { "epoch": 0.05001656445187825, "grad_norm": 2092.3837890625, "learning_rate": 2.5010101010101013e-05, "loss": 231.7141, "step": 6190 }, { "epoch": 0.05009736665616238, "grad_norm": 5872.7822265625, "learning_rate": 2.505050505050505e-05, "loss": 307.3282, "step": 6200 }, { "epoch": 0.050178168860446515, "grad_norm": 1653.10888671875, "learning_rate": 2.5090909090909094e-05, "loss": 290.71, "step": 6210 }, { "epoch": 0.050258971064730644, "grad_norm": 5940.2861328125, "learning_rate": 2.5131313131313133e-05, "loss": 298.1718, "step": 6220 }, { "epoch": 0.05033977326901478, "grad_norm": 1056.6617431640625, "learning_rate": 2.5171717171717175e-05, "loss": 167.573, "step": 6230 }, { "epoch": 0.05042057547329891, "grad_norm": 1492.5479736328125, "learning_rate": 2.5212121212121214e-05, "loss": 209.9481, "step": 6240 }, { "epoch": 0.050501377677583045, "grad_norm": 764.5651245117188, "learning_rate": 2.5252525252525256e-05, "loss": 189.9945, "step": 6250 }, { "epoch": 0.050582179881867174, "grad_norm": 2933.18603515625, "learning_rate": 2.5292929292929295e-05, "loss": 228.2252, "step": 6260 }, { "epoch": 0.05066298208615131, "grad_norm": 2692.583740234375, "learning_rate": 2.5333333333333337e-05, "loss": 217.1123, "step": 6270 }, { "epoch": 0.05074378429043544, "grad_norm": 1611.5694580078125, "learning_rate": 2.5373737373737376e-05, "loss": 199.2745, "step": 6280 }, { "epoch": 0.050824586494719576, "grad_norm": 638.3251953125, "learning_rate": 2.5414141414141418e-05, "loss": 283.4336, "step": 6290 }, { "epoch": 0.05090538869900371, "grad_norm": 960.7551879882812, "learning_rate": 2.5454545454545454e-05, "loss": 199.2895, "step": 6300 }, { "epoch": 0.05098619090328784, "grad_norm": 1416.6865234375, "learning_rate": 2.5494949494949492e-05, "loss": 247.6437, "step": 6310 }, { "epoch": 0.05106699310757198, "grad_norm": 962.5587158203125, "learning_rate": 2.5535353535353535e-05, "loss": 222.514, "step": 6320 }, { "epoch": 0.051147795311856106, "grad_norm": 1019.0704956054688, "learning_rate": 2.5575757575757573e-05, "loss": 233.7968, "step": 6330 }, { "epoch": 0.05122859751614024, "grad_norm": 1380.1087646484375, "learning_rate": 2.5616161616161616e-05, "loss": 203.472, "step": 6340 }, { "epoch": 0.05130939972042437, "grad_norm": 765.1551513671875, "learning_rate": 2.5656565656565658e-05, "loss": 202.9591, "step": 6350 }, { "epoch": 0.05139020192470851, "grad_norm": 854.4512329101562, "learning_rate": 2.5696969696969697e-05, "loss": 152.8654, "step": 6360 }, { "epoch": 0.05147100412899264, "grad_norm": 1366.1529541015625, "learning_rate": 2.573737373737374e-05, "loss": 202.4912, "step": 6370 }, { "epoch": 0.05155180633327677, "grad_norm": 812.153564453125, "learning_rate": 2.5777777777777778e-05, "loss": 190.2283, "step": 6380 }, { "epoch": 0.0516326085375609, "grad_norm": 2072.30029296875, "learning_rate": 2.581818181818182e-05, "loss": 250.4601, "step": 6390 }, { "epoch": 0.05171341074184504, "grad_norm": 1064.25732421875, "learning_rate": 2.585858585858586e-05, "loss": 243.8253, "step": 6400 }, { "epoch": 0.05179421294612917, "grad_norm": 1004.585205078125, "learning_rate": 2.58989898989899e-05, "loss": 233.6981, "step": 6410 }, { "epoch": 0.0518750151504133, "grad_norm": 781.0443115234375, "learning_rate": 2.593939393939394e-05, "loss": 231.5708, "step": 6420 }, { "epoch": 0.05195581735469744, "grad_norm": 1038.6923828125, "learning_rate": 2.5979797979797982e-05, "loss": 166.9408, "step": 6430 }, { "epoch": 0.05203661955898157, "grad_norm": 1369.49560546875, "learning_rate": 2.602020202020202e-05, "loss": 212.7086, "step": 6440 }, { "epoch": 0.052117421763265705, "grad_norm": 1065.5115966796875, "learning_rate": 2.6060606060606063e-05, "loss": 197.232, "step": 6450 }, { "epoch": 0.052198223967549834, "grad_norm": 1192.5135498046875, "learning_rate": 2.6101010101010102e-05, "loss": 210.3559, "step": 6460 }, { "epoch": 0.05227902617183397, "grad_norm": 2817.4658203125, "learning_rate": 2.6141414141414145e-05, "loss": 218.5129, "step": 6470 }, { "epoch": 0.0523598283761181, "grad_norm": 1661.2547607421875, "learning_rate": 2.6181818181818187e-05, "loss": 251.3214, "step": 6480 }, { "epoch": 0.052440630580402235, "grad_norm": 1465.83251953125, "learning_rate": 2.6222222222222226e-05, "loss": 167.4487, "step": 6490 }, { "epoch": 0.052521432784686364, "grad_norm": 1172.0814208984375, "learning_rate": 2.6262626262626268e-05, "loss": 202.9199, "step": 6500 }, { "epoch": 0.0526022349889705, "grad_norm": 845.3886108398438, "learning_rate": 2.63030303030303e-05, "loss": 245.5335, "step": 6510 }, { "epoch": 0.05268303719325463, "grad_norm": 1505.1903076171875, "learning_rate": 2.6343434343434342e-05, "loss": 189.6107, "step": 6520 }, { "epoch": 0.052763839397538766, "grad_norm": 855.8611450195312, "learning_rate": 2.6383838383838384e-05, "loss": 168.8279, "step": 6530 }, { "epoch": 0.052844641601822895, "grad_norm": 1719.1915283203125, "learning_rate": 2.6424242424242423e-05, "loss": 266.6213, "step": 6540 }, { "epoch": 0.05292544380610703, "grad_norm": 1334.455322265625, "learning_rate": 2.6464646464646466e-05, "loss": 154.8023, "step": 6550 }, { "epoch": 0.05300624601039116, "grad_norm": 1549.58154296875, "learning_rate": 2.6505050505050504e-05, "loss": 188.1264, "step": 6560 }, { "epoch": 0.053087048214675296, "grad_norm": 740.02587890625, "learning_rate": 2.6545454545454547e-05, "loss": 241.7192, "step": 6570 }, { "epoch": 0.05316785041895943, "grad_norm": 1759.15869140625, "learning_rate": 2.6585858585858585e-05, "loss": 249.789, "step": 6580 }, { "epoch": 0.05324865262324356, "grad_norm": 1615.3770751953125, "learning_rate": 2.6626262626262628e-05, "loss": 256.1343, "step": 6590 }, { "epoch": 0.0533294548275277, "grad_norm": 1187.103515625, "learning_rate": 2.6666666666666667e-05, "loss": 153.0139, "step": 6600 }, { "epoch": 0.05341025703181183, "grad_norm": 835.982177734375, "learning_rate": 2.670707070707071e-05, "loss": 240.5858, "step": 6610 }, { "epoch": 0.05349105923609596, "grad_norm": 658.9365234375, "learning_rate": 2.6747474747474748e-05, "loss": 212.2474, "step": 6620 }, { "epoch": 0.05357186144038009, "grad_norm": 836.15185546875, "learning_rate": 2.678787878787879e-05, "loss": 203.4595, "step": 6630 }, { "epoch": 0.05365266364466423, "grad_norm": 1312.960205078125, "learning_rate": 2.682828282828283e-05, "loss": 178.5055, "step": 6640 }, { "epoch": 0.05373346584894836, "grad_norm": 2402.58642578125, "learning_rate": 2.686868686868687e-05, "loss": 204.5425, "step": 6650 }, { "epoch": 0.05381426805323249, "grad_norm": 743.2178344726562, "learning_rate": 2.6909090909090913e-05, "loss": 151.0765, "step": 6660 }, { "epoch": 0.05389507025751662, "grad_norm": 2009.14599609375, "learning_rate": 2.6949494949494952e-05, "loss": 255.5723, "step": 6670 }, { "epoch": 0.05397587246180076, "grad_norm": 1129.924560546875, "learning_rate": 2.6989898989898994e-05, "loss": 249.5275, "step": 6680 }, { "epoch": 0.05405667466608489, "grad_norm": 1877.5682373046875, "learning_rate": 2.7030303030303033e-05, "loss": 201.4787, "step": 6690 }, { "epoch": 0.054137476870369024, "grad_norm": 1205.5860595703125, "learning_rate": 2.7070707070707075e-05, "loss": 165.1917, "step": 6700 }, { "epoch": 0.05421827907465316, "grad_norm": 833.5079956054688, "learning_rate": 2.7111111111111114e-05, "loss": 222.6354, "step": 6710 }, { "epoch": 0.05429908127893729, "grad_norm": 1644.57470703125, "learning_rate": 2.7151515151515157e-05, "loss": 183.0618, "step": 6720 }, { "epoch": 0.054379883483221425, "grad_norm": 1261.3482666015625, "learning_rate": 2.7191919191919192e-05, "loss": 204.8876, "step": 6730 }, { "epoch": 0.054460685687505554, "grad_norm": 1064.4910888671875, "learning_rate": 2.723232323232323e-05, "loss": 228.8735, "step": 6740 }, { "epoch": 0.05454148789178969, "grad_norm": 1227.28369140625, "learning_rate": 2.7272727272727273e-05, "loss": 244.5206, "step": 6750 }, { "epoch": 0.05462229009607382, "grad_norm": 747.6671142578125, "learning_rate": 2.7313131313131312e-05, "loss": 171.5991, "step": 6760 }, { "epoch": 0.054703092300357956, "grad_norm": 1191.174560546875, "learning_rate": 2.7353535353535354e-05, "loss": 183.062, "step": 6770 }, { "epoch": 0.054783894504642085, "grad_norm": 1179.271484375, "learning_rate": 2.7393939393939393e-05, "loss": 203.4914, "step": 6780 }, { "epoch": 0.05486469670892622, "grad_norm": 1980.94287109375, "learning_rate": 2.7434343434343435e-05, "loss": 190.7682, "step": 6790 }, { "epoch": 0.05494549891321035, "grad_norm": 1313.760498046875, "learning_rate": 2.7474747474747474e-05, "loss": 179.4395, "step": 6800 }, { "epoch": 0.055026301117494486, "grad_norm": 818.7135620117188, "learning_rate": 2.7515151515151516e-05, "loss": 240.7207, "step": 6810 }, { "epoch": 0.055107103321778615, "grad_norm": 1303.9735107421875, "learning_rate": 2.7555555555555555e-05, "loss": 197.5866, "step": 6820 }, { "epoch": 0.05518790552606275, "grad_norm": 4817.638671875, "learning_rate": 2.7595959595959597e-05, "loss": 236.2139, "step": 6830 }, { "epoch": 0.05526870773034688, "grad_norm": 1369.7080078125, "learning_rate": 2.7636363636363636e-05, "loss": 155.043, "step": 6840 }, { "epoch": 0.05534950993463102, "grad_norm": 1351.29150390625, "learning_rate": 2.767676767676768e-05, "loss": 193.5722, "step": 6850 }, { "epoch": 0.05543031213891515, "grad_norm": 1340.113525390625, "learning_rate": 2.771717171717172e-05, "loss": 196.4928, "step": 6860 }, { "epoch": 0.05551111434319928, "grad_norm": 1829.1298828125, "learning_rate": 2.775757575757576e-05, "loss": 207.2559, "step": 6870 }, { "epoch": 0.05559191654748342, "grad_norm": 1614.317138671875, "learning_rate": 2.7797979797979802e-05, "loss": 191.8481, "step": 6880 }, { "epoch": 0.05567271875176755, "grad_norm": 1409.754150390625, "learning_rate": 2.783838383838384e-05, "loss": 172.1698, "step": 6890 }, { "epoch": 0.05575352095605168, "grad_norm": 1008.7220458984375, "learning_rate": 2.7878787878787883e-05, "loss": 204.197, "step": 6900 }, { "epoch": 0.05583432316033581, "grad_norm": 1213.98291015625, "learning_rate": 2.7919191919191922e-05, "loss": 176.116, "step": 6910 }, { "epoch": 0.05591512536461995, "grad_norm": 1919.146484375, "learning_rate": 2.7959595959595964e-05, "loss": 205.5731, "step": 6920 }, { "epoch": 0.05599592756890408, "grad_norm": 1582.1240234375, "learning_rate": 2.8000000000000003e-05, "loss": 204.0053, "step": 6930 }, { "epoch": 0.056076729773188214, "grad_norm": 1082.2257080078125, "learning_rate": 2.804040404040404e-05, "loss": 216.7668, "step": 6940 }, { "epoch": 0.05615753197747234, "grad_norm": 1451.9715576171875, "learning_rate": 2.808080808080808e-05, "loss": 128.3858, "step": 6950 }, { "epoch": 0.05623833418175648, "grad_norm": 2580.067138671875, "learning_rate": 2.812121212121212e-05, "loss": 187.2646, "step": 6960 }, { "epoch": 0.05631913638604061, "grad_norm": 1153.5308837890625, "learning_rate": 2.8161616161616162e-05, "loss": 170.1935, "step": 6970 }, { "epoch": 0.056399938590324744, "grad_norm": 842.653076171875, "learning_rate": 2.82020202020202e-05, "loss": 229.306, "step": 6980 }, { "epoch": 0.05648074079460888, "grad_norm": 1086.96337890625, "learning_rate": 2.8242424242424243e-05, "loss": 180.9517, "step": 6990 }, { "epoch": 0.05656154299889301, "grad_norm": 963.1438598632812, "learning_rate": 2.8282828282828282e-05, "loss": 186.2078, "step": 7000 }, { "epoch": 0.056642345203177145, "grad_norm": 1010.3299560546875, "learning_rate": 2.8323232323232324e-05, "loss": 223.6001, "step": 7010 }, { "epoch": 0.056723147407461275, "grad_norm": 1217.844482421875, "learning_rate": 2.8363636363636363e-05, "loss": 179.5198, "step": 7020 }, { "epoch": 0.05680394961174541, "grad_norm": 1364.8577880859375, "learning_rate": 2.8404040404040405e-05, "loss": 212.5286, "step": 7030 }, { "epoch": 0.05688475181602954, "grad_norm": 804.541748046875, "learning_rate": 2.8444444444444447e-05, "loss": 201.4965, "step": 7040 }, { "epoch": 0.056965554020313676, "grad_norm": 2093.808349609375, "learning_rate": 2.8484848484848486e-05, "loss": 202.5039, "step": 7050 }, { "epoch": 0.057046356224597805, "grad_norm": 1088.9471435546875, "learning_rate": 2.852525252525253e-05, "loss": 164.6322, "step": 7060 }, { "epoch": 0.05712715842888194, "grad_norm": 1510.014404296875, "learning_rate": 2.8565656565656567e-05, "loss": 246.0487, "step": 7070 }, { "epoch": 0.05720796063316607, "grad_norm": 617.3926391601562, "learning_rate": 2.860606060606061e-05, "loss": 166.5255, "step": 7080 }, { "epoch": 0.057288762837450206, "grad_norm": 1088.094482421875, "learning_rate": 2.864646464646465e-05, "loss": 180.0012, "step": 7090 }, { "epoch": 0.057369565041734336, "grad_norm": 754.35400390625, "learning_rate": 2.868686868686869e-05, "loss": 165.6718, "step": 7100 }, { "epoch": 0.05745036724601847, "grad_norm": 847.3502197265625, "learning_rate": 2.872727272727273e-05, "loss": 150.3254, "step": 7110 }, { "epoch": 0.0575311694503026, "grad_norm": 3462.79541015625, "learning_rate": 2.876767676767677e-05, "loss": 206.9913, "step": 7120 }, { "epoch": 0.05761197165458674, "grad_norm": 1302.846923828125, "learning_rate": 2.880808080808081e-05, "loss": 218.2749, "step": 7130 }, { "epoch": 0.05769277385887087, "grad_norm": 1508.3194580078125, "learning_rate": 2.8848484848484853e-05, "loss": 198.5009, "step": 7140 }, { "epoch": 0.057773576063155, "grad_norm": 1260.8990478515625, "learning_rate": 2.8888888888888888e-05, "loss": 287.7319, "step": 7150 }, { "epoch": 0.05785437826743914, "grad_norm": 2510.641357421875, "learning_rate": 2.8929292929292927e-05, "loss": 212.435, "step": 7160 }, { "epoch": 0.05793518047172327, "grad_norm": 1610.3782958984375, "learning_rate": 2.896969696969697e-05, "loss": 195.9904, "step": 7170 }, { "epoch": 0.058015982676007403, "grad_norm": 2051.1611328125, "learning_rate": 2.9010101010101008e-05, "loss": 230.746, "step": 7180 }, { "epoch": 0.05809678488029153, "grad_norm": 1708.345703125, "learning_rate": 2.905050505050505e-05, "loss": 202.1169, "step": 7190 }, { "epoch": 0.05817758708457567, "grad_norm": 991.0370483398438, "learning_rate": 2.909090909090909e-05, "loss": 182.8259, "step": 7200 }, { "epoch": 0.0582583892888598, "grad_norm": 1151.1380615234375, "learning_rate": 2.913131313131313e-05, "loss": 241.0473, "step": 7210 }, { "epoch": 0.058339191493143934, "grad_norm": 1103.3897705078125, "learning_rate": 2.9171717171717174e-05, "loss": 151.7667, "step": 7220 }, { "epoch": 0.05841999369742806, "grad_norm": 1151.0849609375, "learning_rate": 2.9212121212121213e-05, "loss": 197.9749, "step": 7230 }, { "epoch": 0.0585007959017122, "grad_norm": 983.3527221679688, "learning_rate": 2.9252525252525255e-05, "loss": 186.8989, "step": 7240 }, { "epoch": 0.05858159810599633, "grad_norm": 669.5452880859375, "learning_rate": 2.9292929292929294e-05, "loss": 179.281, "step": 7250 }, { "epoch": 0.058662400310280464, "grad_norm": 1186.9957275390625, "learning_rate": 2.9333333333333336e-05, "loss": 170.5622, "step": 7260 }, { "epoch": 0.0587432025145646, "grad_norm": 1314.4376220703125, "learning_rate": 2.9373737373737375e-05, "loss": 175.4961, "step": 7270 }, { "epoch": 0.05882400471884873, "grad_norm": 1278.834716796875, "learning_rate": 2.9414141414141417e-05, "loss": 183.9097, "step": 7280 }, { "epoch": 0.058904806923132866, "grad_norm": 1116.2734375, "learning_rate": 2.9454545454545456e-05, "loss": 143.516, "step": 7290 }, { "epoch": 0.058985609127416995, "grad_norm": 1352.628173828125, "learning_rate": 2.9494949494949498e-05, "loss": 204.3025, "step": 7300 }, { "epoch": 0.05906641133170113, "grad_norm": 1091.3201904296875, "learning_rate": 2.9535353535353537e-05, "loss": 181.4761, "step": 7310 }, { "epoch": 0.05914721353598526, "grad_norm": 1040.334716796875, "learning_rate": 2.957575757575758e-05, "loss": 170.6319, "step": 7320 }, { "epoch": 0.059228015740269396, "grad_norm": 1476.125732421875, "learning_rate": 2.9616161616161618e-05, "loss": 161.3749, "step": 7330 }, { "epoch": 0.059308817944553525, "grad_norm": 1488.0325927734375, "learning_rate": 2.965656565656566e-05, "loss": 183.3941, "step": 7340 }, { "epoch": 0.05938962014883766, "grad_norm": 481.60833740234375, "learning_rate": 2.96969696969697e-05, "loss": 199.2278, "step": 7350 }, { "epoch": 0.05947042235312179, "grad_norm": 1610.34521484375, "learning_rate": 2.973737373737374e-05, "loss": 201.723, "step": 7360 }, { "epoch": 0.05955122455740593, "grad_norm": 1576.0423583984375, "learning_rate": 2.9777777777777777e-05, "loss": 222.0852, "step": 7370 }, { "epoch": 0.059632026761690056, "grad_norm": 889.7515258789062, "learning_rate": 2.9818181818181816e-05, "loss": 193.5616, "step": 7380 }, { "epoch": 0.05971282896597419, "grad_norm": 746.6514282226562, "learning_rate": 2.9858585858585858e-05, "loss": 166.2696, "step": 7390 }, { "epoch": 0.05979363117025832, "grad_norm": 1730.69580078125, "learning_rate": 2.98989898989899e-05, "loss": 209.8799, "step": 7400 }, { "epoch": 0.05987443337454246, "grad_norm": 690.6642456054688, "learning_rate": 2.993939393939394e-05, "loss": 230.9101, "step": 7410 }, { "epoch": 0.05995523557882659, "grad_norm": 863.1697387695312, "learning_rate": 2.997979797979798e-05, "loss": 150.7177, "step": 7420 }, { "epoch": 0.06003603778311072, "grad_norm": 1267.2069091796875, "learning_rate": 3.002020202020202e-05, "loss": 210.8308, "step": 7430 }, { "epoch": 0.06011683998739486, "grad_norm": 1010.417724609375, "learning_rate": 3.0060606060606062e-05, "loss": 191.3645, "step": 7440 }, { "epoch": 0.06019764219167899, "grad_norm": 689.7382202148438, "learning_rate": 3.01010101010101e-05, "loss": 187.7134, "step": 7450 }, { "epoch": 0.060278444395963124, "grad_norm": 1864.760986328125, "learning_rate": 3.0141414141414144e-05, "loss": 214.7331, "step": 7460 }, { "epoch": 0.06035924660024725, "grad_norm": 1038.37353515625, "learning_rate": 3.0181818181818182e-05, "loss": 217.9106, "step": 7470 }, { "epoch": 0.06044004880453139, "grad_norm": 622.6604614257812, "learning_rate": 3.0222222222222225e-05, "loss": 155.4263, "step": 7480 }, { "epoch": 0.06052085100881552, "grad_norm": 878.7538452148438, "learning_rate": 3.0262626262626263e-05, "loss": 231.1667, "step": 7490 }, { "epoch": 0.060601653213099654, "grad_norm": 1581.2225341796875, "learning_rate": 3.0303030303030306e-05, "loss": 163.4888, "step": 7500 }, { "epoch": 0.060682455417383784, "grad_norm": 1152.7149658203125, "learning_rate": 3.0343434343434345e-05, "loss": 182.3645, "step": 7510 }, { "epoch": 0.06076325762166792, "grad_norm": 1109.6708984375, "learning_rate": 3.0383838383838387e-05, "loss": 175.0838, "step": 7520 }, { "epoch": 0.06084405982595205, "grad_norm": 1053.8270263671875, "learning_rate": 3.0424242424242426e-05, "loss": 181.691, "step": 7530 }, { "epoch": 0.060924862030236185, "grad_norm": 2113.046875, "learning_rate": 3.0464646464646468e-05, "loss": 224.7368, "step": 7540 }, { "epoch": 0.06100566423452032, "grad_norm": 1166.90478515625, "learning_rate": 3.050505050505051e-05, "loss": 206.4759, "step": 7550 }, { "epoch": 0.06108646643880445, "grad_norm": 1273.3836669921875, "learning_rate": 3.054545454545455e-05, "loss": 171.2801, "step": 7560 }, { "epoch": 0.061167268643088586, "grad_norm": 2534.885498046875, "learning_rate": 3.058585858585859e-05, "loss": 159.7586, "step": 7570 }, { "epoch": 0.061248070847372715, "grad_norm": 3763.103515625, "learning_rate": 3.062626262626262e-05, "loss": 323.4677, "step": 7580 }, { "epoch": 0.06132887305165685, "grad_norm": 1977.9522705078125, "learning_rate": 3.066666666666667e-05, "loss": 227.4736, "step": 7590 }, { "epoch": 0.06140967525594098, "grad_norm": 1690.8280029296875, "learning_rate": 3.070707070707071e-05, "loss": 192.4362, "step": 7600 }, { "epoch": 0.06149047746022512, "grad_norm": 1523.7828369140625, "learning_rate": 3.074747474747475e-05, "loss": 234.8535, "step": 7610 }, { "epoch": 0.061571279664509246, "grad_norm": 1146.36865234375, "learning_rate": 3.0787878787878786e-05, "loss": 142.9756, "step": 7620 }, { "epoch": 0.06165208186879338, "grad_norm": 895.3403930664062, "learning_rate": 3.082828282828283e-05, "loss": 201.5379, "step": 7630 }, { "epoch": 0.06173288407307751, "grad_norm": 1039.900634765625, "learning_rate": 3.086868686868687e-05, "loss": 230.2974, "step": 7640 }, { "epoch": 0.06181368627736165, "grad_norm": 1130.9986572265625, "learning_rate": 3.090909090909091e-05, "loss": 189.1531, "step": 7650 }, { "epoch": 0.061894488481645776, "grad_norm": 1224.142822265625, "learning_rate": 3.094949494949495e-05, "loss": 204.0206, "step": 7660 }, { "epoch": 0.06197529068592991, "grad_norm": 2115.472412109375, "learning_rate": 3.098989898989899e-05, "loss": 180.536, "step": 7670 }, { "epoch": 0.06205609289021405, "grad_norm": 779.9313354492188, "learning_rate": 3.103030303030303e-05, "loss": 158.623, "step": 7680 }, { "epoch": 0.06213689509449818, "grad_norm": 1337.7568359375, "learning_rate": 3.107070707070707e-05, "loss": 159.0383, "step": 7690 }, { "epoch": 0.062217697298782314, "grad_norm": 1851.648193359375, "learning_rate": 3.111111111111111e-05, "loss": 179.8161, "step": 7700 }, { "epoch": 0.06229849950306644, "grad_norm": 1469.6453857421875, "learning_rate": 3.1151515151515156e-05, "loss": 187.596, "step": 7710 }, { "epoch": 0.06237930170735058, "grad_norm": 1624.6527099609375, "learning_rate": 3.1191919191919194e-05, "loss": 214.8479, "step": 7720 }, { "epoch": 0.06246010391163471, "grad_norm": 1006.6346435546875, "learning_rate": 3.123232323232323e-05, "loss": 154.5748, "step": 7730 }, { "epoch": 0.06254090611591884, "grad_norm": 1002.5286254882812, "learning_rate": 3.127272727272728e-05, "loss": 184.4432, "step": 7740 }, { "epoch": 0.06262170832020297, "grad_norm": 1352.4193115234375, "learning_rate": 3.131313131313132e-05, "loss": 237.0036, "step": 7750 }, { "epoch": 0.06270251052448711, "grad_norm": 1084.147216796875, "learning_rate": 3.1353535353535357e-05, "loss": 164.8318, "step": 7760 }, { "epoch": 0.06278331272877125, "grad_norm": 1302.1048583984375, "learning_rate": 3.1393939393939395e-05, "loss": 164.3788, "step": 7770 }, { "epoch": 0.06286411493305537, "grad_norm": 1383.396484375, "learning_rate": 3.143434343434344e-05, "loss": 175.5805, "step": 7780 }, { "epoch": 0.0629449171373395, "grad_norm": 1246.53857421875, "learning_rate": 3.147474747474747e-05, "loss": 210.3966, "step": 7790 }, { "epoch": 0.06302571934162364, "grad_norm": 1285.145263671875, "learning_rate": 3.151515151515151e-05, "loss": 246.1903, "step": 7800 }, { "epoch": 0.06310652154590778, "grad_norm": 1620.3326416015625, "learning_rate": 3.155555555555556e-05, "loss": 196.0127, "step": 7810 }, { "epoch": 0.06318732375019191, "grad_norm": 1016.9979858398438, "learning_rate": 3.1595959595959596e-05, "loss": 210.5301, "step": 7820 }, { "epoch": 0.06326812595447603, "grad_norm": 1945.8780517578125, "learning_rate": 3.1636363636363635e-05, "loss": 239.3266, "step": 7830 }, { "epoch": 0.06334892815876017, "grad_norm": 1864.5794677734375, "learning_rate": 3.1676767676767674e-05, "loss": 193.7567, "step": 7840 }, { "epoch": 0.0634297303630443, "grad_norm": 1095.450927734375, "learning_rate": 3.171717171717172e-05, "loss": 191.1735, "step": 7850 }, { "epoch": 0.06351053256732844, "grad_norm": 1031.504150390625, "learning_rate": 3.175757575757576e-05, "loss": 185.8655, "step": 7860 }, { "epoch": 0.06359133477161256, "grad_norm": 1385.5076904296875, "learning_rate": 3.17979797979798e-05, "loss": 177.6908, "step": 7870 }, { "epoch": 0.0636721369758967, "grad_norm": 1074.5181884765625, "learning_rate": 3.1838383838383836e-05, "loss": 204.0031, "step": 7880 }, { "epoch": 0.06375293918018084, "grad_norm": 953.3314208984375, "learning_rate": 3.187878787878788e-05, "loss": 180.9185, "step": 7890 }, { "epoch": 0.06383374138446497, "grad_norm": 868.8043823242188, "learning_rate": 3.191919191919192e-05, "loss": 220.1422, "step": 7900 }, { "epoch": 0.0639145435887491, "grad_norm": 5921.494140625, "learning_rate": 3.195959595959596e-05, "loss": 167.039, "step": 7910 }, { "epoch": 0.06399534579303323, "grad_norm": 1500.1710205078125, "learning_rate": 3.2000000000000005e-05, "loss": 138.7559, "step": 7920 }, { "epoch": 0.06407614799731737, "grad_norm": 1143.7266845703125, "learning_rate": 3.2040404040404044e-05, "loss": 195.0978, "step": 7930 }, { "epoch": 0.0641569502016015, "grad_norm": 523.0445556640625, "learning_rate": 3.208080808080808e-05, "loss": 151.1692, "step": 7940 }, { "epoch": 0.06423775240588563, "grad_norm": 2158.39013671875, "learning_rate": 3.212121212121212e-05, "loss": 236.7984, "step": 7950 }, { "epoch": 0.06431855461016976, "grad_norm": 659.3209228515625, "learning_rate": 3.216161616161617e-05, "loss": 181.1136, "step": 7960 }, { "epoch": 0.0643993568144539, "grad_norm": 608.638671875, "learning_rate": 3.2202020202020206e-05, "loss": 194.2183, "step": 7970 }, { "epoch": 0.06448015901873803, "grad_norm": 1122.7078857421875, "learning_rate": 3.2242424242424245e-05, "loss": 160.3627, "step": 7980 }, { "epoch": 0.06456096122302217, "grad_norm": 1686.80810546875, "learning_rate": 3.2282828282828284e-05, "loss": 223.456, "step": 7990 }, { "epoch": 0.06464176342730629, "grad_norm": 1573.1317138671875, "learning_rate": 3.232323232323233e-05, "loss": 224.0322, "step": 8000 }, { "epoch": 0.06472256563159043, "grad_norm": 1321.1458740234375, "learning_rate": 3.236363636363636e-05, "loss": 252.9104, "step": 8010 }, { "epoch": 0.06480336783587456, "grad_norm": 1179.701171875, "learning_rate": 3.24040404040404e-05, "loss": 223.4346, "step": 8020 }, { "epoch": 0.0648841700401587, "grad_norm": 977.9105224609375, "learning_rate": 3.2444444444444446e-05, "loss": 152.0468, "step": 8030 }, { "epoch": 0.06496497224444282, "grad_norm": 2066.90380859375, "learning_rate": 3.2484848484848485e-05, "loss": 190.8667, "step": 8040 }, { "epoch": 0.06504577444872696, "grad_norm": 3095.08935546875, "learning_rate": 3.2525252525252524e-05, "loss": 192.0303, "step": 8050 }, { "epoch": 0.0651265766530111, "grad_norm": 2343.95947265625, "learning_rate": 3.256565656565656e-05, "loss": 157.5384, "step": 8060 }, { "epoch": 0.06520737885729523, "grad_norm": 1510.8023681640625, "learning_rate": 3.260606060606061e-05, "loss": 239.6892, "step": 8070 }, { "epoch": 0.06528818106157935, "grad_norm": 1445.597900390625, "learning_rate": 3.264646464646465e-05, "loss": 177.4803, "step": 8080 }, { "epoch": 0.06536898326586349, "grad_norm": 1667.5521240234375, "learning_rate": 3.2686868686868686e-05, "loss": 190.0459, "step": 8090 }, { "epoch": 0.06544978547014763, "grad_norm": 925.2418212890625, "learning_rate": 3.272727272727273e-05, "loss": 190.8257, "step": 8100 }, { "epoch": 0.06553058767443176, "grad_norm": 1247.4376220703125, "learning_rate": 3.276767676767677e-05, "loss": 203.9773, "step": 8110 }, { "epoch": 0.0656113898787159, "grad_norm": 1212.892822265625, "learning_rate": 3.280808080808081e-05, "loss": 203.7381, "step": 8120 }, { "epoch": 0.06569219208300002, "grad_norm": 1091.890380859375, "learning_rate": 3.284848484848485e-05, "loss": 194.8187, "step": 8130 }, { "epoch": 0.06577299428728416, "grad_norm": 2029.2864990234375, "learning_rate": 3.2888888888888894e-05, "loss": 246.8937, "step": 8140 }, { "epoch": 0.06585379649156829, "grad_norm": 920.1378784179688, "learning_rate": 3.292929292929293e-05, "loss": 215.9934, "step": 8150 }, { "epoch": 0.06593459869585243, "grad_norm": 1521.0574951171875, "learning_rate": 3.296969696969697e-05, "loss": 167.3099, "step": 8160 }, { "epoch": 0.06601540090013655, "grad_norm": 1420.7525634765625, "learning_rate": 3.301010101010101e-05, "loss": 206.7512, "step": 8170 }, { "epoch": 0.06609620310442069, "grad_norm": 840.5839233398438, "learning_rate": 3.3050505050505056e-05, "loss": 202.7185, "step": 8180 }, { "epoch": 0.06617700530870482, "grad_norm": 1193.502197265625, "learning_rate": 3.3090909090909095e-05, "loss": 160.3612, "step": 8190 }, { "epoch": 0.06625780751298896, "grad_norm": 2222.778564453125, "learning_rate": 3.3131313131313134e-05, "loss": 150.92, "step": 8200 }, { "epoch": 0.06633860971727308, "grad_norm": 776.4454956054688, "learning_rate": 3.317171717171717e-05, "loss": 159.6749, "step": 8210 }, { "epoch": 0.06641941192155722, "grad_norm": 1179.86279296875, "learning_rate": 3.321212121212121e-05, "loss": 147.1537, "step": 8220 }, { "epoch": 0.06650021412584135, "grad_norm": 1168.2757568359375, "learning_rate": 3.325252525252525e-05, "loss": 163.5715, "step": 8230 }, { "epoch": 0.06658101633012549, "grad_norm": 996.3876953125, "learning_rate": 3.329292929292929e-05, "loss": 156.5557, "step": 8240 }, { "epoch": 0.06666181853440963, "grad_norm": 1006.9996337890625, "learning_rate": 3.3333333333333335e-05, "loss": 176.6802, "step": 8250 }, { "epoch": 0.06674262073869375, "grad_norm": 877.4000854492188, "learning_rate": 3.3373737373737374e-05, "loss": 182.8363, "step": 8260 }, { "epoch": 0.06682342294297788, "grad_norm": 2153.091552734375, "learning_rate": 3.341414141414141e-05, "loss": 184.7595, "step": 8270 }, { "epoch": 0.06690422514726202, "grad_norm": 1884.7989501953125, "learning_rate": 3.345454545454546e-05, "loss": 197.672, "step": 8280 }, { "epoch": 0.06698502735154616, "grad_norm": 1494.185791015625, "learning_rate": 3.34949494949495e-05, "loss": 178.7865, "step": 8290 }, { "epoch": 0.06706582955583028, "grad_norm": 2600.398193359375, "learning_rate": 3.3535353535353536e-05, "loss": 222.7885, "step": 8300 }, { "epoch": 0.06714663176011441, "grad_norm": 1300.013671875, "learning_rate": 3.3575757575757575e-05, "loss": 182.5449, "step": 8310 }, { "epoch": 0.06722743396439855, "grad_norm": 2145.218505859375, "learning_rate": 3.361616161616162e-05, "loss": 281.7514, "step": 8320 }, { "epoch": 0.06730823616868269, "grad_norm": 1519.411865234375, "learning_rate": 3.365656565656566e-05, "loss": 167.7319, "step": 8330 }, { "epoch": 0.06738903837296681, "grad_norm": 750.0274047851562, "learning_rate": 3.36969696969697e-05, "loss": 173.5281, "step": 8340 }, { "epoch": 0.06746984057725094, "grad_norm": 1222.1435546875, "learning_rate": 3.373737373737374e-05, "loss": 162.324, "step": 8350 }, { "epoch": 0.06755064278153508, "grad_norm": 955.3302001953125, "learning_rate": 3.377777777777778e-05, "loss": 184.845, "step": 8360 }, { "epoch": 0.06763144498581922, "grad_norm": 1112.2943115234375, "learning_rate": 3.381818181818182e-05, "loss": 160.8671, "step": 8370 }, { "epoch": 0.06771224719010335, "grad_norm": 1163.462646484375, "learning_rate": 3.385858585858586e-05, "loss": 147.4451, "step": 8380 }, { "epoch": 0.06779304939438748, "grad_norm": 925.3172607421875, "learning_rate": 3.38989898989899e-05, "loss": 175.2849, "step": 8390 }, { "epoch": 0.06787385159867161, "grad_norm": 1990.568359375, "learning_rate": 3.3939393939393945e-05, "loss": 199.808, "step": 8400 }, { "epoch": 0.06795465380295575, "grad_norm": 2128.471923828125, "learning_rate": 3.3979797979797984e-05, "loss": 205.3155, "step": 8410 }, { "epoch": 0.06803545600723988, "grad_norm": 2691.37353515625, "learning_rate": 3.402020202020202e-05, "loss": 216.1895, "step": 8420 }, { "epoch": 0.068116258211524, "grad_norm": 3363.869140625, "learning_rate": 3.406060606060606e-05, "loss": 201.8374, "step": 8430 }, { "epoch": 0.06819706041580814, "grad_norm": 1438.0633544921875, "learning_rate": 3.41010101010101e-05, "loss": 222.5396, "step": 8440 }, { "epoch": 0.06827786262009228, "grad_norm": 1703.8653564453125, "learning_rate": 3.414141414141414e-05, "loss": 213.8756, "step": 8450 }, { "epoch": 0.06835866482437641, "grad_norm": 1938.7177734375, "learning_rate": 3.4181818181818185e-05, "loss": 215.6468, "step": 8460 }, { "epoch": 0.06843946702866054, "grad_norm": 851.2493896484375, "learning_rate": 3.4222222222222224e-05, "loss": 182.1204, "step": 8470 }, { "epoch": 0.06852026923294467, "grad_norm": 1202.3365478515625, "learning_rate": 3.426262626262626e-05, "loss": 183.2559, "step": 8480 }, { "epoch": 0.06860107143722881, "grad_norm": 1543.7257080078125, "learning_rate": 3.43030303030303e-05, "loss": 239.1145, "step": 8490 }, { "epoch": 0.06868187364151294, "grad_norm": 748.701171875, "learning_rate": 3.434343434343435e-05, "loss": 285.6954, "step": 8500 }, { "epoch": 0.06876267584579707, "grad_norm": 5747.30224609375, "learning_rate": 3.4383838383838386e-05, "loss": 194.7742, "step": 8510 }, { "epoch": 0.0688434780500812, "grad_norm": 1587.478271484375, "learning_rate": 3.4424242424242425e-05, "loss": 213.0056, "step": 8520 }, { "epoch": 0.06892428025436534, "grad_norm": 907.9869995117188, "learning_rate": 3.4464646464646463e-05, "loss": 183.2854, "step": 8530 }, { "epoch": 0.06900508245864947, "grad_norm": 1065.7462158203125, "learning_rate": 3.450505050505051e-05, "loss": 147.3901, "step": 8540 }, { "epoch": 0.06908588466293361, "grad_norm": 1654.4375, "learning_rate": 3.454545454545455e-05, "loss": 152.4811, "step": 8550 }, { "epoch": 0.06916668686721773, "grad_norm": 1075.144775390625, "learning_rate": 3.458585858585859e-05, "loss": 217.3591, "step": 8560 }, { "epoch": 0.06924748907150187, "grad_norm": 1216.4287109375, "learning_rate": 3.4626262626262626e-05, "loss": 184.7878, "step": 8570 }, { "epoch": 0.069328291275786, "grad_norm": 1143.3253173828125, "learning_rate": 3.466666666666667e-05, "loss": 185.8244, "step": 8580 }, { "epoch": 0.06940909348007014, "grad_norm": 2943.891357421875, "learning_rate": 3.470707070707071e-05, "loss": 203.6546, "step": 8590 }, { "epoch": 0.06948989568435426, "grad_norm": 562.7566528320312, "learning_rate": 3.474747474747475e-05, "loss": 224.7354, "step": 8600 }, { "epoch": 0.0695706978886384, "grad_norm": 1995.734130859375, "learning_rate": 3.4787878787878795e-05, "loss": 178.8969, "step": 8610 }, { "epoch": 0.06965150009292254, "grad_norm": 1593.8944091796875, "learning_rate": 3.4828282828282834e-05, "loss": 164.9677, "step": 8620 }, { "epoch": 0.06973230229720667, "grad_norm": 715.6300048828125, "learning_rate": 3.486868686868687e-05, "loss": 147.6377, "step": 8630 }, { "epoch": 0.0698131045014908, "grad_norm": 3263.21044921875, "learning_rate": 3.490909090909091e-05, "loss": 235.9829, "step": 8640 }, { "epoch": 0.06989390670577493, "grad_norm": 3928.5576171875, "learning_rate": 3.494949494949495e-05, "loss": 220.889, "step": 8650 }, { "epoch": 0.06997470891005907, "grad_norm": 1176.6265869140625, "learning_rate": 3.498989898989899e-05, "loss": 173.9785, "step": 8660 }, { "epoch": 0.0700555111143432, "grad_norm": 1472.33349609375, "learning_rate": 3.503030303030303e-05, "loss": 166.185, "step": 8670 }, { "epoch": 0.07013631331862734, "grad_norm": 943.4843139648438, "learning_rate": 3.5070707070707073e-05, "loss": 196.9754, "step": 8680 }, { "epoch": 0.07021711552291146, "grad_norm": 1376.169189453125, "learning_rate": 3.511111111111111e-05, "loss": 249.9044, "step": 8690 }, { "epoch": 0.0702979177271956, "grad_norm": 862.705078125, "learning_rate": 3.515151515151515e-05, "loss": 134.6473, "step": 8700 }, { "epoch": 0.07037871993147973, "grad_norm": 1661.3258056640625, "learning_rate": 3.519191919191919e-05, "loss": 260.4335, "step": 8710 }, { "epoch": 0.07045952213576387, "grad_norm": 858.2864379882812, "learning_rate": 3.5232323232323236e-05, "loss": 156.1466, "step": 8720 }, { "epoch": 0.07054032434004799, "grad_norm": 1033.8033447265625, "learning_rate": 3.5272727272727274e-05, "loss": 158.8132, "step": 8730 }, { "epoch": 0.07062112654433213, "grad_norm": 2244.4833984375, "learning_rate": 3.531313131313131e-05, "loss": 185.2664, "step": 8740 }, { "epoch": 0.07070192874861626, "grad_norm": 828.0194091796875, "learning_rate": 3.535353535353535e-05, "loss": 189.655, "step": 8750 }, { "epoch": 0.0707827309529004, "grad_norm": 764.8339233398438, "learning_rate": 3.53939393939394e-05, "loss": 169.7833, "step": 8760 }, { "epoch": 0.07086353315718452, "grad_norm": 1434.6533203125, "learning_rate": 3.543434343434344e-05, "loss": 169.9032, "step": 8770 }, { "epoch": 0.07094433536146866, "grad_norm": 1811.5740966796875, "learning_rate": 3.5474747474747475e-05, "loss": 256.6501, "step": 8780 }, { "epoch": 0.0710251375657528, "grad_norm": 923.5958251953125, "learning_rate": 3.551515151515152e-05, "loss": 157.0084, "step": 8790 }, { "epoch": 0.07110593977003693, "grad_norm": 1671.8385009765625, "learning_rate": 3.555555555555556e-05, "loss": 206.8505, "step": 8800 }, { "epoch": 0.07118674197432107, "grad_norm": 2508.17626953125, "learning_rate": 3.55959595959596e-05, "loss": 204.866, "step": 8810 }, { "epoch": 0.07126754417860519, "grad_norm": 852.1519775390625, "learning_rate": 3.563636363636364e-05, "loss": 147.0554, "step": 8820 }, { "epoch": 0.07134834638288932, "grad_norm": 925.072021484375, "learning_rate": 3.567676767676768e-05, "loss": 205.7363, "step": 8830 }, { "epoch": 0.07142914858717346, "grad_norm": 1310.513916015625, "learning_rate": 3.571717171717172e-05, "loss": 252.9427, "step": 8840 }, { "epoch": 0.0715099507914576, "grad_norm": 1795.476806640625, "learning_rate": 3.575757575757576e-05, "loss": 184.422, "step": 8850 }, { "epoch": 0.07159075299574172, "grad_norm": 1071.3101806640625, "learning_rate": 3.57979797979798e-05, "loss": 165.8845, "step": 8860 }, { "epoch": 0.07167155520002585, "grad_norm": 724.8527221679688, "learning_rate": 3.583838383838384e-05, "loss": 180.9637, "step": 8870 }, { "epoch": 0.07175235740430999, "grad_norm": 999.9872436523438, "learning_rate": 3.587878787878788e-05, "loss": 186.5519, "step": 8880 }, { "epoch": 0.07183315960859413, "grad_norm": 1380.1075439453125, "learning_rate": 3.5919191919191916e-05, "loss": 224.0065, "step": 8890 }, { "epoch": 0.07191396181287825, "grad_norm": 1093.9498291015625, "learning_rate": 3.595959595959596e-05, "loss": 165.8288, "step": 8900 }, { "epoch": 0.07199476401716239, "grad_norm": 2711.7353515625, "learning_rate": 3.6e-05, "loss": 163.5886, "step": 8910 }, { "epoch": 0.07207556622144652, "grad_norm": 1537.253662109375, "learning_rate": 3.604040404040404e-05, "loss": 165.1541, "step": 8920 }, { "epoch": 0.07215636842573066, "grad_norm": 1100.548095703125, "learning_rate": 3.608080808080808e-05, "loss": 169.8857, "step": 8930 }, { "epoch": 0.0722371706300148, "grad_norm": 1088.0587158203125, "learning_rate": 3.6121212121212124e-05, "loss": 186.5488, "step": 8940 }, { "epoch": 0.07231797283429892, "grad_norm": 1160.9769287109375, "learning_rate": 3.616161616161616e-05, "loss": 190.4372, "step": 8950 }, { "epoch": 0.07239877503858305, "grad_norm": 1117.179443359375, "learning_rate": 3.62020202020202e-05, "loss": 134.2678, "step": 8960 }, { "epoch": 0.07247957724286719, "grad_norm": 1956.5089111328125, "learning_rate": 3.624242424242425e-05, "loss": 172.5972, "step": 8970 }, { "epoch": 0.07256037944715132, "grad_norm": 2694.300537109375, "learning_rate": 3.6282828282828286e-05, "loss": 200.7171, "step": 8980 }, { "epoch": 0.07264118165143545, "grad_norm": 1112.63720703125, "learning_rate": 3.6323232323232325e-05, "loss": 151.1802, "step": 8990 }, { "epoch": 0.07272198385571958, "grad_norm": 2176.663330078125, "learning_rate": 3.6363636363636364e-05, "loss": 162.1838, "step": 9000 }, { "epoch": 0.07280278606000372, "grad_norm": 1126.674072265625, "learning_rate": 3.640404040404041e-05, "loss": 180.7603, "step": 9010 }, { "epoch": 0.07288358826428785, "grad_norm": 1244.1241455078125, "learning_rate": 3.644444444444445e-05, "loss": 114.9891, "step": 9020 }, { "epoch": 0.07296439046857198, "grad_norm": 2782.807373046875, "learning_rate": 3.648484848484849e-05, "loss": 199.3326, "step": 9030 }, { "epoch": 0.07304519267285611, "grad_norm": 1218.721435546875, "learning_rate": 3.6525252525252526e-05, "loss": 143.516, "step": 9040 }, { "epoch": 0.07312599487714025, "grad_norm": 799.7451782226562, "learning_rate": 3.656565656565657e-05, "loss": 131.4932, "step": 9050 }, { "epoch": 0.07320679708142439, "grad_norm": 547.2342529296875, "learning_rate": 3.660606060606061e-05, "loss": 151.0718, "step": 9060 }, { "epoch": 0.07328759928570851, "grad_norm": 1480.14501953125, "learning_rate": 3.664646464646464e-05, "loss": 201.2714, "step": 9070 }, { "epoch": 0.07336840148999264, "grad_norm": 516.29931640625, "learning_rate": 3.668686868686869e-05, "loss": 174.2223, "step": 9080 }, { "epoch": 0.07344920369427678, "grad_norm": 912.0347900390625, "learning_rate": 3.672727272727273e-05, "loss": 180.7495, "step": 9090 }, { "epoch": 0.07353000589856092, "grad_norm": 1243.236083984375, "learning_rate": 3.6767676767676766e-05, "loss": 189.5693, "step": 9100 }, { "epoch": 0.07361080810284505, "grad_norm": 742.5632934570312, "learning_rate": 3.6808080808080805e-05, "loss": 194.6184, "step": 9110 }, { "epoch": 0.07369161030712917, "grad_norm": 1145.2069091796875, "learning_rate": 3.684848484848485e-05, "loss": 185.7565, "step": 9120 }, { "epoch": 0.07377241251141331, "grad_norm": 1086.181640625, "learning_rate": 3.688888888888889e-05, "loss": 167.5214, "step": 9130 }, { "epoch": 0.07385321471569745, "grad_norm": 1321.85400390625, "learning_rate": 3.692929292929293e-05, "loss": 221.075, "step": 9140 }, { "epoch": 0.07393401691998158, "grad_norm": 2168.907958984375, "learning_rate": 3.6969696969696974e-05, "loss": 219.8519, "step": 9150 }, { "epoch": 0.0740148191242657, "grad_norm": 1007.1217041015625, "learning_rate": 3.701010101010101e-05, "loss": 194.1429, "step": 9160 }, { "epoch": 0.07409562132854984, "grad_norm": 1099.997802734375, "learning_rate": 3.705050505050505e-05, "loss": 220.093, "step": 9170 }, { "epoch": 0.07417642353283398, "grad_norm": 745.4526977539062, "learning_rate": 3.709090909090909e-05, "loss": 143.6606, "step": 9180 }, { "epoch": 0.07425722573711811, "grad_norm": 2050.18212890625, "learning_rate": 3.7131313131313136e-05, "loss": 222.1462, "step": 9190 }, { "epoch": 0.07433802794140223, "grad_norm": 2366.43896484375, "learning_rate": 3.7171717171717175e-05, "loss": 191.5783, "step": 9200 }, { "epoch": 0.07441883014568637, "grad_norm": 905.9826049804688, "learning_rate": 3.7212121212121214e-05, "loss": 168.3594, "step": 9210 }, { "epoch": 0.07449963234997051, "grad_norm": 1320.197509765625, "learning_rate": 3.725252525252525e-05, "loss": 140.7596, "step": 9220 }, { "epoch": 0.07458043455425464, "grad_norm": 3290.6748046875, "learning_rate": 3.72929292929293e-05, "loss": 161.625, "step": 9230 }, { "epoch": 0.07466123675853878, "grad_norm": 1262.2984619140625, "learning_rate": 3.733333333333334e-05, "loss": 163.4713, "step": 9240 }, { "epoch": 0.0747420389628229, "grad_norm": 1434.2923583984375, "learning_rate": 3.7373737373737376e-05, "loss": 168.6003, "step": 9250 }, { "epoch": 0.07482284116710704, "grad_norm": 716.86083984375, "learning_rate": 3.7414141414141415e-05, "loss": 165.9515, "step": 9260 }, { "epoch": 0.07490364337139117, "grad_norm": 4190.2119140625, "learning_rate": 3.745454545454546e-05, "loss": 201.2678, "step": 9270 }, { "epoch": 0.07498444557567531, "grad_norm": 1093.3636474609375, "learning_rate": 3.74949494949495e-05, "loss": 145.2475, "step": 9280 }, { "epoch": 0.07506524777995943, "grad_norm": 1217.2969970703125, "learning_rate": 3.753535353535353e-05, "loss": 219.3316, "step": 9290 }, { "epoch": 0.07514604998424357, "grad_norm": 1122.0589599609375, "learning_rate": 3.757575757575758e-05, "loss": 201.7304, "step": 9300 }, { "epoch": 0.0752268521885277, "grad_norm": 779.33447265625, "learning_rate": 3.7616161616161616e-05, "loss": 199.0558, "step": 9310 }, { "epoch": 0.07530765439281184, "grad_norm": 1110.2554931640625, "learning_rate": 3.7656565656565655e-05, "loss": 190.4773, "step": 9320 }, { "epoch": 0.07538845659709596, "grad_norm": 1762.0330810546875, "learning_rate": 3.76969696969697e-05, "loss": 151.1484, "step": 9330 }, { "epoch": 0.0754692588013801, "grad_norm": 1020.6593627929688, "learning_rate": 3.773737373737374e-05, "loss": 169.2854, "step": 9340 }, { "epoch": 0.07555006100566423, "grad_norm": 1202.7464599609375, "learning_rate": 3.777777777777778e-05, "loss": 164.9156, "step": 9350 }, { "epoch": 0.07563086320994837, "grad_norm": 1249.380126953125, "learning_rate": 3.781818181818182e-05, "loss": 227.8868, "step": 9360 }, { "epoch": 0.0757116654142325, "grad_norm": 1847.7193603515625, "learning_rate": 3.785858585858586e-05, "loss": 176.2305, "step": 9370 }, { "epoch": 0.07579246761851663, "grad_norm": 1446.559814453125, "learning_rate": 3.78989898989899e-05, "loss": 147.3983, "step": 9380 }, { "epoch": 0.07587326982280077, "grad_norm": 1216.9862060546875, "learning_rate": 3.793939393939394e-05, "loss": 165.7989, "step": 9390 }, { "epoch": 0.0759540720270849, "grad_norm": 2848.1337890625, "learning_rate": 3.797979797979798e-05, "loss": 157.2326, "step": 9400 }, { "epoch": 0.07603487423136904, "grad_norm": 940.07177734375, "learning_rate": 3.8020202020202025e-05, "loss": 221.2202, "step": 9410 }, { "epoch": 0.07611567643565316, "grad_norm": 1160.6881103515625, "learning_rate": 3.8060606060606064e-05, "loss": 158.6488, "step": 9420 }, { "epoch": 0.0761964786399373, "grad_norm": 2879.822998046875, "learning_rate": 3.81010101010101e-05, "loss": 223.0843, "step": 9430 }, { "epoch": 0.07627728084422143, "grad_norm": 1498.8753662109375, "learning_rate": 3.814141414141414e-05, "loss": 161.5878, "step": 9440 }, { "epoch": 0.07635808304850557, "grad_norm": 1229.455078125, "learning_rate": 3.818181818181819e-05, "loss": 198.7026, "step": 9450 }, { "epoch": 0.07643888525278969, "grad_norm": 929.4089965820312, "learning_rate": 3.8222222222222226e-05, "loss": 251.8946, "step": 9460 }, { "epoch": 0.07651968745707383, "grad_norm": 934.8760375976562, "learning_rate": 3.8262626262626265e-05, "loss": 162.8998, "step": 9470 }, { "epoch": 0.07660048966135796, "grad_norm": 915.323974609375, "learning_rate": 3.830303030303031e-05, "loss": 171.6096, "step": 9480 }, { "epoch": 0.0766812918656421, "grad_norm": 1115.416748046875, "learning_rate": 3.834343434343435e-05, "loss": 163.5847, "step": 9490 }, { "epoch": 0.07676209406992623, "grad_norm": 2073.390869140625, "learning_rate": 3.838383838383838e-05, "loss": 261.5975, "step": 9500 }, { "epoch": 0.07684289627421036, "grad_norm": 1260.9718017578125, "learning_rate": 3.842424242424243e-05, "loss": 131.5614, "step": 9510 }, { "epoch": 0.07692369847849449, "grad_norm": 1239.3568115234375, "learning_rate": 3.8464646464646466e-05, "loss": 164.9494, "step": 9520 }, { "epoch": 0.07700450068277863, "grad_norm": 1221.8023681640625, "learning_rate": 3.8505050505050505e-05, "loss": 245.5353, "step": 9530 }, { "epoch": 0.07708530288706276, "grad_norm": 533.4956665039062, "learning_rate": 3.8545454545454544e-05, "loss": 149.1819, "step": 9540 }, { "epoch": 0.07716610509134689, "grad_norm": 1410.545166015625, "learning_rate": 3.858585858585859e-05, "loss": 172.351, "step": 9550 }, { "epoch": 0.07724690729563102, "grad_norm": 927.8252563476562, "learning_rate": 3.862626262626263e-05, "loss": 152.7654, "step": 9560 }, { "epoch": 0.07732770949991516, "grad_norm": 1244.0257568359375, "learning_rate": 3.866666666666667e-05, "loss": 171.8292, "step": 9570 }, { "epoch": 0.0774085117041993, "grad_norm": 664.2005615234375, "learning_rate": 3.8707070707070706e-05, "loss": 109.5876, "step": 9580 }, { "epoch": 0.07748931390848342, "grad_norm": 1334.42626953125, "learning_rate": 3.874747474747475e-05, "loss": 129.1926, "step": 9590 }, { "epoch": 0.07757011611276755, "grad_norm": 1236.3963623046875, "learning_rate": 3.878787878787879e-05, "loss": 195.8564, "step": 9600 }, { "epoch": 0.07765091831705169, "grad_norm": 517.2808227539062, "learning_rate": 3.882828282828283e-05, "loss": 213.7913, "step": 9610 }, { "epoch": 0.07773172052133583, "grad_norm": 1618.37890625, "learning_rate": 3.886868686868687e-05, "loss": 187.1561, "step": 9620 }, { "epoch": 0.07781252272561995, "grad_norm": 1312.72705078125, "learning_rate": 3.8909090909090914e-05, "loss": 192.0711, "step": 9630 }, { "epoch": 0.07789332492990408, "grad_norm": 970.0208129882812, "learning_rate": 3.894949494949495e-05, "loss": 150.2422, "step": 9640 }, { "epoch": 0.07797412713418822, "grad_norm": 1302.1982421875, "learning_rate": 3.898989898989899e-05, "loss": 149.0684, "step": 9650 }, { "epoch": 0.07805492933847236, "grad_norm": 3663.638427734375, "learning_rate": 3.903030303030304e-05, "loss": 197.8055, "step": 9660 }, { "epoch": 0.07813573154275649, "grad_norm": 1510.8233642578125, "learning_rate": 3.9070707070707076e-05, "loss": 217.5386, "step": 9670 }, { "epoch": 0.07821653374704061, "grad_norm": 4103.01904296875, "learning_rate": 3.9111111111111115e-05, "loss": 198.1855, "step": 9680 }, { "epoch": 0.07829733595132475, "grad_norm": 1483.061279296875, "learning_rate": 3.9151515151515153e-05, "loss": 200.777, "step": 9690 }, { "epoch": 0.07837813815560889, "grad_norm": 1666.9429931640625, "learning_rate": 3.91919191919192e-05, "loss": 233.7256, "step": 9700 }, { "epoch": 0.07845894035989302, "grad_norm": 1422.605224609375, "learning_rate": 3.923232323232323e-05, "loss": 198.8265, "step": 9710 }, { "epoch": 0.07853974256417715, "grad_norm": 955.90869140625, "learning_rate": 3.927272727272727e-05, "loss": 178.4951, "step": 9720 }, { "epoch": 0.07862054476846128, "grad_norm": 768.09228515625, "learning_rate": 3.9313131313131316e-05, "loss": 169.9865, "step": 9730 }, { "epoch": 0.07870134697274542, "grad_norm": 2241.572998046875, "learning_rate": 3.9353535353535355e-05, "loss": 171.3657, "step": 9740 }, { "epoch": 0.07878214917702955, "grad_norm": 828.8177490234375, "learning_rate": 3.939393939393939e-05, "loss": 212.0979, "step": 9750 }, { "epoch": 0.07886295138131368, "grad_norm": 1248.2691650390625, "learning_rate": 3.943434343434343e-05, "loss": 144.412, "step": 9760 }, { "epoch": 0.07894375358559781, "grad_norm": 1106.5013427734375, "learning_rate": 3.947474747474748e-05, "loss": 170.0778, "step": 9770 }, { "epoch": 0.07902455578988195, "grad_norm": 1183.6558837890625, "learning_rate": 3.951515151515152e-05, "loss": 183.0114, "step": 9780 }, { "epoch": 0.07910535799416608, "grad_norm": 790.3275146484375, "learning_rate": 3.9555555555555556e-05, "loss": 148.8441, "step": 9790 }, { "epoch": 0.07918616019845022, "grad_norm": 1040.2529296875, "learning_rate": 3.9595959595959594e-05, "loss": 181.9307, "step": 9800 }, { "epoch": 0.07926696240273434, "grad_norm": 1023.8417358398438, "learning_rate": 3.963636363636364e-05, "loss": 160.768, "step": 9810 }, { "epoch": 0.07934776460701848, "grad_norm": 1530.9327392578125, "learning_rate": 3.967676767676768e-05, "loss": 189.5177, "step": 9820 }, { "epoch": 0.07942856681130261, "grad_norm": 1020.0157470703125, "learning_rate": 3.971717171717172e-05, "loss": 175.2649, "step": 9830 }, { "epoch": 0.07950936901558675, "grad_norm": 1115.394287109375, "learning_rate": 3.975757575757576e-05, "loss": 180.9754, "step": 9840 }, { "epoch": 0.07959017121987087, "grad_norm": 596.3128051757812, "learning_rate": 3.97979797979798e-05, "loss": 190.0906, "step": 9850 }, { "epoch": 0.07967097342415501, "grad_norm": 1229.2056884765625, "learning_rate": 3.983838383838384e-05, "loss": 153.1082, "step": 9860 }, { "epoch": 0.07975177562843914, "grad_norm": 1460.8936767578125, "learning_rate": 3.987878787878788e-05, "loss": 176.2776, "step": 9870 }, { "epoch": 0.07983257783272328, "grad_norm": 4583.7373046875, "learning_rate": 3.9919191919191926e-05, "loss": 163.6398, "step": 9880 }, { "epoch": 0.0799133800370074, "grad_norm": 1115.4329833984375, "learning_rate": 3.9959595959595964e-05, "loss": 174.1616, "step": 9890 }, { "epoch": 0.07999418224129154, "grad_norm": 2389.8232421875, "learning_rate": 4e-05, "loss": 163.833, "step": 9900 }, { "epoch": 0.08007498444557568, "grad_norm": 2180.552978515625, "learning_rate": 4.004040404040404e-05, "loss": 189.1826, "step": 9910 }, { "epoch": 0.08015578664985981, "grad_norm": 2478.71533203125, "learning_rate": 4.008080808080809e-05, "loss": 262.7207, "step": 9920 }, { "epoch": 0.08023658885414395, "grad_norm": 952.7739868164062, "learning_rate": 4.012121212121212e-05, "loss": 152.5153, "step": 9930 }, { "epoch": 0.08031739105842807, "grad_norm": 1591.5555419921875, "learning_rate": 4.016161616161616e-05, "loss": 161.1086, "step": 9940 }, { "epoch": 0.0803981932627122, "grad_norm": 2502.85400390625, "learning_rate": 4.0202020202020204e-05, "loss": 180.6969, "step": 9950 }, { "epoch": 0.08047899546699634, "grad_norm": 1107.453857421875, "learning_rate": 4.024242424242424e-05, "loss": 245.7, "step": 9960 }, { "epoch": 0.08055979767128048, "grad_norm": 732.3422241210938, "learning_rate": 4.028282828282828e-05, "loss": 128.3301, "step": 9970 }, { "epoch": 0.0806405998755646, "grad_norm": 852.4077758789062, "learning_rate": 4.032323232323232e-05, "loss": 171.862, "step": 9980 }, { "epoch": 0.08072140207984874, "grad_norm": 974.1900634765625, "learning_rate": 4.0363636363636367e-05, "loss": 245.3339, "step": 9990 }, { "epoch": 0.08080220428413287, "grad_norm": 917.6868286132812, "learning_rate": 4.0404040404040405e-05, "loss": 151.893, "step": 10000 }, { "epoch": 0.08088300648841701, "grad_norm": 1009.7120971679688, "learning_rate": 4.0444444444444444e-05, "loss": 196.4933, "step": 10010 }, { "epoch": 0.08096380869270113, "grad_norm": 2075.980224609375, "learning_rate": 4.048484848484849e-05, "loss": 190.6988, "step": 10020 }, { "epoch": 0.08104461089698527, "grad_norm": 2236.189697265625, "learning_rate": 4.052525252525253e-05, "loss": 173.7965, "step": 10030 }, { "epoch": 0.0811254131012694, "grad_norm": 1387.155517578125, "learning_rate": 4.056565656565657e-05, "loss": 287.3005, "step": 10040 }, { "epoch": 0.08120621530555354, "grad_norm": 1775.0162353515625, "learning_rate": 4.0606060606060606e-05, "loss": 161.304, "step": 10050 }, { "epoch": 0.08128701750983768, "grad_norm": 2991.034423828125, "learning_rate": 4.064646464646465e-05, "loss": 183.5822, "step": 10060 }, { "epoch": 0.0813678197141218, "grad_norm": 2393.831298828125, "learning_rate": 4.068686868686869e-05, "loss": 216.6813, "step": 10070 }, { "epoch": 0.08144862191840593, "grad_norm": 753.2874755859375, "learning_rate": 4.072727272727273e-05, "loss": 176.5726, "step": 10080 }, { "epoch": 0.08152942412269007, "grad_norm": 917.6944580078125, "learning_rate": 4.076767676767677e-05, "loss": 182.8443, "step": 10090 }, { "epoch": 0.0816102263269742, "grad_norm": 1497.6976318359375, "learning_rate": 4.0808080808080814e-05, "loss": 179.7216, "step": 10100 }, { "epoch": 0.08169102853125833, "grad_norm": 896.7678833007812, "learning_rate": 4.084848484848485e-05, "loss": 171.1982, "step": 10110 }, { "epoch": 0.08177183073554246, "grad_norm": 2560.443115234375, "learning_rate": 4.088888888888889e-05, "loss": 242.9725, "step": 10120 }, { "epoch": 0.0818526329398266, "grad_norm": 914.347900390625, "learning_rate": 4.092929292929293e-05, "loss": 166.4831, "step": 10130 }, { "epoch": 0.08193343514411074, "grad_norm": 1105.142822265625, "learning_rate": 4.096969696969697e-05, "loss": 182.9737, "step": 10140 }, { "epoch": 0.08201423734839486, "grad_norm": 1321.1329345703125, "learning_rate": 4.101010101010101e-05, "loss": 152.9314, "step": 10150 }, { "epoch": 0.082095039552679, "grad_norm": 924.438720703125, "learning_rate": 4.105050505050505e-05, "loss": 187.9318, "step": 10160 }, { "epoch": 0.08217584175696313, "grad_norm": 834.27685546875, "learning_rate": 4.109090909090909e-05, "loss": 177.3875, "step": 10170 }, { "epoch": 0.08225664396124727, "grad_norm": 996.1378173828125, "learning_rate": 4.113131313131313e-05, "loss": 163.1871, "step": 10180 }, { "epoch": 0.08233744616553139, "grad_norm": 1657.6314697265625, "learning_rate": 4.117171717171717e-05, "loss": 195.5306, "step": 10190 }, { "epoch": 0.08241824836981552, "grad_norm": 1040.5526123046875, "learning_rate": 4.1212121212121216e-05, "loss": 179.7722, "step": 10200 }, { "epoch": 0.08249905057409966, "grad_norm": 1405.0408935546875, "learning_rate": 4.1252525252525255e-05, "loss": 177.9881, "step": 10210 }, { "epoch": 0.0825798527783838, "grad_norm": 1484.392333984375, "learning_rate": 4.1292929292929294e-05, "loss": 170.4384, "step": 10220 }, { "epoch": 0.08266065498266793, "grad_norm": 533.9537963867188, "learning_rate": 4.133333333333333e-05, "loss": 166.7928, "step": 10230 }, { "epoch": 0.08274145718695206, "grad_norm": 1133.5531005859375, "learning_rate": 4.137373737373738e-05, "loss": 185.8696, "step": 10240 }, { "epoch": 0.08282225939123619, "grad_norm": 1964.5546875, "learning_rate": 4.141414141414142e-05, "loss": 247.6705, "step": 10250 }, { "epoch": 0.08290306159552033, "grad_norm": 1816.8203125, "learning_rate": 4.1454545454545456e-05, "loss": 188.6205, "step": 10260 }, { "epoch": 0.08298386379980446, "grad_norm": 809.494873046875, "learning_rate": 4.1494949494949495e-05, "loss": 186.0191, "step": 10270 }, { "epoch": 0.08306466600408859, "grad_norm": 1444.1771240234375, "learning_rate": 4.153535353535354e-05, "loss": 171.1266, "step": 10280 }, { "epoch": 0.08314546820837272, "grad_norm": 1594.9212646484375, "learning_rate": 4.157575757575758e-05, "loss": 152.8558, "step": 10290 }, { "epoch": 0.08322627041265686, "grad_norm": 1367.26318359375, "learning_rate": 4.161616161616162e-05, "loss": 215.3903, "step": 10300 }, { "epoch": 0.083307072616941, "grad_norm": 1395.453857421875, "learning_rate": 4.165656565656566e-05, "loss": 219.8608, "step": 10310 }, { "epoch": 0.08338787482122512, "grad_norm": 823.7109375, "learning_rate": 4.16969696969697e-05, "loss": 198.1472, "step": 10320 }, { "epoch": 0.08346867702550925, "grad_norm": 4030.11083984375, "learning_rate": 4.173737373737374e-05, "loss": 191.6869, "step": 10330 }, { "epoch": 0.08354947922979339, "grad_norm": 1047.6395263671875, "learning_rate": 4.177777777777778e-05, "loss": 198.535, "step": 10340 }, { "epoch": 0.08363028143407752, "grad_norm": 1213.222412109375, "learning_rate": 4.181818181818182e-05, "loss": 209.4711, "step": 10350 }, { "epoch": 0.08371108363836166, "grad_norm": 862.3009643554688, "learning_rate": 4.185858585858586e-05, "loss": 197.0447, "step": 10360 }, { "epoch": 0.08379188584264578, "grad_norm": 7555.03271484375, "learning_rate": 4.18989898989899e-05, "loss": 257.4577, "step": 10370 }, { "epoch": 0.08387268804692992, "grad_norm": 1448.51806640625, "learning_rate": 4.193939393939394e-05, "loss": 145.9192, "step": 10380 }, { "epoch": 0.08395349025121406, "grad_norm": 563.4491577148438, "learning_rate": 4.197979797979798e-05, "loss": 185.379, "step": 10390 }, { "epoch": 0.08403429245549819, "grad_norm": 2999.943603515625, "learning_rate": 4.202020202020202e-05, "loss": 207.6808, "step": 10400 }, { "epoch": 0.08411509465978231, "grad_norm": 1272.3822021484375, "learning_rate": 4.206060606060606e-05, "loss": 203.2208, "step": 10410 }, { "epoch": 0.08419589686406645, "grad_norm": 1877.1287841796875, "learning_rate": 4.2101010101010105e-05, "loss": 208.5024, "step": 10420 }, { "epoch": 0.08427669906835059, "grad_norm": 880.4778442382812, "learning_rate": 4.2141414141414144e-05, "loss": 114.9268, "step": 10430 }, { "epoch": 0.08435750127263472, "grad_norm": 2282.030517578125, "learning_rate": 4.218181818181818e-05, "loss": 185.7249, "step": 10440 }, { "epoch": 0.08443830347691884, "grad_norm": 1241.139892578125, "learning_rate": 4.222222222222222e-05, "loss": 171.4185, "step": 10450 }, { "epoch": 0.08451910568120298, "grad_norm": 1106.116943359375, "learning_rate": 4.226262626262627e-05, "loss": 233.517, "step": 10460 }, { "epoch": 0.08459990788548712, "grad_norm": 1352.7723388671875, "learning_rate": 4.2303030303030306e-05, "loss": 164.0137, "step": 10470 }, { "epoch": 0.08468071008977125, "grad_norm": 824.0071411132812, "learning_rate": 4.2343434343434345e-05, "loss": 149.2571, "step": 10480 }, { "epoch": 0.08476151229405539, "grad_norm": 1494.287841796875, "learning_rate": 4.2383838383838384e-05, "loss": 158.6048, "step": 10490 }, { "epoch": 0.08484231449833951, "grad_norm": 928.8883056640625, "learning_rate": 4.242424242424243e-05, "loss": 160.3929, "step": 10500 }, { "epoch": 0.08492311670262365, "grad_norm": 3102.35791015625, "learning_rate": 4.246464646464647e-05, "loss": 262.7529, "step": 10510 }, { "epoch": 0.08500391890690778, "grad_norm": 753.5589599609375, "learning_rate": 4.250505050505051e-05, "loss": 147.7691, "step": 10520 }, { "epoch": 0.08508472111119192, "grad_norm": 1001.4116821289062, "learning_rate": 4.254545454545455e-05, "loss": 137.3231, "step": 10530 }, { "epoch": 0.08516552331547604, "grad_norm": 813.2144775390625, "learning_rate": 4.258585858585859e-05, "loss": 201.1259, "step": 10540 }, { "epoch": 0.08524632551976018, "grad_norm": 1098.597900390625, "learning_rate": 4.262626262626263e-05, "loss": 165.8937, "step": 10550 }, { "epoch": 0.08532712772404431, "grad_norm": 1298.8853759765625, "learning_rate": 4.266666666666667e-05, "loss": 198.1985, "step": 10560 }, { "epoch": 0.08540792992832845, "grad_norm": 1016.0570678710938, "learning_rate": 4.270707070707071e-05, "loss": 206.1271, "step": 10570 }, { "epoch": 0.08548873213261257, "grad_norm": 2057.573974609375, "learning_rate": 4.274747474747475e-05, "loss": 172.7861, "step": 10580 }, { "epoch": 0.08556953433689671, "grad_norm": 1261.774169921875, "learning_rate": 4.2787878787878786e-05, "loss": 149.1268, "step": 10590 }, { "epoch": 0.08565033654118084, "grad_norm": 1344.2037353515625, "learning_rate": 4.282828282828283e-05, "loss": 195.2029, "step": 10600 }, { "epoch": 0.08573113874546498, "grad_norm": 3034.935546875, "learning_rate": 4.286868686868687e-05, "loss": 222.7399, "step": 10610 }, { "epoch": 0.08581194094974912, "grad_norm": 970.0159912109375, "learning_rate": 4.290909090909091e-05, "loss": 168.5838, "step": 10620 }, { "epoch": 0.08589274315403324, "grad_norm": 953.5883178710938, "learning_rate": 4.294949494949495e-05, "loss": 162.8468, "step": 10630 }, { "epoch": 0.08597354535831737, "grad_norm": 2465.924072265625, "learning_rate": 4.2989898989898994e-05, "loss": 204.2057, "step": 10640 }, { "epoch": 0.08605434756260151, "grad_norm": 1194.1285400390625, "learning_rate": 4.303030303030303e-05, "loss": 197.6671, "step": 10650 }, { "epoch": 0.08613514976688565, "grad_norm": 881.1695556640625, "learning_rate": 4.307070707070707e-05, "loss": 163.8398, "step": 10660 }, { "epoch": 0.08621595197116977, "grad_norm": 3846.036376953125, "learning_rate": 4.311111111111111e-05, "loss": 178.4458, "step": 10670 }, { "epoch": 0.0862967541754539, "grad_norm": 1206.0906982421875, "learning_rate": 4.3151515151515156e-05, "loss": 153.2348, "step": 10680 }, { "epoch": 0.08637755637973804, "grad_norm": 898.4664306640625, "learning_rate": 4.3191919191919195e-05, "loss": 143.3541, "step": 10690 }, { "epoch": 0.08645835858402218, "grad_norm": 2398.2255859375, "learning_rate": 4.3232323232323234e-05, "loss": 212.5595, "step": 10700 }, { "epoch": 0.0865391607883063, "grad_norm": 1220.5733642578125, "learning_rate": 4.327272727272728e-05, "loss": 179.1572, "step": 10710 }, { "epoch": 0.08661996299259044, "grad_norm": 1974.5457763671875, "learning_rate": 4.331313131313132e-05, "loss": 190.0654, "step": 10720 }, { "epoch": 0.08670076519687457, "grad_norm": 2125.98583984375, "learning_rate": 4.335353535353536e-05, "loss": 168.3283, "step": 10730 }, { "epoch": 0.08678156740115871, "grad_norm": 1389.6546630859375, "learning_rate": 4.3393939393939396e-05, "loss": 159.5211, "step": 10740 }, { "epoch": 0.08686236960544283, "grad_norm": 807.5780029296875, "learning_rate": 4.343434343434344e-05, "loss": 192.4806, "step": 10750 }, { "epoch": 0.08694317180972697, "grad_norm": 1139.113037109375, "learning_rate": 4.347474747474748e-05, "loss": 155.3915, "step": 10760 }, { "epoch": 0.0870239740140111, "grad_norm": 1003.0131225585938, "learning_rate": 4.351515151515152e-05, "loss": 150.0795, "step": 10770 }, { "epoch": 0.08710477621829524, "grad_norm": 962.0811767578125, "learning_rate": 4.355555555555556e-05, "loss": 194.0887, "step": 10780 }, { "epoch": 0.08718557842257937, "grad_norm": 626.3510131835938, "learning_rate": 4.35959595959596e-05, "loss": 148.0051, "step": 10790 }, { "epoch": 0.0872663806268635, "grad_norm": 2664.0517578125, "learning_rate": 4.3636363636363636e-05, "loss": 199.106, "step": 10800 }, { "epoch": 0.08734718283114763, "grad_norm": 895.5698852539062, "learning_rate": 4.3676767676767674e-05, "loss": 172.3583, "step": 10810 }, { "epoch": 0.08742798503543177, "grad_norm": 1103.621826171875, "learning_rate": 4.371717171717172e-05, "loss": 203.0746, "step": 10820 }, { "epoch": 0.0875087872397159, "grad_norm": 1323.6517333984375, "learning_rate": 4.375757575757576e-05, "loss": 158.2676, "step": 10830 }, { "epoch": 0.08758958944400003, "grad_norm": 1170.48779296875, "learning_rate": 4.37979797979798e-05, "loss": 151.4179, "step": 10840 }, { "epoch": 0.08767039164828416, "grad_norm": 1171.2379150390625, "learning_rate": 4.383838383838384e-05, "loss": 181.4644, "step": 10850 }, { "epoch": 0.0877511938525683, "grad_norm": 1495.01025390625, "learning_rate": 4.387878787878788e-05, "loss": 167.6518, "step": 10860 }, { "epoch": 0.08783199605685243, "grad_norm": 1283.5498046875, "learning_rate": 4.391919191919192e-05, "loss": 138.4191, "step": 10870 }, { "epoch": 0.08791279826113656, "grad_norm": 1028.198974609375, "learning_rate": 4.395959595959596e-05, "loss": 167.4624, "step": 10880 }, { "epoch": 0.0879936004654207, "grad_norm": 958.3167114257812, "learning_rate": 4.4000000000000006e-05, "loss": 187.0603, "step": 10890 }, { "epoch": 0.08807440266970483, "grad_norm": 1248.95556640625, "learning_rate": 4.4040404040404044e-05, "loss": 194.5838, "step": 10900 }, { "epoch": 0.08815520487398897, "grad_norm": 1088.8775634765625, "learning_rate": 4.408080808080808e-05, "loss": 137.5682, "step": 10910 }, { "epoch": 0.0882360070782731, "grad_norm": 1130.275146484375, "learning_rate": 4.412121212121212e-05, "loss": 180.1215, "step": 10920 }, { "epoch": 0.08831680928255722, "grad_norm": 1201.9453125, "learning_rate": 4.416161616161617e-05, "loss": 157.9624, "step": 10930 }, { "epoch": 0.08839761148684136, "grad_norm": 1291.0989990234375, "learning_rate": 4.420202020202021e-05, "loss": 159.1208, "step": 10940 }, { "epoch": 0.0884784136911255, "grad_norm": 946.185546875, "learning_rate": 4.4242424242424246e-05, "loss": 200.6266, "step": 10950 }, { "epoch": 0.08855921589540963, "grad_norm": 2330.45361328125, "learning_rate": 4.4282828282828284e-05, "loss": 163.2997, "step": 10960 }, { "epoch": 0.08864001809969375, "grad_norm": 959.3818359375, "learning_rate": 4.432323232323233e-05, "loss": 200.18, "step": 10970 }, { "epoch": 0.08872082030397789, "grad_norm": 1215.0078125, "learning_rate": 4.436363636363637e-05, "loss": 194.7184, "step": 10980 }, { "epoch": 0.08880162250826203, "grad_norm": 731.89501953125, "learning_rate": 4.44040404040404e-05, "loss": 183.9504, "step": 10990 }, { "epoch": 0.08888242471254616, "grad_norm": 751.2623291015625, "learning_rate": 4.4444444444444447e-05, "loss": 164.3441, "step": 11000 }, { "epoch": 0.08896322691683028, "grad_norm": 1424.0635986328125, "learning_rate": 4.4484848484848485e-05, "loss": 202.2657, "step": 11010 }, { "epoch": 0.08904402912111442, "grad_norm": 1039.608642578125, "learning_rate": 4.4525252525252524e-05, "loss": 177.8795, "step": 11020 }, { "epoch": 0.08912483132539856, "grad_norm": 1338.993408203125, "learning_rate": 4.456565656565656e-05, "loss": 167.3348, "step": 11030 }, { "epoch": 0.08920563352968269, "grad_norm": 2591.8984375, "learning_rate": 4.460606060606061e-05, "loss": 165.2155, "step": 11040 }, { "epoch": 0.08928643573396683, "grad_norm": 931.5535888671875, "learning_rate": 4.464646464646465e-05, "loss": 198.5493, "step": 11050 }, { "epoch": 0.08936723793825095, "grad_norm": 1028.25927734375, "learning_rate": 4.4686868686868686e-05, "loss": 169.1925, "step": 11060 }, { "epoch": 0.08944804014253509, "grad_norm": 1156.0810546875, "learning_rate": 4.472727272727273e-05, "loss": 172.742, "step": 11070 }, { "epoch": 0.08952884234681922, "grad_norm": 1409.868408203125, "learning_rate": 4.476767676767677e-05, "loss": 152.6085, "step": 11080 }, { "epoch": 0.08960964455110336, "grad_norm": 1091.4266357421875, "learning_rate": 4.480808080808081e-05, "loss": 149.4833, "step": 11090 }, { "epoch": 0.08969044675538748, "grad_norm": 1274.9849853515625, "learning_rate": 4.484848484848485e-05, "loss": 239.724, "step": 11100 }, { "epoch": 0.08977124895967162, "grad_norm": 2974.341552734375, "learning_rate": 4.4888888888888894e-05, "loss": 184.0551, "step": 11110 }, { "epoch": 0.08985205116395575, "grad_norm": 893.275390625, "learning_rate": 4.492929292929293e-05, "loss": 163.8494, "step": 11120 }, { "epoch": 0.08993285336823989, "grad_norm": 1236.2047119140625, "learning_rate": 4.496969696969697e-05, "loss": 223.2573, "step": 11130 }, { "epoch": 0.09001365557252401, "grad_norm": 560.5221557617188, "learning_rate": 4.501010101010101e-05, "loss": 143.0442, "step": 11140 }, { "epoch": 0.09009445777680815, "grad_norm": 1671.4537353515625, "learning_rate": 4.5050505050505056e-05, "loss": 230.4551, "step": 11150 }, { "epoch": 0.09017525998109228, "grad_norm": 2295.0419921875, "learning_rate": 4.5090909090909095e-05, "loss": 191.5067, "step": 11160 }, { "epoch": 0.09025606218537642, "grad_norm": 1291.7230224609375, "learning_rate": 4.5131313131313134e-05, "loss": 165.677, "step": 11170 }, { "epoch": 0.09033686438966056, "grad_norm": 817.165771484375, "learning_rate": 4.517171717171717e-05, "loss": 145.0678, "step": 11180 }, { "epoch": 0.09041766659394468, "grad_norm": 938.4746704101562, "learning_rate": 4.521212121212122e-05, "loss": 168.7315, "step": 11190 }, { "epoch": 0.09049846879822881, "grad_norm": 820.7261352539062, "learning_rate": 4.525252525252526e-05, "loss": 174.4509, "step": 11200 }, { "epoch": 0.09057927100251295, "grad_norm": 899.671875, "learning_rate": 4.529292929292929e-05, "loss": 153.883, "step": 11210 }, { "epoch": 0.09066007320679709, "grad_norm": 2744.694091796875, "learning_rate": 4.5333333333333335e-05, "loss": 152.8033, "step": 11220 }, { "epoch": 0.09074087541108121, "grad_norm": 1821.427734375, "learning_rate": 4.5373737373737374e-05, "loss": 203.8027, "step": 11230 }, { "epoch": 0.09082167761536535, "grad_norm": 937.9207763671875, "learning_rate": 4.541414141414141e-05, "loss": 183.0782, "step": 11240 }, { "epoch": 0.09090247981964948, "grad_norm": 1872.185546875, "learning_rate": 4.545454545454546e-05, "loss": 153.9125, "step": 11250 }, { "epoch": 0.09098328202393362, "grad_norm": 772.832275390625, "learning_rate": 4.54949494949495e-05, "loss": 143.9607, "step": 11260 }, { "epoch": 0.09106408422821774, "grad_norm": 2842.652587890625, "learning_rate": 4.5535353535353536e-05, "loss": 189.6786, "step": 11270 }, { "epoch": 0.09114488643250188, "grad_norm": 1738.2589111328125, "learning_rate": 4.5575757575757575e-05, "loss": 163.1939, "step": 11280 }, { "epoch": 0.09122568863678601, "grad_norm": 1521.9814453125, "learning_rate": 4.561616161616162e-05, "loss": 208.5482, "step": 11290 }, { "epoch": 0.09130649084107015, "grad_norm": 1132.692138671875, "learning_rate": 4.565656565656566e-05, "loss": 161.8789, "step": 11300 }, { "epoch": 0.09138729304535427, "grad_norm": 1395.38671875, "learning_rate": 4.56969696969697e-05, "loss": 164.5156, "step": 11310 }, { "epoch": 0.0914680952496384, "grad_norm": 661.7669067382812, "learning_rate": 4.573737373737374e-05, "loss": 163.1058, "step": 11320 }, { "epoch": 0.09154889745392254, "grad_norm": 1353.42578125, "learning_rate": 4.577777777777778e-05, "loss": 178.0265, "step": 11330 }, { "epoch": 0.09162969965820668, "grad_norm": 758.9215087890625, "learning_rate": 4.581818181818182e-05, "loss": 186.8539, "step": 11340 }, { "epoch": 0.09171050186249081, "grad_norm": 926.5440673828125, "learning_rate": 4.585858585858586e-05, "loss": 174.9289, "step": 11350 }, { "epoch": 0.09179130406677494, "grad_norm": 1032.7493896484375, "learning_rate": 4.58989898989899e-05, "loss": 157.8875, "step": 11360 }, { "epoch": 0.09187210627105907, "grad_norm": 1104.685302734375, "learning_rate": 4.5939393939393945e-05, "loss": 180.8059, "step": 11370 }, { "epoch": 0.09195290847534321, "grad_norm": 807.3258056640625, "learning_rate": 4.5979797979797984e-05, "loss": 133.7691, "step": 11380 }, { "epoch": 0.09203371067962735, "grad_norm": 1296.3505859375, "learning_rate": 4.602020202020202e-05, "loss": 168.8188, "step": 11390 }, { "epoch": 0.09211451288391147, "grad_norm": 717.891357421875, "learning_rate": 4.606060606060607e-05, "loss": 143.9607, "step": 11400 }, { "epoch": 0.0921953150881956, "grad_norm": 841.2793579101562, "learning_rate": 4.610101010101011e-05, "loss": 185.6871, "step": 11410 }, { "epoch": 0.09227611729247974, "grad_norm": 1319.2064208984375, "learning_rate": 4.614141414141414e-05, "loss": 163.8516, "step": 11420 }, { "epoch": 0.09235691949676388, "grad_norm": 1566.4168701171875, "learning_rate": 4.618181818181818e-05, "loss": 165.3434, "step": 11430 }, { "epoch": 0.092437721701048, "grad_norm": 919.2448120117188, "learning_rate": 4.6222222222222224e-05, "loss": 133.6576, "step": 11440 }, { "epoch": 0.09251852390533213, "grad_norm": 976.5399780273438, "learning_rate": 4.626262626262626e-05, "loss": 172.4401, "step": 11450 }, { "epoch": 0.09259932610961627, "grad_norm": 1343.291015625, "learning_rate": 4.63030303030303e-05, "loss": 203.6782, "step": 11460 }, { "epoch": 0.0926801283139004, "grad_norm": 839.6242065429688, "learning_rate": 4.634343434343435e-05, "loss": 175.6062, "step": 11470 }, { "epoch": 0.09276093051818454, "grad_norm": 909.182861328125, "learning_rate": 4.6383838383838386e-05, "loss": 159.7895, "step": 11480 }, { "epoch": 0.09284173272246866, "grad_norm": 2260.60107421875, "learning_rate": 4.6424242424242425e-05, "loss": 168.3035, "step": 11490 }, { "epoch": 0.0929225349267528, "grad_norm": 1016.5995483398438, "learning_rate": 4.6464646464646464e-05, "loss": 149.6134, "step": 11500 }, { "epoch": 0.09300333713103694, "grad_norm": 1180.2608642578125, "learning_rate": 4.650505050505051e-05, "loss": 123.0571, "step": 11510 }, { "epoch": 0.09308413933532107, "grad_norm": 1054.5244140625, "learning_rate": 4.654545454545455e-05, "loss": 180.0858, "step": 11520 }, { "epoch": 0.0931649415396052, "grad_norm": 1064.1981201171875, "learning_rate": 4.658585858585859e-05, "loss": 305.0233, "step": 11530 }, { "epoch": 0.09324574374388933, "grad_norm": 1455.9857177734375, "learning_rate": 4.6626262626262626e-05, "loss": 174.1713, "step": 11540 }, { "epoch": 0.09332654594817347, "grad_norm": 801.254150390625, "learning_rate": 4.666666666666667e-05, "loss": 175.7632, "step": 11550 }, { "epoch": 0.0934073481524576, "grad_norm": 811.2909545898438, "learning_rate": 4.670707070707071e-05, "loss": 152.5526, "step": 11560 }, { "epoch": 0.09348815035674173, "grad_norm": 860.8350830078125, "learning_rate": 4.674747474747475e-05, "loss": 159.9338, "step": 11570 }, { "epoch": 0.09356895256102586, "grad_norm": 1820.26318359375, "learning_rate": 4.6787878787878795e-05, "loss": 205.222, "step": 11580 }, { "epoch": 0.09364975476531, "grad_norm": 1027.9521484375, "learning_rate": 4.6828282828282834e-05, "loss": 167.9507, "step": 11590 }, { "epoch": 0.09373055696959413, "grad_norm": 3551.64599609375, "learning_rate": 4.686868686868687e-05, "loss": 194.2529, "step": 11600 }, { "epoch": 0.09381135917387827, "grad_norm": 957.3357543945312, "learning_rate": 4.690909090909091e-05, "loss": 185.5651, "step": 11610 }, { "epoch": 0.09389216137816239, "grad_norm": 587.98828125, "learning_rate": 4.694949494949496e-05, "loss": 208.2654, "step": 11620 }, { "epoch": 0.09397296358244653, "grad_norm": 824.1953735351562, "learning_rate": 4.698989898989899e-05, "loss": 118.9412, "step": 11630 }, { "epoch": 0.09405376578673066, "grad_norm": 997.8128051757812, "learning_rate": 4.703030303030303e-05, "loss": 144.5133, "step": 11640 }, { "epoch": 0.0941345679910148, "grad_norm": 825.6588745117188, "learning_rate": 4.7070707070707074e-05, "loss": 130.3822, "step": 11650 }, { "epoch": 0.09421537019529892, "grad_norm": 1590.2271728515625, "learning_rate": 4.711111111111111e-05, "loss": 147.783, "step": 11660 }, { "epoch": 0.09429617239958306, "grad_norm": 831.8695068359375, "learning_rate": 4.715151515151515e-05, "loss": 150.0897, "step": 11670 }, { "epoch": 0.0943769746038672, "grad_norm": 879.4678955078125, "learning_rate": 4.719191919191919e-05, "loss": 134.1156, "step": 11680 }, { "epoch": 0.09445777680815133, "grad_norm": 1955.2484130859375, "learning_rate": 4.7232323232323236e-05, "loss": 155.8984, "step": 11690 }, { "epoch": 0.09453857901243545, "grad_norm": 1074.5509033203125, "learning_rate": 4.7272727272727275e-05, "loss": 186.7514, "step": 11700 }, { "epoch": 0.09461938121671959, "grad_norm": 992.748046875, "learning_rate": 4.7313131313131314e-05, "loss": 155.1787, "step": 11710 }, { "epoch": 0.09470018342100373, "grad_norm": 990.5753784179688, "learning_rate": 4.735353535353535e-05, "loss": 140.0309, "step": 11720 }, { "epoch": 0.09478098562528786, "grad_norm": 792.1434936523438, "learning_rate": 4.73939393939394e-05, "loss": 138.4607, "step": 11730 }, { "epoch": 0.094861787829572, "grad_norm": 1121.582763671875, "learning_rate": 4.743434343434344e-05, "loss": 121.2844, "step": 11740 }, { "epoch": 0.09494259003385612, "grad_norm": 1351.6878662109375, "learning_rate": 4.7474747474747476e-05, "loss": 198.7702, "step": 11750 }, { "epoch": 0.09502339223814026, "grad_norm": 2031.31494140625, "learning_rate": 4.751515151515152e-05, "loss": 164.1413, "step": 11760 }, { "epoch": 0.09510419444242439, "grad_norm": 761.006103515625, "learning_rate": 4.755555555555556e-05, "loss": 143.7779, "step": 11770 }, { "epoch": 0.09518499664670853, "grad_norm": 1228.6676025390625, "learning_rate": 4.75959595959596e-05, "loss": 177.4899, "step": 11780 }, { "epoch": 0.09526579885099265, "grad_norm": 817.8377075195312, "learning_rate": 4.763636363636364e-05, "loss": 158.3827, "step": 11790 }, { "epoch": 0.09534660105527679, "grad_norm": 1481.2501220703125, "learning_rate": 4.7676767676767684e-05, "loss": 144.1627, "step": 11800 }, { "epoch": 0.09542740325956092, "grad_norm": 899.5111083984375, "learning_rate": 4.771717171717172e-05, "loss": 188.1043, "step": 11810 }, { "epoch": 0.09550820546384506, "grad_norm": 766.15869140625, "learning_rate": 4.775757575757576e-05, "loss": 190.8986, "step": 11820 }, { "epoch": 0.09558900766812918, "grad_norm": 818.2703857421875, "learning_rate": 4.77979797979798e-05, "loss": 226.6272, "step": 11830 }, { "epoch": 0.09566980987241332, "grad_norm": 1306.8607177734375, "learning_rate": 4.7838383838383846e-05, "loss": 179.7575, "step": 11840 }, { "epoch": 0.09575061207669745, "grad_norm": 1371.1048583984375, "learning_rate": 4.787878787878788e-05, "loss": 184.5432, "step": 11850 }, { "epoch": 0.09583141428098159, "grad_norm": 1219.8428955078125, "learning_rate": 4.791919191919192e-05, "loss": 256.0484, "step": 11860 }, { "epoch": 0.09591221648526572, "grad_norm": 1769.408935546875, "learning_rate": 4.795959595959596e-05, "loss": 216.8512, "step": 11870 }, { "epoch": 0.09599301868954985, "grad_norm": 1021.0985107421875, "learning_rate": 4.8e-05, "loss": 150.8418, "step": 11880 }, { "epoch": 0.09607382089383398, "grad_norm": 789.3172607421875, "learning_rate": 4.804040404040404e-05, "loss": 168.713, "step": 11890 }, { "epoch": 0.09615462309811812, "grad_norm": 1145.4168701171875, "learning_rate": 4.808080808080808e-05, "loss": 248.8971, "step": 11900 }, { "epoch": 0.09623542530240226, "grad_norm": 1330.3175048828125, "learning_rate": 4.8121212121212125e-05, "loss": 143.9111, "step": 11910 }, { "epoch": 0.09631622750668638, "grad_norm": 1145.8402099609375, "learning_rate": 4.8161616161616163e-05, "loss": 196.2298, "step": 11920 }, { "epoch": 0.09639702971097051, "grad_norm": 3719.409423828125, "learning_rate": 4.82020202020202e-05, "loss": 141.981, "step": 11930 }, { "epoch": 0.09647783191525465, "grad_norm": 783.9396362304688, "learning_rate": 4.824242424242425e-05, "loss": 139.5111, "step": 11940 }, { "epoch": 0.09655863411953879, "grad_norm": 817.3587036132812, "learning_rate": 4.828282828282829e-05, "loss": 193.1281, "step": 11950 }, { "epoch": 0.09663943632382291, "grad_norm": 2150.12939453125, "learning_rate": 4.8323232323232326e-05, "loss": 173.1304, "step": 11960 }, { "epoch": 0.09672023852810704, "grad_norm": 1299.8162841796875, "learning_rate": 4.8363636363636364e-05, "loss": 181.7705, "step": 11970 }, { "epoch": 0.09680104073239118, "grad_norm": 936.7069091796875, "learning_rate": 4.840404040404041e-05, "loss": 173.4242, "step": 11980 }, { "epoch": 0.09688184293667532, "grad_norm": 613.501708984375, "learning_rate": 4.844444444444445e-05, "loss": 153.117, "step": 11990 }, { "epoch": 0.09696264514095944, "grad_norm": 1406.6314697265625, "learning_rate": 4.848484848484849e-05, "loss": 185.4404, "step": 12000 } ], "logging_steps": 10, "max_steps": 123750, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }