BioMike's picture
Upload folder using huggingface_hub
8857776 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.09696264514095944,
"eval_steps": 500,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.080220428413287e-05,
"grad_norm": 120784.5234375,
"learning_rate": 4.040404040404041e-08,
"loss": 6344.0191,
"step": 10
},
{
"epoch": 0.00016160440856826573,
"grad_norm": 246101.4375,
"learning_rate": 8.080808080808082e-08,
"loss": 7230.2391,
"step": 20
},
{
"epoch": 0.00024240661285239863,
"grad_norm": 220140.828125,
"learning_rate": 1.2121212121212122e-07,
"loss": 7248.1844,
"step": 30
},
{
"epoch": 0.00032320881713653147,
"grad_norm": 468224.40625,
"learning_rate": 1.6161616161616163e-07,
"loss": 8245.5844,
"step": 40
},
{
"epoch": 0.00040401102142066436,
"grad_norm": 129730.1171875,
"learning_rate": 2.0202020202020202e-07,
"loss": 5464.3164,
"step": 50
},
{
"epoch": 0.00048481322570479725,
"grad_norm": 37984.21484375,
"learning_rate": 2.4242424242424244e-07,
"loss": 7551.0562,
"step": 60
},
{
"epoch": 0.0005656154299889301,
"grad_norm": 170187.078125,
"learning_rate": 2.8282828282828283e-07,
"loss": 6856.2141,
"step": 70
},
{
"epoch": 0.0006464176342730629,
"grad_norm": 64738.05078125,
"learning_rate": 3.2323232323232327e-07,
"loss": 7045.2477,
"step": 80
},
{
"epoch": 0.0007272198385571959,
"grad_norm": 114463.8515625,
"learning_rate": 3.6363636363636366e-07,
"loss": 5803.459,
"step": 90
},
{
"epoch": 0.0008080220428413287,
"grad_norm": 102587.6796875,
"learning_rate": 4.0404040404040405e-07,
"loss": 3976.1211,
"step": 100
},
{
"epoch": 0.0008888242471254616,
"grad_norm": 78112.53125,
"learning_rate": 4.444444444444445e-07,
"loss": 3535.432,
"step": 110
},
{
"epoch": 0.0009696264514095945,
"grad_norm": 100944.5703125,
"learning_rate": 4.848484848484849e-07,
"loss": 4244.4645,
"step": 120
},
{
"epoch": 0.0010504286556937273,
"grad_norm": 178465.359375,
"learning_rate": 5.252525252525253e-07,
"loss": 5655.2863,
"step": 130
},
{
"epoch": 0.0011312308599778602,
"grad_norm": 70585.359375,
"learning_rate": 5.656565656565657e-07,
"loss": 3637.5656,
"step": 140
},
{
"epoch": 0.001212033064261993,
"grad_norm": 51223.41015625,
"learning_rate": 6.060606060606061e-07,
"loss": 2629.8635,
"step": 150
},
{
"epoch": 0.0012928352685461259,
"grad_norm": 53755.01953125,
"learning_rate": 6.464646464646465e-07,
"loss": 4522.2559,
"step": 160
},
{
"epoch": 0.0013736374728302587,
"grad_norm": 72124.0625,
"learning_rate": 6.868686868686869e-07,
"loss": 3395.8777,
"step": 170
},
{
"epoch": 0.0014544396771143918,
"grad_norm": 11780.193359375,
"learning_rate": 7.272727272727273e-07,
"loss": 1784.9486,
"step": 180
},
{
"epoch": 0.0015352418813985246,
"grad_norm": 62803.8828125,
"learning_rate": 7.676767676767678e-07,
"loss": 2220.277,
"step": 190
},
{
"epoch": 0.0016160440856826574,
"grad_norm": 118829.34375,
"learning_rate": 8.080808080808081e-07,
"loss": 2133.5133,
"step": 200
},
{
"epoch": 0.0016968462899667903,
"grad_norm": 7391.849609375,
"learning_rate": 8.484848484848486e-07,
"loss": 1618.4662,
"step": 210
},
{
"epoch": 0.0017776484942509231,
"grad_norm": 7889.263671875,
"learning_rate": 8.88888888888889e-07,
"loss": 1578.3401,
"step": 220
},
{
"epoch": 0.001858450698535056,
"grad_norm": 10828.140625,
"learning_rate": 9.292929292929294e-07,
"loss": 1327.2748,
"step": 230
},
{
"epoch": 0.001939252902819189,
"grad_norm": 15991.2119140625,
"learning_rate": 9.696969696969698e-07,
"loss": 1342.3516,
"step": 240
},
{
"epoch": 0.0020200551071033216,
"grad_norm": 7872.46484375,
"learning_rate": 1.0101010101010103e-06,
"loss": 1077.598,
"step": 250
},
{
"epoch": 0.0021008573113874547,
"grad_norm": 3977.63623046875,
"learning_rate": 1.0505050505050506e-06,
"loss": 835.4583,
"step": 260
},
{
"epoch": 0.0021816595156715873,
"grad_norm": 3596.71923828125,
"learning_rate": 1.090909090909091e-06,
"loss": 766.0804,
"step": 270
},
{
"epoch": 0.0022624617199557204,
"grad_norm": 10194.791015625,
"learning_rate": 1.1313131313131313e-06,
"loss": 777.5695,
"step": 280
},
{
"epoch": 0.0023432639242398534,
"grad_norm": 7022.6103515625,
"learning_rate": 1.1717171717171719e-06,
"loss": 639.281,
"step": 290
},
{
"epoch": 0.002424066128523986,
"grad_norm": 2920.81787109375,
"learning_rate": 1.2121212121212122e-06,
"loss": 703.318,
"step": 300
},
{
"epoch": 0.002504868332808119,
"grad_norm": 4137.4970703125,
"learning_rate": 1.2525252525252527e-06,
"loss": 766.8718,
"step": 310
},
{
"epoch": 0.0025856705370922517,
"grad_norm": 8379.064453125,
"learning_rate": 1.292929292929293e-06,
"loss": 625.7259,
"step": 320
},
{
"epoch": 0.002666472741376385,
"grad_norm": 2632.939697265625,
"learning_rate": 1.3333333333333334e-06,
"loss": 529.0049,
"step": 330
},
{
"epoch": 0.0027472749456605174,
"grad_norm": 3072.3486328125,
"learning_rate": 1.3737373737373738e-06,
"loss": 557.5195,
"step": 340
},
{
"epoch": 0.0028280771499446505,
"grad_norm": 2430.994384765625,
"learning_rate": 1.4141414141414143e-06,
"loss": 548.776,
"step": 350
},
{
"epoch": 0.0029088793542287835,
"grad_norm": 2531.90771484375,
"learning_rate": 1.4545454545454546e-06,
"loss": 602.2291,
"step": 360
},
{
"epoch": 0.002989681558512916,
"grad_norm": 5374.16552734375,
"learning_rate": 1.4949494949494952e-06,
"loss": 447.6404,
"step": 370
},
{
"epoch": 0.003070483762797049,
"grad_norm": 1357.3726806640625,
"learning_rate": 1.5353535353535355e-06,
"loss": 342.7574,
"step": 380
},
{
"epoch": 0.003151285967081182,
"grad_norm": 1249.0936279296875,
"learning_rate": 1.5757575757575759e-06,
"loss": 562.0835,
"step": 390
},
{
"epoch": 0.003232088171365315,
"grad_norm": 1270.3609619140625,
"learning_rate": 1.6161616161616162e-06,
"loss": 467.299,
"step": 400
},
{
"epoch": 0.0033128903756494475,
"grad_norm": 2576.8291015625,
"learning_rate": 1.6565656565656565e-06,
"loss": 448.897,
"step": 410
},
{
"epoch": 0.0033936925799335806,
"grad_norm": 1376.2908935546875,
"learning_rate": 1.6969696969696973e-06,
"loss": 382.242,
"step": 420
},
{
"epoch": 0.0034744947842177136,
"grad_norm": 1212.802490234375,
"learning_rate": 1.7373737373737376e-06,
"loss": 486.5346,
"step": 430
},
{
"epoch": 0.0035552969885018462,
"grad_norm": 801.7883911132812,
"learning_rate": 1.777777777777778e-06,
"loss": 452.1537,
"step": 440
},
{
"epoch": 0.0036360991927859793,
"grad_norm": 1590.736572265625,
"learning_rate": 1.818181818181818e-06,
"loss": 449.5157,
"step": 450
},
{
"epoch": 0.003716901397070112,
"grad_norm": 3368.974853515625,
"learning_rate": 1.8585858585858588e-06,
"loss": 445.9489,
"step": 460
},
{
"epoch": 0.003797703601354245,
"grad_norm": 2164.530517578125,
"learning_rate": 1.8989898989898992e-06,
"loss": 449.8674,
"step": 470
},
{
"epoch": 0.003878505805638378,
"grad_norm": 1234.2860107421875,
"learning_rate": 1.9393939393939395e-06,
"loss": 375.5616,
"step": 480
},
{
"epoch": 0.003959308009922511,
"grad_norm": 1191.9912109375,
"learning_rate": 1.9797979797979796e-06,
"loss": 394.9626,
"step": 490
},
{
"epoch": 0.004040110214206643,
"grad_norm": 2829.901611328125,
"learning_rate": 2.0202020202020206e-06,
"loss": 398.194,
"step": 500
},
{
"epoch": 0.004120912418490777,
"grad_norm": 3573.80322265625,
"learning_rate": 2.0606060606060607e-06,
"loss": 440.5374,
"step": 510
},
{
"epoch": 0.004201714622774909,
"grad_norm": 991.9559326171875,
"learning_rate": 2.1010101010101013e-06,
"loss": 552.4083,
"step": 520
},
{
"epoch": 0.004282516827059042,
"grad_norm": 2654.274658203125,
"learning_rate": 2.1414141414141414e-06,
"loss": 414.1354,
"step": 530
},
{
"epoch": 0.004363319031343175,
"grad_norm": 1416.258056640625,
"learning_rate": 2.181818181818182e-06,
"loss": 443.3496,
"step": 540
},
{
"epoch": 0.004444121235627308,
"grad_norm": 1433.0880126953125,
"learning_rate": 2.2222222222222225e-06,
"loss": 476.5702,
"step": 550
},
{
"epoch": 0.004524923439911441,
"grad_norm": 852.6603393554688,
"learning_rate": 2.2626262626262626e-06,
"loss": 425.1744,
"step": 560
},
{
"epoch": 0.004605725644195573,
"grad_norm": 1662.8759765625,
"learning_rate": 2.303030303030303e-06,
"loss": 432.2859,
"step": 570
},
{
"epoch": 0.004686527848479707,
"grad_norm": 1177.7154541015625,
"learning_rate": 2.3434343434343437e-06,
"loss": 472.836,
"step": 580
},
{
"epoch": 0.0047673300527638395,
"grad_norm": 1203.2510986328125,
"learning_rate": 2.383838383838384e-06,
"loss": 400.3267,
"step": 590
},
{
"epoch": 0.004848132257047972,
"grad_norm": 806.4482421875,
"learning_rate": 2.4242424242424244e-06,
"loss": 381.2281,
"step": 600
},
{
"epoch": 0.004928934461332105,
"grad_norm": 1223.798095703125,
"learning_rate": 2.4646464646464645e-06,
"loss": 314.522,
"step": 610
},
{
"epoch": 0.005009736665616238,
"grad_norm": 1220.419189453125,
"learning_rate": 2.5050505050505055e-06,
"loss": 391.7079,
"step": 620
},
{
"epoch": 0.005090538869900371,
"grad_norm": 1846.539306640625,
"learning_rate": 2.5454545454545456e-06,
"loss": 415.6819,
"step": 630
},
{
"epoch": 0.0051713410741845035,
"grad_norm": 1243.7620849609375,
"learning_rate": 2.585858585858586e-06,
"loss": 437.1002,
"step": 640
},
{
"epoch": 0.005252143278468637,
"grad_norm": 1140.9453125,
"learning_rate": 2.6262626262626263e-06,
"loss": 415.1159,
"step": 650
},
{
"epoch": 0.00533294548275277,
"grad_norm": 4821.37060546875,
"learning_rate": 2.666666666666667e-06,
"loss": 464.5417,
"step": 660
},
{
"epoch": 0.005413747687036902,
"grad_norm": 1713.6087646484375,
"learning_rate": 2.7070707070707074e-06,
"loss": 375.5785,
"step": 670
},
{
"epoch": 0.005494549891321035,
"grad_norm": 2620.644287109375,
"learning_rate": 2.7474747474747475e-06,
"loss": 299.2374,
"step": 680
},
{
"epoch": 0.005575352095605168,
"grad_norm": 1759.0950927734375,
"learning_rate": 2.787878787878788e-06,
"loss": 415.4854,
"step": 690
},
{
"epoch": 0.005656154299889301,
"grad_norm": 1536.8719482421875,
"learning_rate": 2.8282828282828286e-06,
"loss": 449.5622,
"step": 700
},
{
"epoch": 0.0057369565041734336,
"grad_norm": 2095.785400390625,
"learning_rate": 2.8686868686868687e-06,
"loss": 380.9472,
"step": 710
},
{
"epoch": 0.005817758708457567,
"grad_norm": 1478.4825439453125,
"learning_rate": 2.9090909090909093e-06,
"loss": 390.7729,
"step": 720
},
{
"epoch": 0.0058985609127417,
"grad_norm": 1876.679443359375,
"learning_rate": 2.9494949494949494e-06,
"loss": 391.9601,
"step": 730
},
{
"epoch": 0.005979363117025832,
"grad_norm": 749.0634765625,
"learning_rate": 2.9898989898989904e-06,
"loss": 253.9756,
"step": 740
},
{
"epoch": 0.006060165321309965,
"grad_norm": 906.431396484375,
"learning_rate": 3.0303030303030305e-06,
"loss": 347.2193,
"step": 750
},
{
"epoch": 0.006140967525594098,
"grad_norm": 1034.1180419921875,
"learning_rate": 3.070707070707071e-06,
"loss": 311.8948,
"step": 760
},
{
"epoch": 0.006221769729878231,
"grad_norm": 2706.747802734375,
"learning_rate": 3.111111111111111e-06,
"loss": 329.9375,
"step": 770
},
{
"epoch": 0.006302571934162364,
"grad_norm": 1195.091064453125,
"learning_rate": 3.1515151515151517e-06,
"loss": 303.2004,
"step": 780
},
{
"epoch": 0.006383374138446497,
"grad_norm": 1622.099609375,
"learning_rate": 3.191919191919192e-06,
"loss": 359.3405,
"step": 790
},
{
"epoch": 0.00646417634273063,
"grad_norm": 1255.0582275390625,
"learning_rate": 3.2323232323232324e-06,
"loss": 362.8006,
"step": 800
},
{
"epoch": 0.006544978547014762,
"grad_norm": 1571.966552734375,
"learning_rate": 3.2727272727272733e-06,
"loss": 372.003,
"step": 810
},
{
"epoch": 0.006625780751298895,
"grad_norm": 1350.624755859375,
"learning_rate": 3.313131313131313e-06,
"loss": 391.2471,
"step": 820
},
{
"epoch": 0.0067065829555830285,
"grad_norm": 1288.0430908203125,
"learning_rate": 3.3535353535353536e-06,
"loss": 381.0373,
"step": 830
},
{
"epoch": 0.006787385159867161,
"grad_norm": 1756.347900390625,
"learning_rate": 3.3939393939393946e-06,
"loss": 366.8195,
"step": 840
},
{
"epoch": 0.006868187364151294,
"grad_norm": 1164.05224609375,
"learning_rate": 3.4343434343434343e-06,
"loss": 340.2402,
"step": 850
},
{
"epoch": 0.006948989568435427,
"grad_norm": 6004.3291015625,
"learning_rate": 3.4747474747474752e-06,
"loss": 397.7737,
"step": 860
},
{
"epoch": 0.00702979177271956,
"grad_norm": 1949.6370849609375,
"learning_rate": 3.515151515151515e-06,
"loss": 352.9498,
"step": 870
},
{
"epoch": 0.0071105939770036925,
"grad_norm": 823.8360595703125,
"learning_rate": 3.555555555555556e-06,
"loss": 432.1595,
"step": 880
},
{
"epoch": 0.007191396181287825,
"grad_norm": 3512.72607421875,
"learning_rate": 3.5959595959595965e-06,
"loss": 342.5901,
"step": 890
},
{
"epoch": 0.007272198385571959,
"grad_norm": 1352.2506103515625,
"learning_rate": 3.636363636363636e-06,
"loss": 306.887,
"step": 900
},
{
"epoch": 0.007353000589856091,
"grad_norm": 2983.867919921875,
"learning_rate": 3.676767676767677e-06,
"loss": 381.7238,
"step": 910
},
{
"epoch": 0.007433802794140224,
"grad_norm": 2423.1806640625,
"learning_rate": 3.7171717171717177e-06,
"loss": 303.3808,
"step": 920
},
{
"epoch": 0.007514604998424357,
"grad_norm": 953.1580810546875,
"learning_rate": 3.757575757575758e-06,
"loss": 380.6255,
"step": 930
},
{
"epoch": 0.00759540720270849,
"grad_norm": 5818.9609375,
"learning_rate": 3.7979797979797984e-06,
"loss": 372.4149,
"step": 940
},
{
"epoch": 0.007676209406992623,
"grad_norm": 1317.5467529296875,
"learning_rate": 3.8383838383838385e-06,
"loss": 369.7815,
"step": 950
},
{
"epoch": 0.007757011611276756,
"grad_norm": 929.1298217773438,
"learning_rate": 3.878787878787879e-06,
"loss": 395.9197,
"step": 960
},
{
"epoch": 0.007837813815560889,
"grad_norm": 1342.6861572265625,
"learning_rate": 3.9191919191919196e-06,
"loss": 384.7229,
"step": 970
},
{
"epoch": 0.007918616019845021,
"grad_norm": 1253.3720703125,
"learning_rate": 3.959595959595959e-06,
"loss": 302.0778,
"step": 980
},
{
"epoch": 0.007999418224129154,
"grad_norm": 966.1517333984375,
"learning_rate": 4.000000000000001e-06,
"loss": 407.2742,
"step": 990
},
{
"epoch": 0.008080220428413287,
"grad_norm": 949.992919921875,
"learning_rate": 4.040404040404041e-06,
"loss": 333.9852,
"step": 1000
},
{
"epoch": 0.00816102263269742,
"grad_norm": 2160.320068359375,
"learning_rate": 4.080808080808081e-06,
"loss": 320.1485,
"step": 1010
},
{
"epoch": 0.008241824836981554,
"grad_norm": 1784.5238037109375,
"learning_rate": 4.1212121212121215e-06,
"loss": 388.0309,
"step": 1020
},
{
"epoch": 0.008322627041265686,
"grad_norm": 11426.04296875,
"learning_rate": 4.161616161616161e-06,
"loss": 342.7975,
"step": 1030
},
{
"epoch": 0.008403429245549819,
"grad_norm": 1648.6806640625,
"learning_rate": 4.2020202020202026e-06,
"loss": 377.397,
"step": 1040
},
{
"epoch": 0.008484231449833951,
"grad_norm": 929.9481811523438,
"learning_rate": 4.242424242424243e-06,
"loss": 316.3978,
"step": 1050
},
{
"epoch": 0.008565033654118084,
"grad_norm": 1129.7996826171875,
"learning_rate": 4.282828282828283e-06,
"loss": 299.9879,
"step": 1060
},
{
"epoch": 0.008645835858402217,
"grad_norm": 1400.36376953125,
"learning_rate": 4.323232323232323e-06,
"loss": 350.1171,
"step": 1070
},
{
"epoch": 0.00872663806268635,
"grad_norm": 1833.9061279296875,
"learning_rate": 4.363636363636364e-06,
"loss": 325.1123,
"step": 1080
},
{
"epoch": 0.008807440266970484,
"grad_norm": 1330.033203125,
"learning_rate": 4.4040404040404044e-06,
"loss": 325.3823,
"step": 1090
},
{
"epoch": 0.008888242471254616,
"grad_norm": 910.3088989257812,
"learning_rate": 4.444444444444445e-06,
"loss": 301.4958,
"step": 1100
},
{
"epoch": 0.008969044675538749,
"grad_norm": 1182.816162109375,
"learning_rate": 4.484848484848485e-06,
"loss": 277.6404,
"step": 1110
},
{
"epoch": 0.009049846879822881,
"grad_norm": 4707.03857421875,
"learning_rate": 4.525252525252525e-06,
"loss": 424.5438,
"step": 1120
},
{
"epoch": 0.009130649084107014,
"grad_norm": 2059.0185546875,
"learning_rate": 4.565656565656566e-06,
"loss": 324.0304,
"step": 1130
},
{
"epoch": 0.009211451288391147,
"grad_norm": 3679.1044921875,
"learning_rate": 4.606060606060606e-06,
"loss": 303.365,
"step": 1140
},
{
"epoch": 0.00929225349267528,
"grad_norm": 2518.44970703125,
"learning_rate": 4.646464646464647e-06,
"loss": 257.8196,
"step": 1150
},
{
"epoch": 0.009373055696959414,
"grad_norm": 1017.4381713867188,
"learning_rate": 4.6868686868686874e-06,
"loss": 308.7409,
"step": 1160
},
{
"epoch": 0.009453857901243546,
"grad_norm": 1016.4625244140625,
"learning_rate": 4.727272727272727e-06,
"loss": 309.7148,
"step": 1170
},
{
"epoch": 0.009534660105527679,
"grad_norm": 2000.1339111328125,
"learning_rate": 4.767676767676768e-06,
"loss": 416.5233,
"step": 1180
},
{
"epoch": 0.009615462309811812,
"grad_norm": 1349.24755859375,
"learning_rate": 4.808080808080808e-06,
"loss": 312.0022,
"step": 1190
},
{
"epoch": 0.009696264514095944,
"grad_norm": 605.5498046875,
"learning_rate": 4.848484848484849e-06,
"loss": 373.616,
"step": 1200
},
{
"epoch": 0.009777066718380077,
"grad_norm": 1045.7061767578125,
"learning_rate": 4.888888888888889e-06,
"loss": 263.9121,
"step": 1210
},
{
"epoch": 0.00985786892266421,
"grad_norm": 997.4849243164062,
"learning_rate": 4.929292929292929e-06,
"loss": 299.2633,
"step": 1220
},
{
"epoch": 0.009938671126948344,
"grad_norm": 690.6622314453125,
"learning_rate": 4.96969696969697e-06,
"loss": 216.8853,
"step": 1230
},
{
"epoch": 0.010019473331232476,
"grad_norm": 1591.2972412109375,
"learning_rate": 5.010101010101011e-06,
"loss": 339.3444,
"step": 1240
},
{
"epoch": 0.010100275535516609,
"grad_norm": 749.3450317382812,
"learning_rate": 5.050505050505051e-06,
"loss": 315.4824,
"step": 1250
},
{
"epoch": 0.010181077739800742,
"grad_norm": 923.2822265625,
"learning_rate": 5.090909090909091e-06,
"loss": 265.8641,
"step": 1260
},
{
"epoch": 0.010261879944084874,
"grad_norm": 1161.5048828125,
"learning_rate": 5.131313131313131e-06,
"loss": 255.6592,
"step": 1270
},
{
"epoch": 0.010342682148369007,
"grad_norm": 949.8252563476562,
"learning_rate": 5.171717171717172e-06,
"loss": 350.1398,
"step": 1280
},
{
"epoch": 0.01042348435265314,
"grad_norm": 1018.5875854492188,
"learning_rate": 5.212121212121213e-06,
"loss": 313.8918,
"step": 1290
},
{
"epoch": 0.010504286556937274,
"grad_norm": 3170.640869140625,
"learning_rate": 5.2525252525252526e-06,
"loss": 305.4674,
"step": 1300
},
{
"epoch": 0.010585088761221407,
"grad_norm": 1600.813720703125,
"learning_rate": 5.292929292929293e-06,
"loss": 331.4024,
"step": 1310
},
{
"epoch": 0.01066589096550554,
"grad_norm": 1063.93408203125,
"learning_rate": 5.333333333333334e-06,
"loss": 373.562,
"step": 1320
},
{
"epoch": 0.010746693169789672,
"grad_norm": 665.6146240234375,
"learning_rate": 5.373737373737374e-06,
"loss": 235.0867,
"step": 1330
},
{
"epoch": 0.010827495374073804,
"grad_norm": 1380.6151123046875,
"learning_rate": 5.414141414141415e-06,
"loss": 324.2148,
"step": 1340
},
{
"epoch": 0.010908297578357937,
"grad_norm": 3627.843994140625,
"learning_rate": 5.4545454545454545e-06,
"loss": 364.2515,
"step": 1350
},
{
"epoch": 0.01098909978264207,
"grad_norm": 850.1503295898438,
"learning_rate": 5.494949494949495e-06,
"loss": 408.481,
"step": 1360
},
{
"epoch": 0.011069901986926204,
"grad_norm": 2091.482421875,
"learning_rate": 5.5353535353535355e-06,
"loss": 267.4172,
"step": 1370
},
{
"epoch": 0.011150704191210337,
"grad_norm": 1496.604248046875,
"learning_rate": 5.575757575757576e-06,
"loss": 295.7435,
"step": 1380
},
{
"epoch": 0.01123150639549447,
"grad_norm": 2673.033203125,
"learning_rate": 5.616161616161617e-06,
"loss": 251.5225,
"step": 1390
},
{
"epoch": 0.011312308599778602,
"grad_norm": 1121.48779296875,
"learning_rate": 5.656565656565657e-06,
"loss": 318.3079,
"step": 1400
},
{
"epoch": 0.011393110804062734,
"grad_norm": 866.069091796875,
"learning_rate": 5.696969696969697e-06,
"loss": 283.861,
"step": 1410
},
{
"epoch": 0.011473913008346867,
"grad_norm": 1132.9764404296875,
"learning_rate": 5.7373737373737374e-06,
"loss": 312.9168,
"step": 1420
},
{
"epoch": 0.011554715212631,
"grad_norm": 1029.2037353515625,
"learning_rate": 5.777777777777778e-06,
"loss": 307.1583,
"step": 1430
},
{
"epoch": 0.011635517416915134,
"grad_norm": 1138.4461669921875,
"learning_rate": 5.8181818181818185e-06,
"loss": 266.8528,
"step": 1440
},
{
"epoch": 0.011716319621199267,
"grad_norm": 1278.30224609375,
"learning_rate": 5.858585858585859e-06,
"loss": 339.6735,
"step": 1450
},
{
"epoch": 0.0117971218254834,
"grad_norm": 908.4046630859375,
"learning_rate": 5.898989898989899e-06,
"loss": 331.0705,
"step": 1460
},
{
"epoch": 0.011877924029767532,
"grad_norm": 1165.220947265625,
"learning_rate": 5.93939393939394e-06,
"loss": 267.2519,
"step": 1470
},
{
"epoch": 0.011958726234051665,
"grad_norm": 3219.732177734375,
"learning_rate": 5.979797979797981e-06,
"loss": 485.5872,
"step": 1480
},
{
"epoch": 0.012039528438335797,
"grad_norm": 1050.6778564453125,
"learning_rate": 6.0202020202020204e-06,
"loss": 325.8213,
"step": 1490
},
{
"epoch": 0.01212033064261993,
"grad_norm": 1031.156005859375,
"learning_rate": 6.060606060606061e-06,
"loss": 302.1713,
"step": 1500
},
{
"epoch": 0.012201132846904064,
"grad_norm": 1457.485107421875,
"learning_rate": 6.101010101010101e-06,
"loss": 267.4604,
"step": 1510
},
{
"epoch": 0.012281935051188197,
"grad_norm": 1509.94091796875,
"learning_rate": 6.141414141414142e-06,
"loss": 368.4667,
"step": 1520
},
{
"epoch": 0.01236273725547233,
"grad_norm": 1406.15673828125,
"learning_rate": 6.181818181818183e-06,
"loss": 349.9011,
"step": 1530
},
{
"epoch": 0.012443539459756462,
"grad_norm": 1459.2613525390625,
"learning_rate": 6.222222222222222e-06,
"loss": 325.6512,
"step": 1540
},
{
"epoch": 0.012524341664040595,
"grad_norm": 3242.78271484375,
"learning_rate": 6.262626262626263e-06,
"loss": 375.0032,
"step": 1550
},
{
"epoch": 0.012605143868324727,
"grad_norm": 2443.515625,
"learning_rate": 6.303030303030303e-06,
"loss": 379.789,
"step": 1560
},
{
"epoch": 0.01268594607260886,
"grad_norm": 1770.516845703125,
"learning_rate": 6.343434343434344e-06,
"loss": 251.652,
"step": 1570
},
{
"epoch": 0.012766748276892994,
"grad_norm": 1482.888671875,
"learning_rate": 6.383838383838384e-06,
"loss": 324.2996,
"step": 1580
},
{
"epoch": 0.012847550481177127,
"grad_norm": 1663.3935546875,
"learning_rate": 6.424242424242424e-06,
"loss": 301.8472,
"step": 1590
},
{
"epoch": 0.01292835268546126,
"grad_norm": 23524.42578125,
"learning_rate": 6.464646464646465e-06,
"loss": 335.0892,
"step": 1600
},
{
"epoch": 0.013009154889745392,
"grad_norm": 862.7949829101562,
"learning_rate": 6.505050505050505e-06,
"loss": 306.2031,
"step": 1610
},
{
"epoch": 0.013089957094029525,
"grad_norm": 2391.976806640625,
"learning_rate": 6.545454545454547e-06,
"loss": 310.1344,
"step": 1620
},
{
"epoch": 0.013170759298313657,
"grad_norm": 1260.0029296875,
"learning_rate": 6.5858585858585856e-06,
"loss": 372.1161,
"step": 1630
},
{
"epoch": 0.01325156150259779,
"grad_norm": 1587.1514892578125,
"learning_rate": 6.626262626262626e-06,
"loss": 351.7325,
"step": 1640
},
{
"epoch": 0.013332363706881924,
"grad_norm": 1152.1556396484375,
"learning_rate": 6.666666666666667e-06,
"loss": 373.3706,
"step": 1650
},
{
"epoch": 0.013413165911166057,
"grad_norm": 1005.15771484375,
"learning_rate": 6.707070707070707e-06,
"loss": 317.8097,
"step": 1660
},
{
"epoch": 0.01349396811545019,
"grad_norm": 1090.3779296875,
"learning_rate": 6.747474747474749e-06,
"loss": 315.5722,
"step": 1670
},
{
"epoch": 0.013574770319734322,
"grad_norm": 1011.2723388671875,
"learning_rate": 6.787878787878789e-06,
"loss": 231.6832,
"step": 1680
},
{
"epoch": 0.013655572524018455,
"grad_norm": 798.0405883789062,
"learning_rate": 6.828282828282828e-06,
"loss": 306.8582,
"step": 1690
},
{
"epoch": 0.013736374728302587,
"grad_norm": 855.2308959960938,
"learning_rate": 6.8686868686868685e-06,
"loss": 282.2944,
"step": 1700
},
{
"epoch": 0.01381717693258672,
"grad_norm": 1325.1092529296875,
"learning_rate": 6.909090909090909e-06,
"loss": 427.1749,
"step": 1710
},
{
"epoch": 0.013897979136870854,
"grad_norm": 1027.2860107421875,
"learning_rate": 6.9494949494949505e-06,
"loss": 232.57,
"step": 1720
},
{
"epoch": 0.013978781341154987,
"grad_norm": 1047.0118408203125,
"learning_rate": 6.989898989898991e-06,
"loss": 272.4593,
"step": 1730
},
{
"epoch": 0.01405958354543912,
"grad_norm": 1453.4931640625,
"learning_rate": 7.03030303030303e-06,
"loss": 287.978,
"step": 1740
},
{
"epoch": 0.014140385749723252,
"grad_norm": 1631.84521484375,
"learning_rate": 7.0707070707070704e-06,
"loss": 222.9585,
"step": 1750
},
{
"epoch": 0.014221187954007385,
"grad_norm": 1109.9012451171875,
"learning_rate": 7.111111111111112e-06,
"loss": 263.4153,
"step": 1760
},
{
"epoch": 0.014301990158291518,
"grad_norm": 1374.5731201171875,
"learning_rate": 7.151515151515152e-06,
"loss": 224.7149,
"step": 1770
},
{
"epoch": 0.01438279236257565,
"grad_norm": 700.2552490234375,
"learning_rate": 7.191919191919193e-06,
"loss": 243.2667,
"step": 1780
},
{
"epoch": 0.014463594566859785,
"grad_norm": 689.7608032226562,
"learning_rate": 7.232323232323232e-06,
"loss": 209.3831,
"step": 1790
},
{
"epoch": 0.014544396771143917,
"grad_norm": 1072.593994140625,
"learning_rate": 7.272727272727272e-06,
"loss": 233.4359,
"step": 1800
},
{
"epoch": 0.01462519897542805,
"grad_norm": 783.2555541992188,
"learning_rate": 7.313131313131314e-06,
"loss": 259.1145,
"step": 1810
},
{
"epoch": 0.014706001179712182,
"grad_norm": 1364.049560546875,
"learning_rate": 7.353535353535354e-06,
"loss": 244.7944,
"step": 1820
},
{
"epoch": 0.014786803383996315,
"grad_norm": 1947.3690185546875,
"learning_rate": 7.393939393939395e-06,
"loss": 276.0544,
"step": 1830
},
{
"epoch": 0.014867605588280448,
"grad_norm": 1572.007568359375,
"learning_rate": 7.434343434343435e-06,
"loss": 310.638,
"step": 1840
},
{
"epoch": 0.01494840779256458,
"grad_norm": 1419.4361572265625,
"learning_rate": 7.474747474747475e-06,
"loss": 409.634,
"step": 1850
},
{
"epoch": 0.015029209996848715,
"grad_norm": 1355.0137939453125,
"learning_rate": 7.515151515151516e-06,
"loss": 253.9533,
"step": 1860
},
{
"epoch": 0.015110012201132847,
"grad_norm": 768.96923828125,
"learning_rate": 7.555555555555556e-06,
"loss": 304.4757,
"step": 1870
},
{
"epoch": 0.01519081440541698,
"grad_norm": 959.4989624023438,
"learning_rate": 7.595959595959597e-06,
"loss": 345.032,
"step": 1880
},
{
"epoch": 0.015271616609701113,
"grad_norm": 1339.84228515625,
"learning_rate": 7.636363636363638e-06,
"loss": 296.0798,
"step": 1890
},
{
"epoch": 0.015352418813985245,
"grad_norm": 1036.2491455078125,
"learning_rate": 7.676767676767677e-06,
"loss": 300.197,
"step": 1900
},
{
"epoch": 0.015433221018269378,
"grad_norm": 707.27587890625,
"learning_rate": 7.717171717171717e-06,
"loss": 277.1603,
"step": 1910
},
{
"epoch": 0.015514023222553512,
"grad_norm": 1203.453857421875,
"learning_rate": 7.757575757575758e-06,
"loss": 303.6785,
"step": 1920
},
{
"epoch": 0.015594825426837645,
"grad_norm": 1172.7025146484375,
"learning_rate": 7.797979797979799e-06,
"loss": 246.9346,
"step": 1930
},
{
"epoch": 0.015675627631121777,
"grad_norm": 1082.4605712890625,
"learning_rate": 7.838383838383839e-06,
"loss": 296.1259,
"step": 1940
},
{
"epoch": 0.01575642983540591,
"grad_norm": 1456.42529296875,
"learning_rate": 7.878787878787878e-06,
"loss": 269.7506,
"step": 1950
},
{
"epoch": 0.015837232039690043,
"grad_norm": 1794.72119140625,
"learning_rate": 7.919191919191919e-06,
"loss": 246.411,
"step": 1960
},
{
"epoch": 0.015918034243974175,
"grad_norm": 3157.114990234375,
"learning_rate": 7.959595959595959e-06,
"loss": 256.127,
"step": 1970
},
{
"epoch": 0.015998836448258308,
"grad_norm": 1361.6929931640625,
"learning_rate": 8.000000000000001e-06,
"loss": 308.1984,
"step": 1980
},
{
"epoch": 0.01607963865254244,
"grad_norm": 1006.965087890625,
"learning_rate": 8.040404040404042e-06,
"loss": 341.8693,
"step": 1990
},
{
"epoch": 0.016160440856826573,
"grad_norm": 1541.38720703125,
"learning_rate": 8.080808080808082e-06,
"loss": 274.4633,
"step": 2000
},
{
"epoch": 0.016241243061110706,
"grad_norm": 1735.104248046875,
"learning_rate": 8.121212121212121e-06,
"loss": 258.4104,
"step": 2010
},
{
"epoch": 0.01632204526539484,
"grad_norm": 2176.154052734375,
"learning_rate": 8.161616161616162e-06,
"loss": 314.5508,
"step": 2020
},
{
"epoch": 0.016402847469678974,
"grad_norm": 1052.533447265625,
"learning_rate": 8.202020202020202e-06,
"loss": 312.927,
"step": 2030
},
{
"epoch": 0.016483649673963107,
"grad_norm": 1208.69189453125,
"learning_rate": 8.242424242424243e-06,
"loss": 254.359,
"step": 2040
},
{
"epoch": 0.01656445187824724,
"grad_norm": 1978.5025634765625,
"learning_rate": 8.282828282828283e-06,
"loss": 299.8744,
"step": 2050
},
{
"epoch": 0.016645254082531372,
"grad_norm": 3605.6494140625,
"learning_rate": 8.323232323232322e-06,
"loss": 298.715,
"step": 2060
},
{
"epoch": 0.016726056286815505,
"grad_norm": 1599.973876953125,
"learning_rate": 8.363636363636365e-06,
"loss": 258.9758,
"step": 2070
},
{
"epoch": 0.016806858491099638,
"grad_norm": 1183.451904296875,
"learning_rate": 8.404040404040405e-06,
"loss": 352.8176,
"step": 2080
},
{
"epoch": 0.01688766069538377,
"grad_norm": 1582.7120361328125,
"learning_rate": 8.444444444444446e-06,
"loss": 309.3484,
"step": 2090
},
{
"epoch": 0.016968462899667903,
"grad_norm": 932.9716796875,
"learning_rate": 8.484848484848486e-06,
"loss": 255.265,
"step": 2100
},
{
"epoch": 0.017049265103952035,
"grad_norm": 922.5059814453125,
"learning_rate": 8.525252525252525e-06,
"loss": 253.6583,
"step": 2110
},
{
"epoch": 0.017130067308236168,
"grad_norm": 1196.361083984375,
"learning_rate": 8.565656565656566e-06,
"loss": 323.4844,
"step": 2120
},
{
"epoch": 0.0172108695125203,
"grad_norm": 1005.9546508789062,
"learning_rate": 8.606060606060606e-06,
"loss": 351.4896,
"step": 2130
},
{
"epoch": 0.017291671716804433,
"grad_norm": 1585.8636474609375,
"learning_rate": 8.646464646464647e-06,
"loss": 325.0701,
"step": 2140
},
{
"epoch": 0.017372473921088566,
"grad_norm": 3758.6982421875,
"learning_rate": 8.686868686868687e-06,
"loss": 203.9509,
"step": 2150
},
{
"epoch": 0.0174532761253727,
"grad_norm": 1602.480224609375,
"learning_rate": 8.727272727272728e-06,
"loss": 294.8089,
"step": 2160
},
{
"epoch": 0.017534078329656835,
"grad_norm": 1571.1812744140625,
"learning_rate": 8.767676767676768e-06,
"loss": 220.656,
"step": 2170
},
{
"epoch": 0.017614880533940967,
"grad_norm": 1073.2261962890625,
"learning_rate": 8.808080808080809e-06,
"loss": 221.6837,
"step": 2180
},
{
"epoch": 0.0176956827382251,
"grad_norm": 1125.1983642578125,
"learning_rate": 8.84848484848485e-06,
"loss": 286.0061,
"step": 2190
},
{
"epoch": 0.017776484942509233,
"grad_norm": 1008.795654296875,
"learning_rate": 8.88888888888889e-06,
"loss": 282.5281,
"step": 2200
},
{
"epoch": 0.017857287146793365,
"grad_norm": 1810.7894287109375,
"learning_rate": 8.92929292929293e-06,
"loss": 226.4816,
"step": 2210
},
{
"epoch": 0.017938089351077498,
"grad_norm": 894.2589721679688,
"learning_rate": 8.96969696969697e-06,
"loss": 251.6401,
"step": 2220
},
{
"epoch": 0.01801889155536163,
"grad_norm": 1232.9827880859375,
"learning_rate": 9.01010101010101e-06,
"loss": 248.7544,
"step": 2230
},
{
"epoch": 0.018099693759645763,
"grad_norm": 1993.9267578125,
"learning_rate": 9.05050505050505e-06,
"loss": 256.8296,
"step": 2240
},
{
"epoch": 0.018180495963929896,
"grad_norm": 967.433837890625,
"learning_rate": 9.090909090909091e-06,
"loss": 245.6321,
"step": 2250
},
{
"epoch": 0.018261298168214028,
"grad_norm": 2560.1728515625,
"learning_rate": 9.131313131313132e-06,
"loss": 234.9478,
"step": 2260
},
{
"epoch": 0.01834210037249816,
"grad_norm": 590.1747436523438,
"learning_rate": 9.171717171717172e-06,
"loss": 236.6796,
"step": 2270
},
{
"epoch": 0.018422902576782293,
"grad_norm": 1504.942626953125,
"learning_rate": 9.212121212121213e-06,
"loss": 367.3362,
"step": 2280
},
{
"epoch": 0.018503704781066426,
"grad_norm": 977.7069091796875,
"learning_rate": 9.252525252525253e-06,
"loss": 276.6067,
"step": 2290
},
{
"epoch": 0.01858450698535056,
"grad_norm": 1384.83203125,
"learning_rate": 9.292929292929294e-06,
"loss": 340.7896,
"step": 2300
},
{
"epoch": 0.018665309189634695,
"grad_norm": 1302.6343994140625,
"learning_rate": 9.333333333333334e-06,
"loss": 198.1527,
"step": 2310
},
{
"epoch": 0.018746111393918827,
"grad_norm": 836.4732666015625,
"learning_rate": 9.373737373737375e-06,
"loss": 209.2565,
"step": 2320
},
{
"epoch": 0.01882691359820296,
"grad_norm": 1400.0604248046875,
"learning_rate": 9.414141414141414e-06,
"loss": 305.2968,
"step": 2330
},
{
"epoch": 0.018907715802487093,
"grad_norm": 1454.78125,
"learning_rate": 9.454545454545454e-06,
"loss": 260.4474,
"step": 2340
},
{
"epoch": 0.018988518006771225,
"grad_norm": 4679.6923828125,
"learning_rate": 9.494949494949495e-06,
"loss": 247.9314,
"step": 2350
},
{
"epoch": 0.019069320211055358,
"grad_norm": 6640.1201171875,
"learning_rate": 9.535353535353535e-06,
"loss": 259.6065,
"step": 2360
},
{
"epoch": 0.01915012241533949,
"grad_norm": 1564.781982421875,
"learning_rate": 9.575757575757578e-06,
"loss": 245.3016,
"step": 2370
},
{
"epoch": 0.019230924619623623,
"grad_norm": 2078.74267578125,
"learning_rate": 9.616161616161616e-06,
"loss": 284.7133,
"step": 2380
},
{
"epoch": 0.019311726823907756,
"grad_norm": 1441.0360107421875,
"learning_rate": 9.656565656565657e-06,
"loss": 270.4787,
"step": 2390
},
{
"epoch": 0.01939252902819189,
"grad_norm": 744.0514526367188,
"learning_rate": 9.696969696969698e-06,
"loss": 200.2359,
"step": 2400
},
{
"epoch": 0.01947333123247602,
"grad_norm": 1168.913818359375,
"learning_rate": 9.737373737373738e-06,
"loss": 269.4149,
"step": 2410
},
{
"epoch": 0.019554133436760154,
"grad_norm": 3848.146484375,
"learning_rate": 9.777777777777779e-06,
"loss": 250.6479,
"step": 2420
},
{
"epoch": 0.019634935641044286,
"grad_norm": 798.60595703125,
"learning_rate": 9.818181818181818e-06,
"loss": 245.7341,
"step": 2430
},
{
"epoch": 0.01971573784532842,
"grad_norm": 2139.72265625,
"learning_rate": 9.858585858585858e-06,
"loss": 275.653,
"step": 2440
},
{
"epoch": 0.019796540049612555,
"grad_norm": 1223.6392822265625,
"learning_rate": 9.898989898989899e-06,
"loss": 271.2685,
"step": 2450
},
{
"epoch": 0.019877342253896688,
"grad_norm": 1007.25439453125,
"learning_rate": 9.93939393939394e-06,
"loss": 235.147,
"step": 2460
},
{
"epoch": 0.01995814445818082,
"grad_norm": 1195.012939453125,
"learning_rate": 9.979797979797981e-06,
"loss": 302.4475,
"step": 2470
},
{
"epoch": 0.020038946662464953,
"grad_norm": 1530.9473876953125,
"learning_rate": 1.0020202020202022e-05,
"loss": 259.1597,
"step": 2480
},
{
"epoch": 0.020119748866749085,
"grad_norm": 1403.546142578125,
"learning_rate": 1.006060606060606e-05,
"loss": 348.2433,
"step": 2490
},
{
"epoch": 0.020200551071033218,
"grad_norm": 1328.4873046875,
"learning_rate": 1.0101010101010101e-05,
"loss": 289.0594,
"step": 2500
},
{
"epoch": 0.02028135327531735,
"grad_norm": 1171.5048828125,
"learning_rate": 1.0141414141414142e-05,
"loss": 197.3946,
"step": 2510
},
{
"epoch": 0.020362155479601483,
"grad_norm": 1274.6544189453125,
"learning_rate": 1.0181818181818182e-05,
"loss": 295.2945,
"step": 2520
},
{
"epoch": 0.020442957683885616,
"grad_norm": 2023.71337890625,
"learning_rate": 1.0222222222222223e-05,
"loss": 271.9707,
"step": 2530
},
{
"epoch": 0.02052375988816975,
"grad_norm": 1765.538818359375,
"learning_rate": 1.0262626262626262e-05,
"loss": 246.6141,
"step": 2540
},
{
"epoch": 0.02060456209245388,
"grad_norm": 859.3914794921875,
"learning_rate": 1.0303030303030304e-05,
"loss": 276.1375,
"step": 2550
},
{
"epoch": 0.020685364296738014,
"grad_norm": 1024.8955078125,
"learning_rate": 1.0343434343434345e-05,
"loss": 239.6529,
"step": 2560
},
{
"epoch": 0.020766166501022146,
"grad_norm": 1012.91455078125,
"learning_rate": 1.0383838383838385e-05,
"loss": 199.6656,
"step": 2570
},
{
"epoch": 0.02084696870530628,
"grad_norm": 1371.8551025390625,
"learning_rate": 1.0424242424242426e-05,
"loss": 247.0821,
"step": 2580
},
{
"epoch": 0.020927770909590415,
"grad_norm": 1338.343017578125,
"learning_rate": 1.0464646464646465e-05,
"loss": 260.7634,
"step": 2590
},
{
"epoch": 0.021008573113874548,
"grad_norm": 917.8023071289062,
"learning_rate": 1.0505050505050505e-05,
"loss": 255.2869,
"step": 2600
},
{
"epoch": 0.02108937531815868,
"grad_norm": 1167.3427734375,
"learning_rate": 1.0545454545454546e-05,
"loss": 255.108,
"step": 2610
},
{
"epoch": 0.021170177522442813,
"grad_norm": 1662.1556396484375,
"learning_rate": 1.0585858585858586e-05,
"loss": 304.7596,
"step": 2620
},
{
"epoch": 0.021250979726726946,
"grad_norm": 1393.7713623046875,
"learning_rate": 1.0626262626262627e-05,
"loss": 298.9948,
"step": 2630
},
{
"epoch": 0.02133178193101108,
"grad_norm": 4169.89306640625,
"learning_rate": 1.0666666666666667e-05,
"loss": 256.329,
"step": 2640
},
{
"epoch": 0.02141258413529521,
"grad_norm": 961.8526000976562,
"learning_rate": 1.0707070707070708e-05,
"loss": 268.2124,
"step": 2650
},
{
"epoch": 0.021493386339579344,
"grad_norm": 2141.90869140625,
"learning_rate": 1.0747474747474748e-05,
"loss": 279.6773,
"step": 2660
},
{
"epoch": 0.021574188543863476,
"grad_norm": 1454.77392578125,
"learning_rate": 1.0787878787878789e-05,
"loss": 250.6374,
"step": 2670
},
{
"epoch": 0.02165499074814761,
"grad_norm": 1119.782958984375,
"learning_rate": 1.082828282828283e-05,
"loss": 299.24,
"step": 2680
},
{
"epoch": 0.02173579295243174,
"grad_norm": 1507.88916015625,
"learning_rate": 1.086868686868687e-05,
"loss": 304.945,
"step": 2690
},
{
"epoch": 0.021816595156715874,
"grad_norm": 1235.4326171875,
"learning_rate": 1.0909090909090909e-05,
"loss": 329.6972,
"step": 2700
},
{
"epoch": 0.021897397361000007,
"grad_norm": 1516.6436767578125,
"learning_rate": 1.094949494949495e-05,
"loss": 276.193,
"step": 2710
},
{
"epoch": 0.02197819956528414,
"grad_norm": 1332.3309326171875,
"learning_rate": 1.098989898989899e-05,
"loss": 309.7016,
"step": 2720
},
{
"epoch": 0.022059001769568275,
"grad_norm": 1349.0360107421875,
"learning_rate": 1.103030303030303e-05,
"loss": 423.0254,
"step": 2730
},
{
"epoch": 0.022139803973852408,
"grad_norm": 2262.348876953125,
"learning_rate": 1.1070707070707071e-05,
"loss": 260.9447,
"step": 2740
},
{
"epoch": 0.02222060617813654,
"grad_norm": 1374.3009033203125,
"learning_rate": 1.1111111111111112e-05,
"loss": 283.2908,
"step": 2750
},
{
"epoch": 0.022301408382420673,
"grad_norm": 768.2625732421875,
"learning_rate": 1.1151515151515152e-05,
"loss": 243.0171,
"step": 2760
},
{
"epoch": 0.022382210586704806,
"grad_norm": 4175.06396484375,
"learning_rate": 1.1191919191919193e-05,
"loss": 236.5765,
"step": 2770
},
{
"epoch": 0.02246301279098894,
"grad_norm": 1220.933837890625,
"learning_rate": 1.1232323232323233e-05,
"loss": 254.5181,
"step": 2780
},
{
"epoch": 0.02254381499527307,
"grad_norm": 1589.7581787109375,
"learning_rate": 1.1272727272727274e-05,
"loss": 183.8809,
"step": 2790
},
{
"epoch": 0.022624617199557204,
"grad_norm": 1031.3692626953125,
"learning_rate": 1.1313131313131314e-05,
"loss": 246.612,
"step": 2800
},
{
"epoch": 0.022705419403841336,
"grad_norm": 1396.37744140625,
"learning_rate": 1.1353535353535353e-05,
"loss": 224.2803,
"step": 2810
},
{
"epoch": 0.02278622160812547,
"grad_norm": 1879.5634765625,
"learning_rate": 1.1393939393939394e-05,
"loss": 287.104,
"step": 2820
},
{
"epoch": 0.0228670238124096,
"grad_norm": 1376.0625,
"learning_rate": 1.1434343434343434e-05,
"loss": 284.0272,
"step": 2830
},
{
"epoch": 0.022947826016693734,
"grad_norm": 682.961181640625,
"learning_rate": 1.1474747474747475e-05,
"loss": 210.3876,
"step": 2840
},
{
"epoch": 0.023028628220977867,
"grad_norm": 1677.2169189453125,
"learning_rate": 1.1515151515151517e-05,
"loss": 326.8289,
"step": 2850
},
{
"epoch": 0.023109430425262,
"grad_norm": 733.2987060546875,
"learning_rate": 1.1555555555555556e-05,
"loss": 230.43,
"step": 2860
},
{
"epoch": 0.023190232629546136,
"grad_norm": 630.212890625,
"learning_rate": 1.1595959595959597e-05,
"loss": 205.1732,
"step": 2870
},
{
"epoch": 0.023271034833830268,
"grad_norm": 1535.36572265625,
"learning_rate": 1.1636363636363637e-05,
"loss": 304.9371,
"step": 2880
},
{
"epoch": 0.0233518370381144,
"grad_norm": 1065.3255615234375,
"learning_rate": 1.1676767676767678e-05,
"loss": 219.3441,
"step": 2890
},
{
"epoch": 0.023432639242398533,
"grad_norm": 2319.004638671875,
"learning_rate": 1.1717171717171718e-05,
"loss": 274.9734,
"step": 2900
},
{
"epoch": 0.023513441446682666,
"grad_norm": 1362.914794921875,
"learning_rate": 1.1757575757575757e-05,
"loss": 214.0071,
"step": 2910
},
{
"epoch": 0.0235942436509668,
"grad_norm": 1748.42333984375,
"learning_rate": 1.1797979797979798e-05,
"loss": 247.8715,
"step": 2920
},
{
"epoch": 0.02367504585525093,
"grad_norm": 1357.4864501953125,
"learning_rate": 1.1838383838383838e-05,
"loss": 367.5896,
"step": 2930
},
{
"epoch": 0.023755848059535064,
"grad_norm": 1028.1129150390625,
"learning_rate": 1.187878787878788e-05,
"loss": 248.4116,
"step": 2940
},
{
"epoch": 0.023836650263819197,
"grad_norm": 1497.2218017578125,
"learning_rate": 1.1919191919191921e-05,
"loss": 309.2206,
"step": 2950
},
{
"epoch": 0.02391745246810333,
"grad_norm": 830.5894775390625,
"learning_rate": 1.1959595959595961e-05,
"loss": 203.645,
"step": 2960
},
{
"epoch": 0.023998254672387462,
"grad_norm": 1067.825439453125,
"learning_rate": 1.2e-05,
"loss": 226.1753,
"step": 2970
},
{
"epoch": 0.024079056876671594,
"grad_norm": 1088.949462890625,
"learning_rate": 1.2040404040404041e-05,
"loss": 235.4068,
"step": 2980
},
{
"epoch": 0.024159859080955727,
"grad_norm": 1378.7232666015625,
"learning_rate": 1.2080808080808081e-05,
"loss": 258.9663,
"step": 2990
},
{
"epoch": 0.02424066128523986,
"grad_norm": 1665.1182861328125,
"learning_rate": 1.2121212121212122e-05,
"loss": 178.7374,
"step": 3000
},
{
"epoch": 0.024321463489523996,
"grad_norm": 1254.1650390625,
"learning_rate": 1.2161616161616162e-05,
"loss": 196.0946,
"step": 3010
},
{
"epoch": 0.02440226569380813,
"grad_norm": 1680.0679931640625,
"learning_rate": 1.2202020202020201e-05,
"loss": 232.687,
"step": 3020
},
{
"epoch": 0.02448306789809226,
"grad_norm": 1063.8585205078125,
"learning_rate": 1.2242424242424242e-05,
"loss": 247.0947,
"step": 3030
},
{
"epoch": 0.024563870102376394,
"grad_norm": 3708.090087890625,
"learning_rate": 1.2282828282828284e-05,
"loss": 252.8591,
"step": 3040
},
{
"epoch": 0.024644672306660526,
"grad_norm": 1465.7147216796875,
"learning_rate": 1.2323232323232325e-05,
"loss": 262.5838,
"step": 3050
},
{
"epoch": 0.02472547451094466,
"grad_norm": 1327.937255859375,
"learning_rate": 1.2363636363636365e-05,
"loss": 363.5593,
"step": 3060
},
{
"epoch": 0.02480627671522879,
"grad_norm": 1344.0130615234375,
"learning_rate": 1.2404040404040404e-05,
"loss": 257.7762,
"step": 3070
},
{
"epoch": 0.024887078919512924,
"grad_norm": 1054.320068359375,
"learning_rate": 1.2444444444444445e-05,
"loss": 247.01,
"step": 3080
},
{
"epoch": 0.024967881123797057,
"grad_norm": 1068.103515625,
"learning_rate": 1.2484848484848485e-05,
"loss": 248.3725,
"step": 3090
},
{
"epoch": 0.02504868332808119,
"grad_norm": 1697.66259765625,
"learning_rate": 1.2525252525252526e-05,
"loss": 258.5855,
"step": 3100
},
{
"epoch": 0.025129485532365322,
"grad_norm": 2154.507080078125,
"learning_rate": 1.2565656565656566e-05,
"loss": 324.4505,
"step": 3110
},
{
"epoch": 0.025210287736649455,
"grad_norm": 1405.111083984375,
"learning_rate": 1.2606060606060607e-05,
"loss": 201.9885,
"step": 3120
},
{
"epoch": 0.025291089940933587,
"grad_norm": 1943.2344970703125,
"learning_rate": 1.2646464646464647e-05,
"loss": 279.3126,
"step": 3130
},
{
"epoch": 0.02537189214521772,
"grad_norm": 1883.3538818359375,
"learning_rate": 1.2686868686868688e-05,
"loss": 305.4725,
"step": 3140
},
{
"epoch": 0.025452694349501856,
"grad_norm": 2943.758544921875,
"learning_rate": 1.2727272727272727e-05,
"loss": 282.1084,
"step": 3150
},
{
"epoch": 0.02553349655378599,
"grad_norm": 1242.160400390625,
"learning_rate": 1.2767676767676767e-05,
"loss": 220.8162,
"step": 3160
},
{
"epoch": 0.02561429875807012,
"grad_norm": 2627.211181640625,
"learning_rate": 1.2808080808080808e-05,
"loss": 303.4214,
"step": 3170
},
{
"epoch": 0.025695100962354254,
"grad_norm": 1310.1988525390625,
"learning_rate": 1.2848484848484848e-05,
"loss": 259.6753,
"step": 3180
},
{
"epoch": 0.025775903166638386,
"grad_norm": 1910.4666748046875,
"learning_rate": 1.2888888888888889e-05,
"loss": 273.7454,
"step": 3190
},
{
"epoch": 0.02585670537092252,
"grad_norm": 740.5687255859375,
"learning_rate": 1.292929292929293e-05,
"loss": 292.6491,
"step": 3200
},
{
"epoch": 0.02593750757520665,
"grad_norm": 1202.99462890625,
"learning_rate": 1.296969696969697e-05,
"loss": 243.2686,
"step": 3210
},
{
"epoch": 0.026018309779490784,
"grad_norm": 2174.3525390625,
"learning_rate": 1.301010101010101e-05,
"loss": 276.3461,
"step": 3220
},
{
"epoch": 0.026099111983774917,
"grad_norm": 1177.9141845703125,
"learning_rate": 1.3050505050505051e-05,
"loss": 246.7616,
"step": 3230
},
{
"epoch": 0.02617991418805905,
"grad_norm": 1337.3155517578125,
"learning_rate": 1.3090909090909093e-05,
"loss": 279.5415,
"step": 3240
},
{
"epoch": 0.026260716392343182,
"grad_norm": 970.5191040039062,
"learning_rate": 1.3131313131313134e-05,
"loss": 240.0207,
"step": 3250
},
{
"epoch": 0.026341518596627315,
"grad_norm": 1119.1689453125,
"learning_rate": 1.3171717171717171e-05,
"loss": 242.7696,
"step": 3260
},
{
"epoch": 0.026422320800911447,
"grad_norm": 4005.26318359375,
"learning_rate": 1.3212121212121212e-05,
"loss": 288.4158,
"step": 3270
},
{
"epoch": 0.02650312300519558,
"grad_norm": 2148.187255859375,
"learning_rate": 1.3252525252525252e-05,
"loss": 224.5919,
"step": 3280
},
{
"epoch": 0.026583925209479716,
"grad_norm": 1367.2222900390625,
"learning_rate": 1.3292929292929293e-05,
"loss": 212.8641,
"step": 3290
},
{
"epoch": 0.02666472741376385,
"grad_norm": 1278.0506591796875,
"learning_rate": 1.3333333333333333e-05,
"loss": 222.3518,
"step": 3300
},
{
"epoch": 0.02674552961804798,
"grad_norm": 1361.965087890625,
"learning_rate": 1.3373737373737374e-05,
"loss": 257.0285,
"step": 3310
},
{
"epoch": 0.026826331822332114,
"grad_norm": 1330.8619384765625,
"learning_rate": 1.3414141414141414e-05,
"loss": 293.7213,
"step": 3320
},
{
"epoch": 0.026907134026616247,
"grad_norm": 1107.8526611328125,
"learning_rate": 1.3454545454545457e-05,
"loss": 198.6048,
"step": 3330
},
{
"epoch": 0.02698793623090038,
"grad_norm": 1048.5009765625,
"learning_rate": 1.3494949494949497e-05,
"loss": 197.4301,
"step": 3340
},
{
"epoch": 0.027068738435184512,
"grad_norm": 2446.47119140625,
"learning_rate": 1.3535353535353538e-05,
"loss": 287.2842,
"step": 3350
},
{
"epoch": 0.027149540639468644,
"grad_norm": 1396.544921875,
"learning_rate": 1.3575757575757578e-05,
"loss": 258.2231,
"step": 3360
},
{
"epoch": 0.027230342843752777,
"grad_norm": 904.5388793945312,
"learning_rate": 1.3616161616161615e-05,
"loss": 376.3808,
"step": 3370
},
{
"epoch": 0.02731114504803691,
"grad_norm": 1247.5994873046875,
"learning_rate": 1.3656565656565656e-05,
"loss": 318.9619,
"step": 3380
},
{
"epoch": 0.027391947252321042,
"grad_norm": 1123.76220703125,
"learning_rate": 1.3696969696969697e-05,
"loss": 235.3794,
"step": 3390
},
{
"epoch": 0.027472749456605175,
"grad_norm": 2435.860107421875,
"learning_rate": 1.3737373737373737e-05,
"loss": 290.7648,
"step": 3400
},
{
"epoch": 0.027553551660889308,
"grad_norm": 1895.08642578125,
"learning_rate": 1.3777777777777778e-05,
"loss": 228.9499,
"step": 3410
},
{
"epoch": 0.02763435386517344,
"grad_norm": 1057.9578857421875,
"learning_rate": 1.3818181818181818e-05,
"loss": 199.0401,
"step": 3420
},
{
"epoch": 0.027715156069457576,
"grad_norm": 1296.09130859375,
"learning_rate": 1.385858585858586e-05,
"loss": 238.6514,
"step": 3430
},
{
"epoch": 0.02779595827374171,
"grad_norm": 3411.5341796875,
"learning_rate": 1.3898989898989901e-05,
"loss": 299.6798,
"step": 3440
},
{
"epoch": 0.02787676047802584,
"grad_norm": 1198.8946533203125,
"learning_rate": 1.3939393939393942e-05,
"loss": 258.3361,
"step": 3450
},
{
"epoch": 0.027957562682309974,
"grad_norm": 839.223388671875,
"learning_rate": 1.3979797979797982e-05,
"loss": 222.8733,
"step": 3460
},
{
"epoch": 0.028038364886594107,
"grad_norm": 1462.1505126953125,
"learning_rate": 1.402020202020202e-05,
"loss": 206.9738,
"step": 3470
},
{
"epoch": 0.02811916709087824,
"grad_norm": 1066.7890625,
"learning_rate": 1.406060606060606e-05,
"loss": 190.5506,
"step": 3480
},
{
"epoch": 0.028199969295162372,
"grad_norm": 2753.951171875,
"learning_rate": 1.41010101010101e-05,
"loss": 210.7065,
"step": 3490
},
{
"epoch": 0.028280771499446505,
"grad_norm": 1700.187744140625,
"learning_rate": 1.4141414141414141e-05,
"loss": 239.0851,
"step": 3500
},
{
"epoch": 0.028361573703730637,
"grad_norm": 1913.965576171875,
"learning_rate": 1.4181818181818181e-05,
"loss": 244.5272,
"step": 3510
},
{
"epoch": 0.02844237590801477,
"grad_norm": 1347.6934814453125,
"learning_rate": 1.4222222222222224e-05,
"loss": 219.5849,
"step": 3520
},
{
"epoch": 0.028523178112298903,
"grad_norm": 894.3698120117188,
"learning_rate": 1.4262626262626264e-05,
"loss": 261.3107,
"step": 3530
},
{
"epoch": 0.028603980316583035,
"grad_norm": 649.42236328125,
"learning_rate": 1.4303030303030305e-05,
"loss": 202.5557,
"step": 3540
},
{
"epoch": 0.028684782520867168,
"grad_norm": 824.8812255859375,
"learning_rate": 1.4343434343434345e-05,
"loss": 245.8003,
"step": 3550
},
{
"epoch": 0.0287655847251513,
"grad_norm": 828.0931396484375,
"learning_rate": 1.4383838383838386e-05,
"loss": 260.2875,
"step": 3560
},
{
"epoch": 0.028846386929435437,
"grad_norm": 1334.4947509765625,
"learning_rate": 1.4424242424242426e-05,
"loss": 232.7898,
"step": 3570
},
{
"epoch": 0.02892718913371957,
"grad_norm": 1371.1171875,
"learning_rate": 1.4464646464646464e-05,
"loss": 418.4771,
"step": 3580
},
{
"epoch": 0.029007991338003702,
"grad_norm": 18497.5234375,
"learning_rate": 1.4505050505050504e-05,
"loss": 303.3979,
"step": 3590
},
{
"epoch": 0.029088793542287834,
"grad_norm": 1640.417724609375,
"learning_rate": 1.4545454545454545e-05,
"loss": 246.1203,
"step": 3600
},
{
"epoch": 0.029169595746571967,
"grad_norm": 866.4635620117188,
"learning_rate": 1.4585858585858587e-05,
"loss": 227.0032,
"step": 3610
},
{
"epoch": 0.0292503979508561,
"grad_norm": 1206.3389892578125,
"learning_rate": 1.4626262626262627e-05,
"loss": 240.7797,
"step": 3620
},
{
"epoch": 0.029331200155140232,
"grad_norm": 1930.5679931640625,
"learning_rate": 1.4666666666666668e-05,
"loss": 244.9207,
"step": 3630
},
{
"epoch": 0.029412002359424365,
"grad_norm": 1362.0755615234375,
"learning_rate": 1.4707070707070709e-05,
"loss": 223.5896,
"step": 3640
},
{
"epoch": 0.029492804563708497,
"grad_norm": 1778.240478515625,
"learning_rate": 1.4747474747474749e-05,
"loss": 233.0804,
"step": 3650
},
{
"epoch": 0.02957360676799263,
"grad_norm": 1185.7432861328125,
"learning_rate": 1.478787878787879e-05,
"loss": 269.5211,
"step": 3660
},
{
"epoch": 0.029654408972276763,
"grad_norm": 1272.7274169921875,
"learning_rate": 1.482828282828283e-05,
"loss": 256.0854,
"step": 3670
},
{
"epoch": 0.029735211176560895,
"grad_norm": 3724.482421875,
"learning_rate": 1.486868686868687e-05,
"loss": 220.0564,
"step": 3680
},
{
"epoch": 0.029816013380845028,
"grad_norm": 1362.2408447265625,
"learning_rate": 1.4909090909090908e-05,
"loss": 196.9579,
"step": 3690
},
{
"epoch": 0.02989681558512916,
"grad_norm": 1142.985107421875,
"learning_rate": 1.494949494949495e-05,
"loss": 298.3712,
"step": 3700
},
{
"epoch": 0.029977617789413297,
"grad_norm": 1711.4461669921875,
"learning_rate": 1.498989898989899e-05,
"loss": 248.673,
"step": 3710
},
{
"epoch": 0.03005841999369743,
"grad_norm": 1854.973876953125,
"learning_rate": 1.5030303030303031e-05,
"loss": 178.4528,
"step": 3720
},
{
"epoch": 0.030139222197981562,
"grad_norm": 2415.3564453125,
"learning_rate": 1.5070707070707072e-05,
"loss": 279.0313,
"step": 3730
},
{
"epoch": 0.030220024402265695,
"grad_norm": 1113.0447998046875,
"learning_rate": 1.5111111111111112e-05,
"loss": 263.2642,
"step": 3740
},
{
"epoch": 0.030300826606549827,
"grad_norm": 1523.1632080078125,
"learning_rate": 1.5151515151515153e-05,
"loss": 292.6833,
"step": 3750
},
{
"epoch": 0.03038162881083396,
"grad_norm": 1810.5382080078125,
"learning_rate": 1.5191919191919193e-05,
"loss": 260.2465,
"step": 3760
},
{
"epoch": 0.030462431015118092,
"grad_norm": 2051.318115234375,
"learning_rate": 1.5232323232323234e-05,
"loss": 249.5686,
"step": 3770
},
{
"epoch": 0.030543233219402225,
"grad_norm": 1145.482421875,
"learning_rate": 1.5272727272727276e-05,
"loss": 217.0,
"step": 3780
},
{
"epoch": 0.030624035423686358,
"grad_norm": 1456.9969482421875,
"learning_rate": 1.531313131313131e-05,
"loss": 247.1355,
"step": 3790
},
{
"epoch": 0.03070483762797049,
"grad_norm": 2063.9072265625,
"learning_rate": 1.5353535353535354e-05,
"loss": 317.8373,
"step": 3800
},
{
"epoch": 0.030785639832254623,
"grad_norm": 1188.59130859375,
"learning_rate": 1.5393939393939393e-05,
"loss": 251.0659,
"step": 3810
},
{
"epoch": 0.030866442036538756,
"grad_norm": 542.1653442382812,
"learning_rate": 1.5434343434343435e-05,
"loss": 205.6288,
"step": 3820
},
{
"epoch": 0.030947244240822888,
"grad_norm": 858.66552734375,
"learning_rate": 1.5474747474747474e-05,
"loss": 261.1724,
"step": 3830
},
{
"epoch": 0.031028046445107024,
"grad_norm": 1392.4208984375,
"learning_rate": 1.5515151515151516e-05,
"loss": 263.898,
"step": 3840
},
{
"epoch": 0.031108848649391157,
"grad_norm": 1089.10888671875,
"learning_rate": 1.5555555555555555e-05,
"loss": 263.4895,
"step": 3850
},
{
"epoch": 0.03118965085367529,
"grad_norm": 1323.1083984375,
"learning_rate": 1.5595959595959597e-05,
"loss": 224.5914,
"step": 3860
},
{
"epoch": 0.03127045305795942,
"grad_norm": 748.7206420898438,
"learning_rate": 1.563636363636364e-05,
"loss": 185.8181,
"step": 3870
},
{
"epoch": 0.031351255262243555,
"grad_norm": 1530.072021484375,
"learning_rate": 1.5676767676767678e-05,
"loss": 295.1081,
"step": 3880
},
{
"epoch": 0.031432057466527684,
"grad_norm": 1390.1978759765625,
"learning_rate": 1.571717171717172e-05,
"loss": 221.7333,
"step": 3890
},
{
"epoch": 0.03151285967081182,
"grad_norm": 1188.6934814453125,
"learning_rate": 1.5757575757575756e-05,
"loss": 231.4922,
"step": 3900
},
{
"epoch": 0.031593661875095956,
"grad_norm": 1810.8616943359375,
"learning_rate": 1.5797979797979798e-05,
"loss": 226.2008,
"step": 3910
},
{
"epoch": 0.031674464079380085,
"grad_norm": 1351.2021484375,
"learning_rate": 1.5838383838383837e-05,
"loss": 213.8082,
"step": 3920
},
{
"epoch": 0.03175526628366422,
"grad_norm": 1504.8511962890625,
"learning_rate": 1.587878787878788e-05,
"loss": 237.731,
"step": 3930
},
{
"epoch": 0.03183606848794835,
"grad_norm": 3990.205810546875,
"learning_rate": 1.5919191919191918e-05,
"loss": 308.5875,
"step": 3940
},
{
"epoch": 0.03191687069223249,
"grad_norm": 1052.4140625,
"learning_rate": 1.595959595959596e-05,
"loss": 173.6135,
"step": 3950
},
{
"epoch": 0.031997672896516616,
"grad_norm": 818.5986328125,
"learning_rate": 1.6000000000000003e-05,
"loss": 255.928,
"step": 3960
},
{
"epoch": 0.03207847510080075,
"grad_norm": 3015.482666015625,
"learning_rate": 1.604040404040404e-05,
"loss": 246.6157,
"step": 3970
},
{
"epoch": 0.03215927730508488,
"grad_norm": 1520.350341796875,
"learning_rate": 1.6080808080808084e-05,
"loss": 294.5478,
"step": 3980
},
{
"epoch": 0.03224007950936902,
"grad_norm": 1362.8385009765625,
"learning_rate": 1.6121212121212123e-05,
"loss": 263.2382,
"step": 3990
},
{
"epoch": 0.032320881713653146,
"grad_norm": 1330.2135009765625,
"learning_rate": 1.6161616161616165e-05,
"loss": 227.9952,
"step": 4000
},
{
"epoch": 0.03240168391793728,
"grad_norm": 2001.479248046875,
"learning_rate": 1.62020202020202e-05,
"loss": 373.7298,
"step": 4010
},
{
"epoch": 0.03248248612222141,
"grad_norm": 670.8789672851562,
"learning_rate": 1.6242424242424243e-05,
"loss": 252.2481,
"step": 4020
},
{
"epoch": 0.03256328832650555,
"grad_norm": 1504.35205078125,
"learning_rate": 1.628282828282828e-05,
"loss": 259.7328,
"step": 4030
},
{
"epoch": 0.03264409053078968,
"grad_norm": 1177.47509765625,
"learning_rate": 1.6323232323232324e-05,
"loss": 220.6592,
"step": 4040
},
{
"epoch": 0.03272489273507381,
"grad_norm": 889.9537353515625,
"learning_rate": 1.6363636363636366e-05,
"loss": 210.0868,
"step": 4050
},
{
"epoch": 0.03280569493935795,
"grad_norm": 1655.767333984375,
"learning_rate": 1.6404040404040405e-05,
"loss": 247.7082,
"step": 4060
},
{
"epoch": 0.03288649714364208,
"grad_norm": 1741.26416015625,
"learning_rate": 1.6444444444444447e-05,
"loss": 213.1305,
"step": 4070
},
{
"epoch": 0.032967299347926214,
"grad_norm": 1701.3470458984375,
"learning_rate": 1.6484848484848486e-05,
"loss": 197.4172,
"step": 4080
},
{
"epoch": 0.03304810155221034,
"grad_norm": 1241.48876953125,
"learning_rate": 1.6525252525252528e-05,
"loss": 200.895,
"step": 4090
},
{
"epoch": 0.03312890375649448,
"grad_norm": 4305.5234375,
"learning_rate": 1.6565656565656567e-05,
"loss": 270.1561,
"step": 4100
},
{
"epoch": 0.03320970596077861,
"grad_norm": 1233.9559326171875,
"learning_rate": 1.6606060606060606e-05,
"loss": 234.697,
"step": 4110
},
{
"epoch": 0.033290508165062745,
"grad_norm": 1864.9722900390625,
"learning_rate": 1.6646464646464645e-05,
"loss": 207.7519,
"step": 4120
},
{
"epoch": 0.033371310369346874,
"grad_norm": 696.45654296875,
"learning_rate": 1.6686868686868687e-05,
"loss": 260.9977,
"step": 4130
},
{
"epoch": 0.03345211257363101,
"grad_norm": 1083.8914794921875,
"learning_rate": 1.672727272727273e-05,
"loss": 296.5648,
"step": 4140
},
{
"epoch": 0.03353291477791514,
"grad_norm": 787.8980102539062,
"learning_rate": 1.6767676767676768e-05,
"loss": 252.0068,
"step": 4150
},
{
"epoch": 0.033613716982199275,
"grad_norm": 3963.899658203125,
"learning_rate": 1.680808080808081e-05,
"loss": 239.0976,
"step": 4160
},
{
"epoch": 0.033694519186483404,
"grad_norm": 1345.8841552734375,
"learning_rate": 1.684848484848485e-05,
"loss": 200.5568,
"step": 4170
},
{
"epoch": 0.03377532139076754,
"grad_norm": 1667.1441650390625,
"learning_rate": 1.688888888888889e-05,
"loss": 246.528,
"step": 4180
},
{
"epoch": 0.033856123595051676,
"grad_norm": 941.3829956054688,
"learning_rate": 1.692929292929293e-05,
"loss": 188.0032,
"step": 4190
},
{
"epoch": 0.033936925799335806,
"grad_norm": 2224.00048828125,
"learning_rate": 1.6969696969696972e-05,
"loss": 233.2688,
"step": 4200
},
{
"epoch": 0.03401772800361994,
"grad_norm": 990.577880859375,
"learning_rate": 1.701010101010101e-05,
"loss": 229.0408,
"step": 4210
},
{
"epoch": 0.03409853020790407,
"grad_norm": 1741.591064453125,
"learning_rate": 1.705050505050505e-05,
"loss": 210.8973,
"step": 4220
},
{
"epoch": 0.03417933241218821,
"grad_norm": 1565.2149658203125,
"learning_rate": 1.7090909090909092e-05,
"loss": 172.9691,
"step": 4230
},
{
"epoch": 0.034260134616472336,
"grad_norm": 1411.6668701171875,
"learning_rate": 1.713131313131313e-05,
"loss": 223.8018,
"step": 4240
},
{
"epoch": 0.03434093682075647,
"grad_norm": 849.447998046875,
"learning_rate": 1.7171717171717173e-05,
"loss": 277.72,
"step": 4250
},
{
"epoch": 0.0344217390250406,
"grad_norm": 1456.3353271484375,
"learning_rate": 1.7212121212121212e-05,
"loss": 269.1795,
"step": 4260
},
{
"epoch": 0.03450254122932474,
"grad_norm": 2039.048583984375,
"learning_rate": 1.7252525252525255e-05,
"loss": 203.6644,
"step": 4270
},
{
"epoch": 0.03458334343360887,
"grad_norm": 1037.1063232421875,
"learning_rate": 1.7292929292929293e-05,
"loss": 268.1442,
"step": 4280
},
{
"epoch": 0.034664145637893,
"grad_norm": 1481.98095703125,
"learning_rate": 1.7333333333333336e-05,
"loss": 246.1609,
"step": 4290
},
{
"epoch": 0.03474494784217713,
"grad_norm": 1042.147216796875,
"learning_rate": 1.7373737373737375e-05,
"loss": 360.0711,
"step": 4300
},
{
"epoch": 0.03482575004646127,
"grad_norm": 1008.8258666992188,
"learning_rate": 1.7414141414141417e-05,
"loss": 254.1684,
"step": 4310
},
{
"epoch": 0.0349065522507454,
"grad_norm": 1818.73681640625,
"learning_rate": 1.7454545454545456e-05,
"loss": 248.8469,
"step": 4320
},
{
"epoch": 0.03498735445502953,
"grad_norm": 2598.832763671875,
"learning_rate": 1.7494949494949494e-05,
"loss": 215.4962,
"step": 4330
},
{
"epoch": 0.03506815665931367,
"grad_norm": 5505.1572265625,
"learning_rate": 1.7535353535353537e-05,
"loss": 206.1825,
"step": 4340
},
{
"epoch": 0.0351489588635978,
"grad_norm": 872.9111328125,
"learning_rate": 1.7575757575757576e-05,
"loss": 226.9096,
"step": 4350
},
{
"epoch": 0.035229761067881935,
"grad_norm": 1309.483154296875,
"learning_rate": 1.7616161616161618e-05,
"loss": 347.4825,
"step": 4360
},
{
"epoch": 0.035310563272166064,
"grad_norm": 1847.357666015625,
"learning_rate": 1.7656565656565657e-05,
"loss": 283.7126,
"step": 4370
},
{
"epoch": 0.0353913654764502,
"grad_norm": 1132.7510986328125,
"learning_rate": 1.76969696969697e-05,
"loss": 238.7522,
"step": 4380
},
{
"epoch": 0.03547216768073433,
"grad_norm": 1338.4906005859375,
"learning_rate": 1.7737373737373738e-05,
"loss": 206.4677,
"step": 4390
},
{
"epoch": 0.035552969885018465,
"grad_norm": 889.9144897460938,
"learning_rate": 1.777777777777778e-05,
"loss": 218.258,
"step": 4400
},
{
"epoch": 0.035633772089302594,
"grad_norm": 1081.747314453125,
"learning_rate": 1.781818181818182e-05,
"loss": 227.4267,
"step": 4410
},
{
"epoch": 0.03571457429358673,
"grad_norm": 1337.2747802734375,
"learning_rate": 1.785858585858586e-05,
"loss": 216.5905,
"step": 4420
},
{
"epoch": 0.03579537649787086,
"grad_norm": 1070.0733642578125,
"learning_rate": 1.78989898989899e-05,
"loss": 244.2413,
"step": 4430
},
{
"epoch": 0.035876178702154995,
"grad_norm": 2713.52392578125,
"learning_rate": 1.793939393939394e-05,
"loss": 214.338,
"step": 4440
},
{
"epoch": 0.035956980906439125,
"grad_norm": 1579.244873046875,
"learning_rate": 1.797979797979798e-05,
"loss": 236.9733,
"step": 4450
},
{
"epoch": 0.03603778311072326,
"grad_norm": 1429.0421142578125,
"learning_rate": 1.802020202020202e-05,
"loss": 210.1912,
"step": 4460
},
{
"epoch": 0.0361185853150074,
"grad_norm": 1236.484375,
"learning_rate": 1.8060606060606062e-05,
"loss": 209.4003,
"step": 4470
},
{
"epoch": 0.036199387519291526,
"grad_norm": 2510.634521484375,
"learning_rate": 1.81010101010101e-05,
"loss": 261.3127,
"step": 4480
},
{
"epoch": 0.03628018972357566,
"grad_norm": 697.04345703125,
"learning_rate": 1.8141414141414143e-05,
"loss": 202.3308,
"step": 4490
},
{
"epoch": 0.03636099192785979,
"grad_norm": 1664.605712890625,
"learning_rate": 1.8181818181818182e-05,
"loss": 230.7549,
"step": 4500
},
{
"epoch": 0.03644179413214393,
"grad_norm": 1968.6279296875,
"learning_rate": 1.8222222222222224e-05,
"loss": 222.0125,
"step": 4510
},
{
"epoch": 0.036522596336428056,
"grad_norm": 1813.247314453125,
"learning_rate": 1.8262626262626263e-05,
"loss": 206.1146,
"step": 4520
},
{
"epoch": 0.03660339854071219,
"grad_norm": 1681.3162841796875,
"learning_rate": 1.8303030303030305e-05,
"loss": 281.5203,
"step": 4530
},
{
"epoch": 0.03668420074499632,
"grad_norm": 813.0327758789062,
"learning_rate": 1.8343434343434344e-05,
"loss": 241.7396,
"step": 4540
},
{
"epoch": 0.03676500294928046,
"grad_norm": 1714.4927978515625,
"learning_rate": 1.8383838383838383e-05,
"loss": 229.0337,
"step": 4550
},
{
"epoch": 0.03684580515356459,
"grad_norm": 1173.26318359375,
"learning_rate": 1.8424242424242425e-05,
"loss": 167.8814,
"step": 4560
},
{
"epoch": 0.03692660735784872,
"grad_norm": 1044.22509765625,
"learning_rate": 1.8464646464646464e-05,
"loss": 181.4134,
"step": 4570
},
{
"epoch": 0.03700740956213285,
"grad_norm": 1544.4964599609375,
"learning_rate": 1.8505050505050506e-05,
"loss": 264.1711,
"step": 4580
},
{
"epoch": 0.03708821176641699,
"grad_norm": 3204.8271484375,
"learning_rate": 1.8545454545454545e-05,
"loss": 209.0515,
"step": 4590
},
{
"epoch": 0.03716901397070112,
"grad_norm": 1948.9998779296875,
"learning_rate": 1.8585858585858588e-05,
"loss": 204.1481,
"step": 4600
},
{
"epoch": 0.037249816174985254,
"grad_norm": 985.3388671875,
"learning_rate": 1.8626262626262626e-05,
"loss": 251.0652,
"step": 4610
},
{
"epoch": 0.03733061837926939,
"grad_norm": 4716.29833984375,
"learning_rate": 1.866666666666667e-05,
"loss": 234.005,
"step": 4620
},
{
"epoch": 0.03741142058355352,
"grad_norm": 2745.129150390625,
"learning_rate": 1.8707070707070707e-05,
"loss": 222.8053,
"step": 4630
},
{
"epoch": 0.037492222787837655,
"grad_norm": 852.2494506835938,
"learning_rate": 1.874747474747475e-05,
"loss": 244.6,
"step": 4640
},
{
"epoch": 0.037573024992121784,
"grad_norm": 1276.906494140625,
"learning_rate": 1.878787878787879e-05,
"loss": 243.4739,
"step": 4650
},
{
"epoch": 0.03765382719640592,
"grad_norm": 2488.490478515625,
"learning_rate": 1.8828282828282827e-05,
"loss": 241.5105,
"step": 4660
},
{
"epoch": 0.03773462940069005,
"grad_norm": 1208.5731201171875,
"learning_rate": 1.886868686868687e-05,
"loss": 266.4298,
"step": 4670
},
{
"epoch": 0.037815431604974185,
"grad_norm": 1110.9935302734375,
"learning_rate": 1.890909090909091e-05,
"loss": 220.2013,
"step": 4680
},
{
"epoch": 0.037896233809258315,
"grad_norm": 966.4763793945312,
"learning_rate": 1.894949494949495e-05,
"loss": 213.4089,
"step": 4690
},
{
"epoch": 0.03797703601354245,
"grad_norm": 888.4136352539062,
"learning_rate": 1.898989898989899e-05,
"loss": 192.6133,
"step": 4700
},
{
"epoch": 0.03805783821782658,
"grad_norm": 1441.930419921875,
"learning_rate": 1.9030303030303032e-05,
"loss": 210.6855,
"step": 4710
},
{
"epoch": 0.038138640422110716,
"grad_norm": 1268.2919921875,
"learning_rate": 1.907070707070707e-05,
"loss": 196.9399,
"step": 4720
},
{
"epoch": 0.038219442626394845,
"grad_norm": 714.101318359375,
"learning_rate": 1.9111111111111113e-05,
"loss": 236.1493,
"step": 4730
},
{
"epoch": 0.03830024483067898,
"grad_norm": 1360.3662109375,
"learning_rate": 1.9151515151515155e-05,
"loss": 277.1614,
"step": 4740
},
{
"epoch": 0.03838104703496312,
"grad_norm": 857.1802368164062,
"learning_rate": 1.919191919191919e-05,
"loss": 233.6975,
"step": 4750
},
{
"epoch": 0.038461849239247246,
"grad_norm": 1430.3370361328125,
"learning_rate": 1.9232323232323233e-05,
"loss": 206.9375,
"step": 4760
},
{
"epoch": 0.03854265144353138,
"grad_norm": 999.745849609375,
"learning_rate": 1.9272727272727272e-05,
"loss": 177.6682,
"step": 4770
},
{
"epoch": 0.03862345364781551,
"grad_norm": 1979.0234375,
"learning_rate": 1.9313131313131314e-05,
"loss": 237.5471,
"step": 4780
},
{
"epoch": 0.03870425585209965,
"grad_norm": 1399.9544677734375,
"learning_rate": 1.9353535353535353e-05,
"loss": 209.8267,
"step": 4790
},
{
"epoch": 0.03878505805638378,
"grad_norm": 1058.5128173828125,
"learning_rate": 1.9393939393939395e-05,
"loss": 206.1269,
"step": 4800
},
{
"epoch": 0.03886586026066791,
"grad_norm": 1852.674072265625,
"learning_rate": 1.9434343434343434e-05,
"loss": 192.6013,
"step": 4810
},
{
"epoch": 0.03894666246495204,
"grad_norm": 1104.2967529296875,
"learning_rate": 1.9474747474747476e-05,
"loss": 252.5522,
"step": 4820
},
{
"epoch": 0.03902746466923618,
"grad_norm": 1426.0396728515625,
"learning_rate": 1.951515151515152e-05,
"loss": 250.6448,
"step": 4830
},
{
"epoch": 0.03910826687352031,
"grad_norm": 1632.4510498046875,
"learning_rate": 1.9555555555555557e-05,
"loss": 163.3638,
"step": 4840
},
{
"epoch": 0.03918906907780444,
"grad_norm": 700.0907592773438,
"learning_rate": 1.95959595959596e-05,
"loss": 236.7388,
"step": 4850
},
{
"epoch": 0.03926987128208857,
"grad_norm": 1205.572265625,
"learning_rate": 1.9636363636363635e-05,
"loss": 272.2705,
"step": 4860
},
{
"epoch": 0.03935067348637271,
"grad_norm": 799.412353515625,
"learning_rate": 1.9676767676767677e-05,
"loss": 171.4291,
"step": 4870
},
{
"epoch": 0.03943147569065684,
"grad_norm": 1350.2025146484375,
"learning_rate": 1.9717171717171716e-05,
"loss": 233.9921,
"step": 4880
},
{
"epoch": 0.039512277894940974,
"grad_norm": 976.219970703125,
"learning_rate": 1.975757575757576e-05,
"loss": 189.0711,
"step": 4890
},
{
"epoch": 0.03959308009922511,
"grad_norm": 947.8401489257812,
"learning_rate": 1.9797979797979797e-05,
"loss": 207.2786,
"step": 4900
},
{
"epoch": 0.03967388230350924,
"grad_norm": 1402.2440185546875,
"learning_rate": 1.983838383838384e-05,
"loss": 233.6717,
"step": 4910
},
{
"epoch": 0.039754684507793375,
"grad_norm": 2319.2314453125,
"learning_rate": 1.987878787878788e-05,
"loss": 268.4254,
"step": 4920
},
{
"epoch": 0.039835486712077504,
"grad_norm": 1344.019775390625,
"learning_rate": 1.991919191919192e-05,
"loss": 215.5304,
"step": 4930
},
{
"epoch": 0.03991628891636164,
"grad_norm": 1209.1622314453125,
"learning_rate": 1.9959595959595963e-05,
"loss": 202.8059,
"step": 4940
},
{
"epoch": 0.03999709112064577,
"grad_norm": 1872.3892822265625,
"learning_rate": 2e-05,
"loss": 193.5764,
"step": 4950
},
{
"epoch": 0.040077893324929906,
"grad_norm": 1944.2449951171875,
"learning_rate": 2.0040404040404044e-05,
"loss": 273.8487,
"step": 4960
},
{
"epoch": 0.040158695529214035,
"grad_norm": 988.1495361328125,
"learning_rate": 2.008080808080808e-05,
"loss": 202.3245,
"step": 4970
},
{
"epoch": 0.04023949773349817,
"grad_norm": 1082.6280517578125,
"learning_rate": 2.012121212121212e-05,
"loss": 190.6009,
"step": 4980
},
{
"epoch": 0.0403202999377823,
"grad_norm": 1510.5738525390625,
"learning_rate": 2.016161616161616e-05,
"loss": 262.4282,
"step": 4990
},
{
"epoch": 0.040401102142066436,
"grad_norm": 1080.0328369140625,
"learning_rate": 2.0202020202020203e-05,
"loss": 179.5178,
"step": 5000
},
{
"epoch": 0.040481904346350565,
"grad_norm": 1204.5341796875,
"learning_rate": 2.0242424242424245e-05,
"loss": 208.4234,
"step": 5010
},
{
"epoch": 0.0405627065506347,
"grad_norm": 788.6203002929688,
"learning_rate": 2.0282828282828284e-05,
"loss": 222.1854,
"step": 5020
},
{
"epoch": 0.04064350875491884,
"grad_norm": 2447.934326171875,
"learning_rate": 2.0323232323232326e-05,
"loss": 183.1969,
"step": 5030
},
{
"epoch": 0.04072431095920297,
"grad_norm": 1879.5914306640625,
"learning_rate": 2.0363636363636365e-05,
"loss": 235.8428,
"step": 5040
},
{
"epoch": 0.0408051131634871,
"grad_norm": 859.5083618164062,
"learning_rate": 2.0404040404040407e-05,
"loss": 223.1974,
"step": 5050
},
{
"epoch": 0.04088591536777123,
"grad_norm": 591.982421875,
"learning_rate": 2.0444444444444446e-05,
"loss": 195.905,
"step": 5060
},
{
"epoch": 0.04096671757205537,
"grad_norm": 2516.256103515625,
"learning_rate": 2.0484848484848485e-05,
"loss": 224.4586,
"step": 5070
},
{
"epoch": 0.0410475197763395,
"grad_norm": 1155.78271484375,
"learning_rate": 2.0525252525252524e-05,
"loss": 237.8034,
"step": 5080
},
{
"epoch": 0.04112832198062363,
"grad_norm": 760.8511962890625,
"learning_rate": 2.0565656565656566e-05,
"loss": 213.4372,
"step": 5090
},
{
"epoch": 0.04120912418490776,
"grad_norm": 746.3182983398438,
"learning_rate": 2.0606060606060608e-05,
"loss": 246.9279,
"step": 5100
},
{
"epoch": 0.0412899263891919,
"grad_norm": 1112.6119384765625,
"learning_rate": 2.0646464646464647e-05,
"loss": 215.4636,
"step": 5110
},
{
"epoch": 0.04137072859347603,
"grad_norm": 1308.880126953125,
"learning_rate": 2.068686868686869e-05,
"loss": 184.3576,
"step": 5120
},
{
"epoch": 0.041451530797760164,
"grad_norm": 1182.3695068359375,
"learning_rate": 2.0727272727272728e-05,
"loss": 251.3663,
"step": 5130
},
{
"epoch": 0.04153233300204429,
"grad_norm": 3545.449951171875,
"learning_rate": 2.076767676767677e-05,
"loss": 221.7183,
"step": 5140
},
{
"epoch": 0.04161313520632843,
"grad_norm": 1155.616455078125,
"learning_rate": 2.080808080808081e-05,
"loss": 181.7703,
"step": 5150
},
{
"epoch": 0.04169393741061256,
"grad_norm": 927.0892333984375,
"learning_rate": 2.084848484848485e-05,
"loss": 242.7771,
"step": 5160
},
{
"epoch": 0.041774739614896694,
"grad_norm": 1621.09326171875,
"learning_rate": 2.088888888888889e-05,
"loss": 168.8398,
"step": 5170
},
{
"epoch": 0.04185554181918083,
"grad_norm": 1823.0281982421875,
"learning_rate": 2.092929292929293e-05,
"loss": 226.3993,
"step": 5180
},
{
"epoch": 0.04193634402346496,
"grad_norm": 1904.581298828125,
"learning_rate": 2.096969696969697e-05,
"loss": 274.6904,
"step": 5190
},
{
"epoch": 0.042017146227749096,
"grad_norm": 1195.8973388671875,
"learning_rate": 2.101010101010101e-05,
"loss": 193.929,
"step": 5200
},
{
"epoch": 0.042097948432033225,
"grad_norm": 809.5712890625,
"learning_rate": 2.1050505050505052e-05,
"loss": 183.259,
"step": 5210
},
{
"epoch": 0.04217875063631736,
"grad_norm": 1392.5491943359375,
"learning_rate": 2.109090909090909e-05,
"loss": 220.0326,
"step": 5220
},
{
"epoch": 0.04225955284060149,
"grad_norm": 1818.6051025390625,
"learning_rate": 2.1131313131313134e-05,
"loss": 209.3423,
"step": 5230
},
{
"epoch": 0.042340355044885626,
"grad_norm": 756.583740234375,
"learning_rate": 2.1171717171717172e-05,
"loss": 152.79,
"step": 5240
},
{
"epoch": 0.042421157249169755,
"grad_norm": 1358.5194091796875,
"learning_rate": 2.1212121212121215e-05,
"loss": 223.6846,
"step": 5250
},
{
"epoch": 0.04250195945345389,
"grad_norm": 2302.727783203125,
"learning_rate": 2.1252525252525254e-05,
"loss": 206.5412,
"step": 5260
},
{
"epoch": 0.04258276165773802,
"grad_norm": 1090.666259765625,
"learning_rate": 2.1292929292929296e-05,
"loss": 197.9379,
"step": 5270
},
{
"epoch": 0.04266356386202216,
"grad_norm": 1535.5264892578125,
"learning_rate": 2.1333333333333335e-05,
"loss": 172.5529,
"step": 5280
},
{
"epoch": 0.042744366066306286,
"grad_norm": 1242.1055908203125,
"learning_rate": 2.1373737373737373e-05,
"loss": 182.2667,
"step": 5290
},
{
"epoch": 0.04282516827059042,
"grad_norm": 1571.221923828125,
"learning_rate": 2.1414141414141416e-05,
"loss": 206.9152,
"step": 5300
},
{
"epoch": 0.04290597047487456,
"grad_norm": 1733.92578125,
"learning_rate": 2.1454545454545455e-05,
"loss": 253.6228,
"step": 5310
},
{
"epoch": 0.04298677267915869,
"grad_norm": 1736.4722900390625,
"learning_rate": 2.1494949494949497e-05,
"loss": 209.5105,
"step": 5320
},
{
"epoch": 0.04306757488344282,
"grad_norm": 846.6854248046875,
"learning_rate": 2.1535353535353536e-05,
"loss": 227.6331,
"step": 5330
},
{
"epoch": 0.04314837708772695,
"grad_norm": 793.491943359375,
"learning_rate": 2.1575757575757578e-05,
"loss": 190.9206,
"step": 5340
},
{
"epoch": 0.04322917929201109,
"grad_norm": 1314.4940185546875,
"learning_rate": 2.1616161616161617e-05,
"loss": 278.1586,
"step": 5350
},
{
"epoch": 0.04330998149629522,
"grad_norm": 1807.1669921875,
"learning_rate": 2.165656565656566e-05,
"loss": 246.6954,
"step": 5360
},
{
"epoch": 0.043390783700579354,
"grad_norm": 1456.6739501953125,
"learning_rate": 2.1696969696969698e-05,
"loss": 198.1051,
"step": 5370
},
{
"epoch": 0.04347158590486348,
"grad_norm": 2645.863037109375,
"learning_rate": 2.173737373737374e-05,
"loss": 263.7012,
"step": 5380
},
{
"epoch": 0.04355238810914762,
"grad_norm": 890.2818603515625,
"learning_rate": 2.177777777777778e-05,
"loss": 206.08,
"step": 5390
},
{
"epoch": 0.04363319031343175,
"grad_norm": 1066.948974609375,
"learning_rate": 2.1818181818181818e-05,
"loss": 203.3024,
"step": 5400
},
{
"epoch": 0.043713992517715884,
"grad_norm": 1678.3651123046875,
"learning_rate": 2.185858585858586e-05,
"loss": 287.4994,
"step": 5410
},
{
"epoch": 0.04379479472200001,
"grad_norm": 1427.133544921875,
"learning_rate": 2.18989898989899e-05,
"loss": 236.3808,
"step": 5420
},
{
"epoch": 0.04387559692628415,
"grad_norm": 993.3723754882812,
"learning_rate": 2.193939393939394e-05,
"loss": 221.2247,
"step": 5430
},
{
"epoch": 0.04395639913056828,
"grad_norm": 919.2279663085938,
"learning_rate": 2.197979797979798e-05,
"loss": 232.8961,
"step": 5440
},
{
"epoch": 0.044037201334852415,
"grad_norm": 1196.51904296875,
"learning_rate": 2.2020202020202022e-05,
"loss": 208.7773,
"step": 5450
},
{
"epoch": 0.04411800353913655,
"grad_norm": 937.6903076171875,
"learning_rate": 2.206060606060606e-05,
"loss": 159.7425,
"step": 5460
},
{
"epoch": 0.04419880574342068,
"grad_norm": 2946.419921875,
"learning_rate": 2.2101010101010103e-05,
"loss": 201.0844,
"step": 5470
},
{
"epoch": 0.044279607947704816,
"grad_norm": 1663.4422607421875,
"learning_rate": 2.2141414141414142e-05,
"loss": 140.8333,
"step": 5480
},
{
"epoch": 0.044360410151988945,
"grad_norm": 1202.589599609375,
"learning_rate": 2.2181818181818184e-05,
"loss": 210.6169,
"step": 5490
},
{
"epoch": 0.04444121235627308,
"grad_norm": 1676.0555419921875,
"learning_rate": 2.2222222222222223e-05,
"loss": 300.5972,
"step": 5500
},
{
"epoch": 0.04452201456055721,
"grad_norm": 1122.7333984375,
"learning_rate": 2.2262626262626262e-05,
"loss": 223.7688,
"step": 5510
},
{
"epoch": 0.044602816764841346,
"grad_norm": 842.3754272460938,
"learning_rate": 2.2303030303030304e-05,
"loss": 231.1573,
"step": 5520
},
{
"epoch": 0.044683618969125476,
"grad_norm": 912.3519897460938,
"learning_rate": 2.2343434343434343e-05,
"loss": 161.5479,
"step": 5530
},
{
"epoch": 0.04476442117340961,
"grad_norm": 2117.377197265625,
"learning_rate": 2.2383838383838385e-05,
"loss": 178.251,
"step": 5540
},
{
"epoch": 0.04484522337769374,
"grad_norm": 1402.2164306640625,
"learning_rate": 2.2424242424242424e-05,
"loss": 209.2086,
"step": 5550
},
{
"epoch": 0.04492602558197788,
"grad_norm": 1458.323974609375,
"learning_rate": 2.2464646464646467e-05,
"loss": 243.9479,
"step": 5560
},
{
"epoch": 0.045006827786262006,
"grad_norm": 2175.216796875,
"learning_rate": 2.2505050505050505e-05,
"loss": 189.8892,
"step": 5570
},
{
"epoch": 0.04508762999054614,
"grad_norm": 1899.4354248046875,
"learning_rate": 2.2545454545454548e-05,
"loss": 335.0552,
"step": 5580
},
{
"epoch": 0.04516843219483028,
"grad_norm": 1230.814697265625,
"learning_rate": 2.2585858585858587e-05,
"loss": 194.9335,
"step": 5590
},
{
"epoch": 0.04524923439911441,
"grad_norm": 2101.527587890625,
"learning_rate": 2.262626262626263e-05,
"loss": 257.3806,
"step": 5600
},
{
"epoch": 0.045330036603398544,
"grad_norm": 1695.30810546875,
"learning_rate": 2.2666666666666668e-05,
"loss": 219.7137,
"step": 5610
},
{
"epoch": 0.04541083880768267,
"grad_norm": 1386.2855224609375,
"learning_rate": 2.2707070707070706e-05,
"loss": 236.2214,
"step": 5620
},
{
"epoch": 0.04549164101196681,
"grad_norm": 1138.779052734375,
"learning_rate": 2.274747474747475e-05,
"loss": 192.9845,
"step": 5630
},
{
"epoch": 0.04557244321625094,
"grad_norm": 2650.991943359375,
"learning_rate": 2.2787878787878788e-05,
"loss": 233.3904,
"step": 5640
},
{
"epoch": 0.045653245420535074,
"grad_norm": 1309.0333251953125,
"learning_rate": 2.282828282828283e-05,
"loss": 225.3846,
"step": 5650
},
{
"epoch": 0.0457340476248192,
"grad_norm": 930.385009765625,
"learning_rate": 2.286868686868687e-05,
"loss": 236.1336,
"step": 5660
},
{
"epoch": 0.04581484982910334,
"grad_norm": 1646.2891845703125,
"learning_rate": 2.290909090909091e-05,
"loss": 227.3526,
"step": 5670
},
{
"epoch": 0.04589565203338747,
"grad_norm": 2285.751708984375,
"learning_rate": 2.294949494949495e-05,
"loss": 236.6346,
"step": 5680
},
{
"epoch": 0.045976454237671605,
"grad_norm": 3180.75537109375,
"learning_rate": 2.2989898989898992e-05,
"loss": 177.5457,
"step": 5690
},
{
"epoch": 0.046057256441955734,
"grad_norm": 1423.35009765625,
"learning_rate": 2.3030303030303034e-05,
"loss": 194.2139,
"step": 5700
},
{
"epoch": 0.04613805864623987,
"grad_norm": 1577.701171875,
"learning_rate": 2.307070707070707e-05,
"loss": 183.8717,
"step": 5710
},
{
"epoch": 0.046218860850524,
"grad_norm": 1255.1485595703125,
"learning_rate": 2.3111111111111112e-05,
"loss": 213.8492,
"step": 5720
},
{
"epoch": 0.046299663054808135,
"grad_norm": 1154.9453125,
"learning_rate": 2.315151515151515e-05,
"loss": 219.6154,
"step": 5730
},
{
"epoch": 0.04638046525909227,
"grad_norm": 3208.9140625,
"learning_rate": 2.3191919191919193e-05,
"loss": 212.2527,
"step": 5740
},
{
"epoch": 0.0464612674633764,
"grad_norm": 826.8831787109375,
"learning_rate": 2.3232323232323232e-05,
"loss": 193.0573,
"step": 5750
},
{
"epoch": 0.046542069667660536,
"grad_norm": 953.578369140625,
"learning_rate": 2.3272727272727274e-05,
"loss": 200.1285,
"step": 5760
},
{
"epoch": 0.046622871871944666,
"grad_norm": 948.6517944335938,
"learning_rate": 2.3313131313131313e-05,
"loss": 226.3946,
"step": 5770
},
{
"epoch": 0.0467036740762288,
"grad_norm": 1502.9415283203125,
"learning_rate": 2.3353535353535355e-05,
"loss": 301.4247,
"step": 5780
},
{
"epoch": 0.04678447628051293,
"grad_norm": 592.7190551757812,
"learning_rate": 2.3393939393939397e-05,
"loss": 171.6613,
"step": 5790
},
{
"epoch": 0.04686527848479707,
"grad_norm": 774.3163452148438,
"learning_rate": 2.3434343434343436e-05,
"loss": 174.7567,
"step": 5800
},
{
"epoch": 0.046946080689081196,
"grad_norm": 1000.3840942382812,
"learning_rate": 2.347474747474748e-05,
"loss": 140.1143,
"step": 5810
},
{
"epoch": 0.04702688289336533,
"grad_norm": 1050.761474609375,
"learning_rate": 2.3515151515151514e-05,
"loss": 234.2542,
"step": 5820
},
{
"epoch": 0.04710768509764946,
"grad_norm": 1076.979248046875,
"learning_rate": 2.3555555555555556e-05,
"loss": 170.2877,
"step": 5830
},
{
"epoch": 0.0471884873019336,
"grad_norm": 1528.865478515625,
"learning_rate": 2.3595959595959595e-05,
"loss": 280.3715,
"step": 5840
},
{
"epoch": 0.047269289506217727,
"grad_norm": 1554.0205078125,
"learning_rate": 2.3636363636363637e-05,
"loss": 258.9206,
"step": 5850
},
{
"epoch": 0.04735009171050186,
"grad_norm": 969.7879028320312,
"learning_rate": 2.3676767676767676e-05,
"loss": 173.4592,
"step": 5860
},
{
"epoch": 0.047430893914786,
"grad_norm": 1271.55322265625,
"learning_rate": 2.371717171717172e-05,
"loss": 187.7373,
"step": 5870
},
{
"epoch": 0.04751169611907013,
"grad_norm": 757.3799438476562,
"learning_rate": 2.375757575757576e-05,
"loss": 206.0978,
"step": 5880
},
{
"epoch": 0.047592498323354264,
"grad_norm": 1099.2119140625,
"learning_rate": 2.37979797979798e-05,
"loss": 191.4486,
"step": 5890
},
{
"epoch": 0.04767330052763839,
"grad_norm": 895.0558471679688,
"learning_rate": 2.3838383838383842e-05,
"loss": 197.1677,
"step": 5900
},
{
"epoch": 0.04775410273192253,
"grad_norm": 900.752685546875,
"learning_rate": 2.387878787878788e-05,
"loss": 209.3482,
"step": 5910
},
{
"epoch": 0.04783490493620666,
"grad_norm": 865.3425903320312,
"learning_rate": 2.3919191919191923e-05,
"loss": 211.704,
"step": 5920
},
{
"epoch": 0.047915707140490794,
"grad_norm": 1376.961181640625,
"learning_rate": 2.395959595959596e-05,
"loss": 197.2012,
"step": 5930
},
{
"epoch": 0.047996509344774924,
"grad_norm": 2671.92236328125,
"learning_rate": 2.4e-05,
"loss": 262.8319,
"step": 5940
},
{
"epoch": 0.04807731154905906,
"grad_norm": 4328.66552734375,
"learning_rate": 2.404040404040404e-05,
"loss": 263.4226,
"step": 5950
},
{
"epoch": 0.04815811375334319,
"grad_norm": 1454.4398193359375,
"learning_rate": 2.4080808080808082e-05,
"loss": 173.7909,
"step": 5960
},
{
"epoch": 0.048238915957627325,
"grad_norm": 1238.2913818359375,
"learning_rate": 2.4121212121212124e-05,
"loss": 190.8571,
"step": 5970
},
{
"epoch": 0.048319718161911454,
"grad_norm": 1106.6146240234375,
"learning_rate": 2.4161616161616163e-05,
"loss": 252.6962,
"step": 5980
},
{
"epoch": 0.04840052036619559,
"grad_norm": 1612.1171875,
"learning_rate": 2.4202020202020205e-05,
"loss": 176.1714,
"step": 5990
},
{
"epoch": 0.04848132257047972,
"grad_norm": 684.4707641601562,
"learning_rate": 2.4242424242424244e-05,
"loss": 236.6299,
"step": 6000
},
{
"epoch": 0.048562124774763855,
"grad_norm": 5278.638671875,
"learning_rate": 2.4282828282828286e-05,
"loss": 200.9588,
"step": 6010
},
{
"epoch": 0.04864292697904799,
"grad_norm": 2136.859375,
"learning_rate": 2.4323232323232325e-05,
"loss": 249.8048,
"step": 6020
},
{
"epoch": 0.04872372918333212,
"grad_norm": 704.8456420898438,
"learning_rate": 2.4363636363636364e-05,
"loss": 210.816,
"step": 6030
},
{
"epoch": 0.04880453138761626,
"grad_norm": 2405.291259765625,
"learning_rate": 2.4404040404040403e-05,
"loss": 180.7068,
"step": 6040
},
{
"epoch": 0.048885333591900386,
"grad_norm": 1121.5928955078125,
"learning_rate": 2.4444444444444445e-05,
"loss": 268.1764,
"step": 6050
},
{
"epoch": 0.04896613579618452,
"grad_norm": 1185.4925537109375,
"learning_rate": 2.4484848484848484e-05,
"loss": 252.5901,
"step": 6060
},
{
"epoch": 0.04904693800046865,
"grad_norm": 1037.7261962890625,
"learning_rate": 2.4525252525252526e-05,
"loss": 217.7089,
"step": 6070
},
{
"epoch": 0.04912774020475279,
"grad_norm": 3574.91943359375,
"learning_rate": 2.4565656565656568e-05,
"loss": 248.0757,
"step": 6080
},
{
"epoch": 0.049208542409036916,
"grad_norm": 1335.7510986328125,
"learning_rate": 2.4606060606060607e-05,
"loss": 243.5903,
"step": 6090
},
{
"epoch": 0.04928934461332105,
"grad_norm": 1548.2281494140625,
"learning_rate": 2.464646464646465e-05,
"loss": 204.2808,
"step": 6100
},
{
"epoch": 0.04937014681760518,
"grad_norm": 1327.641357421875,
"learning_rate": 2.4686868686868688e-05,
"loss": 175.3226,
"step": 6110
},
{
"epoch": 0.04945094902188932,
"grad_norm": 1096.567626953125,
"learning_rate": 2.472727272727273e-05,
"loss": 251.3891,
"step": 6120
},
{
"epoch": 0.04953175122617345,
"grad_norm": 916.0780639648438,
"learning_rate": 2.476767676767677e-05,
"loss": 265.5964,
"step": 6130
},
{
"epoch": 0.04961255343045758,
"grad_norm": 3319.821533203125,
"learning_rate": 2.4808080808080808e-05,
"loss": 199.656,
"step": 6140
},
{
"epoch": 0.04969335563474172,
"grad_norm": 804.5398559570312,
"learning_rate": 2.4848484848484847e-05,
"loss": 176.793,
"step": 6150
},
{
"epoch": 0.04977415783902585,
"grad_norm": 1266.6590576171875,
"learning_rate": 2.488888888888889e-05,
"loss": 172.6065,
"step": 6160
},
{
"epoch": 0.049854960043309984,
"grad_norm": 953.1856689453125,
"learning_rate": 2.492929292929293e-05,
"loss": 259.3056,
"step": 6170
},
{
"epoch": 0.049935762247594113,
"grad_norm": 1643.679443359375,
"learning_rate": 2.496969696969697e-05,
"loss": 227.5671,
"step": 6180
},
{
"epoch": 0.05001656445187825,
"grad_norm": 2092.3837890625,
"learning_rate": 2.5010101010101013e-05,
"loss": 231.7141,
"step": 6190
},
{
"epoch": 0.05009736665616238,
"grad_norm": 5872.7822265625,
"learning_rate": 2.505050505050505e-05,
"loss": 307.3282,
"step": 6200
},
{
"epoch": 0.050178168860446515,
"grad_norm": 1653.10888671875,
"learning_rate": 2.5090909090909094e-05,
"loss": 290.71,
"step": 6210
},
{
"epoch": 0.050258971064730644,
"grad_norm": 5940.2861328125,
"learning_rate": 2.5131313131313133e-05,
"loss": 298.1718,
"step": 6220
},
{
"epoch": 0.05033977326901478,
"grad_norm": 1056.6617431640625,
"learning_rate": 2.5171717171717175e-05,
"loss": 167.573,
"step": 6230
},
{
"epoch": 0.05042057547329891,
"grad_norm": 1492.5479736328125,
"learning_rate": 2.5212121212121214e-05,
"loss": 209.9481,
"step": 6240
},
{
"epoch": 0.050501377677583045,
"grad_norm": 764.5651245117188,
"learning_rate": 2.5252525252525256e-05,
"loss": 189.9945,
"step": 6250
},
{
"epoch": 0.050582179881867174,
"grad_norm": 2933.18603515625,
"learning_rate": 2.5292929292929295e-05,
"loss": 228.2252,
"step": 6260
},
{
"epoch": 0.05066298208615131,
"grad_norm": 2692.583740234375,
"learning_rate": 2.5333333333333337e-05,
"loss": 217.1123,
"step": 6270
},
{
"epoch": 0.05074378429043544,
"grad_norm": 1611.5694580078125,
"learning_rate": 2.5373737373737376e-05,
"loss": 199.2745,
"step": 6280
},
{
"epoch": 0.050824586494719576,
"grad_norm": 638.3251953125,
"learning_rate": 2.5414141414141418e-05,
"loss": 283.4336,
"step": 6290
},
{
"epoch": 0.05090538869900371,
"grad_norm": 960.7551879882812,
"learning_rate": 2.5454545454545454e-05,
"loss": 199.2895,
"step": 6300
},
{
"epoch": 0.05098619090328784,
"grad_norm": 1416.6865234375,
"learning_rate": 2.5494949494949492e-05,
"loss": 247.6437,
"step": 6310
},
{
"epoch": 0.05106699310757198,
"grad_norm": 962.5587158203125,
"learning_rate": 2.5535353535353535e-05,
"loss": 222.514,
"step": 6320
},
{
"epoch": 0.051147795311856106,
"grad_norm": 1019.0704956054688,
"learning_rate": 2.5575757575757573e-05,
"loss": 233.7968,
"step": 6330
},
{
"epoch": 0.05122859751614024,
"grad_norm": 1380.1087646484375,
"learning_rate": 2.5616161616161616e-05,
"loss": 203.472,
"step": 6340
},
{
"epoch": 0.05130939972042437,
"grad_norm": 765.1551513671875,
"learning_rate": 2.5656565656565658e-05,
"loss": 202.9591,
"step": 6350
},
{
"epoch": 0.05139020192470851,
"grad_norm": 854.4512329101562,
"learning_rate": 2.5696969696969697e-05,
"loss": 152.8654,
"step": 6360
},
{
"epoch": 0.05147100412899264,
"grad_norm": 1366.1529541015625,
"learning_rate": 2.573737373737374e-05,
"loss": 202.4912,
"step": 6370
},
{
"epoch": 0.05155180633327677,
"grad_norm": 812.153564453125,
"learning_rate": 2.5777777777777778e-05,
"loss": 190.2283,
"step": 6380
},
{
"epoch": 0.0516326085375609,
"grad_norm": 2072.30029296875,
"learning_rate": 2.581818181818182e-05,
"loss": 250.4601,
"step": 6390
},
{
"epoch": 0.05171341074184504,
"grad_norm": 1064.25732421875,
"learning_rate": 2.585858585858586e-05,
"loss": 243.8253,
"step": 6400
},
{
"epoch": 0.05179421294612917,
"grad_norm": 1004.585205078125,
"learning_rate": 2.58989898989899e-05,
"loss": 233.6981,
"step": 6410
},
{
"epoch": 0.0518750151504133,
"grad_norm": 781.0443115234375,
"learning_rate": 2.593939393939394e-05,
"loss": 231.5708,
"step": 6420
},
{
"epoch": 0.05195581735469744,
"grad_norm": 1038.6923828125,
"learning_rate": 2.5979797979797982e-05,
"loss": 166.9408,
"step": 6430
},
{
"epoch": 0.05203661955898157,
"grad_norm": 1369.49560546875,
"learning_rate": 2.602020202020202e-05,
"loss": 212.7086,
"step": 6440
},
{
"epoch": 0.052117421763265705,
"grad_norm": 1065.5115966796875,
"learning_rate": 2.6060606060606063e-05,
"loss": 197.232,
"step": 6450
},
{
"epoch": 0.052198223967549834,
"grad_norm": 1192.5135498046875,
"learning_rate": 2.6101010101010102e-05,
"loss": 210.3559,
"step": 6460
},
{
"epoch": 0.05227902617183397,
"grad_norm": 2817.4658203125,
"learning_rate": 2.6141414141414145e-05,
"loss": 218.5129,
"step": 6470
},
{
"epoch": 0.0523598283761181,
"grad_norm": 1661.2547607421875,
"learning_rate": 2.6181818181818187e-05,
"loss": 251.3214,
"step": 6480
},
{
"epoch": 0.052440630580402235,
"grad_norm": 1465.83251953125,
"learning_rate": 2.6222222222222226e-05,
"loss": 167.4487,
"step": 6490
},
{
"epoch": 0.052521432784686364,
"grad_norm": 1172.0814208984375,
"learning_rate": 2.6262626262626268e-05,
"loss": 202.9199,
"step": 6500
},
{
"epoch": 0.0526022349889705,
"grad_norm": 845.3886108398438,
"learning_rate": 2.63030303030303e-05,
"loss": 245.5335,
"step": 6510
},
{
"epoch": 0.05268303719325463,
"grad_norm": 1505.1903076171875,
"learning_rate": 2.6343434343434342e-05,
"loss": 189.6107,
"step": 6520
},
{
"epoch": 0.052763839397538766,
"grad_norm": 855.8611450195312,
"learning_rate": 2.6383838383838384e-05,
"loss": 168.8279,
"step": 6530
},
{
"epoch": 0.052844641601822895,
"grad_norm": 1719.1915283203125,
"learning_rate": 2.6424242424242423e-05,
"loss": 266.6213,
"step": 6540
},
{
"epoch": 0.05292544380610703,
"grad_norm": 1334.455322265625,
"learning_rate": 2.6464646464646466e-05,
"loss": 154.8023,
"step": 6550
},
{
"epoch": 0.05300624601039116,
"grad_norm": 1549.58154296875,
"learning_rate": 2.6505050505050504e-05,
"loss": 188.1264,
"step": 6560
},
{
"epoch": 0.053087048214675296,
"grad_norm": 740.02587890625,
"learning_rate": 2.6545454545454547e-05,
"loss": 241.7192,
"step": 6570
},
{
"epoch": 0.05316785041895943,
"grad_norm": 1759.15869140625,
"learning_rate": 2.6585858585858585e-05,
"loss": 249.789,
"step": 6580
},
{
"epoch": 0.05324865262324356,
"grad_norm": 1615.3770751953125,
"learning_rate": 2.6626262626262628e-05,
"loss": 256.1343,
"step": 6590
},
{
"epoch": 0.0533294548275277,
"grad_norm": 1187.103515625,
"learning_rate": 2.6666666666666667e-05,
"loss": 153.0139,
"step": 6600
},
{
"epoch": 0.05341025703181183,
"grad_norm": 835.982177734375,
"learning_rate": 2.670707070707071e-05,
"loss": 240.5858,
"step": 6610
},
{
"epoch": 0.05349105923609596,
"grad_norm": 658.9365234375,
"learning_rate": 2.6747474747474748e-05,
"loss": 212.2474,
"step": 6620
},
{
"epoch": 0.05357186144038009,
"grad_norm": 836.15185546875,
"learning_rate": 2.678787878787879e-05,
"loss": 203.4595,
"step": 6630
},
{
"epoch": 0.05365266364466423,
"grad_norm": 1312.960205078125,
"learning_rate": 2.682828282828283e-05,
"loss": 178.5055,
"step": 6640
},
{
"epoch": 0.05373346584894836,
"grad_norm": 2402.58642578125,
"learning_rate": 2.686868686868687e-05,
"loss": 204.5425,
"step": 6650
},
{
"epoch": 0.05381426805323249,
"grad_norm": 743.2178344726562,
"learning_rate": 2.6909090909090913e-05,
"loss": 151.0765,
"step": 6660
},
{
"epoch": 0.05389507025751662,
"grad_norm": 2009.14599609375,
"learning_rate": 2.6949494949494952e-05,
"loss": 255.5723,
"step": 6670
},
{
"epoch": 0.05397587246180076,
"grad_norm": 1129.924560546875,
"learning_rate": 2.6989898989898994e-05,
"loss": 249.5275,
"step": 6680
},
{
"epoch": 0.05405667466608489,
"grad_norm": 1877.5682373046875,
"learning_rate": 2.7030303030303033e-05,
"loss": 201.4787,
"step": 6690
},
{
"epoch": 0.054137476870369024,
"grad_norm": 1205.5860595703125,
"learning_rate": 2.7070707070707075e-05,
"loss": 165.1917,
"step": 6700
},
{
"epoch": 0.05421827907465316,
"grad_norm": 833.5079956054688,
"learning_rate": 2.7111111111111114e-05,
"loss": 222.6354,
"step": 6710
},
{
"epoch": 0.05429908127893729,
"grad_norm": 1644.57470703125,
"learning_rate": 2.7151515151515157e-05,
"loss": 183.0618,
"step": 6720
},
{
"epoch": 0.054379883483221425,
"grad_norm": 1261.3482666015625,
"learning_rate": 2.7191919191919192e-05,
"loss": 204.8876,
"step": 6730
},
{
"epoch": 0.054460685687505554,
"grad_norm": 1064.4910888671875,
"learning_rate": 2.723232323232323e-05,
"loss": 228.8735,
"step": 6740
},
{
"epoch": 0.05454148789178969,
"grad_norm": 1227.28369140625,
"learning_rate": 2.7272727272727273e-05,
"loss": 244.5206,
"step": 6750
},
{
"epoch": 0.05462229009607382,
"grad_norm": 747.6671142578125,
"learning_rate": 2.7313131313131312e-05,
"loss": 171.5991,
"step": 6760
},
{
"epoch": 0.054703092300357956,
"grad_norm": 1191.174560546875,
"learning_rate": 2.7353535353535354e-05,
"loss": 183.062,
"step": 6770
},
{
"epoch": 0.054783894504642085,
"grad_norm": 1179.271484375,
"learning_rate": 2.7393939393939393e-05,
"loss": 203.4914,
"step": 6780
},
{
"epoch": 0.05486469670892622,
"grad_norm": 1980.94287109375,
"learning_rate": 2.7434343434343435e-05,
"loss": 190.7682,
"step": 6790
},
{
"epoch": 0.05494549891321035,
"grad_norm": 1313.760498046875,
"learning_rate": 2.7474747474747474e-05,
"loss": 179.4395,
"step": 6800
},
{
"epoch": 0.055026301117494486,
"grad_norm": 818.7135620117188,
"learning_rate": 2.7515151515151516e-05,
"loss": 240.7207,
"step": 6810
},
{
"epoch": 0.055107103321778615,
"grad_norm": 1303.9735107421875,
"learning_rate": 2.7555555555555555e-05,
"loss": 197.5866,
"step": 6820
},
{
"epoch": 0.05518790552606275,
"grad_norm": 4817.638671875,
"learning_rate": 2.7595959595959597e-05,
"loss": 236.2139,
"step": 6830
},
{
"epoch": 0.05526870773034688,
"grad_norm": 1369.7080078125,
"learning_rate": 2.7636363636363636e-05,
"loss": 155.043,
"step": 6840
},
{
"epoch": 0.05534950993463102,
"grad_norm": 1351.29150390625,
"learning_rate": 2.767676767676768e-05,
"loss": 193.5722,
"step": 6850
},
{
"epoch": 0.05543031213891515,
"grad_norm": 1340.113525390625,
"learning_rate": 2.771717171717172e-05,
"loss": 196.4928,
"step": 6860
},
{
"epoch": 0.05551111434319928,
"grad_norm": 1829.1298828125,
"learning_rate": 2.775757575757576e-05,
"loss": 207.2559,
"step": 6870
},
{
"epoch": 0.05559191654748342,
"grad_norm": 1614.317138671875,
"learning_rate": 2.7797979797979802e-05,
"loss": 191.8481,
"step": 6880
},
{
"epoch": 0.05567271875176755,
"grad_norm": 1409.754150390625,
"learning_rate": 2.783838383838384e-05,
"loss": 172.1698,
"step": 6890
},
{
"epoch": 0.05575352095605168,
"grad_norm": 1008.7220458984375,
"learning_rate": 2.7878787878787883e-05,
"loss": 204.197,
"step": 6900
},
{
"epoch": 0.05583432316033581,
"grad_norm": 1213.98291015625,
"learning_rate": 2.7919191919191922e-05,
"loss": 176.116,
"step": 6910
},
{
"epoch": 0.05591512536461995,
"grad_norm": 1919.146484375,
"learning_rate": 2.7959595959595964e-05,
"loss": 205.5731,
"step": 6920
},
{
"epoch": 0.05599592756890408,
"grad_norm": 1582.1240234375,
"learning_rate": 2.8000000000000003e-05,
"loss": 204.0053,
"step": 6930
},
{
"epoch": 0.056076729773188214,
"grad_norm": 1082.2257080078125,
"learning_rate": 2.804040404040404e-05,
"loss": 216.7668,
"step": 6940
},
{
"epoch": 0.05615753197747234,
"grad_norm": 1451.9715576171875,
"learning_rate": 2.808080808080808e-05,
"loss": 128.3858,
"step": 6950
},
{
"epoch": 0.05623833418175648,
"grad_norm": 2580.067138671875,
"learning_rate": 2.812121212121212e-05,
"loss": 187.2646,
"step": 6960
},
{
"epoch": 0.05631913638604061,
"grad_norm": 1153.5308837890625,
"learning_rate": 2.8161616161616162e-05,
"loss": 170.1935,
"step": 6970
},
{
"epoch": 0.056399938590324744,
"grad_norm": 842.653076171875,
"learning_rate": 2.82020202020202e-05,
"loss": 229.306,
"step": 6980
},
{
"epoch": 0.05648074079460888,
"grad_norm": 1086.96337890625,
"learning_rate": 2.8242424242424243e-05,
"loss": 180.9517,
"step": 6990
},
{
"epoch": 0.05656154299889301,
"grad_norm": 963.1438598632812,
"learning_rate": 2.8282828282828282e-05,
"loss": 186.2078,
"step": 7000
},
{
"epoch": 0.056642345203177145,
"grad_norm": 1010.3299560546875,
"learning_rate": 2.8323232323232324e-05,
"loss": 223.6001,
"step": 7010
},
{
"epoch": 0.056723147407461275,
"grad_norm": 1217.844482421875,
"learning_rate": 2.8363636363636363e-05,
"loss": 179.5198,
"step": 7020
},
{
"epoch": 0.05680394961174541,
"grad_norm": 1364.8577880859375,
"learning_rate": 2.8404040404040405e-05,
"loss": 212.5286,
"step": 7030
},
{
"epoch": 0.05688475181602954,
"grad_norm": 804.541748046875,
"learning_rate": 2.8444444444444447e-05,
"loss": 201.4965,
"step": 7040
},
{
"epoch": 0.056965554020313676,
"grad_norm": 2093.808349609375,
"learning_rate": 2.8484848484848486e-05,
"loss": 202.5039,
"step": 7050
},
{
"epoch": 0.057046356224597805,
"grad_norm": 1088.9471435546875,
"learning_rate": 2.852525252525253e-05,
"loss": 164.6322,
"step": 7060
},
{
"epoch": 0.05712715842888194,
"grad_norm": 1510.014404296875,
"learning_rate": 2.8565656565656567e-05,
"loss": 246.0487,
"step": 7070
},
{
"epoch": 0.05720796063316607,
"grad_norm": 617.3926391601562,
"learning_rate": 2.860606060606061e-05,
"loss": 166.5255,
"step": 7080
},
{
"epoch": 0.057288762837450206,
"grad_norm": 1088.094482421875,
"learning_rate": 2.864646464646465e-05,
"loss": 180.0012,
"step": 7090
},
{
"epoch": 0.057369565041734336,
"grad_norm": 754.35400390625,
"learning_rate": 2.868686868686869e-05,
"loss": 165.6718,
"step": 7100
},
{
"epoch": 0.05745036724601847,
"grad_norm": 847.3502197265625,
"learning_rate": 2.872727272727273e-05,
"loss": 150.3254,
"step": 7110
},
{
"epoch": 0.0575311694503026,
"grad_norm": 3462.79541015625,
"learning_rate": 2.876767676767677e-05,
"loss": 206.9913,
"step": 7120
},
{
"epoch": 0.05761197165458674,
"grad_norm": 1302.846923828125,
"learning_rate": 2.880808080808081e-05,
"loss": 218.2749,
"step": 7130
},
{
"epoch": 0.05769277385887087,
"grad_norm": 1508.3194580078125,
"learning_rate": 2.8848484848484853e-05,
"loss": 198.5009,
"step": 7140
},
{
"epoch": 0.057773576063155,
"grad_norm": 1260.8990478515625,
"learning_rate": 2.8888888888888888e-05,
"loss": 287.7319,
"step": 7150
},
{
"epoch": 0.05785437826743914,
"grad_norm": 2510.641357421875,
"learning_rate": 2.8929292929292927e-05,
"loss": 212.435,
"step": 7160
},
{
"epoch": 0.05793518047172327,
"grad_norm": 1610.3782958984375,
"learning_rate": 2.896969696969697e-05,
"loss": 195.9904,
"step": 7170
},
{
"epoch": 0.058015982676007403,
"grad_norm": 2051.1611328125,
"learning_rate": 2.9010101010101008e-05,
"loss": 230.746,
"step": 7180
},
{
"epoch": 0.05809678488029153,
"grad_norm": 1708.345703125,
"learning_rate": 2.905050505050505e-05,
"loss": 202.1169,
"step": 7190
},
{
"epoch": 0.05817758708457567,
"grad_norm": 991.0370483398438,
"learning_rate": 2.909090909090909e-05,
"loss": 182.8259,
"step": 7200
},
{
"epoch": 0.0582583892888598,
"grad_norm": 1151.1380615234375,
"learning_rate": 2.913131313131313e-05,
"loss": 241.0473,
"step": 7210
},
{
"epoch": 0.058339191493143934,
"grad_norm": 1103.3897705078125,
"learning_rate": 2.9171717171717174e-05,
"loss": 151.7667,
"step": 7220
},
{
"epoch": 0.05841999369742806,
"grad_norm": 1151.0849609375,
"learning_rate": 2.9212121212121213e-05,
"loss": 197.9749,
"step": 7230
},
{
"epoch": 0.0585007959017122,
"grad_norm": 983.3527221679688,
"learning_rate": 2.9252525252525255e-05,
"loss": 186.8989,
"step": 7240
},
{
"epoch": 0.05858159810599633,
"grad_norm": 669.5452880859375,
"learning_rate": 2.9292929292929294e-05,
"loss": 179.281,
"step": 7250
},
{
"epoch": 0.058662400310280464,
"grad_norm": 1186.9957275390625,
"learning_rate": 2.9333333333333336e-05,
"loss": 170.5622,
"step": 7260
},
{
"epoch": 0.0587432025145646,
"grad_norm": 1314.4376220703125,
"learning_rate": 2.9373737373737375e-05,
"loss": 175.4961,
"step": 7270
},
{
"epoch": 0.05882400471884873,
"grad_norm": 1278.834716796875,
"learning_rate": 2.9414141414141417e-05,
"loss": 183.9097,
"step": 7280
},
{
"epoch": 0.058904806923132866,
"grad_norm": 1116.2734375,
"learning_rate": 2.9454545454545456e-05,
"loss": 143.516,
"step": 7290
},
{
"epoch": 0.058985609127416995,
"grad_norm": 1352.628173828125,
"learning_rate": 2.9494949494949498e-05,
"loss": 204.3025,
"step": 7300
},
{
"epoch": 0.05906641133170113,
"grad_norm": 1091.3201904296875,
"learning_rate": 2.9535353535353537e-05,
"loss": 181.4761,
"step": 7310
},
{
"epoch": 0.05914721353598526,
"grad_norm": 1040.334716796875,
"learning_rate": 2.957575757575758e-05,
"loss": 170.6319,
"step": 7320
},
{
"epoch": 0.059228015740269396,
"grad_norm": 1476.125732421875,
"learning_rate": 2.9616161616161618e-05,
"loss": 161.3749,
"step": 7330
},
{
"epoch": 0.059308817944553525,
"grad_norm": 1488.0325927734375,
"learning_rate": 2.965656565656566e-05,
"loss": 183.3941,
"step": 7340
},
{
"epoch": 0.05938962014883766,
"grad_norm": 481.60833740234375,
"learning_rate": 2.96969696969697e-05,
"loss": 199.2278,
"step": 7350
},
{
"epoch": 0.05947042235312179,
"grad_norm": 1610.34521484375,
"learning_rate": 2.973737373737374e-05,
"loss": 201.723,
"step": 7360
},
{
"epoch": 0.05955122455740593,
"grad_norm": 1576.0423583984375,
"learning_rate": 2.9777777777777777e-05,
"loss": 222.0852,
"step": 7370
},
{
"epoch": 0.059632026761690056,
"grad_norm": 889.7515258789062,
"learning_rate": 2.9818181818181816e-05,
"loss": 193.5616,
"step": 7380
},
{
"epoch": 0.05971282896597419,
"grad_norm": 746.6514282226562,
"learning_rate": 2.9858585858585858e-05,
"loss": 166.2696,
"step": 7390
},
{
"epoch": 0.05979363117025832,
"grad_norm": 1730.69580078125,
"learning_rate": 2.98989898989899e-05,
"loss": 209.8799,
"step": 7400
},
{
"epoch": 0.05987443337454246,
"grad_norm": 690.6642456054688,
"learning_rate": 2.993939393939394e-05,
"loss": 230.9101,
"step": 7410
},
{
"epoch": 0.05995523557882659,
"grad_norm": 863.1697387695312,
"learning_rate": 2.997979797979798e-05,
"loss": 150.7177,
"step": 7420
},
{
"epoch": 0.06003603778311072,
"grad_norm": 1267.2069091796875,
"learning_rate": 3.002020202020202e-05,
"loss": 210.8308,
"step": 7430
},
{
"epoch": 0.06011683998739486,
"grad_norm": 1010.417724609375,
"learning_rate": 3.0060606060606062e-05,
"loss": 191.3645,
"step": 7440
},
{
"epoch": 0.06019764219167899,
"grad_norm": 689.7382202148438,
"learning_rate": 3.01010101010101e-05,
"loss": 187.7134,
"step": 7450
},
{
"epoch": 0.060278444395963124,
"grad_norm": 1864.760986328125,
"learning_rate": 3.0141414141414144e-05,
"loss": 214.7331,
"step": 7460
},
{
"epoch": 0.06035924660024725,
"grad_norm": 1038.37353515625,
"learning_rate": 3.0181818181818182e-05,
"loss": 217.9106,
"step": 7470
},
{
"epoch": 0.06044004880453139,
"grad_norm": 622.6604614257812,
"learning_rate": 3.0222222222222225e-05,
"loss": 155.4263,
"step": 7480
},
{
"epoch": 0.06052085100881552,
"grad_norm": 878.7538452148438,
"learning_rate": 3.0262626262626263e-05,
"loss": 231.1667,
"step": 7490
},
{
"epoch": 0.060601653213099654,
"grad_norm": 1581.2225341796875,
"learning_rate": 3.0303030303030306e-05,
"loss": 163.4888,
"step": 7500
},
{
"epoch": 0.060682455417383784,
"grad_norm": 1152.7149658203125,
"learning_rate": 3.0343434343434345e-05,
"loss": 182.3645,
"step": 7510
},
{
"epoch": 0.06076325762166792,
"grad_norm": 1109.6708984375,
"learning_rate": 3.0383838383838387e-05,
"loss": 175.0838,
"step": 7520
},
{
"epoch": 0.06084405982595205,
"grad_norm": 1053.8270263671875,
"learning_rate": 3.0424242424242426e-05,
"loss": 181.691,
"step": 7530
},
{
"epoch": 0.060924862030236185,
"grad_norm": 2113.046875,
"learning_rate": 3.0464646464646468e-05,
"loss": 224.7368,
"step": 7540
},
{
"epoch": 0.06100566423452032,
"grad_norm": 1166.90478515625,
"learning_rate": 3.050505050505051e-05,
"loss": 206.4759,
"step": 7550
},
{
"epoch": 0.06108646643880445,
"grad_norm": 1273.3836669921875,
"learning_rate": 3.054545454545455e-05,
"loss": 171.2801,
"step": 7560
},
{
"epoch": 0.061167268643088586,
"grad_norm": 2534.885498046875,
"learning_rate": 3.058585858585859e-05,
"loss": 159.7586,
"step": 7570
},
{
"epoch": 0.061248070847372715,
"grad_norm": 3763.103515625,
"learning_rate": 3.062626262626262e-05,
"loss": 323.4677,
"step": 7580
},
{
"epoch": 0.06132887305165685,
"grad_norm": 1977.9522705078125,
"learning_rate": 3.066666666666667e-05,
"loss": 227.4736,
"step": 7590
},
{
"epoch": 0.06140967525594098,
"grad_norm": 1690.8280029296875,
"learning_rate": 3.070707070707071e-05,
"loss": 192.4362,
"step": 7600
},
{
"epoch": 0.06149047746022512,
"grad_norm": 1523.7828369140625,
"learning_rate": 3.074747474747475e-05,
"loss": 234.8535,
"step": 7610
},
{
"epoch": 0.061571279664509246,
"grad_norm": 1146.36865234375,
"learning_rate": 3.0787878787878786e-05,
"loss": 142.9756,
"step": 7620
},
{
"epoch": 0.06165208186879338,
"grad_norm": 895.3403930664062,
"learning_rate": 3.082828282828283e-05,
"loss": 201.5379,
"step": 7630
},
{
"epoch": 0.06173288407307751,
"grad_norm": 1039.900634765625,
"learning_rate": 3.086868686868687e-05,
"loss": 230.2974,
"step": 7640
},
{
"epoch": 0.06181368627736165,
"grad_norm": 1130.9986572265625,
"learning_rate": 3.090909090909091e-05,
"loss": 189.1531,
"step": 7650
},
{
"epoch": 0.061894488481645776,
"grad_norm": 1224.142822265625,
"learning_rate": 3.094949494949495e-05,
"loss": 204.0206,
"step": 7660
},
{
"epoch": 0.06197529068592991,
"grad_norm": 2115.472412109375,
"learning_rate": 3.098989898989899e-05,
"loss": 180.536,
"step": 7670
},
{
"epoch": 0.06205609289021405,
"grad_norm": 779.9313354492188,
"learning_rate": 3.103030303030303e-05,
"loss": 158.623,
"step": 7680
},
{
"epoch": 0.06213689509449818,
"grad_norm": 1337.7568359375,
"learning_rate": 3.107070707070707e-05,
"loss": 159.0383,
"step": 7690
},
{
"epoch": 0.062217697298782314,
"grad_norm": 1851.648193359375,
"learning_rate": 3.111111111111111e-05,
"loss": 179.8161,
"step": 7700
},
{
"epoch": 0.06229849950306644,
"grad_norm": 1469.6453857421875,
"learning_rate": 3.1151515151515156e-05,
"loss": 187.596,
"step": 7710
},
{
"epoch": 0.06237930170735058,
"grad_norm": 1624.6527099609375,
"learning_rate": 3.1191919191919194e-05,
"loss": 214.8479,
"step": 7720
},
{
"epoch": 0.06246010391163471,
"grad_norm": 1006.6346435546875,
"learning_rate": 3.123232323232323e-05,
"loss": 154.5748,
"step": 7730
},
{
"epoch": 0.06254090611591884,
"grad_norm": 1002.5286254882812,
"learning_rate": 3.127272727272728e-05,
"loss": 184.4432,
"step": 7740
},
{
"epoch": 0.06262170832020297,
"grad_norm": 1352.4193115234375,
"learning_rate": 3.131313131313132e-05,
"loss": 237.0036,
"step": 7750
},
{
"epoch": 0.06270251052448711,
"grad_norm": 1084.147216796875,
"learning_rate": 3.1353535353535357e-05,
"loss": 164.8318,
"step": 7760
},
{
"epoch": 0.06278331272877125,
"grad_norm": 1302.1048583984375,
"learning_rate": 3.1393939393939395e-05,
"loss": 164.3788,
"step": 7770
},
{
"epoch": 0.06286411493305537,
"grad_norm": 1383.396484375,
"learning_rate": 3.143434343434344e-05,
"loss": 175.5805,
"step": 7780
},
{
"epoch": 0.0629449171373395,
"grad_norm": 1246.53857421875,
"learning_rate": 3.147474747474747e-05,
"loss": 210.3966,
"step": 7790
},
{
"epoch": 0.06302571934162364,
"grad_norm": 1285.145263671875,
"learning_rate": 3.151515151515151e-05,
"loss": 246.1903,
"step": 7800
},
{
"epoch": 0.06310652154590778,
"grad_norm": 1620.3326416015625,
"learning_rate": 3.155555555555556e-05,
"loss": 196.0127,
"step": 7810
},
{
"epoch": 0.06318732375019191,
"grad_norm": 1016.9979858398438,
"learning_rate": 3.1595959595959596e-05,
"loss": 210.5301,
"step": 7820
},
{
"epoch": 0.06326812595447603,
"grad_norm": 1945.8780517578125,
"learning_rate": 3.1636363636363635e-05,
"loss": 239.3266,
"step": 7830
},
{
"epoch": 0.06334892815876017,
"grad_norm": 1864.5794677734375,
"learning_rate": 3.1676767676767674e-05,
"loss": 193.7567,
"step": 7840
},
{
"epoch": 0.0634297303630443,
"grad_norm": 1095.450927734375,
"learning_rate": 3.171717171717172e-05,
"loss": 191.1735,
"step": 7850
},
{
"epoch": 0.06351053256732844,
"grad_norm": 1031.504150390625,
"learning_rate": 3.175757575757576e-05,
"loss": 185.8655,
"step": 7860
},
{
"epoch": 0.06359133477161256,
"grad_norm": 1385.5076904296875,
"learning_rate": 3.17979797979798e-05,
"loss": 177.6908,
"step": 7870
},
{
"epoch": 0.0636721369758967,
"grad_norm": 1074.5181884765625,
"learning_rate": 3.1838383838383836e-05,
"loss": 204.0031,
"step": 7880
},
{
"epoch": 0.06375293918018084,
"grad_norm": 953.3314208984375,
"learning_rate": 3.187878787878788e-05,
"loss": 180.9185,
"step": 7890
},
{
"epoch": 0.06383374138446497,
"grad_norm": 868.8043823242188,
"learning_rate": 3.191919191919192e-05,
"loss": 220.1422,
"step": 7900
},
{
"epoch": 0.0639145435887491,
"grad_norm": 5921.494140625,
"learning_rate": 3.195959595959596e-05,
"loss": 167.039,
"step": 7910
},
{
"epoch": 0.06399534579303323,
"grad_norm": 1500.1710205078125,
"learning_rate": 3.2000000000000005e-05,
"loss": 138.7559,
"step": 7920
},
{
"epoch": 0.06407614799731737,
"grad_norm": 1143.7266845703125,
"learning_rate": 3.2040404040404044e-05,
"loss": 195.0978,
"step": 7930
},
{
"epoch": 0.0641569502016015,
"grad_norm": 523.0445556640625,
"learning_rate": 3.208080808080808e-05,
"loss": 151.1692,
"step": 7940
},
{
"epoch": 0.06423775240588563,
"grad_norm": 2158.39013671875,
"learning_rate": 3.212121212121212e-05,
"loss": 236.7984,
"step": 7950
},
{
"epoch": 0.06431855461016976,
"grad_norm": 659.3209228515625,
"learning_rate": 3.216161616161617e-05,
"loss": 181.1136,
"step": 7960
},
{
"epoch": 0.0643993568144539,
"grad_norm": 608.638671875,
"learning_rate": 3.2202020202020206e-05,
"loss": 194.2183,
"step": 7970
},
{
"epoch": 0.06448015901873803,
"grad_norm": 1122.7078857421875,
"learning_rate": 3.2242424242424245e-05,
"loss": 160.3627,
"step": 7980
},
{
"epoch": 0.06456096122302217,
"grad_norm": 1686.80810546875,
"learning_rate": 3.2282828282828284e-05,
"loss": 223.456,
"step": 7990
},
{
"epoch": 0.06464176342730629,
"grad_norm": 1573.1317138671875,
"learning_rate": 3.232323232323233e-05,
"loss": 224.0322,
"step": 8000
},
{
"epoch": 0.06472256563159043,
"grad_norm": 1321.1458740234375,
"learning_rate": 3.236363636363636e-05,
"loss": 252.9104,
"step": 8010
},
{
"epoch": 0.06480336783587456,
"grad_norm": 1179.701171875,
"learning_rate": 3.24040404040404e-05,
"loss": 223.4346,
"step": 8020
},
{
"epoch": 0.0648841700401587,
"grad_norm": 977.9105224609375,
"learning_rate": 3.2444444444444446e-05,
"loss": 152.0468,
"step": 8030
},
{
"epoch": 0.06496497224444282,
"grad_norm": 2066.90380859375,
"learning_rate": 3.2484848484848485e-05,
"loss": 190.8667,
"step": 8040
},
{
"epoch": 0.06504577444872696,
"grad_norm": 3095.08935546875,
"learning_rate": 3.2525252525252524e-05,
"loss": 192.0303,
"step": 8050
},
{
"epoch": 0.0651265766530111,
"grad_norm": 2343.95947265625,
"learning_rate": 3.256565656565656e-05,
"loss": 157.5384,
"step": 8060
},
{
"epoch": 0.06520737885729523,
"grad_norm": 1510.8023681640625,
"learning_rate": 3.260606060606061e-05,
"loss": 239.6892,
"step": 8070
},
{
"epoch": 0.06528818106157935,
"grad_norm": 1445.597900390625,
"learning_rate": 3.264646464646465e-05,
"loss": 177.4803,
"step": 8080
},
{
"epoch": 0.06536898326586349,
"grad_norm": 1667.5521240234375,
"learning_rate": 3.2686868686868686e-05,
"loss": 190.0459,
"step": 8090
},
{
"epoch": 0.06544978547014763,
"grad_norm": 925.2418212890625,
"learning_rate": 3.272727272727273e-05,
"loss": 190.8257,
"step": 8100
},
{
"epoch": 0.06553058767443176,
"grad_norm": 1247.4376220703125,
"learning_rate": 3.276767676767677e-05,
"loss": 203.9773,
"step": 8110
},
{
"epoch": 0.0656113898787159,
"grad_norm": 1212.892822265625,
"learning_rate": 3.280808080808081e-05,
"loss": 203.7381,
"step": 8120
},
{
"epoch": 0.06569219208300002,
"grad_norm": 1091.890380859375,
"learning_rate": 3.284848484848485e-05,
"loss": 194.8187,
"step": 8130
},
{
"epoch": 0.06577299428728416,
"grad_norm": 2029.2864990234375,
"learning_rate": 3.2888888888888894e-05,
"loss": 246.8937,
"step": 8140
},
{
"epoch": 0.06585379649156829,
"grad_norm": 920.1378784179688,
"learning_rate": 3.292929292929293e-05,
"loss": 215.9934,
"step": 8150
},
{
"epoch": 0.06593459869585243,
"grad_norm": 1521.0574951171875,
"learning_rate": 3.296969696969697e-05,
"loss": 167.3099,
"step": 8160
},
{
"epoch": 0.06601540090013655,
"grad_norm": 1420.7525634765625,
"learning_rate": 3.301010101010101e-05,
"loss": 206.7512,
"step": 8170
},
{
"epoch": 0.06609620310442069,
"grad_norm": 840.5839233398438,
"learning_rate": 3.3050505050505056e-05,
"loss": 202.7185,
"step": 8180
},
{
"epoch": 0.06617700530870482,
"grad_norm": 1193.502197265625,
"learning_rate": 3.3090909090909095e-05,
"loss": 160.3612,
"step": 8190
},
{
"epoch": 0.06625780751298896,
"grad_norm": 2222.778564453125,
"learning_rate": 3.3131313131313134e-05,
"loss": 150.92,
"step": 8200
},
{
"epoch": 0.06633860971727308,
"grad_norm": 776.4454956054688,
"learning_rate": 3.317171717171717e-05,
"loss": 159.6749,
"step": 8210
},
{
"epoch": 0.06641941192155722,
"grad_norm": 1179.86279296875,
"learning_rate": 3.321212121212121e-05,
"loss": 147.1537,
"step": 8220
},
{
"epoch": 0.06650021412584135,
"grad_norm": 1168.2757568359375,
"learning_rate": 3.325252525252525e-05,
"loss": 163.5715,
"step": 8230
},
{
"epoch": 0.06658101633012549,
"grad_norm": 996.3876953125,
"learning_rate": 3.329292929292929e-05,
"loss": 156.5557,
"step": 8240
},
{
"epoch": 0.06666181853440963,
"grad_norm": 1006.9996337890625,
"learning_rate": 3.3333333333333335e-05,
"loss": 176.6802,
"step": 8250
},
{
"epoch": 0.06674262073869375,
"grad_norm": 877.4000854492188,
"learning_rate": 3.3373737373737374e-05,
"loss": 182.8363,
"step": 8260
},
{
"epoch": 0.06682342294297788,
"grad_norm": 2153.091552734375,
"learning_rate": 3.341414141414141e-05,
"loss": 184.7595,
"step": 8270
},
{
"epoch": 0.06690422514726202,
"grad_norm": 1884.7989501953125,
"learning_rate": 3.345454545454546e-05,
"loss": 197.672,
"step": 8280
},
{
"epoch": 0.06698502735154616,
"grad_norm": 1494.185791015625,
"learning_rate": 3.34949494949495e-05,
"loss": 178.7865,
"step": 8290
},
{
"epoch": 0.06706582955583028,
"grad_norm": 2600.398193359375,
"learning_rate": 3.3535353535353536e-05,
"loss": 222.7885,
"step": 8300
},
{
"epoch": 0.06714663176011441,
"grad_norm": 1300.013671875,
"learning_rate": 3.3575757575757575e-05,
"loss": 182.5449,
"step": 8310
},
{
"epoch": 0.06722743396439855,
"grad_norm": 2145.218505859375,
"learning_rate": 3.361616161616162e-05,
"loss": 281.7514,
"step": 8320
},
{
"epoch": 0.06730823616868269,
"grad_norm": 1519.411865234375,
"learning_rate": 3.365656565656566e-05,
"loss": 167.7319,
"step": 8330
},
{
"epoch": 0.06738903837296681,
"grad_norm": 750.0274047851562,
"learning_rate": 3.36969696969697e-05,
"loss": 173.5281,
"step": 8340
},
{
"epoch": 0.06746984057725094,
"grad_norm": 1222.1435546875,
"learning_rate": 3.373737373737374e-05,
"loss": 162.324,
"step": 8350
},
{
"epoch": 0.06755064278153508,
"grad_norm": 955.3302001953125,
"learning_rate": 3.377777777777778e-05,
"loss": 184.845,
"step": 8360
},
{
"epoch": 0.06763144498581922,
"grad_norm": 1112.2943115234375,
"learning_rate": 3.381818181818182e-05,
"loss": 160.8671,
"step": 8370
},
{
"epoch": 0.06771224719010335,
"grad_norm": 1163.462646484375,
"learning_rate": 3.385858585858586e-05,
"loss": 147.4451,
"step": 8380
},
{
"epoch": 0.06779304939438748,
"grad_norm": 925.3172607421875,
"learning_rate": 3.38989898989899e-05,
"loss": 175.2849,
"step": 8390
},
{
"epoch": 0.06787385159867161,
"grad_norm": 1990.568359375,
"learning_rate": 3.3939393939393945e-05,
"loss": 199.808,
"step": 8400
},
{
"epoch": 0.06795465380295575,
"grad_norm": 2128.471923828125,
"learning_rate": 3.3979797979797984e-05,
"loss": 205.3155,
"step": 8410
},
{
"epoch": 0.06803545600723988,
"grad_norm": 2691.37353515625,
"learning_rate": 3.402020202020202e-05,
"loss": 216.1895,
"step": 8420
},
{
"epoch": 0.068116258211524,
"grad_norm": 3363.869140625,
"learning_rate": 3.406060606060606e-05,
"loss": 201.8374,
"step": 8430
},
{
"epoch": 0.06819706041580814,
"grad_norm": 1438.0633544921875,
"learning_rate": 3.41010101010101e-05,
"loss": 222.5396,
"step": 8440
},
{
"epoch": 0.06827786262009228,
"grad_norm": 1703.8653564453125,
"learning_rate": 3.414141414141414e-05,
"loss": 213.8756,
"step": 8450
},
{
"epoch": 0.06835866482437641,
"grad_norm": 1938.7177734375,
"learning_rate": 3.4181818181818185e-05,
"loss": 215.6468,
"step": 8460
},
{
"epoch": 0.06843946702866054,
"grad_norm": 851.2493896484375,
"learning_rate": 3.4222222222222224e-05,
"loss": 182.1204,
"step": 8470
},
{
"epoch": 0.06852026923294467,
"grad_norm": 1202.3365478515625,
"learning_rate": 3.426262626262626e-05,
"loss": 183.2559,
"step": 8480
},
{
"epoch": 0.06860107143722881,
"grad_norm": 1543.7257080078125,
"learning_rate": 3.43030303030303e-05,
"loss": 239.1145,
"step": 8490
},
{
"epoch": 0.06868187364151294,
"grad_norm": 748.701171875,
"learning_rate": 3.434343434343435e-05,
"loss": 285.6954,
"step": 8500
},
{
"epoch": 0.06876267584579707,
"grad_norm": 5747.30224609375,
"learning_rate": 3.4383838383838386e-05,
"loss": 194.7742,
"step": 8510
},
{
"epoch": 0.0688434780500812,
"grad_norm": 1587.478271484375,
"learning_rate": 3.4424242424242425e-05,
"loss": 213.0056,
"step": 8520
},
{
"epoch": 0.06892428025436534,
"grad_norm": 907.9869995117188,
"learning_rate": 3.4464646464646463e-05,
"loss": 183.2854,
"step": 8530
},
{
"epoch": 0.06900508245864947,
"grad_norm": 1065.7462158203125,
"learning_rate": 3.450505050505051e-05,
"loss": 147.3901,
"step": 8540
},
{
"epoch": 0.06908588466293361,
"grad_norm": 1654.4375,
"learning_rate": 3.454545454545455e-05,
"loss": 152.4811,
"step": 8550
},
{
"epoch": 0.06916668686721773,
"grad_norm": 1075.144775390625,
"learning_rate": 3.458585858585859e-05,
"loss": 217.3591,
"step": 8560
},
{
"epoch": 0.06924748907150187,
"grad_norm": 1216.4287109375,
"learning_rate": 3.4626262626262626e-05,
"loss": 184.7878,
"step": 8570
},
{
"epoch": 0.069328291275786,
"grad_norm": 1143.3253173828125,
"learning_rate": 3.466666666666667e-05,
"loss": 185.8244,
"step": 8580
},
{
"epoch": 0.06940909348007014,
"grad_norm": 2943.891357421875,
"learning_rate": 3.470707070707071e-05,
"loss": 203.6546,
"step": 8590
},
{
"epoch": 0.06948989568435426,
"grad_norm": 562.7566528320312,
"learning_rate": 3.474747474747475e-05,
"loss": 224.7354,
"step": 8600
},
{
"epoch": 0.0695706978886384,
"grad_norm": 1995.734130859375,
"learning_rate": 3.4787878787878795e-05,
"loss": 178.8969,
"step": 8610
},
{
"epoch": 0.06965150009292254,
"grad_norm": 1593.8944091796875,
"learning_rate": 3.4828282828282834e-05,
"loss": 164.9677,
"step": 8620
},
{
"epoch": 0.06973230229720667,
"grad_norm": 715.6300048828125,
"learning_rate": 3.486868686868687e-05,
"loss": 147.6377,
"step": 8630
},
{
"epoch": 0.0698131045014908,
"grad_norm": 3263.21044921875,
"learning_rate": 3.490909090909091e-05,
"loss": 235.9829,
"step": 8640
},
{
"epoch": 0.06989390670577493,
"grad_norm": 3928.5576171875,
"learning_rate": 3.494949494949495e-05,
"loss": 220.889,
"step": 8650
},
{
"epoch": 0.06997470891005907,
"grad_norm": 1176.6265869140625,
"learning_rate": 3.498989898989899e-05,
"loss": 173.9785,
"step": 8660
},
{
"epoch": 0.0700555111143432,
"grad_norm": 1472.33349609375,
"learning_rate": 3.503030303030303e-05,
"loss": 166.185,
"step": 8670
},
{
"epoch": 0.07013631331862734,
"grad_norm": 943.4843139648438,
"learning_rate": 3.5070707070707073e-05,
"loss": 196.9754,
"step": 8680
},
{
"epoch": 0.07021711552291146,
"grad_norm": 1376.169189453125,
"learning_rate": 3.511111111111111e-05,
"loss": 249.9044,
"step": 8690
},
{
"epoch": 0.0702979177271956,
"grad_norm": 862.705078125,
"learning_rate": 3.515151515151515e-05,
"loss": 134.6473,
"step": 8700
},
{
"epoch": 0.07037871993147973,
"grad_norm": 1661.3258056640625,
"learning_rate": 3.519191919191919e-05,
"loss": 260.4335,
"step": 8710
},
{
"epoch": 0.07045952213576387,
"grad_norm": 858.2864379882812,
"learning_rate": 3.5232323232323236e-05,
"loss": 156.1466,
"step": 8720
},
{
"epoch": 0.07054032434004799,
"grad_norm": 1033.8033447265625,
"learning_rate": 3.5272727272727274e-05,
"loss": 158.8132,
"step": 8730
},
{
"epoch": 0.07062112654433213,
"grad_norm": 2244.4833984375,
"learning_rate": 3.531313131313131e-05,
"loss": 185.2664,
"step": 8740
},
{
"epoch": 0.07070192874861626,
"grad_norm": 828.0194091796875,
"learning_rate": 3.535353535353535e-05,
"loss": 189.655,
"step": 8750
},
{
"epoch": 0.0707827309529004,
"grad_norm": 764.8339233398438,
"learning_rate": 3.53939393939394e-05,
"loss": 169.7833,
"step": 8760
},
{
"epoch": 0.07086353315718452,
"grad_norm": 1434.6533203125,
"learning_rate": 3.543434343434344e-05,
"loss": 169.9032,
"step": 8770
},
{
"epoch": 0.07094433536146866,
"grad_norm": 1811.5740966796875,
"learning_rate": 3.5474747474747475e-05,
"loss": 256.6501,
"step": 8780
},
{
"epoch": 0.0710251375657528,
"grad_norm": 923.5958251953125,
"learning_rate": 3.551515151515152e-05,
"loss": 157.0084,
"step": 8790
},
{
"epoch": 0.07110593977003693,
"grad_norm": 1671.8385009765625,
"learning_rate": 3.555555555555556e-05,
"loss": 206.8505,
"step": 8800
},
{
"epoch": 0.07118674197432107,
"grad_norm": 2508.17626953125,
"learning_rate": 3.55959595959596e-05,
"loss": 204.866,
"step": 8810
},
{
"epoch": 0.07126754417860519,
"grad_norm": 852.1519775390625,
"learning_rate": 3.563636363636364e-05,
"loss": 147.0554,
"step": 8820
},
{
"epoch": 0.07134834638288932,
"grad_norm": 925.072021484375,
"learning_rate": 3.567676767676768e-05,
"loss": 205.7363,
"step": 8830
},
{
"epoch": 0.07142914858717346,
"grad_norm": 1310.513916015625,
"learning_rate": 3.571717171717172e-05,
"loss": 252.9427,
"step": 8840
},
{
"epoch": 0.0715099507914576,
"grad_norm": 1795.476806640625,
"learning_rate": 3.575757575757576e-05,
"loss": 184.422,
"step": 8850
},
{
"epoch": 0.07159075299574172,
"grad_norm": 1071.3101806640625,
"learning_rate": 3.57979797979798e-05,
"loss": 165.8845,
"step": 8860
},
{
"epoch": 0.07167155520002585,
"grad_norm": 724.8527221679688,
"learning_rate": 3.583838383838384e-05,
"loss": 180.9637,
"step": 8870
},
{
"epoch": 0.07175235740430999,
"grad_norm": 999.9872436523438,
"learning_rate": 3.587878787878788e-05,
"loss": 186.5519,
"step": 8880
},
{
"epoch": 0.07183315960859413,
"grad_norm": 1380.1075439453125,
"learning_rate": 3.5919191919191916e-05,
"loss": 224.0065,
"step": 8890
},
{
"epoch": 0.07191396181287825,
"grad_norm": 1093.9498291015625,
"learning_rate": 3.595959595959596e-05,
"loss": 165.8288,
"step": 8900
},
{
"epoch": 0.07199476401716239,
"grad_norm": 2711.7353515625,
"learning_rate": 3.6e-05,
"loss": 163.5886,
"step": 8910
},
{
"epoch": 0.07207556622144652,
"grad_norm": 1537.253662109375,
"learning_rate": 3.604040404040404e-05,
"loss": 165.1541,
"step": 8920
},
{
"epoch": 0.07215636842573066,
"grad_norm": 1100.548095703125,
"learning_rate": 3.608080808080808e-05,
"loss": 169.8857,
"step": 8930
},
{
"epoch": 0.0722371706300148,
"grad_norm": 1088.0587158203125,
"learning_rate": 3.6121212121212124e-05,
"loss": 186.5488,
"step": 8940
},
{
"epoch": 0.07231797283429892,
"grad_norm": 1160.9769287109375,
"learning_rate": 3.616161616161616e-05,
"loss": 190.4372,
"step": 8950
},
{
"epoch": 0.07239877503858305,
"grad_norm": 1117.179443359375,
"learning_rate": 3.62020202020202e-05,
"loss": 134.2678,
"step": 8960
},
{
"epoch": 0.07247957724286719,
"grad_norm": 1956.5089111328125,
"learning_rate": 3.624242424242425e-05,
"loss": 172.5972,
"step": 8970
},
{
"epoch": 0.07256037944715132,
"grad_norm": 2694.300537109375,
"learning_rate": 3.6282828282828286e-05,
"loss": 200.7171,
"step": 8980
},
{
"epoch": 0.07264118165143545,
"grad_norm": 1112.63720703125,
"learning_rate": 3.6323232323232325e-05,
"loss": 151.1802,
"step": 8990
},
{
"epoch": 0.07272198385571958,
"grad_norm": 2176.663330078125,
"learning_rate": 3.6363636363636364e-05,
"loss": 162.1838,
"step": 9000
},
{
"epoch": 0.07280278606000372,
"grad_norm": 1126.674072265625,
"learning_rate": 3.640404040404041e-05,
"loss": 180.7603,
"step": 9010
},
{
"epoch": 0.07288358826428785,
"grad_norm": 1244.1241455078125,
"learning_rate": 3.644444444444445e-05,
"loss": 114.9891,
"step": 9020
},
{
"epoch": 0.07296439046857198,
"grad_norm": 2782.807373046875,
"learning_rate": 3.648484848484849e-05,
"loss": 199.3326,
"step": 9030
},
{
"epoch": 0.07304519267285611,
"grad_norm": 1218.721435546875,
"learning_rate": 3.6525252525252526e-05,
"loss": 143.516,
"step": 9040
},
{
"epoch": 0.07312599487714025,
"grad_norm": 799.7451782226562,
"learning_rate": 3.656565656565657e-05,
"loss": 131.4932,
"step": 9050
},
{
"epoch": 0.07320679708142439,
"grad_norm": 547.2342529296875,
"learning_rate": 3.660606060606061e-05,
"loss": 151.0718,
"step": 9060
},
{
"epoch": 0.07328759928570851,
"grad_norm": 1480.14501953125,
"learning_rate": 3.664646464646464e-05,
"loss": 201.2714,
"step": 9070
},
{
"epoch": 0.07336840148999264,
"grad_norm": 516.29931640625,
"learning_rate": 3.668686868686869e-05,
"loss": 174.2223,
"step": 9080
},
{
"epoch": 0.07344920369427678,
"grad_norm": 912.0347900390625,
"learning_rate": 3.672727272727273e-05,
"loss": 180.7495,
"step": 9090
},
{
"epoch": 0.07353000589856092,
"grad_norm": 1243.236083984375,
"learning_rate": 3.6767676767676766e-05,
"loss": 189.5693,
"step": 9100
},
{
"epoch": 0.07361080810284505,
"grad_norm": 742.5632934570312,
"learning_rate": 3.6808080808080805e-05,
"loss": 194.6184,
"step": 9110
},
{
"epoch": 0.07369161030712917,
"grad_norm": 1145.2069091796875,
"learning_rate": 3.684848484848485e-05,
"loss": 185.7565,
"step": 9120
},
{
"epoch": 0.07377241251141331,
"grad_norm": 1086.181640625,
"learning_rate": 3.688888888888889e-05,
"loss": 167.5214,
"step": 9130
},
{
"epoch": 0.07385321471569745,
"grad_norm": 1321.85400390625,
"learning_rate": 3.692929292929293e-05,
"loss": 221.075,
"step": 9140
},
{
"epoch": 0.07393401691998158,
"grad_norm": 2168.907958984375,
"learning_rate": 3.6969696969696974e-05,
"loss": 219.8519,
"step": 9150
},
{
"epoch": 0.0740148191242657,
"grad_norm": 1007.1217041015625,
"learning_rate": 3.701010101010101e-05,
"loss": 194.1429,
"step": 9160
},
{
"epoch": 0.07409562132854984,
"grad_norm": 1099.997802734375,
"learning_rate": 3.705050505050505e-05,
"loss": 220.093,
"step": 9170
},
{
"epoch": 0.07417642353283398,
"grad_norm": 745.4526977539062,
"learning_rate": 3.709090909090909e-05,
"loss": 143.6606,
"step": 9180
},
{
"epoch": 0.07425722573711811,
"grad_norm": 2050.18212890625,
"learning_rate": 3.7131313131313136e-05,
"loss": 222.1462,
"step": 9190
},
{
"epoch": 0.07433802794140223,
"grad_norm": 2366.43896484375,
"learning_rate": 3.7171717171717175e-05,
"loss": 191.5783,
"step": 9200
},
{
"epoch": 0.07441883014568637,
"grad_norm": 905.9826049804688,
"learning_rate": 3.7212121212121214e-05,
"loss": 168.3594,
"step": 9210
},
{
"epoch": 0.07449963234997051,
"grad_norm": 1320.197509765625,
"learning_rate": 3.725252525252525e-05,
"loss": 140.7596,
"step": 9220
},
{
"epoch": 0.07458043455425464,
"grad_norm": 3290.6748046875,
"learning_rate": 3.72929292929293e-05,
"loss": 161.625,
"step": 9230
},
{
"epoch": 0.07466123675853878,
"grad_norm": 1262.2984619140625,
"learning_rate": 3.733333333333334e-05,
"loss": 163.4713,
"step": 9240
},
{
"epoch": 0.0747420389628229,
"grad_norm": 1434.2923583984375,
"learning_rate": 3.7373737373737376e-05,
"loss": 168.6003,
"step": 9250
},
{
"epoch": 0.07482284116710704,
"grad_norm": 716.86083984375,
"learning_rate": 3.7414141414141415e-05,
"loss": 165.9515,
"step": 9260
},
{
"epoch": 0.07490364337139117,
"grad_norm": 4190.2119140625,
"learning_rate": 3.745454545454546e-05,
"loss": 201.2678,
"step": 9270
},
{
"epoch": 0.07498444557567531,
"grad_norm": 1093.3636474609375,
"learning_rate": 3.74949494949495e-05,
"loss": 145.2475,
"step": 9280
},
{
"epoch": 0.07506524777995943,
"grad_norm": 1217.2969970703125,
"learning_rate": 3.753535353535353e-05,
"loss": 219.3316,
"step": 9290
},
{
"epoch": 0.07514604998424357,
"grad_norm": 1122.0589599609375,
"learning_rate": 3.757575757575758e-05,
"loss": 201.7304,
"step": 9300
},
{
"epoch": 0.0752268521885277,
"grad_norm": 779.33447265625,
"learning_rate": 3.7616161616161616e-05,
"loss": 199.0558,
"step": 9310
},
{
"epoch": 0.07530765439281184,
"grad_norm": 1110.2554931640625,
"learning_rate": 3.7656565656565655e-05,
"loss": 190.4773,
"step": 9320
},
{
"epoch": 0.07538845659709596,
"grad_norm": 1762.0330810546875,
"learning_rate": 3.76969696969697e-05,
"loss": 151.1484,
"step": 9330
},
{
"epoch": 0.0754692588013801,
"grad_norm": 1020.6593627929688,
"learning_rate": 3.773737373737374e-05,
"loss": 169.2854,
"step": 9340
},
{
"epoch": 0.07555006100566423,
"grad_norm": 1202.7464599609375,
"learning_rate": 3.777777777777778e-05,
"loss": 164.9156,
"step": 9350
},
{
"epoch": 0.07563086320994837,
"grad_norm": 1249.380126953125,
"learning_rate": 3.781818181818182e-05,
"loss": 227.8868,
"step": 9360
},
{
"epoch": 0.0757116654142325,
"grad_norm": 1847.7193603515625,
"learning_rate": 3.785858585858586e-05,
"loss": 176.2305,
"step": 9370
},
{
"epoch": 0.07579246761851663,
"grad_norm": 1446.559814453125,
"learning_rate": 3.78989898989899e-05,
"loss": 147.3983,
"step": 9380
},
{
"epoch": 0.07587326982280077,
"grad_norm": 1216.9862060546875,
"learning_rate": 3.793939393939394e-05,
"loss": 165.7989,
"step": 9390
},
{
"epoch": 0.0759540720270849,
"grad_norm": 2848.1337890625,
"learning_rate": 3.797979797979798e-05,
"loss": 157.2326,
"step": 9400
},
{
"epoch": 0.07603487423136904,
"grad_norm": 940.07177734375,
"learning_rate": 3.8020202020202025e-05,
"loss": 221.2202,
"step": 9410
},
{
"epoch": 0.07611567643565316,
"grad_norm": 1160.6881103515625,
"learning_rate": 3.8060606060606064e-05,
"loss": 158.6488,
"step": 9420
},
{
"epoch": 0.0761964786399373,
"grad_norm": 2879.822998046875,
"learning_rate": 3.81010101010101e-05,
"loss": 223.0843,
"step": 9430
},
{
"epoch": 0.07627728084422143,
"grad_norm": 1498.8753662109375,
"learning_rate": 3.814141414141414e-05,
"loss": 161.5878,
"step": 9440
},
{
"epoch": 0.07635808304850557,
"grad_norm": 1229.455078125,
"learning_rate": 3.818181818181819e-05,
"loss": 198.7026,
"step": 9450
},
{
"epoch": 0.07643888525278969,
"grad_norm": 929.4089965820312,
"learning_rate": 3.8222222222222226e-05,
"loss": 251.8946,
"step": 9460
},
{
"epoch": 0.07651968745707383,
"grad_norm": 934.8760375976562,
"learning_rate": 3.8262626262626265e-05,
"loss": 162.8998,
"step": 9470
},
{
"epoch": 0.07660048966135796,
"grad_norm": 915.323974609375,
"learning_rate": 3.830303030303031e-05,
"loss": 171.6096,
"step": 9480
},
{
"epoch": 0.0766812918656421,
"grad_norm": 1115.416748046875,
"learning_rate": 3.834343434343435e-05,
"loss": 163.5847,
"step": 9490
},
{
"epoch": 0.07676209406992623,
"grad_norm": 2073.390869140625,
"learning_rate": 3.838383838383838e-05,
"loss": 261.5975,
"step": 9500
},
{
"epoch": 0.07684289627421036,
"grad_norm": 1260.9718017578125,
"learning_rate": 3.842424242424243e-05,
"loss": 131.5614,
"step": 9510
},
{
"epoch": 0.07692369847849449,
"grad_norm": 1239.3568115234375,
"learning_rate": 3.8464646464646466e-05,
"loss": 164.9494,
"step": 9520
},
{
"epoch": 0.07700450068277863,
"grad_norm": 1221.8023681640625,
"learning_rate": 3.8505050505050505e-05,
"loss": 245.5353,
"step": 9530
},
{
"epoch": 0.07708530288706276,
"grad_norm": 533.4956665039062,
"learning_rate": 3.8545454545454544e-05,
"loss": 149.1819,
"step": 9540
},
{
"epoch": 0.07716610509134689,
"grad_norm": 1410.545166015625,
"learning_rate": 3.858585858585859e-05,
"loss": 172.351,
"step": 9550
},
{
"epoch": 0.07724690729563102,
"grad_norm": 927.8252563476562,
"learning_rate": 3.862626262626263e-05,
"loss": 152.7654,
"step": 9560
},
{
"epoch": 0.07732770949991516,
"grad_norm": 1244.0257568359375,
"learning_rate": 3.866666666666667e-05,
"loss": 171.8292,
"step": 9570
},
{
"epoch": 0.0774085117041993,
"grad_norm": 664.2005615234375,
"learning_rate": 3.8707070707070706e-05,
"loss": 109.5876,
"step": 9580
},
{
"epoch": 0.07748931390848342,
"grad_norm": 1334.42626953125,
"learning_rate": 3.874747474747475e-05,
"loss": 129.1926,
"step": 9590
},
{
"epoch": 0.07757011611276755,
"grad_norm": 1236.3963623046875,
"learning_rate": 3.878787878787879e-05,
"loss": 195.8564,
"step": 9600
},
{
"epoch": 0.07765091831705169,
"grad_norm": 517.2808227539062,
"learning_rate": 3.882828282828283e-05,
"loss": 213.7913,
"step": 9610
},
{
"epoch": 0.07773172052133583,
"grad_norm": 1618.37890625,
"learning_rate": 3.886868686868687e-05,
"loss": 187.1561,
"step": 9620
},
{
"epoch": 0.07781252272561995,
"grad_norm": 1312.72705078125,
"learning_rate": 3.8909090909090914e-05,
"loss": 192.0711,
"step": 9630
},
{
"epoch": 0.07789332492990408,
"grad_norm": 970.0208129882812,
"learning_rate": 3.894949494949495e-05,
"loss": 150.2422,
"step": 9640
},
{
"epoch": 0.07797412713418822,
"grad_norm": 1302.1982421875,
"learning_rate": 3.898989898989899e-05,
"loss": 149.0684,
"step": 9650
},
{
"epoch": 0.07805492933847236,
"grad_norm": 3663.638427734375,
"learning_rate": 3.903030303030304e-05,
"loss": 197.8055,
"step": 9660
},
{
"epoch": 0.07813573154275649,
"grad_norm": 1510.8233642578125,
"learning_rate": 3.9070707070707076e-05,
"loss": 217.5386,
"step": 9670
},
{
"epoch": 0.07821653374704061,
"grad_norm": 4103.01904296875,
"learning_rate": 3.9111111111111115e-05,
"loss": 198.1855,
"step": 9680
},
{
"epoch": 0.07829733595132475,
"grad_norm": 1483.061279296875,
"learning_rate": 3.9151515151515153e-05,
"loss": 200.777,
"step": 9690
},
{
"epoch": 0.07837813815560889,
"grad_norm": 1666.9429931640625,
"learning_rate": 3.91919191919192e-05,
"loss": 233.7256,
"step": 9700
},
{
"epoch": 0.07845894035989302,
"grad_norm": 1422.605224609375,
"learning_rate": 3.923232323232323e-05,
"loss": 198.8265,
"step": 9710
},
{
"epoch": 0.07853974256417715,
"grad_norm": 955.90869140625,
"learning_rate": 3.927272727272727e-05,
"loss": 178.4951,
"step": 9720
},
{
"epoch": 0.07862054476846128,
"grad_norm": 768.09228515625,
"learning_rate": 3.9313131313131316e-05,
"loss": 169.9865,
"step": 9730
},
{
"epoch": 0.07870134697274542,
"grad_norm": 2241.572998046875,
"learning_rate": 3.9353535353535355e-05,
"loss": 171.3657,
"step": 9740
},
{
"epoch": 0.07878214917702955,
"grad_norm": 828.8177490234375,
"learning_rate": 3.939393939393939e-05,
"loss": 212.0979,
"step": 9750
},
{
"epoch": 0.07886295138131368,
"grad_norm": 1248.2691650390625,
"learning_rate": 3.943434343434343e-05,
"loss": 144.412,
"step": 9760
},
{
"epoch": 0.07894375358559781,
"grad_norm": 1106.5013427734375,
"learning_rate": 3.947474747474748e-05,
"loss": 170.0778,
"step": 9770
},
{
"epoch": 0.07902455578988195,
"grad_norm": 1183.6558837890625,
"learning_rate": 3.951515151515152e-05,
"loss": 183.0114,
"step": 9780
},
{
"epoch": 0.07910535799416608,
"grad_norm": 790.3275146484375,
"learning_rate": 3.9555555555555556e-05,
"loss": 148.8441,
"step": 9790
},
{
"epoch": 0.07918616019845022,
"grad_norm": 1040.2529296875,
"learning_rate": 3.9595959595959594e-05,
"loss": 181.9307,
"step": 9800
},
{
"epoch": 0.07926696240273434,
"grad_norm": 1023.8417358398438,
"learning_rate": 3.963636363636364e-05,
"loss": 160.768,
"step": 9810
},
{
"epoch": 0.07934776460701848,
"grad_norm": 1530.9327392578125,
"learning_rate": 3.967676767676768e-05,
"loss": 189.5177,
"step": 9820
},
{
"epoch": 0.07942856681130261,
"grad_norm": 1020.0157470703125,
"learning_rate": 3.971717171717172e-05,
"loss": 175.2649,
"step": 9830
},
{
"epoch": 0.07950936901558675,
"grad_norm": 1115.394287109375,
"learning_rate": 3.975757575757576e-05,
"loss": 180.9754,
"step": 9840
},
{
"epoch": 0.07959017121987087,
"grad_norm": 596.3128051757812,
"learning_rate": 3.97979797979798e-05,
"loss": 190.0906,
"step": 9850
},
{
"epoch": 0.07967097342415501,
"grad_norm": 1229.2056884765625,
"learning_rate": 3.983838383838384e-05,
"loss": 153.1082,
"step": 9860
},
{
"epoch": 0.07975177562843914,
"grad_norm": 1460.8936767578125,
"learning_rate": 3.987878787878788e-05,
"loss": 176.2776,
"step": 9870
},
{
"epoch": 0.07983257783272328,
"grad_norm": 4583.7373046875,
"learning_rate": 3.9919191919191926e-05,
"loss": 163.6398,
"step": 9880
},
{
"epoch": 0.0799133800370074,
"grad_norm": 1115.4329833984375,
"learning_rate": 3.9959595959595964e-05,
"loss": 174.1616,
"step": 9890
},
{
"epoch": 0.07999418224129154,
"grad_norm": 2389.8232421875,
"learning_rate": 4e-05,
"loss": 163.833,
"step": 9900
},
{
"epoch": 0.08007498444557568,
"grad_norm": 2180.552978515625,
"learning_rate": 4.004040404040404e-05,
"loss": 189.1826,
"step": 9910
},
{
"epoch": 0.08015578664985981,
"grad_norm": 2478.71533203125,
"learning_rate": 4.008080808080809e-05,
"loss": 262.7207,
"step": 9920
},
{
"epoch": 0.08023658885414395,
"grad_norm": 952.7739868164062,
"learning_rate": 4.012121212121212e-05,
"loss": 152.5153,
"step": 9930
},
{
"epoch": 0.08031739105842807,
"grad_norm": 1591.5555419921875,
"learning_rate": 4.016161616161616e-05,
"loss": 161.1086,
"step": 9940
},
{
"epoch": 0.0803981932627122,
"grad_norm": 2502.85400390625,
"learning_rate": 4.0202020202020204e-05,
"loss": 180.6969,
"step": 9950
},
{
"epoch": 0.08047899546699634,
"grad_norm": 1107.453857421875,
"learning_rate": 4.024242424242424e-05,
"loss": 245.7,
"step": 9960
},
{
"epoch": 0.08055979767128048,
"grad_norm": 732.3422241210938,
"learning_rate": 4.028282828282828e-05,
"loss": 128.3301,
"step": 9970
},
{
"epoch": 0.0806405998755646,
"grad_norm": 852.4077758789062,
"learning_rate": 4.032323232323232e-05,
"loss": 171.862,
"step": 9980
},
{
"epoch": 0.08072140207984874,
"grad_norm": 974.1900634765625,
"learning_rate": 4.0363636363636367e-05,
"loss": 245.3339,
"step": 9990
},
{
"epoch": 0.08080220428413287,
"grad_norm": 917.6868286132812,
"learning_rate": 4.0404040404040405e-05,
"loss": 151.893,
"step": 10000
},
{
"epoch": 0.08088300648841701,
"grad_norm": 1009.7120971679688,
"learning_rate": 4.0444444444444444e-05,
"loss": 196.4933,
"step": 10010
},
{
"epoch": 0.08096380869270113,
"grad_norm": 2075.980224609375,
"learning_rate": 4.048484848484849e-05,
"loss": 190.6988,
"step": 10020
},
{
"epoch": 0.08104461089698527,
"grad_norm": 2236.189697265625,
"learning_rate": 4.052525252525253e-05,
"loss": 173.7965,
"step": 10030
},
{
"epoch": 0.0811254131012694,
"grad_norm": 1387.155517578125,
"learning_rate": 4.056565656565657e-05,
"loss": 287.3005,
"step": 10040
},
{
"epoch": 0.08120621530555354,
"grad_norm": 1775.0162353515625,
"learning_rate": 4.0606060606060606e-05,
"loss": 161.304,
"step": 10050
},
{
"epoch": 0.08128701750983768,
"grad_norm": 2991.034423828125,
"learning_rate": 4.064646464646465e-05,
"loss": 183.5822,
"step": 10060
},
{
"epoch": 0.0813678197141218,
"grad_norm": 2393.831298828125,
"learning_rate": 4.068686868686869e-05,
"loss": 216.6813,
"step": 10070
},
{
"epoch": 0.08144862191840593,
"grad_norm": 753.2874755859375,
"learning_rate": 4.072727272727273e-05,
"loss": 176.5726,
"step": 10080
},
{
"epoch": 0.08152942412269007,
"grad_norm": 917.6944580078125,
"learning_rate": 4.076767676767677e-05,
"loss": 182.8443,
"step": 10090
},
{
"epoch": 0.0816102263269742,
"grad_norm": 1497.6976318359375,
"learning_rate": 4.0808080808080814e-05,
"loss": 179.7216,
"step": 10100
},
{
"epoch": 0.08169102853125833,
"grad_norm": 896.7678833007812,
"learning_rate": 4.084848484848485e-05,
"loss": 171.1982,
"step": 10110
},
{
"epoch": 0.08177183073554246,
"grad_norm": 2560.443115234375,
"learning_rate": 4.088888888888889e-05,
"loss": 242.9725,
"step": 10120
},
{
"epoch": 0.0818526329398266,
"grad_norm": 914.347900390625,
"learning_rate": 4.092929292929293e-05,
"loss": 166.4831,
"step": 10130
},
{
"epoch": 0.08193343514411074,
"grad_norm": 1105.142822265625,
"learning_rate": 4.096969696969697e-05,
"loss": 182.9737,
"step": 10140
},
{
"epoch": 0.08201423734839486,
"grad_norm": 1321.1329345703125,
"learning_rate": 4.101010101010101e-05,
"loss": 152.9314,
"step": 10150
},
{
"epoch": 0.082095039552679,
"grad_norm": 924.438720703125,
"learning_rate": 4.105050505050505e-05,
"loss": 187.9318,
"step": 10160
},
{
"epoch": 0.08217584175696313,
"grad_norm": 834.27685546875,
"learning_rate": 4.109090909090909e-05,
"loss": 177.3875,
"step": 10170
},
{
"epoch": 0.08225664396124727,
"grad_norm": 996.1378173828125,
"learning_rate": 4.113131313131313e-05,
"loss": 163.1871,
"step": 10180
},
{
"epoch": 0.08233744616553139,
"grad_norm": 1657.6314697265625,
"learning_rate": 4.117171717171717e-05,
"loss": 195.5306,
"step": 10190
},
{
"epoch": 0.08241824836981552,
"grad_norm": 1040.5526123046875,
"learning_rate": 4.1212121212121216e-05,
"loss": 179.7722,
"step": 10200
},
{
"epoch": 0.08249905057409966,
"grad_norm": 1405.0408935546875,
"learning_rate": 4.1252525252525255e-05,
"loss": 177.9881,
"step": 10210
},
{
"epoch": 0.0825798527783838,
"grad_norm": 1484.392333984375,
"learning_rate": 4.1292929292929294e-05,
"loss": 170.4384,
"step": 10220
},
{
"epoch": 0.08266065498266793,
"grad_norm": 533.9537963867188,
"learning_rate": 4.133333333333333e-05,
"loss": 166.7928,
"step": 10230
},
{
"epoch": 0.08274145718695206,
"grad_norm": 1133.5531005859375,
"learning_rate": 4.137373737373738e-05,
"loss": 185.8696,
"step": 10240
},
{
"epoch": 0.08282225939123619,
"grad_norm": 1964.5546875,
"learning_rate": 4.141414141414142e-05,
"loss": 247.6705,
"step": 10250
},
{
"epoch": 0.08290306159552033,
"grad_norm": 1816.8203125,
"learning_rate": 4.1454545454545456e-05,
"loss": 188.6205,
"step": 10260
},
{
"epoch": 0.08298386379980446,
"grad_norm": 809.494873046875,
"learning_rate": 4.1494949494949495e-05,
"loss": 186.0191,
"step": 10270
},
{
"epoch": 0.08306466600408859,
"grad_norm": 1444.1771240234375,
"learning_rate": 4.153535353535354e-05,
"loss": 171.1266,
"step": 10280
},
{
"epoch": 0.08314546820837272,
"grad_norm": 1594.9212646484375,
"learning_rate": 4.157575757575758e-05,
"loss": 152.8558,
"step": 10290
},
{
"epoch": 0.08322627041265686,
"grad_norm": 1367.26318359375,
"learning_rate": 4.161616161616162e-05,
"loss": 215.3903,
"step": 10300
},
{
"epoch": 0.083307072616941,
"grad_norm": 1395.453857421875,
"learning_rate": 4.165656565656566e-05,
"loss": 219.8608,
"step": 10310
},
{
"epoch": 0.08338787482122512,
"grad_norm": 823.7109375,
"learning_rate": 4.16969696969697e-05,
"loss": 198.1472,
"step": 10320
},
{
"epoch": 0.08346867702550925,
"grad_norm": 4030.11083984375,
"learning_rate": 4.173737373737374e-05,
"loss": 191.6869,
"step": 10330
},
{
"epoch": 0.08354947922979339,
"grad_norm": 1047.6395263671875,
"learning_rate": 4.177777777777778e-05,
"loss": 198.535,
"step": 10340
},
{
"epoch": 0.08363028143407752,
"grad_norm": 1213.222412109375,
"learning_rate": 4.181818181818182e-05,
"loss": 209.4711,
"step": 10350
},
{
"epoch": 0.08371108363836166,
"grad_norm": 862.3009643554688,
"learning_rate": 4.185858585858586e-05,
"loss": 197.0447,
"step": 10360
},
{
"epoch": 0.08379188584264578,
"grad_norm": 7555.03271484375,
"learning_rate": 4.18989898989899e-05,
"loss": 257.4577,
"step": 10370
},
{
"epoch": 0.08387268804692992,
"grad_norm": 1448.51806640625,
"learning_rate": 4.193939393939394e-05,
"loss": 145.9192,
"step": 10380
},
{
"epoch": 0.08395349025121406,
"grad_norm": 563.4491577148438,
"learning_rate": 4.197979797979798e-05,
"loss": 185.379,
"step": 10390
},
{
"epoch": 0.08403429245549819,
"grad_norm": 2999.943603515625,
"learning_rate": 4.202020202020202e-05,
"loss": 207.6808,
"step": 10400
},
{
"epoch": 0.08411509465978231,
"grad_norm": 1272.3822021484375,
"learning_rate": 4.206060606060606e-05,
"loss": 203.2208,
"step": 10410
},
{
"epoch": 0.08419589686406645,
"grad_norm": 1877.1287841796875,
"learning_rate": 4.2101010101010105e-05,
"loss": 208.5024,
"step": 10420
},
{
"epoch": 0.08427669906835059,
"grad_norm": 880.4778442382812,
"learning_rate": 4.2141414141414144e-05,
"loss": 114.9268,
"step": 10430
},
{
"epoch": 0.08435750127263472,
"grad_norm": 2282.030517578125,
"learning_rate": 4.218181818181818e-05,
"loss": 185.7249,
"step": 10440
},
{
"epoch": 0.08443830347691884,
"grad_norm": 1241.139892578125,
"learning_rate": 4.222222222222222e-05,
"loss": 171.4185,
"step": 10450
},
{
"epoch": 0.08451910568120298,
"grad_norm": 1106.116943359375,
"learning_rate": 4.226262626262627e-05,
"loss": 233.517,
"step": 10460
},
{
"epoch": 0.08459990788548712,
"grad_norm": 1352.7723388671875,
"learning_rate": 4.2303030303030306e-05,
"loss": 164.0137,
"step": 10470
},
{
"epoch": 0.08468071008977125,
"grad_norm": 824.0071411132812,
"learning_rate": 4.2343434343434345e-05,
"loss": 149.2571,
"step": 10480
},
{
"epoch": 0.08476151229405539,
"grad_norm": 1494.287841796875,
"learning_rate": 4.2383838383838384e-05,
"loss": 158.6048,
"step": 10490
},
{
"epoch": 0.08484231449833951,
"grad_norm": 928.8883056640625,
"learning_rate": 4.242424242424243e-05,
"loss": 160.3929,
"step": 10500
},
{
"epoch": 0.08492311670262365,
"grad_norm": 3102.35791015625,
"learning_rate": 4.246464646464647e-05,
"loss": 262.7529,
"step": 10510
},
{
"epoch": 0.08500391890690778,
"grad_norm": 753.5589599609375,
"learning_rate": 4.250505050505051e-05,
"loss": 147.7691,
"step": 10520
},
{
"epoch": 0.08508472111119192,
"grad_norm": 1001.4116821289062,
"learning_rate": 4.254545454545455e-05,
"loss": 137.3231,
"step": 10530
},
{
"epoch": 0.08516552331547604,
"grad_norm": 813.2144775390625,
"learning_rate": 4.258585858585859e-05,
"loss": 201.1259,
"step": 10540
},
{
"epoch": 0.08524632551976018,
"grad_norm": 1098.597900390625,
"learning_rate": 4.262626262626263e-05,
"loss": 165.8937,
"step": 10550
},
{
"epoch": 0.08532712772404431,
"grad_norm": 1298.8853759765625,
"learning_rate": 4.266666666666667e-05,
"loss": 198.1985,
"step": 10560
},
{
"epoch": 0.08540792992832845,
"grad_norm": 1016.0570678710938,
"learning_rate": 4.270707070707071e-05,
"loss": 206.1271,
"step": 10570
},
{
"epoch": 0.08548873213261257,
"grad_norm": 2057.573974609375,
"learning_rate": 4.274747474747475e-05,
"loss": 172.7861,
"step": 10580
},
{
"epoch": 0.08556953433689671,
"grad_norm": 1261.774169921875,
"learning_rate": 4.2787878787878786e-05,
"loss": 149.1268,
"step": 10590
},
{
"epoch": 0.08565033654118084,
"grad_norm": 1344.2037353515625,
"learning_rate": 4.282828282828283e-05,
"loss": 195.2029,
"step": 10600
},
{
"epoch": 0.08573113874546498,
"grad_norm": 3034.935546875,
"learning_rate": 4.286868686868687e-05,
"loss": 222.7399,
"step": 10610
},
{
"epoch": 0.08581194094974912,
"grad_norm": 970.0159912109375,
"learning_rate": 4.290909090909091e-05,
"loss": 168.5838,
"step": 10620
},
{
"epoch": 0.08589274315403324,
"grad_norm": 953.5883178710938,
"learning_rate": 4.294949494949495e-05,
"loss": 162.8468,
"step": 10630
},
{
"epoch": 0.08597354535831737,
"grad_norm": 2465.924072265625,
"learning_rate": 4.2989898989898994e-05,
"loss": 204.2057,
"step": 10640
},
{
"epoch": 0.08605434756260151,
"grad_norm": 1194.1285400390625,
"learning_rate": 4.303030303030303e-05,
"loss": 197.6671,
"step": 10650
},
{
"epoch": 0.08613514976688565,
"grad_norm": 881.1695556640625,
"learning_rate": 4.307070707070707e-05,
"loss": 163.8398,
"step": 10660
},
{
"epoch": 0.08621595197116977,
"grad_norm": 3846.036376953125,
"learning_rate": 4.311111111111111e-05,
"loss": 178.4458,
"step": 10670
},
{
"epoch": 0.0862967541754539,
"grad_norm": 1206.0906982421875,
"learning_rate": 4.3151515151515156e-05,
"loss": 153.2348,
"step": 10680
},
{
"epoch": 0.08637755637973804,
"grad_norm": 898.4664306640625,
"learning_rate": 4.3191919191919195e-05,
"loss": 143.3541,
"step": 10690
},
{
"epoch": 0.08645835858402218,
"grad_norm": 2398.2255859375,
"learning_rate": 4.3232323232323234e-05,
"loss": 212.5595,
"step": 10700
},
{
"epoch": 0.0865391607883063,
"grad_norm": 1220.5733642578125,
"learning_rate": 4.327272727272728e-05,
"loss": 179.1572,
"step": 10710
},
{
"epoch": 0.08661996299259044,
"grad_norm": 1974.5457763671875,
"learning_rate": 4.331313131313132e-05,
"loss": 190.0654,
"step": 10720
},
{
"epoch": 0.08670076519687457,
"grad_norm": 2125.98583984375,
"learning_rate": 4.335353535353536e-05,
"loss": 168.3283,
"step": 10730
},
{
"epoch": 0.08678156740115871,
"grad_norm": 1389.6546630859375,
"learning_rate": 4.3393939393939396e-05,
"loss": 159.5211,
"step": 10740
},
{
"epoch": 0.08686236960544283,
"grad_norm": 807.5780029296875,
"learning_rate": 4.343434343434344e-05,
"loss": 192.4806,
"step": 10750
},
{
"epoch": 0.08694317180972697,
"grad_norm": 1139.113037109375,
"learning_rate": 4.347474747474748e-05,
"loss": 155.3915,
"step": 10760
},
{
"epoch": 0.0870239740140111,
"grad_norm": 1003.0131225585938,
"learning_rate": 4.351515151515152e-05,
"loss": 150.0795,
"step": 10770
},
{
"epoch": 0.08710477621829524,
"grad_norm": 962.0811767578125,
"learning_rate": 4.355555555555556e-05,
"loss": 194.0887,
"step": 10780
},
{
"epoch": 0.08718557842257937,
"grad_norm": 626.3510131835938,
"learning_rate": 4.35959595959596e-05,
"loss": 148.0051,
"step": 10790
},
{
"epoch": 0.0872663806268635,
"grad_norm": 2664.0517578125,
"learning_rate": 4.3636363636363636e-05,
"loss": 199.106,
"step": 10800
},
{
"epoch": 0.08734718283114763,
"grad_norm": 895.5698852539062,
"learning_rate": 4.3676767676767674e-05,
"loss": 172.3583,
"step": 10810
},
{
"epoch": 0.08742798503543177,
"grad_norm": 1103.621826171875,
"learning_rate": 4.371717171717172e-05,
"loss": 203.0746,
"step": 10820
},
{
"epoch": 0.0875087872397159,
"grad_norm": 1323.6517333984375,
"learning_rate": 4.375757575757576e-05,
"loss": 158.2676,
"step": 10830
},
{
"epoch": 0.08758958944400003,
"grad_norm": 1170.48779296875,
"learning_rate": 4.37979797979798e-05,
"loss": 151.4179,
"step": 10840
},
{
"epoch": 0.08767039164828416,
"grad_norm": 1171.2379150390625,
"learning_rate": 4.383838383838384e-05,
"loss": 181.4644,
"step": 10850
},
{
"epoch": 0.0877511938525683,
"grad_norm": 1495.01025390625,
"learning_rate": 4.387878787878788e-05,
"loss": 167.6518,
"step": 10860
},
{
"epoch": 0.08783199605685243,
"grad_norm": 1283.5498046875,
"learning_rate": 4.391919191919192e-05,
"loss": 138.4191,
"step": 10870
},
{
"epoch": 0.08791279826113656,
"grad_norm": 1028.198974609375,
"learning_rate": 4.395959595959596e-05,
"loss": 167.4624,
"step": 10880
},
{
"epoch": 0.0879936004654207,
"grad_norm": 958.3167114257812,
"learning_rate": 4.4000000000000006e-05,
"loss": 187.0603,
"step": 10890
},
{
"epoch": 0.08807440266970483,
"grad_norm": 1248.95556640625,
"learning_rate": 4.4040404040404044e-05,
"loss": 194.5838,
"step": 10900
},
{
"epoch": 0.08815520487398897,
"grad_norm": 1088.8775634765625,
"learning_rate": 4.408080808080808e-05,
"loss": 137.5682,
"step": 10910
},
{
"epoch": 0.0882360070782731,
"grad_norm": 1130.275146484375,
"learning_rate": 4.412121212121212e-05,
"loss": 180.1215,
"step": 10920
},
{
"epoch": 0.08831680928255722,
"grad_norm": 1201.9453125,
"learning_rate": 4.416161616161617e-05,
"loss": 157.9624,
"step": 10930
},
{
"epoch": 0.08839761148684136,
"grad_norm": 1291.0989990234375,
"learning_rate": 4.420202020202021e-05,
"loss": 159.1208,
"step": 10940
},
{
"epoch": 0.0884784136911255,
"grad_norm": 946.185546875,
"learning_rate": 4.4242424242424246e-05,
"loss": 200.6266,
"step": 10950
},
{
"epoch": 0.08855921589540963,
"grad_norm": 2330.45361328125,
"learning_rate": 4.4282828282828284e-05,
"loss": 163.2997,
"step": 10960
},
{
"epoch": 0.08864001809969375,
"grad_norm": 959.3818359375,
"learning_rate": 4.432323232323233e-05,
"loss": 200.18,
"step": 10970
},
{
"epoch": 0.08872082030397789,
"grad_norm": 1215.0078125,
"learning_rate": 4.436363636363637e-05,
"loss": 194.7184,
"step": 10980
},
{
"epoch": 0.08880162250826203,
"grad_norm": 731.89501953125,
"learning_rate": 4.44040404040404e-05,
"loss": 183.9504,
"step": 10990
},
{
"epoch": 0.08888242471254616,
"grad_norm": 751.2623291015625,
"learning_rate": 4.4444444444444447e-05,
"loss": 164.3441,
"step": 11000
},
{
"epoch": 0.08896322691683028,
"grad_norm": 1424.0635986328125,
"learning_rate": 4.4484848484848485e-05,
"loss": 202.2657,
"step": 11010
},
{
"epoch": 0.08904402912111442,
"grad_norm": 1039.608642578125,
"learning_rate": 4.4525252525252524e-05,
"loss": 177.8795,
"step": 11020
},
{
"epoch": 0.08912483132539856,
"grad_norm": 1338.993408203125,
"learning_rate": 4.456565656565656e-05,
"loss": 167.3348,
"step": 11030
},
{
"epoch": 0.08920563352968269,
"grad_norm": 2591.8984375,
"learning_rate": 4.460606060606061e-05,
"loss": 165.2155,
"step": 11040
},
{
"epoch": 0.08928643573396683,
"grad_norm": 931.5535888671875,
"learning_rate": 4.464646464646465e-05,
"loss": 198.5493,
"step": 11050
},
{
"epoch": 0.08936723793825095,
"grad_norm": 1028.25927734375,
"learning_rate": 4.4686868686868686e-05,
"loss": 169.1925,
"step": 11060
},
{
"epoch": 0.08944804014253509,
"grad_norm": 1156.0810546875,
"learning_rate": 4.472727272727273e-05,
"loss": 172.742,
"step": 11070
},
{
"epoch": 0.08952884234681922,
"grad_norm": 1409.868408203125,
"learning_rate": 4.476767676767677e-05,
"loss": 152.6085,
"step": 11080
},
{
"epoch": 0.08960964455110336,
"grad_norm": 1091.4266357421875,
"learning_rate": 4.480808080808081e-05,
"loss": 149.4833,
"step": 11090
},
{
"epoch": 0.08969044675538748,
"grad_norm": 1274.9849853515625,
"learning_rate": 4.484848484848485e-05,
"loss": 239.724,
"step": 11100
},
{
"epoch": 0.08977124895967162,
"grad_norm": 2974.341552734375,
"learning_rate": 4.4888888888888894e-05,
"loss": 184.0551,
"step": 11110
},
{
"epoch": 0.08985205116395575,
"grad_norm": 893.275390625,
"learning_rate": 4.492929292929293e-05,
"loss": 163.8494,
"step": 11120
},
{
"epoch": 0.08993285336823989,
"grad_norm": 1236.2047119140625,
"learning_rate": 4.496969696969697e-05,
"loss": 223.2573,
"step": 11130
},
{
"epoch": 0.09001365557252401,
"grad_norm": 560.5221557617188,
"learning_rate": 4.501010101010101e-05,
"loss": 143.0442,
"step": 11140
},
{
"epoch": 0.09009445777680815,
"grad_norm": 1671.4537353515625,
"learning_rate": 4.5050505050505056e-05,
"loss": 230.4551,
"step": 11150
},
{
"epoch": 0.09017525998109228,
"grad_norm": 2295.0419921875,
"learning_rate": 4.5090909090909095e-05,
"loss": 191.5067,
"step": 11160
},
{
"epoch": 0.09025606218537642,
"grad_norm": 1291.7230224609375,
"learning_rate": 4.5131313131313134e-05,
"loss": 165.677,
"step": 11170
},
{
"epoch": 0.09033686438966056,
"grad_norm": 817.165771484375,
"learning_rate": 4.517171717171717e-05,
"loss": 145.0678,
"step": 11180
},
{
"epoch": 0.09041766659394468,
"grad_norm": 938.4746704101562,
"learning_rate": 4.521212121212122e-05,
"loss": 168.7315,
"step": 11190
},
{
"epoch": 0.09049846879822881,
"grad_norm": 820.7261352539062,
"learning_rate": 4.525252525252526e-05,
"loss": 174.4509,
"step": 11200
},
{
"epoch": 0.09057927100251295,
"grad_norm": 899.671875,
"learning_rate": 4.529292929292929e-05,
"loss": 153.883,
"step": 11210
},
{
"epoch": 0.09066007320679709,
"grad_norm": 2744.694091796875,
"learning_rate": 4.5333333333333335e-05,
"loss": 152.8033,
"step": 11220
},
{
"epoch": 0.09074087541108121,
"grad_norm": 1821.427734375,
"learning_rate": 4.5373737373737374e-05,
"loss": 203.8027,
"step": 11230
},
{
"epoch": 0.09082167761536535,
"grad_norm": 937.9207763671875,
"learning_rate": 4.541414141414141e-05,
"loss": 183.0782,
"step": 11240
},
{
"epoch": 0.09090247981964948,
"grad_norm": 1872.185546875,
"learning_rate": 4.545454545454546e-05,
"loss": 153.9125,
"step": 11250
},
{
"epoch": 0.09098328202393362,
"grad_norm": 772.832275390625,
"learning_rate": 4.54949494949495e-05,
"loss": 143.9607,
"step": 11260
},
{
"epoch": 0.09106408422821774,
"grad_norm": 2842.652587890625,
"learning_rate": 4.5535353535353536e-05,
"loss": 189.6786,
"step": 11270
},
{
"epoch": 0.09114488643250188,
"grad_norm": 1738.2589111328125,
"learning_rate": 4.5575757575757575e-05,
"loss": 163.1939,
"step": 11280
},
{
"epoch": 0.09122568863678601,
"grad_norm": 1521.9814453125,
"learning_rate": 4.561616161616162e-05,
"loss": 208.5482,
"step": 11290
},
{
"epoch": 0.09130649084107015,
"grad_norm": 1132.692138671875,
"learning_rate": 4.565656565656566e-05,
"loss": 161.8789,
"step": 11300
},
{
"epoch": 0.09138729304535427,
"grad_norm": 1395.38671875,
"learning_rate": 4.56969696969697e-05,
"loss": 164.5156,
"step": 11310
},
{
"epoch": 0.0914680952496384,
"grad_norm": 661.7669067382812,
"learning_rate": 4.573737373737374e-05,
"loss": 163.1058,
"step": 11320
},
{
"epoch": 0.09154889745392254,
"grad_norm": 1353.42578125,
"learning_rate": 4.577777777777778e-05,
"loss": 178.0265,
"step": 11330
},
{
"epoch": 0.09162969965820668,
"grad_norm": 758.9215087890625,
"learning_rate": 4.581818181818182e-05,
"loss": 186.8539,
"step": 11340
},
{
"epoch": 0.09171050186249081,
"grad_norm": 926.5440673828125,
"learning_rate": 4.585858585858586e-05,
"loss": 174.9289,
"step": 11350
},
{
"epoch": 0.09179130406677494,
"grad_norm": 1032.7493896484375,
"learning_rate": 4.58989898989899e-05,
"loss": 157.8875,
"step": 11360
},
{
"epoch": 0.09187210627105907,
"grad_norm": 1104.685302734375,
"learning_rate": 4.5939393939393945e-05,
"loss": 180.8059,
"step": 11370
},
{
"epoch": 0.09195290847534321,
"grad_norm": 807.3258056640625,
"learning_rate": 4.5979797979797984e-05,
"loss": 133.7691,
"step": 11380
},
{
"epoch": 0.09203371067962735,
"grad_norm": 1296.3505859375,
"learning_rate": 4.602020202020202e-05,
"loss": 168.8188,
"step": 11390
},
{
"epoch": 0.09211451288391147,
"grad_norm": 717.891357421875,
"learning_rate": 4.606060606060607e-05,
"loss": 143.9607,
"step": 11400
},
{
"epoch": 0.0921953150881956,
"grad_norm": 841.2793579101562,
"learning_rate": 4.610101010101011e-05,
"loss": 185.6871,
"step": 11410
},
{
"epoch": 0.09227611729247974,
"grad_norm": 1319.2064208984375,
"learning_rate": 4.614141414141414e-05,
"loss": 163.8516,
"step": 11420
},
{
"epoch": 0.09235691949676388,
"grad_norm": 1566.4168701171875,
"learning_rate": 4.618181818181818e-05,
"loss": 165.3434,
"step": 11430
},
{
"epoch": 0.092437721701048,
"grad_norm": 919.2448120117188,
"learning_rate": 4.6222222222222224e-05,
"loss": 133.6576,
"step": 11440
},
{
"epoch": 0.09251852390533213,
"grad_norm": 976.5399780273438,
"learning_rate": 4.626262626262626e-05,
"loss": 172.4401,
"step": 11450
},
{
"epoch": 0.09259932610961627,
"grad_norm": 1343.291015625,
"learning_rate": 4.63030303030303e-05,
"loss": 203.6782,
"step": 11460
},
{
"epoch": 0.0926801283139004,
"grad_norm": 839.6242065429688,
"learning_rate": 4.634343434343435e-05,
"loss": 175.6062,
"step": 11470
},
{
"epoch": 0.09276093051818454,
"grad_norm": 909.182861328125,
"learning_rate": 4.6383838383838386e-05,
"loss": 159.7895,
"step": 11480
},
{
"epoch": 0.09284173272246866,
"grad_norm": 2260.60107421875,
"learning_rate": 4.6424242424242425e-05,
"loss": 168.3035,
"step": 11490
},
{
"epoch": 0.0929225349267528,
"grad_norm": 1016.5995483398438,
"learning_rate": 4.6464646464646464e-05,
"loss": 149.6134,
"step": 11500
},
{
"epoch": 0.09300333713103694,
"grad_norm": 1180.2608642578125,
"learning_rate": 4.650505050505051e-05,
"loss": 123.0571,
"step": 11510
},
{
"epoch": 0.09308413933532107,
"grad_norm": 1054.5244140625,
"learning_rate": 4.654545454545455e-05,
"loss": 180.0858,
"step": 11520
},
{
"epoch": 0.0931649415396052,
"grad_norm": 1064.1981201171875,
"learning_rate": 4.658585858585859e-05,
"loss": 305.0233,
"step": 11530
},
{
"epoch": 0.09324574374388933,
"grad_norm": 1455.9857177734375,
"learning_rate": 4.6626262626262626e-05,
"loss": 174.1713,
"step": 11540
},
{
"epoch": 0.09332654594817347,
"grad_norm": 801.254150390625,
"learning_rate": 4.666666666666667e-05,
"loss": 175.7632,
"step": 11550
},
{
"epoch": 0.0934073481524576,
"grad_norm": 811.2909545898438,
"learning_rate": 4.670707070707071e-05,
"loss": 152.5526,
"step": 11560
},
{
"epoch": 0.09348815035674173,
"grad_norm": 860.8350830078125,
"learning_rate": 4.674747474747475e-05,
"loss": 159.9338,
"step": 11570
},
{
"epoch": 0.09356895256102586,
"grad_norm": 1820.26318359375,
"learning_rate": 4.6787878787878795e-05,
"loss": 205.222,
"step": 11580
},
{
"epoch": 0.09364975476531,
"grad_norm": 1027.9521484375,
"learning_rate": 4.6828282828282834e-05,
"loss": 167.9507,
"step": 11590
},
{
"epoch": 0.09373055696959413,
"grad_norm": 3551.64599609375,
"learning_rate": 4.686868686868687e-05,
"loss": 194.2529,
"step": 11600
},
{
"epoch": 0.09381135917387827,
"grad_norm": 957.3357543945312,
"learning_rate": 4.690909090909091e-05,
"loss": 185.5651,
"step": 11610
},
{
"epoch": 0.09389216137816239,
"grad_norm": 587.98828125,
"learning_rate": 4.694949494949496e-05,
"loss": 208.2654,
"step": 11620
},
{
"epoch": 0.09397296358244653,
"grad_norm": 824.1953735351562,
"learning_rate": 4.698989898989899e-05,
"loss": 118.9412,
"step": 11630
},
{
"epoch": 0.09405376578673066,
"grad_norm": 997.8128051757812,
"learning_rate": 4.703030303030303e-05,
"loss": 144.5133,
"step": 11640
},
{
"epoch": 0.0941345679910148,
"grad_norm": 825.6588745117188,
"learning_rate": 4.7070707070707074e-05,
"loss": 130.3822,
"step": 11650
},
{
"epoch": 0.09421537019529892,
"grad_norm": 1590.2271728515625,
"learning_rate": 4.711111111111111e-05,
"loss": 147.783,
"step": 11660
},
{
"epoch": 0.09429617239958306,
"grad_norm": 831.8695068359375,
"learning_rate": 4.715151515151515e-05,
"loss": 150.0897,
"step": 11670
},
{
"epoch": 0.0943769746038672,
"grad_norm": 879.4678955078125,
"learning_rate": 4.719191919191919e-05,
"loss": 134.1156,
"step": 11680
},
{
"epoch": 0.09445777680815133,
"grad_norm": 1955.2484130859375,
"learning_rate": 4.7232323232323236e-05,
"loss": 155.8984,
"step": 11690
},
{
"epoch": 0.09453857901243545,
"grad_norm": 1074.5509033203125,
"learning_rate": 4.7272727272727275e-05,
"loss": 186.7514,
"step": 11700
},
{
"epoch": 0.09461938121671959,
"grad_norm": 992.748046875,
"learning_rate": 4.7313131313131314e-05,
"loss": 155.1787,
"step": 11710
},
{
"epoch": 0.09470018342100373,
"grad_norm": 990.5753784179688,
"learning_rate": 4.735353535353535e-05,
"loss": 140.0309,
"step": 11720
},
{
"epoch": 0.09478098562528786,
"grad_norm": 792.1434936523438,
"learning_rate": 4.73939393939394e-05,
"loss": 138.4607,
"step": 11730
},
{
"epoch": 0.094861787829572,
"grad_norm": 1121.582763671875,
"learning_rate": 4.743434343434344e-05,
"loss": 121.2844,
"step": 11740
},
{
"epoch": 0.09494259003385612,
"grad_norm": 1351.6878662109375,
"learning_rate": 4.7474747474747476e-05,
"loss": 198.7702,
"step": 11750
},
{
"epoch": 0.09502339223814026,
"grad_norm": 2031.31494140625,
"learning_rate": 4.751515151515152e-05,
"loss": 164.1413,
"step": 11760
},
{
"epoch": 0.09510419444242439,
"grad_norm": 761.006103515625,
"learning_rate": 4.755555555555556e-05,
"loss": 143.7779,
"step": 11770
},
{
"epoch": 0.09518499664670853,
"grad_norm": 1228.6676025390625,
"learning_rate": 4.75959595959596e-05,
"loss": 177.4899,
"step": 11780
},
{
"epoch": 0.09526579885099265,
"grad_norm": 817.8377075195312,
"learning_rate": 4.763636363636364e-05,
"loss": 158.3827,
"step": 11790
},
{
"epoch": 0.09534660105527679,
"grad_norm": 1481.2501220703125,
"learning_rate": 4.7676767676767684e-05,
"loss": 144.1627,
"step": 11800
},
{
"epoch": 0.09542740325956092,
"grad_norm": 899.5111083984375,
"learning_rate": 4.771717171717172e-05,
"loss": 188.1043,
"step": 11810
},
{
"epoch": 0.09550820546384506,
"grad_norm": 766.15869140625,
"learning_rate": 4.775757575757576e-05,
"loss": 190.8986,
"step": 11820
},
{
"epoch": 0.09558900766812918,
"grad_norm": 818.2703857421875,
"learning_rate": 4.77979797979798e-05,
"loss": 226.6272,
"step": 11830
},
{
"epoch": 0.09566980987241332,
"grad_norm": 1306.8607177734375,
"learning_rate": 4.7838383838383846e-05,
"loss": 179.7575,
"step": 11840
},
{
"epoch": 0.09575061207669745,
"grad_norm": 1371.1048583984375,
"learning_rate": 4.787878787878788e-05,
"loss": 184.5432,
"step": 11850
},
{
"epoch": 0.09583141428098159,
"grad_norm": 1219.8428955078125,
"learning_rate": 4.791919191919192e-05,
"loss": 256.0484,
"step": 11860
},
{
"epoch": 0.09591221648526572,
"grad_norm": 1769.408935546875,
"learning_rate": 4.795959595959596e-05,
"loss": 216.8512,
"step": 11870
},
{
"epoch": 0.09599301868954985,
"grad_norm": 1021.0985107421875,
"learning_rate": 4.8e-05,
"loss": 150.8418,
"step": 11880
},
{
"epoch": 0.09607382089383398,
"grad_norm": 789.3172607421875,
"learning_rate": 4.804040404040404e-05,
"loss": 168.713,
"step": 11890
},
{
"epoch": 0.09615462309811812,
"grad_norm": 1145.4168701171875,
"learning_rate": 4.808080808080808e-05,
"loss": 248.8971,
"step": 11900
},
{
"epoch": 0.09623542530240226,
"grad_norm": 1330.3175048828125,
"learning_rate": 4.8121212121212125e-05,
"loss": 143.9111,
"step": 11910
},
{
"epoch": 0.09631622750668638,
"grad_norm": 1145.8402099609375,
"learning_rate": 4.8161616161616163e-05,
"loss": 196.2298,
"step": 11920
},
{
"epoch": 0.09639702971097051,
"grad_norm": 3719.409423828125,
"learning_rate": 4.82020202020202e-05,
"loss": 141.981,
"step": 11930
},
{
"epoch": 0.09647783191525465,
"grad_norm": 783.9396362304688,
"learning_rate": 4.824242424242425e-05,
"loss": 139.5111,
"step": 11940
},
{
"epoch": 0.09655863411953879,
"grad_norm": 817.3587036132812,
"learning_rate": 4.828282828282829e-05,
"loss": 193.1281,
"step": 11950
},
{
"epoch": 0.09663943632382291,
"grad_norm": 2150.12939453125,
"learning_rate": 4.8323232323232326e-05,
"loss": 173.1304,
"step": 11960
},
{
"epoch": 0.09672023852810704,
"grad_norm": 1299.8162841796875,
"learning_rate": 4.8363636363636364e-05,
"loss": 181.7705,
"step": 11970
},
{
"epoch": 0.09680104073239118,
"grad_norm": 936.7069091796875,
"learning_rate": 4.840404040404041e-05,
"loss": 173.4242,
"step": 11980
},
{
"epoch": 0.09688184293667532,
"grad_norm": 613.501708984375,
"learning_rate": 4.844444444444445e-05,
"loss": 153.117,
"step": 11990
},
{
"epoch": 0.09696264514095944,
"grad_norm": 1406.6314697265625,
"learning_rate": 4.848484848484849e-05,
"loss": 185.4404,
"step": 12000
}
],
"logging_steps": 10,
"max_steps": 123750,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}