{ "best_metric": 0.5118223208478402, "best_model_checkpoint": "Mrohit01/1_M_cards-swinv2-base-patch4-window12to16-192to256-22kto1k-ft-finetuned-v3/checkpoint-16330", "epoch": 9.99540933435348, "eval_steps": 500, "global_step": 16330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00612088752869166, "grad_norm": 12.164946556091309, "learning_rate": 3.061849357011635e-07, "loss": 2.3398, "step": 10 }, { "epoch": 0.01224177505738332, "grad_norm": 14.423832893371582, "learning_rate": 6.12369871402327e-07, "loss": 2.3002, "step": 20 }, { "epoch": 0.018362662586074982, "grad_norm": 10.38547134399414, "learning_rate": 9.185548071034905e-07, "loss": 2.2128, "step": 30 }, { "epoch": 0.02448355011476664, "grad_norm": 10.298561096191406, "learning_rate": 1.224739742804654e-06, "loss": 2.0935, "step": 40 }, { "epoch": 0.030604437643458302, "grad_norm": 8.211506843566895, "learning_rate": 1.5309246785058176e-06, "loss": 1.9766, "step": 50 }, { "epoch": 0.036725325172149964, "grad_norm": 50.141380310058594, "learning_rate": 1.837109614206981e-06, "loss": 1.861, "step": 60 }, { "epoch": 0.04284621270084162, "grad_norm": 7.632486343383789, "learning_rate": 2.1432945499081446e-06, "loss": 1.8087, "step": 70 }, { "epoch": 0.04896710022953328, "grad_norm": 79.66400909423828, "learning_rate": 2.449479485609308e-06, "loss": 1.7657, "step": 80 }, { "epoch": 0.05508798775822494, "grad_norm": 7.020776271820068, "learning_rate": 2.7556644213104716e-06, "loss": 1.7149, "step": 90 }, { "epoch": 0.061208875286916604, "grad_norm": 5.450502395629883, "learning_rate": 3.061849357011635e-06, "loss": 1.6573, "step": 100 }, { "epoch": 0.06732976281560826, "grad_norm": 5.259347438812256, "learning_rate": 3.368034292712799e-06, "loss": 1.6454, "step": 110 }, { "epoch": 0.07345065034429993, "grad_norm": 13.419878005981445, "learning_rate": 3.674219228413962e-06, "loss": 1.6188, "step": 120 }, { "epoch": 0.07957153787299158, "grad_norm": 12.368359565734863, "learning_rate": 3.980404164115126e-06, "loss": 1.6115, "step": 130 }, { "epoch": 0.08569242540168324, "grad_norm": 8.168076515197754, "learning_rate": 4.286589099816289e-06, "loss": 1.5702, "step": 140 }, { "epoch": 0.09181331293037491, "grad_norm": 9.173910140991211, "learning_rate": 4.592774035517453e-06, "loss": 1.5632, "step": 150 }, { "epoch": 0.09793420045906656, "grad_norm": 95.74823760986328, "learning_rate": 4.898958971218616e-06, "loss": 1.5163, "step": 160 }, { "epoch": 0.10405508798775823, "grad_norm": 7.6970744132995605, "learning_rate": 5.20514390691978e-06, "loss": 1.5255, "step": 170 }, { "epoch": 0.11017597551644988, "grad_norm": 8.821610450744629, "learning_rate": 5.511328842620943e-06, "loss": 1.5225, "step": 180 }, { "epoch": 0.11629686304514154, "grad_norm": 8.845519065856934, "learning_rate": 5.817513778322107e-06, "loss": 1.5166, "step": 190 }, { "epoch": 0.12241775057383321, "grad_norm": 5.295988082885742, "learning_rate": 6.12369871402327e-06, "loss": 1.5011, "step": 200 }, { "epoch": 0.12853863810252486, "grad_norm": 21.03306770324707, "learning_rate": 6.429883649724435e-06, "loss": 1.4884, "step": 210 }, { "epoch": 0.13465952563121653, "grad_norm": 7.419331073760986, "learning_rate": 6.736068585425598e-06, "loss": 1.4816, "step": 220 }, { "epoch": 0.1407804131599082, "grad_norm": 6.6504411697387695, "learning_rate": 7.042253521126762e-06, "loss": 1.4639, "step": 230 }, { "epoch": 0.14690130068859986, "grad_norm": 19.93020248413086, "learning_rate": 7.348438456827924e-06, "loss": 1.4748, "step": 240 }, { "epoch": 0.1530221882172915, "grad_norm": 212.81414794921875, "learning_rate": 7.654623392529089e-06, "loss": 1.4663, "step": 250 }, { "epoch": 0.15914307574598316, "grad_norm": 6.855514049530029, "learning_rate": 7.960808328230251e-06, "loss": 1.4681, "step": 260 }, { "epoch": 0.16526396327467482, "grad_norm": 6.292046546936035, "learning_rate": 8.266993263931416e-06, "loss": 1.4505, "step": 270 }, { "epoch": 0.1713848508033665, "grad_norm": 95.416015625, "learning_rate": 8.573178199632578e-06, "loss": 1.4735, "step": 280 }, { "epoch": 0.17750573833205815, "grad_norm": 9.463255882263184, "learning_rate": 8.879363135333741e-06, "loss": 1.4572, "step": 290 }, { "epoch": 0.18362662586074982, "grad_norm": 5.731226921081543, "learning_rate": 9.185548071034905e-06, "loss": 1.4478, "step": 300 }, { "epoch": 0.18974751338944146, "grad_norm": 7.399662494659424, "learning_rate": 9.491733006736068e-06, "loss": 1.4655, "step": 310 }, { "epoch": 0.19586840091813312, "grad_norm": 10.838027954101562, "learning_rate": 9.797917942437232e-06, "loss": 1.4375, "step": 320 }, { "epoch": 0.2019892884468248, "grad_norm": 7.435635566711426, "learning_rate": 1.0104102878138397e-05, "loss": 1.431, "step": 330 }, { "epoch": 0.20811017597551645, "grad_norm": 12.052409172058105, "learning_rate": 1.041028781383956e-05, "loss": 1.4273, "step": 340 }, { "epoch": 0.21423106350420812, "grad_norm": 10.6692476272583, "learning_rate": 1.0716472749540724e-05, "loss": 1.4286, "step": 350 }, { "epoch": 0.22035195103289976, "grad_norm": 6.245419025421143, "learning_rate": 1.1022657685241886e-05, "loss": 1.4376, "step": 360 }, { "epoch": 0.22647283856159142, "grad_norm": 6.799665451049805, "learning_rate": 1.132884262094305e-05, "loss": 1.4288, "step": 370 }, { "epoch": 0.2325937260902831, "grad_norm": 10.520297050476074, "learning_rate": 1.1635027556644213e-05, "loss": 1.4242, "step": 380 }, { "epoch": 0.23871461361897475, "grad_norm": 6.719942092895508, "learning_rate": 1.1941212492345378e-05, "loss": 1.4336, "step": 390 }, { "epoch": 0.24483550114766642, "grad_norm": 7.194707870483398, "learning_rate": 1.224739742804654e-05, "loss": 1.4121, "step": 400 }, { "epoch": 0.25095638867635806, "grad_norm": 18.913164138793945, "learning_rate": 1.2553582363747705e-05, "loss": 1.4399, "step": 410 }, { "epoch": 0.2570772762050497, "grad_norm": 3.8806185722351074, "learning_rate": 1.285976729944887e-05, "loss": 1.428, "step": 420 }, { "epoch": 0.2631981637337414, "grad_norm": 9.427650451660156, "learning_rate": 1.316595223515003e-05, "loss": 1.4204, "step": 430 }, { "epoch": 0.26931905126243305, "grad_norm": 4.577441692352295, "learning_rate": 1.3472137170851196e-05, "loss": 1.4168, "step": 440 }, { "epoch": 0.2754399387911247, "grad_norm": 6.408951759338379, "learning_rate": 1.3778322106552357e-05, "loss": 1.4402, "step": 450 }, { "epoch": 0.2815608263198164, "grad_norm": 7.625643253326416, "learning_rate": 1.4084507042253523e-05, "loss": 1.389, "step": 460 }, { "epoch": 0.28768171384850805, "grad_norm": 11.841383934020996, "learning_rate": 1.4390691977954684e-05, "loss": 1.4048, "step": 470 }, { "epoch": 0.2938026013771997, "grad_norm": 18.034799575805664, "learning_rate": 1.4696876913655849e-05, "loss": 1.4152, "step": 480 }, { "epoch": 0.2999234889058914, "grad_norm": 8.04079532623291, "learning_rate": 1.5003061849357011e-05, "loss": 1.413, "step": 490 }, { "epoch": 0.306044376434583, "grad_norm": 10.082817077636719, "learning_rate": 1.5309246785058177e-05, "loss": 1.3984, "step": 500 }, { "epoch": 0.31216526396327465, "grad_norm": 3.9042136669158936, "learning_rate": 1.5615431720759337e-05, "loss": 1.4192, "step": 510 }, { "epoch": 0.3182861514919663, "grad_norm": 5.863616466522217, "learning_rate": 1.5921616656460503e-05, "loss": 1.4082, "step": 520 }, { "epoch": 0.324407039020658, "grad_norm": 4.442514419555664, "learning_rate": 1.622780159216167e-05, "loss": 1.4038, "step": 530 }, { "epoch": 0.33052792654934965, "grad_norm": 3.5633575916290283, "learning_rate": 1.653398652786283e-05, "loss": 1.4019, "step": 540 }, { "epoch": 0.3366488140780413, "grad_norm": 6.105249404907227, "learning_rate": 1.6840171463563994e-05, "loss": 1.408, "step": 550 }, { "epoch": 0.342769701606733, "grad_norm": 30.103477478027344, "learning_rate": 1.7146356399265157e-05, "loss": 1.3889, "step": 560 }, { "epoch": 0.34889058913542464, "grad_norm": 6.706213474273682, "learning_rate": 1.7452541334966323e-05, "loss": 1.3868, "step": 570 }, { "epoch": 0.3550114766641163, "grad_norm": 6.28405237197876, "learning_rate": 1.7758726270667482e-05, "loss": 1.4019, "step": 580 }, { "epoch": 0.361132364192808, "grad_norm": 8.72684097290039, "learning_rate": 1.8064911206368648e-05, "loss": 1.3877, "step": 590 }, { "epoch": 0.36725325172149964, "grad_norm": 4.952666759490967, "learning_rate": 1.837109614206981e-05, "loss": 1.3992, "step": 600 }, { "epoch": 0.37337413925019125, "grad_norm": 31.551925659179688, "learning_rate": 1.8677281077770973e-05, "loss": 1.3858, "step": 610 }, { "epoch": 0.3794950267788829, "grad_norm": 4.013786792755127, "learning_rate": 1.8983466013472136e-05, "loss": 1.3955, "step": 620 }, { "epoch": 0.3856159143075746, "grad_norm": 6.236915111541748, "learning_rate": 1.9289650949173302e-05, "loss": 1.4099, "step": 630 }, { "epoch": 0.39173680183626625, "grad_norm": 18.471105575561523, "learning_rate": 1.9595835884874465e-05, "loss": 1.3936, "step": 640 }, { "epoch": 0.3978576893649579, "grad_norm": 8.124337196350098, "learning_rate": 1.9902020820575627e-05, "loss": 1.412, "step": 650 }, { "epoch": 0.4039785768936496, "grad_norm": 10.235346794128418, "learning_rate": 2.0208205756276793e-05, "loss": 1.3985, "step": 660 }, { "epoch": 0.41009946442234124, "grad_norm": 15.285164833068848, "learning_rate": 2.0514390691977956e-05, "loss": 1.3585, "step": 670 }, { "epoch": 0.4162203519510329, "grad_norm": 7.845749855041504, "learning_rate": 2.082057562767912e-05, "loss": 1.4062, "step": 680 }, { "epoch": 0.4223412394797246, "grad_norm": 9.471783638000488, "learning_rate": 2.112676056338028e-05, "loss": 1.3831, "step": 690 }, { "epoch": 0.42846212700841624, "grad_norm": 3.549377679824829, "learning_rate": 2.1432945499081448e-05, "loss": 1.3984, "step": 700 }, { "epoch": 0.4345830145371079, "grad_norm": 4.166463375091553, "learning_rate": 2.173913043478261e-05, "loss": 1.3931, "step": 710 }, { "epoch": 0.4407039020657995, "grad_norm": 21.529024124145508, "learning_rate": 2.2045315370483773e-05, "loss": 1.389, "step": 720 }, { "epoch": 0.4468247895944912, "grad_norm": 3.551419258117676, "learning_rate": 2.2351500306184936e-05, "loss": 1.3993, "step": 730 }, { "epoch": 0.45294567712318284, "grad_norm": 39.104515075683594, "learning_rate": 2.26576852418861e-05, "loss": 1.373, "step": 740 }, { "epoch": 0.4590665646518745, "grad_norm": 6.086831092834473, "learning_rate": 2.2963870177587264e-05, "loss": 1.4084, "step": 750 }, { "epoch": 0.4651874521805662, "grad_norm": 16.3785457611084, "learning_rate": 2.3270055113288427e-05, "loss": 1.387, "step": 760 }, { "epoch": 0.47130833970925784, "grad_norm": 10.629354476928711, "learning_rate": 2.357624004898959e-05, "loss": 1.3784, "step": 770 }, { "epoch": 0.4774292272379495, "grad_norm": 21.637479782104492, "learning_rate": 2.3882424984690756e-05, "loss": 1.3841, "step": 780 }, { "epoch": 0.48355011476664117, "grad_norm": 9.542509078979492, "learning_rate": 2.4188609920391918e-05, "loss": 1.3896, "step": 790 }, { "epoch": 0.48967100229533284, "grad_norm": 4.976247787475586, "learning_rate": 2.449479485609308e-05, "loss": 1.3781, "step": 800 }, { "epoch": 0.4957918898240245, "grad_norm": 4.468277454376221, "learning_rate": 2.4800979791794247e-05, "loss": 1.3793, "step": 810 }, { "epoch": 0.5019127773527161, "grad_norm": 4.785976409912109, "learning_rate": 2.510716472749541e-05, "loss": 1.3981, "step": 820 }, { "epoch": 0.5080336648814078, "grad_norm": 30.538387298583984, "learning_rate": 2.5413349663196572e-05, "loss": 1.384, "step": 830 }, { "epoch": 0.5141545524100994, "grad_norm": 3.1451311111450195, "learning_rate": 2.571953459889774e-05, "loss": 1.3803, "step": 840 }, { "epoch": 0.5202754399387911, "grad_norm": 3.084407091140747, "learning_rate": 2.6025719534598898e-05, "loss": 1.3821, "step": 850 }, { "epoch": 0.5263963274674828, "grad_norm": 2.694706439971924, "learning_rate": 2.633190447030006e-05, "loss": 1.3875, "step": 860 }, { "epoch": 0.5325172149961744, "grad_norm": 9.250505447387695, "learning_rate": 2.6638089406001226e-05, "loss": 1.3763, "step": 870 }, { "epoch": 0.5386381025248661, "grad_norm": 2.440683364868164, "learning_rate": 2.6944274341702392e-05, "loss": 1.3596, "step": 880 }, { "epoch": 0.5447589900535578, "grad_norm": 4.1361165046691895, "learning_rate": 2.7250459277403552e-05, "loss": 1.3616, "step": 890 }, { "epoch": 0.5508798775822494, "grad_norm": 3.9648489952087402, "learning_rate": 2.7556644213104714e-05, "loss": 1.361, "step": 900 }, { "epoch": 0.5570007651109411, "grad_norm": 3.9357821941375732, "learning_rate": 2.786282914880588e-05, "loss": 1.3763, "step": 910 }, { "epoch": 0.5631216526396328, "grad_norm": 4.0848493576049805, "learning_rate": 2.8169014084507046e-05, "loss": 1.3744, "step": 920 }, { "epoch": 0.5692425401683244, "grad_norm": 172.5287628173828, "learning_rate": 2.847519902020821e-05, "loss": 1.3632, "step": 930 }, { "epoch": 0.5753634276970161, "grad_norm": 3.403939723968506, "learning_rate": 2.878138395590937e-05, "loss": 1.3731, "step": 940 }, { "epoch": 0.5814843152257078, "grad_norm": 3.526984930038452, "learning_rate": 2.9087568891610534e-05, "loss": 1.3827, "step": 950 }, { "epoch": 0.5876052027543994, "grad_norm": 23.79243278503418, "learning_rate": 2.9393753827311697e-05, "loss": 1.3808, "step": 960 }, { "epoch": 0.5937260902830911, "grad_norm": 4.624916076660156, "learning_rate": 2.9699938763012863e-05, "loss": 1.3669, "step": 970 }, { "epoch": 0.5998469778117828, "grad_norm": 3.8356945514678955, "learning_rate": 3.0006123698714022e-05, "loss": 1.3378, "step": 980 }, { "epoch": 0.6059678653404744, "grad_norm": 4.029366493225098, "learning_rate": 3.031230863441519e-05, "loss": 1.3691, "step": 990 }, { "epoch": 0.612088752869166, "grad_norm": 4.599860191345215, "learning_rate": 3.0618493570116355e-05, "loss": 1.3704, "step": 1000 }, { "epoch": 0.6182096403978576, "grad_norm": 9.668410301208496, "learning_rate": 3.0924678505817514e-05, "loss": 1.3767, "step": 1010 }, { "epoch": 0.6243305279265493, "grad_norm": 7.55600643157959, "learning_rate": 3.123086344151867e-05, "loss": 1.3661, "step": 1020 }, { "epoch": 0.630451415455241, "grad_norm": 4.51859712600708, "learning_rate": 3.153704837721984e-05, "loss": 1.3559, "step": 1030 }, { "epoch": 0.6365723029839326, "grad_norm": 3.436450242996216, "learning_rate": 3.1843233312921005e-05, "loss": 1.345, "step": 1040 }, { "epoch": 0.6426931905126243, "grad_norm": 3.319326877593994, "learning_rate": 3.214941824862217e-05, "loss": 1.3485, "step": 1050 }, { "epoch": 0.648814078041316, "grad_norm": 5.369412422180176, "learning_rate": 3.245560318432334e-05, "loss": 1.3516, "step": 1060 }, { "epoch": 0.6549349655700076, "grad_norm": 2.8182034492492676, "learning_rate": 3.2761788120024497e-05, "loss": 1.3703, "step": 1070 }, { "epoch": 0.6610558530986993, "grad_norm": 4.989190101623535, "learning_rate": 3.306797305572566e-05, "loss": 1.3815, "step": 1080 }, { "epoch": 0.667176740627391, "grad_norm": 6.330949783325195, "learning_rate": 3.337415799142682e-05, "loss": 1.3733, "step": 1090 }, { "epoch": 0.6732976281560826, "grad_norm": 2.702294111251831, "learning_rate": 3.368034292712799e-05, "loss": 1.3526, "step": 1100 }, { "epoch": 0.6794185156847743, "grad_norm": 4.195952892303467, "learning_rate": 3.398652786282915e-05, "loss": 1.339, "step": 1110 }, { "epoch": 0.685539403213466, "grad_norm": 6.287045478820801, "learning_rate": 3.429271279853031e-05, "loss": 1.3479, "step": 1120 }, { "epoch": 0.6916602907421576, "grad_norm": 2.7063398361206055, "learning_rate": 3.459889773423148e-05, "loss": 1.3689, "step": 1130 }, { "epoch": 0.6977811782708493, "grad_norm": 2.84989595413208, "learning_rate": 3.4905082669932645e-05, "loss": 1.3589, "step": 1140 }, { "epoch": 0.703902065799541, "grad_norm": 4.503838062286377, "learning_rate": 3.5211267605633805e-05, "loss": 1.3399, "step": 1150 }, { "epoch": 0.7100229533282326, "grad_norm": 2.6337504386901855, "learning_rate": 3.5517452541334964e-05, "loss": 1.3604, "step": 1160 }, { "epoch": 0.7161438408569243, "grad_norm": 3.980837821960449, "learning_rate": 3.582363747703613e-05, "loss": 1.3529, "step": 1170 }, { "epoch": 0.722264728385616, "grad_norm": 3.845048189163208, "learning_rate": 3.6129822412737296e-05, "loss": 1.3456, "step": 1180 }, { "epoch": 0.7283856159143076, "grad_norm": 2.598834753036499, "learning_rate": 3.643600734843846e-05, "loss": 1.3482, "step": 1190 }, { "epoch": 0.7345065034429993, "grad_norm": 2.160224676132202, "learning_rate": 3.674219228413962e-05, "loss": 1.3303, "step": 1200 }, { "epoch": 0.740627390971691, "grad_norm": 5.0525360107421875, "learning_rate": 3.704837721984079e-05, "loss": 1.3836, "step": 1210 }, { "epoch": 0.7467482785003825, "grad_norm": 2.637212038040161, "learning_rate": 3.735456215554195e-05, "loss": 1.3535, "step": 1220 }, { "epoch": 0.7528691660290742, "grad_norm": 2.752631425857544, "learning_rate": 3.766074709124311e-05, "loss": 1.351, "step": 1230 }, { "epoch": 0.7589900535577658, "grad_norm": 3.4024102687835693, "learning_rate": 3.796693202694427e-05, "loss": 1.3518, "step": 1240 }, { "epoch": 0.7651109410864575, "grad_norm": 3.2056336402893066, "learning_rate": 3.827311696264544e-05, "loss": 1.3415, "step": 1250 }, { "epoch": 0.7712318286151492, "grad_norm": 3.5055158138275146, "learning_rate": 3.8579301898346604e-05, "loss": 1.3502, "step": 1260 }, { "epoch": 0.7773527161438408, "grad_norm": 2.2886886596679688, "learning_rate": 3.888548683404777e-05, "loss": 1.3664, "step": 1270 }, { "epoch": 0.7834736036725325, "grad_norm": 3.2076241970062256, "learning_rate": 3.919167176974893e-05, "loss": 1.3293, "step": 1280 }, { "epoch": 0.7895944912012242, "grad_norm": 3.1704225540161133, "learning_rate": 3.9497856705450096e-05, "loss": 1.351, "step": 1290 }, { "epoch": 0.7957153787299158, "grad_norm": 2.2499842643737793, "learning_rate": 3.9804041641151255e-05, "loss": 1.3509, "step": 1300 }, { "epoch": 0.8018362662586075, "grad_norm": 2.698070526123047, "learning_rate": 4.011022657685242e-05, "loss": 1.3463, "step": 1310 }, { "epoch": 0.8079571537872992, "grad_norm": 1.8415290117263794, "learning_rate": 4.041641151255359e-05, "loss": 1.3214, "step": 1320 }, { "epoch": 0.8140780413159908, "grad_norm": 2.9317309856414795, "learning_rate": 4.0722596448254746e-05, "loss": 1.3581, "step": 1330 }, { "epoch": 0.8201989288446825, "grad_norm": 4.546383857727051, "learning_rate": 4.102878138395591e-05, "loss": 1.3413, "step": 1340 }, { "epoch": 0.8263198163733741, "grad_norm": 2.5486834049224854, "learning_rate": 4.133496631965708e-05, "loss": 1.3686, "step": 1350 }, { "epoch": 0.8324407039020658, "grad_norm": 2.265129804611206, "learning_rate": 4.164115125535824e-05, "loss": 1.3125, "step": 1360 }, { "epoch": 0.8385615914307575, "grad_norm": 2.1609714031219482, "learning_rate": 4.19473361910594e-05, "loss": 1.351, "step": 1370 }, { "epoch": 0.8446824789594491, "grad_norm": 3.51118540763855, "learning_rate": 4.225352112676056e-05, "loss": 1.3366, "step": 1380 }, { "epoch": 0.8508033664881408, "grad_norm": 3.0679023265838623, "learning_rate": 4.255970606246173e-05, "loss": 1.3472, "step": 1390 }, { "epoch": 0.8569242540168325, "grad_norm": 2.597992181777954, "learning_rate": 4.2865890998162895e-05, "loss": 1.3632, "step": 1400 }, { "epoch": 0.8630451415455241, "grad_norm": 2.0379843711853027, "learning_rate": 4.3172075933864054e-05, "loss": 1.3615, "step": 1410 }, { "epoch": 0.8691660290742158, "grad_norm": 2.303506374359131, "learning_rate": 4.347826086956522e-05, "loss": 1.3273, "step": 1420 }, { "epoch": 0.8752869166029075, "grad_norm": 2.795226812362671, "learning_rate": 4.3784445805266386e-05, "loss": 1.3475, "step": 1430 }, { "epoch": 0.881407804131599, "grad_norm": 2.218827486038208, "learning_rate": 4.4090630740967546e-05, "loss": 1.3346, "step": 1440 }, { "epoch": 0.8875286916602907, "grad_norm": 1.8111498355865479, "learning_rate": 4.439681567666871e-05, "loss": 1.3135, "step": 1450 }, { "epoch": 0.8936495791889824, "grad_norm": 2.2712912559509277, "learning_rate": 4.470300061236987e-05, "loss": 1.342, "step": 1460 }, { "epoch": 0.899770466717674, "grad_norm": 2.1051836013793945, "learning_rate": 4.500918554807104e-05, "loss": 1.3249, "step": 1470 }, { "epoch": 0.9058913542463657, "grad_norm": 3.4877002239227295, "learning_rate": 4.53153704837722e-05, "loss": 1.3476, "step": 1480 }, { "epoch": 0.9120122417750574, "grad_norm": 1.686214566230774, "learning_rate": 4.562155541947337e-05, "loss": 1.3266, "step": 1490 }, { "epoch": 0.918133129303749, "grad_norm": 2.8292617797851562, "learning_rate": 4.592774035517453e-05, "loss": 1.3308, "step": 1500 }, { "epoch": 0.9242540168324407, "grad_norm": 1.6535699367523193, "learning_rate": 4.623392529087569e-05, "loss": 1.3368, "step": 1510 }, { "epoch": 0.9303749043611323, "grad_norm": 2.361920118331909, "learning_rate": 4.6540110226576854e-05, "loss": 1.3164, "step": 1520 }, { "epoch": 0.936495791889824, "grad_norm": 1.7258111238479614, "learning_rate": 4.684629516227802e-05, "loss": 1.3396, "step": 1530 }, { "epoch": 0.9426166794185157, "grad_norm": 1.9547419548034668, "learning_rate": 4.715248009797918e-05, "loss": 1.3432, "step": 1540 }, { "epoch": 0.9487375669472073, "grad_norm": 3.920701742172241, "learning_rate": 4.7458665033680345e-05, "loss": 1.3296, "step": 1550 }, { "epoch": 0.954858454475899, "grad_norm": 2.1728620529174805, "learning_rate": 4.776484996938151e-05, "loss": 1.3278, "step": 1560 }, { "epoch": 0.9609793420045907, "grad_norm": 4.015989780426025, "learning_rate": 4.807103490508267e-05, "loss": 1.3238, "step": 1570 }, { "epoch": 0.9671002295332823, "grad_norm": 3.3949761390686035, "learning_rate": 4.8377219840783837e-05, "loss": 1.3326, "step": 1580 }, { "epoch": 0.973221117061974, "grad_norm": 2.3282153606414795, "learning_rate": 4.8683404776484996e-05, "loss": 1.3495, "step": 1590 }, { "epoch": 0.9793420045906657, "grad_norm": 2.321676731109619, "learning_rate": 4.898958971218616e-05, "loss": 1.3381, "step": 1600 }, { "epoch": 0.9854628921193573, "grad_norm": 2.4791972637176514, "learning_rate": 4.929577464788733e-05, "loss": 1.3193, "step": 1610 }, { "epoch": 0.991583779648049, "grad_norm": 3.545558214187622, "learning_rate": 4.9601959583588494e-05, "loss": 1.3263, "step": 1620 }, { "epoch": 0.9977046671767407, "grad_norm": 2.3265316486358643, "learning_rate": 4.990814451928965e-05, "loss": 1.3156, "step": 1630 }, { "epoch": 0.9995409334353481, "eval_accuracy": 0.44772257718942493, "eval_loss": 1.2975871562957764, "eval_runtime": 2616.4379, "eval_samples_per_second": 79.916, "eval_steps_per_second": 0.625, "step": 1633 }, { "epoch": 1.0038255547054322, "grad_norm": 2.3401131629943848, "learning_rate": 4.997618561611214e-05, "loss": 1.3378, "step": 1640 }, { "epoch": 1.009946442234124, "grad_norm": 2.454505443572998, "learning_rate": 4.994216506770089e-05, "loss": 1.3076, "step": 1650 }, { "epoch": 1.0160673297628156, "grad_norm": 2.544475793838501, "learning_rate": 4.990814451928965e-05, "loss": 1.3284, "step": 1660 }, { "epoch": 1.0221882172915073, "grad_norm": 3.767465114593506, "learning_rate": 4.9874123970878414e-05, "loss": 1.3108, "step": 1670 }, { "epoch": 1.0283091048201989, "grad_norm": 3.772216796875, "learning_rate": 4.984010342246717e-05, "loss": 1.309, "step": 1680 }, { "epoch": 1.0344299923488907, "grad_norm": 1.745509147644043, "learning_rate": 4.980608287405593e-05, "loss": 1.337, "step": 1690 }, { "epoch": 1.0405508798775822, "grad_norm": 2.45389723777771, "learning_rate": 4.977206232564469e-05, "loss": 1.3282, "step": 1700 }, { "epoch": 1.046671767406274, "grad_norm": 2.846754789352417, "learning_rate": 4.973804177723345e-05, "loss": 1.3403, "step": 1710 }, { "epoch": 1.0527926549349655, "grad_norm": 2.496070146560669, "learning_rate": 4.970402122882221e-05, "loss": 1.3135, "step": 1720 }, { "epoch": 1.0589135424636573, "grad_norm": 3.4331891536712646, "learning_rate": 4.967000068041097e-05, "loss": 1.3359, "step": 1730 }, { "epoch": 1.0650344299923489, "grad_norm": 2.686443328857422, "learning_rate": 4.963598013199973e-05, "loss": 1.3241, "step": 1740 }, { "epoch": 1.0711553175210407, "grad_norm": 2.017503261566162, "learning_rate": 4.9601959583588494e-05, "loss": 1.3314, "step": 1750 }, { "epoch": 1.0772762050497322, "grad_norm": 3.1370229721069336, "learning_rate": 4.9567939035177255e-05, "loss": 1.3204, "step": 1760 }, { "epoch": 1.0833970925784238, "grad_norm": 2.5512239933013916, "learning_rate": 4.953391848676601e-05, "loss": 1.3321, "step": 1770 }, { "epoch": 1.0895179801071155, "grad_norm": 2.360032558441162, "learning_rate": 4.949989793835477e-05, "loss": 1.3228, "step": 1780 }, { "epoch": 1.095638867635807, "grad_norm": 2.947443962097168, "learning_rate": 4.9465877389943524e-05, "loss": 1.3247, "step": 1790 }, { "epoch": 1.1017597551644989, "grad_norm": 2.6693966388702393, "learning_rate": 4.9431856841532285e-05, "loss": 1.3259, "step": 1800 }, { "epoch": 1.1078806426931904, "grad_norm": 3.112257957458496, "learning_rate": 4.9397836293121045e-05, "loss": 1.3237, "step": 1810 }, { "epoch": 1.1140015302218822, "grad_norm": 2.609961986541748, "learning_rate": 4.9363815744709806e-05, "loss": 1.3246, "step": 1820 }, { "epoch": 1.1201224177505738, "grad_norm": 3.4597411155700684, "learning_rate": 4.932979519629857e-05, "loss": 1.299, "step": 1830 }, { "epoch": 1.1262433052792655, "grad_norm": 2.242790937423706, "learning_rate": 4.929577464788733e-05, "loss": 1.3098, "step": 1840 }, { "epoch": 1.132364192807957, "grad_norm": 2.5282511711120605, "learning_rate": 4.926175409947609e-05, "loss": 1.3345, "step": 1850 }, { "epoch": 1.1384850803366489, "grad_norm": 2.2986130714416504, "learning_rate": 4.922773355106485e-05, "loss": 1.3339, "step": 1860 }, { "epoch": 1.1446059678653404, "grad_norm": 5.559305191040039, "learning_rate": 4.919371300265361e-05, "loss": 1.3338, "step": 1870 }, { "epoch": 1.1507268553940322, "grad_norm": 2.9165046215057373, "learning_rate": 4.9159692454242364e-05, "loss": 1.322, "step": 1880 }, { "epoch": 1.1568477429227237, "grad_norm": 2.179335594177246, "learning_rate": 4.9125671905831125e-05, "loss": 1.3289, "step": 1890 }, { "epoch": 1.1629686304514155, "grad_norm": 3.7216742038726807, "learning_rate": 4.909165135741988e-05, "loss": 1.3033, "step": 1900 }, { "epoch": 1.169089517980107, "grad_norm": 2.7910213470458984, "learning_rate": 4.905763080900864e-05, "loss": 1.3086, "step": 1910 }, { "epoch": 1.1752104055087988, "grad_norm": 3.217575788497925, "learning_rate": 4.90236102605974e-05, "loss": 1.3194, "step": 1920 }, { "epoch": 1.1813312930374904, "grad_norm": 6.850343704223633, "learning_rate": 4.898958971218616e-05, "loss": 1.3253, "step": 1930 }, { "epoch": 1.1874521805661822, "grad_norm": 2.523886203765869, "learning_rate": 4.895556916377492e-05, "loss": 1.3291, "step": 1940 }, { "epoch": 1.1935730680948737, "grad_norm": 2.4011008739471436, "learning_rate": 4.8921548615363684e-05, "loss": 1.3201, "step": 1950 }, { "epoch": 1.1996939556235655, "grad_norm": 2.7910492420196533, "learning_rate": 4.8887528066952444e-05, "loss": 1.3376, "step": 1960 }, { "epoch": 1.205814843152257, "grad_norm": 2.3480148315429688, "learning_rate": 4.8853507518541205e-05, "loss": 1.315, "step": 1970 }, { "epoch": 1.2119357306809486, "grad_norm": 3.9389758110046387, "learning_rate": 4.8819486970129966e-05, "loss": 1.3054, "step": 1980 }, { "epoch": 1.2180566182096404, "grad_norm": 2.182918071746826, "learning_rate": 4.878546642171872e-05, "loss": 1.3312, "step": 1990 }, { "epoch": 1.2241775057383322, "grad_norm": 5.687019348144531, "learning_rate": 4.8751445873307474e-05, "loss": 1.3041, "step": 2000 }, { "epoch": 1.2302983932670237, "grad_norm": 1.9981379508972168, "learning_rate": 4.8717425324896235e-05, "loss": 1.3046, "step": 2010 }, { "epoch": 1.2364192807957153, "grad_norm": 2.221470832824707, "learning_rate": 4.8683404776484996e-05, "loss": 1.2983, "step": 2020 }, { "epoch": 1.242540168324407, "grad_norm": 3.1240041255950928, "learning_rate": 4.864938422807376e-05, "loss": 1.2943, "step": 2030 }, { "epoch": 1.2486610558530986, "grad_norm": 2.941997528076172, "learning_rate": 4.861536367966252e-05, "loss": 1.301, "step": 2040 }, { "epoch": 1.2547819433817904, "grad_norm": 2.6538472175598145, "learning_rate": 4.858134313125128e-05, "loss": 1.3127, "step": 2050 }, { "epoch": 1.260902830910482, "grad_norm": 4.265965938568115, "learning_rate": 4.854732258284004e-05, "loss": 1.3172, "step": 2060 }, { "epoch": 1.2670237184391737, "grad_norm": 3.689883232116699, "learning_rate": 4.85133020344288e-05, "loss": 1.3123, "step": 2070 }, { "epoch": 1.2731446059678653, "grad_norm": 2.039571523666382, "learning_rate": 4.847928148601756e-05, "loss": 1.3197, "step": 2080 }, { "epoch": 1.279265493496557, "grad_norm": 3.8511271476745605, "learning_rate": 4.8445260937606315e-05, "loss": 1.3123, "step": 2090 }, { "epoch": 1.2853863810252486, "grad_norm": 1.8921153545379639, "learning_rate": 4.8411240389195076e-05, "loss": 1.3335, "step": 2100 }, { "epoch": 1.2915072685539404, "grad_norm": 2.2640275955200195, "learning_rate": 4.8377219840783837e-05, "loss": 1.3187, "step": 2110 }, { "epoch": 1.297628156082632, "grad_norm": 1.9218261241912842, "learning_rate": 4.834319929237259e-05, "loss": 1.3124, "step": 2120 }, { "epoch": 1.3037490436113237, "grad_norm": 8.252876281738281, "learning_rate": 4.830917874396135e-05, "loss": 1.316, "step": 2130 }, { "epoch": 1.3098699311400153, "grad_norm": 4.976735591888428, "learning_rate": 4.827515819555011e-05, "loss": 1.3164, "step": 2140 }, { "epoch": 1.315990818668707, "grad_norm": 4.07997989654541, "learning_rate": 4.824113764713887e-05, "loss": 1.3076, "step": 2150 }, { "epoch": 1.3221117061973986, "grad_norm": 4.033842086791992, "learning_rate": 4.8207117098727634e-05, "loss": 1.3206, "step": 2160 }, { "epoch": 1.3282325937260904, "grad_norm": 2.3117096424102783, "learning_rate": 4.8173096550316395e-05, "loss": 1.3358, "step": 2170 }, { "epoch": 1.334353481254782, "grad_norm": 2.9266395568847656, "learning_rate": 4.8139076001905156e-05, "loss": 1.3044, "step": 2180 }, { "epoch": 1.3404743687834735, "grad_norm": 2.5126564502716064, "learning_rate": 4.8105055453493916e-05, "loss": 1.3028, "step": 2190 }, { "epoch": 1.3465952563121653, "grad_norm": 3.317023277282715, "learning_rate": 4.807103490508267e-05, "loss": 1.3223, "step": 2200 }, { "epoch": 1.352716143840857, "grad_norm": 2.9920036792755127, "learning_rate": 4.803701435667143e-05, "loss": 1.3312, "step": 2210 }, { "epoch": 1.3588370313695486, "grad_norm": 2.6706953048706055, "learning_rate": 4.800299380826019e-05, "loss": 1.2947, "step": 2220 }, { "epoch": 1.3649579188982401, "grad_norm": 1.970794439315796, "learning_rate": 4.7968973259848946e-05, "loss": 1.3094, "step": 2230 }, { "epoch": 1.371078806426932, "grad_norm": 3.3308029174804688, "learning_rate": 4.793495271143771e-05, "loss": 1.3207, "step": 2240 }, { "epoch": 1.3771996939556237, "grad_norm": 2.6538379192352295, "learning_rate": 4.790093216302647e-05, "loss": 1.2917, "step": 2250 }, { "epoch": 1.3833205814843152, "grad_norm": 1.9290739297866821, "learning_rate": 4.786691161461523e-05, "loss": 1.3194, "step": 2260 }, { "epoch": 1.3894414690130068, "grad_norm": 2.5794217586517334, "learning_rate": 4.783289106620399e-05, "loss": 1.3156, "step": 2270 }, { "epoch": 1.3955623565416986, "grad_norm": 2.9146995544433594, "learning_rate": 4.779887051779275e-05, "loss": 1.3009, "step": 2280 }, { "epoch": 1.4016832440703901, "grad_norm": 3.2218544483184814, "learning_rate": 4.776484996938151e-05, "loss": 1.3091, "step": 2290 }, { "epoch": 1.407804131599082, "grad_norm": 3.2916765213012695, "learning_rate": 4.773082942097027e-05, "loss": 1.2991, "step": 2300 }, { "epoch": 1.4139250191277735, "grad_norm": 2.5758864879608154, "learning_rate": 4.7696808872559026e-05, "loss": 1.3166, "step": 2310 }, { "epoch": 1.4200459066564652, "grad_norm": 2.873842239379883, "learning_rate": 4.766278832414779e-05, "loss": 1.3322, "step": 2320 }, { "epoch": 1.4261667941851568, "grad_norm": 2.3912835121154785, "learning_rate": 4.762876777573655e-05, "loss": 1.3195, "step": 2330 }, { "epoch": 1.4322876817138486, "grad_norm": 1.9144972562789917, "learning_rate": 4.759474722732531e-05, "loss": 1.3046, "step": 2340 }, { "epoch": 1.4384085692425401, "grad_norm": 3.2760283946990967, "learning_rate": 4.756072667891406e-05, "loss": 1.2808, "step": 2350 }, { "epoch": 1.444529456771232, "grad_norm": 2.616299629211426, "learning_rate": 4.7526706130502823e-05, "loss": 1.2997, "step": 2360 }, { "epoch": 1.4506503442999235, "grad_norm": 2.84455943107605, "learning_rate": 4.7492685582091584e-05, "loss": 1.3005, "step": 2370 }, { "epoch": 1.4567712318286152, "grad_norm": 3.6488397121429443, "learning_rate": 4.7458665033680345e-05, "loss": 1.284, "step": 2380 }, { "epoch": 1.4628921193573068, "grad_norm": 3.8589773178100586, "learning_rate": 4.7424644485269106e-05, "loss": 1.329, "step": 2390 }, { "epoch": 1.4690130068859983, "grad_norm": 2.23037052154541, "learning_rate": 4.739062393685787e-05, "loss": 1.3123, "step": 2400 }, { "epoch": 1.4751338944146901, "grad_norm": 1.8849884271621704, "learning_rate": 4.735660338844662e-05, "loss": 1.3062, "step": 2410 }, { "epoch": 1.481254781943382, "grad_norm": 1.9689738750457764, "learning_rate": 4.732258284003538e-05, "loss": 1.3102, "step": 2420 }, { "epoch": 1.4873756694720734, "grad_norm": 2.845853328704834, "learning_rate": 4.728856229162414e-05, "loss": 1.3123, "step": 2430 }, { "epoch": 1.493496557000765, "grad_norm": 2.8484694957733154, "learning_rate": 4.72545417432129e-05, "loss": 1.3163, "step": 2440 }, { "epoch": 1.4996174445294568, "grad_norm": 1.8601232767105103, "learning_rate": 4.7220521194801664e-05, "loss": 1.3182, "step": 2450 }, { "epoch": 1.5057383320581486, "grad_norm": 2.9465630054473877, "learning_rate": 4.7186500646390425e-05, "loss": 1.2982, "step": 2460 }, { "epoch": 1.51185921958684, "grad_norm": 2.2005507946014404, "learning_rate": 4.715248009797918e-05, "loss": 1.3187, "step": 2470 }, { "epoch": 1.5179801071155317, "grad_norm": 2.624022960662842, "learning_rate": 4.711845954956794e-05, "loss": 1.3099, "step": 2480 }, { "epoch": 1.5241009946442234, "grad_norm": 2.030693531036377, "learning_rate": 4.70844390011567e-05, "loss": 1.3149, "step": 2490 }, { "epoch": 1.5302218821729152, "grad_norm": 2.416512966156006, "learning_rate": 4.705041845274546e-05, "loss": 1.2993, "step": 2500 }, { "epoch": 1.5363427697016068, "grad_norm": 3.0876128673553467, "learning_rate": 4.701639790433422e-05, "loss": 1.3011, "step": 2510 }, { "epoch": 1.5424636572302983, "grad_norm": 2.3045902252197266, "learning_rate": 4.6982377355922977e-05, "loss": 1.3061, "step": 2520 }, { "epoch": 1.54858454475899, "grad_norm": 2.438272476196289, "learning_rate": 4.694835680751174e-05, "loss": 1.3118, "step": 2530 }, { "epoch": 1.5547054322876819, "grad_norm": 2.2852370738983154, "learning_rate": 4.69143362591005e-05, "loss": 1.3074, "step": 2540 }, { "epoch": 1.5608263198163734, "grad_norm": 3.027848958969116, "learning_rate": 4.688031571068926e-05, "loss": 1.2948, "step": 2550 }, { "epoch": 1.566947207345065, "grad_norm": 2.1234147548675537, "learning_rate": 4.684629516227802e-05, "loss": 1.299, "step": 2560 }, { "epoch": 1.5730680948737565, "grad_norm": 2.619126558303833, "learning_rate": 4.681227461386678e-05, "loss": 1.298, "step": 2570 }, { "epoch": 1.5791889824024483, "grad_norm": 2.670396327972412, "learning_rate": 4.677825406545554e-05, "loss": 1.2962, "step": 2580 }, { "epoch": 1.58530986993114, "grad_norm": 2.637221097946167, "learning_rate": 4.6744233517044296e-05, "loss": 1.2873, "step": 2590 }, { "epoch": 1.5914307574598316, "grad_norm": 6.313690185546875, "learning_rate": 4.6710212968633056e-05, "loss": 1.3156, "step": 2600 }, { "epoch": 1.5975516449885232, "grad_norm": 2.477229356765747, "learning_rate": 4.667619242022182e-05, "loss": 1.3134, "step": 2610 }, { "epoch": 1.603672532517215, "grad_norm": 2.9845306873321533, "learning_rate": 4.664217187181058e-05, "loss": 1.333, "step": 2620 }, { "epoch": 1.6097934200459068, "grad_norm": 2.5738213062286377, "learning_rate": 4.660815132339933e-05, "loss": 1.3083, "step": 2630 }, { "epoch": 1.6159143075745983, "grad_norm": 2.4613943099975586, "learning_rate": 4.657413077498809e-05, "loss": 1.3116, "step": 2640 }, { "epoch": 1.6220351951032899, "grad_norm": 2.152653932571411, "learning_rate": 4.6540110226576854e-05, "loss": 1.3051, "step": 2650 }, { "epoch": 1.6281560826319816, "grad_norm": 2.0958948135375977, "learning_rate": 4.6506089678165615e-05, "loss": 1.3, "step": 2660 }, { "epoch": 1.6342769701606734, "grad_norm": 2.582873582839966, "learning_rate": 4.6472069129754375e-05, "loss": 1.3144, "step": 2670 }, { "epoch": 1.640397857689365, "grad_norm": 2.145533561706543, "learning_rate": 4.6438048581343136e-05, "loss": 1.3013, "step": 2680 }, { "epoch": 1.6465187452180565, "grad_norm": 1.912956714630127, "learning_rate": 4.64040280329319e-05, "loss": 1.3054, "step": 2690 }, { "epoch": 1.6526396327467483, "grad_norm": 1.9682807922363281, "learning_rate": 4.637000748452065e-05, "loss": 1.3341, "step": 2700 }, { "epoch": 1.65876052027544, "grad_norm": 1.8502757549285889, "learning_rate": 4.633598693610941e-05, "loss": 1.3353, "step": 2710 }, { "epoch": 1.6648814078041316, "grad_norm": 2.60172176361084, "learning_rate": 4.630196638769817e-05, "loss": 1.2979, "step": 2720 }, { "epoch": 1.6710022953328232, "grad_norm": 1.7690106630325317, "learning_rate": 4.626794583928693e-05, "loss": 1.2969, "step": 2730 }, { "epoch": 1.677123182861515, "grad_norm": 1.817347526550293, "learning_rate": 4.623392529087569e-05, "loss": 1.306, "step": 2740 }, { "epoch": 1.6832440703902067, "grad_norm": 1.666766881942749, "learning_rate": 4.619990474246445e-05, "loss": 1.3031, "step": 2750 }, { "epoch": 1.6893649579188983, "grad_norm": 2.351947784423828, "learning_rate": 4.616588419405321e-05, "loss": 1.2978, "step": 2760 }, { "epoch": 1.6954858454475898, "grad_norm": 2.820448875427246, "learning_rate": 4.613186364564197e-05, "loss": 1.3231, "step": 2770 }, { "epoch": 1.7016067329762814, "grad_norm": 1.8438854217529297, "learning_rate": 4.609784309723073e-05, "loss": 1.3081, "step": 2780 }, { "epoch": 1.7077276205049732, "grad_norm": 4.818421840667725, "learning_rate": 4.606382254881949e-05, "loss": 1.318, "step": 2790 }, { "epoch": 1.713848508033665, "grad_norm": 1.912014365196228, "learning_rate": 4.602980200040825e-05, "loss": 1.2886, "step": 2800 }, { "epoch": 1.7199693955623565, "grad_norm": 3.0061912536621094, "learning_rate": 4.5995781451997014e-05, "loss": 1.2966, "step": 2810 }, { "epoch": 1.726090283091048, "grad_norm": 1.7573401927947998, "learning_rate": 4.596176090358577e-05, "loss": 1.3127, "step": 2820 }, { "epoch": 1.7322111706197398, "grad_norm": 1.9873417615890503, "learning_rate": 4.592774035517453e-05, "loss": 1.3052, "step": 2830 }, { "epoch": 1.7383320581484316, "grad_norm": 2.0930285453796387, "learning_rate": 4.589371980676328e-05, "loss": 1.2923, "step": 2840 }, { "epoch": 1.7444529456771232, "grad_norm": 1.8706746101379395, "learning_rate": 4.585969925835204e-05, "loss": 1.2831, "step": 2850 }, { "epoch": 1.7505738332058147, "grad_norm": 1.7447609901428223, "learning_rate": 4.5825678709940804e-05, "loss": 1.3013, "step": 2860 }, { "epoch": 1.7566947207345065, "grad_norm": 1.7126678228378296, "learning_rate": 4.5791658161529565e-05, "loss": 1.2795, "step": 2870 }, { "epoch": 1.7628156082631983, "grad_norm": 3.166419267654419, "learning_rate": 4.5757637613118326e-05, "loss": 1.298, "step": 2880 }, { "epoch": 1.7689364957918898, "grad_norm": 1.741654396057129, "learning_rate": 4.572361706470709e-05, "loss": 1.3289, "step": 2890 }, { "epoch": 1.7750573833205814, "grad_norm": 3.99870228767395, "learning_rate": 4.568959651629585e-05, "loss": 1.3163, "step": 2900 }, { "epoch": 1.7811782708492732, "grad_norm": 2.293135404586792, "learning_rate": 4.565557596788461e-05, "loss": 1.2951, "step": 2910 }, { "epoch": 1.787299158377965, "grad_norm": 4.952125549316406, "learning_rate": 4.562155541947337e-05, "loss": 1.2998, "step": 2920 }, { "epoch": 1.7934200459066565, "grad_norm": 1.812955379486084, "learning_rate": 4.558753487106212e-05, "loss": 1.2733, "step": 2930 }, { "epoch": 1.799540933435348, "grad_norm": 1.9405971765518188, "learning_rate": 4.5553514322650884e-05, "loss": 1.3216, "step": 2940 }, { "epoch": 1.8056618209640398, "grad_norm": 2.513551712036133, "learning_rate": 4.551949377423964e-05, "loss": 1.2977, "step": 2950 }, { "epoch": 1.8117827084927316, "grad_norm": 2.0316007137298584, "learning_rate": 4.54854732258284e-05, "loss": 1.2958, "step": 2960 }, { "epoch": 1.8179035960214232, "grad_norm": 2.4681851863861084, "learning_rate": 4.545145267741716e-05, "loss": 1.2789, "step": 2970 }, { "epoch": 1.8240244835501147, "grad_norm": 2.1180663108825684, "learning_rate": 4.541743212900592e-05, "loss": 1.2687, "step": 2980 }, { "epoch": 1.8301453710788065, "grad_norm": 1.6326196193695068, "learning_rate": 4.538341158059468e-05, "loss": 1.314, "step": 2990 }, { "epoch": 1.836266258607498, "grad_norm": 1.7477805614471436, "learning_rate": 4.534939103218344e-05, "loss": 1.3038, "step": 3000 }, { "epoch": 1.8423871461361898, "grad_norm": 1.446861743927002, "learning_rate": 4.53153704837722e-05, "loss": 1.3192, "step": 3010 }, { "epoch": 1.8485080336648814, "grad_norm": 2.2364675998687744, "learning_rate": 4.5281349935360964e-05, "loss": 1.2723, "step": 3020 }, { "epoch": 1.854628921193573, "grad_norm": 1.59337317943573, "learning_rate": 4.5247329386949725e-05, "loss": 1.2792, "step": 3030 }, { "epoch": 1.8607498087222647, "grad_norm": 2.401291847229004, "learning_rate": 4.521330883853848e-05, "loss": 1.2842, "step": 3040 }, { "epoch": 1.8668706962509565, "grad_norm": 2.394993782043457, "learning_rate": 4.517928829012724e-05, "loss": 1.3205, "step": 3050 }, { "epoch": 1.872991583779648, "grad_norm": 2.0789761543273926, "learning_rate": 4.5145267741715994e-05, "loss": 1.2946, "step": 3060 }, { "epoch": 1.8791124713083396, "grad_norm": 4.516623020172119, "learning_rate": 4.5111247193304755e-05, "loss": 1.2772, "step": 3070 }, { "epoch": 1.8852333588370314, "grad_norm": 2.66314959526062, "learning_rate": 4.5077226644893515e-05, "loss": 1.2999, "step": 3080 }, { "epoch": 1.8913542463657231, "grad_norm": 2.047537088394165, "learning_rate": 4.5043206096482276e-05, "loss": 1.2849, "step": 3090 }, { "epoch": 1.8974751338944147, "grad_norm": 2.709646701812744, "learning_rate": 4.500918554807104e-05, "loss": 1.2807, "step": 3100 }, { "epoch": 1.9035960214231062, "grad_norm": 3.5094571113586426, "learning_rate": 4.49751649996598e-05, "loss": 1.2962, "step": 3110 }, { "epoch": 1.909716908951798, "grad_norm": 2.9054393768310547, "learning_rate": 4.494114445124856e-05, "loss": 1.2791, "step": 3120 }, { "epoch": 1.9158377964804898, "grad_norm": 2.601428747177124, "learning_rate": 4.490712390283732e-05, "loss": 1.3076, "step": 3130 }, { "epoch": 1.9219586840091814, "grad_norm": 2.667839527130127, "learning_rate": 4.487310335442608e-05, "loss": 1.2874, "step": 3140 }, { "epoch": 1.928079571537873, "grad_norm": 1.9442615509033203, "learning_rate": 4.4839082806014834e-05, "loss": 1.3045, "step": 3150 }, { "epoch": 1.9342004590665647, "grad_norm": 1.9131063222885132, "learning_rate": 4.4805062257603595e-05, "loss": 1.2915, "step": 3160 }, { "epoch": 1.9403213465952565, "grad_norm": 2.0807864665985107, "learning_rate": 4.477104170919235e-05, "loss": 1.2856, "step": 3170 }, { "epoch": 1.946442234123948, "grad_norm": 2.696155071258545, "learning_rate": 4.473702116078111e-05, "loss": 1.2945, "step": 3180 }, { "epoch": 1.9525631216526396, "grad_norm": 2.16047739982605, "learning_rate": 4.470300061236987e-05, "loss": 1.2889, "step": 3190 }, { "epoch": 1.9586840091813313, "grad_norm": 1.872473120689392, "learning_rate": 4.466898006395863e-05, "loss": 1.2869, "step": 3200 }, { "epoch": 1.9648048967100231, "grad_norm": 1.9989575147628784, "learning_rate": 4.463495951554739e-05, "loss": 1.2999, "step": 3210 }, { "epoch": 1.9709257842387147, "grad_norm": 1.451534390449524, "learning_rate": 4.4600938967136154e-05, "loss": 1.2875, "step": 3220 }, { "epoch": 1.9770466717674062, "grad_norm": 1.6723949909210205, "learning_rate": 4.4566918418724914e-05, "loss": 1.2951, "step": 3230 }, { "epoch": 1.9831675592960978, "grad_norm": 1.913266658782959, "learning_rate": 4.4532897870313675e-05, "loss": 1.2909, "step": 3240 }, { "epoch": 1.9892884468247896, "grad_norm": 2.985137462615967, "learning_rate": 4.449887732190243e-05, "loss": 1.2973, "step": 3250 }, { "epoch": 1.9954093343534813, "grad_norm": 2.2191321849823, "learning_rate": 4.446485677349119e-05, "loss": 1.2943, "step": 3260 }, { "epoch": 1.9996939556235653, "eval_accuracy": 0.4667951562918468, "eval_loss": 1.244300365447998, "eval_runtime": 2619.0656, "eval_samples_per_second": 79.836, "eval_steps_per_second": 0.624, "step": 3267 }, { "epoch": 2.001530221882173, "grad_norm": 1.887938380241394, "learning_rate": 4.443083622507995e-05, "loss": 1.2935, "step": 3270 }, { "epoch": 2.0076511094108644, "grad_norm": 2.1216981410980225, "learning_rate": 4.439681567666871e-05, "loss": 1.2836, "step": 3280 }, { "epoch": 2.0137719969395564, "grad_norm": 2.093498706817627, "learning_rate": 4.4362795128257466e-05, "loss": 1.2785, "step": 3290 }, { "epoch": 2.019892884468248, "grad_norm": 1.6856573820114136, "learning_rate": 4.432877457984623e-05, "loss": 1.3046, "step": 3300 }, { "epoch": 2.0260137719969395, "grad_norm": 2.1252543926239014, "learning_rate": 4.429475403143499e-05, "loss": 1.2838, "step": 3310 }, { "epoch": 2.032134659525631, "grad_norm": 2.125027894973755, "learning_rate": 4.426073348302375e-05, "loss": 1.2772, "step": 3320 }, { "epoch": 2.0382555470543227, "grad_norm": 2.1360585689544678, "learning_rate": 4.422671293461251e-05, "loss": 1.2974, "step": 3330 }, { "epoch": 2.0443764345830147, "grad_norm": 1.6038954257965088, "learning_rate": 4.419269238620127e-05, "loss": 1.2707, "step": 3340 }, { "epoch": 2.050497322111706, "grad_norm": 2.2439558506011963, "learning_rate": 4.415867183779003e-05, "loss": 1.2755, "step": 3350 }, { "epoch": 2.0566182096403978, "grad_norm": 1.75458824634552, "learning_rate": 4.4124651289378785e-05, "loss": 1.2673, "step": 3360 }, { "epoch": 2.0627390971690893, "grad_norm": 2.192709445953369, "learning_rate": 4.4090630740967546e-05, "loss": 1.2809, "step": 3370 }, { "epoch": 2.0688599846977813, "grad_norm": 2.699859142303467, "learning_rate": 4.4056610192556307e-05, "loss": 1.3, "step": 3380 }, { "epoch": 2.074980872226473, "grad_norm": 2.0490965843200684, "learning_rate": 4.402258964414507e-05, "loss": 1.2842, "step": 3390 }, { "epoch": 2.0811017597551644, "grad_norm": 2.2830417156219482, "learning_rate": 4.398856909573382e-05, "loss": 1.2604, "step": 3400 }, { "epoch": 2.087222647283856, "grad_norm": 1.8715858459472656, "learning_rate": 4.395454854732258e-05, "loss": 1.2829, "step": 3410 }, { "epoch": 2.093343534812548, "grad_norm": 3.053288221359253, "learning_rate": 4.392052799891134e-05, "loss": 1.2969, "step": 3420 }, { "epoch": 2.0994644223412395, "grad_norm": 2.098418712615967, "learning_rate": 4.3886507450500104e-05, "loss": 1.2879, "step": 3430 }, { "epoch": 2.105585309869931, "grad_norm": 2.1844096183776855, "learning_rate": 4.3852486902088865e-05, "loss": 1.2907, "step": 3440 }, { "epoch": 2.1117061973986226, "grad_norm": 2.036256790161133, "learning_rate": 4.3818466353677626e-05, "loss": 1.2921, "step": 3450 }, { "epoch": 2.1178270849273146, "grad_norm": 2.0535950660705566, "learning_rate": 4.3784445805266386e-05, "loss": 1.2739, "step": 3460 }, { "epoch": 2.123947972456006, "grad_norm": 2.0060815811157227, "learning_rate": 4.375042525685514e-05, "loss": 1.2891, "step": 3470 }, { "epoch": 2.1300688599846977, "grad_norm": 2.4420292377471924, "learning_rate": 4.37164047084439e-05, "loss": 1.2854, "step": 3480 }, { "epoch": 2.1361897475133893, "grad_norm": 1.8335957527160645, "learning_rate": 4.368238416003266e-05, "loss": 1.3028, "step": 3490 }, { "epoch": 2.1423106350420813, "grad_norm": 1.709681749343872, "learning_rate": 4.364836361162142e-05, "loss": 1.288, "step": 3500 }, { "epoch": 2.148431522570773, "grad_norm": 1.943090558052063, "learning_rate": 4.3614343063210184e-05, "loss": 1.2677, "step": 3510 }, { "epoch": 2.1545524100994644, "grad_norm": 2.027233600616455, "learning_rate": 4.358032251479894e-05, "loss": 1.283, "step": 3520 }, { "epoch": 2.160673297628156, "grad_norm": 1.6251754760742188, "learning_rate": 4.35463019663877e-05, "loss": 1.2895, "step": 3530 }, { "epoch": 2.1667941851568475, "grad_norm": 1.8065128326416016, "learning_rate": 4.351228141797646e-05, "loss": 1.2942, "step": 3540 }, { "epoch": 2.1729150726855395, "grad_norm": 2.0807602405548096, "learning_rate": 4.347826086956522e-05, "loss": 1.274, "step": 3550 }, { "epoch": 2.179035960214231, "grad_norm": 1.7792989015579224, "learning_rate": 4.344424032115398e-05, "loss": 1.2992, "step": 3560 }, { "epoch": 2.1851568477429226, "grad_norm": 2.005281448364258, "learning_rate": 4.3410219772742735e-05, "loss": 1.2703, "step": 3570 }, { "epoch": 2.191277735271614, "grad_norm": 2.485933780670166, "learning_rate": 4.3376199224331496e-05, "loss": 1.2949, "step": 3580 }, { "epoch": 2.197398622800306, "grad_norm": 1.4569703340530396, "learning_rate": 4.334217867592026e-05, "loss": 1.2788, "step": 3590 }, { "epoch": 2.2035195103289977, "grad_norm": 1.8452506065368652, "learning_rate": 4.330815812750902e-05, "loss": 1.2861, "step": 3600 }, { "epoch": 2.2096403978576893, "grad_norm": 1.921706199645996, "learning_rate": 4.327413757909778e-05, "loss": 1.2833, "step": 3610 }, { "epoch": 2.215761285386381, "grad_norm": 1.9338184595108032, "learning_rate": 4.324011703068654e-05, "loss": 1.2811, "step": 3620 }, { "epoch": 2.221882172915073, "grad_norm": 1.666922926902771, "learning_rate": 4.32060964822753e-05, "loss": 1.2631, "step": 3630 }, { "epoch": 2.2280030604437644, "grad_norm": 2.1670820713043213, "learning_rate": 4.3172075933864054e-05, "loss": 1.2949, "step": 3640 }, { "epoch": 2.234123947972456, "grad_norm": 1.980981707572937, "learning_rate": 4.3138055385452815e-05, "loss": 1.2569, "step": 3650 }, { "epoch": 2.2402448355011475, "grad_norm": 1.6385728120803833, "learning_rate": 4.3104034837041576e-05, "loss": 1.2807, "step": 3660 }, { "epoch": 2.2463657230298395, "grad_norm": 3.8424932956695557, "learning_rate": 4.307001428863034e-05, "loss": 1.2776, "step": 3670 }, { "epoch": 2.252486610558531, "grad_norm": 1.75303316116333, "learning_rate": 4.303599374021909e-05, "loss": 1.26, "step": 3680 }, { "epoch": 2.2586074980872226, "grad_norm": 2.4019761085510254, "learning_rate": 4.300197319180785e-05, "loss": 1.2846, "step": 3690 }, { "epoch": 2.264728385615914, "grad_norm": 1.8176641464233398, "learning_rate": 4.296795264339661e-05, "loss": 1.2877, "step": 3700 }, { "epoch": 2.270849273144606, "grad_norm": 3.1902916431427, "learning_rate": 4.293393209498537e-05, "loss": 1.2552, "step": 3710 }, { "epoch": 2.2769701606732977, "grad_norm": 1.9043002128601074, "learning_rate": 4.2899911546574134e-05, "loss": 1.2898, "step": 3720 }, { "epoch": 2.2830910482019893, "grad_norm": 1.6254369020462036, "learning_rate": 4.2865890998162895e-05, "loss": 1.2753, "step": 3730 }, { "epoch": 2.289211935730681, "grad_norm": 1.8241372108459473, "learning_rate": 4.2831870449751656e-05, "loss": 1.2722, "step": 3740 }, { "epoch": 2.295332823259373, "grad_norm": 1.5528113842010498, "learning_rate": 4.279784990134042e-05, "loss": 1.2576, "step": 3750 }, { "epoch": 2.3014537107880644, "grad_norm": 2.23053240776062, "learning_rate": 4.276382935292917e-05, "loss": 1.2991, "step": 3760 }, { "epoch": 2.307574598316756, "grad_norm": 2.2314915657043457, "learning_rate": 4.272980880451793e-05, "loss": 1.2812, "step": 3770 }, { "epoch": 2.3136954858454475, "grad_norm": 1.9923975467681885, "learning_rate": 4.269578825610669e-05, "loss": 1.3072, "step": 3780 }, { "epoch": 2.319816373374139, "grad_norm": 2.52502179145813, "learning_rate": 4.2661767707695446e-05, "loss": 1.2686, "step": 3790 }, { "epoch": 2.325937260902831, "grad_norm": 1.7113980054855347, "learning_rate": 4.262774715928421e-05, "loss": 1.2644, "step": 3800 }, { "epoch": 2.3320581484315226, "grad_norm": 1.640830636024475, "learning_rate": 4.259372661087297e-05, "loss": 1.2841, "step": 3810 }, { "epoch": 2.338179035960214, "grad_norm": 2.1533923149108887, "learning_rate": 4.255970606246173e-05, "loss": 1.2569, "step": 3820 }, { "epoch": 2.3442999234889057, "grad_norm": 1.8957128524780273, "learning_rate": 4.252568551405049e-05, "loss": 1.2557, "step": 3830 }, { "epoch": 2.3504208110175977, "grad_norm": 2.399122476577759, "learning_rate": 4.249166496563925e-05, "loss": 1.2755, "step": 3840 }, { "epoch": 2.3565416985462893, "grad_norm": 1.7818849086761475, "learning_rate": 4.245764441722801e-05, "loss": 1.2749, "step": 3850 }, { "epoch": 2.362662586074981, "grad_norm": 1.9884240627288818, "learning_rate": 4.242362386881677e-05, "loss": 1.2851, "step": 3860 }, { "epoch": 2.3687834736036724, "grad_norm": 1.8245644569396973, "learning_rate": 4.2389603320405526e-05, "loss": 1.2736, "step": 3870 }, { "epoch": 2.3749043611323644, "grad_norm": 2.0622308254241943, "learning_rate": 4.235558277199429e-05, "loss": 1.2661, "step": 3880 }, { "epoch": 2.381025248661056, "grad_norm": 1.7899513244628906, "learning_rate": 4.232156222358304e-05, "loss": 1.2735, "step": 3890 }, { "epoch": 2.3871461361897475, "grad_norm": 2.266698122024536, "learning_rate": 4.22875416751718e-05, "loss": 1.2875, "step": 3900 }, { "epoch": 2.393267023718439, "grad_norm": 1.6049247980117798, "learning_rate": 4.225352112676056e-05, "loss": 1.2709, "step": 3910 }, { "epoch": 2.399387911247131, "grad_norm": 2.069531202316284, "learning_rate": 4.2219500578349324e-05, "loss": 1.2725, "step": 3920 }, { "epoch": 2.4055087987758226, "grad_norm": 2.038214683532715, "learning_rate": 4.2185480029938085e-05, "loss": 1.284, "step": 3930 }, { "epoch": 2.411629686304514, "grad_norm": 1.7478989362716675, "learning_rate": 4.2151459481526845e-05, "loss": 1.2845, "step": 3940 }, { "epoch": 2.4177505738332057, "grad_norm": 1.536227822303772, "learning_rate": 4.2117438933115606e-05, "loss": 1.2724, "step": 3950 }, { "epoch": 2.4238714613618972, "grad_norm": 2.473824977874756, "learning_rate": 4.208341838470437e-05, "loss": 1.2793, "step": 3960 }, { "epoch": 2.4299923488905892, "grad_norm": 2.2828357219696045, "learning_rate": 4.204939783629313e-05, "loss": 1.2595, "step": 3970 }, { "epoch": 2.436113236419281, "grad_norm": 2.004650831222534, "learning_rate": 4.201537728788188e-05, "loss": 1.2778, "step": 3980 }, { "epoch": 2.4422341239479723, "grad_norm": 2.862926959991455, "learning_rate": 4.198135673947064e-05, "loss": 1.2625, "step": 3990 }, { "epoch": 2.4483550114766643, "grad_norm": 3.5888588428497314, "learning_rate": 4.19473361910594e-05, "loss": 1.2608, "step": 4000 }, { "epoch": 2.454475899005356, "grad_norm": 1.9902139902114868, "learning_rate": 4.191331564264816e-05, "loss": 1.2607, "step": 4010 }, { "epoch": 2.4605967865340475, "grad_norm": 2.3070342540740967, "learning_rate": 4.187929509423692e-05, "loss": 1.2555, "step": 4020 }, { "epoch": 2.466717674062739, "grad_norm": 1.9157483577728271, "learning_rate": 4.184527454582568e-05, "loss": 1.2881, "step": 4030 }, { "epoch": 2.4728385615914306, "grad_norm": 2.6677842140197754, "learning_rate": 4.181125399741444e-05, "loss": 1.2861, "step": 4040 }, { "epoch": 2.4789594491201226, "grad_norm": 1.9452474117279053, "learning_rate": 4.17772334490032e-05, "loss": 1.2648, "step": 4050 }, { "epoch": 2.485080336648814, "grad_norm": 2.034928798675537, "learning_rate": 4.174321290059196e-05, "loss": 1.2693, "step": 4060 }, { "epoch": 2.4912012241775057, "grad_norm": 2.1892433166503906, "learning_rate": 4.170919235218072e-05, "loss": 1.2807, "step": 4070 }, { "epoch": 2.4973221117061972, "grad_norm": 2.4424924850463867, "learning_rate": 4.1675171803769484e-05, "loss": 1.2849, "step": 4080 }, { "epoch": 2.503442999234889, "grad_norm": 2.0010828971862793, "learning_rate": 4.164115125535824e-05, "loss": 1.2695, "step": 4090 }, { "epoch": 2.5095638867635808, "grad_norm": 2.1089091300964355, "learning_rate": 4.1607130706947e-05, "loss": 1.2908, "step": 4100 }, { "epoch": 2.5156847742922723, "grad_norm": 2.363715887069702, "learning_rate": 4.157311015853575e-05, "loss": 1.2743, "step": 4110 }, { "epoch": 2.521805661820964, "grad_norm": 1.6981701850891113, "learning_rate": 4.153908961012451e-05, "loss": 1.2672, "step": 4120 }, { "epoch": 2.5279265493496554, "grad_norm": 1.7963359355926514, "learning_rate": 4.1505069061713274e-05, "loss": 1.2625, "step": 4130 }, { "epoch": 2.5340474368783474, "grad_norm": 2.267949104309082, "learning_rate": 4.1471048513302035e-05, "loss": 1.2701, "step": 4140 }, { "epoch": 2.540168324407039, "grad_norm": 1.7686315774917603, "learning_rate": 4.1437027964890796e-05, "loss": 1.2587, "step": 4150 }, { "epoch": 2.5462892119357305, "grad_norm": 1.8550716638565063, "learning_rate": 4.140300741647956e-05, "loss": 1.2511, "step": 4160 }, { "epoch": 2.5524100994644225, "grad_norm": 3.2846624851226807, "learning_rate": 4.136898686806832e-05, "loss": 1.2627, "step": 4170 }, { "epoch": 2.558530986993114, "grad_norm": 2.268218994140625, "learning_rate": 4.133496631965708e-05, "loss": 1.2658, "step": 4180 }, { "epoch": 2.5646518745218057, "grad_norm": 1.6908515691757202, "learning_rate": 4.130094577124584e-05, "loss": 1.2674, "step": 4190 }, { "epoch": 2.570772762050497, "grad_norm": 1.8642390966415405, "learning_rate": 4.126692522283459e-05, "loss": 1.2874, "step": 4200 }, { "epoch": 2.5768936495791888, "grad_norm": 2.945605516433716, "learning_rate": 4.1232904674423354e-05, "loss": 1.2729, "step": 4210 }, { "epoch": 2.5830145371078808, "grad_norm": 2.022144079208374, "learning_rate": 4.119888412601211e-05, "loss": 1.2687, "step": 4220 }, { "epoch": 2.5891354246365723, "grad_norm": 1.9110801219940186, "learning_rate": 4.116486357760087e-05, "loss": 1.2739, "step": 4230 }, { "epoch": 2.595256312165264, "grad_norm": 2.433802366256714, "learning_rate": 4.113084302918963e-05, "loss": 1.2572, "step": 4240 }, { "epoch": 2.601377199693956, "grad_norm": 1.9494119882583618, "learning_rate": 4.109682248077839e-05, "loss": 1.2653, "step": 4250 }, { "epoch": 2.6074980872226474, "grad_norm": 1.64384126663208, "learning_rate": 4.106280193236715e-05, "loss": 1.2791, "step": 4260 }, { "epoch": 2.613618974751339, "grad_norm": 2.202897787094116, "learning_rate": 4.102878138395591e-05, "loss": 1.2564, "step": 4270 }, { "epoch": 2.6197398622800305, "grad_norm": 2.0833334922790527, "learning_rate": 4.099476083554467e-05, "loss": 1.2915, "step": 4280 }, { "epoch": 2.625860749808722, "grad_norm": 2.360783815383911, "learning_rate": 4.0960740287133434e-05, "loss": 1.2879, "step": 4290 }, { "epoch": 2.631981637337414, "grad_norm": 1.80355966091156, "learning_rate": 4.092671973872219e-05, "loss": 1.2798, "step": 4300 }, { "epoch": 2.6381025248661056, "grad_norm": 2.43011212348938, "learning_rate": 4.089269919031095e-05, "loss": 1.2694, "step": 4310 }, { "epoch": 2.644223412394797, "grad_norm": 1.8871304988861084, "learning_rate": 4.085867864189971e-05, "loss": 1.2641, "step": 4320 }, { "epoch": 2.6503442999234887, "grad_norm": 2.0215072631835938, "learning_rate": 4.082465809348847e-05, "loss": 1.2608, "step": 4330 }, { "epoch": 2.6564651874521807, "grad_norm": 1.9137364625930786, "learning_rate": 4.0790637545077225e-05, "loss": 1.27, "step": 4340 }, { "epoch": 2.6625860749808723, "grad_norm": 1.7641111612319946, "learning_rate": 4.0756616996665985e-05, "loss": 1.3004, "step": 4350 }, { "epoch": 2.668706962509564, "grad_norm": 1.8382567167282104, "learning_rate": 4.0722596448254746e-05, "loss": 1.2596, "step": 4360 }, { "epoch": 2.6748278500382554, "grad_norm": 2.095576047897339, "learning_rate": 4.068857589984351e-05, "loss": 1.2685, "step": 4370 }, { "epoch": 2.680948737566947, "grad_norm": 1.7252328395843506, "learning_rate": 4.065455535143227e-05, "loss": 1.3021, "step": 4380 }, { "epoch": 2.687069625095639, "grad_norm": 3.130185604095459, "learning_rate": 4.062053480302103e-05, "loss": 1.2793, "step": 4390 }, { "epoch": 2.6931905126243305, "grad_norm": 1.8197978734970093, "learning_rate": 4.058651425460979e-05, "loss": 1.2679, "step": 4400 }, { "epoch": 2.699311400153022, "grad_norm": 2.7329905033111572, "learning_rate": 4.0552493706198544e-05, "loss": 1.2627, "step": 4410 }, { "epoch": 2.705432287681714, "grad_norm": 2.3365578651428223, "learning_rate": 4.0518473157787304e-05, "loss": 1.2552, "step": 4420 }, { "epoch": 2.7115531752104056, "grad_norm": 2.006134510040283, "learning_rate": 4.0484452609376065e-05, "loss": 1.2712, "step": 4430 }, { "epoch": 2.717674062739097, "grad_norm": 1.6508187055587769, "learning_rate": 4.0450432060964826e-05, "loss": 1.2739, "step": 4440 }, { "epoch": 2.7237949502677887, "grad_norm": 2.231863021850586, "learning_rate": 4.041641151255359e-05, "loss": 1.2694, "step": 4450 }, { "epoch": 2.7299158377964803, "grad_norm": 1.7678385972976685, "learning_rate": 4.038239096414234e-05, "loss": 1.2696, "step": 4460 }, { "epoch": 2.7360367253251723, "grad_norm": 1.7382951974868774, "learning_rate": 4.03483704157311e-05, "loss": 1.2863, "step": 4470 }, { "epoch": 2.742157612853864, "grad_norm": 1.6372942924499512, "learning_rate": 4.031434986731986e-05, "loss": 1.2688, "step": 4480 }, { "epoch": 2.7482785003825554, "grad_norm": 1.9079464673995972, "learning_rate": 4.0280329318908623e-05, "loss": 1.2702, "step": 4490 }, { "epoch": 2.7543993879112474, "grad_norm": 1.8411625623703003, "learning_rate": 4.0246308770497384e-05, "loss": 1.2515, "step": 4500 }, { "epoch": 2.760520275439939, "grad_norm": 1.8902119398117065, "learning_rate": 4.0212288222086145e-05, "loss": 1.2837, "step": 4510 }, { "epoch": 2.7666411629686305, "grad_norm": 1.4213835000991821, "learning_rate": 4.01782676736749e-05, "loss": 1.261, "step": 4520 }, { "epoch": 2.772762050497322, "grad_norm": 2.356781244277954, "learning_rate": 4.014424712526366e-05, "loss": 1.2697, "step": 4530 }, { "epoch": 2.7788829380260136, "grad_norm": 2.5330777168273926, "learning_rate": 4.011022657685242e-05, "loss": 1.2639, "step": 4540 }, { "epoch": 2.785003825554705, "grad_norm": 2.177523374557495, "learning_rate": 4.007620602844118e-05, "loss": 1.2547, "step": 4550 }, { "epoch": 2.791124713083397, "grad_norm": 2.4313042163848877, "learning_rate": 4.004218548002994e-05, "loss": 1.2772, "step": 4560 }, { "epoch": 2.7972456006120887, "grad_norm": 2.4263336658477783, "learning_rate": 4.00081649316187e-05, "loss": 1.2484, "step": 4570 }, { "epoch": 2.8033664881407803, "grad_norm": 1.936612606048584, "learning_rate": 3.997414438320746e-05, "loss": 1.267, "step": 4580 }, { "epoch": 2.8094873756694723, "grad_norm": 2.0780820846557617, "learning_rate": 3.994012383479622e-05, "loss": 1.2625, "step": 4590 }, { "epoch": 2.815608263198164, "grad_norm": 1.8207616806030273, "learning_rate": 3.990610328638498e-05, "loss": 1.2544, "step": 4600 }, { "epoch": 2.8217291507268554, "grad_norm": 2.180629014968872, "learning_rate": 3.987208273797374e-05, "loss": 1.2701, "step": 4610 }, { "epoch": 2.827850038255547, "grad_norm": 2.3692867755889893, "learning_rate": 3.9838062189562494e-05, "loss": 1.2784, "step": 4620 }, { "epoch": 2.8339709257842385, "grad_norm": 1.676451563835144, "learning_rate": 3.9804041641151255e-05, "loss": 1.28, "step": 4630 }, { "epoch": 2.8400918133129305, "grad_norm": 2.219745397567749, "learning_rate": 3.9770021092740016e-05, "loss": 1.256, "step": 4640 }, { "epoch": 2.846212700841622, "grad_norm": 2.0797510147094727, "learning_rate": 3.9736000544328776e-05, "loss": 1.2526, "step": 4650 }, { "epoch": 2.8523335883703136, "grad_norm": 2.238093614578247, "learning_rate": 3.970197999591754e-05, "loss": 1.2936, "step": 4660 }, { "epoch": 2.8584544758990056, "grad_norm": 2.133490562438965, "learning_rate": 3.96679594475063e-05, "loss": 1.2578, "step": 4670 }, { "epoch": 2.864575363427697, "grad_norm": 2.0491223335266113, "learning_rate": 3.963393889909506e-05, "loss": 1.2564, "step": 4680 }, { "epoch": 2.8706962509563887, "grad_norm": 1.5369951725006104, "learning_rate": 3.959991835068381e-05, "loss": 1.2806, "step": 4690 }, { "epoch": 2.8768171384850802, "grad_norm": 1.8396649360656738, "learning_rate": 3.9565897802272574e-05, "loss": 1.2763, "step": 4700 }, { "epoch": 2.882938026013772, "grad_norm": 1.60837721824646, "learning_rate": 3.9531877253861335e-05, "loss": 1.2606, "step": 4710 }, { "epoch": 2.889058913542464, "grad_norm": 2.0090882778167725, "learning_rate": 3.9497856705450096e-05, "loss": 1.2633, "step": 4720 }, { "epoch": 2.8951798010711554, "grad_norm": 3.1234018802642822, "learning_rate": 3.946383615703885e-05, "loss": 1.2743, "step": 4730 }, { "epoch": 2.901300688599847, "grad_norm": 1.919801115989685, "learning_rate": 3.942981560862761e-05, "loss": 1.2815, "step": 4740 }, { "epoch": 2.907421576128539, "grad_norm": 1.6320799589157104, "learning_rate": 3.939579506021637e-05, "loss": 1.2651, "step": 4750 }, { "epoch": 2.9135424636572305, "grad_norm": 1.6664077043533325, "learning_rate": 3.936177451180513e-05, "loss": 1.267, "step": 4760 }, { "epoch": 2.919663351185922, "grad_norm": 1.7831507921218872, "learning_rate": 3.932775396339389e-05, "loss": 1.2613, "step": 4770 }, { "epoch": 2.9257842387146136, "grad_norm": 1.8862683773040771, "learning_rate": 3.9293733414982654e-05, "loss": 1.2512, "step": 4780 }, { "epoch": 2.931905126243305, "grad_norm": 1.8569462299346924, "learning_rate": 3.9259712866571415e-05, "loss": 1.2508, "step": 4790 }, { "epoch": 2.9380260137719967, "grad_norm": 2.7612645626068115, "learning_rate": 3.9225692318160175e-05, "loss": 1.2685, "step": 4800 }, { "epoch": 2.9441469013006887, "grad_norm": 3.243058204650879, "learning_rate": 3.919167176974893e-05, "loss": 1.2725, "step": 4810 }, { "epoch": 2.9502677888293802, "grad_norm": 1.8038846254348755, "learning_rate": 3.915765122133769e-05, "loss": 1.2791, "step": 4820 }, { "epoch": 2.956388676358072, "grad_norm": 2.171898603439331, "learning_rate": 3.912363067292645e-05, "loss": 1.2533, "step": 4830 }, { "epoch": 2.962509563886764, "grad_norm": 2.8486342430114746, "learning_rate": 3.9089610124515205e-05, "loss": 1.2483, "step": 4840 }, { "epoch": 2.9686304514154553, "grad_norm": 1.8373701572418213, "learning_rate": 3.9055589576103966e-05, "loss": 1.2397, "step": 4850 }, { "epoch": 2.974751338944147, "grad_norm": 1.858534336090088, "learning_rate": 3.902156902769273e-05, "loss": 1.2703, "step": 4860 }, { "epoch": 2.9808722264728384, "grad_norm": 2.107698678970337, "learning_rate": 3.898754847928149e-05, "loss": 1.266, "step": 4870 }, { "epoch": 2.98699311400153, "grad_norm": 1.9265609979629517, "learning_rate": 3.895352793087025e-05, "loss": 1.2691, "step": 4880 }, { "epoch": 2.993114001530222, "grad_norm": 2.060009241104126, "learning_rate": 3.891950738245901e-05, "loss": 1.2656, "step": 4890 }, { "epoch": 2.9992348890589136, "grad_norm": 1.5703758001327515, "learning_rate": 3.888548683404777e-05, "loss": 1.2411, "step": 4900 }, { "epoch": 2.9998469778117824, "eval_accuracy": 0.47869399701572485, "eval_loss": 1.2228519916534424, "eval_runtime": 2617.8718, "eval_samples_per_second": 79.873, "eval_steps_per_second": 0.624, "step": 4901 }, { "epoch": 3.005355776587605, "grad_norm": 1.9858216047286987, "learning_rate": 3.885146628563653e-05, "loss": 1.273, "step": 4910 }, { "epoch": 3.0114766641162967, "grad_norm": 2.257009744644165, "learning_rate": 3.881744573722529e-05, "loss": 1.2604, "step": 4920 }, { "epoch": 3.0175975516449887, "grad_norm": 2.2428131103515625, "learning_rate": 3.8783425188814046e-05, "loss": 1.2407, "step": 4930 }, { "epoch": 3.02371843917368, "grad_norm": 1.81501042842865, "learning_rate": 3.87494046404028e-05, "loss": 1.249, "step": 4940 }, { "epoch": 3.0298393267023718, "grad_norm": 2.2534372806549072, "learning_rate": 3.871538409199156e-05, "loss": 1.2608, "step": 4950 }, { "epoch": 3.0359602142310633, "grad_norm": 1.6115994453430176, "learning_rate": 3.868136354358032e-05, "loss": 1.2425, "step": 4960 }, { "epoch": 3.0420811017597553, "grad_norm": 2.0571775436401367, "learning_rate": 3.864734299516908e-05, "loss": 1.2551, "step": 4970 }, { "epoch": 3.048201989288447, "grad_norm": 2.081515073776245, "learning_rate": 3.861332244675784e-05, "loss": 1.2524, "step": 4980 }, { "epoch": 3.0543228768171384, "grad_norm": 2.2645437717437744, "learning_rate": 3.8579301898346604e-05, "loss": 1.2413, "step": 4990 }, { "epoch": 3.06044376434583, "grad_norm": 2.3221487998962402, "learning_rate": 3.8545281349935365e-05, "loss": 1.2519, "step": 5000 }, { "epoch": 3.066564651874522, "grad_norm": 3.2614245414733887, "learning_rate": 3.8511260801524126e-05, "loss": 1.2543, "step": 5010 }, { "epoch": 3.0726855394032135, "grad_norm": 1.9711055755615234, "learning_rate": 3.847724025311289e-05, "loss": 1.2583, "step": 5020 }, { "epoch": 3.078806426931905, "grad_norm": 2.410093069076538, "learning_rate": 3.844321970470164e-05, "loss": 1.2661, "step": 5030 }, { "epoch": 3.0849273144605966, "grad_norm": 2.4068052768707275, "learning_rate": 3.84091991562904e-05, "loss": 1.2298, "step": 5040 }, { "epoch": 3.0910482019892886, "grad_norm": 1.658258080482483, "learning_rate": 3.8375178607879156e-05, "loss": 1.2351, "step": 5050 }, { "epoch": 3.09716908951798, "grad_norm": 1.8933178186416626, "learning_rate": 3.8341158059467916e-05, "loss": 1.2665, "step": 5060 }, { "epoch": 3.1032899770466718, "grad_norm": 2.4667043685913086, "learning_rate": 3.830713751105668e-05, "loss": 1.2583, "step": 5070 }, { "epoch": 3.1094108645753633, "grad_norm": 2.2437615394592285, "learning_rate": 3.827311696264544e-05, "loss": 1.2442, "step": 5080 }, { "epoch": 3.1155317521040553, "grad_norm": 2.0303754806518555, "learning_rate": 3.82390964142342e-05, "loss": 1.2395, "step": 5090 }, { "epoch": 3.121652639632747, "grad_norm": 2.256101608276367, "learning_rate": 3.820507586582296e-05, "loss": 1.2621, "step": 5100 }, { "epoch": 3.1277735271614384, "grad_norm": 4.909579753875732, "learning_rate": 3.817105531741172e-05, "loss": 1.2733, "step": 5110 }, { "epoch": 3.13389441469013, "grad_norm": 1.6804922819137573, "learning_rate": 3.813703476900048e-05, "loss": 1.2408, "step": 5120 }, { "epoch": 3.1400153022188215, "grad_norm": 1.8485102653503418, "learning_rate": 3.810301422058924e-05, "loss": 1.2511, "step": 5130 }, { "epoch": 3.1461361897475135, "grad_norm": 2.0070157051086426, "learning_rate": 3.8068993672177996e-05, "loss": 1.2407, "step": 5140 }, { "epoch": 3.152257077276205, "grad_norm": 2.0214972496032715, "learning_rate": 3.803497312376676e-05, "loss": 1.2527, "step": 5150 }, { "epoch": 3.1583779648048966, "grad_norm": 1.8533238172531128, "learning_rate": 3.800095257535551e-05, "loss": 1.2293, "step": 5160 }, { "epoch": 3.164498852333588, "grad_norm": 1.751216173171997, "learning_rate": 3.796693202694427e-05, "loss": 1.2299, "step": 5170 }, { "epoch": 3.17061973986228, "grad_norm": 2.6898555755615234, "learning_rate": 3.793291147853303e-05, "loss": 1.2641, "step": 5180 }, { "epoch": 3.1767406273909717, "grad_norm": 1.6866509914398193, "learning_rate": 3.7898890930121794e-05, "loss": 1.2722, "step": 5190 }, { "epoch": 3.1828615149196633, "grad_norm": 3.089430093765259, "learning_rate": 3.7864870381710555e-05, "loss": 1.2548, "step": 5200 }, { "epoch": 3.188982402448355, "grad_norm": 2.683788776397705, "learning_rate": 3.7830849833299315e-05, "loss": 1.2477, "step": 5210 }, { "epoch": 3.195103289977047, "grad_norm": 2.5055129528045654, "learning_rate": 3.7796829284888076e-05, "loss": 1.2566, "step": 5220 }, { "epoch": 3.2012241775057384, "grad_norm": 2.4237446784973145, "learning_rate": 3.776280873647684e-05, "loss": 1.2662, "step": 5230 }, { "epoch": 3.20734506503443, "grad_norm": 1.907581090927124, "learning_rate": 3.77287881880656e-05, "loss": 1.2348, "step": 5240 }, { "epoch": 3.2134659525631215, "grad_norm": 2.126953363418579, "learning_rate": 3.769476763965435e-05, "loss": 1.2582, "step": 5250 }, { "epoch": 3.2195868400918135, "grad_norm": 2.169069766998291, "learning_rate": 3.766074709124311e-05, "loss": 1.2571, "step": 5260 }, { "epoch": 3.225707727620505, "grad_norm": 2.7466557025909424, "learning_rate": 3.7626726542831874e-05, "loss": 1.2542, "step": 5270 }, { "epoch": 3.2318286151491966, "grad_norm": 2.4945733547210693, "learning_rate": 3.759270599442063e-05, "loss": 1.2634, "step": 5280 }, { "epoch": 3.237949502677888, "grad_norm": 2.4939398765563965, "learning_rate": 3.755868544600939e-05, "loss": 1.2249, "step": 5290 }, { "epoch": 3.2440703902065797, "grad_norm": 2.0732944011688232, "learning_rate": 3.752466489759815e-05, "loss": 1.2717, "step": 5300 }, { "epoch": 3.2501912777352717, "grad_norm": 2.3293912410736084, "learning_rate": 3.749064434918691e-05, "loss": 1.2467, "step": 5310 }, { "epoch": 3.2563121652639633, "grad_norm": 2.104963779449463, "learning_rate": 3.745662380077567e-05, "loss": 1.2682, "step": 5320 }, { "epoch": 3.262433052792655, "grad_norm": 2.453064203262329, "learning_rate": 3.742260325236443e-05, "loss": 1.2593, "step": 5330 }, { "epoch": 3.268553940321347, "grad_norm": 2.3136184215545654, "learning_rate": 3.738858270395319e-05, "loss": 1.2457, "step": 5340 }, { "epoch": 3.2746748278500384, "grad_norm": 2.3127260208129883, "learning_rate": 3.735456215554195e-05, "loss": 1.2452, "step": 5350 }, { "epoch": 3.28079571537873, "grad_norm": 2.367079496383667, "learning_rate": 3.732054160713071e-05, "loss": 1.2439, "step": 5360 }, { "epoch": 3.2869166029074215, "grad_norm": 2.3883674144744873, "learning_rate": 3.728652105871947e-05, "loss": 1.2428, "step": 5370 }, { "epoch": 3.293037490436113, "grad_norm": 2.274970531463623, "learning_rate": 3.725250051030823e-05, "loss": 1.2567, "step": 5380 }, { "epoch": 3.299158377964805, "grad_norm": 2.298823356628418, "learning_rate": 3.721847996189698e-05, "loss": 1.2633, "step": 5390 }, { "epoch": 3.3052792654934966, "grad_norm": 2.3997528553009033, "learning_rate": 3.7184459413485744e-05, "loss": 1.2277, "step": 5400 }, { "epoch": 3.311400153022188, "grad_norm": 2.273841619491577, "learning_rate": 3.7150438865074505e-05, "loss": 1.2629, "step": 5410 }, { "epoch": 3.3175210405508797, "grad_norm": 1.885188102722168, "learning_rate": 3.7116418316663266e-05, "loss": 1.2464, "step": 5420 }, { "epoch": 3.3236419280795717, "grad_norm": 2.231029987335205, "learning_rate": 3.708239776825203e-05, "loss": 1.2562, "step": 5430 }, { "epoch": 3.3297628156082633, "grad_norm": 2.6225924491882324, "learning_rate": 3.704837721984079e-05, "loss": 1.2352, "step": 5440 }, { "epoch": 3.335883703136955, "grad_norm": 3.8559517860412598, "learning_rate": 3.701435667142955e-05, "loss": 1.2598, "step": 5450 }, { "epoch": 3.3420045906656464, "grad_norm": 1.7077224254608154, "learning_rate": 3.69803361230183e-05, "loss": 1.2587, "step": 5460 }, { "epoch": 3.3481254781943384, "grad_norm": 1.9027427434921265, "learning_rate": 3.694631557460706e-05, "loss": 1.2417, "step": 5470 }, { "epoch": 3.35424636572303, "grad_norm": 2.166884183883667, "learning_rate": 3.6912295026195824e-05, "loss": 1.2559, "step": 5480 }, { "epoch": 3.3603672532517215, "grad_norm": 2.6949479579925537, "learning_rate": 3.6878274477784585e-05, "loss": 1.2529, "step": 5490 }, { "epoch": 3.366488140780413, "grad_norm": 2.176804304122925, "learning_rate": 3.6844253929373346e-05, "loss": 1.2583, "step": 5500 }, { "epoch": 3.372609028309105, "grad_norm": 2.2006351947784424, "learning_rate": 3.68102333809621e-05, "loss": 1.2481, "step": 5510 }, { "epoch": 3.3787299158377966, "grad_norm": 1.8229804039001465, "learning_rate": 3.677621283255086e-05, "loss": 1.2492, "step": 5520 }, { "epoch": 3.384850803366488, "grad_norm": 2.4698712825775146, "learning_rate": 3.674219228413962e-05, "loss": 1.246, "step": 5530 }, { "epoch": 3.3909716908951797, "grad_norm": 2.333577871322632, "learning_rate": 3.670817173572838e-05, "loss": 1.2416, "step": 5540 }, { "epoch": 3.3970925784238712, "grad_norm": 1.945576548576355, "learning_rate": 3.667415118731714e-05, "loss": 1.2501, "step": 5550 }, { "epoch": 3.4032134659525632, "grad_norm": 2.6930272579193115, "learning_rate": 3.6640130638905904e-05, "loss": 1.237, "step": 5560 }, { "epoch": 3.409334353481255, "grad_norm": 1.8951822519302368, "learning_rate": 3.660611009049466e-05, "loss": 1.247, "step": 5570 }, { "epoch": 3.4154552410099464, "grad_norm": 2.351973533630371, "learning_rate": 3.657208954208342e-05, "loss": 1.2406, "step": 5580 }, { "epoch": 3.4215761285386384, "grad_norm": 2.6603872776031494, "learning_rate": 3.653806899367218e-05, "loss": 1.2575, "step": 5590 }, { "epoch": 3.42769701606733, "grad_norm": 2.3119587898254395, "learning_rate": 3.650404844526094e-05, "loss": 1.2376, "step": 5600 }, { "epoch": 3.4338179035960215, "grad_norm": 2.6269590854644775, "learning_rate": 3.64700278968497e-05, "loss": 1.2413, "step": 5610 }, { "epoch": 3.439938791124713, "grad_norm": 2.115171194076538, "learning_rate": 3.643600734843846e-05, "loss": 1.2224, "step": 5620 }, { "epoch": 3.4460596786534046, "grad_norm": 1.861360788345337, "learning_rate": 3.6401986800027216e-05, "loss": 1.2664, "step": 5630 }, { "epoch": 3.4521805661820966, "grad_norm": 2.1996967792510986, "learning_rate": 3.636796625161598e-05, "loss": 1.2546, "step": 5640 }, { "epoch": 3.458301453710788, "grad_norm": 2.7611207962036133, "learning_rate": 3.633394570320474e-05, "loss": 1.2629, "step": 5650 }, { "epoch": 3.4644223412394797, "grad_norm": 2.022286891937256, "learning_rate": 3.62999251547935e-05, "loss": 1.2454, "step": 5660 }, { "epoch": 3.4705432287681712, "grad_norm": 1.778696894645691, "learning_rate": 3.626590460638225e-05, "loss": 1.2577, "step": 5670 }, { "epoch": 3.4766641162968632, "grad_norm": 2.090994358062744, "learning_rate": 3.6231884057971014e-05, "loss": 1.2511, "step": 5680 }, { "epoch": 3.482785003825555, "grad_norm": 2.744858741760254, "learning_rate": 3.6197863509559774e-05, "loss": 1.2579, "step": 5690 }, { "epoch": 3.4889058913542463, "grad_norm": 2.450564384460449, "learning_rate": 3.6163842961148535e-05, "loss": 1.2393, "step": 5700 }, { "epoch": 3.495026778882938, "grad_norm": 1.8227015733718872, "learning_rate": 3.6129822412737296e-05, "loss": 1.2361, "step": 5710 }, { "epoch": 3.5011476664116294, "grad_norm": 1.8969275951385498, "learning_rate": 3.609580186432606e-05, "loss": 1.2275, "step": 5720 }, { "epoch": 3.5072685539403214, "grad_norm": 1.7384281158447266, "learning_rate": 3.606178131591482e-05, "loss": 1.2522, "step": 5730 }, { "epoch": 3.513389441469013, "grad_norm": 2.120198965072632, "learning_rate": 3.602776076750358e-05, "loss": 1.2322, "step": 5740 }, { "epoch": 3.5195103289977046, "grad_norm": 1.9926379919052124, "learning_rate": 3.599374021909233e-05, "loss": 1.2302, "step": 5750 }, { "epoch": 3.5256312165263965, "grad_norm": 2.081388235092163, "learning_rate": 3.5959719670681093e-05, "loss": 1.2464, "step": 5760 }, { "epoch": 3.531752104055088, "grad_norm": 2.0220718383789062, "learning_rate": 3.5925699122269854e-05, "loss": 1.2502, "step": 5770 }, { "epoch": 3.5378729915837797, "grad_norm": 1.9582314491271973, "learning_rate": 3.589167857385861e-05, "loss": 1.2281, "step": 5780 }, { "epoch": 3.543993879112471, "grad_norm": 1.8635259866714478, "learning_rate": 3.585765802544737e-05, "loss": 1.2109, "step": 5790 }, { "epoch": 3.5501147666411628, "grad_norm": 1.8320813179016113, "learning_rate": 3.582363747703613e-05, "loss": 1.2441, "step": 5800 }, { "epoch": 3.5562356541698548, "grad_norm": 2.1509382724761963, "learning_rate": 3.578961692862489e-05, "loss": 1.2561, "step": 5810 }, { "epoch": 3.5623565416985463, "grad_norm": 1.8632539510726929, "learning_rate": 3.575559638021365e-05, "loss": 1.2481, "step": 5820 }, { "epoch": 3.568477429227238, "grad_norm": 1.9509693384170532, "learning_rate": 3.572157583180241e-05, "loss": 1.2618, "step": 5830 }, { "epoch": 3.57459831675593, "grad_norm": 2.189829111099243, "learning_rate": 3.568755528339117e-05, "loss": 1.2455, "step": 5840 }, { "epoch": 3.5807192042846214, "grad_norm": 2.2501230239868164, "learning_rate": 3.5653534734979934e-05, "loss": 1.2442, "step": 5850 }, { "epoch": 3.586840091813313, "grad_norm": 2.8913943767547607, "learning_rate": 3.561951418656869e-05, "loss": 1.2615, "step": 5860 }, { "epoch": 3.5929609793420045, "grad_norm": 1.5915802717208862, "learning_rate": 3.558549363815745e-05, "loss": 1.2502, "step": 5870 }, { "epoch": 3.599081866870696, "grad_norm": 2.0232715606689453, "learning_rate": 3.555147308974621e-05, "loss": 1.2592, "step": 5880 }, { "epoch": 3.6052027543993876, "grad_norm": 2.3988993167877197, "learning_rate": 3.5517452541334964e-05, "loss": 1.2568, "step": 5890 }, { "epoch": 3.6113236419280796, "grad_norm": 1.941627025604248, "learning_rate": 3.5483431992923725e-05, "loss": 1.2435, "step": 5900 }, { "epoch": 3.617444529456771, "grad_norm": 1.9982330799102783, "learning_rate": 3.5449411444512486e-05, "loss": 1.2441, "step": 5910 }, { "epoch": 3.6235654169854628, "grad_norm": 1.9015318155288696, "learning_rate": 3.5415390896101246e-05, "loss": 1.247, "step": 5920 }, { "epoch": 3.6296863045141547, "grad_norm": 1.683561086654663, "learning_rate": 3.538137034769001e-05, "loss": 1.2401, "step": 5930 }, { "epoch": 3.6358071920428463, "grad_norm": 2.5195016860961914, "learning_rate": 3.534734979927877e-05, "loss": 1.2342, "step": 5940 }, { "epoch": 3.641928079571538, "grad_norm": 2.2849786281585693, "learning_rate": 3.531332925086753e-05, "loss": 1.232, "step": 5950 }, { "epoch": 3.6480489671002294, "grad_norm": 2.70056414604187, "learning_rate": 3.527930870245629e-05, "loss": 1.2573, "step": 5960 }, { "epoch": 3.654169854628921, "grad_norm": 5.275028228759766, "learning_rate": 3.524528815404505e-05, "loss": 1.2237, "step": 5970 }, { "epoch": 3.660290742157613, "grad_norm": 1.7736239433288574, "learning_rate": 3.5211267605633805e-05, "loss": 1.2314, "step": 5980 }, { "epoch": 3.6664116296863045, "grad_norm": 1.8925633430480957, "learning_rate": 3.517724705722256e-05, "loss": 1.2476, "step": 5990 }, { "epoch": 3.672532517214996, "grad_norm": 1.8658711910247803, "learning_rate": 3.514322650881132e-05, "loss": 1.2448, "step": 6000 }, { "epoch": 3.678653404743688, "grad_norm": 2.2303245067596436, "learning_rate": 3.510920596040008e-05, "loss": 1.2566, "step": 6010 }, { "epoch": 3.6847742922723796, "grad_norm": 1.967257022857666, "learning_rate": 3.507518541198884e-05, "loss": 1.2791, "step": 6020 }, { "epoch": 3.690895179801071, "grad_norm": 2.001021146774292, "learning_rate": 3.50411648635776e-05, "loss": 1.2275, "step": 6030 }, { "epoch": 3.6970160673297627, "grad_norm": 2.0317656993865967, "learning_rate": 3.500714431516636e-05, "loss": 1.2223, "step": 6040 }, { "epoch": 3.7031369548584543, "grad_norm": 2.7534890174865723, "learning_rate": 3.4973123766755124e-05, "loss": 1.2521, "step": 6050 }, { "epoch": 3.7092578423871463, "grad_norm": 1.9250917434692383, "learning_rate": 3.4939103218343885e-05, "loss": 1.2426, "step": 6060 }, { "epoch": 3.715378729915838, "grad_norm": 2.119258165359497, "learning_rate": 3.4905082669932645e-05, "loss": 1.2613, "step": 6070 }, { "epoch": 3.7214996174445294, "grad_norm": 2.6487741470336914, "learning_rate": 3.48710621215214e-05, "loss": 1.2531, "step": 6080 }, { "epoch": 3.7276205049732214, "grad_norm": 2.616588830947876, "learning_rate": 3.483704157311016e-05, "loss": 1.2603, "step": 6090 }, { "epoch": 3.733741392501913, "grad_norm": 1.95454740524292, "learning_rate": 3.4803021024698914e-05, "loss": 1.2393, "step": 6100 }, { "epoch": 3.7398622800306045, "grad_norm": 2.1213455200195312, "learning_rate": 3.4769000476287675e-05, "loss": 1.2269, "step": 6110 }, { "epoch": 3.745983167559296, "grad_norm": 2.3341705799102783, "learning_rate": 3.4734979927876436e-05, "loss": 1.236, "step": 6120 }, { "epoch": 3.7521040550879876, "grad_norm": 2.3794469833374023, "learning_rate": 3.47009593794652e-05, "loss": 1.2483, "step": 6130 }, { "epoch": 3.758224942616679, "grad_norm": 1.9347407817840576, "learning_rate": 3.466693883105396e-05, "loss": 1.255, "step": 6140 }, { "epoch": 3.764345830145371, "grad_norm": 3.7383499145507812, "learning_rate": 3.463291828264272e-05, "loss": 1.2169, "step": 6150 }, { "epoch": 3.7704667176740627, "grad_norm": 1.8134688138961792, "learning_rate": 3.459889773423148e-05, "loss": 1.2588, "step": 6160 }, { "epoch": 3.7765876052027543, "grad_norm": 1.9380251169204712, "learning_rate": 3.456487718582024e-05, "loss": 1.248, "step": 6170 }, { "epoch": 3.7827084927314463, "grad_norm": 2.238302230834961, "learning_rate": 3.4530856637409e-05, "loss": 1.2323, "step": 6180 }, { "epoch": 3.788829380260138, "grad_norm": 1.9604918956756592, "learning_rate": 3.4496836088997755e-05, "loss": 1.2261, "step": 6190 }, { "epoch": 3.7949502677888294, "grad_norm": 1.7650002241134644, "learning_rate": 3.4462815540586516e-05, "loss": 1.2346, "step": 6200 }, { "epoch": 3.801071155317521, "grad_norm": 2.498751401901245, "learning_rate": 3.442879499217527e-05, "loss": 1.2297, "step": 6210 }, { "epoch": 3.8071920428462125, "grad_norm": 3.40084171295166, "learning_rate": 3.439477444376403e-05, "loss": 1.2549, "step": 6220 }, { "epoch": 3.8133129303749045, "grad_norm": 1.700204610824585, "learning_rate": 3.436075389535279e-05, "loss": 1.2558, "step": 6230 }, { "epoch": 3.819433817903596, "grad_norm": 2.0135040283203125, "learning_rate": 3.432673334694155e-05, "loss": 1.2576, "step": 6240 }, { "epoch": 3.8255547054322876, "grad_norm": 2.295555353164673, "learning_rate": 3.429271279853031e-05, "loss": 1.2233, "step": 6250 }, { "epoch": 3.8316755929609796, "grad_norm": 1.7916216850280762, "learning_rate": 3.4258692250119074e-05, "loss": 1.2267, "step": 6260 }, { "epoch": 3.837796480489671, "grad_norm": 1.6943750381469727, "learning_rate": 3.4224671701707835e-05, "loss": 1.2482, "step": 6270 }, { "epoch": 3.8439173680183627, "grad_norm": 1.9428541660308838, "learning_rate": 3.4190651153296596e-05, "loss": 1.2324, "step": 6280 }, { "epoch": 3.8500382555470543, "grad_norm": 2.07246994972229, "learning_rate": 3.415663060488536e-05, "loss": 1.2335, "step": 6290 }, { "epoch": 3.856159143075746, "grad_norm": 1.881109356880188, "learning_rate": 3.412261005647411e-05, "loss": 1.2594, "step": 6300 }, { "epoch": 3.862280030604438, "grad_norm": 2.3518142700195312, "learning_rate": 3.408858950806287e-05, "loss": 1.2397, "step": 6310 }, { "epoch": 3.8684009181331294, "grad_norm": 2.0112175941467285, "learning_rate": 3.405456895965163e-05, "loss": 1.2563, "step": 6320 }, { "epoch": 3.874521805661821, "grad_norm": 1.5001269578933716, "learning_rate": 3.4020548411240386e-05, "loss": 1.2371, "step": 6330 }, { "epoch": 3.8806426931905125, "grad_norm": 1.9393521547317505, "learning_rate": 3.398652786282915e-05, "loss": 1.2375, "step": 6340 }, { "epoch": 3.8867635807192045, "grad_norm": 2.159621238708496, "learning_rate": 3.395250731441791e-05, "loss": 1.2535, "step": 6350 }, { "epoch": 3.892884468247896, "grad_norm": 2.0031697750091553, "learning_rate": 3.391848676600667e-05, "loss": 1.2502, "step": 6360 }, { "epoch": 3.8990053557765876, "grad_norm": 1.7993634939193726, "learning_rate": 3.388446621759543e-05, "loss": 1.2323, "step": 6370 }, { "epoch": 3.905126243305279, "grad_norm": 1.918763279914856, "learning_rate": 3.385044566918419e-05, "loss": 1.2363, "step": 6380 }, { "epoch": 3.9112471308339707, "grad_norm": 1.641964316368103, "learning_rate": 3.381642512077295e-05, "loss": 1.2507, "step": 6390 }, { "epoch": 3.9173680183626627, "grad_norm": 2.025327444076538, "learning_rate": 3.3782404572361705e-05, "loss": 1.2392, "step": 6400 }, { "epoch": 3.9234889058913542, "grad_norm": 2.53024959564209, "learning_rate": 3.3748384023950466e-05, "loss": 1.2323, "step": 6410 }, { "epoch": 3.929609793420046, "grad_norm": 2.083070755004883, "learning_rate": 3.371436347553923e-05, "loss": 1.2503, "step": 6420 }, { "epoch": 3.935730680948738, "grad_norm": 2.3890929222106934, "learning_rate": 3.368034292712799e-05, "loss": 1.2482, "step": 6430 }, { "epoch": 3.9418515684774293, "grad_norm": 2.910482168197632, "learning_rate": 3.364632237871675e-05, "loss": 1.2236, "step": 6440 }, { "epoch": 3.947972456006121, "grad_norm": 1.9107235670089722, "learning_rate": 3.36123018303055e-05, "loss": 1.2466, "step": 6450 }, { "epoch": 3.9540933435348125, "grad_norm": 1.7786065340042114, "learning_rate": 3.3578281281894264e-05, "loss": 1.247, "step": 6460 }, { "epoch": 3.960214231063504, "grad_norm": 1.7764980792999268, "learning_rate": 3.3544260733483025e-05, "loss": 1.2401, "step": 6470 }, { "epoch": 3.966335118592196, "grad_norm": 2.0788002014160156, "learning_rate": 3.3510240185071785e-05, "loss": 1.2405, "step": 6480 }, { "epoch": 3.9724560061208876, "grad_norm": 2.2174031734466553, "learning_rate": 3.3476219636660546e-05, "loss": 1.2499, "step": 6490 }, { "epoch": 3.978576893649579, "grad_norm": 1.71248459815979, "learning_rate": 3.344219908824931e-05, "loss": 1.2697, "step": 6500 }, { "epoch": 3.984697781178271, "grad_norm": 2.434459686279297, "learning_rate": 3.340817853983806e-05, "loss": 1.2351, "step": 6510 }, { "epoch": 3.9908186687069627, "grad_norm": 3.131270408630371, "learning_rate": 3.337415799142682e-05, "loss": 1.2279, "step": 6520 }, { "epoch": 3.9969395562356542, "grad_norm": 1.993688702583313, "learning_rate": 3.334013744301558e-05, "loss": 1.2368, "step": 6530 }, { "epoch": 4.0, "eval_accuracy": 0.49007154608409537, "eval_loss": 1.1966572999954224, "eval_runtime": 2703.3706, "eval_samples_per_second": 77.346, "eval_steps_per_second": 0.604, "step": 6535 }, { "epoch": 4.003060443764346, "grad_norm": 3.0211377143859863, "learning_rate": 3.3306116894604344e-05, "loss": 1.2443, "step": 6540 }, { "epoch": 4.009181331293037, "grad_norm": 2.0056445598602295, "learning_rate": 3.3272096346193104e-05, "loss": 1.2186, "step": 6550 }, { "epoch": 4.015302218821729, "grad_norm": 2.565847396850586, "learning_rate": 3.323807579778186e-05, "loss": 1.2187, "step": 6560 }, { "epoch": 4.02142310635042, "grad_norm": 2.171513319015503, "learning_rate": 3.320405524937062e-05, "loss": 1.2271, "step": 6570 }, { "epoch": 4.027543993879113, "grad_norm": 2.016981840133667, "learning_rate": 3.317003470095938e-05, "loss": 1.2409, "step": 6580 }, { "epoch": 4.033664881407804, "grad_norm": 2.211911916732788, "learning_rate": 3.313601415254814e-05, "loss": 1.2206, "step": 6590 }, { "epoch": 4.039785768936496, "grad_norm": 2.5300450325012207, "learning_rate": 3.31019936041369e-05, "loss": 1.2272, "step": 6600 }, { "epoch": 4.0459066564651875, "grad_norm": 1.8287904262542725, "learning_rate": 3.306797305572566e-05, "loss": 1.2423, "step": 6610 }, { "epoch": 4.052027543993879, "grad_norm": 2.4838716983795166, "learning_rate": 3.303395250731442e-05, "loss": 1.2215, "step": 6620 }, { "epoch": 4.058148431522571, "grad_norm": 2.263672113418579, "learning_rate": 3.299993195890318e-05, "loss": 1.2403, "step": 6630 }, { "epoch": 4.064269319051262, "grad_norm": 1.7068171501159668, "learning_rate": 3.296591141049194e-05, "loss": 1.2132, "step": 6640 }, { "epoch": 4.070390206579954, "grad_norm": 2.154141664505005, "learning_rate": 3.29318908620807e-05, "loss": 1.209, "step": 6650 }, { "epoch": 4.076511094108645, "grad_norm": 2.417459011077881, "learning_rate": 3.289787031366946e-05, "loss": 1.2118, "step": 6660 }, { "epoch": 4.082631981637338, "grad_norm": 3.0920281410217285, "learning_rate": 3.286384976525822e-05, "loss": 1.2296, "step": 6670 }, { "epoch": 4.088752869166029, "grad_norm": 2.09197998046875, "learning_rate": 3.2829829216846975e-05, "loss": 1.2097, "step": 6680 }, { "epoch": 4.094873756694721, "grad_norm": 2.23848819732666, "learning_rate": 3.2795808668435736e-05, "loss": 1.2354, "step": 6690 }, { "epoch": 4.100994644223412, "grad_norm": 2.4771714210510254, "learning_rate": 3.2761788120024497e-05, "loss": 1.2477, "step": 6700 }, { "epoch": 4.107115531752104, "grad_norm": 2.599956750869751, "learning_rate": 3.272776757161326e-05, "loss": 1.242, "step": 6710 }, { "epoch": 4.1132364192807955, "grad_norm": 2.1151208877563477, "learning_rate": 3.269374702320202e-05, "loss": 1.2182, "step": 6720 }, { "epoch": 4.119357306809487, "grad_norm": 2.415768623352051, "learning_rate": 3.265972647479077e-05, "loss": 1.214, "step": 6730 }, { "epoch": 4.125478194338179, "grad_norm": 2.391017436981201, "learning_rate": 3.262570592637953e-05, "loss": 1.2449, "step": 6740 }, { "epoch": 4.131599081866871, "grad_norm": 2.170161724090576, "learning_rate": 3.2591685377968294e-05, "loss": 1.2286, "step": 6750 }, { "epoch": 4.137719969395563, "grad_norm": 2.0515055656433105, "learning_rate": 3.2557664829557055e-05, "loss": 1.2426, "step": 6760 }, { "epoch": 4.143840856924254, "grad_norm": 2.469785213470459, "learning_rate": 3.2523644281145816e-05, "loss": 1.2284, "step": 6770 }, { "epoch": 4.149961744452946, "grad_norm": 2.2527389526367188, "learning_rate": 3.2489623732734576e-05, "loss": 1.2186, "step": 6780 }, { "epoch": 4.156082631981637, "grad_norm": 1.9241255521774292, "learning_rate": 3.245560318432334e-05, "loss": 1.2133, "step": 6790 }, { "epoch": 4.162203519510329, "grad_norm": 2.233799457550049, "learning_rate": 3.242158263591209e-05, "loss": 1.2279, "step": 6800 }, { "epoch": 4.16832440703902, "grad_norm": 2.2666685581207275, "learning_rate": 3.238756208750085e-05, "loss": 1.2359, "step": 6810 }, { "epoch": 4.174445294567712, "grad_norm": 2.9180006980895996, "learning_rate": 3.235354153908961e-05, "loss": 1.2105, "step": 6820 }, { "epoch": 4.180566182096404, "grad_norm": 3.030034303665161, "learning_rate": 3.231952099067837e-05, "loss": 1.2184, "step": 6830 }, { "epoch": 4.186687069625096, "grad_norm": 1.8429760932922363, "learning_rate": 3.228550044226713e-05, "loss": 1.2197, "step": 6840 }, { "epoch": 4.1928079571537875, "grad_norm": 2.205509662628174, "learning_rate": 3.225147989385589e-05, "loss": 1.2274, "step": 6850 }, { "epoch": 4.198928844682479, "grad_norm": 2.7778589725494385, "learning_rate": 3.221745934544465e-05, "loss": 1.2541, "step": 6860 }, { "epoch": 4.205049732211171, "grad_norm": 2.7423245906829834, "learning_rate": 3.218343879703341e-05, "loss": 1.2135, "step": 6870 }, { "epoch": 4.211170619739862, "grad_norm": 2.1378300189971924, "learning_rate": 3.214941824862217e-05, "loss": 1.2166, "step": 6880 }, { "epoch": 4.217291507268554, "grad_norm": 2.2552590370178223, "learning_rate": 3.211539770021093e-05, "loss": 1.2267, "step": 6890 }, { "epoch": 4.223412394797245, "grad_norm": 2.3378567695617676, "learning_rate": 3.208137715179969e-05, "loss": 1.2274, "step": 6900 }, { "epoch": 4.229533282325937, "grad_norm": 2.442488193511963, "learning_rate": 3.2047356603388454e-05, "loss": 1.2178, "step": 6910 }, { "epoch": 4.235654169854629, "grad_norm": 2.7005996704101562, "learning_rate": 3.201333605497721e-05, "loss": 1.2279, "step": 6920 }, { "epoch": 4.241775057383321, "grad_norm": 1.9639949798583984, "learning_rate": 3.197931550656597e-05, "loss": 1.2372, "step": 6930 }, { "epoch": 4.247895944912012, "grad_norm": 2.424386739730835, "learning_rate": 3.194529495815472e-05, "loss": 1.2217, "step": 6940 }, { "epoch": 4.254016832440704, "grad_norm": 2.461184501647949, "learning_rate": 3.1911274409743484e-05, "loss": 1.2157, "step": 6950 }, { "epoch": 4.2601377199693955, "grad_norm": 2.4525201320648193, "learning_rate": 3.1877253861332244e-05, "loss": 1.2272, "step": 6960 }, { "epoch": 4.266258607498087, "grad_norm": 2.1750197410583496, "learning_rate": 3.1843233312921005e-05, "loss": 1.2195, "step": 6970 }, { "epoch": 4.272379495026779, "grad_norm": 2.52929949760437, "learning_rate": 3.1809212764509766e-05, "loss": 1.2455, "step": 6980 }, { "epoch": 4.27850038255547, "grad_norm": 1.8435324430465698, "learning_rate": 3.177519221609853e-05, "loss": 1.1996, "step": 6990 }, { "epoch": 4.284621270084163, "grad_norm": 2.5383806228637695, "learning_rate": 3.174117166768729e-05, "loss": 1.2393, "step": 7000 }, { "epoch": 4.290742157612854, "grad_norm": 2.3538296222686768, "learning_rate": 3.170715111927605e-05, "loss": 1.2237, "step": 7010 }, { "epoch": 4.296863045141546, "grad_norm": 1.8341562747955322, "learning_rate": 3.167313057086481e-05, "loss": 1.2217, "step": 7020 }, { "epoch": 4.302983932670237, "grad_norm": 1.9721349477767944, "learning_rate": 3.1639110022453563e-05, "loss": 1.2232, "step": 7030 }, { "epoch": 4.309104820198929, "grad_norm": 2.715494155883789, "learning_rate": 3.1605089474042324e-05, "loss": 1.2342, "step": 7040 }, { "epoch": 4.31522570772762, "grad_norm": 2.1433260440826416, "learning_rate": 3.157106892563108e-05, "loss": 1.2411, "step": 7050 }, { "epoch": 4.321346595256312, "grad_norm": 2.1917879581451416, "learning_rate": 3.153704837721984e-05, "loss": 1.2175, "step": 7060 }, { "epoch": 4.3274674827850035, "grad_norm": 1.9488756656646729, "learning_rate": 3.15030278288086e-05, "loss": 1.2296, "step": 7070 }, { "epoch": 4.333588370313695, "grad_norm": 2.587290048599243, "learning_rate": 3.146900728039736e-05, "loss": 1.2451, "step": 7080 }, { "epoch": 4.3397092578423875, "grad_norm": 2.3056092262268066, "learning_rate": 3.143498673198612e-05, "loss": 1.2075, "step": 7090 }, { "epoch": 4.345830145371079, "grad_norm": 2.641049385070801, "learning_rate": 3.140096618357488e-05, "loss": 1.2404, "step": 7100 }, { "epoch": 4.351951032899771, "grad_norm": 2.5124692916870117, "learning_rate": 3.136694563516364e-05, "loss": 1.2183, "step": 7110 }, { "epoch": 4.358071920428462, "grad_norm": 2.4015398025512695, "learning_rate": 3.1332925086752404e-05, "loss": 1.2212, "step": 7120 }, { "epoch": 4.364192807957154, "grad_norm": 2.2219901084899902, "learning_rate": 3.1298904538341165e-05, "loss": 1.2227, "step": 7130 }, { "epoch": 4.370313695485845, "grad_norm": 1.9921913146972656, "learning_rate": 3.126488398992992e-05, "loss": 1.2294, "step": 7140 }, { "epoch": 4.376434583014537, "grad_norm": 2.351640224456787, "learning_rate": 3.123086344151867e-05, "loss": 1.2379, "step": 7150 }, { "epoch": 4.382555470543228, "grad_norm": 2.6132214069366455, "learning_rate": 3.1196842893107434e-05, "loss": 1.2235, "step": 7160 }, { "epoch": 4.388676358071921, "grad_norm": 2.056541681289673, "learning_rate": 3.1162822344696195e-05, "loss": 1.2133, "step": 7170 }, { "epoch": 4.394797245600612, "grad_norm": 2.450434684753418, "learning_rate": 3.1128801796284956e-05, "loss": 1.2085, "step": 7180 }, { "epoch": 4.400918133129304, "grad_norm": 1.9336761236190796, "learning_rate": 3.1094781247873716e-05, "loss": 1.2293, "step": 7190 }, { "epoch": 4.4070390206579955, "grad_norm": 2.7180116176605225, "learning_rate": 3.106076069946248e-05, "loss": 1.2234, "step": 7200 }, { "epoch": 4.413159908186687, "grad_norm": 2.412187337875366, "learning_rate": 3.102674015105124e-05, "loss": 1.2163, "step": 7210 }, { "epoch": 4.419280795715379, "grad_norm": 2.206754684448242, "learning_rate": 3.099271960264e-05, "loss": 1.2197, "step": 7220 }, { "epoch": 4.42540168324407, "grad_norm": 1.9868717193603516, "learning_rate": 3.095869905422876e-05, "loss": 1.2371, "step": 7230 }, { "epoch": 4.431522570772762, "grad_norm": 2.1179041862487793, "learning_rate": 3.0924678505817514e-05, "loss": 1.207, "step": 7240 }, { "epoch": 4.437643458301454, "grad_norm": 2.341188907623291, "learning_rate": 3.0890657957406275e-05, "loss": 1.2212, "step": 7250 }, { "epoch": 4.443764345830146, "grad_norm": 2.979339361190796, "learning_rate": 3.0856637408995036e-05, "loss": 1.2373, "step": 7260 }, { "epoch": 4.449885233358837, "grad_norm": 2.224384307861328, "learning_rate": 3.082261686058379e-05, "loss": 1.2274, "step": 7270 }, { "epoch": 4.456006120887529, "grad_norm": 1.9843226671218872, "learning_rate": 3.078859631217255e-05, "loss": 1.2236, "step": 7280 }, { "epoch": 4.46212700841622, "grad_norm": 2.021461248397827, "learning_rate": 3.075457576376131e-05, "loss": 1.2296, "step": 7290 }, { "epoch": 4.468247895944912, "grad_norm": 2.517347812652588, "learning_rate": 3.072055521535007e-05, "loss": 1.1913, "step": 7300 }, { "epoch": 4.4743687834736035, "grad_norm": 2.3766560554504395, "learning_rate": 3.068653466693883e-05, "loss": 1.2233, "step": 7310 }, { "epoch": 4.480489671002295, "grad_norm": 2.126357316970825, "learning_rate": 3.0652514118527594e-05, "loss": 1.2336, "step": 7320 }, { "epoch": 4.4866105585309874, "grad_norm": 2.1979329586029053, "learning_rate": 3.0618493570116355e-05, "loss": 1.2091, "step": 7330 }, { "epoch": 4.492731446059679, "grad_norm": 2.0365238189697266, "learning_rate": 3.0584473021705115e-05, "loss": 1.2131, "step": 7340 }, { "epoch": 4.4988523335883706, "grad_norm": 2.6603610515594482, "learning_rate": 3.055045247329387e-05, "loss": 1.1893, "step": 7350 }, { "epoch": 4.504973221117062, "grad_norm": 2.6453335285186768, "learning_rate": 3.0516431924882634e-05, "loss": 1.2219, "step": 7360 }, { "epoch": 4.511094108645754, "grad_norm": 2.1196227073669434, "learning_rate": 3.048241137647139e-05, "loss": 1.2368, "step": 7370 }, { "epoch": 4.517214996174445, "grad_norm": 1.869175672531128, "learning_rate": 3.044839082806015e-05, "loss": 1.2139, "step": 7380 }, { "epoch": 4.523335883703137, "grad_norm": 2.3623552322387695, "learning_rate": 3.0414370279648906e-05, "loss": 1.2032, "step": 7390 }, { "epoch": 4.529456771231828, "grad_norm": 2.8302743434906006, "learning_rate": 3.0380349731237667e-05, "loss": 1.2228, "step": 7400 }, { "epoch": 4.53557765876052, "grad_norm": 2.22745418548584, "learning_rate": 3.0346329182826428e-05, "loss": 1.21, "step": 7410 }, { "epoch": 4.541698546289212, "grad_norm": 2.6589863300323486, "learning_rate": 3.031230863441519e-05, "loss": 1.2401, "step": 7420 }, { "epoch": 4.547819433817904, "grad_norm": 2.0868048667907715, "learning_rate": 3.0278288086003946e-05, "loss": 1.2353, "step": 7430 }, { "epoch": 4.553940321346595, "grad_norm": 2.1262969970703125, "learning_rate": 3.0244267537592707e-05, "loss": 1.2056, "step": 7440 }, { "epoch": 4.560061208875287, "grad_norm": 2.061774730682373, "learning_rate": 3.0210246989181468e-05, "loss": 1.2139, "step": 7450 }, { "epoch": 4.5661820964039785, "grad_norm": 2.2398433685302734, "learning_rate": 3.017622644077023e-05, "loss": 1.2223, "step": 7460 }, { "epoch": 4.57230298393267, "grad_norm": 2.264333963394165, "learning_rate": 3.014220589235899e-05, "loss": 1.2289, "step": 7470 }, { "epoch": 4.578423871461362, "grad_norm": 2.1285994052886963, "learning_rate": 3.0108185343947747e-05, "loss": 1.22, "step": 7480 }, { "epoch": 4.584544758990053, "grad_norm": 2.1647374629974365, "learning_rate": 3.0074164795536508e-05, "loss": 1.2366, "step": 7490 }, { "epoch": 4.590665646518746, "grad_norm": 2.071086883544922, "learning_rate": 3.004014424712526e-05, "loss": 1.1946, "step": 7500 }, { "epoch": 4.596786534047437, "grad_norm": 2.2825191020965576, "learning_rate": 3.0006123698714022e-05, "loss": 1.2258, "step": 7510 }, { "epoch": 4.602907421576129, "grad_norm": 2.1768798828125, "learning_rate": 2.9972103150302783e-05, "loss": 1.2357, "step": 7520 }, { "epoch": 4.60902830910482, "grad_norm": 1.8341929912567139, "learning_rate": 2.9938082601891544e-05, "loss": 1.2337, "step": 7530 }, { "epoch": 4.615149196633512, "grad_norm": 2.1512627601623535, "learning_rate": 2.99040620534803e-05, "loss": 1.2289, "step": 7540 }, { "epoch": 4.621270084162203, "grad_norm": 3.288439989089966, "learning_rate": 2.9870041505069062e-05, "loss": 1.2379, "step": 7550 }, { "epoch": 4.627390971690895, "grad_norm": 2.537337064743042, "learning_rate": 2.9836020956657823e-05, "loss": 1.2334, "step": 7560 }, { "epoch": 4.6335118592195865, "grad_norm": 2.0168588161468506, "learning_rate": 2.9802000408246584e-05, "loss": 1.2124, "step": 7570 }, { "epoch": 4.639632746748278, "grad_norm": 2.4690663814544678, "learning_rate": 2.9767979859835345e-05, "loss": 1.2069, "step": 7580 }, { "epoch": 4.6457536342769705, "grad_norm": 2.4409384727478027, "learning_rate": 2.9733959311424102e-05, "loss": 1.2304, "step": 7590 }, { "epoch": 4.651874521805662, "grad_norm": 1.9961847066879272, "learning_rate": 2.9699938763012863e-05, "loss": 1.2159, "step": 7600 }, { "epoch": 4.657995409334354, "grad_norm": 2.5575664043426514, "learning_rate": 2.9665918214601624e-05, "loss": 1.2252, "step": 7610 }, { "epoch": 4.664116296863045, "grad_norm": 2.107255220413208, "learning_rate": 2.9631897666190378e-05, "loss": 1.1979, "step": 7620 }, { "epoch": 4.670237184391737, "grad_norm": 2.145972490310669, "learning_rate": 2.959787711777914e-05, "loss": 1.2467, "step": 7630 }, { "epoch": 4.676358071920428, "grad_norm": 1.9515827894210815, "learning_rate": 2.95638565693679e-05, "loss": 1.2142, "step": 7640 }, { "epoch": 4.68247895944912, "grad_norm": 2.8625612258911133, "learning_rate": 2.9529836020956657e-05, "loss": 1.238, "step": 7650 }, { "epoch": 4.688599846977811, "grad_norm": 2.255072832107544, "learning_rate": 2.9495815472545418e-05, "loss": 1.232, "step": 7660 }, { "epoch": 4.694720734506504, "grad_norm": 2.5473906993865967, "learning_rate": 2.946179492413418e-05, "loss": 1.2325, "step": 7670 }, { "epoch": 4.700841622035195, "grad_norm": 2.642747640609741, "learning_rate": 2.942777437572294e-05, "loss": 1.2431, "step": 7680 }, { "epoch": 4.706962509563887, "grad_norm": 2.264507532119751, "learning_rate": 2.9393753827311697e-05, "loss": 1.221, "step": 7690 }, { "epoch": 4.7130833970925785, "grad_norm": 1.8713449239730835, "learning_rate": 2.9359733278900458e-05, "loss": 1.217, "step": 7700 }, { "epoch": 4.71920428462127, "grad_norm": 3.092440128326416, "learning_rate": 2.932571273048922e-05, "loss": 1.2544, "step": 7710 }, { "epoch": 4.725325172149962, "grad_norm": 2.9618237018585205, "learning_rate": 2.929169218207798e-05, "loss": 1.2355, "step": 7720 }, { "epoch": 4.731446059678653, "grad_norm": 1.8672477006912231, "learning_rate": 2.9257671633666734e-05, "loss": 1.1962, "step": 7730 }, { "epoch": 4.737566947207345, "grad_norm": 2.159982442855835, "learning_rate": 2.9223651085255495e-05, "loss": 1.2319, "step": 7740 }, { "epoch": 4.743687834736036, "grad_norm": 2.338762044906616, "learning_rate": 2.9189630536844252e-05, "loss": 1.2143, "step": 7750 }, { "epoch": 4.749808722264729, "grad_norm": 2.3189380168914795, "learning_rate": 2.9155609988433013e-05, "loss": 1.239, "step": 7760 }, { "epoch": 4.75592960979342, "grad_norm": 1.9343451261520386, "learning_rate": 2.9121589440021774e-05, "loss": 1.2121, "step": 7770 }, { "epoch": 4.762050497322112, "grad_norm": 1.9574135541915894, "learning_rate": 2.9087568891610534e-05, "loss": 1.228, "step": 7780 }, { "epoch": 4.768171384850803, "grad_norm": 2.6530020236968994, "learning_rate": 2.9053548343199295e-05, "loss": 1.2365, "step": 7790 }, { "epoch": 4.774292272379495, "grad_norm": 1.8905853033065796, "learning_rate": 2.9019527794788053e-05, "loss": 1.2071, "step": 7800 }, { "epoch": 4.7804131599081865, "grad_norm": 2.568509101867676, "learning_rate": 2.8985507246376814e-05, "loss": 1.2149, "step": 7810 }, { "epoch": 4.786534047436878, "grad_norm": 2.2760283946990967, "learning_rate": 2.8951486697965574e-05, "loss": 1.2287, "step": 7820 }, { "epoch": 4.7926549349655705, "grad_norm": 2.0909883975982666, "learning_rate": 2.8917466149554335e-05, "loss": 1.2221, "step": 7830 }, { "epoch": 4.798775822494262, "grad_norm": 2.4575860500335693, "learning_rate": 2.8883445601143093e-05, "loss": 1.2271, "step": 7840 }, { "epoch": 4.804896710022954, "grad_norm": 1.9734315872192383, "learning_rate": 2.884942505273185e-05, "loss": 1.2501, "step": 7850 }, { "epoch": 4.811017597551645, "grad_norm": 3.550708532333374, "learning_rate": 2.8815404504320608e-05, "loss": 1.2128, "step": 7860 }, { "epoch": 4.817138485080337, "grad_norm": 2.462048053741455, "learning_rate": 2.878138395590937e-05, "loss": 1.2109, "step": 7870 }, { "epoch": 4.823259372609028, "grad_norm": 1.9959014654159546, "learning_rate": 2.874736340749813e-05, "loss": 1.2379, "step": 7880 }, { "epoch": 4.82938026013772, "grad_norm": 2.04964542388916, "learning_rate": 2.871334285908689e-05, "loss": 1.2224, "step": 7890 }, { "epoch": 4.835501147666411, "grad_norm": 2.9247825145721436, "learning_rate": 2.867932231067565e-05, "loss": 1.2303, "step": 7900 }, { "epoch": 4.841622035195103, "grad_norm": 2.2298436164855957, "learning_rate": 2.864530176226441e-05, "loss": 1.243, "step": 7910 }, { "epoch": 4.8477429227237945, "grad_norm": 1.946879267692566, "learning_rate": 2.861128121385317e-05, "loss": 1.2206, "step": 7920 }, { "epoch": 4.853863810252487, "grad_norm": 2.5111823081970215, "learning_rate": 2.857726066544193e-05, "loss": 1.2002, "step": 7930 }, { "epoch": 4.8599846977811785, "grad_norm": 2.873385190963745, "learning_rate": 2.854324011703069e-05, "loss": 1.2167, "step": 7940 }, { "epoch": 4.86610558530987, "grad_norm": 2.1272952556610107, "learning_rate": 2.8509219568619448e-05, "loss": 1.2131, "step": 7950 }, { "epoch": 4.872226472838562, "grad_norm": 2.3787174224853516, "learning_rate": 2.847519902020821e-05, "loss": 1.2121, "step": 7960 }, { "epoch": 4.878347360367253, "grad_norm": 2.747269868850708, "learning_rate": 2.8441178471796963e-05, "loss": 1.227, "step": 7970 }, { "epoch": 4.884468247895945, "grad_norm": 3.0929527282714844, "learning_rate": 2.8407157923385724e-05, "loss": 1.2276, "step": 7980 }, { "epoch": 4.890589135424636, "grad_norm": 2.1043317317962646, "learning_rate": 2.8373137374974485e-05, "loss": 1.2188, "step": 7990 }, { "epoch": 4.896710022953329, "grad_norm": 2.1406781673431396, "learning_rate": 2.8339116826563246e-05, "loss": 1.2234, "step": 8000 }, { "epoch": 4.90283091048202, "grad_norm": 2.324378728866577, "learning_rate": 2.8305096278152003e-05, "loss": 1.2438, "step": 8010 }, { "epoch": 4.908951798010712, "grad_norm": 1.8091750144958496, "learning_rate": 2.8271075729740764e-05, "loss": 1.2426, "step": 8020 }, { "epoch": 4.915072685539403, "grad_norm": 2.3250181674957275, "learning_rate": 2.8237055181329525e-05, "loss": 1.2115, "step": 8030 }, { "epoch": 4.921193573068095, "grad_norm": 2.8805532455444336, "learning_rate": 2.8203034632918286e-05, "loss": 1.2214, "step": 8040 }, { "epoch": 4.9273144605967865, "grad_norm": 1.797640323638916, "learning_rate": 2.8169014084507046e-05, "loss": 1.2072, "step": 8050 }, { "epoch": 4.933435348125478, "grad_norm": 2.2686753273010254, "learning_rate": 2.8134993536095804e-05, "loss": 1.1998, "step": 8060 }, { "epoch": 4.93955623565417, "grad_norm": 2.1427371501922607, "learning_rate": 2.8100972987684565e-05, "loss": 1.2291, "step": 8070 }, { "epoch": 4.945677123182861, "grad_norm": 1.856122374534607, "learning_rate": 2.8066952439273326e-05, "loss": 1.2189, "step": 8080 }, { "epoch": 4.951798010711553, "grad_norm": 1.8572533130645752, "learning_rate": 2.803293189086208e-05, "loss": 1.2252, "step": 8090 }, { "epoch": 4.957918898240245, "grad_norm": 2.145905017852783, "learning_rate": 2.799891134245084e-05, "loss": 1.2065, "step": 8100 }, { "epoch": 4.964039785768937, "grad_norm": 2.219866991043091, "learning_rate": 2.79648907940396e-05, "loss": 1.2106, "step": 8110 }, { "epoch": 4.970160673297628, "grad_norm": 1.9409259557724, "learning_rate": 2.793087024562836e-05, "loss": 1.2306, "step": 8120 }, { "epoch": 4.97628156082632, "grad_norm": 1.977242350578308, "learning_rate": 2.789684969721712e-05, "loss": 1.216, "step": 8130 }, { "epoch": 4.982402448355011, "grad_norm": 2.118283271789551, "learning_rate": 2.786282914880588e-05, "loss": 1.2048, "step": 8140 }, { "epoch": 4.988523335883703, "grad_norm": 2.632073402404785, "learning_rate": 2.782880860039464e-05, "loss": 1.2132, "step": 8150 }, { "epoch": 4.9946442234123944, "grad_norm": 2.513634204864502, "learning_rate": 2.77947880519834e-05, "loss": 1.1973, "step": 8160 }, { "epoch": 4.999540933435348, "eval_accuracy": 0.49269235183839, "eval_loss": 1.191017508506775, "eval_runtime": 2626.2477, "eval_samples_per_second": 79.618, "eval_steps_per_second": 0.622, "step": 8168 }, { "epoch": 5.000765110941087, "grad_norm": 2.578747034072876, "learning_rate": 2.776076750357216e-05, "loss": 1.2102, "step": 8170 }, { "epoch": 5.006885998469778, "grad_norm": 2.2281763553619385, "learning_rate": 2.772674695516092e-05, "loss": 1.2144, "step": 8180 }, { "epoch": 5.01300688599847, "grad_norm": 2.208728551864624, "learning_rate": 2.769272640674968e-05, "loss": 1.1976, "step": 8190 }, { "epoch": 5.0191277735271616, "grad_norm": 2.4716689586639404, "learning_rate": 2.7658705858338435e-05, "loss": 1.1719, "step": 8200 }, { "epoch": 5.025248661055853, "grad_norm": 2.048522472381592, "learning_rate": 2.7624685309927196e-05, "loss": 1.2016, "step": 8210 }, { "epoch": 5.031369548584545, "grad_norm": 2.6114094257354736, "learning_rate": 2.7590664761515957e-05, "loss": 1.1989, "step": 8220 }, { "epoch": 5.037490436113236, "grad_norm": 2.179579734802246, "learning_rate": 2.7556644213104714e-05, "loss": 1.2051, "step": 8230 }, { "epoch": 5.043611323641928, "grad_norm": 2.6294515132904053, "learning_rate": 2.7522623664693475e-05, "loss": 1.2084, "step": 8240 }, { "epoch": 5.049732211170619, "grad_norm": 2.3146214485168457, "learning_rate": 2.7488603116282236e-05, "loss": 1.1932, "step": 8250 }, { "epoch": 5.055853098699312, "grad_norm": 3.6110377311706543, "learning_rate": 2.7454582567870997e-05, "loss": 1.2052, "step": 8260 }, { "epoch": 5.061973986228003, "grad_norm": 3.290837049484253, "learning_rate": 2.7420562019459754e-05, "loss": 1.2179, "step": 8270 }, { "epoch": 5.068094873756695, "grad_norm": 1.990761637687683, "learning_rate": 2.7386541471048515e-05, "loss": 1.2253, "step": 8280 }, { "epoch": 5.074215761285386, "grad_norm": 2.85910964012146, "learning_rate": 2.7352520922637276e-05, "loss": 1.1992, "step": 8290 }, { "epoch": 5.080336648814078, "grad_norm": 6.976127624511719, "learning_rate": 2.7318500374226037e-05, "loss": 1.2065, "step": 8300 }, { "epoch": 5.0864575363427695, "grad_norm": 2.3096303939819336, "learning_rate": 2.7284479825814798e-05, "loss": 1.1973, "step": 8310 }, { "epoch": 5.092578423871461, "grad_norm": 6.251516342163086, "learning_rate": 2.7250459277403552e-05, "loss": 1.187, "step": 8320 }, { "epoch": 5.098699311400153, "grad_norm": 2.466515064239502, "learning_rate": 2.721643872899231e-05, "loss": 1.1905, "step": 8330 }, { "epoch": 5.104820198928845, "grad_norm": 2.5167441368103027, "learning_rate": 2.718241818058107e-05, "loss": 1.2128, "step": 8340 }, { "epoch": 5.110941086457537, "grad_norm": 2.935549736022949, "learning_rate": 2.714839763216983e-05, "loss": 1.2005, "step": 8350 }, { "epoch": 5.117061973986228, "grad_norm": 3.1587438583374023, "learning_rate": 2.711437708375859e-05, "loss": 1.194, "step": 8360 }, { "epoch": 5.12318286151492, "grad_norm": 2.3066651821136475, "learning_rate": 2.7080356535347352e-05, "loss": 1.1774, "step": 8370 }, { "epoch": 5.129303749043611, "grad_norm": 2.729510545730591, "learning_rate": 2.704633598693611e-05, "loss": 1.2187, "step": 8380 }, { "epoch": 5.135424636572303, "grad_norm": 2.3249425888061523, "learning_rate": 2.701231543852487e-05, "loss": 1.1844, "step": 8390 }, { "epoch": 5.141545524100994, "grad_norm": 2.5726587772369385, "learning_rate": 2.697829489011363e-05, "loss": 1.1862, "step": 8400 }, { "epoch": 5.147666411629686, "grad_norm": 2.2208099365234375, "learning_rate": 2.6944274341702392e-05, "loss": 1.2141, "step": 8410 }, { "epoch": 5.1537872991583775, "grad_norm": 2.545058012008667, "learning_rate": 2.691025379329115e-05, "loss": 1.2017, "step": 8420 }, { "epoch": 5.15990818668707, "grad_norm": 2.34517502784729, "learning_rate": 2.687623324487991e-05, "loss": 1.1668, "step": 8430 }, { "epoch": 5.1660290742157615, "grad_norm": 3.2366106510162354, "learning_rate": 2.6842212696468665e-05, "loss": 1.2014, "step": 8440 }, { "epoch": 5.172149961744453, "grad_norm": 2.4992125034332275, "learning_rate": 2.6808192148057426e-05, "loss": 1.2127, "step": 8450 }, { "epoch": 5.178270849273145, "grad_norm": 2.8871748447418213, "learning_rate": 2.6774171599646186e-05, "loss": 1.202, "step": 8460 }, { "epoch": 5.184391736801836, "grad_norm": 2.683912515640259, "learning_rate": 2.6740151051234947e-05, "loss": 1.2157, "step": 8470 }, { "epoch": 5.190512624330528, "grad_norm": 2.367983102798462, "learning_rate": 2.6706130502823705e-05, "loss": 1.2233, "step": 8480 }, { "epoch": 5.196633511859219, "grad_norm": 2.0448033809661865, "learning_rate": 2.6672109954412466e-05, "loss": 1.2066, "step": 8490 }, { "epoch": 5.202754399387911, "grad_norm": 2.684910774230957, "learning_rate": 2.6638089406001226e-05, "loss": 1.1947, "step": 8500 }, { "epoch": 5.208875286916603, "grad_norm": 1.977565884590149, "learning_rate": 2.6604068857589987e-05, "loss": 1.1967, "step": 8510 }, { "epoch": 5.214996174445295, "grad_norm": 2.5993175506591797, "learning_rate": 2.6570048309178748e-05, "loss": 1.1995, "step": 8520 }, { "epoch": 5.221117061973986, "grad_norm": 2.7591464519500732, "learning_rate": 2.6536027760767505e-05, "loss": 1.1821, "step": 8530 }, { "epoch": 5.227237949502678, "grad_norm": 3.1257448196411133, "learning_rate": 2.6502007212356266e-05, "loss": 1.2133, "step": 8540 }, { "epoch": 5.2333588370313695, "grad_norm": 2.6627302169799805, "learning_rate": 2.646798666394502e-05, "loss": 1.2123, "step": 8550 }, { "epoch": 5.239479724560061, "grad_norm": 2.7436108589172363, "learning_rate": 2.643396611553378e-05, "loss": 1.1839, "step": 8560 }, { "epoch": 5.245600612088753, "grad_norm": 2.5480778217315674, "learning_rate": 2.6399945567122542e-05, "loss": 1.2051, "step": 8570 }, { "epoch": 5.251721499617444, "grad_norm": 2.8406577110290527, "learning_rate": 2.6365925018711303e-05, "loss": 1.2, "step": 8580 }, { "epoch": 5.257842387146136, "grad_norm": 2.1096248626708984, "learning_rate": 2.633190447030006e-05, "loss": 1.2257, "step": 8590 }, { "epoch": 5.263963274674828, "grad_norm": 2.118366003036499, "learning_rate": 2.629788392188882e-05, "loss": 1.1917, "step": 8600 }, { "epoch": 5.27008416220352, "grad_norm": 2.6410601139068604, "learning_rate": 2.6263863373477582e-05, "loss": 1.199, "step": 8610 }, { "epoch": 5.276205049732211, "grad_norm": 2.6622838973999023, "learning_rate": 2.6229842825066343e-05, "loss": 1.2393, "step": 8620 }, { "epoch": 5.282325937260903, "grad_norm": 2.324550151824951, "learning_rate": 2.6195822276655104e-05, "loss": 1.1996, "step": 8630 }, { "epoch": 5.288446824789594, "grad_norm": 2.170179843902588, "learning_rate": 2.616180172824386e-05, "loss": 1.2034, "step": 8640 }, { "epoch": 5.294567712318286, "grad_norm": 2.926877737045288, "learning_rate": 2.6127781179832622e-05, "loss": 1.2017, "step": 8650 }, { "epoch": 5.3006885998469775, "grad_norm": 2.6772279739379883, "learning_rate": 2.6093760631421383e-05, "loss": 1.2162, "step": 8660 }, { "epoch": 5.30680948737567, "grad_norm": 2.1664867401123047, "learning_rate": 2.6059740083010137e-05, "loss": 1.2046, "step": 8670 }, { "epoch": 5.3129303749043615, "grad_norm": 3.135590076446533, "learning_rate": 2.6025719534598898e-05, "loss": 1.2098, "step": 8680 }, { "epoch": 5.319051262433053, "grad_norm": 2.273746967315674, "learning_rate": 2.599169898618766e-05, "loss": 1.1882, "step": 8690 }, { "epoch": 5.325172149961745, "grad_norm": 2.1665115356445312, "learning_rate": 2.5957678437776416e-05, "loss": 1.2069, "step": 8700 }, { "epoch": 5.331293037490436, "grad_norm": 2.255600929260254, "learning_rate": 2.5923657889365177e-05, "loss": 1.1852, "step": 8710 }, { "epoch": 5.337413925019128, "grad_norm": 2.3505523204803467, "learning_rate": 2.5889637340953938e-05, "loss": 1.198, "step": 8720 }, { "epoch": 5.343534812547819, "grad_norm": 2.702526330947876, "learning_rate": 2.58556167925427e-05, "loss": 1.1917, "step": 8730 }, { "epoch": 5.349655700076511, "grad_norm": 2.1754770278930664, "learning_rate": 2.5821596244131456e-05, "loss": 1.2049, "step": 8740 }, { "epoch": 5.355776587605202, "grad_norm": 2.5231616497039795, "learning_rate": 2.5787575695720217e-05, "loss": 1.1991, "step": 8750 }, { "epoch": 5.361897475133895, "grad_norm": 2.6360552310943604, "learning_rate": 2.5753555147308978e-05, "loss": 1.2092, "step": 8760 }, { "epoch": 5.368018362662586, "grad_norm": 2.3326311111450195, "learning_rate": 2.571953459889774e-05, "loss": 1.198, "step": 8770 }, { "epoch": 5.374139250191278, "grad_norm": 2.2699687480926514, "learning_rate": 2.56855140504865e-05, "loss": 1.2005, "step": 8780 }, { "epoch": 5.3802601377199695, "grad_norm": 2.333057165145874, "learning_rate": 2.5651493502075253e-05, "loss": 1.2159, "step": 8790 }, { "epoch": 5.386381025248661, "grad_norm": 2.36719012260437, "learning_rate": 2.561747295366401e-05, "loss": 1.184, "step": 8800 }, { "epoch": 5.392501912777353, "grad_norm": 2.606476306915283, "learning_rate": 2.558345240525277e-05, "loss": 1.2024, "step": 8810 }, { "epoch": 5.398622800306044, "grad_norm": 2.2720136642456055, "learning_rate": 2.5549431856841532e-05, "loss": 1.2019, "step": 8820 }, { "epoch": 5.404743687834736, "grad_norm": 2.720341205596924, "learning_rate": 2.5515411308430293e-05, "loss": 1.1857, "step": 8830 }, { "epoch": 5.410864575363428, "grad_norm": 2.564955234527588, "learning_rate": 2.5481390760019054e-05, "loss": 1.1948, "step": 8840 }, { "epoch": 5.41698546289212, "grad_norm": 2.1959404945373535, "learning_rate": 2.544737021160781e-05, "loss": 1.209, "step": 8850 }, { "epoch": 5.423106350420811, "grad_norm": 2.346018075942993, "learning_rate": 2.5413349663196572e-05, "loss": 1.2147, "step": 8860 }, { "epoch": 5.429227237949503, "grad_norm": 3.420417547225952, "learning_rate": 2.5379329114785333e-05, "loss": 1.2052, "step": 8870 }, { "epoch": 5.435348125478194, "grad_norm": 2.6672658920288086, "learning_rate": 2.5345308566374094e-05, "loss": 1.1976, "step": 8880 }, { "epoch": 5.441469013006886, "grad_norm": 2.4706966876983643, "learning_rate": 2.531128801796285e-05, "loss": 1.1956, "step": 8890 }, { "epoch": 5.4475899005355775, "grad_norm": 2.0787277221679688, "learning_rate": 2.527726746955161e-05, "loss": 1.2117, "step": 8900 }, { "epoch": 5.453710788064269, "grad_norm": 2.4532337188720703, "learning_rate": 2.5243246921140366e-05, "loss": 1.2004, "step": 8910 }, { "epoch": 5.459831675592961, "grad_norm": 2.5366806983947754, "learning_rate": 2.5209226372729127e-05, "loss": 1.2241, "step": 8920 }, { "epoch": 5.465952563121653, "grad_norm": 2.249305486679077, "learning_rate": 2.5175205824317888e-05, "loss": 1.1777, "step": 8930 }, { "epoch": 5.472073450650345, "grad_norm": 2.310941457748413, "learning_rate": 2.514118527590665e-05, "loss": 1.2017, "step": 8940 }, { "epoch": 5.478194338179036, "grad_norm": 2.3156533241271973, "learning_rate": 2.510716472749541e-05, "loss": 1.2053, "step": 8950 }, { "epoch": 5.484315225707728, "grad_norm": 2.2076308727264404, "learning_rate": 2.5073144179084167e-05, "loss": 1.197, "step": 8960 }, { "epoch": 5.490436113236419, "grad_norm": 2.4675791263580322, "learning_rate": 2.5039123630672928e-05, "loss": 1.2096, "step": 8970 }, { "epoch": 5.496557000765111, "grad_norm": 2.278066396713257, "learning_rate": 2.500510308226169e-05, "loss": 1.2003, "step": 8980 }, { "epoch": 5.502677888293802, "grad_norm": 2.8391642570495605, "learning_rate": 2.4971082533850446e-05, "loss": 1.1821, "step": 8990 }, { "epoch": 5.508798775822494, "grad_norm": 2.1658828258514404, "learning_rate": 2.4937061985439207e-05, "loss": 1.1976, "step": 9000 }, { "epoch": 5.514919663351186, "grad_norm": 2.391244888305664, "learning_rate": 2.4903041437027964e-05, "loss": 1.1906, "step": 9010 }, { "epoch": 5.521040550879878, "grad_norm": 3.277883529663086, "learning_rate": 2.4869020888616725e-05, "loss": 1.2039, "step": 9020 }, { "epoch": 5.527161438408569, "grad_norm": 2.536834716796875, "learning_rate": 2.4835000340205486e-05, "loss": 1.1884, "step": 9030 }, { "epoch": 5.533282325937261, "grad_norm": 2.8371753692626953, "learning_rate": 2.4800979791794247e-05, "loss": 1.1964, "step": 9040 }, { "epoch": 5.5394032134659525, "grad_norm": 2.2327940464019775, "learning_rate": 2.4766959243383004e-05, "loss": 1.2042, "step": 9050 }, { "epoch": 5.545524100994644, "grad_norm": 2.28041934967041, "learning_rate": 2.4732938694971762e-05, "loss": 1.2076, "step": 9060 }, { "epoch": 5.551644988523336, "grad_norm": 2.6812901496887207, "learning_rate": 2.4698918146560523e-05, "loss": 1.197, "step": 9070 }, { "epoch": 5.557765876052027, "grad_norm": 2.6180360317230225, "learning_rate": 2.4664897598149284e-05, "loss": 1.2023, "step": 9080 }, { "epoch": 5.563886763580719, "grad_norm": 2.2223103046417236, "learning_rate": 2.4630877049738044e-05, "loss": 1.2079, "step": 9090 }, { "epoch": 5.570007651109411, "grad_norm": 2.5299456119537354, "learning_rate": 2.4596856501326805e-05, "loss": 1.1983, "step": 9100 }, { "epoch": 5.576128538638103, "grad_norm": 2.541381359100342, "learning_rate": 2.4562835952915563e-05, "loss": 1.211, "step": 9110 }, { "epoch": 5.582249426166794, "grad_norm": 2.687126636505127, "learning_rate": 2.452881540450432e-05, "loss": 1.2138, "step": 9120 }, { "epoch": 5.588370313695486, "grad_norm": 2.71907377243042, "learning_rate": 2.449479485609308e-05, "loss": 1.213, "step": 9130 }, { "epoch": 5.594491201224177, "grad_norm": 2.3322901725769043, "learning_rate": 2.4460774307681842e-05, "loss": 1.1948, "step": 9140 }, { "epoch": 5.600612088752869, "grad_norm": 2.353484630584717, "learning_rate": 2.4426753759270603e-05, "loss": 1.1968, "step": 9150 }, { "epoch": 5.6067329762815605, "grad_norm": 2.285670042037964, "learning_rate": 2.439273321085936e-05, "loss": 1.216, "step": 9160 }, { "epoch": 5.612853863810253, "grad_norm": 2.336120128631592, "learning_rate": 2.4358712662448117e-05, "loss": 1.1931, "step": 9170 }, { "epoch": 5.6189747513389445, "grad_norm": 2.471015453338623, "learning_rate": 2.432469211403688e-05, "loss": 1.1919, "step": 9180 }, { "epoch": 5.625095638867636, "grad_norm": 2.1988890171051025, "learning_rate": 2.429067156562564e-05, "loss": 1.2223, "step": 9190 }, { "epoch": 5.631216526396328, "grad_norm": 1.9588499069213867, "learning_rate": 2.42566510172144e-05, "loss": 1.2036, "step": 9200 }, { "epoch": 5.637337413925019, "grad_norm": 2.280674934387207, "learning_rate": 2.4222630468803157e-05, "loss": 1.1847, "step": 9210 }, { "epoch": 5.643458301453711, "grad_norm": 2.6746459007263184, "learning_rate": 2.4188609920391918e-05, "loss": 1.2244, "step": 9220 }, { "epoch": 5.649579188982402, "grad_norm": 2.370379686355591, "learning_rate": 2.4154589371980676e-05, "loss": 1.204, "step": 9230 }, { "epoch": 5.655700076511094, "grad_norm": 2.2550511360168457, "learning_rate": 2.4120568823569437e-05, "loss": 1.1869, "step": 9240 }, { "epoch": 5.661820964039785, "grad_norm": 2.5164883136749268, "learning_rate": 2.4086548275158197e-05, "loss": 1.2165, "step": 9250 }, { "epoch": 5.667941851568477, "grad_norm": 2.711061954498291, "learning_rate": 2.4052527726746958e-05, "loss": 1.2242, "step": 9260 }, { "epoch": 5.674062739097169, "grad_norm": 2.4196157455444336, "learning_rate": 2.4018507178335716e-05, "loss": 1.2015, "step": 9270 }, { "epoch": 5.680183626625861, "grad_norm": 3.0094361305236816, "learning_rate": 2.3984486629924473e-05, "loss": 1.2095, "step": 9280 }, { "epoch": 5.6863045141545525, "grad_norm": 2.66207218170166, "learning_rate": 2.3950466081513234e-05, "loss": 1.1981, "step": 9290 }, { "epoch": 5.692425401683244, "grad_norm": 2.4135472774505615, "learning_rate": 2.3916445533101995e-05, "loss": 1.2121, "step": 9300 }, { "epoch": 5.698546289211936, "grad_norm": 2.1922266483306885, "learning_rate": 2.3882424984690756e-05, "loss": 1.2119, "step": 9310 }, { "epoch": 5.704667176740627, "grad_norm": 2.5734925270080566, "learning_rate": 2.3848404436279513e-05, "loss": 1.1831, "step": 9320 }, { "epoch": 5.710788064269319, "grad_norm": 2.7392802238464355, "learning_rate": 2.3814383887868274e-05, "loss": 1.2075, "step": 9330 }, { "epoch": 5.716908951798011, "grad_norm": 2.732255458831787, "learning_rate": 2.378036333945703e-05, "loss": 1.2072, "step": 9340 }, { "epoch": 5.723029839326703, "grad_norm": 2.7654104232788086, "learning_rate": 2.3746342791045792e-05, "loss": 1.2062, "step": 9350 }, { "epoch": 5.729150726855394, "grad_norm": 2.4640185832977295, "learning_rate": 2.3712322242634553e-05, "loss": 1.2067, "step": 9360 }, { "epoch": 5.735271614384086, "grad_norm": 2.2052786350250244, "learning_rate": 2.367830169422331e-05, "loss": 1.1818, "step": 9370 }, { "epoch": 5.741392501912777, "grad_norm": 2.3217477798461914, "learning_rate": 2.364428114581207e-05, "loss": 1.2122, "step": 9380 }, { "epoch": 5.747513389441469, "grad_norm": 2.3705313205718994, "learning_rate": 2.3610260597400832e-05, "loss": 1.1926, "step": 9390 }, { "epoch": 5.7536342769701605, "grad_norm": 2.948225259780884, "learning_rate": 2.357624004898959e-05, "loss": 1.2, "step": 9400 }, { "epoch": 5.759755164498852, "grad_norm": 2.4769287109375, "learning_rate": 2.354221950057835e-05, "loss": 1.1832, "step": 9410 }, { "epoch": 5.765876052027544, "grad_norm": 2.542545795440674, "learning_rate": 2.350819895216711e-05, "loss": 1.2188, "step": 9420 }, { "epoch": 5.771996939556235, "grad_norm": 2.491767406463623, "learning_rate": 2.347417840375587e-05, "loss": 1.1808, "step": 9430 }, { "epoch": 5.778117827084928, "grad_norm": 2.5319976806640625, "learning_rate": 2.344015785534463e-05, "loss": 1.2085, "step": 9440 }, { "epoch": 5.784238714613619, "grad_norm": 2.477506399154663, "learning_rate": 2.340613730693339e-05, "loss": 1.1946, "step": 9450 }, { "epoch": 5.790359602142311, "grad_norm": 2.290252685546875, "learning_rate": 2.3372116758522148e-05, "loss": 1.1919, "step": 9460 }, { "epoch": 5.796480489671002, "grad_norm": 2.647599697113037, "learning_rate": 2.333809621011091e-05, "loss": 1.2116, "step": 9470 }, { "epoch": 5.802601377199694, "grad_norm": 2.6891698837280273, "learning_rate": 2.3304075661699666e-05, "loss": 1.1808, "step": 9480 }, { "epoch": 5.808722264728385, "grad_norm": 2.669074773788452, "learning_rate": 2.3270055113288427e-05, "loss": 1.1937, "step": 9490 }, { "epoch": 5.814843152257077, "grad_norm": 2.0883407592773438, "learning_rate": 2.3236034564877188e-05, "loss": 1.1748, "step": 9500 }, { "epoch": 5.820964039785769, "grad_norm": 2.735302448272705, "learning_rate": 2.320201401646595e-05, "loss": 1.2035, "step": 9510 }, { "epoch": 5.827084927314461, "grad_norm": 2.263305187225342, "learning_rate": 2.3167993468054706e-05, "loss": 1.204, "step": 9520 }, { "epoch": 5.8332058148431525, "grad_norm": 2.3452515602111816, "learning_rate": 2.3133972919643463e-05, "loss": 1.2145, "step": 9530 }, { "epoch": 5.839326702371844, "grad_norm": 2.134207010269165, "learning_rate": 2.3099952371232224e-05, "loss": 1.1987, "step": 9540 }, { "epoch": 5.845447589900536, "grad_norm": 2.2933249473571777, "learning_rate": 2.3065931822820985e-05, "loss": 1.2018, "step": 9550 }, { "epoch": 5.851568477429227, "grad_norm": 3.017359972000122, "learning_rate": 2.3031911274409746e-05, "loss": 1.1982, "step": 9560 }, { "epoch": 5.857689364957919, "grad_norm": 2.179676055908203, "learning_rate": 2.2997890725998507e-05, "loss": 1.1911, "step": 9570 }, { "epoch": 5.86381025248661, "grad_norm": 3.0648555755615234, "learning_rate": 2.2963870177587264e-05, "loss": 1.1962, "step": 9580 }, { "epoch": 5.869931140015302, "grad_norm": 2.395503044128418, "learning_rate": 2.292984962917602e-05, "loss": 1.2117, "step": 9590 }, { "epoch": 5.876052027543994, "grad_norm": 2.281404972076416, "learning_rate": 2.2895829080764783e-05, "loss": 1.2103, "step": 9600 }, { "epoch": 5.882172915072686, "grad_norm": 2.796386957168579, "learning_rate": 2.2861808532353543e-05, "loss": 1.2088, "step": 9610 }, { "epoch": 5.888293802601377, "grad_norm": 3.063288927078247, "learning_rate": 2.2827787983942304e-05, "loss": 1.2001, "step": 9620 }, { "epoch": 5.894414690130069, "grad_norm": 2.6174395084381104, "learning_rate": 2.279376743553106e-05, "loss": 1.2125, "step": 9630 }, { "epoch": 5.9005355776587605, "grad_norm": 2.5155296325683594, "learning_rate": 2.275974688711982e-05, "loss": 1.1971, "step": 9640 }, { "epoch": 5.906656465187452, "grad_norm": 2.342050790786743, "learning_rate": 2.272572633870858e-05, "loss": 1.2212, "step": 9650 }, { "epoch": 5.912777352716144, "grad_norm": 2.788961172103882, "learning_rate": 2.269170579029734e-05, "loss": 1.1955, "step": 9660 }, { "epoch": 5.918898240244835, "grad_norm": 2.815035581588745, "learning_rate": 2.26576852418861e-05, "loss": 1.2054, "step": 9670 }, { "epoch": 5.925019127773528, "grad_norm": 2.6851422786712646, "learning_rate": 2.2623664693474862e-05, "loss": 1.2034, "step": 9680 }, { "epoch": 5.931140015302219, "grad_norm": 2.8981006145477295, "learning_rate": 2.258964414506362e-05, "loss": 1.2022, "step": 9690 }, { "epoch": 5.937260902830911, "grad_norm": 2.1799545288085938, "learning_rate": 2.2555623596652377e-05, "loss": 1.1863, "step": 9700 }, { "epoch": 5.943381790359602, "grad_norm": 2.2509403228759766, "learning_rate": 2.2521603048241138e-05, "loss": 1.1952, "step": 9710 }, { "epoch": 5.949502677888294, "grad_norm": 2.3222877979278564, "learning_rate": 2.24875824998299e-05, "loss": 1.2116, "step": 9720 }, { "epoch": 5.955623565416985, "grad_norm": 2.506871223449707, "learning_rate": 2.245356195141866e-05, "loss": 1.1901, "step": 9730 }, { "epoch": 5.961744452945677, "grad_norm": 2.736382484436035, "learning_rate": 2.2419541403007417e-05, "loss": 1.2051, "step": 9740 }, { "epoch": 5.9678653404743685, "grad_norm": 2.47483491897583, "learning_rate": 2.2385520854596175e-05, "loss": 1.2044, "step": 9750 }, { "epoch": 5.97398622800306, "grad_norm": 2.4884045124053955, "learning_rate": 2.2351500306184936e-05, "loss": 1.1871, "step": 9760 }, { "epoch": 5.9801071155317524, "grad_norm": 3.008117437362671, "learning_rate": 2.2317479757773696e-05, "loss": 1.2276, "step": 9770 }, { "epoch": 5.986228003060444, "grad_norm": 2.610323429107666, "learning_rate": 2.2283459209362457e-05, "loss": 1.1854, "step": 9780 }, { "epoch": 5.992348890589136, "grad_norm": 2.8609611988067627, "learning_rate": 2.2249438660951215e-05, "loss": 1.1797, "step": 9790 }, { "epoch": 5.998469778117827, "grad_norm": 2.5393433570861816, "learning_rate": 2.2215418112539975e-05, "loss": 1.2124, "step": 9800 }, { "epoch": 5.999693955623566, "eval_accuracy": 0.49892393924321843, "eval_loss": 1.181107997894287, "eval_runtime": 2638.7084, "eval_samples_per_second": 79.242, "eval_steps_per_second": 0.619, "step": 9802 }, { "epoch": 6.004590665646519, "grad_norm": 2.1775569915771484, "learning_rate": 2.2181397564128733e-05, "loss": 1.1978, "step": 9810 }, { "epoch": 6.01071155317521, "grad_norm": 2.870755434036255, "learning_rate": 2.2147377015717494e-05, "loss": 1.1981, "step": 9820 }, { "epoch": 6.016832440703902, "grad_norm": 2.644536018371582, "learning_rate": 2.2113356467306255e-05, "loss": 1.1809, "step": 9830 }, { "epoch": 6.022953328232593, "grad_norm": 4.089147090911865, "learning_rate": 2.2079335918895015e-05, "loss": 1.1818, "step": 9840 }, { "epoch": 6.029074215761286, "grad_norm": 3.8916430473327637, "learning_rate": 2.2045315370483773e-05, "loss": 1.1734, "step": 9850 }, { "epoch": 6.035195103289977, "grad_norm": 2.7396483421325684, "learning_rate": 2.2011294822072534e-05, "loss": 1.1856, "step": 9860 }, { "epoch": 6.041315990818669, "grad_norm": 3.0049068927764893, "learning_rate": 2.197727427366129e-05, "loss": 1.1725, "step": 9870 }, { "epoch": 6.04743687834736, "grad_norm": 5.130383014678955, "learning_rate": 2.1943253725250052e-05, "loss": 1.1805, "step": 9880 }, { "epoch": 6.053557765876052, "grad_norm": 2.8813374042510986, "learning_rate": 2.1909233176838813e-05, "loss": 1.1973, "step": 9890 }, { "epoch": 6.0596786534047435, "grad_norm": 2.5018324851989746, "learning_rate": 2.187521262842757e-05, "loss": 1.1796, "step": 9900 }, { "epoch": 6.065799540933435, "grad_norm": 3.9136996269226074, "learning_rate": 2.184119208001633e-05, "loss": 1.1864, "step": 9910 }, { "epoch": 6.071920428462127, "grad_norm": 2.804731845855713, "learning_rate": 2.1807171531605092e-05, "loss": 1.1714, "step": 9920 }, { "epoch": 6.078041315990819, "grad_norm": 2.595283269882202, "learning_rate": 2.177315098319385e-05, "loss": 1.2029, "step": 9930 }, { "epoch": 6.084162203519511, "grad_norm": 3.3707568645477295, "learning_rate": 2.173913043478261e-05, "loss": 1.1851, "step": 9940 }, { "epoch": 6.090283091048202, "grad_norm": 2.980250597000122, "learning_rate": 2.1705109886371368e-05, "loss": 1.1693, "step": 9950 }, { "epoch": 6.096403978576894, "grad_norm": 2.76073956489563, "learning_rate": 2.167108933796013e-05, "loss": 1.1947, "step": 9960 }, { "epoch": 6.102524866105585, "grad_norm": 2.2360520362854004, "learning_rate": 2.163706878954889e-05, "loss": 1.185, "step": 9970 }, { "epoch": 6.108645753634277, "grad_norm": 11.473621368408203, "learning_rate": 2.160304824113765e-05, "loss": 1.1858, "step": 9980 }, { "epoch": 6.114766641162968, "grad_norm": 2.612234115600586, "learning_rate": 2.1569027692726408e-05, "loss": 1.1713, "step": 9990 }, { "epoch": 6.12088752869166, "grad_norm": 2.6785049438476562, "learning_rate": 2.153500714431517e-05, "loss": 1.1796, "step": 10000 }, { "epoch": 6.1270084162203515, "grad_norm": 2.6284472942352295, "learning_rate": 2.1500986595903926e-05, "loss": 1.1747, "step": 10010 }, { "epoch": 6.133129303749044, "grad_norm": 3.1232402324676514, "learning_rate": 2.1466966047492687e-05, "loss": 1.1757, "step": 10020 }, { "epoch": 6.1392501912777355, "grad_norm": 2.471266508102417, "learning_rate": 2.1432945499081448e-05, "loss": 1.1757, "step": 10030 }, { "epoch": 6.145371078806427, "grad_norm": 3.1765995025634766, "learning_rate": 2.139892495067021e-05, "loss": 1.1865, "step": 10040 }, { "epoch": 6.151491966335119, "grad_norm": 2.3057191371917725, "learning_rate": 2.1364904402258966e-05, "loss": 1.1802, "step": 10050 }, { "epoch": 6.15761285386381, "grad_norm": 3.5264956951141357, "learning_rate": 2.1330883853847723e-05, "loss": 1.1786, "step": 10060 }, { "epoch": 6.163733741392502, "grad_norm": 2.909853219985962, "learning_rate": 2.1296863305436484e-05, "loss": 1.1902, "step": 10070 }, { "epoch": 6.169854628921193, "grad_norm": 2.270792007446289, "learning_rate": 2.1262842757025245e-05, "loss": 1.1738, "step": 10080 }, { "epoch": 6.175975516449885, "grad_norm": 2.9542202949523926, "learning_rate": 2.1228822208614006e-05, "loss": 1.1752, "step": 10090 }, { "epoch": 6.182096403978577, "grad_norm": 3.13840913772583, "learning_rate": 2.1194801660202763e-05, "loss": 1.1809, "step": 10100 }, { "epoch": 6.188217291507269, "grad_norm": 2.623544216156006, "learning_rate": 2.116078111179152e-05, "loss": 1.1945, "step": 10110 }, { "epoch": 6.19433817903596, "grad_norm": 2.469536304473877, "learning_rate": 2.112676056338028e-05, "loss": 1.1781, "step": 10120 }, { "epoch": 6.200459066564652, "grad_norm": 3.681007146835327, "learning_rate": 2.1092740014969042e-05, "loss": 1.1644, "step": 10130 }, { "epoch": 6.2065799540933435, "grad_norm": 2.528697967529297, "learning_rate": 2.1058719466557803e-05, "loss": 1.1842, "step": 10140 }, { "epoch": 6.212700841622035, "grad_norm": 2.5676088333129883, "learning_rate": 2.1024698918146564e-05, "loss": 1.1688, "step": 10150 }, { "epoch": 6.218821729150727, "grad_norm": 2.5315933227539062, "learning_rate": 2.099067836973532e-05, "loss": 1.1532, "step": 10160 }, { "epoch": 6.224942616679418, "grad_norm": 2.9567384719848633, "learning_rate": 2.095665782132408e-05, "loss": 1.1692, "step": 10170 }, { "epoch": 6.231063504208111, "grad_norm": 2.8193509578704834, "learning_rate": 2.092263727291284e-05, "loss": 1.1883, "step": 10180 }, { "epoch": 6.237184391736802, "grad_norm": 2.6887598037719727, "learning_rate": 2.08886167245016e-05, "loss": 1.1845, "step": 10190 }, { "epoch": 6.243305279265494, "grad_norm": 2.5700228214263916, "learning_rate": 2.085459617609036e-05, "loss": 1.1845, "step": 10200 }, { "epoch": 6.249426166794185, "grad_norm": 2.4592013359069824, "learning_rate": 2.082057562767912e-05, "loss": 1.1918, "step": 10210 }, { "epoch": 6.255547054322877, "grad_norm": 2.9663164615631104, "learning_rate": 2.0786555079267876e-05, "loss": 1.1804, "step": 10220 }, { "epoch": 6.261667941851568, "grad_norm": 3.259889841079712, "learning_rate": 2.0752534530856637e-05, "loss": 1.2066, "step": 10230 }, { "epoch": 6.26778882938026, "grad_norm": 2.5240590572357178, "learning_rate": 2.0718513982445398e-05, "loss": 1.1703, "step": 10240 }, { "epoch": 6.2739097169089515, "grad_norm": 2.6424803733825684, "learning_rate": 2.068449343403416e-05, "loss": 1.1812, "step": 10250 }, { "epoch": 6.280030604437643, "grad_norm": 2.5683681964874268, "learning_rate": 2.065047288562292e-05, "loss": 1.207, "step": 10260 }, { "epoch": 6.2861514919663355, "grad_norm": 3.324641227722168, "learning_rate": 2.0616452337211677e-05, "loss": 1.1763, "step": 10270 }, { "epoch": 6.292272379495027, "grad_norm": 2.700704574584961, "learning_rate": 2.0582431788800434e-05, "loss": 1.1748, "step": 10280 }, { "epoch": 6.298393267023719, "grad_norm": 2.4409584999084473, "learning_rate": 2.0548411240389195e-05, "loss": 1.1982, "step": 10290 }, { "epoch": 6.30451415455241, "grad_norm": 2.4599087238311768, "learning_rate": 2.0514390691977956e-05, "loss": 1.1868, "step": 10300 }, { "epoch": 6.310635042081102, "grad_norm": 2.6646196842193604, "learning_rate": 2.0480370143566717e-05, "loss": 1.1888, "step": 10310 }, { "epoch": 6.316755929609793, "grad_norm": 2.526289463043213, "learning_rate": 2.0446349595155474e-05, "loss": 1.1866, "step": 10320 }, { "epoch": 6.322876817138485, "grad_norm": 2.692807912826538, "learning_rate": 2.0412329046744235e-05, "loss": 1.1929, "step": 10330 }, { "epoch": 6.328997704667176, "grad_norm": 4.2312846183776855, "learning_rate": 2.0378308498332993e-05, "loss": 1.1613, "step": 10340 }, { "epoch": 6.335118592195869, "grad_norm": 2.276723861694336, "learning_rate": 2.0344287949921754e-05, "loss": 1.1885, "step": 10350 }, { "epoch": 6.34123947972456, "grad_norm": 3.342312812805176, "learning_rate": 2.0310267401510514e-05, "loss": 1.1896, "step": 10360 }, { "epoch": 6.347360367253252, "grad_norm": 2.6276004314422607, "learning_rate": 2.0276246853099272e-05, "loss": 1.1788, "step": 10370 }, { "epoch": 6.3534812547819435, "grad_norm": 2.7103192806243896, "learning_rate": 2.0242226304688033e-05, "loss": 1.1777, "step": 10380 }, { "epoch": 6.359602142310635, "grad_norm": 3.6030147075653076, "learning_rate": 2.0208205756276793e-05, "loss": 1.1764, "step": 10390 }, { "epoch": 6.365723029839327, "grad_norm": 2.7777633666992188, "learning_rate": 2.017418520786555e-05, "loss": 1.1853, "step": 10400 }, { "epoch": 6.371843917368018, "grad_norm": 3.4900357723236084, "learning_rate": 2.0140164659454312e-05, "loss": 1.1888, "step": 10410 }, { "epoch": 6.37796480489671, "grad_norm": 3.1087987422943115, "learning_rate": 2.0106144111043073e-05, "loss": 1.1711, "step": 10420 }, { "epoch": 6.384085692425401, "grad_norm": 3.3824617862701416, "learning_rate": 2.007212356263183e-05, "loss": 1.1865, "step": 10430 }, { "epoch": 6.390206579954094, "grad_norm": 4.719346046447754, "learning_rate": 2.003810301422059e-05, "loss": 1.195, "step": 10440 }, { "epoch": 6.396327467482785, "grad_norm": 2.5060925483703613, "learning_rate": 2.000408246580935e-05, "loss": 1.171, "step": 10450 }, { "epoch": 6.402448355011477, "grad_norm": 2.3634145259857178, "learning_rate": 1.997006191739811e-05, "loss": 1.177, "step": 10460 }, { "epoch": 6.408569242540168, "grad_norm": 2.408721685409546, "learning_rate": 1.993604136898687e-05, "loss": 1.1695, "step": 10470 }, { "epoch": 6.41469013006886, "grad_norm": 2.7625339031219482, "learning_rate": 1.9902020820575627e-05, "loss": 1.1819, "step": 10480 }, { "epoch": 6.4208110175975515, "grad_norm": 2.217163562774658, "learning_rate": 1.9868000272164388e-05, "loss": 1.1802, "step": 10490 }, { "epoch": 6.426931905126243, "grad_norm": 2.648359775543213, "learning_rate": 1.983397972375315e-05, "loss": 1.1587, "step": 10500 }, { "epoch": 6.433052792654935, "grad_norm": 2.5863988399505615, "learning_rate": 1.9799959175341907e-05, "loss": 1.1944, "step": 10510 }, { "epoch": 6.439173680183627, "grad_norm": 2.959826707839966, "learning_rate": 1.9765938626930667e-05, "loss": 1.1812, "step": 10520 }, { "epoch": 6.445294567712319, "grad_norm": 2.727541208267212, "learning_rate": 1.9731918078519425e-05, "loss": 1.1546, "step": 10530 }, { "epoch": 6.45141545524101, "grad_norm": 2.1876068115234375, "learning_rate": 1.9697897530108186e-05, "loss": 1.1914, "step": 10540 }, { "epoch": 6.457536342769702, "grad_norm": 3.7542362213134766, "learning_rate": 1.9663876981696946e-05, "loss": 1.1692, "step": 10550 }, { "epoch": 6.463657230298393, "grad_norm": 2.905738353729248, "learning_rate": 1.9629856433285707e-05, "loss": 1.1724, "step": 10560 }, { "epoch": 6.469778117827085, "grad_norm": 3.0283126831054688, "learning_rate": 1.9595835884874465e-05, "loss": 1.1866, "step": 10570 }, { "epoch": 6.475899005355776, "grad_norm": 2.3837485313415527, "learning_rate": 1.9561815336463226e-05, "loss": 1.2031, "step": 10580 }, { "epoch": 6.482019892884468, "grad_norm": 2.557549476623535, "learning_rate": 1.9527794788051983e-05, "loss": 1.184, "step": 10590 }, { "epoch": 6.4881407804131594, "grad_norm": 2.638843059539795, "learning_rate": 1.9493774239640744e-05, "loss": 1.165, "step": 10600 }, { "epoch": 6.494261667941852, "grad_norm": 2.693385601043701, "learning_rate": 1.9459753691229505e-05, "loss": 1.1862, "step": 10610 }, { "epoch": 6.5003825554705434, "grad_norm": 3.040398597717285, "learning_rate": 1.9425733142818266e-05, "loss": 1.1936, "step": 10620 }, { "epoch": 6.506503442999235, "grad_norm": 2.483701229095459, "learning_rate": 1.9391712594407023e-05, "loss": 1.1766, "step": 10630 }, { "epoch": 6.5126243305279266, "grad_norm": 3.3055386543273926, "learning_rate": 1.935769204599578e-05, "loss": 1.1818, "step": 10640 }, { "epoch": 6.518745218056618, "grad_norm": 2.60319447517395, "learning_rate": 1.932367149758454e-05, "loss": 1.1672, "step": 10650 }, { "epoch": 6.52486610558531, "grad_norm": 2.968649387359619, "learning_rate": 1.9289650949173302e-05, "loss": 1.1689, "step": 10660 }, { "epoch": 6.530986993114001, "grad_norm": 2.3525383472442627, "learning_rate": 1.9255630400762063e-05, "loss": 1.174, "step": 10670 }, { "epoch": 6.537107880642694, "grad_norm": 2.5817339420318604, "learning_rate": 1.922160985235082e-05, "loss": 1.1747, "step": 10680 }, { "epoch": 6.543228768171385, "grad_norm": 2.791980743408203, "learning_rate": 1.9187589303939578e-05, "loss": 1.168, "step": 10690 }, { "epoch": 6.549349655700077, "grad_norm": 2.451387405395508, "learning_rate": 1.915356875552834e-05, "loss": 1.1625, "step": 10700 }, { "epoch": 6.555470543228768, "grad_norm": 3.5441620349884033, "learning_rate": 1.91195482071171e-05, "loss": 1.1739, "step": 10710 }, { "epoch": 6.56159143075746, "grad_norm": 2.8768322467803955, "learning_rate": 1.908552765870586e-05, "loss": 1.1802, "step": 10720 }, { "epoch": 6.567712318286151, "grad_norm": 3.1151416301727295, "learning_rate": 1.905150711029462e-05, "loss": 1.2072, "step": 10730 }, { "epoch": 6.573833205814843, "grad_norm": 2.7969307899475098, "learning_rate": 1.901748656188338e-05, "loss": 1.1929, "step": 10740 }, { "epoch": 6.5799540933435345, "grad_norm": 2.526097059249878, "learning_rate": 1.8983466013472136e-05, "loss": 1.1804, "step": 10750 }, { "epoch": 6.586074980872226, "grad_norm": 3.3882269859313965, "learning_rate": 1.8949445465060897e-05, "loss": 1.1857, "step": 10760 }, { "epoch": 6.592195868400918, "grad_norm": 2.8102879524230957, "learning_rate": 1.8915424916649658e-05, "loss": 1.1749, "step": 10770 }, { "epoch": 6.59831675592961, "grad_norm": 2.504945755004883, "learning_rate": 1.888140436823842e-05, "loss": 1.1949, "step": 10780 }, { "epoch": 6.604437643458302, "grad_norm": 2.4723777770996094, "learning_rate": 1.8847383819827176e-05, "loss": 1.1984, "step": 10790 }, { "epoch": 6.610558530986993, "grad_norm": 2.8720338344573975, "learning_rate": 1.8813363271415937e-05, "loss": 1.1748, "step": 10800 }, { "epoch": 6.616679418515685, "grad_norm": 2.2344985008239746, "learning_rate": 1.8779342723004694e-05, "loss": 1.1727, "step": 10810 }, { "epoch": 6.622800306044376, "grad_norm": 2.382798433303833, "learning_rate": 1.8745322174593455e-05, "loss": 1.1895, "step": 10820 }, { "epoch": 6.628921193573068, "grad_norm": 3.054856777191162, "learning_rate": 1.8711301626182216e-05, "loss": 1.1806, "step": 10830 }, { "epoch": 6.635042081101759, "grad_norm": 2.8418335914611816, "learning_rate": 1.8677281077770973e-05, "loss": 1.1927, "step": 10840 }, { "epoch": 6.641162968630452, "grad_norm": 3.4169845581054688, "learning_rate": 1.8643260529359734e-05, "loss": 1.1948, "step": 10850 }, { "epoch": 6.647283856159143, "grad_norm": 2.521684169769287, "learning_rate": 1.860923998094849e-05, "loss": 1.1669, "step": 10860 }, { "epoch": 6.653404743687835, "grad_norm": 2.434762954711914, "learning_rate": 1.8575219432537252e-05, "loss": 1.1578, "step": 10870 }, { "epoch": 6.6595256312165265, "grad_norm": 2.8318793773651123, "learning_rate": 1.8541198884126013e-05, "loss": 1.1674, "step": 10880 }, { "epoch": 6.665646518745218, "grad_norm": 2.788933038711548, "learning_rate": 1.8507178335714774e-05, "loss": 1.169, "step": 10890 }, { "epoch": 6.67176740627391, "grad_norm": 2.764143943786621, "learning_rate": 1.847315778730353e-05, "loss": 1.1838, "step": 10900 }, { "epoch": 6.677888293802601, "grad_norm": 2.8891654014587402, "learning_rate": 1.8439137238892292e-05, "loss": 1.175, "step": 10910 }, { "epoch": 6.684009181331293, "grad_norm": 2.8268089294433594, "learning_rate": 1.840511669048105e-05, "loss": 1.1808, "step": 10920 }, { "epoch": 6.690130068859984, "grad_norm": 2.4628899097442627, "learning_rate": 1.837109614206981e-05, "loss": 1.187, "step": 10930 }, { "epoch": 6.696250956388677, "grad_norm": 2.6242809295654297, "learning_rate": 1.833707559365857e-05, "loss": 1.1686, "step": 10940 }, { "epoch": 6.702371843917368, "grad_norm": 2.548687696456909, "learning_rate": 1.830305504524733e-05, "loss": 1.1736, "step": 10950 }, { "epoch": 6.70849273144606, "grad_norm": 3.2288942337036133, "learning_rate": 1.826903449683609e-05, "loss": 1.174, "step": 10960 }, { "epoch": 6.714613618974751, "grad_norm": 3.222327947616577, "learning_rate": 1.823501394842485e-05, "loss": 1.1773, "step": 10970 }, { "epoch": 6.720734506503443, "grad_norm": 3.697409152984619, "learning_rate": 1.8200993400013608e-05, "loss": 1.1763, "step": 10980 }, { "epoch": 6.7268553940321345, "grad_norm": 2.843804359436035, "learning_rate": 1.816697285160237e-05, "loss": 1.1649, "step": 10990 }, { "epoch": 6.732976281560826, "grad_norm": 2.6715619564056396, "learning_rate": 1.8132952303191126e-05, "loss": 1.1723, "step": 11000 }, { "epoch": 6.739097169089518, "grad_norm": 6.662703514099121, "learning_rate": 1.8098931754779887e-05, "loss": 1.1733, "step": 11010 }, { "epoch": 6.74521805661821, "grad_norm": 2.99680757522583, "learning_rate": 1.8064911206368648e-05, "loss": 1.1768, "step": 11020 }, { "epoch": 6.751338944146902, "grad_norm": 2.3245575428009033, "learning_rate": 1.803089065795741e-05, "loss": 1.1777, "step": 11030 }, { "epoch": 6.757459831675593, "grad_norm": 2.462326765060425, "learning_rate": 1.7996870109546166e-05, "loss": 1.1786, "step": 11040 }, { "epoch": 6.763580719204285, "grad_norm": 2.6256535053253174, "learning_rate": 1.7962849561134927e-05, "loss": 1.1995, "step": 11050 }, { "epoch": 6.769701606732976, "grad_norm": 3.1199820041656494, "learning_rate": 1.7928829012723685e-05, "loss": 1.1864, "step": 11060 }, { "epoch": 6.775822494261668, "grad_norm": 2.832915782928467, "learning_rate": 1.7894808464312445e-05, "loss": 1.1835, "step": 11070 }, { "epoch": 6.781943381790359, "grad_norm": 3.149301052093506, "learning_rate": 1.7860787915901206e-05, "loss": 1.1758, "step": 11080 }, { "epoch": 6.788064269319051, "grad_norm": 2.3810904026031494, "learning_rate": 1.7826767367489967e-05, "loss": 1.1814, "step": 11090 }, { "epoch": 6.7941851568477425, "grad_norm": 3.023635149002075, "learning_rate": 1.7792746819078725e-05, "loss": 1.2116, "step": 11100 }, { "epoch": 6.800306044376435, "grad_norm": 2.3528966903686523, "learning_rate": 1.7758726270667482e-05, "loss": 1.1843, "step": 11110 }, { "epoch": 6.8064269319051265, "grad_norm": 2.8839402198791504, "learning_rate": 1.7724705722256243e-05, "loss": 1.1801, "step": 11120 }, { "epoch": 6.812547819433818, "grad_norm": 2.401827573776245, "learning_rate": 1.7690685173845004e-05, "loss": 1.1914, "step": 11130 }, { "epoch": 6.81866870696251, "grad_norm": 2.917004346847534, "learning_rate": 1.7656664625433764e-05, "loss": 1.1822, "step": 11140 }, { "epoch": 6.824789594491201, "grad_norm": 2.9332504272460938, "learning_rate": 1.7622644077022525e-05, "loss": 1.1879, "step": 11150 }, { "epoch": 6.830910482019893, "grad_norm": 2.532586097717285, "learning_rate": 1.758862352861128e-05, "loss": 1.1712, "step": 11160 }, { "epoch": 6.837031369548584, "grad_norm": 2.5997772216796875, "learning_rate": 1.755460298020004e-05, "loss": 1.1806, "step": 11170 }, { "epoch": 6.843152257077277, "grad_norm": 2.8061439990997314, "learning_rate": 1.75205824317888e-05, "loss": 1.1824, "step": 11180 }, { "epoch": 6.849273144605968, "grad_norm": 2.2987356185913086, "learning_rate": 1.7486561883377562e-05, "loss": 1.1849, "step": 11190 }, { "epoch": 6.85539403213466, "grad_norm": 2.5757126808166504, "learning_rate": 1.7452541334966323e-05, "loss": 1.1794, "step": 11200 }, { "epoch": 6.861514919663351, "grad_norm": 3.3306543827056885, "learning_rate": 1.741852078655508e-05, "loss": 1.1942, "step": 11210 }, { "epoch": 6.867635807192043, "grad_norm": 2.8044543266296387, "learning_rate": 1.7384500238143838e-05, "loss": 1.1695, "step": 11220 }, { "epoch": 6.8737566947207345, "grad_norm": 2.9289653301239014, "learning_rate": 1.73504796897326e-05, "loss": 1.165, "step": 11230 }, { "epoch": 6.879877582249426, "grad_norm": 2.7337334156036377, "learning_rate": 1.731645914132136e-05, "loss": 1.1732, "step": 11240 }, { "epoch": 6.885998469778118, "grad_norm": 2.9211790561676025, "learning_rate": 1.728243859291012e-05, "loss": 1.1905, "step": 11250 }, { "epoch": 6.892119357306809, "grad_norm": 2.608776092529297, "learning_rate": 1.7248418044498878e-05, "loss": 1.1794, "step": 11260 }, { "epoch": 6.898240244835501, "grad_norm": 2.365649938583374, "learning_rate": 1.7214397496087635e-05, "loss": 1.1882, "step": 11270 }, { "epoch": 6.904361132364193, "grad_norm": 2.889016628265381, "learning_rate": 1.7180376947676396e-05, "loss": 1.1644, "step": 11280 }, { "epoch": 6.910482019892885, "grad_norm": 2.8026652336120605, "learning_rate": 1.7146356399265157e-05, "loss": 1.1785, "step": 11290 }, { "epoch": 6.916602907421576, "grad_norm": 2.359157085418701, "learning_rate": 1.7112335850853917e-05, "loss": 1.1581, "step": 11300 }, { "epoch": 6.922723794950268, "grad_norm": 2.2410833835601807, "learning_rate": 1.707831530244268e-05, "loss": 1.1835, "step": 11310 }, { "epoch": 6.928844682478959, "grad_norm": 3.33797287940979, "learning_rate": 1.7044294754031436e-05, "loss": 1.2033, "step": 11320 }, { "epoch": 6.934965570007651, "grad_norm": 2.6676926612854004, "learning_rate": 1.7010274205620193e-05, "loss": 1.1759, "step": 11330 }, { "epoch": 6.9410864575363425, "grad_norm": 2.5464632511138916, "learning_rate": 1.6976253657208954e-05, "loss": 1.178, "step": 11340 }, { "epoch": 6.947207345065035, "grad_norm": 2.017395496368408, "learning_rate": 1.6942233108797715e-05, "loss": 1.1843, "step": 11350 }, { "epoch": 6.9533282325937265, "grad_norm": 3.047525405883789, "learning_rate": 1.6908212560386476e-05, "loss": 1.1589, "step": 11360 }, { "epoch": 6.959449120122418, "grad_norm": 3.0717241764068604, "learning_rate": 1.6874192011975233e-05, "loss": 1.1871, "step": 11370 }, { "epoch": 6.96557000765111, "grad_norm": 3.1424989700317383, "learning_rate": 1.6840171463563994e-05, "loss": 1.179, "step": 11380 }, { "epoch": 6.971690895179801, "grad_norm": 2.526454210281372, "learning_rate": 1.680615091515275e-05, "loss": 1.1832, "step": 11390 }, { "epoch": 6.977811782708493, "grad_norm": 2.4816434383392334, "learning_rate": 1.6772130366741512e-05, "loss": 1.1773, "step": 11400 }, { "epoch": 6.983932670237184, "grad_norm": 2.4473142623901367, "learning_rate": 1.6738109818330273e-05, "loss": 1.1609, "step": 11410 }, { "epoch": 6.990053557765876, "grad_norm": 4.933506488800049, "learning_rate": 1.670408926991903e-05, "loss": 1.1895, "step": 11420 }, { "epoch": 6.996174445294567, "grad_norm": 2.4855034351348877, "learning_rate": 1.667006872150779e-05, "loss": 1.1753, "step": 11430 }, { "epoch": 6.9998469778117824, "eval_accuracy": 0.506212457435819, "eval_loss": 1.1684563159942627, "eval_runtime": 2638.0369, "eval_samples_per_second": 79.262, "eval_steps_per_second": 0.619, "step": 11436 }, { "epoch": 7.00229533282326, "grad_norm": 2.4456984996795654, "learning_rate": 1.6636048173096552e-05, "loss": 1.1607, "step": 11440 }, { "epoch": 7.008416220351951, "grad_norm": 2.768742322921753, "learning_rate": 1.660202762468531e-05, "loss": 1.1586, "step": 11450 }, { "epoch": 7.014537107880643, "grad_norm": 6.783010482788086, "learning_rate": 1.656800707627407e-05, "loss": 1.1458, "step": 11460 }, { "epoch": 7.020657995409334, "grad_norm": 3.2379367351531982, "learning_rate": 1.653398652786283e-05, "loss": 1.156, "step": 11470 }, { "epoch": 7.026778882938026, "grad_norm": 2.605741262435913, "learning_rate": 1.649996597945159e-05, "loss": 1.1538, "step": 11480 }, { "epoch": 7.0328997704667175, "grad_norm": 2.8403258323669434, "learning_rate": 1.646594543104035e-05, "loss": 1.1369, "step": 11490 }, { "epoch": 7.039020657995409, "grad_norm": 2.7052204608917236, "learning_rate": 1.643192488262911e-05, "loss": 1.1392, "step": 11500 }, { "epoch": 7.045141545524101, "grad_norm": 2.6772308349609375, "learning_rate": 1.6397904334217868e-05, "loss": 1.159, "step": 11510 }, { "epoch": 7.051262433052793, "grad_norm": 2.957548141479492, "learning_rate": 1.636388378580663e-05, "loss": 1.147, "step": 11520 }, { "epoch": 7.057383320581485, "grad_norm": 3.451127767562866, "learning_rate": 1.6329863237395386e-05, "loss": 1.1674, "step": 11530 }, { "epoch": 7.063504208110176, "grad_norm": 2.7239480018615723, "learning_rate": 1.6295842688984147e-05, "loss": 1.1764, "step": 11540 }, { "epoch": 7.069625095638868, "grad_norm": 2.3374781608581543, "learning_rate": 1.6261822140572908e-05, "loss": 1.1757, "step": 11550 }, { "epoch": 7.075745983167559, "grad_norm": 3.032133102416992, "learning_rate": 1.622780159216167e-05, "loss": 1.1412, "step": 11560 }, { "epoch": 7.081866870696251, "grad_norm": 2.7997188568115234, "learning_rate": 1.6193781043750426e-05, "loss": 1.1475, "step": 11570 }, { "epoch": 7.087987758224942, "grad_norm": 2.812303304672241, "learning_rate": 1.6159760495339184e-05, "loss": 1.1512, "step": 11580 }, { "epoch": 7.094108645753634, "grad_norm": 3.3279213905334473, "learning_rate": 1.6125739946927944e-05, "loss": 1.1652, "step": 11590 }, { "epoch": 7.1002295332823255, "grad_norm": 3.0579566955566406, "learning_rate": 1.6091719398516705e-05, "loss": 1.1648, "step": 11600 }, { "epoch": 7.106350420811018, "grad_norm": 3.063218355178833, "learning_rate": 1.6057698850105466e-05, "loss": 1.1553, "step": 11610 }, { "epoch": 7.1124713083397095, "grad_norm": 4.288387775421143, "learning_rate": 1.6023678301694227e-05, "loss": 1.1675, "step": 11620 }, { "epoch": 7.118592195868401, "grad_norm": 2.884408473968506, "learning_rate": 1.5989657753282984e-05, "loss": 1.1605, "step": 11630 }, { "epoch": 7.124713083397093, "grad_norm": 3.101125478744507, "learning_rate": 1.5955637204871742e-05, "loss": 1.1643, "step": 11640 }, { "epoch": 7.130833970925784, "grad_norm": 2.6853866577148438, "learning_rate": 1.5921616656460503e-05, "loss": 1.1542, "step": 11650 }, { "epoch": 7.136954858454476, "grad_norm": 3.852677822113037, "learning_rate": 1.5887596108049263e-05, "loss": 1.1571, "step": 11660 }, { "epoch": 7.143075745983167, "grad_norm": 3.694631814956665, "learning_rate": 1.5853575559638024e-05, "loss": 1.1841, "step": 11670 }, { "epoch": 7.149196633511859, "grad_norm": 3.0455989837646484, "learning_rate": 1.5819555011226782e-05, "loss": 1.1337, "step": 11680 }, { "epoch": 7.155317521040551, "grad_norm": 2.740424394607544, "learning_rate": 1.578553446281554e-05, "loss": 1.1515, "step": 11690 }, { "epoch": 7.161438408569243, "grad_norm": 2.8961853981018066, "learning_rate": 1.57515139144043e-05, "loss": 1.1851, "step": 11700 }, { "epoch": 7.167559296097934, "grad_norm": 2.4369847774505615, "learning_rate": 1.571749336599306e-05, "loss": 1.1595, "step": 11710 }, { "epoch": 7.173680183626626, "grad_norm": 2.7445836067199707, "learning_rate": 1.568347281758182e-05, "loss": 1.15, "step": 11720 }, { "epoch": 7.1798010711553175, "grad_norm": 2.5091135501861572, "learning_rate": 1.5649452269170582e-05, "loss": 1.173, "step": 11730 }, { "epoch": 7.185921958684009, "grad_norm": 2.7890584468841553, "learning_rate": 1.5615431720759337e-05, "loss": 1.1538, "step": 11740 }, { "epoch": 7.192042846212701, "grad_norm": 2.6841843128204346, "learning_rate": 1.5581411172348097e-05, "loss": 1.1627, "step": 11750 }, { "epoch": 7.198163733741392, "grad_norm": 3.1628477573394775, "learning_rate": 1.5547390623936858e-05, "loss": 1.1598, "step": 11760 }, { "epoch": 7.204284621270084, "grad_norm": 3.3740761280059814, "learning_rate": 1.551337007552562e-05, "loss": 1.1385, "step": 11770 }, { "epoch": 7.210405508798776, "grad_norm": 3.2839245796203613, "learning_rate": 1.547934952711438e-05, "loss": 1.1818, "step": 11780 }, { "epoch": 7.216526396327468, "grad_norm": 2.728667736053467, "learning_rate": 1.5445328978703137e-05, "loss": 1.1555, "step": 11790 }, { "epoch": 7.222647283856159, "grad_norm": 2.8254268169403076, "learning_rate": 1.5411308430291895e-05, "loss": 1.1651, "step": 11800 }, { "epoch": 7.228768171384851, "grad_norm": 2.361497640609741, "learning_rate": 1.5377287881880656e-05, "loss": 1.1478, "step": 11810 }, { "epoch": 7.234889058913542, "grad_norm": 2.7829697132110596, "learning_rate": 1.5343267333469416e-05, "loss": 1.1575, "step": 11820 }, { "epoch": 7.241009946442234, "grad_norm": 3.025606870651245, "learning_rate": 1.5309246785058177e-05, "loss": 1.1693, "step": 11830 }, { "epoch": 7.2471308339709255, "grad_norm": 2.929812431335449, "learning_rate": 1.5275226236646935e-05, "loss": 1.1697, "step": 11840 }, { "epoch": 7.253251721499617, "grad_norm": 2.4072935581207275, "learning_rate": 1.5241205688235696e-05, "loss": 1.1605, "step": 11850 }, { "epoch": 7.2593726090283095, "grad_norm": 3.0431549549102783, "learning_rate": 1.5207185139824453e-05, "loss": 1.1653, "step": 11860 }, { "epoch": 7.265493496557001, "grad_norm": 2.6219193935394287, "learning_rate": 1.5173164591413214e-05, "loss": 1.1629, "step": 11870 }, { "epoch": 7.271614384085693, "grad_norm": 4.558614253997803, "learning_rate": 1.5139144043001973e-05, "loss": 1.1462, "step": 11880 }, { "epoch": 7.277735271614384, "grad_norm": 3.1963789463043213, "learning_rate": 1.5105123494590734e-05, "loss": 1.1594, "step": 11890 }, { "epoch": 7.283856159143076, "grad_norm": 2.740114212036133, "learning_rate": 1.5071102946179495e-05, "loss": 1.1599, "step": 11900 }, { "epoch": 7.289977046671767, "grad_norm": 2.688758134841919, "learning_rate": 1.5037082397768254e-05, "loss": 1.1705, "step": 11910 }, { "epoch": 7.296097934200459, "grad_norm": 3.089334011077881, "learning_rate": 1.5003061849357011e-05, "loss": 1.1837, "step": 11920 }, { "epoch": 7.30221882172915, "grad_norm": 2.151132822036743, "learning_rate": 1.4969041300945772e-05, "loss": 1.1793, "step": 11930 }, { "epoch": 7.308339709257842, "grad_norm": 2.6404690742492676, "learning_rate": 1.4935020752534531e-05, "loss": 1.1483, "step": 11940 }, { "epoch": 7.314460596786534, "grad_norm": 3.6337945461273193, "learning_rate": 1.4901000204123292e-05, "loss": 1.178, "step": 11950 }, { "epoch": 7.320581484315226, "grad_norm": 3.0208799839019775, "learning_rate": 1.4866979655712051e-05, "loss": 1.1774, "step": 11960 }, { "epoch": 7.3267023718439175, "grad_norm": 2.9319422245025635, "learning_rate": 1.4832959107300812e-05, "loss": 1.1627, "step": 11970 }, { "epoch": 7.332823259372609, "grad_norm": 2.71775484085083, "learning_rate": 1.479893855888957e-05, "loss": 1.1509, "step": 11980 }, { "epoch": 7.338944146901301, "grad_norm": 2.7247722148895264, "learning_rate": 1.4764918010478329e-05, "loss": 1.151, "step": 11990 }, { "epoch": 7.345065034429992, "grad_norm": 2.7740538120269775, "learning_rate": 1.473089746206709e-05, "loss": 1.1558, "step": 12000 }, { "epoch": 7.351185921958684, "grad_norm": 2.2844157218933105, "learning_rate": 1.4696876913655849e-05, "loss": 1.1709, "step": 12010 }, { "epoch": 7.357306809487375, "grad_norm": 2.582582950592041, "learning_rate": 1.466285636524461e-05, "loss": 1.1666, "step": 12020 }, { "epoch": 7.363427697016068, "grad_norm": 2.9742329120635986, "learning_rate": 1.4628835816833367e-05, "loss": 1.1523, "step": 12030 }, { "epoch": 7.369548584544759, "grad_norm": 3.2235240936279297, "learning_rate": 1.4594815268422126e-05, "loss": 1.1655, "step": 12040 }, { "epoch": 7.375669472073451, "grad_norm": 2.9418814182281494, "learning_rate": 1.4560794720010887e-05, "loss": 1.1478, "step": 12050 }, { "epoch": 7.381790359602142, "grad_norm": 2.813707113265991, "learning_rate": 1.4526774171599648e-05, "loss": 1.1548, "step": 12060 }, { "epoch": 7.387911247130834, "grad_norm": 3.019864082336426, "learning_rate": 1.4492753623188407e-05, "loss": 1.1583, "step": 12070 }, { "epoch": 7.3940321346595255, "grad_norm": 2.753734588623047, "learning_rate": 1.4458733074777168e-05, "loss": 1.1695, "step": 12080 }, { "epoch": 7.400153022188217, "grad_norm": 2.878682851791382, "learning_rate": 1.4424712526365925e-05, "loss": 1.1662, "step": 12090 }, { "epoch": 7.406273909716909, "grad_norm": 2.875643730163574, "learning_rate": 1.4390691977954684e-05, "loss": 1.1695, "step": 12100 }, { "epoch": 7.412394797245601, "grad_norm": 2.6274614334106445, "learning_rate": 1.4356671429543445e-05, "loss": 1.1682, "step": 12110 }, { "epoch": 7.418515684774293, "grad_norm": 3.541102647781372, "learning_rate": 1.4322650881132204e-05, "loss": 1.1555, "step": 12120 }, { "epoch": 7.424636572302984, "grad_norm": 2.542127847671509, "learning_rate": 1.4288630332720965e-05, "loss": 1.1637, "step": 12130 }, { "epoch": 7.430757459831676, "grad_norm": 3.612274169921875, "learning_rate": 1.4254609784309724e-05, "loss": 1.1533, "step": 12140 }, { "epoch": 7.436878347360367, "grad_norm": 2.997279405593872, "learning_rate": 1.4220589235898482e-05, "loss": 1.1724, "step": 12150 }, { "epoch": 7.442999234889059, "grad_norm": 2.5861494541168213, "learning_rate": 1.4186568687487242e-05, "loss": 1.1835, "step": 12160 }, { "epoch": 7.44912012241775, "grad_norm": 2.768721342086792, "learning_rate": 1.4152548139076002e-05, "loss": 1.1371, "step": 12170 }, { "epoch": 7.455241009946442, "grad_norm": 3.0678112506866455, "learning_rate": 1.4118527590664762e-05, "loss": 1.1829, "step": 12180 }, { "epoch": 7.461361897475134, "grad_norm": 2.529768228530884, "learning_rate": 1.4084507042253523e-05, "loss": 1.1469, "step": 12190 }, { "epoch": 7.467482785003826, "grad_norm": 2.7529709339141846, "learning_rate": 1.4050486493842282e-05, "loss": 1.1494, "step": 12200 }, { "epoch": 7.4736036725325175, "grad_norm": 3.012122631072998, "learning_rate": 1.401646594543104e-05, "loss": 1.1716, "step": 12210 }, { "epoch": 7.479724560061209, "grad_norm": 2.9628913402557373, "learning_rate": 1.39824453970198e-05, "loss": 1.1623, "step": 12220 }, { "epoch": 7.485845447589901, "grad_norm": 2.6262001991271973, "learning_rate": 1.394842484860856e-05, "loss": 1.1684, "step": 12230 }, { "epoch": 7.491966335118592, "grad_norm": 3.49383544921875, "learning_rate": 1.391440430019732e-05, "loss": 1.1686, "step": 12240 }, { "epoch": 7.498087222647284, "grad_norm": 2.6639795303344727, "learning_rate": 1.388038375178608e-05, "loss": 1.1402, "step": 12250 }, { "epoch": 7.504208110175975, "grad_norm": 3.079808235168457, "learning_rate": 1.384636320337484e-05, "loss": 1.168, "step": 12260 }, { "epoch": 7.510328997704667, "grad_norm": 2.770954132080078, "learning_rate": 1.3812342654963598e-05, "loss": 1.1693, "step": 12270 }, { "epoch": 7.516449885233359, "grad_norm": 3.519986629486084, "learning_rate": 1.3778322106552357e-05, "loss": 1.147, "step": 12280 }, { "epoch": 7.522570772762051, "grad_norm": 2.8663604259490967, "learning_rate": 1.3744301558141118e-05, "loss": 1.1609, "step": 12290 }, { "epoch": 7.528691660290742, "grad_norm": 3.6388792991638184, "learning_rate": 1.3710281009729877e-05, "loss": 1.1953, "step": 12300 }, { "epoch": 7.534812547819434, "grad_norm": 2.5936012268066406, "learning_rate": 1.3676260461318638e-05, "loss": 1.1634, "step": 12310 }, { "epoch": 7.540933435348125, "grad_norm": 2.83449387550354, "learning_rate": 1.3642239912907399e-05, "loss": 1.1716, "step": 12320 }, { "epoch": 7.547054322876817, "grad_norm": 3.5120432376861572, "learning_rate": 1.3608219364496155e-05, "loss": 1.179, "step": 12330 }, { "epoch": 7.5531752104055085, "grad_norm": 2.879821300506592, "learning_rate": 1.3574198816084915e-05, "loss": 1.1607, "step": 12340 }, { "epoch": 7.5592960979342, "grad_norm": 3.0477402210235596, "learning_rate": 1.3540178267673676e-05, "loss": 1.1605, "step": 12350 }, { "epoch": 7.5654169854628925, "grad_norm": 2.9386744499206543, "learning_rate": 1.3506157719262435e-05, "loss": 1.1582, "step": 12360 }, { "epoch": 7.571537872991584, "grad_norm": 2.8157691955566406, "learning_rate": 1.3472137170851196e-05, "loss": 1.1514, "step": 12370 }, { "epoch": 7.577658760520276, "grad_norm": 3.2182819843292236, "learning_rate": 1.3438116622439955e-05, "loss": 1.1571, "step": 12380 }, { "epoch": 7.583779648048967, "grad_norm": 3.252556562423706, "learning_rate": 1.3404096074028713e-05, "loss": 1.1616, "step": 12390 }, { "epoch": 7.589900535577659, "grad_norm": 2.7979633808135986, "learning_rate": 1.3370075525617474e-05, "loss": 1.1643, "step": 12400 }, { "epoch": 7.59602142310635, "grad_norm": 2.8204915523529053, "learning_rate": 1.3336054977206233e-05, "loss": 1.1598, "step": 12410 }, { "epoch": 7.602142310635042, "grad_norm": 2.7400918006896973, "learning_rate": 1.3302034428794994e-05, "loss": 1.1792, "step": 12420 }, { "epoch": 7.608263198163733, "grad_norm": 2.467127561569214, "learning_rate": 1.3268013880383753e-05, "loss": 1.1724, "step": 12430 }, { "epoch": 7.614384085692425, "grad_norm": 2.994223117828369, "learning_rate": 1.323399333197251e-05, "loss": 1.1749, "step": 12440 }, { "epoch": 7.620504973221117, "grad_norm": 2.5678718090057373, "learning_rate": 1.3199972783561271e-05, "loss": 1.1516, "step": 12450 }, { "epoch": 7.626625860749809, "grad_norm": 2.963792085647583, "learning_rate": 1.316595223515003e-05, "loss": 1.1915, "step": 12460 }, { "epoch": 7.6327467482785005, "grad_norm": 3.1967992782592773, "learning_rate": 1.3131931686738791e-05, "loss": 1.1713, "step": 12470 }, { "epoch": 7.638867635807192, "grad_norm": 2.8124046325683594, "learning_rate": 1.3097911138327552e-05, "loss": 1.1524, "step": 12480 }, { "epoch": 7.644988523335884, "grad_norm": 2.9094791412353516, "learning_rate": 1.3063890589916311e-05, "loss": 1.1482, "step": 12490 }, { "epoch": 7.651109410864575, "grad_norm": 3.05389142036438, "learning_rate": 1.3029870041505068e-05, "loss": 1.1529, "step": 12500 }, { "epoch": 7.657230298393267, "grad_norm": 2.79539155960083, "learning_rate": 1.299584949309383e-05, "loss": 1.1594, "step": 12510 }, { "epoch": 7.663351185921958, "grad_norm": 2.77883243560791, "learning_rate": 1.2961828944682588e-05, "loss": 1.1402, "step": 12520 }, { "epoch": 7.669472073450651, "grad_norm": 3.6547787189483643, "learning_rate": 1.292780839627135e-05, "loss": 1.1764, "step": 12530 }, { "epoch": 7.675592960979342, "grad_norm": 3.068427801132202, "learning_rate": 1.2893787847860108e-05, "loss": 1.1619, "step": 12540 }, { "epoch": 7.681713848508034, "grad_norm": 3.1389434337615967, "learning_rate": 1.285976729944887e-05, "loss": 1.1603, "step": 12550 }, { "epoch": 7.687834736036725, "grad_norm": 2.6703555583953857, "learning_rate": 1.2825746751037627e-05, "loss": 1.1638, "step": 12560 }, { "epoch": 7.693955623565417, "grad_norm": 3.204761028289795, "learning_rate": 1.2791726202626386e-05, "loss": 1.1702, "step": 12570 }, { "epoch": 7.7000765110941085, "grad_norm": 2.758955717086792, "learning_rate": 1.2757705654215147e-05, "loss": 1.1335, "step": 12580 }, { "epoch": 7.7061973986228, "grad_norm": 2.848846197128296, "learning_rate": 1.2723685105803906e-05, "loss": 1.1589, "step": 12590 }, { "epoch": 7.712318286151492, "grad_norm": 4.4077630043029785, "learning_rate": 1.2689664557392667e-05, "loss": 1.128, "step": 12600 }, { "epoch": 7.718439173680183, "grad_norm": 2.898097038269043, "learning_rate": 1.2655644008981426e-05, "loss": 1.1513, "step": 12610 }, { "epoch": 7.724560061208876, "grad_norm": 3.664088249206543, "learning_rate": 1.2621623460570183e-05, "loss": 1.1615, "step": 12620 }, { "epoch": 7.730680948737567, "grad_norm": 2.5493152141571045, "learning_rate": 1.2587602912158944e-05, "loss": 1.1498, "step": 12630 }, { "epoch": 7.736801836266259, "grad_norm": 3.5458126068115234, "learning_rate": 1.2553582363747705e-05, "loss": 1.1375, "step": 12640 }, { "epoch": 7.74292272379495, "grad_norm": 2.8673930168151855, "learning_rate": 1.2519561815336464e-05, "loss": 1.1696, "step": 12650 }, { "epoch": 7.749043611323642, "grad_norm": 2.6049365997314453, "learning_rate": 1.2485541266925223e-05, "loss": 1.1786, "step": 12660 }, { "epoch": 7.755164498852333, "grad_norm": 3.1647322177886963, "learning_rate": 1.2451520718513982e-05, "loss": 1.1593, "step": 12670 }, { "epoch": 7.761285386381025, "grad_norm": 4.2167649269104, "learning_rate": 1.2417500170102743e-05, "loss": 1.1604, "step": 12680 }, { "epoch": 7.767406273909717, "grad_norm": 3.039738655090332, "learning_rate": 1.2383479621691502e-05, "loss": 1.162, "step": 12690 }, { "epoch": 7.773527161438409, "grad_norm": 4.248793125152588, "learning_rate": 1.2349459073280261e-05, "loss": 1.1537, "step": 12700 }, { "epoch": 7.7796480489671005, "grad_norm": 2.4704842567443848, "learning_rate": 1.2315438524869022e-05, "loss": 1.1563, "step": 12710 }, { "epoch": 7.785768936495792, "grad_norm": 3.5454394817352295, "learning_rate": 1.2281417976457781e-05, "loss": 1.1546, "step": 12720 }, { "epoch": 7.791889824024484, "grad_norm": 2.821631669998169, "learning_rate": 1.224739742804654e-05, "loss": 1.1563, "step": 12730 }, { "epoch": 7.798010711553175, "grad_norm": 2.692189931869507, "learning_rate": 1.2213376879635301e-05, "loss": 1.1658, "step": 12740 }, { "epoch": 7.804131599081867, "grad_norm": 2.9162681102752686, "learning_rate": 1.2179356331224059e-05, "loss": 1.176, "step": 12750 }, { "epoch": 7.810252486610558, "grad_norm": 3.638213872909546, "learning_rate": 1.214533578281282e-05, "loss": 1.165, "step": 12760 }, { "epoch": 7.81637337413925, "grad_norm": 3.419288158416748, "learning_rate": 1.2111315234401579e-05, "loss": 1.1561, "step": 12770 }, { "epoch": 7.822494261667941, "grad_norm": 2.933861255645752, "learning_rate": 1.2077294685990338e-05, "loss": 1.1663, "step": 12780 }, { "epoch": 7.828615149196634, "grad_norm": 2.9365577697753906, "learning_rate": 1.2043274137579099e-05, "loss": 1.1514, "step": 12790 }, { "epoch": 7.834736036725325, "grad_norm": 2.736703634262085, "learning_rate": 1.2009253589167858e-05, "loss": 1.1579, "step": 12800 }, { "epoch": 7.840856924254017, "grad_norm": 2.8209228515625, "learning_rate": 1.1975233040756617e-05, "loss": 1.164, "step": 12810 }, { "epoch": 7.8469778117827085, "grad_norm": 4.310190200805664, "learning_rate": 1.1941212492345378e-05, "loss": 1.1767, "step": 12820 }, { "epoch": 7.8530986993114, "grad_norm": 2.7291927337646484, "learning_rate": 1.1907191943934137e-05, "loss": 1.1503, "step": 12830 }, { "epoch": 7.859219586840092, "grad_norm": 2.8844997882843018, "learning_rate": 1.1873171395522896e-05, "loss": 1.1516, "step": 12840 }, { "epoch": 7.865340474368783, "grad_norm": 3.1168551445007324, "learning_rate": 1.1839150847111655e-05, "loss": 1.1593, "step": 12850 }, { "epoch": 7.871461361897476, "grad_norm": 2.552144765853882, "learning_rate": 1.1805130298700416e-05, "loss": 1.133, "step": 12860 }, { "epoch": 7.877582249426167, "grad_norm": 3.00880765914917, "learning_rate": 1.1771109750289175e-05, "loss": 1.1585, "step": 12870 }, { "epoch": 7.883703136954859, "grad_norm": 3.129321813583374, "learning_rate": 1.1737089201877934e-05, "loss": 1.1643, "step": 12880 }, { "epoch": 7.88982402448355, "grad_norm": 2.6133196353912354, "learning_rate": 1.1703068653466695e-05, "loss": 1.1933, "step": 12890 }, { "epoch": 7.895944912012242, "grad_norm": 3.4052469730377197, "learning_rate": 1.1669048105055454e-05, "loss": 1.1478, "step": 12900 }, { "epoch": 7.902065799540933, "grad_norm": 3.1864941120147705, "learning_rate": 1.1635027556644213e-05, "loss": 1.1558, "step": 12910 }, { "epoch": 7.908186687069625, "grad_norm": 2.8696846961975098, "learning_rate": 1.1601007008232974e-05, "loss": 1.1498, "step": 12920 }, { "epoch": 7.9143075745983165, "grad_norm": 3.1243202686309814, "learning_rate": 1.1566986459821732e-05, "loss": 1.1532, "step": 12930 }, { "epoch": 7.920428462127008, "grad_norm": 3.030133008956909, "learning_rate": 1.1532965911410493e-05, "loss": 1.168, "step": 12940 }, { "epoch": 7.9265493496557005, "grad_norm": 5.5496602058410645, "learning_rate": 1.1498945362999253e-05, "loss": 1.1728, "step": 12950 }, { "epoch": 7.932670237184392, "grad_norm": 2.774750232696533, "learning_rate": 1.146492481458801e-05, "loss": 1.1698, "step": 12960 }, { "epoch": 7.938791124713084, "grad_norm": 2.4071033000946045, "learning_rate": 1.1430904266176772e-05, "loss": 1.1621, "step": 12970 }, { "epoch": 7.944912012241775, "grad_norm": 3.9501190185546875, "learning_rate": 1.139688371776553e-05, "loss": 1.1595, "step": 12980 }, { "epoch": 7.951032899770467, "grad_norm": 2.919703483581543, "learning_rate": 1.136286316935429e-05, "loss": 1.1575, "step": 12990 }, { "epoch": 7.957153787299158, "grad_norm": 2.7440402507781982, "learning_rate": 1.132884262094305e-05, "loss": 1.1659, "step": 13000 }, { "epoch": 7.96327467482785, "grad_norm": 2.6349709033966064, "learning_rate": 1.129482207253181e-05, "loss": 1.148, "step": 13010 }, { "epoch": 7.969395562356541, "grad_norm": 3.7944247722625732, "learning_rate": 1.1260801524120569e-05, "loss": 1.1524, "step": 13020 }, { "epoch": 7.975516449885234, "grad_norm": 3.455249071121216, "learning_rate": 1.122678097570933e-05, "loss": 1.1796, "step": 13030 }, { "epoch": 7.981637337413925, "grad_norm": 3.802947759628296, "learning_rate": 1.1192760427298087e-05, "loss": 1.163, "step": 13040 }, { "epoch": 7.987758224942617, "grad_norm": 3.643726348876953, "learning_rate": 1.1158739878886848e-05, "loss": 1.1361, "step": 13050 }, { "epoch": 7.9938791124713084, "grad_norm": 3.0998475551605225, "learning_rate": 1.1124719330475607e-05, "loss": 1.1557, "step": 13060 }, { "epoch": 8.0, "grad_norm": 4.292564392089844, "learning_rate": 1.1090698782064366e-05, "loss": 1.1554, "step": 13070 }, { "epoch": 8.0, "eval_accuracy": 0.5080250219994643, "eval_loss": 1.168102741241455, "eval_runtime": 2685.348, "eval_samples_per_second": 77.866, "eval_steps_per_second": 0.608, "step": 13070 }, { "epoch": 8.006120887528692, "grad_norm": 2.589630603790283, "learning_rate": 1.1056678233653127e-05, "loss": 1.124, "step": 13080 }, { "epoch": 8.012241775057383, "grad_norm": 2.709681749343872, "learning_rate": 1.1022657685241886e-05, "loss": 1.1433, "step": 13090 }, { "epoch": 8.018362662586075, "grad_norm": 3.8132050037384033, "learning_rate": 1.0988637136830646e-05, "loss": 1.1654, "step": 13100 }, { "epoch": 8.024483550114766, "grad_norm": 3.2627689838409424, "learning_rate": 1.0954616588419406e-05, "loss": 1.1345, "step": 13110 }, { "epoch": 8.030604437643458, "grad_norm": 3.628002405166626, "learning_rate": 1.0920596040008166e-05, "loss": 1.1702, "step": 13120 }, { "epoch": 8.03672532517215, "grad_norm": 2.8849120140075684, "learning_rate": 1.0886575491596925e-05, "loss": 1.1406, "step": 13130 }, { "epoch": 8.04284621270084, "grad_norm": 3.8340485095977783, "learning_rate": 1.0852554943185684e-05, "loss": 1.1339, "step": 13140 }, { "epoch": 8.048967100229532, "grad_norm": 3.1332828998565674, "learning_rate": 1.0818534394774445e-05, "loss": 1.1345, "step": 13150 }, { "epoch": 8.055087987758226, "grad_norm": 3.195749044418335, "learning_rate": 1.0784513846363204e-05, "loss": 1.1504, "step": 13160 }, { "epoch": 8.061208875286917, "grad_norm": 3.9875683784484863, "learning_rate": 1.0750493297951963e-05, "loss": 1.1356, "step": 13170 }, { "epoch": 8.067329762815609, "grad_norm": 4.1312384605407715, "learning_rate": 1.0716472749540724e-05, "loss": 1.1529, "step": 13180 }, { "epoch": 8.0734506503443, "grad_norm": 4.20302152633667, "learning_rate": 1.0682452201129483e-05, "loss": 1.1317, "step": 13190 }, { "epoch": 8.079571537872992, "grad_norm": 3.356748580932617, "learning_rate": 1.0648431652718242e-05, "loss": 1.1307, "step": 13200 }, { "epoch": 8.085692425401684, "grad_norm": 2.7438583374023438, "learning_rate": 1.0614411104307003e-05, "loss": 1.1565, "step": 13210 }, { "epoch": 8.091813312930375, "grad_norm": 2.8637239933013916, "learning_rate": 1.058039055589576e-05, "loss": 1.1327, "step": 13220 }, { "epoch": 8.097934200459067, "grad_norm": 2.9302523136138916, "learning_rate": 1.0546370007484521e-05, "loss": 1.1527, "step": 13230 }, { "epoch": 8.104055087987758, "grad_norm": 3.599940538406372, "learning_rate": 1.0512349459073282e-05, "loss": 1.1311, "step": 13240 }, { "epoch": 8.11017597551645, "grad_norm": 3.4082233905792236, "learning_rate": 1.047832891066204e-05, "loss": 1.1282, "step": 13250 }, { "epoch": 8.116296863045141, "grad_norm": 3.8687469959259033, "learning_rate": 1.04443083622508e-05, "loss": 1.141, "step": 13260 }, { "epoch": 8.122417750573833, "grad_norm": 3.1596381664276123, "learning_rate": 1.041028781383956e-05, "loss": 1.1433, "step": 13270 }, { "epoch": 8.128538638102524, "grad_norm": 3.489830255508423, "learning_rate": 1.0376267265428319e-05, "loss": 1.1456, "step": 13280 }, { "epoch": 8.134659525631216, "grad_norm": 3.366809844970703, "learning_rate": 1.034224671701708e-05, "loss": 1.1499, "step": 13290 }, { "epoch": 8.140780413159908, "grad_norm": 3.154359817504883, "learning_rate": 1.0308226168605839e-05, "loss": 1.1721, "step": 13300 }, { "epoch": 8.146901300688599, "grad_norm": 3.2491090297698975, "learning_rate": 1.0274205620194598e-05, "loss": 1.1306, "step": 13310 }, { "epoch": 8.15302218821729, "grad_norm": 3.3683319091796875, "learning_rate": 1.0240185071783358e-05, "loss": 1.1425, "step": 13320 }, { "epoch": 8.159143075745984, "grad_norm": 3.144820213317871, "learning_rate": 1.0206164523372118e-05, "loss": 1.1222, "step": 13330 }, { "epoch": 8.165263963274676, "grad_norm": 3.5433125495910645, "learning_rate": 1.0172143974960877e-05, "loss": 1.1324, "step": 13340 }, { "epoch": 8.171384850803367, "grad_norm": 3.311105728149414, "learning_rate": 1.0138123426549636e-05, "loss": 1.1394, "step": 13350 }, { "epoch": 8.177505738332059, "grad_norm": 2.810060739517212, "learning_rate": 1.0104102878138397e-05, "loss": 1.1454, "step": 13360 }, { "epoch": 8.18362662586075, "grad_norm": 3.0078938007354736, "learning_rate": 1.0070082329727156e-05, "loss": 1.1376, "step": 13370 }, { "epoch": 8.189747513389442, "grad_norm": 2.672898769378662, "learning_rate": 1.0036061781315915e-05, "loss": 1.1611, "step": 13380 }, { "epoch": 8.195868400918133, "grad_norm": 2.5500643253326416, "learning_rate": 1.0002041232904676e-05, "loss": 1.1525, "step": 13390 }, { "epoch": 8.201989288446825, "grad_norm": 3.041093587875366, "learning_rate": 9.968020684493435e-06, "loss": 1.1366, "step": 13400 }, { "epoch": 8.208110175975516, "grad_norm": 3.1694231033325195, "learning_rate": 9.934000136082194e-06, "loss": 1.1638, "step": 13410 }, { "epoch": 8.214231063504208, "grad_norm": 3.110459089279175, "learning_rate": 9.899979587670953e-06, "loss": 1.1397, "step": 13420 }, { "epoch": 8.2203519510329, "grad_norm": 3.1803088188171387, "learning_rate": 9.865959039259712e-06, "loss": 1.1641, "step": 13430 }, { "epoch": 8.226472838561591, "grad_norm": 4.122441291809082, "learning_rate": 9.831938490848473e-06, "loss": 1.1562, "step": 13440 }, { "epoch": 8.232593726090283, "grad_norm": 2.7772445678710938, "learning_rate": 9.797917942437232e-06, "loss": 1.154, "step": 13450 }, { "epoch": 8.238714613618974, "grad_norm": 2.789196014404297, "learning_rate": 9.763897394025992e-06, "loss": 1.1483, "step": 13460 }, { "epoch": 8.244835501147666, "grad_norm": 3.0457327365875244, "learning_rate": 9.729876845614752e-06, "loss": 1.1212, "step": 13470 }, { "epoch": 8.250956388676357, "grad_norm": 3.0723581314086914, "learning_rate": 9.695856297203511e-06, "loss": 1.1467, "step": 13480 }, { "epoch": 8.257077276205049, "grad_norm": 3.5150368213653564, "learning_rate": 9.66183574879227e-06, "loss": 1.1353, "step": 13490 }, { "epoch": 8.263198163733742, "grad_norm": 3.7156941890716553, "learning_rate": 9.627815200381031e-06, "loss": 1.1536, "step": 13500 }, { "epoch": 8.269319051262434, "grad_norm": 3.0745482444763184, "learning_rate": 9.593794651969789e-06, "loss": 1.1182, "step": 13510 }, { "epoch": 8.275439938791125, "grad_norm": 3.3888468742370605, "learning_rate": 9.55977410355855e-06, "loss": 1.1684, "step": 13520 }, { "epoch": 8.281560826319817, "grad_norm": 3.1190714836120605, "learning_rate": 9.52575355514731e-06, "loss": 1.1388, "step": 13530 }, { "epoch": 8.287681713848508, "grad_norm": 3.361152172088623, "learning_rate": 9.491733006736068e-06, "loss": 1.166, "step": 13540 }, { "epoch": 8.2938026013772, "grad_norm": 3.003202199935913, "learning_rate": 9.457712458324829e-06, "loss": 1.1659, "step": 13550 }, { "epoch": 8.299923488905891, "grad_norm": 3.121140718460083, "learning_rate": 9.423691909913588e-06, "loss": 1.131, "step": 13560 }, { "epoch": 8.306044376434583, "grad_norm": 4.083986282348633, "learning_rate": 9.389671361502347e-06, "loss": 1.1433, "step": 13570 }, { "epoch": 8.312165263963275, "grad_norm": 2.6656112670898438, "learning_rate": 9.355650813091108e-06, "loss": 1.1422, "step": 13580 }, { "epoch": 8.318286151491966, "grad_norm": 3.0381946563720703, "learning_rate": 9.321630264679867e-06, "loss": 1.1543, "step": 13590 }, { "epoch": 8.324407039020658, "grad_norm": 2.631458282470703, "learning_rate": 9.287609716268626e-06, "loss": 1.1442, "step": 13600 }, { "epoch": 8.33052792654935, "grad_norm": 3.234130620956421, "learning_rate": 9.253589167857387e-06, "loss": 1.1463, "step": 13610 }, { "epoch": 8.33664881407804, "grad_norm": 2.735459566116333, "learning_rate": 9.219568619446146e-06, "loss": 1.1337, "step": 13620 }, { "epoch": 8.342769701606732, "grad_norm": 3.386232852935791, "learning_rate": 9.185548071034905e-06, "loss": 1.1434, "step": 13630 }, { "epoch": 8.348890589135424, "grad_norm": 2.588688611984253, "learning_rate": 9.151527522623664e-06, "loss": 1.1449, "step": 13640 }, { "epoch": 8.355011476664115, "grad_norm": 2.9801838397979736, "learning_rate": 9.117506974212425e-06, "loss": 1.1649, "step": 13650 }, { "epoch": 8.361132364192809, "grad_norm": 3.2618236541748047, "learning_rate": 9.083486425801184e-06, "loss": 1.1522, "step": 13660 }, { "epoch": 8.3672532517215, "grad_norm": 3.2830750942230225, "learning_rate": 9.049465877389944e-06, "loss": 1.1472, "step": 13670 }, { "epoch": 8.373374139250192, "grad_norm": 3.4445407390594482, "learning_rate": 9.015445328978704e-06, "loss": 1.1538, "step": 13680 }, { "epoch": 8.379495026778883, "grad_norm": 3.6787800788879395, "learning_rate": 8.981424780567464e-06, "loss": 1.1503, "step": 13690 }, { "epoch": 8.385615914307575, "grad_norm": 3.1863129138946533, "learning_rate": 8.947404232156223e-06, "loss": 1.1275, "step": 13700 }, { "epoch": 8.391736801836267, "grad_norm": 2.996192216873169, "learning_rate": 8.913383683744984e-06, "loss": 1.1438, "step": 13710 }, { "epoch": 8.397857689364958, "grad_norm": 2.6561343669891357, "learning_rate": 8.879363135333741e-06, "loss": 1.1432, "step": 13720 }, { "epoch": 8.40397857689365, "grad_norm": 2.806730031967163, "learning_rate": 8.845342586922502e-06, "loss": 1.1325, "step": 13730 }, { "epoch": 8.410099464422341, "grad_norm": 2.868938446044922, "learning_rate": 8.811322038511263e-06, "loss": 1.1437, "step": 13740 }, { "epoch": 8.416220351951033, "grad_norm": 2.915384292602539, "learning_rate": 8.77730149010002e-06, "loss": 1.1502, "step": 13750 }, { "epoch": 8.422341239479724, "grad_norm": 3.5552773475646973, "learning_rate": 8.743280941688781e-06, "loss": 1.1418, "step": 13760 }, { "epoch": 8.428462127008416, "grad_norm": 3.456284999847412, "learning_rate": 8.70926039327754e-06, "loss": 1.1539, "step": 13770 }, { "epoch": 8.434583014537107, "grad_norm": 3.1513900756835938, "learning_rate": 8.6752398448663e-06, "loss": 1.1522, "step": 13780 }, { "epoch": 8.440703902065799, "grad_norm": 3.183990240097046, "learning_rate": 8.64121929645506e-06, "loss": 1.1419, "step": 13790 }, { "epoch": 8.44682478959449, "grad_norm": 3.3508360385894775, "learning_rate": 8.607198748043817e-06, "loss": 1.151, "step": 13800 }, { "epoch": 8.452945677123182, "grad_norm": 3.314988374710083, "learning_rate": 8.573178199632578e-06, "loss": 1.1358, "step": 13810 }, { "epoch": 8.459066564651874, "grad_norm": 3.3967995643615723, "learning_rate": 8.53915765122134e-06, "loss": 1.147, "step": 13820 }, { "epoch": 8.465187452180567, "grad_norm": 3.0596559047698975, "learning_rate": 8.505137102810097e-06, "loss": 1.1446, "step": 13830 }, { "epoch": 8.471308339709259, "grad_norm": 2.67799973487854, "learning_rate": 8.471116554398857e-06, "loss": 1.1332, "step": 13840 }, { "epoch": 8.47742922723795, "grad_norm": 3.3761391639709473, "learning_rate": 8.437096005987617e-06, "loss": 1.1318, "step": 13850 }, { "epoch": 8.483550114766642, "grad_norm": 7.103214263916016, "learning_rate": 8.403075457576376e-06, "loss": 1.1476, "step": 13860 }, { "epoch": 8.489671002295333, "grad_norm": 3.1061320304870605, "learning_rate": 8.369054909165137e-06, "loss": 1.1586, "step": 13870 }, { "epoch": 8.495791889824025, "grad_norm": 3.3592042922973633, "learning_rate": 8.335034360753896e-06, "loss": 1.1442, "step": 13880 }, { "epoch": 8.501912777352716, "grad_norm": 5.015500545501709, "learning_rate": 8.301013812342655e-06, "loss": 1.1537, "step": 13890 }, { "epoch": 8.508033664881408, "grad_norm": 3.5375547409057617, "learning_rate": 8.266993263931416e-06, "loss": 1.1601, "step": 13900 }, { "epoch": 8.5141545524101, "grad_norm": 3.1028242111206055, "learning_rate": 8.232972715520175e-06, "loss": 1.1347, "step": 13910 }, { "epoch": 8.520275439938791, "grad_norm": 2.954200506210327, "learning_rate": 8.198952167108934e-06, "loss": 1.1233, "step": 13920 }, { "epoch": 8.526396327467483, "grad_norm": 4.131378650665283, "learning_rate": 8.164931618697693e-06, "loss": 1.1252, "step": 13930 }, { "epoch": 8.532517214996174, "grad_norm": 3.189302921295166, "learning_rate": 8.130911070286454e-06, "loss": 1.143, "step": 13940 }, { "epoch": 8.538638102524866, "grad_norm": 4.135822772979736, "learning_rate": 8.096890521875213e-06, "loss": 1.1489, "step": 13950 }, { "epoch": 8.544758990053557, "grad_norm": 3.0186097621917725, "learning_rate": 8.062869973463972e-06, "loss": 1.1458, "step": 13960 }, { "epoch": 8.550879877582249, "grad_norm": 3.5816197395324707, "learning_rate": 8.028849425052733e-06, "loss": 1.1348, "step": 13970 }, { "epoch": 8.55700076511094, "grad_norm": 3.028970956802368, "learning_rate": 7.994828876641492e-06, "loss": 1.1498, "step": 13980 }, { "epoch": 8.563121652639634, "grad_norm": 2.8785955905914307, "learning_rate": 7.960808328230251e-06, "loss": 1.1504, "step": 13990 }, { "epoch": 8.569242540168325, "grad_norm": 3.270470380783081, "learning_rate": 7.926787779819012e-06, "loss": 1.1361, "step": 14000 }, { "epoch": 8.575363427697017, "grad_norm": 2.8553550243377686, "learning_rate": 7.89276723140777e-06, "loss": 1.1572, "step": 14010 }, { "epoch": 8.581484315225708, "grad_norm": 3.0945401191711426, "learning_rate": 7.85874668299653e-06, "loss": 1.1139, "step": 14020 }, { "epoch": 8.5876052027544, "grad_norm": 3.0976555347442627, "learning_rate": 7.824726134585291e-06, "loss": 1.1414, "step": 14030 }, { "epoch": 8.593726090283091, "grad_norm": 3.190762758255005, "learning_rate": 7.790705586174049e-06, "loss": 1.1243, "step": 14040 }, { "epoch": 8.599846977811783, "grad_norm": 3.8209619522094727, "learning_rate": 7.75668503776281e-06, "loss": 1.1383, "step": 14050 }, { "epoch": 8.605967865340475, "grad_norm": 3.3825807571411133, "learning_rate": 7.722664489351569e-06, "loss": 1.1393, "step": 14060 }, { "epoch": 8.612088752869166, "grad_norm": 2.7561495304107666, "learning_rate": 7.688643940940328e-06, "loss": 1.1503, "step": 14070 }, { "epoch": 8.618209640397858, "grad_norm": 2.9515817165374756, "learning_rate": 7.654623392529089e-06, "loss": 1.1396, "step": 14080 }, { "epoch": 8.62433052792655, "grad_norm": 2.807973861694336, "learning_rate": 7.620602844117848e-06, "loss": 1.1299, "step": 14090 }, { "epoch": 8.63045141545524, "grad_norm": 2.8332042694091797, "learning_rate": 7.586582295706607e-06, "loss": 1.1255, "step": 14100 }, { "epoch": 8.636572302983932, "grad_norm": 3.248446226119995, "learning_rate": 7.552561747295367e-06, "loss": 1.1442, "step": 14110 }, { "epoch": 8.642693190512624, "grad_norm": 3.1265931129455566, "learning_rate": 7.518541198884127e-06, "loss": 1.1516, "step": 14120 }, { "epoch": 8.648814078041315, "grad_norm": 2.808518886566162, "learning_rate": 7.484520650472886e-06, "loss": 1.1333, "step": 14130 }, { "epoch": 8.654934965570007, "grad_norm": 4.110435962677002, "learning_rate": 7.450500102061646e-06, "loss": 1.1439, "step": 14140 }, { "epoch": 8.661055853098699, "grad_norm": 3.197977304458618, "learning_rate": 7.416479553650406e-06, "loss": 1.1325, "step": 14150 }, { "epoch": 8.66717674062739, "grad_norm": 3.7765300273895264, "learning_rate": 7.382459005239164e-06, "loss": 1.158, "step": 14160 }, { "epoch": 8.673297628156083, "grad_norm": 3.2383854389190674, "learning_rate": 7.348438456827924e-06, "loss": 1.1646, "step": 14170 }, { "epoch": 8.679418515684775, "grad_norm": 2.7088334560394287, "learning_rate": 7.314417908416683e-06, "loss": 1.1301, "step": 14180 }, { "epoch": 8.685539403213467, "grad_norm": 4.367984771728516, "learning_rate": 7.280397360005443e-06, "loss": 1.1286, "step": 14190 }, { "epoch": 8.691660290742158, "grad_norm": 3.3485443592071533, "learning_rate": 7.246376811594203e-06, "loss": 1.153, "step": 14200 }, { "epoch": 8.69778117827085, "grad_norm": 4.202768802642822, "learning_rate": 7.2123562631829625e-06, "loss": 1.1468, "step": 14210 }, { "epoch": 8.703902065799541, "grad_norm": 3.7260403633117676, "learning_rate": 7.1783357147717225e-06, "loss": 1.1485, "step": 14220 }, { "epoch": 8.710022953328233, "grad_norm": 3.604518413543701, "learning_rate": 7.1443151663604825e-06, "loss": 1.1551, "step": 14230 }, { "epoch": 8.716143840856924, "grad_norm": 2.95175838470459, "learning_rate": 7.110294617949241e-06, "loss": 1.1454, "step": 14240 }, { "epoch": 8.722264728385616, "grad_norm": 3.4083735942840576, "learning_rate": 7.076274069538001e-06, "loss": 1.1589, "step": 14250 }, { "epoch": 8.728385615914307, "grad_norm": 3.287588357925415, "learning_rate": 7.042253521126762e-06, "loss": 1.1326, "step": 14260 }, { "epoch": 8.734506503442999, "grad_norm": 2.924208164215088, "learning_rate": 7.00823297271552e-06, "loss": 1.1171, "step": 14270 }, { "epoch": 8.74062739097169, "grad_norm": 2.9708151817321777, "learning_rate": 6.97421242430428e-06, "loss": 1.1443, "step": 14280 }, { "epoch": 8.746748278500382, "grad_norm": 3.3981306552886963, "learning_rate": 6.94019187589304e-06, "loss": 1.1377, "step": 14290 }, { "epoch": 8.752869166029074, "grad_norm": 4.204988956451416, "learning_rate": 6.906171327481799e-06, "loss": 1.1445, "step": 14300 }, { "epoch": 8.758990053557765, "grad_norm": 3.8043899536132812, "learning_rate": 6.872150779070559e-06, "loss": 1.1468, "step": 14310 }, { "epoch": 8.765110941086457, "grad_norm": 3.0697388648986816, "learning_rate": 6.838130230659319e-06, "loss": 1.1339, "step": 14320 }, { "epoch": 8.77123182861515, "grad_norm": 3.4975595474243164, "learning_rate": 6.804109682248077e-06, "loss": 1.1465, "step": 14330 }, { "epoch": 8.777352716143842, "grad_norm": 3.3167073726654053, "learning_rate": 6.770089133836838e-06, "loss": 1.1393, "step": 14340 }, { "epoch": 8.783473603672533, "grad_norm": 3.330259323120117, "learning_rate": 6.736068585425598e-06, "loss": 1.1289, "step": 14350 }, { "epoch": 8.789594491201225, "grad_norm": 3.079725503921509, "learning_rate": 6.702048037014356e-06, "loss": 1.1521, "step": 14360 }, { "epoch": 8.795715378729916, "grad_norm": 3.1370887756347656, "learning_rate": 6.668027488603116e-06, "loss": 1.1732, "step": 14370 }, { "epoch": 8.801836266258608, "grad_norm": 2.693749189376831, "learning_rate": 6.634006940191876e-06, "loss": 1.147, "step": 14380 }, { "epoch": 8.8079571537873, "grad_norm": 2.9202158451080322, "learning_rate": 6.5999863917806355e-06, "loss": 1.1401, "step": 14390 }, { "epoch": 8.814078041315991, "grad_norm": 2.882169246673584, "learning_rate": 6.5659658433693955e-06, "loss": 1.1602, "step": 14400 }, { "epoch": 8.820198928844682, "grad_norm": 3.2943568229675293, "learning_rate": 6.5319452949581555e-06, "loss": 1.1259, "step": 14410 }, { "epoch": 8.826319816373374, "grad_norm": 3.4829976558685303, "learning_rate": 6.497924746546915e-06, "loss": 1.1503, "step": 14420 }, { "epoch": 8.832440703902066, "grad_norm": 3.278031826019287, "learning_rate": 6.463904198135675e-06, "loss": 1.1534, "step": 14430 }, { "epoch": 8.838561591430757, "grad_norm": 3.270439386367798, "learning_rate": 6.429883649724435e-06, "loss": 1.1414, "step": 14440 }, { "epoch": 8.844682478959449, "grad_norm": 3.0096373558044434, "learning_rate": 6.395863101313193e-06, "loss": 1.1311, "step": 14450 }, { "epoch": 8.85080336648814, "grad_norm": 2.464264392852783, "learning_rate": 6.361842552901953e-06, "loss": 1.1442, "step": 14460 }, { "epoch": 8.856924254016832, "grad_norm": 3.1836256980895996, "learning_rate": 6.327822004490713e-06, "loss": 1.1667, "step": 14470 }, { "epoch": 8.863045141545523, "grad_norm": 2.9464852809906006, "learning_rate": 6.293801456079472e-06, "loss": 1.1368, "step": 14480 }, { "epoch": 8.869166029074215, "grad_norm": 3.198124408721924, "learning_rate": 6.259780907668232e-06, "loss": 1.1569, "step": 14490 }, { "epoch": 8.875286916602908, "grad_norm": 3.599224328994751, "learning_rate": 6.225760359256991e-06, "loss": 1.1241, "step": 14500 }, { "epoch": 8.8814078041316, "grad_norm": 2.898756742477417, "learning_rate": 6.191739810845751e-06, "loss": 1.144, "step": 14510 }, { "epoch": 8.887528691660291, "grad_norm": 2.7918193340301514, "learning_rate": 6.157719262434511e-06, "loss": 1.1261, "step": 14520 }, { "epoch": 8.893649579188983, "grad_norm": 3.0213284492492676, "learning_rate": 6.12369871402327e-06, "loss": 1.1325, "step": 14530 }, { "epoch": 8.899770466717674, "grad_norm": 3.097947359085083, "learning_rate": 6.089678165612029e-06, "loss": 1.1257, "step": 14540 }, { "epoch": 8.905891354246366, "grad_norm": 3.194293975830078, "learning_rate": 6.055657617200789e-06, "loss": 1.1345, "step": 14550 }, { "epoch": 8.912012241775058, "grad_norm": 3.5505549907684326, "learning_rate": 6.021637068789549e-06, "loss": 1.1222, "step": 14560 }, { "epoch": 8.91813312930375, "grad_norm": 3.582949638366699, "learning_rate": 5.9876165203783085e-06, "loss": 1.1516, "step": 14570 }, { "epoch": 8.92425401683244, "grad_norm": 3.011648178100586, "learning_rate": 5.9535959719670685e-06, "loss": 1.1439, "step": 14580 }, { "epoch": 8.930374904361132, "grad_norm": 3.2848992347717285, "learning_rate": 5.919575423555828e-06, "loss": 1.1335, "step": 14590 }, { "epoch": 8.936495791889824, "grad_norm": 3.178999423980713, "learning_rate": 5.885554875144588e-06, "loss": 1.1529, "step": 14600 }, { "epoch": 8.942616679418515, "grad_norm": 7.24118709564209, "learning_rate": 5.851534326733348e-06, "loss": 1.1363, "step": 14610 }, { "epoch": 8.948737566947207, "grad_norm": 2.974168062210083, "learning_rate": 5.817513778322107e-06, "loss": 1.1155, "step": 14620 }, { "epoch": 8.954858454475898, "grad_norm": 3.646303176879883, "learning_rate": 5.783493229910866e-06, "loss": 1.1383, "step": 14630 }, { "epoch": 8.96097934200459, "grad_norm": 3.389556407928467, "learning_rate": 5.749472681499627e-06, "loss": 1.1449, "step": 14640 }, { "epoch": 8.967100229533282, "grad_norm": 3.2630605697631836, "learning_rate": 5.715452133088386e-06, "loss": 1.1312, "step": 14650 }, { "epoch": 8.973221117061975, "grad_norm": 3.2353131771087646, "learning_rate": 5.681431584677145e-06, "loss": 1.1315, "step": 14660 }, { "epoch": 8.979342004590666, "grad_norm": 3.8882558345794678, "learning_rate": 5.647411036265905e-06, "loss": 1.1425, "step": 14670 }, { "epoch": 8.985462892119358, "grad_norm": 2.721869945526123, "learning_rate": 5.613390487854665e-06, "loss": 1.1308, "step": 14680 }, { "epoch": 8.99158377964805, "grad_norm": 3.2869043350219727, "learning_rate": 5.579369939443424e-06, "loss": 1.1277, "step": 14690 }, { "epoch": 8.997704667176741, "grad_norm": 4.439366817474365, "learning_rate": 5.545349391032183e-06, "loss": 1.1279, "step": 14700 }, { "epoch": 8.999540933435348, "eval_accuracy": 0.5099954088074378, "eval_loss": 1.168525218963623, "eval_runtime": 2629.8, "eval_samples_per_second": 79.51, "eval_steps_per_second": 0.621, "step": 14703 }, { "epoch": 9.003825554705433, "grad_norm": 3.387463092803955, "learning_rate": 5.511328842620943e-06, "loss": 1.1331, "step": 14710 }, { "epoch": 9.009946442234124, "grad_norm": 3.4445242881774902, "learning_rate": 5.477308294209703e-06, "loss": 1.1233, "step": 14720 }, { "epoch": 9.016067329762816, "grad_norm": 3.013448476791382, "learning_rate": 5.443287745798462e-06, "loss": 1.1391, "step": 14730 }, { "epoch": 9.022188217291507, "grad_norm": 2.931241512298584, "learning_rate": 5.409267197387222e-06, "loss": 1.1187, "step": 14740 }, { "epoch": 9.028309104820199, "grad_norm": 3.7724857330322266, "learning_rate": 5.3752466489759815e-06, "loss": 1.1375, "step": 14750 }, { "epoch": 9.03442999234889, "grad_norm": 3.2617881298065186, "learning_rate": 5.3412261005647414e-06, "loss": 1.1239, "step": 14760 }, { "epoch": 9.040550879877582, "grad_norm": 3.163550853729248, "learning_rate": 5.3072055521535014e-06, "loss": 1.1333, "step": 14770 }, { "epoch": 9.046671767406274, "grad_norm": 3.271584987640381, "learning_rate": 5.2731850037422606e-06, "loss": 1.1241, "step": 14780 }, { "epoch": 9.052792654934965, "grad_norm": 2.9983322620391846, "learning_rate": 5.23916445533102e-06, "loss": 1.1245, "step": 14790 }, { "epoch": 9.058913542463657, "grad_norm": 3.0628502368927, "learning_rate": 5.20514390691978e-06, "loss": 1.1411, "step": 14800 }, { "epoch": 9.065034429992348, "grad_norm": 2.9316301345825195, "learning_rate": 5.17112335850854e-06, "loss": 1.1326, "step": 14810 }, { "epoch": 9.07115531752104, "grad_norm": 2.7524290084838867, "learning_rate": 5.137102810097299e-06, "loss": 1.1024, "step": 14820 }, { "epoch": 9.077276205049731, "grad_norm": 3.432013511657715, "learning_rate": 5.103082261686059e-06, "loss": 1.1071, "step": 14830 }, { "epoch": 9.083397092578425, "grad_norm": 3.5625956058502197, "learning_rate": 5.069061713274818e-06, "loss": 1.098, "step": 14840 }, { "epoch": 9.089517980107116, "grad_norm": 3.2693939208984375, "learning_rate": 5.035041164863578e-06, "loss": 1.1313, "step": 14850 }, { "epoch": 9.095638867635808, "grad_norm": 4.684074878692627, "learning_rate": 5.001020616452338e-06, "loss": 1.1344, "step": 14860 }, { "epoch": 9.1017597551645, "grad_norm": 3.109464168548584, "learning_rate": 4.967000068041097e-06, "loss": 1.1211, "step": 14870 }, { "epoch": 9.10788064269319, "grad_norm": 3.3348963260650635, "learning_rate": 4.932979519629856e-06, "loss": 1.1295, "step": 14880 }, { "epoch": 9.114001530221882, "grad_norm": 2.6500282287597656, "learning_rate": 4.898958971218616e-06, "loss": 1.1262, "step": 14890 }, { "epoch": 9.120122417750574, "grad_norm": 4.016347408294678, "learning_rate": 4.864938422807376e-06, "loss": 1.1288, "step": 14900 }, { "epoch": 9.126243305279266, "grad_norm": 3.280271291732788, "learning_rate": 4.830917874396135e-06, "loss": 1.1441, "step": 14910 }, { "epoch": 9.132364192807957, "grad_norm": 2.6855580806732178, "learning_rate": 4.7968973259848945e-06, "loss": 1.1212, "step": 14920 }, { "epoch": 9.138485080336649, "grad_norm": 3.044363021850586, "learning_rate": 4.762876777573655e-06, "loss": 1.1398, "step": 14930 }, { "epoch": 9.14460596786534, "grad_norm": 3.8019986152648926, "learning_rate": 4.7288562291624144e-06, "loss": 1.1222, "step": 14940 }, { "epoch": 9.150726855394032, "grad_norm": 3.0457823276519775, "learning_rate": 4.6948356807511736e-06, "loss": 1.1531, "step": 14950 }, { "epoch": 9.156847742922723, "grad_norm": 3.0033369064331055, "learning_rate": 4.6608151323399336e-06, "loss": 1.1374, "step": 14960 }, { "epoch": 9.162968630451415, "grad_norm": 3.2390763759613037, "learning_rate": 4.6267945839286935e-06, "loss": 1.124, "step": 14970 }, { "epoch": 9.169089517980106, "grad_norm": 3.1349003314971924, "learning_rate": 4.592774035517453e-06, "loss": 1.1241, "step": 14980 }, { "epoch": 9.175210405508798, "grad_norm": 3.171571969985962, "learning_rate": 4.558753487106213e-06, "loss": 1.1218, "step": 14990 }, { "epoch": 9.181331293037491, "grad_norm": 3.28657603263855, "learning_rate": 4.524732938694972e-06, "loss": 1.1209, "step": 15000 }, { "epoch": 9.187452180566183, "grad_norm": 3.4533939361572266, "learning_rate": 4.490712390283732e-06, "loss": 1.129, "step": 15010 }, { "epoch": 9.193573068094874, "grad_norm": 3.1935927867889404, "learning_rate": 4.456691841872492e-06, "loss": 1.1171, "step": 15020 }, { "epoch": 9.199693955623566, "grad_norm": 3.5906758308410645, "learning_rate": 4.422671293461251e-06, "loss": 1.1337, "step": 15030 }, { "epoch": 9.205814843152258, "grad_norm": 3.6400201320648193, "learning_rate": 4.38865074505001e-06, "loss": 1.1338, "step": 15040 }, { "epoch": 9.211935730680949, "grad_norm": 3.086146116256714, "learning_rate": 4.35463019663877e-06, "loss": 1.133, "step": 15050 }, { "epoch": 9.21805661820964, "grad_norm": 3.2579269409179688, "learning_rate": 4.32060964822753e-06, "loss": 1.1388, "step": 15060 }, { "epoch": 9.224177505738332, "grad_norm": 2.8636515140533447, "learning_rate": 4.286589099816289e-06, "loss": 1.1233, "step": 15070 }, { "epoch": 9.230298393267024, "grad_norm": 3.022768497467041, "learning_rate": 4.252568551405048e-06, "loss": 1.1253, "step": 15080 }, { "epoch": 9.236419280795715, "grad_norm": 3.074159860610962, "learning_rate": 4.218548002993808e-06, "loss": 1.1527, "step": 15090 }, { "epoch": 9.242540168324407, "grad_norm": 5.568500518798828, "learning_rate": 4.184527454582568e-06, "loss": 1.1397, "step": 15100 }, { "epoch": 9.248661055853098, "grad_norm": 3.478727102279663, "learning_rate": 4.150506906171327e-06, "loss": 1.1371, "step": 15110 }, { "epoch": 9.25478194338179, "grad_norm": 3.0093882083892822, "learning_rate": 4.116486357760087e-06, "loss": 1.1308, "step": 15120 }, { "epoch": 9.260902830910481, "grad_norm": 2.999668836593628, "learning_rate": 4.0824658093488465e-06, "loss": 1.1466, "step": 15130 }, { "epoch": 9.267023718439173, "grad_norm": 3.1364176273345947, "learning_rate": 4.0484452609376065e-06, "loss": 1.1379, "step": 15140 }, { "epoch": 9.273144605967865, "grad_norm": 3.1165690422058105, "learning_rate": 4.0144247125263665e-06, "loss": 1.118, "step": 15150 }, { "epoch": 9.279265493496556, "grad_norm": 3.0751230716705322, "learning_rate": 3.980404164115126e-06, "loss": 1.1519, "step": 15160 }, { "epoch": 9.28538638102525, "grad_norm": 3.8941943645477295, "learning_rate": 3.946383615703885e-06, "loss": 1.1118, "step": 15170 }, { "epoch": 9.291507268553941, "grad_norm": 2.9752681255340576, "learning_rate": 3.912363067292646e-06, "loss": 1.1358, "step": 15180 }, { "epoch": 9.297628156082633, "grad_norm": 3.164053201675415, "learning_rate": 3.878342518881405e-06, "loss": 1.1298, "step": 15190 }, { "epoch": 9.303749043611324, "grad_norm": 3.064384698867798, "learning_rate": 3.844321970470164e-06, "loss": 1.1337, "step": 15200 }, { "epoch": 9.309869931140016, "grad_norm": 3.1741909980773926, "learning_rate": 3.810301422058924e-06, "loss": 1.1217, "step": 15210 }, { "epoch": 9.315990818668707, "grad_norm": 3.1322076320648193, "learning_rate": 3.7762808736476835e-06, "loss": 1.1268, "step": 15220 }, { "epoch": 9.322111706197399, "grad_norm": 3.045135736465454, "learning_rate": 3.742260325236443e-06, "loss": 1.1352, "step": 15230 }, { "epoch": 9.32823259372609, "grad_norm": 2.932527542114258, "learning_rate": 3.708239776825203e-06, "loss": 1.1294, "step": 15240 }, { "epoch": 9.334353481254782, "grad_norm": 3.5815134048461914, "learning_rate": 3.674219228413962e-06, "loss": 1.1304, "step": 15250 }, { "epoch": 9.340474368783473, "grad_norm": 3.414130449295044, "learning_rate": 3.6401986800027217e-06, "loss": 1.1232, "step": 15260 }, { "epoch": 9.346595256312165, "grad_norm": 5.269806861877441, "learning_rate": 3.6061781315914813e-06, "loss": 1.121, "step": 15270 }, { "epoch": 9.352716143840857, "grad_norm": 3.498656749725342, "learning_rate": 3.5721575831802413e-06, "loss": 1.1319, "step": 15280 }, { "epoch": 9.358837031369548, "grad_norm": 3.1993322372436523, "learning_rate": 3.5381370347690004e-06, "loss": 1.1244, "step": 15290 }, { "epoch": 9.36495791889824, "grad_norm": 3.3193633556365967, "learning_rate": 3.50411648635776e-06, "loss": 1.1454, "step": 15300 }, { "epoch": 9.371078806426931, "grad_norm": 3.172529697418213, "learning_rate": 3.47009593794652e-06, "loss": 1.1373, "step": 15310 }, { "epoch": 9.377199693955623, "grad_norm": 3.223278045654297, "learning_rate": 3.4360753895352795e-06, "loss": 1.1099, "step": 15320 }, { "epoch": 9.383320581484316, "grad_norm": 3.193432569503784, "learning_rate": 3.4020548411240386e-06, "loss": 1.121, "step": 15330 }, { "epoch": 9.389441469013008, "grad_norm": 3.438258647918701, "learning_rate": 3.368034292712799e-06, "loss": 1.1343, "step": 15340 }, { "epoch": 9.3955623565417, "grad_norm": 2.990633964538574, "learning_rate": 3.334013744301558e-06, "loss": 1.1368, "step": 15350 }, { "epoch": 9.40168324407039, "grad_norm": 3.5829241275787354, "learning_rate": 3.2999931958903178e-06, "loss": 1.1209, "step": 15360 }, { "epoch": 9.407804131599082, "grad_norm": 3.2910852432250977, "learning_rate": 3.2659726474790777e-06, "loss": 1.1385, "step": 15370 }, { "epoch": 9.413925019127774, "grad_norm": 2.937903642654419, "learning_rate": 3.2319520990678373e-06, "loss": 1.124, "step": 15380 }, { "epoch": 9.420045906656465, "grad_norm": 3.887308359146118, "learning_rate": 3.1979315506565964e-06, "loss": 1.1046, "step": 15390 }, { "epoch": 9.426166794185157, "grad_norm": 3.3708431720733643, "learning_rate": 3.1639110022453564e-06, "loss": 1.1138, "step": 15400 }, { "epoch": 9.432287681713849, "grad_norm": 3.5277280807495117, "learning_rate": 3.129890453834116e-06, "loss": 1.1523, "step": 15410 }, { "epoch": 9.43840856924254, "grad_norm": 2.8049731254577637, "learning_rate": 3.0958699054228756e-06, "loss": 1.1184, "step": 15420 }, { "epoch": 9.444529456771232, "grad_norm": 3.5323147773742676, "learning_rate": 3.061849357011635e-06, "loss": 1.1315, "step": 15430 }, { "epoch": 9.450650344299923, "grad_norm": 3.5388619899749756, "learning_rate": 3.0278288086003947e-06, "loss": 1.1142, "step": 15440 }, { "epoch": 9.456771231828615, "grad_norm": 3.382925033569336, "learning_rate": 2.9938082601891542e-06, "loss": 1.1183, "step": 15450 }, { "epoch": 9.462892119357306, "grad_norm": 3.5912578105926514, "learning_rate": 2.959787711777914e-06, "loss": 1.1363, "step": 15460 }, { "epoch": 9.469013006885998, "grad_norm": 3.481766700744629, "learning_rate": 2.925767163366674e-06, "loss": 1.1075, "step": 15470 }, { "epoch": 9.47513389441469, "grad_norm": 3.3989980220794678, "learning_rate": 2.891746614955433e-06, "loss": 1.1367, "step": 15480 }, { "epoch": 9.481254781943381, "grad_norm": 3.504020929336548, "learning_rate": 2.857726066544193e-06, "loss": 1.1343, "step": 15490 }, { "epoch": 9.487375669472073, "grad_norm": 3.38145112991333, "learning_rate": 2.8237055181329525e-06, "loss": 1.1064, "step": 15500 }, { "epoch": 9.493496557000766, "grad_norm": 3.2473201751708984, "learning_rate": 2.789684969721712e-06, "loss": 1.135, "step": 15510 }, { "epoch": 9.499617444529457, "grad_norm": 3.069551706314087, "learning_rate": 2.7556644213104716e-06, "loss": 1.1204, "step": 15520 }, { "epoch": 9.505738332058149, "grad_norm": 3.7915170192718506, "learning_rate": 2.721643872899231e-06, "loss": 1.1449, "step": 15530 }, { "epoch": 9.51185921958684, "grad_norm": 3.12088680267334, "learning_rate": 2.6876233244879907e-06, "loss": 1.1241, "step": 15540 }, { "epoch": 9.517980107115532, "grad_norm": 5.423216342926025, "learning_rate": 2.6536027760767507e-06, "loss": 1.1169, "step": 15550 }, { "epoch": 9.524100994644224, "grad_norm": 3.776402473449707, "learning_rate": 2.61958222766551e-06, "loss": 1.1351, "step": 15560 }, { "epoch": 9.530221882172915, "grad_norm": 3.071627616882324, "learning_rate": 2.58556167925427e-06, "loss": 1.1043, "step": 15570 }, { "epoch": 9.536342769701607, "grad_norm": 4.0253825187683105, "learning_rate": 2.5515411308430294e-06, "loss": 1.1118, "step": 15580 }, { "epoch": 9.542463657230298, "grad_norm": 3.3617985248565674, "learning_rate": 2.517520582431789e-06, "loss": 1.1208, "step": 15590 }, { "epoch": 9.54858454475899, "grad_norm": 2.9002251625061035, "learning_rate": 2.4835000340205485e-06, "loss": 1.1143, "step": 15600 }, { "epoch": 9.554705432287681, "grad_norm": 3.4758708477020264, "learning_rate": 2.449479485609308e-06, "loss": 1.1395, "step": 15610 }, { "epoch": 9.560826319816373, "grad_norm": 2.933244228363037, "learning_rate": 2.4154589371980677e-06, "loss": 1.1125, "step": 15620 }, { "epoch": 9.566947207345065, "grad_norm": 3.2921762466430664, "learning_rate": 2.3814383887868276e-06, "loss": 1.1387, "step": 15630 }, { "epoch": 9.573068094873756, "grad_norm": 3.7236812114715576, "learning_rate": 2.3474178403755868e-06, "loss": 1.1284, "step": 15640 }, { "epoch": 9.579188982402448, "grad_norm": 3.9367823600769043, "learning_rate": 2.3133972919643468e-06, "loss": 1.1343, "step": 15650 }, { "epoch": 9.58530986993114, "grad_norm": 3.410693645477295, "learning_rate": 2.2793767435531063e-06, "loss": 1.1268, "step": 15660 }, { "epoch": 9.591430757459833, "grad_norm": 3.306009292602539, "learning_rate": 2.245356195141866e-06, "loss": 1.1068, "step": 15670 }, { "epoch": 9.597551644988524, "grad_norm": 3.4138855934143066, "learning_rate": 2.2113356467306255e-06, "loss": 1.1455, "step": 15680 }, { "epoch": 9.603672532517216, "grad_norm": 3.4779481887817383, "learning_rate": 2.177315098319385e-06, "loss": 1.1274, "step": 15690 }, { "epoch": 9.609793420045907, "grad_norm": 3.2200963497161865, "learning_rate": 2.1432945499081446e-06, "loss": 1.1384, "step": 15700 }, { "epoch": 9.615914307574599, "grad_norm": 3.1388471126556396, "learning_rate": 2.109274001496904e-06, "loss": 1.1296, "step": 15710 }, { "epoch": 9.62203519510329, "grad_norm": 3.209235668182373, "learning_rate": 2.0752534530856637e-06, "loss": 1.1473, "step": 15720 }, { "epoch": 9.628156082631982, "grad_norm": 3.1397557258605957, "learning_rate": 2.0412329046744233e-06, "loss": 1.1258, "step": 15730 }, { "epoch": 9.634276970160673, "grad_norm": 3.0715200901031494, "learning_rate": 2.0072123562631833e-06, "loss": 1.1275, "step": 15740 }, { "epoch": 9.640397857689365, "grad_norm": 2.979800224304199, "learning_rate": 1.9731918078519424e-06, "loss": 1.1087, "step": 15750 }, { "epoch": 9.646518745218057, "grad_norm": 3.0174732208251953, "learning_rate": 1.9391712594407024e-06, "loss": 1.125, "step": 15760 }, { "epoch": 9.652639632746748, "grad_norm": 3.9182639122009277, "learning_rate": 1.905150711029462e-06, "loss": 1.1404, "step": 15770 }, { "epoch": 9.65876052027544, "grad_norm": 3.180032730102539, "learning_rate": 1.8711301626182215e-06, "loss": 1.1258, "step": 15780 }, { "epoch": 9.664881407804131, "grad_norm": 3.5461888313293457, "learning_rate": 1.837109614206981e-06, "loss": 1.1234, "step": 15790 }, { "epoch": 9.671002295332823, "grad_norm": 3.3645544052124023, "learning_rate": 1.8030890657957406e-06, "loss": 1.1248, "step": 15800 }, { "epoch": 9.677123182861514, "grad_norm": 3.192190170288086, "learning_rate": 1.7690685173845002e-06, "loss": 1.1435, "step": 15810 }, { "epoch": 9.683244070390206, "grad_norm": 3.084299325942993, "learning_rate": 1.73504796897326e-06, "loss": 1.113, "step": 15820 }, { "epoch": 9.689364957918897, "grad_norm": 3.075233221054077, "learning_rate": 1.7010274205620193e-06, "loss": 1.1314, "step": 15830 }, { "epoch": 9.695485845447589, "grad_norm": 2.9223477840423584, "learning_rate": 1.667006872150779e-06, "loss": 1.1218, "step": 15840 }, { "epoch": 9.701606732976282, "grad_norm": 2.988662004470825, "learning_rate": 1.6329863237395389e-06, "loss": 1.1205, "step": 15850 }, { "epoch": 9.707727620504974, "grad_norm": 3.0018911361694336, "learning_rate": 1.5989657753282982e-06, "loss": 1.1387, "step": 15860 }, { "epoch": 9.713848508033665, "grad_norm": 3.0322518348693848, "learning_rate": 1.564945226917058e-06, "loss": 1.1287, "step": 15870 }, { "epoch": 9.719969395562357, "grad_norm": 3.4595978260040283, "learning_rate": 1.5309246785058176e-06, "loss": 1.1386, "step": 15880 }, { "epoch": 9.726090283091049, "grad_norm": 3.31239914894104, "learning_rate": 1.4969041300945771e-06, "loss": 1.1349, "step": 15890 }, { "epoch": 9.73221117061974, "grad_norm": 2.9286789894104004, "learning_rate": 1.462883581683337e-06, "loss": 1.122, "step": 15900 }, { "epoch": 9.738332058148432, "grad_norm": 3.4690709114074707, "learning_rate": 1.4288630332720965e-06, "loss": 1.1368, "step": 15910 }, { "epoch": 9.744452945677123, "grad_norm": 3.5552656650543213, "learning_rate": 1.394842484860856e-06, "loss": 1.1394, "step": 15920 }, { "epoch": 9.750573833205815, "grad_norm": 3.3070826530456543, "learning_rate": 1.3608219364496156e-06, "loss": 1.1426, "step": 15930 }, { "epoch": 9.756694720734506, "grad_norm": 3.5823755264282227, "learning_rate": 1.3268013880383754e-06, "loss": 1.1264, "step": 15940 }, { "epoch": 9.762815608263198, "grad_norm": 2.63779354095459, "learning_rate": 1.292780839627135e-06, "loss": 1.1305, "step": 15950 }, { "epoch": 9.76893649579189, "grad_norm": 2.931502103805542, "learning_rate": 1.2587602912158945e-06, "loss": 1.118, "step": 15960 }, { "epoch": 9.775057383320581, "grad_norm": 2.708097457885742, "learning_rate": 1.224739742804654e-06, "loss": 1.1333, "step": 15970 }, { "epoch": 9.781178270849272, "grad_norm": 3.4733378887176514, "learning_rate": 1.1907191943934138e-06, "loss": 1.1304, "step": 15980 }, { "epoch": 9.787299158377964, "grad_norm": 3.208436965942383, "learning_rate": 1.1566986459821734e-06, "loss": 1.116, "step": 15990 }, { "epoch": 9.793420045906657, "grad_norm": 3.083686351776123, "learning_rate": 1.122678097570933e-06, "loss": 1.1345, "step": 16000 }, { "epoch": 9.799540933435349, "grad_norm": 5.919510841369629, "learning_rate": 1.0886575491596925e-06, "loss": 1.1295, "step": 16010 }, { "epoch": 9.80566182096404, "grad_norm": 3.0980031490325928, "learning_rate": 1.054637000748452e-06, "loss": 1.1264, "step": 16020 }, { "epoch": 9.811782708492732, "grad_norm": 2.991114854812622, "learning_rate": 1.0206164523372116e-06, "loss": 1.1158, "step": 16030 }, { "epoch": 9.817903596021424, "grad_norm": 2.762467861175537, "learning_rate": 9.865959039259712e-07, "loss": 1.1482, "step": 16040 }, { "epoch": 9.824024483550115, "grad_norm": 2.762880325317383, "learning_rate": 9.52575355514731e-07, "loss": 1.1252, "step": 16050 }, { "epoch": 9.830145371078807, "grad_norm": 3.7553000450134277, "learning_rate": 9.185548071034905e-07, "loss": 1.1354, "step": 16060 }, { "epoch": 9.836266258607498, "grad_norm": 3.1074488162994385, "learning_rate": 8.845342586922501e-07, "loss": 1.1433, "step": 16070 }, { "epoch": 9.84238714613619, "grad_norm": 3.9977195262908936, "learning_rate": 8.505137102810097e-07, "loss": 1.1506, "step": 16080 }, { "epoch": 9.848508033664881, "grad_norm": 2.973998785018921, "learning_rate": 8.164931618697694e-07, "loss": 1.138, "step": 16090 }, { "epoch": 9.854628921193573, "grad_norm": 2.8410356044769287, "learning_rate": 7.82472613458529e-07, "loss": 1.1325, "step": 16100 }, { "epoch": 9.860749808722264, "grad_norm": 3.599996328353882, "learning_rate": 7.484520650472886e-07, "loss": 1.1124, "step": 16110 }, { "epoch": 9.866870696250956, "grad_norm": 3.0998120307922363, "learning_rate": 7.144315166360482e-07, "loss": 1.1207, "step": 16120 }, { "epoch": 9.872991583779648, "grad_norm": 4.06972599029541, "learning_rate": 6.804109682248078e-07, "loss": 1.1025, "step": 16130 }, { "epoch": 9.87911247130834, "grad_norm": 4.1819329261779785, "learning_rate": 6.463904198135675e-07, "loss": 1.115, "step": 16140 }, { "epoch": 9.88523335883703, "grad_norm": 3.015798568725586, "learning_rate": 6.12369871402327e-07, "loss": 1.1476, "step": 16150 }, { "epoch": 9.891354246365722, "grad_norm": 3.4522454738616943, "learning_rate": 5.783493229910867e-07, "loss": 1.1165, "step": 16160 }, { "epoch": 9.897475133894414, "grad_norm": 3.4204092025756836, "learning_rate": 5.443287745798463e-07, "loss": 1.1144, "step": 16170 }, { "epoch": 9.903596021423107, "grad_norm": 3.0444397926330566, "learning_rate": 5.103082261686058e-07, "loss": 1.1305, "step": 16180 }, { "epoch": 9.909716908951799, "grad_norm": 3.3424997329711914, "learning_rate": 4.762876777573655e-07, "loss": 1.1561, "step": 16190 }, { "epoch": 9.91583779648049, "grad_norm": 2.94327449798584, "learning_rate": 4.4226712934612505e-07, "loss": 1.1472, "step": 16200 }, { "epoch": 9.921958684009182, "grad_norm": 3.0581696033477783, "learning_rate": 4.082465809348847e-07, "loss": 1.1122, "step": 16210 }, { "epoch": 9.928079571537873, "grad_norm": 3.0790395736694336, "learning_rate": 3.742260325236443e-07, "loss": 1.1427, "step": 16220 }, { "epoch": 9.934200459066565, "grad_norm": 3.1817245483398438, "learning_rate": 3.402054841124039e-07, "loss": 1.1301, "step": 16230 }, { "epoch": 9.940321346595256, "grad_norm": 2.9370524883270264, "learning_rate": 3.061849357011635e-07, "loss": 1.1217, "step": 16240 }, { "epoch": 9.946442234123948, "grad_norm": 3.528365135192871, "learning_rate": 2.7216438728992313e-07, "loss": 1.1252, "step": 16250 }, { "epoch": 9.95256312165264, "grad_norm": 2.9326071739196777, "learning_rate": 2.3814383887868274e-07, "loss": 1.109, "step": 16260 }, { "epoch": 9.958684009181331, "grad_norm": 3.28116512298584, "learning_rate": 2.0412329046744236e-07, "loss": 1.1162, "step": 16270 }, { "epoch": 9.964804896710023, "grad_norm": 2.973410129547119, "learning_rate": 1.7010274205620195e-07, "loss": 1.112, "step": 16280 }, { "epoch": 9.970925784238714, "grad_norm": 3.1284003257751465, "learning_rate": 1.3608219364496156e-07, "loss": 1.1297, "step": 16290 }, { "epoch": 9.977046671767406, "grad_norm": 3.262115001678467, "learning_rate": 1.0206164523372118e-07, "loss": 1.1359, "step": 16300 }, { "epoch": 9.983167559296097, "grad_norm": 4.041773796081543, "learning_rate": 6.804109682248078e-08, "loss": 1.1251, "step": 16310 }, { "epoch": 9.989288446824789, "grad_norm": 3.1230151653289795, "learning_rate": 3.402054841124039e-08, "loss": 1.1696, "step": 16320 }, { "epoch": 9.99540933435348, "grad_norm": 5.236699104309082, "learning_rate": 0.0, "loss": 1.1121, "step": 16330 }, { "epoch": 9.99540933435348, "eval_accuracy": 0.5118223208478402, "eval_loss": 1.1701384782791138, "eval_runtime": 2639.5684, "eval_samples_per_second": 79.216, "eval_steps_per_second": 0.619, "step": 16330 }, { "epoch": 9.99540933435348, "step": 16330, "total_flos": 8.570305751657333e+20, "train_loss": 1.22920058554692, "train_runtime": 204809.1788, "train_samples_per_second": 40.836, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 16330, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.570305751657333e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }