diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,38577 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998410788475947, + "eval_steps": 500, + "global_step": 5505, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018162417417758305, + "grad_norm": 0.9355892539024353, + "learning_rate": 5.181347150259068e-07, + "loss": 2.4784, + "step": 1 + }, + { + "epoch": 0.0003632483483551661, + "grad_norm": 1.0276386737823486, + "learning_rate": 1.0362694300518136e-06, + "loss": 2.5847, + "step": 2 + }, + { + "epoch": 0.0005448725225327491, + "grad_norm": 0.6001347899436951, + "learning_rate": 1.5544041450777204e-06, + "loss": 2.2673, + "step": 3 + }, + { + "epoch": 0.0007264966967103322, + "grad_norm": 0.9976602792739868, + "learning_rate": 2.0725388601036273e-06, + "loss": 2.7898, + "step": 4 + }, + { + "epoch": 0.0009081208708879152, + "grad_norm": 0.819512665271759, + "learning_rate": 2.5906735751295338e-06, + "loss": 2.7476, + "step": 5 + }, + { + "epoch": 0.0010897450450654982, + "grad_norm": 0.8106259107589722, + "learning_rate": 3.1088082901554407e-06, + "loss": 2.6634, + "step": 6 + }, + { + "epoch": 0.0012713692192430812, + "grad_norm": 0.8429345488548279, + "learning_rate": 3.626943005181347e-06, + "loss": 2.375, + "step": 7 + }, + { + "epoch": 0.0014529933934206644, + "grad_norm": 0.741061806678772, + "learning_rate": 4.1450777202072546e-06, + "loss": 2.4188, + "step": 8 + }, + { + "epoch": 0.0016346175675982474, + "grad_norm": 0.8119791150093079, + "learning_rate": 4.663212435233161e-06, + "loss": 2.4477, + "step": 9 + }, + { + "epoch": 0.0018162417417758303, + "grad_norm": 0.9859192967414856, + "learning_rate": 5.1813471502590676e-06, + "loss": 2.6121, + "step": 10 + }, + { + "epoch": 0.0019978659159534135, + "grad_norm": 0.9813627600669861, + "learning_rate": 5.699481865284975e-06, + "loss": 2.5205, + "step": 11 + }, + { + "epoch": 0.0021794900901309965, + "grad_norm": 0.9706599116325378, + "learning_rate": 6.217616580310881e-06, + "loss": 2.661, + "step": 12 + }, + { + "epoch": 0.0023611142643085795, + "grad_norm": 0.8823219537734985, + "learning_rate": 6.735751295336788e-06, + "loss": 2.5312, + "step": 13 + }, + { + "epoch": 0.0025427384384861624, + "grad_norm": 0.986649215221405, + "learning_rate": 7.253886010362694e-06, + "loss": 2.4421, + "step": 14 + }, + { + "epoch": 0.0027243626126637454, + "grad_norm": 0.8093275427818298, + "learning_rate": 7.772020725388602e-06, + "loss": 2.4559, + "step": 15 + }, + { + "epoch": 0.002905986786841329, + "grad_norm": 0.7500173449516296, + "learning_rate": 8.290155440414509e-06, + "loss": 2.3828, + "step": 16 + }, + { + "epoch": 0.0030876109610189118, + "grad_norm": 1.1916862726211548, + "learning_rate": 8.808290155440415e-06, + "loss": 2.478, + "step": 17 + }, + { + "epoch": 0.0032692351351964947, + "grad_norm": 0.8217176198959351, + "learning_rate": 9.326424870466322e-06, + "loss": 2.5261, + "step": 18 + }, + { + "epoch": 0.0034508593093740777, + "grad_norm": 0.9047083258628845, + "learning_rate": 9.84455958549223e-06, + "loss": 2.5519, + "step": 19 + }, + { + "epoch": 0.0036324834835516607, + "grad_norm": 0.8520768880844116, + "learning_rate": 1.0362694300518135e-05, + "loss": 2.4592, + "step": 20 + }, + { + "epoch": 0.0038141076577292436, + "grad_norm": 0.6490415930747986, + "learning_rate": 1.0880829015544042e-05, + "loss": 2.3465, + "step": 21 + }, + { + "epoch": 0.003995731831906827, + "grad_norm": 0.9296863675117493, + "learning_rate": 1.139896373056995e-05, + "loss": 2.4501, + "step": 22 + }, + { + "epoch": 0.00417735600608441, + "grad_norm": 0.713177502155304, + "learning_rate": 1.1917098445595855e-05, + "loss": 2.4707, + "step": 23 + }, + { + "epoch": 0.004358980180261993, + "grad_norm": 0.7937615513801575, + "learning_rate": 1.2435233160621763e-05, + "loss": 2.66, + "step": 24 + }, + { + "epoch": 0.004540604354439576, + "grad_norm": 1.196436882019043, + "learning_rate": 1.2953367875647668e-05, + "loss": 2.5459, + "step": 25 + }, + { + "epoch": 0.004722228528617159, + "grad_norm": 0.6011134386062622, + "learning_rate": 1.3471502590673576e-05, + "loss": 2.7127, + "step": 26 + }, + { + "epoch": 0.004903852702794742, + "grad_norm": 0.7113818526268005, + "learning_rate": 1.3989637305699481e-05, + "loss": 2.408, + "step": 27 + }, + { + "epoch": 0.005085476876972325, + "grad_norm": 0.5616409182548523, + "learning_rate": 1.4507772020725389e-05, + "loss": 2.4343, + "step": 28 + }, + { + "epoch": 0.005267101051149908, + "grad_norm": 0.5502139925956726, + "learning_rate": 1.5025906735751296e-05, + "loss": 2.3583, + "step": 29 + }, + { + "epoch": 0.005448725225327491, + "grad_norm": 0.7829020619392395, + "learning_rate": 1.5544041450777204e-05, + "loss": 2.4652, + "step": 30 + }, + { + "epoch": 0.005630349399505074, + "grad_norm": 0.6393241286277771, + "learning_rate": 1.606217616580311e-05, + "loss": 2.424, + "step": 31 + }, + { + "epoch": 0.005811973573682658, + "grad_norm": 0.5807009339332581, + "learning_rate": 1.6580310880829018e-05, + "loss": 2.6691, + "step": 32 + }, + { + "epoch": 0.0059935977478602406, + "grad_norm": 0.7704054117202759, + "learning_rate": 1.7098445595854924e-05, + "loss": 2.7302, + "step": 33 + }, + { + "epoch": 0.0061752219220378235, + "grad_norm": 0.5449063777923584, + "learning_rate": 1.761658031088083e-05, + "loss": 2.4516, + "step": 34 + }, + { + "epoch": 0.0063568460962154065, + "grad_norm": 0.5298614501953125, + "learning_rate": 1.813471502590674e-05, + "loss": 2.8757, + "step": 35 + }, + { + "epoch": 0.0065384702703929895, + "grad_norm": 0.5003065466880798, + "learning_rate": 1.8652849740932644e-05, + "loss": 2.4964, + "step": 36 + }, + { + "epoch": 0.0067200944445705724, + "grad_norm": 0.598949670791626, + "learning_rate": 1.917098445595855e-05, + "loss": 2.4403, + "step": 37 + }, + { + "epoch": 0.006901718618748155, + "grad_norm": 0.5137653946876526, + "learning_rate": 1.968911917098446e-05, + "loss": 2.6402, + "step": 38 + }, + { + "epoch": 0.007083342792925738, + "grad_norm": 1.902198076248169, + "learning_rate": 2.0207253886010365e-05, + "loss": 2.4357, + "step": 39 + }, + { + "epoch": 0.007264966967103321, + "grad_norm": 0.5281524062156677, + "learning_rate": 2.072538860103627e-05, + "loss": 2.447, + "step": 40 + }, + { + "epoch": 0.007446591141280904, + "grad_norm": 0.5813673734664917, + "learning_rate": 2.124352331606218e-05, + "loss": 2.7036, + "step": 41 + }, + { + "epoch": 0.007628215315458487, + "grad_norm": 0.5637506246566772, + "learning_rate": 2.1761658031088085e-05, + "loss": 2.4506, + "step": 42 + }, + { + "epoch": 0.00780983948963607, + "grad_norm": 0.6267617344856262, + "learning_rate": 2.227979274611399e-05, + "loss": 2.2509, + "step": 43 + }, + { + "epoch": 0.007991463663813654, + "grad_norm": 1.1631958484649658, + "learning_rate": 2.27979274611399e-05, + "loss": 2.45, + "step": 44 + }, + { + "epoch": 0.008173087837991236, + "grad_norm": 0.572827160358429, + "learning_rate": 2.3316062176165805e-05, + "loss": 2.4291, + "step": 45 + }, + { + "epoch": 0.00835471201216882, + "grad_norm": 0.46059638261795044, + "learning_rate": 2.383419689119171e-05, + "loss": 2.2827, + "step": 46 + }, + { + "epoch": 0.008536336186346402, + "grad_norm": 0.6101712584495544, + "learning_rate": 2.4352331606217617e-05, + "loss": 2.5707, + "step": 47 + }, + { + "epoch": 0.008717960360523986, + "grad_norm": 0.669353187084198, + "learning_rate": 2.4870466321243526e-05, + "loss": 2.4078, + "step": 48 + }, + { + "epoch": 0.008899584534701568, + "grad_norm": 0.8186442852020264, + "learning_rate": 2.538860103626943e-05, + "loss": 2.4147, + "step": 49 + }, + { + "epoch": 0.009081208708879152, + "grad_norm": 0.9178618788719177, + "learning_rate": 2.5906735751295337e-05, + "loss": 2.4082, + "step": 50 + }, + { + "epoch": 0.009262832883056734, + "grad_norm": 0.5555235147476196, + "learning_rate": 2.6424870466321246e-05, + "loss": 2.5507, + "step": 51 + }, + { + "epoch": 0.009444457057234318, + "grad_norm": 1.8077261447906494, + "learning_rate": 2.694300518134715e-05, + "loss": 2.5315, + "step": 52 + }, + { + "epoch": 0.009626081231411902, + "grad_norm": 0.5049795508384705, + "learning_rate": 2.7461139896373057e-05, + "loss": 2.4552, + "step": 53 + }, + { + "epoch": 0.009807705405589484, + "grad_norm": 0.3923340141773224, + "learning_rate": 2.7979274611398963e-05, + "loss": 2.2318, + "step": 54 + }, + { + "epoch": 0.009989329579767068, + "grad_norm": 0.46768641471862793, + "learning_rate": 2.8497409326424872e-05, + "loss": 2.5398, + "step": 55 + }, + { + "epoch": 0.01017095375394465, + "grad_norm": 0.4088476300239563, + "learning_rate": 2.9015544041450778e-05, + "loss": 2.33, + "step": 56 + }, + { + "epoch": 0.010352577928122234, + "grad_norm": 0.517441987991333, + "learning_rate": 2.9533678756476683e-05, + "loss": 2.3288, + "step": 57 + }, + { + "epoch": 0.010534202102299816, + "grad_norm": 0.37423601746559143, + "learning_rate": 3.0051813471502592e-05, + "loss": 2.3812, + "step": 58 + }, + { + "epoch": 0.0107158262764774, + "grad_norm": 0.3792896270751953, + "learning_rate": 3.05699481865285e-05, + "loss": 2.3738, + "step": 59 + }, + { + "epoch": 0.010897450450654982, + "grad_norm": 0.5935635566711426, + "learning_rate": 3.108808290155441e-05, + "loss": 2.4773, + "step": 60 + }, + { + "epoch": 0.011079074624832565, + "grad_norm": 1.0870025157928467, + "learning_rate": 3.1606217616580316e-05, + "loss": 2.3544, + "step": 61 + }, + { + "epoch": 0.011260698799010148, + "grad_norm": 0.3451383113861084, + "learning_rate": 3.212435233160622e-05, + "loss": 2.3082, + "step": 62 + }, + { + "epoch": 0.011442322973187731, + "grad_norm": 0.34188321232795715, + "learning_rate": 3.264248704663213e-05, + "loss": 2.2612, + "step": 63 + }, + { + "epoch": 0.011623947147365315, + "grad_norm": 0.4865647852420807, + "learning_rate": 3.3160621761658036e-05, + "loss": 2.6019, + "step": 64 + }, + { + "epoch": 0.011805571321542897, + "grad_norm": 0.4166278541088104, + "learning_rate": 3.367875647668394e-05, + "loss": 2.4319, + "step": 65 + }, + { + "epoch": 0.011987195495720481, + "grad_norm": 0.4125998616218567, + "learning_rate": 3.419689119170985e-05, + "loss": 2.3205, + "step": 66 + }, + { + "epoch": 0.012168819669898063, + "grad_norm": 0.44595298171043396, + "learning_rate": 3.471502590673576e-05, + "loss": 2.3382, + "step": 67 + }, + { + "epoch": 0.012350443844075647, + "grad_norm": 0.396384060382843, + "learning_rate": 3.523316062176166e-05, + "loss": 2.4102, + "step": 68 + }, + { + "epoch": 0.01253206801825323, + "grad_norm": 0.40286874771118164, + "learning_rate": 3.575129533678757e-05, + "loss": 2.3261, + "step": 69 + }, + { + "epoch": 0.012713692192430813, + "grad_norm": 0.3083820044994354, + "learning_rate": 3.626943005181348e-05, + "loss": 2.301, + "step": 70 + }, + { + "epoch": 0.012895316366608395, + "grad_norm": 0.3478189706802368, + "learning_rate": 3.678756476683938e-05, + "loss": 2.1265, + "step": 71 + }, + { + "epoch": 0.013076940540785979, + "grad_norm": 0.5084174275398254, + "learning_rate": 3.730569948186529e-05, + "loss": 2.4529, + "step": 72 + }, + { + "epoch": 0.013258564714963561, + "grad_norm": 1.4283984899520874, + "learning_rate": 3.78238341968912e-05, + "loss": 2.3471, + "step": 73 + }, + { + "epoch": 0.013440188889141145, + "grad_norm": 1.1325210332870483, + "learning_rate": 3.83419689119171e-05, + "loss": 2.4232, + "step": 74 + }, + { + "epoch": 0.013621813063318727, + "grad_norm": 0.36412298679351807, + "learning_rate": 3.886010362694301e-05, + "loss": 2.2145, + "step": 75 + }, + { + "epoch": 0.01380343723749631, + "grad_norm": 0.5811281800270081, + "learning_rate": 3.937823834196892e-05, + "loss": 2.3214, + "step": 76 + }, + { + "epoch": 0.013985061411673895, + "grad_norm": 0.5909280180931091, + "learning_rate": 3.989637305699482e-05, + "loss": 2.3929, + "step": 77 + }, + { + "epoch": 0.014166685585851477, + "grad_norm": 0.654392421245575, + "learning_rate": 4.041450777202073e-05, + "loss": 2.2751, + "step": 78 + }, + { + "epoch": 0.01434830976002906, + "grad_norm": 0.30986690521240234, + "learning_rate": 4.093264248704664e-05, + "loss": 2.2492, + "step": 79 + }, + { + "epoch": 0.014529933934206643, + "grad_norm": 0.4318878650665283, + "learning_rate": 4.145077720207254e-05, + "loss": 2.2025, + "step": 80 + }, + { + "epoch": 0.014711558108384227, + "grad_norm": 2.3534741401672363, + "learning_rate": 4.196891191709845e-05, + "loss": 2.6087, + "step": 81 + }, + { + "epoch": 0.014893182282561809, + "grad_norm": 0.34861910343170166, + "learning_rate": 4.248704663212436e-05, + "loss": 2.2196, + "step": 82 + }, + { + "epoch": 0.015074806456739392, + "grad_norm": 0.4679009020328522, + "learning_rate": 4.300518134715026e-05, + "loss": 2.4186, + "step": 83 + }, + { + "epoch": 0.015256430630916975, + "grad_norm": 0.5095484852790833, + "learning_rate": 4.352331606217617e-05, + "loss": 2.4261, + "step": 84 + }, + { + "epoch": 0.015438054805094558, + "grad_norm": 0.36703184247016907, + "learning_rate": 4.404145077720208e-05, + "loss": 2.3011, + "step": 85 + }, + { + "epoch": 0.01561967897927214, + "grad_norm": 0.38229334354400635, + "learning_rate": 4.455958549222798e-05, + "loss": 2.1118, + "step": 86 + }, + { + "epoch": 0.015801303153449724, + "grad_norm": 0.36906898021698, + "learning_rate": 4.507772020725389e-05, + "loss": 2.1179, + "step": 87 + }, + { + "epoch": 0.015982927327627308, + "grad_norm": 0.3785008192062378, + "learning_rate": 4.55958549222798e-05, + "loss": 2.0318, + "step": 88 + }, + { + "epoch": 0.01616455150180489, + "grad_norm": 0.42567700147628784, + "learning_rate": 4.61139896373057e-05, + "loss": 2.2625, + "step": 89 + }, + { + "epoch": 0.016346175675982472, + "grad_norm": 0.35618677735328674, + "learning_rate": 4.663212435233161e-05, + "loss": 2.201, + "step": 90 + }, + { + "epoch": 0.016527799850160056, + "grad_norm": 0.4565955698490143, + "learning_rate": 4.715025906735751e-05, + "loss": 2.2626, + "step": 91 + }, + { + "epoch": 0.01670942402433764, + "grad_norm": 0.5555636882781982, + "learning_rate": 4.766839378238342e-05, + "loss": 2.1012, + "step": 92 + }, + { + "epoch": 0.016891048198515224, + "grad_norm": 0.4084779918193817, + "learning_rate": 4.818652849740933e-05, + "loss": 2.2521, + "step": 93 + }, + { + "epoch": 0.017072672372692804, + "grad_norm": 1.8528622388839722, + "learning_rate": 4.870466321243523e-05, + "loss": 2.2422, + "step": 94 + }, + { + "epoch": 0.017254296546870388, + "grad_norm": 0.3796592056751251, + "learning_rate": 4.922279792746114e-05, + "loss": 2.1936, + "step": 95 + }, + { + "epoch": 0.017435920721047972, + "grad_norm": 1.0143017768859863, + "learning_rate": 4.974093264248705e-05, + "loss": 2.1626, + "step": 96 + }, + { + "epoch": 0.017617544895225556, + "grad_norm": 0.9917471408843994, + "learning_rate": 5.0259067357512954e-05, + "loss": 2.2291, + "step": 97 + }, + { + "epoch": 0.017799169069403136, + "grad_norm": 1.0647225379943848, + "learning_rate": 5.077720207253886e-05, + "loss": 2.2496, + "step": 98 + }, + { + "epoch": 0.01798079324358072, + "grad_norm": 0.4856836497783661, + "learning_rate": 5.129533678756477e-05, + "loss": 2.3867, + "step": 99 + }, + { + "epoch": 0.018162417417758304, + "grad_norm": 0.4964068830013275, + "learning_rate": 5.1813471502590674e-05, + "loss": 2.1406, + "step": 100 + }, + { + "epoch": 0.018344041591935888, + "grad_norm": 1.1051304340362549, + "learning_rate": 5.233160621761658e-05, + "loss": 2.2095, + "step": 101 + }, + { + "epoch": 0.018525665766113468, + "grad_norm": 0.5281195044517517, + "learning_rate": 5.284974093264249e-05, + "loss": 2.1395, + "step": 102 + }, + { + "epoch": 0.018707289940291052, + "grad_norm": 1.481725811958313, + "learning_rate": 5.3367875647668394e-05, + "loss": 2.3895, + "step": 103 + }, + { + "epoch": 0.018888914114468636, + "grad_norm": 0.4415139853954315, + "learning_rate": 5.38860103626943e-05, + "loss": 2.062, + "step": 104 + }, + { + "epoch": 0.01907053828864622, + "grad_norm": 0.6395520567893982, + "learning_rate": 5.440414507772021e-05, + "loss": 2.2551, + "step": 105 + }, + { + "epoch": 0.019252162462823803, + "grad_norm": 0.3751755654811859, + "learning_rate": 5.4922279792746115e-05, + "loss": 2.1727, + "step": 106 + }, + { + "epoch": 0.019433786637001384, + "grad_norm": 1.847085952758789, + "learning_rate": 5.5440414507772024e-05, + "loss": 2.1582, + "step": 107 + }, + { + "epoch": 0.019615410811178968, + "grad_norm": 1.0029480457305908, + "learning_rate": 5.5958549222797926e-05, + "loss": 2.1846, + "step": 108 + }, + { + "epoch": 0.01979703498535655, + "grad_norm": 0.444780558347702, + "learning_rate": 5.6476683937823835e-05, + "loss": 2.1476, + "step": 109 + }, + { + "epoch": 0.019978659159534135, + "grad_norm": 0.9109865427017212, + "learning_rate": 5.6994818652849744e-05, + "loss": 2.3904, + "step": 110 + }, + { + "epoch": 0.020160283333711716, + "grad_norm": 0.9916377663612366, + "learning_rate": 5.751295336787565e-05, + "loss": 2.2877, + "step": 111 + }, + { + "epoch": 0.0203419075078893, + "grad_norm": 0.7947441935539246, + "learning_rate": 5.8031088082901555e-05, + "loss": 2.3243, + "step": 112 + }, + { + "epoch": 0.020523531682066883, + "grad_norm": 0.5035752654075623, + "learning_rate": 5.8549222797927464e-05, + "loss": 2.0979, + "step": 113 + }, + { + "epoch": 0.020705155856244467, + "grad_norm": 0.4561691880226135, + "learning_rate": 5.9067357512953367e-05, + "loss": 1.9182, + "step": 114 + }, + { + "epoch": 0.020886780030422047, + "grad_norm": 0.600885272026062, + "learning_rate": 5.9585492227979276e-05, + "loss": 2.0583, + "step": 115 + }, + { + "epoch": 0.02106840420459963, + "grad_norm": 0.44051992893218994, + "learning_rate": 6.0103626943005185e-05, + "loss": 2.203, + "step": 116 + }, + { + "epoch": 0.021250028378777215, + "grad_norm": 1.0905159711837769, + "learning_rate": 6.0621761658031094e-05, + "loss": 2.0277, + "step": 117 + }, + { + "epoch": 0.0214316525529548, + "grad_norm": 0.5323740839958191, + "learning_rate": 6.1139896373057e-05, + "loss": 2.0476, + "step": 118 + }, + { + "epoch": 0.021613276727132383, + "grad_norm": 0.5111187100410461, + "learning_rate": 6.16580310880829e-05, + "loss": 2.0962, + "step": 119 + }, + { + "epoch": 0.021794900901309963, + "grad_norm": 0.6293158531188965, + "learning_rate": 6.217616580310881e-05, + "loss": 1.9443, + "step": 120 + }, + { + "epoch": 0.021976525075487547, + "grad_norm": 0.9380314350128174, + "learning_rate": 6.269430051813472e-05, + "loss": 2.0789, + "step": 121 + }, + { + "epoch": 0.02215814924966513, + "grad_norm": 0.5308202505111694, + "learning_rate": 6.321243523316063e-05, + "loss": 2.2565, + "step": 122 + }, + { + "epoch": 0.022339773423842715, + "grad_norm": 0.6502010226249695, + "learning_rate": 6.373056994818653e-05, + "loss": 2.222, + "step": 123 + }, + { + "epoch": 0.022521397598020295, + "grad_norm": 0.5924925804138184, + "learning_rate": 6.424870466321244e-05, + "loss": 2.1623, + "step": 124 + }, + { + "epoch": 0.02270302177219788, + "grad_norm": 0.5103888511657715, + "learning_rate": 6.476683937823834e-05, + "loss": 2.0882, + "step": 125 + }, + { + "epoch": 0.022884645946375463, + "grad_norm": 0.8708823323249817, + "learning_rate": 6.528497409326425e-05, + "loss": 2.3389, + "step": 126 + }, + { + "epoch": 0.023066270120553047, + "grad_norm": 0.4867015779018402, + "learning_rate": 6.580310880829016e-05, + "loss": 2.2712, + "step": 127 + }, + { + "epoch": 0.02324789429473063, + "grad_norm": 0.5543134808540344, + "learning_rate": 6.632124352331607e-05, + "loss": 2.255, + "step": 128 + }, + { + "epoch": 0.02342951846890821, + "grad_norm": 0.5474322438240051, + "learning_rate": 6.683937823834198e-05, + "loss": 2.1097, + "step": 129 + }, + { + "epoch": 0.023611142643085795, + "grad_norm": 0.5004704594612122, + "learning_rate": 6.735751295336788e-05, + "loss": 2.1266, + "step": 130 + }, + { + "epoch": 0.02379276681726338, + "grad_norm": 0.523854672908783, + "learning_rate": 6.787564766839378e-05, + "loss": 2.1426, + "step": 131 + }, + { + "epoch": 0.023974390991440962, + "grad_norm": 0.5098679065704346, + "learning_rate": 6.83937823834197e-05, + "loss": 2.1159, + "step": 132 + }, + { + "epoch": 0.024156015165618543, + "grad_norm": 0.5101820826530457, + "learning_rate": 6.89119170984456e-05, + "loss": 2.1442, + "step": 133 + }, + { + "epoch": 0.024337639339796126, + "grad_norm": 0.49454033374786377, + "learning_rate": 6.943005181347151e-05, + "loss": 2.2096, + "step": 134 + }, + { + "epoch": 0.02451926351397371, + "grad_norm": 0.5890989303588867, + "learning_rate": 6.994818652849742e-05, + "loss": 2.129, + "step": 135 + }, + { + "epoch": 0.024700887688151294, + "grad_norm": 0.7449917197227478, + "learning_rate": 7.046632124352332e-05, + "loss": 2.0047, + "step": 136 + }, + { + "epoch": 0.024882511862328874, + "grad_norm": 0.4811873435974121, + "learning_rate": 7.098445595854922e-05, + "loss": 2.1288, + "step": 137 + }, + { + "epoch": 0.02506413603650646, + "grad_norm": 0.9320221543312073, + "learning_rate": 7.150259067357514e-05, + "loss": 2.1691, + "step": 138 + }, + { + "epoch": 0.025245760210684042, + "grad_norm": 0.7224787473678589, + "learning_rate": 7.202072538860104e-05, + "loss": 2.2568, + "step": 139 + }, + { + "epoch": 0.025427384384861626, + "grad_norm": 1.3435351848602295, + "learning_rate": 7.253886010362695e-05, + "loss": 2.0933, + "step": 140 + }, + { + "epoch": 0.02560900855903921, + "grad_norm": 0.7233456373214722, + "learning_rate": 7.305699481865286e-05, + "loss": 2.091, + "step": 141 + }, + { + "epoch": 0.02579063273321679, + "grad_norm": 0.5043159127235413, + "learning_rate": 7.357512953367876e-05, + "loss": 2.0361, + "step": 142 + }, + { + "epoch": 0.025972256907394374, + "grad_norm": 0.5293927192687988, + "learning_rate": 7.409326424870466e-05, + "loss": 2.121, + "step": 143 + }, + { + "epoch": 0.026153881081571958, + "grad_norm": 0.7314125895500183, + "learning_rate": 7.461139896373058e-05, + "loss": 2.1064, + "step": 144 + }, + { + "epoch": 0.02633550525574954, + "grad_norm": 1.0181902647018433, + "learning_rate": 7.512953367875648e-05, + "loss": 2.1687, + "step": 145 + }, + { + "epoch": 0.026517129429927122, + "grad_norm": 0.6310367584228516, + "learning_rate": 7.56476683937824e-05, + "loss": 2.0239, + "step": 146 + }, + { + "epoch": 0.026698753604104706, + "grad_norm": 0.5555290579795837, + "learning_rate": 7.61658031088083e-05, + "loss": 2.1226, + "step": 147 + }, + { + "epoch": 0.02688037777828229, + "grad_norm": 1.0884681940078735, + "learning_rate": 7.66839378238342e-05, + "loss": 1.998, + "step": 148 + }, + { + "epoch": 0.027062001952459874, + "grad_norm": 0.7076562643051147, + "learning_rate": 7.72020725388601e-05, + "loss": 2.2531, + "step": 149 + }, + { + "epoch": 0.027243626126637454, + "grad_norm": 0.5506603717803955, + "learning_rate": 7.772020725388602e-05, + "loss": 1.9174, + "step": 150 + }, + { + "epoch": 0.027425250300815038, + "grad_norm": 0.6282668113708496, + "learning_rate": 7.823834196891192e-05, + "loss": 2.0004, + "step": 151 + }, + { + "epoch": 0.02760687447499262, + "grad_norm": 0.730701744556427, + "learning_rate": 7.875647668393784e-05, + "loss": 2.2952, + "step": 152 + }, + { + "epoch": 0.027788498649170205, + "grad_norm": 0.5425845384597778, + "learning_rate": 7.927461139896374e-05, + "loss": 2.0512, + "step": 153 + }, + { + "epoch": 0.02797012282334779, + "grad_norm": 0.9672417640686035, + "learning_rate": 7.979274611398964e-05, + "loss": 1.9864, + "step": 154 + }, + { + "epoch": 0.02815174699752537, + "grad_norm": 0.5153308510780334, + "learning_rate": 8.031088082901554e-05, + "loss": 1.9224, + "step": 155 + }, + { + "epoch": 0.028333371171702953, + "grad_norm": 0.5459411144256592, + "learning_rate": 8.082901554404146e-05, + "loss": 2.0777, + "step": 156 + }, + { + "epoch": 0.028514995345880537, + "grad_norm": 0.5593655705451965, + "learning_rate": 8.134715025906736e-05, + "loss": 1.8415, + "step": 157 + }, + { + "epoch": 0.02869661952005812, + "grad_norm": 0.5603652000427246, + "learning_rate": 8.186528497409328e-05, + "loss": 1.892, + "step": 158 + }, + { + "epoch": 0.0288782436942357, + "grad_norm": 0.5403916239738464, + "learning_rate": 8.238341968911918e-05, + "loss": 1.9784, + "step": 159 + }, + { + "epoch": 0.029059867868413285, + "grad_norm": 1.3487857580184937, + "learning_rate": 8.290155440414508e-05, + "loss": 1.9403, + "step": 160 + }, + { + "epoch": 0.02924149204259087, + "grad_norm": 0.56437748670578, + "learning_rate": 8.341968911917098e-05, + "loss": 2.1233, + "step": 161 + }, + { + "epoch": 0.029423116216768453, + "grad_norm": 0.773360013961792, + "learning_rate": 8.39378238341969e-05, + "loss": 2.1698, + "step": 162 + }, + { + "epoch": 0.029604740390946033, + "grad_norm": 0.5230541825294495, + "learning_rate": 8.44559585492228e-05, + "loss": 1.8507, + "step": 163 + }, + { + "epoch": 0.029786364565123617, + "grad_norm": 0.558347761631012, + "learning_rate": 8.497409326424872e-05, + "loss": 2.0165, + "step": 164 + }, + { + "epoch": 0.0299679887393012, + "grad_norm": 1.2348442077636719, + "learning_rate": 8.549222797927462e-05, + "loss": 2.1044, + "step": 165 + }, + { + "epoch": 0.030149612913478785, + "grad_norm": 0.5135602355003357, + "learning_rate": 8.601036269430052e-05, + "loss": 2.0679, + "step": 166 + }, + { + "epoch": 0.03033123708765637, + "grad_norm": 0.5656648874282837, + "learning_rate": 8.652849740932642e-05, + "loss": 2.0321, + "step": 167 + }, + { + "epoch": 0.03051286126183395, + "grad_norm": 0.4788796305656433, + "learning_rate": 8.704663212435234e-05, + "loss": 1.9276, + "step": 168 + }, + { + "epoch": 0.030694485436011533, + "grad_norm": 0.4768126606941223, + "learning_rate": 8.756476683937824e-05, + "loss": 1.994, + "step": 169 + }, + { + "epoch": 0.030876109610189117, + "grad_norm": 0.5992928743362427, + "learning_rate": 8.808290155440416e-05, + "loss": 2.1367, + "step": 170 + }, + { + "epoch": 0.0310577337843667, + "grad_norm": 0.472366601228714, + "learning_rate": 8.860103626943006e-05, + "loss": 1.9625, + "step": 171 + }, + { + "epoch": 0.03123935795854428, + "grad_norm": 0.5374825596809387, + "learning_rate": 8.911917098445596e-05, + "loss": 2.1125, + "step": 172 + }, + { + "epoch": 0.03142098213272187, + "grad_norm": 0.5096084475517273, + "learning_rate": 8.963730569948186e-05, + "loss": 1.9918, + "step": 173 + }, + { + "epoch": 0.03160260630689945, + "grad_norm": 0.599746584892273, + "learning_rate": 9.015544041450778e-05, + "loss": 2.034, + "step": 174 + }, + { + "epoch": 0.03178423048107703, + "grad_norm": 0.5323488116264343, + "learning_rate": 9.067357512953368e-05, + "loss": 1.8535, + "step": 175 + }, + { + "epoch": 0.031965854655254616, + "grad_norm": 0.9212128520011902, + "learning_rate": 9.11917098445596e-05, + "loss": 1.9794, + "step": 176 + }, + { + "epoch": 0.0321474788294322, + "grad_norm": 0.5121253132820129, + "learning_rate": 9.17098445595855e-05, + "loss": 1.9265, + "step": 177 + }, + { + "epoch": 0.03232910300360978, + "grad_norm": 0.4543314278125763, + "learning_rate": 9.22279792746114e-05, + "loss": 1.8726, + "step": 178 + }, + { + "epoch": 0.032510727177787364, + "grad_norm": 0.6522256731987, + "learning_rate": 9.27461139896373e-05, + "loss": 2.1014, + "step": 179 + }, + { + "epoch": 0.032692351351964945, + "grad_norm": 0.5582627058029175, + "learning_rate": 9.326424870466322e-05, + "loss": 1.9575, + "step": 180 + }, + { + "epoch": 0.03287397552614253, + "grad_norm": 0.7073903679847717, + "learning_rate": 9.378238341968912e-05, + "loss": 2.0158, + "step": 181 + }, + { + "epoch": 0.03305559970032011, + "grad_norm": 0.7305707931518555, + "learning_rate": 9.430051813471503e-05, + "loss": 2.2171, + "step": 182 + }, + { + "epoch": 0.03323722387449769, + "grad_norm": 0.45588213205337524, + "learning_rate": 9.481865284974094e-05, + "loss": 1.8412, + "step": 183 + }, + { + "epoch": 0.03341884804867528, + "grad_norm": 0.5821022391319275, + "learning_rate": 9.533678756476684e-05, + "loss": 1.8955, + "step": 184 + }, + { + "epoch": 0.03360047222285286, + "grad_norm": 0.4976764917373657, + "learning_rate": 9.585492227979275e-05, + "loss": 1.8346, + "step": 185 + }, + { + "epoch": 0.03378209639703045, + "grad_norm": 0.7272288799285889, + "learning_rate": 9.637305699481866e-05, + "loss": 2.1006, + "step": 186 + }, + { + "epoch": 0.03396372057120803, + "grad_norm": 0.5521365404129028, + "learning_rate": 9.689119170984456e-05, + "loss": 2.0787, + "step": 187 + }, + { + "epoch": 0.03414534474538561, + "grad_norm": 0.9140254259109497, + "learning_rate": 9.740932642487047e-05, + "loss": 2.0385, + "step": 188 + }, + { + "epoch": 0.034326968919563196, + "grad_norm": 0.556168794631958, + "learning_rate": 9.792746113989638e-05, + "loss": 2.1439, + "step": 189 + }, + { + "epoch": 0.034508593093740776, + "grad_norm": 0.5846197605133057, + "learning_rate": 9.844559585492228e-05, + "loss": 2.0476, + "step": 190 + }, + { + "epoch": 0.034690217267918357, + "grad_norm": 0.8282696008682251, + "learning_rate": 9.896373056994819e-05, + "loss": 2.0304, + "step": 191 + }, + { + "epoch": 0.034871841442095944, + "grad_norm": 0.6812045574188232, + "learning_rate": 9.94818652849741e-05, + "loss": 2.1063, + "step": 192 + }, + { + "epoch": 0.035053465616273524, + "grad_norm": 0.4153490662574768, + "learning_rate": 0.0001, + "loss": 1.9792, + "step": 193 + }, + { + "epoch": 0.03523508979045111, + "grad_norm": 0.5349352359771729, + "learning_rate": 9.9999991255729e-05, + "loss": 1.9231, + "step": 194 + }, + { + "epoch": 0.03541671396462869, + "grad_norm": 0.6110925674438477, + "learning_rate": 9.999996502291907e-05, + "loss": 1.9478, + "step": 195 + }, + { + "epoch": 0.03559833813880627, + "grad_norm": 0.6445969939231873, + "learning_rate": 9.999992130157935e-05, + "loss": 2.0168, + "step": 196 + }, + { + "epoch": 0.03577996231298386, + "grad_norm": 0.5698654651641846, + "learning_rate": 9.999986009172518e-05, + "loss": 2.088, + "step": 197 + }, + { + "epoch": 0.03596158648716144, + "grad_norm": 0.4550374746322632, + "learning_rate": 9.999978139337793e-05, + "loss": 1.907, + "step": 198 + }, + { + "epoch": 0.03614321066133903, + "grad_norm": 0.5427805781364441, + "learning_rate": 9.999968520656516e-05, + "loss": 1.9499, + "step": 199 + }, + { + "epoch": 0.03632483483551661, + "grad_norm": 0.8563138246536255, + "learning_rate": 9.999957153132047e-05, + "loss": 2.0971, + "step": 200 + }, + { + "epoch": 0.03650645900969419, + "grad_norm": 0.5087870359420776, + "learning_rate": 9.999944036768366e-05, + "loss": 2.0396, + "step": 201 + }, + { + "epoch": 0.036688083183871775, + "grad_norm": 0.6134437918663025, + "learning_rate": 9.999929171570059e-05, + "loss": 2.0672, + "step": 202 + }, + { + "epoch": 0.036869707358049356, + "grad_norm": 0.5154579877853394, + "learning_rate": 9.999912557542326e-05, + "loss": 1.9605, + "step": 203 + }, + { + "epoch": 0.037051331532226936, + "grad_norm": 0.5387488603591919, + "learning_rate": 9.999894194690977e-05, + "loss": 1.9856, + "step": 204 + }, + { + "epoch": 0.03723295570640452, + "grad_norm": 0.4504241645336151, + "learning_rate": 9.999874083022437e-05, + "loss": 1.9329, + "step": 205 + }, + { + "epoch": 0.037414579880582104, + "grad_norm": 2.0094799995422363, + "learning_rate": 9.999852222543739e-05, + "loss": 1.9693, + "step": 206 + }, + { + "epoch": 0.03759620405475969, + "grad_norm": 0.7588562965393066, + "learning_rate": 9.999828613262528e-05, + "loss": 2.1319, + "step": 207 + }, + { + "epoch": 0.03777782822893727, + "grad_norm": 1.8306244611740112, + "learning_rate": 9.999803255187064e-05, + "loss": 2.1474, + "step": 208 + }, + { + "epoch": 0.03795945240311485, + "grad_norm": 0.6234582662582397, + "learning_rate": 9.999776148326216e-05, + "loss": 1.854, + "step": 209 + }, + { + "epoch": 0.03814107657729244, + "grad_norm": 0.4484318494796753, + "learning_rate": 9.999747292689462e-05, + "loss": 1.9622, + "step": 210 + }, + { + "epoch": 0.03832270075147002, + "grad_norm": 0.5201136469841003, + "learning_rate": 9.999716688286903e-05, + "loss": 1.848, + "step": 211 + }, + { + "epoch": 0.03850432492564761, + "grad_norm": 0.40048733353614807, + "learning_rate": 9.999684335129233e-05, + "loss": 1.9368, + "step": 212 + }, + { + "epoch": 0.03868594909982519, + "grad_norm": 0.5734590888023376, + "learning_rate": 9.999650233227775e-05, + "loss": 1.9985, + "step": 213 + }, + { + "epoch": 0.03886757327400277, + "grad_norm": 0.8003738522529602, + "learning_rate": 9.999614382594457e-05, + "loss": 2.0196, + "step": 214 + }, + { + "epoch": 0.039049197448180355, + "grad_norm": 0.6715677380561829, + "learning_rate": 9.999576783241815e-05, + "loss": 1.9451, + "step": 215 + }, + { + "epoch": 0.039230821622357935, + "grad_norm": 0.8733481168746948, + "learning_rate": 9.999537435183002e-05, + "loss": 1.9113, + "step": 216 + }, + { + "epoch": 0.039412445796535515, + "grad_norm": 0.5027008652687073, + "learning_rate": 9.999496338431781e-05, + "loss": 1.8876, + "step": 217 + }, + { + "epoch": 0.0395940699707131, + "grad_norm": 0.501278817653656, + "learning_rate": 9.999453493002526e-05, + "loss": 2.0054, + "step": 218 + }, + { + "epoch": 0.03977569414489068, + "grad_norm": 0.4332396388053894, + "learning_rate": 9.999408898910223e-05, + "loss": 1.9783, + "step": 219 + }, + { + "epoch": 0.03995731831906827, + "grad_norm": 0.5797857642173767, + "learning_rate": 9.999362556170471e-05, + "loss": 2.0512, + "step": 220 + }, + { + "epoch": 0.04013894249324585, + "grad_norm": 0.5610188245773315, + "learning_rate": 9.999314464799477e-05, + "loss": 2.0057, + "step": 221 + }, + { + "epoch": 0.04032056666742343, + "grad_norm": 1.9245027303695679, + "learning_rate": 9.999264624814064e-05, + "loss": 2.103, + "step": 222 + }, + { + "epoch": 0.04050219084160102, + "grad_norm": 0.8170127868652344, + "learning_rate": 9.999213036231663e-05, + "loss": 1.6707, + "step": 223 + }, + { + "epoch": 0.0406838150157786, + "grad_norm": 0.677662193775177, + "learning_rate": 9.99915969907032e-05, + "loss": 1.9779, + "step": 224 + }, + { + "epoch": 0.040865439189956186, + "grad_norm": 0.7971349954605103, + "learning_rate": 9.999104613348688e-05, + "loss": 1.7315, + "step": 225 + }, + { + "epoch": 0.041047063364133766, + "grad_norm": 0.5359565615653992, + "learning_rate": 9.99904777908604e-05, + "loss": 1.9773, + "step": 226 + }, + { + "epoch": 0.04122868753831135, + "grad_norm": 0.8242977261543274, + "learning_rate": 9.998989196302247e-05, + "loss": 2.0346, + "step": 227 + }, + { + "epoch": 0.041410311712488934, + "grad_norm": 0.533557653427124, + "learning_rate": 9.998928865017805e-05, + "loss": 1.9612, + "step": 228 + }, + { + "epoch": 0.041591935886666515, + "grad_norm": 0.6152899265289307, + "learning_rate": 9.998866785253816e-05, + "loss": 2.0139, + "step": 229 + }, + { + "epoch": 0.041773560060844095, + "grad_norm": 0.5450100302696228, + "learning_rate": 9.998802957031991e-05, + "loss": 1.8884, + "step": 230 + }, + { + "epoch": 0.04195518423502168, + "grad_norm": 1.0017800331115723, + "learning_rate": 9.998737380374656e-05, + "loss": 2.0601, + "step": 231 + }, + { + "epoch": 0.04213680840919926, + "grad_norm": 0.8007463216781616, + "learning_rate": 9.998670055304751e-05, + "loss": 1.9534, + "step": 232 + }, + { + "epoch": 0.04231843258337685, + "grad_norm": 0.5781195759773254, + "learning_rate": 9.998600981845821e-05, + "loss": 1.8906, + "step": 233 + }, + { + "epoch": 0.04250005675755443, + "grad_norm": 0.5877636075019836, + "learning_rate": 9.998530160022026e-05, + "loss": 1.9805, + "step": 234 + }, + { + "epoch": 0.04268168093173201, + "grad_norm": 0.5641177892684937, + "learning_rate": 9.998457589858138e-05, + "loss": 2.027, + "step": 235 + }, + { + "epoch": 0.0428633051059096, + "grad_norm": 0.4406791627407074, + "learning_rate": 9.998383271379541e-05, + "loss": 1.7494, + "step": 236 + }, + { + "epoch": 0.04304492928008718, + "grad_norm": 0.45656275749206543, + "learning_rate": 9.998307204612228e-05, + "loss": 1.783, + "step": 237 + }, + { + "epoch": 0.043226553454264766, + "grad_norm": 2.456749200820923, + "learning_rate": 9.998229389582806e-05, + "loss": 1.9996, + "step": 238 + }, + { + "epoch": 0.043408177628442346, + "grad_norm": 0.4090185761451721, + "learning_rate": 9.998149826318492e-05, + "loss": 1.8552, + "step": 239 + }, + { + "epoch": 0.043589801802619926, + "grad_norm": 0.5331476926803589, + "learning_rate": 9.998068514847115e-05, + "loss": 2.0985, + "step": 240 + }, + { + "epoch": 0.043771425976797514, + "grad_norm": 0.504355788230896, + "learning_rate": 9.997985455197114e-05, + "loss": 1.9726, + "step": 241 + }, + { + "epoch": 0.043953050150975094, + "grad_norm": 0.5038161277770996, + "learning_rate": 9.997900647397542e-05, + "loss": 1.9096, + "step": 242 + }, + { + "epoch": 0.044134674325152674, + "grad_norm": 0.6025775074958801, + "learning_rate": 9.997814091478063e-05, + "loss": 2.2669, + "step": 243 + }, + { + "epoch": 0.04431629849933026, + "grad_norm": 0.7067390084266663, + "learning_rate": 9.99772578746895e-05, + "loss": 2.0333, + "step": 244 + }, + { + "epoch": 0.04449792267350784, + "grad_norm": 0.4111911356449127, + "learning_rate": 9.997635735401092e-05, + "loss": 1.9079, + "step": 245 + }, + { + "epoch": 0.04467954684768543, + "grad_norm": 0.5181403756141663, + "learning_rate": 9.997543935305984e-05, + "loss": 2.2205, + "step": 246 + }, + { + "epoch": 0.04486117102186301, + "grad_norm": 0.7203893065452576, + "learning_rate": 9.997450387215737e-05, + "loss": 2.0579, + "step": 247 + }, + { + "epoch": 0.04504279519604059, + "grad_norm": 0.5655404925346375, + "learning_rate": 9.997355091163067e-05, + "loss": 1.9029, + "step": 248 + }, + { + "epoch": 0.04522441937021818, + "grad_norm": 0.6414552927017212, + "learning_rate": 9.997258047181312e-05, + "loss": 1.9435, + "step": 249 + }, + { + "epoch": 0.04540604354439576, + "grad_norm": 1.5546740293502808, + "learning_rate": 9.997159255304412e-05, + "loss": 2.0844, + "step": 250 + }, + { + "epoch": 0.045587667718573345, + "grad_norm": 0.4496568739414215, + "learning_rate": 9.99705871556692e-05, + "loss": 1.8197, + "step": 251 + }, + { + "epoch": 0.045769291892750925, + "grad_norm": 0.6394856572151184, + "learning_rate": 9.996956428004006e-05, + "loss": 1.9991, + "step": 252 + }, + { + "epoch": 0.045950916066928506, + "grad_norm": 0.4730919599533081, + "learning_rate": 9.996852392651441e-05, + "loss": 1.926, + "step": 253 + }, + { + "epoch": 0.04613254024110609, + "grad_norm": 0.9420594573020935, + "learning_rate": 9.99674660954562e-05, + "loss": 2.1053, + "step": 254 + }, + { + "epoch": 0.04631416441528367, + "grad_norm": 0.7130627632141113, + "learning_rate": 9.99663907872354e-05, + "loss": 1.9033, + "step": 255 + }, + { + "epoch": 0.04649578858946126, + "grad_norm": 1.1046788692474365, + "learning_rate": 9.996529800222811e-05, + "loss": 2.0286, + "step": 256 + }, + { + "epoch": 0.04667741276363884, + "grad_norm": 0.465515673160553, + "learning_rate": 9.996418774081658e-05, + "loss": 2.1228, + "step": 257 + }, + { + "epoch": 0.04685903693781642, + "grad_norm": 0.5146877765655518, + "learning_rate": 9.996306000338913e-05, + "loss": 2.217, + "step": 258 + }, + { + "epoch": 0.04704066111199401, + "grad_norm": 0.5746394395828247, + "learning_rate": 9.99619147903402e-05, + "loss": 1.9521, + "step": 259 + }, + { + "epoch": 0.04722228528617159, + "grad_norm": 0.48638585209846497, + "learning_rate": 9.996075210207039e-05, + "loss": 1.8195, + "step": 260 + }, + { + "epoch": 0.04740390946034917, + "grad_norm": 0.5564948916435242, + "learning_rate": 9.995957193898633e-05, + "loss": 1.9843, + "step": 261 + }, + { + "epoch": 0.04758553363452676, + "grad_norm": 0.577829897403717, + "learning_rate": 9.995837430150084e-05, + "loss": 2.0708, + "step": 262 + }, + { + "epoch": 0.04776715780870434, + "grad_norm": 0.7894048690795898, + "learning_rate": 9.99571591900328e-05, + "loss": 1.9195, + "step": 263 + }, + { + "epoch": 0.047948781982881925, + "grad_norm": 0.5667011141777039, + "learning_rate": 9.995592660500723e-05, + "loss": 2.1423, + "step": 264 + }, + { + "epoch": 0.048130406157059505, + "grad_norm": 0.6106431484222412, + "learning_rate": 9.995467654685524e-05, + "loss": 1.7233, + "step": 265 + }, + { + "epoch": 0.048312030331237085, + "grad_norm": 0.596860945224762, + "learning_rate": 9.995340901601409e-05, + "loss": 1.8656, + "step": 266 + }, + { + "epoch": 0.04849365450541467, + "grad_norm": 0.5731503367424011, + "learning_rate": 9.995212401292708e-05, + "loss": 1.9853, + "step": 267 + }, + { + "epoch": 0.04867527867959225, + "grad_norm": 0.5788452625274658, + "learning_rate": 9.995082153804372e-05, + "loss": 1.981, + "step": 268 + }, + { + "epoch": 0.04885690285376984, + "grad_norm": 0.6130810976028442, + "learning_rate": 9.994950159181955e-05, + "loss": 2.1088, + "step": 269 + }, + { + "epoch": 0.04903852702794742, + "grad_norm": 0.951172947883606, + "learning_rate": 9.994816417471625e-05, + "loss": 2.1207, + "step": 270 + }, + { + "epoch": 0.049220151202125, + "grad_norm": 0.6242237091064453, + "learning_rate": 9.994680928720159e-05, + "loss": 1.9262, + "step": 271 + }, + { + "epoch": 0.04940177537630259, + "grad_norm": 0.7647978663444519, + "learning_rate": 9.994543692974951e-05, + "loss": 1.9241, + "step": 272 + }, + { + "epoch": 0.04958339955048017, + "grad_norm": 0.537568211555481, + "learning_rate": 9.994404710283998e-05, + "loss": 2.0117, + "step": 273 + }, + { + "epoch": 0.04976502372465775, + "grad_norm": 0.5689613819122314, + "learning_rate": 9.994263980695916e-05, + "loss": 1.8471, + "step": 274 + }, + { + "epoch": 0.049946647898835336, + "grad_norm": 0.5327227711677551, + "learning_rate": 9.994121504259926e-05, + "loss": 1.8627, + "step": 275 + }, + { + "epoch": 0.05012827207301292, + "grad_norm": 0.571347713470459, + "learning_rate": 9.993977281025862e-05, + "loss": 1.863, + "step": 276 + }, + { + "epoch": 0.050309896247190504, + "grad_norm": 0.46306657791137695, + "learning_rate": 9.99383131104417e-05, + "loss": 1.8234, + "step": 277 + }, + { + "epoch": 0.050491520421368084, + "grad_norm": 0.9584365487098694, + "learning_rate": 9.993683594365906e-05, + "loss": 2.1433, + "step": 278 + }, + { + "epoch": 0.050673144595545665, + "grad_norm": 0.4813757538795471, + "learning_rate": 9.993534131042736e-05, + "loss": 1.8555, + "step": 279 + }, + { + "epoch": 0.05085476876972325, + "grad_norm": 0.501269519329071, + "learning_rate": 9.993382921126937e-05, + "loss": 2.1177, + "step": 280 + }, + { + "epoch": 0.05103639294390083, + "grad_norm": 0.49768951535224915, + "learning_rate": 9.993229964671401e-05, + "loss": 2.0618, + "step": 281 + }, + { + "epoch": 0.05121801711807842, + "grad_norm": 0.592501699924469, + "learning_rate": 9.993075261729626e-05, + "loss": 2.1432, + "step": 282 + }, + { + "epoch": 0.051399641292256, + "grad_norm": 0.7442554235458374, + "learning_rate": 9.992918812355722e-05, + "loss": 2.1114, + "step": 283 + }, + { + "epoch": 0.05158126546643358, + "grad_norm": 0.6427244544029236, + "learning_rate": 9.992760616604411e-05, + "loss": 1.9454, + "step": 284 + }, + { + "epoch": 0.05176288964061117, + "grad_norm": 0.5275068879127502, + "learning_rate": 9.992600674531025e-05, + "loss": 1.8789, + "step": 285 + }, + { + "epoch": 0.05194451381478875, + "grad_norm": 0.9334560632705688, + "learning_rate": 9.992438986191508e-05, + "loss": 2.1246, + "step": 286 + }, + { + "epoch": 0.05212613798896633, + "grad_norm": 0.7725337147712708, + "learning_rate": 9.992275551642412e-05, + "loss": 2.0579, + "step": 287 + }, + { + "epoch": 0.052307762163143916, + "grad_norm": 0.5573204755783081, + "learning_rate": 9.992110370940904e-05, + "loss": 1.8973, + "step": 288 + }, + { + "epoch": 0.052489386337321496, + "grad_norm": 0.55717533826828, + "learning_rate": 9.991943444144757e-05, + "loss": 1.9215, + "step": 289 + }, + { + "epoch": 0.05267101051149908, + "grad_norm": 0.6777409315109253, + "learning_rate": 9.99177477131236e-05, + "loss": 1.8596, + "step": 290 + }, + { + "epoch": 0.052852634685676664, + "grad_norm": 0.4369008243083954, + "learning_rate": 9.991604352502706e-05, + "loss": 2.0565, + "step": 291 + }, + { + "epoch": 0.053034258859854244, + "grad_norm": 0.4787452816963196, + "learning_rate": 9.991432187775404e-05, + "loss": 1.8195, + "step": 292 + }, + { + "epoch": 0.05321588303403183, + "grad_norm": 0.4417600929737091, + "learning_rate": 9.991258277190676e-05, + "loss": 1.9176, + "step": 293 + }, + { + "epoch": 0.05339750720820941, + "grad_norm": 0.7290786504745483, + "learning_rate": 9.991082620809346e-05, + "loss": 1.8646, + "step": 294 + }, + { + "epoch": 0.053579131382387, + "grad_norm": 0.9229206442832947, + "learning_rate": 9.990905218692856e-05, + "loss": 1.9688, + "step": 295 + }, + { + "epoch": 0.05376075555656458, + "grad_norm": 0.4611010253429413, + "learning_rate": 9.990726070903255e-05, + "loss": 1.9803, + "step": 296 + }, + { + "epoch": 0.05394237973074216, + "grad_norm": 0.7303685545921326, + "learning_rate": 9.990545177503203e-05, + "loss": 2.0098, + "step": 297 + }, + { + "epoch": 0.05412400390491975, + "grad_norm": 0.92532879114151, + "learning_rate": 9.990362538555974e-05, + "loss": 1.965, + "step": 298 + }, + { + "epoch": 0.05430562807909733, + "grad_norm": 1.031825304031372, + "learning_rate": 9.990178154125447e-05, + "loss": 1.861, + "step": 299 + }, + { + "epoch": 0.05448725225327491, + "grad_norm": 0.7958323359489441, + "learning_rate": 9.989992024276116e-05, + "loss": 1.8802, + "step": 300 + }, + { + "epoch": 0.054668876427452495, + "grad_norm": 0.7992780208587646, + "learning_rate": 9.989804149073081e-05, + "loss": 1.9898, + "step": 301 + }, + { + "epoch": 0.054850500601630076, + "grad_norm": 0.6866152882575989, + "learning_rate": 9.98961452858206e-05, + "loss": 1.9722, + "step": 302 + }, + { + "epoch": 0.05503212477580766, + "grad_norm": 1.5901784896850586, + "learning_rate": 9.989423162869373e-05, + "loss": 1.8668, + "step": 303 + }, + { + "epoch": 0.05521374894998524, + "grad_norm": 0.5362821221351624, + "learning_rate": 9.989230052001954e-05, + "loss": 1.8459, + "step": 304 + }, + { + "epoch": 0.055395373124162824, + "grad_norm": 0.5171098113059998, + "learning_rate": 9.98903519604735e-05, + "loss": 1.7798, + "step": 305 + }, + { + "epoch": 0.05557699729834041, + "grad_norm": 1.7755564451217651, + "learning_rate": 9.988838595073715e-05, + "loss": 1.8907, + "step": 306 + }, + { + "epoch": 0.05575862147251799, + "grad_norm": 1.1596213579177856, + "learning_rate": 9.988640249149814e-05, + "loss": 2.0165, + "step": 307 + }, + { + "epoch": 0.05594024564669558, + "grad_norm": 0.5094814896583557, + "learning_rate": 9.988440158345022e-05, + "loss": 1.8385, + "step": 308 + }, + { + "epoch": 0.05612186982087316, + "grad_norm": 0.4795410633087158, + "learning_rate": 9.988238322729325e-05, + "loss": 1.7478, + "step": 309 + }, + { + "epoch": 0.05630349399505074, + "grad_norm": 0.5195814967155457, + "learning_rate": 9.98803474237332e-05, + "loss": 1.8514, + "step": 310 + }, + { + "epoch": 0.05648511816922833, + "grad_norm": 0.5391933917999268, + "learning_rate": 9.987829417348213e-05, + "loss": 2.0813, + "step": 311 + }, + { + "epoch": 0.05666674234340591, + "grad_norm": 0.4565756022930145, + "learning_rate": 9.987622347725823e-05, + "loss": 1.9294, + "step": 312 + }, + { + "epoch": 0.05684836651758349, + "grad_norm": 0.8106536865234375, + "learning_rate": 9.987413533578574e-05, + "loss": 2.0036, + "step": 313 + }, + { + "epoch": 0.057029990691761075, + "grad_norm": 0.6366486549377441, + "learning_rate": 9.987202974979502e-05, + "loss": 1.978, + "step": 314 + }, + { + "epoch": 0.057211614865938655, + "grad_norm": 0.49140796065330505, + "learning_rate": 9.986990672002258e-05, + "loss": 1.8049, + "step": 315 + }, + { + "epoch": 0.05739323904011624, + "grad_norm": 0.5457502007484436, + "learning_rate": 9.986776624721097e-05, + "loss": 1.7408, + "step": 316 + }, + { + "epoch": 0.05757486321429382, + "grad_norm": 0.8576157093048096, + "learning_rate": 9.986560833210887e-05, + "loss": 2.0088, + "step": 317 + }, + { + "epoch": 0.0577564873884714, + "grad_norm": 0.44103729724884033, + "learning_rate": 9.986343297547104e-05, + "loss": 1.6961, + "step": 318 + }, + { + "epoch": 0.05793811156264899, + "grad_norm": 0.6258465647697449, + "learning_rate": 9.98612401780584e-05, + "loss": 2.0141, + "step": 319 + }, + { + "epoch": 0.05811973573682657, + "grad_norm": 0.5575562119483948, + "learning_rate": 9.98590299406379e-05, + "loss": 1.9816, + "step": 320 + }, + { + "epoch": 0.05830135991100416, + "grad_norm": 0.49205225706100464, + "learning_rate": 9.985680226398261e-05, + "loss": 1.9399, + "step": 321 + }, + { + "epoch": 0.05848298408518174, + "grad_norm": 0.4992923438549042, + "learning_rate": 9.985455714887171e-05, + "loss": 1.8168, + "step": 322 + }, + { + "epoch": 0.05866460825935932, + "grad_norm": 0.5268008708953857, + "learning_rate": 9.985229459609046e-05, + "loss": 1.9027, + "step": 323 + }, + { + "epoch": 0.058846232433536906, + "grad_norm": 0.6155356764793396, + "learning_rate": 9.985001460643028e-05, + "loss": 1.9663, + "step": 324 + }, + { + "epoch": 0.059027856607714486, + "grad_norm": 0.49179914593696594, + "learning_rate": 9.984771718068861e-05, + "loss": 1.9217, + "step": 325 + }, + { + "epoch": 0.05920948078189207, + "grad_norm": 0.545464813709259, + "learning_rate": 9.984540231966904e-05, + "loss": 1.9117, + "step": 326 + }, + { + "epoch": 0.059391104956069654, + "grad_norm": 0.4688274562358856, + "learning_rate": 9.984307002418121e-05, + "loss": 2.0378, + "step": 327 + }, + { + "epoch": 0.059572729130247234, + "grad_norm": 0.5679923295974731, + "learning_rate": 9.984072029504092e-05, + "loss": 2.0892, + "step": 328 + }, + { + "epoch": 0.05975435330442482, + "grad_norm": 0.5551508069038391, + "learning_rate": 9.983835313307002e-05, + "loss": 1.911, + "step": 329 + }, + { + "epoch": 0.0599359774786024, + "grad_norm": 0.508350670337677, + "learning_rate": 9.98359685390965e-05, + "loss": 1.9275, + "step": 330 + }, + { + "epoch": 0.06011760165277998, + "grad_norm": 0.5157881379127502, + "learning_rate": 9.983356651395436e-05, + "loss": 1.8197, + "step": 331 + }, + { + "epoch": 0.06029922582695757, + "grad_norm": 0.6892542839050293, + "learning_rate": 9.983114705848384e-05, + "loss": 1.8518, + "step": 332 + }, + { + "epoch": 0.06048085000113515, + "grad_norm": 0.3899281620979309, + "learning_rate": 9.982871017353115e-05, + "loss": 1.8268, + "step": 333 + }, + { + "epoch": 0.06066247417531274, + "grad_norm": 2.06675124168396, + "learning_rate": 9.982625585994863e-05, + "loss": 2.1146, + "step": 334 + }, + { + "epoch": 0.06084409834949032, + "grad_norm": 0.5297189950942993, + "learning_rate": 9.982378411859476e-05, + "loss": 1.8628, + "step": 335 + }, + { + "epoch": 0.0610257225236679, + "grad_norm": 1.2235521078109741, + "learning_rate": 9.982129495033406e-05, + "loss": 1.7627, + "step": 336 + }, + { + "epoch": 0.061207346697845486, + "grad_norm": 0.6300793886184692, + "learning_rate": 9.981878835603717e-05, + "loss": 1.8824, + "step": 337 + }, + { + "epoch": 0.061388970872023066, + "grad_norm": 0.4006335139274597, + "learning_rate": 9.981626433658083e-05, + "loss": 1.6601, + "step": 338 + }, + { + "epoch": 0.061570595046200646, + "grad_norm": 0.5129688382148743, + "learning_rate": 9.981372289284788e-05, + "loss": 2.0826, + "step": 339 + }, + { + "epoch": 0.061752219220378234, + "grad_norm": 0.5632669925689697, + "learning_rate": 9.981116402572722e-05, + "loss": 1.8608, + "step": 340 + }, + { + "epoch": 0.061933843394555814, + "grad_norm": 1.4150149822235107, + "learning_rate": 9.980858773611388e-05, + "loss": 2.3284, + "step": 341 + }, + { + "epoch": 0.0621154675687334, + "grad_norm": 0.538357138633728, + "learning_rate": 9.980599402490896e-05, + "loss": 2.0453, + "step": 342 + }, + { + "epoch": 0.06229709174291098, + "grad_norm": 0.5519031882286072, + "learning_rate": 9.980338289301968e-05, + "loss": 1.7887, + "step": 343 + }, + { + "epoch": 0.06247871591708856, + "grad_norm": 0.4075939655303955, + "learning_rate": 9.980075434135934e-05, + "loss": 1.7524, + "step": 344 + }, + { + "epoch": 0.06266034009126614, + "grad_norm": 0.42223554849624634, + "learning_rate": 9.979810837084731e-05, + "loss": 1.865, + "step": 345 + }, + { + "epoch": 0.06284196426544374, + "grad_norm": 0.6270445585250854, + "learning_rate": 9.979544498240908e-05, + "loss": 1.9746, + "step": 346 + }, + { + "epoch": 0.06302358843962132, + "grad_norm": 0.4365719258785248, + "learning_rate": 9.979276417697624e-05, + "loss": 2.0284, + "step": 347 + }, + { + "epoch": 0.0632052126137989, + "grad_norm": 0.8699837923049927, + "learning_rate": 9.979006595548644e-05, + "loss": 1.965, + "step": 348 + }, + { + "epoch": 0.06338683678797648, + "grad_norm": 0.5183691382408142, + "learning_rate": 9.978735031888345e-05, + "loss": 1.8619, + "step": 349 + }, + { + "epoch": 0.06356846096215406, + "grad_norm": 0.5108275413513184, + "learning_rate": 9.978461726811712e-05, + "loss": 2.104, + "step": 350 + }, + { + "epoch": 0.06375008513633165, + "grad_norm": 0.560340166091919, + "learning_rate": 9.97818668041434e-05, + "loss": 1.942, + "step": 351 + }, + { + "epoch": 0.06393170931050923, + "grad_norm": 0.5048916339874268, + "learning_rate": 9.977909892792429e-05, + "loss": 2.0062, + "step": 352 + }, + { + "epoch": 0.06411333348468681, + "grad_norm": 0.503559410572052, + "learning_rate": 9.977631364042795e-05, + "loss": 1.913, + "step": 353 + }, + { + "epoch": 0.0642949576588644, + "grad_norm": 0.5209774971008301, + "learning_rate": 9.977351094262857e-05, + "loss": 1.9194, + "step": 354 + }, + { + "epoch": 0.06447658183304197, + "grad_norm": 0.48323437571525574, + "learning_rate": 9.977069083550645e-05, + "loss": 1.8887, + "step": 355 + }, + { + "epoch": 0.06465820600721955, + "grad_norm": 0.6365177035331726, + "learning_rate": 9.976785332004801e-05, + "loss": 1.835, + "step": 356 + }, + { + "epoch": 0.06483983018139715, + "grad_norm": 1.144991159439087, + "learning_rate": 9.976499839724568e-05, + "loss": 2.122, + "step": 357 + }, + { + "epoch": 0.06502145435557473, + "grad_norm": 1.7136495113372803, + "learning_rate": 9.97621260680981e-05, + "loss": 2.1187, + "step": 358 + }, + { + "epoch": 0.06520307852975231, + "grad_norm": 0.4659644067287445, + "learning_rate": 9.975923633360985e-05, + "loss": 1.8431, + "step": 359 + }, + { + "epoch": 0.06538470270392989, + "grad_norm": 0.86777663230896, + "learning_rate": 9.975632919479172e-05, + "loss": 1.9699, + "step": 360 + }, + { + "epoch": 0.06556632687810747, + "grad_norm": 0.4893040060997009, + "learning_rate": 9.975340465266053e-05, + "loss": 1.8685, + "step": 361 + }, + { + "epoch": 0.06574795105228506, + "grad_norm": 0.5920698046684265, + "learning_rate": 9.97504627082392e-05, + "loss": 1.9624, + "step": 362 + }, + { + "epoch": 0.06592957522646264, + "grad_norm": 0.6074674725532532, + "learning_rate": 9.974750336255675e-05, + "loss": 1.8944, + "step": 363 + }, + { + "epoch": 0.06611119940064022, + "grad_norm": 0.5636250376701355, + "learning_rate": 9.974452661664825e-05, + "loss": 1.99, + "step": 364 + }, + { + "epoch": 0.0662928235748178, + "grad_norm": 0.4112483263015747, + "learning_rate": 9.974153247155489e-05, + "loss": 1.8319, + "step": 365 + }, + { + "epoch": 0.06647444774899539, + "grad_norm": 0.36125364899635315, + "learning_rate": 9.973852092832394e-05, + "loss": 1.7408, + "step": 366 + }, + { + "epoch": 0.06665607192317298, + "grad_norm": 0.6088950634002686, + "learning_rate": 9.973549198800874e-05, + "loss": 2.0506, + "step": 367 + }, + { + "epoch": 0.06683769609735056, + "grad_norm": 0.5315702557563782, + "learning_rate": 9.973244565166873e-05, + "loss": 1.9147, + "step": 368 + }, + { + "epoch": 0.06701932027152814, + "grad_norm": 1.3861491680145264, + "learning_rate": 9.972938192036944e-05, + "loss": 1.9785, + "step": 369 + }, + { + "epoch": 0.06720094444570572, + "grad_norm": 0.5531999468803406, + "learning_rate": 9.972630079518245e-05, + "loss": 1.8073, + "step": 370 + }, + { + "epoch": 0.0673825686198833, + "grad_norm": 0.6080440878868103, + "learning_rate": 9.972320227718546e-05, + "loss": 1.8597, + "step": 371 + }, + { + "epoch": 0.0675641927940609, + "grad_norm": 0.470310240983963, + "learning_rate": 9.972008636746225e-05, + "loss": 1.7832, + "step": 372 + }, + { + "epoch": 0.06774581696823848, + "grad_norm": 0.6512259244918823, + "learning_rate": 9.971695306710267e-05, + "loss": 2.0078, + "step": 373 + }, + { + "epoch": 0.06792744114241606, + "grad_norm": 0.44897764921188354, + "learning_rate": 9.971380237720264e-05, + "loss": 1.945, + "step": 374 + }, + { + "epoch": 0.06810906531659364, + "grad_norm": 0.45520028471946716, + "learning_rate": 9.97106342988642e-05, + "loss": 1.9587, + "step": 375 + }, + { + "epoch": 0.06829068949077122, + "grad_norm": 0.5400926470756531, + "learning_rate": 9.970744883319545e-05, + "loss": 1.9172, + "step": 376 + }, + { + "epoch": 0.06847231366494881, + "grad_norm": 0.5085148215293884, + "learning_rate": 9.970424598131056e-05, + "loss": 2.054, + "step": 377 + }, + { + "epoch": 0.06865393783912639, + "grad_norm": 0.8323948979377747, + "learning_rate": 9.97010257443298e-05, + "loss": 1.9645, + "step": 378 + }, + { + "epoch": 0.06883556201330397, + "grad_norm": 0.5662606954574585, + "learning_rate": 9.969778812337952e-05, + "loss": 1.8119, + "step": 379 + }, + { + "epoch": 0.06901718618748155, + "grad_norm": 0.4778348505496979, + "learning_rate": 9.969453311959214e-05, + "loss": 1.9783, + "step": 380 + }, + { + "epoch": 0.06919881036165913, + "grad_norm": 0.6256463527679443, + "learning_rate": 9.969126073410617e-05, + "loss": 1.7369, + "step": 381 + }, + { + "epoch": 0.06938043453583671, + "grad_norm": 0.42785632610321045, + "learning_rate": 9.968797096806619e-05, + "loss": 1.8356, + "step": 382 + }, + { + "epoch": 0.06956205871001431, + "grad_norm": 0.483888179063797, + "learning_rate": 9.968466382262286e-05, + "loss": 1.8284, + "step": 383 + }, + { + "epoch": 0.06974368288419189, + "grad_norm": 0.4679669737815857, + "learning_rate": 9.968133929893295e-05, + "loss": 1.6306, + "step": 384 + }, + { + "epoch": 0.06992530705836947, + "grad_norm": 0.6714125871658325, + "learning_rate": 9.967799739815925e-05, + "loss": 1.9029, + "step": 385 + }, + { + "epoch": 0.07010693123254705, + "grad_norm": 0.6736912131309509, + "learning_rate": 9.967463812147067e-05, + "loss": 1.8662, + "step": 386 + }, + { + "epoch": 0.07028855540672463, + "grad_norm": 0.482697993516922, + "learning_rate": 9.96712614700422e-05, + "loss": 1.8166, + "step": 387 + }, + { + "epoch": 0.07047017958090222, + "grad_norm": 0.4093661308288574, + "learning_rate": 9.966786744505485e-05, + "loss": 1.892, + "step": 388 + }, + { + "epoch": 0.0706518037550798, + "grad_norm": 0.4277246594429016, + "learning_rate": 9.966445604769581e-05, + "loss": 1.8756, + "step": 389 + }, + { + "epoch": 0.07083342792925738, + "grad_norm": 0.5065869688987732, + "learning_rate": 9.966102727915824e-05, + "loss": 1.8002, + "step": 390 + }, + { + "epoch": 0.07101505210343496, + "grad_norm": 0.5910537242889404, + "learning_rate": 9.965758114064147e-05, + "loss": 2.1122, + "step": 391 + }, + { + "epoch": 0.07119667627761254, + "grad_norm": 0.8893569707870483, + "learning_rate": 9.965411763335082e-05, + "loss": 1.9363, + "step": 392 + }, + { + "epoch": 0.07137830045179014, + "grad_norm": 0.4846252202987671, + "learning_rate": 9.965063675849773e-05, + "loss": 1.9339, + "step": 393 + }, + { + "epoch": 0.07155992462596772, + "grad_norm": 0.7354714274406433, + "learning_rate": 9.964713851729973e-05, + "loss": 1.9464, + "step": 394 + }, + { + "epoch": 0.0717415488001453, + "grad_norm": 0.8585404753684998, + "learning_rate": 9.964362291098036e-05, + "loss": 1.9671, + "step": 395 + }, + { + "epoch": 0.07192317297432288, + "grad_norm": 0.5193942785263062, + "learning_rate": 9.964008994076932e-05, + "loss": 1.8618, + "step": 396 + }, + { + "epoch": 0.07210479714850046, + "grad_norm": 0.5650438070297241, + "learning_rate": 9.963653960790233e-05, + "loss": 1.8053, + "step": 397 + }, + { + "epoch": 0.07228642132267805, + "grad_norm": 0.5270420908927917, + "learning_rate": 9.963297191362118e-05, + "loss": 1.8391, + "step": 398 + }, + { + "epoch": 0.07246804549685563, + "grad_norm": 0.35955381393432617, + "learning_rate": 9.962938685917374e-05, + "loss": 1.7309, + "step": 399 + }, + { + "epoch": 0.07264966967103322, + "grad_norm": 0.49446314573287964, + "learning_rate": 9.962578444581398e-05, + "loss": 1.9933, + "step": 400 + }, + { + "epoch": 0.0728312938452108, + "grad_norm": 0.5851883292198181, + "learning_rate": 9.962216467480193e-05, + "loss": 1.9596, + "step": 401 + }, + { + "epoch": 0.07301291801938838, + "grad_norm": 0.6153406500816345, + "learning_rate": 9.961852754740362e-05, + "loss": 1.8639, + "step": 402 + }, + { + "epoch": 0.07319454219356597, + "grad_norm": 0.54420006275177, + "learning_rate": 9.961487306489128e-05, + "loss": 1.9897, + "step": 403 + }, + { + "epoch": 0.07337616636774355, + "grad_norm": 0.7936567664146423, + "learning_rate": 9.961120122854311e-05, + "loss": 2.0177, + "step": 404 + }, + { + "epoch": 0.07355779054192113, + "grad_norm": 0.9743078351020813, + "learning_rate": 9.96075120396434e-05, + "loss": 1.8744, + "step": 405 + }, + { + "epoch": 0.07373941471609871, + "grad_norm": 0.5373771786689758, + "learning_rate": 9.960380549948253e-05, + "loss": 2.0266, + "step": 406 + }, + { + "epoch": 0.07392103889027629, + "grad_norm": 0.5517727732658386, + "learning_rate": 9.960008160935695e-05, + "loss": 1.8757, + "step": 407 + }, + { + "epoch": 0.07410266306445387, + "grad_norm": 0.4569588601589203, + "learning_rate": 9.959634037056917e-05, + "loss": 1.9159, + "step": 408 + }, + { + "epoch": 0.07428428723863147, + "grad_norm": 0.5683454275131226, + "learning_rate": 9.959258178442774e-05, + "loss": 1.6762, + "step": 409 + }, + { + "epoch": 0.07446591141280905, + "grad_norm": 0.5814051628112793, + "learning_rate": 9.958880585224734e-05, + "loss": 1.9397, + "step": 410 + }, + { + "epoch": 0.07464753558698663, + "grad_norm": 0.4801495373249054, + "learning_rate": 9.958501257534866e-05, + "loss": 1.9585, + "step": 411 + }, + { + "epoch": 0.07482915976116421, + "grad_norm": 1.0386015176773071, + "learning_rate": 9.958120195505846e-05, + "loss": 1.8768, + "step": 412 + }, + { + "epoch": 0.07501078393534179, + "grad_norm": 0.45565611124038696, + "learning_rate": 9.957737399270963e-05, + "loss": 2.0661, + "step": 413 + }, + { + "epoch": 0.07519240810951938, + "grad_norm": 0.42420774698257446, + "learning_rate": 9.957352868964105e-05, + "loss": 1.6841, + "step": 414 + }, + { + "epoch": 0.07537403228369696, + "grad_norm": 0.6461265087127686, + "learning_rate": 9.956966604719768e-05, + "loss": 1.9299, + "step": 415 + }, + { + "epoch": 0.07555565645787454, + "grad_norm": 0.9873509407043457, + "learning_rate": 9.956578606673059e-05, + "loss": 1.9097, + "step": 416 + }, + { + "epoch": 0.07573728063205212, + "grad_norm": 0.474287211894989, + "learning_rate": 9.956188874959687e-05, + "loss": 1.77, + "step": 417 + }, + { + "epoch": 0.0759189048062297, + "grad_norm": 0.4420859217643738, + "learning_rate": 9.95579740971597e-05, + "loss": 1.9034, + "step": 418 + }, + { + "epoch": 0.0761005289804073, + "grad_norm": 0.5015067458152771, + "learning_rate": 9.955404211078829e-05, + "loss": 1.9651, + "step": 419 + }, + { + "epoch": 0.07628215315458488, + "grad_norm": 0.5823950171470642, + "learning_rate": 9.955009279185795e-05, + "loss": 1.9897, + "step": 420 + }, + { + "epoch": 0.07646377732876246, + "grad_norm": 0.8496461510658264, + "learning_rate": 9.954612614175003e-05, + "loss": 1.7671, + "step": 421 + }, + { + "epoch": 0.07664540150294004, + "grad_norm": 0.5283600687980652, + "learning_rate": 9.954214216185194e-05, + "loss": 1.8516, + "step": 422 + }, + { + "epoch": 0.07682702567711762, + "grad_norm": 0.38519009947776794, + "learning_rate": 9.953814085355719e-05, + "loss": 1.9893, + "step": 423 + }, + { + "epoch": 0.07700864985129521, + "grad_norm": 0.9259639382362366, + "learning_rate": 9.95341222182653e-05, + "loss": 1.8278, + "step": 424 + }, + { + "epoch": 0.0771902740254728, + "grad_norm": 0.5919634103775024, + "learning_rate": 9.953008625738186e-05, + "loss": 1.802, + "step": 425 + }, + { + "epoch": 0.07737189819965037, + "grad_norm": 0.717201828956604, + "learning_rate": 9.952603297231856e-05, + "loss": 1.7037, + "step": 426 + }, + { + "epoch": 0.07755352237382795, + "grad_norm": 0.6786592602729797, + "learning_rate": 9.952196236449309e-05, + "loss": 1.6481, + "step": 427 + }, + { + "epoch": 0.07773514654800553, + "grad_norm": 0.7934308052062988, + "learning_rate": 9.951787443532926e-05, + "loss": 1.7113, + "step": 428 + }, + { + "epoch": 0.07791677072218313, + "grad_norm": 0.4602201282978058, + "learning_rate": 9.951376918625688e-05, + "loss": 2.0203, + "step": 429 + }, + { + "epoch": 0.07809839489636071, + "grad_norm": 0.5299506187438965, + "learning_rate": 9.950964661871187e-05, + "loss": 1.8794, + "step": 430 + }, + { + "epoch": 0.07828001907053829, + "grad_norm": 0.5387322902679443, + "learning_rate": 9.950550673413617e-05, + "loss": 2.0098, + "step": 431 + }, + { + "epoch": 0.07846164324471587, + "grad_norm": 0.6433892846107483, + "learning_rate": 9.950134953397779e-05, + "loss": 2.0249, + "step": 432 + }, + { + "epoch": 0.07864326741889345, + "grad_norm": 0.9116496443748474, + "learning_rate": 9.94971750196908e-05, + "loss": 1.8621, + "step": 433 + }, + { + "epoch": 0.07882489159307103, + "grad_norm": 0.45827096700668335, + "learning_rate": 9.949298319273535e-05, + "loss": 1.9264, + "step": 434 + }, + { + "epoch": 0.07900651576724863, + "grad_norm": 0.6208136677742004, + "learning_rate": 9.948877405457758e-05, + "loss": 2.0121, + "step": 435 + }, + { + "epoch": 0.0791881399414262, + "grad_norm": 0.41528651118278503, + "learning_rate": 9.948454760668973e-05, + "loss": 1.6739, + "step": 436 + }, + { + "epoch": 0.07936976411560379, + "grad_norm": 0.9112038612365723, + "learning_rate": 9.948030385055011e-05, + "loss": 2.0614, + "step": 437 + }, + { + "epoch": 0.07955138828978137, + "grad_norm": 0.672838568687439, + "learning_rate": 9.947604278764304e-05, + "loss": 1.8567, + "step": 438 + }, + { + "epoch": 0.07973301246395895, + "grad_norm": 0.5404559373855591, + "learning_rate": 9.947176441945892e-05, + "loss": 1.8998, + "step": 439 + }, + { + "epoch": 0.07991463663813654, + "grad_norm": 0.47912371158599854, + "learning_rate": 9.946746874749422e-05, + "loss": 2.0541, + "step": 440 + }, + { + "epoch": 0.08009626081231412, + "grad_norm": 0.5426422357559204, + "learning_rate": 9.94631557732514e-05, + "loss": 2.0766, + "step": 441 + }, + { + "epoch": 0.0802778849864917, + "grad_norm": 0.7020577192306519, + "learning_rate": 9.945882549823906e-05, + "loss": 2.0016, + "step": 442 + }, + { + "epoch": 0.08045950916066928, + "grad_norm": 0.6541326642036438, + "learning_rate": 9.945447792397176e-05, + "loss": 1.8973, + "step": 443 + }, + { + "epoch": 0.08064113333484686, + "grad_norm": 0.483767569065094, + "learning_rate": 9.945011305197019e-05, + "loss": 1.845, + "step": 444 + }, + { + "epoch": 0.08082275750902446, + "grad_norm": 1.3902904987335205, + "learning_rate": 9.944573088376103e-05, + "loss": 1.9728, + "step": 445 + }, + { + "epoch": 0.08100438168320204, + "grad_norm": 0.4667975604534149, + "learning_rate": 9.944133142087704e-05, + "loss": 1.975, + "step": 446 + }, + { + "epoch": 0.08118600585737962, + "grad_norm": 0.8533281683921814, + "learning_rate": 9.943691466485705e-05, + "loss": 1.7545, + "step": 447 + }, + { + "epoch": 0.0813676300315572, + "grad_norm": 0.5584127306938171, + "learning_rate": 9.943248061724588e-05, + "loss": 1.787, + "step": 448 + }, + { + "epoch": 0.08154925420573478, + "grad_norm": 0.5159483551979065, + "learning_rate": 9.942802927959443e-05, + "loss": 1.7357, + "step": 449 + }, + { + "epoch": 0.08173087837991237, + "grad_norm": 0.4320266842842102, + "learning_rate": 9.942356065345967e-05, + "loss": 1.8942, + "step": 450 + }, + { + "epoch": 0.08191250255408995, + "grad_norm": 0.634963870048523, + "learning_rate": 9.941907474040458e-05, + "loss": 1.9702, + "step": 451 + }, + { + "epoch": 0.08209412672826753, + "grad_norm": 0.4832179844379425, + "learning_rate": 9.941457154199821e-05, + "loss": 1.9224, + "step": 452 + }, + { + "epoch": 0.08227575090244511, + "grad_norm": 0.7383478879928589, + "learning_rate": 9.941005105981565e-05, + "loss": 1.815, + "step": 453 + }, + { + "epoch": 0.0824573750766227, + "grad_norm": 0.4862358868122101, + "learning_rate": 9.940551329543802e-05, + "loss": 1.8136, + "step": 454 + }, + { + "epoch": 0.08263899925080029, + "grad_norm": 0.583200216293335, + "learning_rate": 9.94009582504525e-05, + "loss": 1.5915, + "step": 455 + }, + { + "epoch": 0.08282062342497787, + "grad_norm": 0.535666823387146, + "learning_rate": 9.93963859264523e-05, + "loss": 2.1399, + "step": 456 + }, + { + "epoch": 0.08300224759915545, + "grad_norm": 1.0164158344268799, + "learning_rate": 9.939179632503674e-05, + "loss": 1.7089, + "step": 457 + }, + { + "epoch": 0.08318387177333303, + "grad_norm": 1.165727138519287, + "learning_rate": 9.938718944781107e-05, + "loss": 1.8275, + "step": 458 + }, + { + "epoch": 0.08336549594751061, + "grad_norm": 0.49638405442237854, + "learning_rate": 9.938256529638665e-05, + "loss": 2.0517, + "step": 459 + }, + { + "epoch": 0.08354712012168819, + "grad_norm": 0.4599452018737793, + "learning_rate": 9.937792387238091e-05, + "loss": 1.8824, + "step": 460 + }, + { + "epoch": 0.08372874429586578, + "grad_norm": 0.7274848818778992, + "learning_rate": 9.937326517741724e-05, + "loss": 1.9269, + "step": 461 + }, + { + "epoch": 0.08391036847004336, + "grad_norm": 0.5780855417251587, + "learning_rate": 9.936858921312515e-05, + "loss": 1.834, + "step": 462 + }, + { + "epoch": 0.08409199264422094, + "grad_norm": 0.39074239134788513, + "learning_rate": 9.936389598114013e-05, + "loss": 1.9896, + "step": 463 + }, + { + "epoch": 0.08427361681839853, + "grad_norm": 0.5945570468902588, + "learning_rate": 9.935918548310374e-05, + "loss": 2.0481, + "step": 464 + }, + { + "epoch": 0.0844552409925761, + "grad_norm": 0.5364426970481873, + "learning_rate": 9.93544577206636e-05, + "loss": 1.8784, + "step": 465 + }, + { + "epoch": 0.0846368651667537, + "grad_norm": 0.49124643206596375, + "learning_rate": 9.934971269547332e-05, + "loss": 1.8207, + "step": 466 + }, + { + "epoch": 0.08481848934093128, + "grad_norm": 0.5053133368492126, + "learning_rate": 9.934495040919258e-05, + "loss": 1.7654, + "step": 467 + }, + { + "epoch": 0.08500011351510886, + "grad_norm": 0.4308580160140991, + "learning_rate": 9.934017086348708e-05, + "loss": 1.8694, + "step": 468 + }, + { + "epoch": 0.08518173768928644, + "grad_norm": 1.035474181175232, + "learning_rate": 9.933537406002857e-05, + "loss": 2.0803, + "step": 469 + }, + { + "epoch": 0.08536336186346402, + "grad_norm": 0.820516049861908, + "learning_rate": 9.933056000049483e-05, + "loss": 1.8305, + "step": 470 + }, + { + "epoch": 0.08554498603764162, + "grad_norm": 0.5738487243652344, + "learning_rate": 9.932572868656969e-05, + "loss": 1.7798, + "step": 471 + }, + { + "epoch": 0.0857266102118192, + "grad_norm": 0.4659803807735443, + "learning_rate": 9.932088011994298e-05, + "loss": 1.7554, + "step": 472 + }, + { + "epoch": 0.08590823438599678, + "grad_norm": 1.0335673093795776, + "learning_rate": 9.931601430231062e-05, + "loss": 2.0792, + "step": 473 + }, + { + "epoch": 0.08608985856017436, + "grad_norm": 0.44645223021507263, + "learning_rate": 9.93111312353745e-05, + "loss": 1.8517, + "step": 474 + }, + { + "epoch": 0.08627148273435194, + "grad_norm": 0.4095654785633087, + "learning_rate": 9.930623092084259e-05, + "loss": 1.814, + "step": 475 + }, + { + "epoch": 0.08645310690852953, + "grad_norm": 0.3973332941532135, + "learning_rate": 9.930131336042888e-05, + "loss": 1.8892, + "step": 476 + }, + { + "epoch": 0.08663473108270711, + "grad_norm": 0.6231238842010498, + "learning_rate": 9.929637855585338e-05, + "loss": 2.0199, + "step": 477 + }, + { + "epoch": 0.08681635525688469, + "grad_norm": 0.4723554253578186, + "learning_rate": 9.929142650884213e-05, + "loss": 1.8968, + "step": 478 + }, + { + "epoch": 0.08699797943106227, + "grad_norm": 0.4837193191051483, + "learning_rate": 9.928645722112724e-05, + "loss": 1.9174, + "step": 479 + }, + { + "epoch": 0.08717960360523985, + "grad_norm": 0.6146240234375, + "learning_rate": 9.92814706944468e-05, + "loss": 1.7772, + "step": 480 + }, + { + "epoch": 0.08736122777941745, + "grad_norm": 0.5154096484184265, + "learning_rate": 9.927646693054496e-05, + "loss": 1.8146, + "step": 481 + }, + { + "epoch": 0.08754285195359503, + "grad_norm": 0.4806475341320038, + "learning_rate": 9.927144593117189e-05, + "loss": 1.736, + "step": 482 + }, + { + "epoch": 0.08772447612777261, + "grad_norm": 0.5453230142593384, + "learning_rate": 9.92664076980838e-05, + "loss": 1.9749, + "step": 483 + }, + { + "epoch": 0.08790610030195019, + "grad_norm": 0.37014394998550415, + "learning_rate": 9.926135223304289e-05, + "loss": 1.8226, + "step": 484 + }, + { + "epoch": 0.08808772447612777, + "grad_norm": 0.5835460424423218, + "learning_rate": 9.925627953781743e-05, + "loss": 1.7283, + "step": 485 + }, + { + "epoch": 0.08826934865030535, + "grad_norm": 0.6834240555763245, + "learning_rate": 9.925118961418171e-05, + "loss": 1.9699, + "step": 486 + }, + { + "epoch": 0.08845097282448294, + "grad_norm": 0.6209450364112854, + "learning_rate": 9.924608246391602e-05, + "loss": 1.7713, + "step": 487 + }, + { + "epoch": 0.08863259699866052, + "grad_norm": 0.39392030239105225, + "learning_rate": 9.924095808880671e-05, + "loss": 2.0027, + "step": 488 + }, + { + "epoch": 0.0888142211728381, + "grad_norm": 0.5256039500236511, + "learning_rate": 9.923581649064611e-05, + "loss": 1.8875, + "step": 489 + }, + { + "epoch": 0.08899584534701568, + "grad_norm": 0.6375530362129211, + "learning_rate": 9.923065767123263e-05, + "loss": 1.9313, + "step": 490 + }, + { + "epoch": 0.08917746952119326, + "grad_norm": 0.68098384141922, + "learning_rate": 9.922548163237066e-05, + "loss": 1.8268, + "step": 491 + }, + { + "epoch": 0.08935909369537086, + "grad_norm": 0.46122175455093384, + "learning_rate": 9.922028837587064e-05, + "loss": 1.8573, + "step": 492 + }, + { + "epoch": 0.08954071786954844, + "grad_norm": 0.5336621999740601, + "learning_rate": 9.9215077903549e-05, + "loss": 2.0756, + "step": 493 + }, + { + "epoch": 0.08972234204372602, + "grad_norm": 0.43433132767677307, + "learning_rate": 9.920985021722822e-05, + "loss": 1.7599, + "step": 494 + }, + { + "epoch": 0.0899039662179036, + "grad_norm": 0.568290650844574, + "learning_rate": 9.920460531873679e-05, + "loss": 1.8355, + "step": 495 + }, + { + "epoch": 0.09008559039208118, + "grad_norm": 0.3568073511123657, + "learning_rate": 9.919934320990925e-05, + "loss": 1.8415, + "step": 496 + }, + { + "epoch": 0.09026721456625877, + "grad_norm": 0.5453485250473022, + "learning_rate": 9.919406389258607e-05, + "loss": 1.7265, + "step": 497 + }, + { + "epoch": 0.09044883874043635, + "grad_norm": 0.41757825016975403, + "learning_rate": 9.918876736861387e-05, + "loss": 1.8945, + "step": 498 + }, + { + "epoch": 0.09063046291461394, + "grad_norm": 1.3820552825927734, + "learning_rate": 9.918345363984519e-05, + "loss": 1.6418, + "step": 499 + }, + { + "epoch": 0.09081208708879152, + "grad_norm": 1.2880027294158936, + "learning_rate": 9.917812270813859e-05, + "loss": 1.8972, + "step": 500 + }, + { + "epoch": 0.0909937112629691, + "grad_norm": 0.5539196133613586, + "learning_rate": 9.917277457535872e-05, + "loss": 1.7916, + "step": 501 + }, + { + "epoch": 0.09117533543714669, + "grad_norm": 0.41120895743370056, + "learning_rate": 9.916740924337617e-05, + "loss": 1.8045, + "step": 502 + }, + { + "epoch": 0.09135695961132427, + "grad_norm": 0.4457366466522217, + "learning_rate": 9.91620267140676e-05, + "loss": 1.688, + "step": 503 + }, + { + "epoch": 0.09153858378550185, + "grad_norm": 0.9961792230606079, + "learning_rate": 9.915662698931565e-05, + "loss": 1.7323, + "step": 504 + }, + { + "epoch": 0.09172020795967943, + "grad_norm": 0.43981990218162537, + "learning_rate": 9.915121007100898e-05, + "loss": 1.9314, + "step": 505 + }, + { + "epoch": 0.09190183213385701, + "grad_norm": 0.7626000642776489, + "learning_rate": 9.914577596104226e-05, + "loss": 1.8718, + "step": 506 + }, + { + "epoch": 0.0920834563080346, + "grad_norm": 0.5151670575141907, + "learning_rate": 9.914032466131623e-05, + "loss": 1.6802, + "step": 507 + }, + { + "epoch": 0.09226508048221219, + "grad_norm": 0.508184015750885, + "learning_rate": 9.913485617373756e-05, + "loss": 1.9883, + "step": 508 + }, + { + "epoch": 0.09244670465638977, + "grad_norm": 0.428376168012619, + "learning_rate": 9.912937050021896e-05, + "loss": 1.9228, + "step": 509 + }, + { + "epoch": 0.09262832883056735, + "grad_norm": 0.44472140073776245, + "learning_rate": 9.912386764267919e-05, + "loss": 1.923, + "step": 510 + }, + { + "epoch": 0.09280995300474493, + "grad_norm": 0.6215564608573914, + "learning_rate": 9.911834760304294e-05, + "loss": 2.0052, + "step": 511 + }, + { + "epoch": 0.09299157717892252, + "grad_norm": 0.46843424439430237, + "learning_rate": 9.9112810383241e-05, + "loss": 1.8646, + "step": 512 + }, + { + "epoch": 0.0931732013531001, + "grad_norm": 0.41585636138916016, + "learning_rate": 9.910725598521013e-05, + "loss": 1.8209, + "step": 513 + }, + { + "epoch": 0.09335482552727768, + "grad_norm": 0.540764331817627, + "learning_rate": 9.91016844108931e-05, + "loss": 1.9147, + "step": 514 + }, + { + "epoch": 0.09353644970145526, + "grad_norm": 0.47401127219200134, + "learning_rate": 9.909609566223863e-05, + "loss": 1.7168, + "step": 515 + }, + { + "epoch": 0.09371807387563284, + "grad_norm": 0.5102025270462036, + "learning_rate": 9.909048974120156e-05, + "loss": 1.9062, + "step": 516 + }, + { + "epoch": 0.09389969804981042, + "grad_norm": 0.5171622037887573, + "learning_rate": 9.908486664974265e-05, + "loss": 1.7451, + "step": 517 + }, + { + "epoch": 0.09408132222398802, + "grad_norm": 0.6138924956321716, + "learning_rate": 9.907922638982872e-05, + "loss": 1.9617, + "step": 518 + }, + { + "epoch": 0.0942629463981656, + "grad_norm": 0.5080596208572388, + "learning_rate": 9.907356896343253e-05, + "loss": 1.823, + "step": 519 + }, + { + "epoch": 0.09444457057234318, + "grad_norm": 0.4340963661670685, + "learning_rate": 9.90678943725329e-05, + "loss": 1.9504, + "step": 520 + }, + { + "epoch": 0.09462619474652076, + "grad_norm": 0.5290579199790955, + "learning_rate": 9.906220261911465e-05, + "loss": 1.8404, + "step": 521 + }, + { + "epoch": 0.09480781892069834, + "grad_norm": 0.7211893200874329, + "learning_rate": 9.905649370516857e-05, + "loss": 1.797, + "step": 522 + }, + { + "epoch": 0.09498944309487593, + "grad_norm": 0.47412413358688354, + "learning_rate": 9.905076763269147e-05, + "loss": 1.8536, + "step": 523 + }, + { + "epoch": 0.09517106726905351, + "grad_norm": 0.6998024582862854, + "learning_rate": 9.90450244036862e-05, + "loss": 1.8973, + "step": 524 + }, + { + "epoch": 0.0953526914432311, + "grad_norm": 0.6209399700164795, + "learning_rate": 9.903926402016153e-05, + "loss": 1.9755, + "step": 525 + }, + { + "epoch": 0.09553431561740867, + "grad_norm": 0.5058971047401428, + "learning_rate": 9.903348648413229e-05, + "loss": 1.9185, + "step": 526 + }, + { + "epoch": 0.09571593979158625, + "grad_norm": 0.46669164299964905, + "learning_rate": 9.90276917976193e-05, + "loss": 1.9264, + "step": 527 + }, + { + "epoch": 0.09589756396576385, + "grad_norm": 0.6721642017364502, + "learning_rate": 9.902187996264935e-05, + "loss": 1.9942, + "step": 528 + }, + { + "epoch": 0.09607918813994143, + "grad_norm": 0.5135471224784851, + "learning_rate": 9.901605098125528e-05, + "loss": 1.8142, + "step": 529 + }, + { + "epoch": 0.09626081231411901, + "grad_norm": 0.46989068388938904, + "learning_rate": 9.90102048554759e-05, + "loss": 1.8513, + "step": 530 + }, + { + "epoch": 0.09644243648829659, + "grad_norm": 0.4177769720554352, + "learning_rate": 9.900434158735598e-05, + "loss": 1.8854, + "step": 531 + }, + { + "epoch": 0.09662406066247417, + "grad_norm": 0.6366725564002991, + "learning_rate": 9.899846117894634e-05, + "loss": 1.8373, + "step": 532 + }, + { + "epoch": 0.09680568483665176, + "grad_norm": 0.5831615328788757, + "learning_rate": 9.899256363230378e-05, + "loss": 1.8294, + "step": 533 + }, + { + "epoch": 0.09698730901082935, + "grad_norm": 0.46682146191596985, + "learning_rate": 9.898664894949107e-05, + "loss": 2.041, + "step": 534 + }, + { + "epoch": 0.09716893318500693, + "grad_norm": 0.5806072354316711, + "learning_rate": 9.898071713257704e-05, + "loss": 1.9307, + "step": 535 + }, + { + "epoch": 0.0973505573591845, + "grad_norm": 0.5399283170700073, + "learning_rate": 9.89747681836364e-05, + "loss": 1.6624, + "step": 536 + }, + { + "epoch": 0.09753218153336209, + "grad_norm": 0.4086305499076843, + "learning_rate": 9.896880210474998e-05, + "loss": 1.9207, + "step": 537 + }, + { + "epoch": 0.09771380570753968, + "grad_norm": 0.5652099251747131, + "learning_rate": 9.896281889800449e-05, + "loss": 2.075, + "step": 538 + }, + { + "epoch": 0.09789542988171726, + "grad_norm": 0.6223917007446289, + "learning_rate": 9.895681856549272e-05, + "loss": 2.0222, + "step": 539 + }, + { + "epoch": 0.09807705405589484, + "grad_norm": 0.3551906645298004, + "learning_rate": 9.89508011093134e-05, + "loss": 1.6763, + "step": 540 + }, + { + "epoch": 0.09825867823007242, + "grad_norm": 0.3849733769893646, + "learning_rate": 9.894476653157126e-05, + "loss": 1.7514, + "step": 541 + }, + { + "epoch": 0.09844030240425, + "grad_norm": 0.5776532888412476, + "learning_rate": 9.8938714834377e-05, + "loss": 1.9226, + "step": 542 + }, + { + "epoch": 0.09862192657842758, + "grad_norm": 0.6551340222358704, + "learning_rate": 9.893264601984735e-05, + "loss": 1.9325, + "step": 543 + }, + { + "epoch": 0.09880355075260518, + "grad_norm": 0.505523145198822, + "learning_rate": 9.892656009010501e-05, + "loss": 1.9205, + "step": 544 + }, + { + "epoch": 0.09898517492678276, + "grad_norm": 0.9124334454536438, + "learning_rate": 9.892045704727864e-05, + "loss": 1.9054, + "step": 545 + }, + { + "epoch": 0.09916679910096034, + "grad_norm": 0.6240634918212891, + "learning_rate": 9.891433689350292e-05, + "loss": 1.7022, + "step": 546 + }, + { + "epoch": 0.09934842327513792, + "grad_norm": 0.42733272910118103, + "learning_rate": 9.890819963091848e-05, + "loss": 1.824, + "step": 547 + }, + { + "epoch": 0.0995300474493155, + "grad_norm": 0.43468523025512695, + "learning_rate": 9.8902045261672e-05, + "loss": 1.8958, + "step": 548 + }, + { + "epoch": 0.09971167162349309, + "grad_norm": 0.5394231081008911, + "learning_rate": 9.889587378791605e-05, + "loss": 1.5863, + "step": 549 + }, + { + "epoch": 0.09989329579767067, + "grad_norm": 0.655252993106842, + "learning_rate": 9.888968521180926e-05, + "loss": 1.8785, + "step": 550 + }, + { + "epoch": 0.10007491997184825, + "grad_norm": 0.4726557433605194, + "learning_rate": 9.88834795355162e-05, + "loss": 1.8144, + "step": 551 + }, + { + "epoch": 0.10025654414602583, + "grad_norm": 0.5020096302032471, + "learning_rate": 9.887725676120745e-05, + "loss": 1.8746, + "step": 552 + }, + { + "epoch": 0.10043816832020341, + "grad_norm": 0.6111868619918823, + "learning_rate": 9.887101689105955e-05, + "loss": 1.8357, + "step": 553 + }, + { + "epoch": 0.10061979249438101, + "grad_norm": 0.5693550109863281, + "learning_rate": 9.886475992725501e-05, + "loss": 1.9047, + "step": 554 + }, + { + "epoch": 0.10080141666855859, + "grad_norm": 0.6030381917953491, + "learning_rate": 9.885848587198234e-05, + "loss": 1.8974, + "step": 555 + }, + { + "epoch": 0.10098304084273617, + "grad_norm": 0.38341444730758667, + "learning_rate": 9.885219472743603e-05, + "loss": 1.8042, + "step": 556 + }, + { + "epoch": 0.10116466501691375, + "grad_norm": 0.6162832975387573, + "learning_rate": 9.884588649581654e-05, + "loss": 1.6651, + "step": 557 + }, + { + "epoch": 0.10134628919109133, + "grad_norm": 0.5239376425743103, + "learning_rate": 9.88395611793303e-05, + "loss": 1.959, + "step": 558 + }, + { + "epoch": 0.10152791336526892, + "grad_norm": 0.5161235332489014, + "learning_rate": 9.883321878018972e-05, + "loss": 2.0421, + "step": 559 + }, + { + "epoch": 0.1017095375394465, + "grad_norm": 0.330890417098999, + "learning_rate": 9.882685930061317e-05, + "loss": 1.8689, + "step": 560 + }, + { + "epoch": 0.10189116171362408, + "grad_norm": 0.4480641186237335, + "learning_rate": 9.882048274282505e-05, + "loss": 1.7384, + "step": 561 + }, + { + "epoch": 0.10207278588780166, + "grad_norm": 0.37208619713783264, + "learning_rate": 9.881408910905567e-05, + "loss": 1.7903, + "step": 562 + }, + { + "epoch": 0.10225441006197925, + "grad_norm": 0.5779393315315247, + "learning_rate": 9.880767840154133e-05, + "loss": 1.8184, + "step": 563 + }, + { + "epoch": 0.10243603423615684, + "grad_norm": 1.709685206413269, + "learning_rate": 9.880125062252433e-05, + "loss": 1.7902, + "step": 564 + }, + { + "epoch": 0.10261765841033442, + "grad_norm": 0.5162463784217834, + "learning_rate": 9.879480577425288e-05, + "loss": 2.0339, + "step": 565 + }, + { + "epoch": 0.102799282584512, + "grad_norm": 0.5114135146141052, + "learning_rate": 9.878834385898126e-05, + "loss": 1.9389, + "step": 566 + }, + { + "epoch": 0.10298090675868958, + "grad_norm": 0.593840479850769, + "learning_rate": 9.87818648789696e-05, + "loss": 1.7544, + "step": 567 + }, + { + "epoch": 0.10316253093286716, + "grad_norm": 0.36868590116500854, + "learning_rate": 9.877536883648409e-05, + "loss": 1.7941, + "step": 568 + }, + { + "epoch": 0.10334415510704474, + "grad_norm": 1.3723578453063965, + "learning_rate": 9.876885573379687e-05, + "loss": 1.8147, + "step": 569 + }, + { + "epoch": 0.10352577928122234, + "grad_norm": 0.49573057889938354, + "learning_rate": 9.876232557318599e-05, + "loss": 1.7819, + "step": 570 + }, + { + "epoch": 0.10370740345539992, + "grad_norm": 0.5209935307502747, + "learning_rate": 9.875577835693554e-05, + "loss": 1.9843, + "step": 571 + }, + { + "epoch": 0.1038890276295775, + "grad_norm": 0.47393864393234253, + "learning_rate": 9.874921408733555e-05, + "loss": 1.8756, + "step": 572 + }, + { + "epoch": 0.10407065180375508, + "grad_norm": 0.42813971638679504, + "learning_rate": 9.874263276668199e-05, + "loss": 1.8987, + "step": 573 + }, + { + "epoch": 0.10425227597793266, + "grad_norm": 0.735572338104248, + "learning_rate": 9.873603439727683e-05, + "loss": 1.873, + "step": 574 + }, + { + "epoch": 0.10443390015211025, + "grad_norm": 0.6372820138931274, + "learning_rate": 9.872941898142797e-05, + "loss": 1.8398, + "step": 575 + }, + { + "epoch": 0.10461552432628783, + "grad_norm": 0.3934377133846283, + "learning_rate": 9.872278652144931e-05, + "loss": 1.6957, + "step": 576 + }, + { + "epoch": 0.10479714850046541, + "grad_norm": 0.635617196559906, + "learning_rate": 9.871613701966067e-05, + "loss": 1.8458, + "step": 577 + }, + { + "epoch": 0.10497877267464299, + "grad_norm": 0.49208512902259827, + "learning_rate": 9.870947047838788e-05, + "loss": 1.8458, + "step": 578 + }, + { + "epoch": 0.10516039684882057, + "grad_norm": 0.586928129196167, + "learning_rate": 9.870278689996266e-05, + "loss": 1.8235, + "step": 579 + }, + { + "epoch": 0.10534202102299817, + "grad_norm": 0.4411647319793701, + "learning_rate": 9.869608628672278e-05, + "loss": 1.8312, + "step": 580 + }, + { + "epoch": 0.10552364519717575, + "grad_norm": 0.4876898229122162, + "learning_rate": 9.868936864101188e-05, + "loss": 2.0389, + "step": 581 + }, + { + "epoch": 0.10570526937135333, + "grad_norm": 0.47010794281959534, + "learning_rate": 9.868263396517963e-05, + "loss": 1.7846, + "step": 582 + }, + { + "epoch": 0.10588689354553091, + "grad_norm": 0.40557247400283813, + "learning_rate": 9.867588226158158e-05, + "loss": 1.9614, + "step": 583 + }, + { + "epoch": 0.10606851771970849, + "grad_norm": 0.4541774392127991, + "learning_rate": 9.866911353257932e-05, + "loss": 1.9429, + "step": 584 + }, + { + "epoch": 0.10625014189388608, + "grad_norm": 0.412641704082489, + "learning_rate": 9.866232778054034e-05, + "loss": 1.945, + "step": 585 + }, + { + "epoch": 0.10643176606806366, + "grad_norm": 0.5932005643844604, + "learning_rate": 9.865552500783809e-05, + "loss": 1.8147, + "step": 586 + }, + { + "epoch": 0.10661339024224124, + "grad_norm": 0.5321000814437866, + "learning_rate": 9.864870521685199e-05, + "loss": 1.8551, + "step": 587 + }, + { + "epoch": 0.10679501441641882, + "grad_norm": 0.4365077018737793, + "learning_rate": 9.864186840996738e-05, + "loss": 1.7516, + "step": 588 + }, + { + "epoch": 0.1069766385905964, + "grad_norm": 0.5119859576225281, + "learning_rate": 9.863501458957562e-05, + "loss": 1.7347, + "step": 589 + }, + { + "epoch": 0.107158262764774, + "grad_norm": 0.4307212829589844, + "learning_rate": 9.862814375807396e-05, + "loss": 1.7751, + "step": 590 + }, + { + "epoch": 0.10733988693895158, + "grad_norm": 0.6241722106933594, + "learning_rate": 9.86212559178656e-05, + "loss": 1.7805, + "step": 591 + }, + { + "epoch": 0.10752151111312916, + "grad_norm": 0.3887357711791992, + "learning_rate": 9.861435107135972e-05, + "loss": 1.873, + "step": 592 + }, + { + "epoch": 0.10770313528730674, + "grad_norm": 0.44180893898010254, + "learning_rate": 9.860742922097141e-05, + "loss": 1.8647, + "step": 593 + }, + { + "epoch": 0.10788475946148432, + "grad_norm": 0.6126459836959839, + "learning_rate": 9.860049036912178e-05, + "loss": 1.7066, + "step": 594 + }, + { + "epoch": 0.1080663836356619, + "grad_norm": 0.3843052089214325, + "learning_rate": 9.859353451823779e-05, + "loss": 1.7942, + "step": 595 + }, + { + "epoch": 0.1082480078098395, + "grad_norm": 0.5541086792945862, + "learning_rate": 9.858656167075242e-05, + "loss": 1.9736, + "step": 596 + }, + { + "epoch": 0.10842963198401707, + "grad_norm": 0.4613756537437439, + "learning_rate": 9.857957182910455e-05, + "loss": 1.6881, + "step": 597 + }, + { + "epoch": 0.10861125615819466, + "grad_norm": 1.256940484046936, + "learning_rate": 9.857256499573905e-05, + "loss": 2.0766, + "step": 598 + }, + { + "epoch": 0.10879288033237224, + "grad_norm": 0.8303955793380737, + "learning_rate": 9.85655411731067e-05, + "loss": 1.8934, + "step": 599 + }, + { + "epoch": 0.10897450450654982, + "grad_norm": 0.38644999265670776, + "learning_rate": 9.85585003636642e-05, + "loss": 1.6884, + "step": 600 + }, + { + "epoch": 0.10915612868072741, + "grad_norm": 0.3877352476119995, + "learning_rate": 9.855144256987423e-05, + "loss": 1.9637, + "step": 601 + }, + { + "epoch": 0.10933775285490499, + "grad_norm": 0.5214934349060059, + "learning_rate": 9.854436779420543e-05, + "loss": 1.9534, + "step": 602 + }, + { + "epoch": 0.10951937702908257, + "grad_norm": 0.4546791911125183, + "learning_rate": 9.853727603913232e-05, + "loss": 1.6619, + "step": 603 + }, + { + "epoch": 0.10970100120326015, + "grad_norm": 0.41686898469924927, + "learning_rate": 9.85301673071354e-05, + "loss": 1.8814, + "step": 604 + }, + { + "epoch": 0.10988262537743773, + "grad_norm": 1.150993824005127, + "learning_rate": 9.852304160070109e-05, + "loss": 1.7062, + "step": 605 + }, + { + "epoch": 0.11006424955161533, + "grad_norm": 0.36474210023880005, + "learning_rate": 9.851589892232178e-05, + "loss": 1.7083, + "step": 606 + }, + { + "epoch": 0.1102458737257929, + "grad_norm": 0.5870533585548401, + "learning_rate": 9.850873927449573e-05, + "loss": 1.71, + "step": 607 + }, + { + "epoch": 0.11042749789997049, + "grad_norm": 0.5769261121749878, + "learning_rate": 9.850156265972721e-05, + "loss": 2.1272, + "step": 608 + }, + { + "epoch": 0.11060912207414807, + "grad_norm": 0.48428717255592346, + "learning_rate": 9.849436908052636e-05, + "loss": 1.8758, + "step": 609 + }, + { + "epoch": 0.11079074624832565, + "grad_norm": 0.39008069038391113, + "learning_rate": 9.848715853940932e-05, + "loss": 1.8017, + "step": 610 + }, + { + "epoch": 0.11097237042250324, + "grad_norm": 0.40317434072494507, + "learning_rate": 9.84799310388981e-05, + "loss": 1.7753, + "step": 611 + }, + { + "epoch": 0.11115399459668082, + "grad_norm": 0.4900026321411133, + "learning_rate": 9.847268658152067e-05, + "loss": 1.7697, + "step": 612 + }, + { + "epoch": 0.1113356187708584, + "grad_norm": 0.4420159161090851, + "learning_rate": 9.846542516981094e-05, + "loss": 1.6895, + "step": 613 + }, + { + "epoch": 0.11151724294503598, + "grad_norm": 0.38464581966400146, + "learning_rate": 9.845814680630877e-05, + "loss": 1.8231, + "step": 614 + }, + { + "epoch": 0.11169886711921356, + "grad_norm": 1.096254825592041, + "learning_rate": 9.845085149355983e-05, + "loss": 1.7353, + "step": 615 + }, + { + "epoch": 0.11188049129339116, + "grad_norm": 0.5849656462669373, + "learning_rate": 9.844353923411592e-05, + "loss": 1.943, + "step": 616 + }, + { + "epoch": 0.11206211546756874, + "grad_norm": 0.4776785373687744, + "learning_rate": 9.843621003053455e-05, + "loss": 2.0531, + "step": 617 + }, + { + "epoch": 0.11224373964174632, + "grad_norm": 0.5481032133102417, + "learning_rate": 9.842886388537936e-05, + "loss": 1.9133, + "step": 618 + }, + { + "epoch": 0.1124253638159239, + "grad_norm": 0.508076548576355, + "learning_rate": 9.842150080121972e-05, + "loss": 1.8805, + "step": 619 + }, + { + "epoch": 0.11260698799010148, + "grad_norm": 0.5915936827659607, + "learning_rate": 9.84141207806311e-05, + "loss": 1.807, + "step": 620 + }, + { + "epoch": 0.11278861216427906, + "grad_norm": 0.5027976036071777, + "learning_rate": 9.840672382619478e-05, + "loss": 1.9352, + "step": 621 + }, + { + "epoch": 0.11297023633845665, + "grad_norm": 0.5230370163917542, + "learning_rate": 9.839930994049802e-05, + "loss": 1.7849, + "step": 622 + }, + { + "epoch": 0.11315186051263423, + "grad_norm": 0.419098436832428, + "learning_rate": 9.839187912613395e-05, + "loss": 1.9757, + "step": 623 + }, + { + "epoch": 0.11333348468681181, + "grad_norm": 0.5154158473014832, + "learning_rate": 9.838443138570167e-05, + "loss": 2.0954, + "step": 624 + }, + { + "epoch": 0.1135151088609894, + "grad_norm": 0.3862294554710388, + "learning_rate": 9.837696672180618e-05, + "loss": 1.733, + "step": 625 + }, + { + "epoch": 0.11369673303516697, + "grad_norm": 0.41402938961982727, + "learning_rate": 9.836948513705842e-05, + "loss": 1.7586, + "step": 626 + }, + { + "epoch": 0.11387835720934457, + "grad_norm": 0.7439208626747131, + "learning_rate": 9.836198663407518e-05, + "loss": 2.0041, + "step": 627 + }, + { + "epoch": 0.11405998138352215, + "grad_norm": 0.5618869662284851, + "learning_rate": 9.835447121547928e-05, + "loss": 1.7615, + "step": 628 + }, + { + "epoch": 0.11424160555769973, + "grad_norm": 0.49154865741729736, + "learning_rate": 9.834693888389936e-05, + "loss": 1.9518, + "step": 629 + }, + { + "epoch": 0.11442322973187731, + "grad_norm": 0.7175078392028809, + "learning_rate": 9.833938964197e-05, + "loss": 1.858, + "step": 630 + }, + { + "epoch": 0.11460485390605489, + "grad_norm": 0.37917521595954895, + "learning_rate": 9.833182349233174e-05, + "loss": 1.7739, + "step": 631 + }, + { + "epoch": 0.11478647808023248, + "grad_norm": 0.4734775722026825, + "learning_rate": 9.832424043763098e-05, + "loss": 1.7995, + "step": 632 + }, + { + "epoch": 0.11496810225441007, + "grad_norm": 0.3310025930404663, + "learning_rate": 9.831664048052003e-05, + "loss": 1.8522, + "step": 633 + }, + { + "epoch": 0.11514972642858765, + "grad_norm": 0.4925045073032379, + "learning_rate": 9.830902362365717e-05, + "loss": 1.7937, + "step": 634 + }, + { + "epoch": 0.11533135060276523, + "grad_norm": 0.40987586975097656, + "learning_rate": 9.830138986970651e-05, + "loss": 1.8298, + "step": 635 + }, + { + "epoch": 0.1155129747769428, + "grad_norm": 0.6148173213005066, + "learning_rate": 9.829373922133818e-05, + "loss": 1.9546, + "step": 636 + }, + { + "epoch": 0.1156945989511204, + "grad_norm": 0.4712829887866974, + "learning_rate": 9.828607168122809e-05, + "loss": 1.7923, + "step": 637 + }, + { + "epoch": 0.11587622312529798, + "grad_norm": 0.335810124874115, + "learning_rate": 9.827838725205816e-05, + "loss": 1.7753, + "step": 638 + }, + { + "epoch": 0.11605784729947556, + "grad_norm": 0.49693191051483154, + "learning_rate": 9.827068593651616e-05, + "loss": 1.8968, + "step": 639 + }, + { + "epoch": 0.11623947147365314, + "grad_norm": 1.197039008140564, + "learning_rate": 9.826296773729579e-05, + "loss": 1.7864, + "step": 640 + }, + { + "epoch": 0.11642109564783072, + "grad_norm": 0.647308349609375, + "learning_rate": 9.825523265709666e-05, + "loss": 1.8411, + "step": 641 + }, + { + "epoch": 0.11660271982200832, + "grad_norm": 0.3614049553871155, + "learning_rate": 9.824748069862428e-05, + "loss": 1.6746, + "step": 642 + }, + { + "epoch": 0.1167843439961859, + "grad_norm": 0.4875134229660034, + "learning_rate": 9.823971186459004e-05, + "loss": 1.9644, + "step": 643 + }, + { + "epoch": 0.11696596817036348, + "grad_norm": 0.4550471603870392, + "learning_rate": 9.823192615771126e-05, + "loss": 1.8252, + "step": 644 + }, + { + "epoch": 0.11714759234454106, + "grad_norm": 0.46366238594055176, + "learning_rate": 9.822412358071113e-05, + "loss": 1.9181, + "step": 645 + }, + { + "epoch": 0.11732921651871864, + "grad_norm": 0.3638553023338318, + "learning_rate": 9.821630413631881e-05, + "loss": 1.8338, + "step": 646 + }, + { + "epoch": 0.11751084069289622, + "grad_norm": 2.1675832271575928, + "learning_rate": 9.820846782726931e-05, + "loss": 1.9349, + "step": 647 + }, + { + "epoch": 0.11769246486707381, + "grad_norm": 0.4351162314414978, + "learning_rate": 9.820061465630349e-05, + "loss": 1.7882, + "step": 648 + }, + { + "epoch": 0.11787408904125139, + "grad_norm": 0.6629000902175903, + "learning_rate": 9.81927446261682e-05, + "loss": 1.8273, + "step": 649 + }, + { + "epoch": 0.11805571321542897, + "grad_norm": 0.4872469902038574, + "learning_rate": 9.818485773961614e-05, + "loss": 1.8135, + "step": 650 + }, + { + "epoch": 0.11823733738960655, + "grad_norm": 0.5557680726051331, + "learning_rate": 9.817695399940592e-05, + "loss": 1.7297, + "step": 651 + }, + { + "epoch": 0.11841896156378413, + "grad_norm": 0.5416964292526245, + "learning_rate": 9.816903340830203e-05, + "loss": 1.8076, + "step": 652 + }, + { + "epoch": 0.11860058573796173, + "grad_norm": 0.5070096254348755, + "learning_rate": 9.816109596907486e-05, + "loss": 1.8788, + "step": 653 + }, + { + "epoch": 0.11878220991213931, + "grad_norm": 0.44313010573387146, + "learning_rate": 9.81531416845007e-05, + "loss": 1.8631, + "step": 654 + }, + { + "epoch": 0.11896383408631689, + "grad_norm": 0.4395395815372467, + "learning_rate": 9.814517055736172e-05, + "loss": 1.797, + "step": 655 + }, + { + "epoch": 0.11914545826049447, + "grad_norm": 0.4504729211330414, + "learning_rate": 9.8137182590446e-05, + "loss": 1.7438, + "step": 656 + }, + { + "epoch": 0.11932708243467205, + "grad_norm": 0.3764480948448181, + "learning_rate": 9.812917778654748e-05, + "loss": 1.8668, + "step": 657 + }, + { + "epoch": 0.11950870660884964, + "grad_norm": 0.3695070445537567, + "learning_rate": 9.812115614846603e-05, + "loss": 1.7964, + "step": 658 + }, + { + "epoch": 0.11969033078302722, + "grad_norm": 0.4922740161418915, + "learning_rate": 9.811311767900737e-05, + "loss": 1.9533, + "step": 659 + }, + { + "epoch": 0.1198719549572048, + "grad_norm": 0.42497506737709045, + "learning_rate": 9.810506238098312e-05, + "loss": 1.8933, + "step": 660 + }, + { + "epoch": 0.12005357913138238, + "grad_norm": 0.5535656213760376, + "learning_rate": 9.80969902572108e-05, + "loss": 1.8989, + "step": 661 + }, + { + "epoch": 0.12023520330555997, + "grad_norm": 0.40506020188331604, + "learning_rate": 9.808890131051379e-05, + "loss": 1.6329, + "step": 662 + }, + { + "epoch": 0.12041682747973756, + "grad_norm": 0.4039515554904938, + "learning_rate": 9.808079554372136e-05, + "loss": 1.8485, + "step": 663 + }, + { + "epoch": 0.12059845165391514, + "grad_norm": 0.417152464389801, + "learning_rate": 9.80726729596687e-05, + "loss": 1.8175, + "step": 664 + }, + { + "epoch": 0.12078007582809272, + "grad_norm": 0.5348461270332336, + "learning_rate": 9.806453356119684e-05, + "loss": 1.9047, + "step": 665 + }, + { + "epoch": 0.1209617000022703, + "grad_norm": 0.42635995149612427, + "learning_rate": 9.80563773511527e-05, + "loss": 1.882, + "step": 666 + }, + { + "epoch": 0.12114332417644788, + "grad_norm": 0.5497307181358337, + "learning_rate": 9.804820433238908e-05, + "loss": 1.9889, + "step": 667 + }, + { + "epoch": 0.12132494835062548, + "grad_norm": 0.39175039529800415, + "learning_rate": 9.804001450776468e-05, + "loss": 1.8104, + "step": 668 + }, + { + "epoch": 0.12150657252480306, + "grad_norm": 0.5015182495117188, + "learning_rate": 9.803180788014403e-05, + "loss": 1.9481, + "step": 669 + }, + { + "epoch": 0.12168819669898064, + "grad_norm": 0.3927193880081177, + "learning_rate": 9.80235844523976e-05, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.12186982087315822, + "grad_norm": 0.471301406621933, + "learning_rate": 9.801534422740173e-05, + "loss": 1.8546, + "step": 671 + }, + { + "epoch": 0.1220514450473358, + "grad_norm": 0.4616087079048157, + "learning_rate": 9.800708720803855e-05, + "loss": 1.9087, + "step": 672 + }, + { + "epoch": 0.12223306922151338, + "grad_norm": 0.6628782153129578, + "learning_rate": 9.799881339719615e-05, + "loss": 1.886, + "step": 673 + }, + { + "epoch": 0.12241469339569097, + "grad_norm": 1.0236927270889282, + "learning_rate": 9.799052279776846e-05, + "loss": 2.05, + "step": 674 + }, + { + "epoch": 0.12259631756986855, + "grad_norm": 0.4582613408565521, + "learning_rate": 9.798221541265531e-05, + "loss": 1.7206, + "step": 675 + }, + { + "epoch": 0.12277794174404613, + "grad_norm": 0.7103002667427063, + "learning_rate": 9.797389124476238e-05, + "loss": 1.8742, + "step": 676 + }, + { + "epoch": 0.12295956591822371, + "grad_norm": 0.8721935749053955, + "learning_rate": 9.796555029700119e-05, + "loss": 1.9451, + "step": 677 + }, + { + "epoch": 0.12314119009240129, + "grad_norm": 1.203221082687378, + "learning_rate": 9.795719257228921e-05, + "loss": 2.0549, + "step": 678 + }, + { + "epoch": 0.12332281426657889, + "grad_norm": 0.4520948827266693, + "learning_rate": 9.794881807354968e-05, + "loss": 1.8966, + "step": 679 + }, + { + "epoch": 0.12350443844075647, + "grad_norm": 1.0113153457641602, + "learning_rate": 9.794042680371177e-05, + "loss": 1.9169, + "step": 680 + }, + { + "epoch": 0.12368606261493405, + "grad_norm": 0.7489022612571716, + "learning_rate": 9.793201876571053e-05, + "loss": 1.8256, + "step": 681 + }, + { + "epoch": 0.12386768678911163, + "grad_norm": 0.3910664916038513, + "learning_rate": 9.792359396248681e-05, + "loss": 1.7914, + "step": 682 + }, + { + "epoch": 0.12404931096328921, + "grad_norm": 0.5365206003189087, + "learning_rate": 9.791515239698736e-05, + "loss": 1.8344, + "step": 683 + }, + { + "epoch": 0.1242309351374668, + "grad_norm": 0.5591122508049011, + "learning_rate": 9.790669407216482e-05, + "loss": 2.0143, + "step": 684 + }, + { + "epoch": 0.12441255931164438, + "grad_norm": 0.39699599146842957, + "learning_rate": 9.789821899097766e-05, + "loss": 1.8573, + "step": 685 + }, + { + "epoch": 0.12459418348582196, + "grad_norm": 0.4777892529964447, + "learning_rate": 9.788972715639019e-05, + "loss": 1.777, + "step": 686 + }, + { + "epoch": 0.12477580765999954, + "grad_norm": 0.4709274172782898, + "learning_rate": 9.788121857137265e-05, + "loss": 1.7394, + "step": 687 + }, + { + "epoch": 0.12495743183417712, + "grad_norm": 0.6101868748664856, + "learning_rate": 9.787269323890104e-05, + "loss": 1.7981, + "step": 688 + }, + { + "epoch": 0.1251390560083547, + "grad_norm": 0.6419447660446167, + "learning_rate": 9.786415116195732e-05, + "loss": 1.9589, + "step": 689 + }, + { + "epoch": 0.12532068018253228, + "grad_norm": 0.6067058444023132, + "learning_rate": 9.785559234352925e-05, + "loss": 1.9349, + "step": 690 + }, + { + "epoch": 0.12550230435670987, + "grad_norm": 0.7613756060600281, + "learning_rate": 9.784701678661045e-05, + "loss": 2.02, + "step": 691 + }, + { + "epoch": 0.12568392853088747, + "grad_norm": 0.4139694273471832, + "learning_rate": 9.783842449420039e-05, + "loss": 1.9143, + "step": 692 + }, + { + "epoch": 0.12586555270506505, + "grad_norm": 0.40928635001182556, + "learning_rate": 9.78298154693044e-05, + "loss": 1.7443, + "step": 693 + }, + { + "epoch": 0.12604717687924263, + "grad_norm": 0.4126574993133545, + "learning_rate": 9.782118971493367e-05, + "loss": 1.9445, + "step": 694 + }, + { + "epoch": 0.12622880105342021, + "grad_norm": 0.4682476818561554, + "learning_rate": 9.781254723410528e-05, + "loss": 1.901, + "step": 695 + }, + { + "epoch": 0.1264104252275978, + "grad_norm": 0.45557302236557007, + "learning_rate": 9.780388802984206e-05, + "loss": 1.7949, + "step": 696 + }, + { + "epoch": 0.12659204940177538, + "grad_norm": 0.4494205415248871, + "learning_rate": 9.779521210517276e-05, + "loss": 1.7765, + "step": 697 + }, + { + "epoch": 0.12677367357595296, + "grad_norm": 0.508558988571167, + "learning_rate": 9.778651946313199e-05, + "loss": 1.6041, + "step": 698 + }, + { + "epoch": 0.12695529775013054, + "grad_norm": 0.4682636857032776, + "learning_rate": 9.777781010676015e-05, + "loss": 1.9704, + "step": 699 + }, + { + "epoch": 0.12713692192430812, + "grad_norm": 0.6863474249839783, + "learning_rate": 9.776908403910355e-05, + "loss": 2.0406, + "step": 700 + }, + { + "epoch": 0.1273185460984857, + "grad_norm": 0.4232069253921509, + "learning_rate": 9.77603412632143e-05, + "loss": 1.8425, + "step": 701 + }, + { + "epoch": 0.1275001702726633, + "grad_norm": 0.42266860604286194, + "learning_rate": 9.775158178215036e-05, + "loss": 2.0545, + "step": 702 + }, + { + "epoch": 0.12768179444684088, + "grad_norm": 0.6117460131645203, + "learning_rate": 9.774280559897555e-05, + "loss": 1.9852, + "step": 703 + }, + { + "epoch": 0.12786341862101847, + "grad_norm": 0.5095504522323608, + "learning_rate": 9.773401271675951e-05, + "loss": 1.8143, + "step": 704 + }, + { + "epoch": 0.12804504279519605, + "grad_norm": 0.4390101432800293, + "learning_rate": 9.772520313857775e-05, + "loss": 1.8234, + "step": 705 + }, + { + "epoch": 0.12822666696937363, + "grad_norm": 0.7279866337776184, + "learning_rate": 9.771637686751163e-05, + "loss": 1.8625, + "step": 706 + }, + { + "epoch": 0.1284082911435512, + "grad_norm": 0.41986462473869324, + "learning_rate": 9.770753390664827e-05, + "loss": 1.7172, + "step": 707 + }, + { + "epoch": 0.1285899153177288, + "grad_norm": 0.4279710352420807, + "learning_rate": 9.76986742590807e-05, + "loss": 1.8989, + "step": 708 + }, + { + "epoch": 0.12877153949190637, + "grad_norm": 0.4669712781906128, + "learning_rate": 9.768979792790775e-05, + "loss": 1.9169, + "step": 709 + }, + { + "epoch": 0.12895316366608395, + "grad_norm": 0.3883441984653473, + "learning_rate": 9.768090491623414e-05, + "loss": 1.9025, + "step": 710 + }, + { + "epoch": 0.12913478784026153, + "grad_norm": 0.6114716529846191, + "learning_rate": 9.767199522717036e-05, + "loss": 1.9194, + "step": 711 + }, + { + "epoch": 0.1293164120144391, + "grad_norm": 0.4525127410888672, + "learning_rate": 9.766306886383277e-05, + "loss": 1.9452, + "step": 712 + }, + { + "epoch": 0.12949803618861672, + "grad_norm": 0.5565483570098877, + "learning_rate": 9.765412582934355e-05, + "loss": 1.8839, + "step": 713 + }, + { + "epoch": 0.1296796603627943, + "grad_norm": 0.4159076511859894, + "learning_rate": 9.764516612683071e-05, + "loss": 1.8784, + "step": 714 + }, + { + "epoch": 0.12986128453697188, + "grad_norm": 0.45089900493621826, + "learning_rate": 9.763618975942807e-05, + "loss": 1.7848, + "step": 715 + }, + { + "epoch": 0.13004290871114946, + "grad_norm": 0.4366222023963928, + "learning_rate": 9.762719673027533e-05, + "loss": 1.8405, + "step": 716 + }, + { + "epoch": 0.13022453288532704, + "grad_norm": 0.7328912019729614, + "learning_rate": 9.7618187042518e-05, + "loss": 1.8042, + "step": 717 + }, + { + "epoch": 0.13040615705950462, + "grad_norm": 0.4680670499801636, + "learning_rate": 9.760916069930738e-05, + "loss": 1.8381, + "step": 718 + }, + { + "epoch": 0.1305877812336822, + "grad_norm": 0.5231205224990845, + "learning_rate": 9.760011770380065e-05, + "loss": 1.7734, + "step": 719 + }, + { + "epoch": 0.13076940540785978, + "grad_norm": 0.3835601806640625, + "learning_rate": 9.759105805916073e-05, + "loss": 1.9158, + "step": 720 + }, + { + "epoch": 0.13095102958203736, + "grad_norm": 0.6928550601005554, + "learning_rate": 9.758198176855648e-05, + "loss": 1.8544, + "step": 721 + }, + { + "epoch": 0.13113265375621494, + "grad_norm": 0.5700839161872864, + "learning_rate": 9.757288883516249e-05, + "loss": 1.7305, + "step": 722 + }, + { + "epoch": 0.13131427793039255, + "grad_norm": 1.0153053998947144, + "learning_rate": 9.756377926215921e-05, + "loss": 1.7723, + "step": 723 + }, + { + "epoch": 0.13149590210457013, + "grad_norm": 0.547630786895752, + "learning_rate": 9.75546530527329e-05, + "loss": 1.8665, + "step": 724 + }, + { + "epoch": 0.1316775262787477, + "grad_norm": 0.3850541412830353, + "learning_rate": 9.754551021007565e-05, + "loss": 1.9016, + "step": 725 + }, + { + "epoch": 0.1318591504529253, + "grad_norm": 0.7832197546958923, + "learning_rate": 9.753635073738537e-05, + "loss": 1.8421, + "step": 726 + }, + { + "epoch": 0.13204077462710287, + "grad_norm": 1.4997161626815796, + "learning_rate": 9.752717463786575e-05, + "loss": 1.8763, + "step": 727 + }, + { + "epoch": 0.13222239880128045, + "grad_norm": 0.49210405349731445, + "learning_rate": 9.751798191472633e-05, + "loss": 1.8389, + "step": 728 + }, + { + "epoch": 0.13240402297545803, + "grad_norm": 0.41312623023986816, + "learning_rate": 9.750877257118247e-05, + "loss": 1.7862, + "step": 729 + }, + { + "epoch": 0.1325856471496356, + "grad_norm": 0.4075622856616974, + "learning_rate": 9.74995466104553e-05, + "loss": 1.9966, + "step": 730 + }, + { + "epoch": 0.1327672713238132, + "grad_norm": 0.3823043704032898, + "learning_rate": 9.749030403577184e-05, + "loss": 1.5975, + "step": 731 + }, + { + "epoch": 0.13294889549799077, + "grad_norm": 0.8819665908813477, + "learning_rate": 9.748104485036483e-05, + "loss": 1.905, + "step": 732 + }, + { + "epoch": 0.13313051967216838, + "grad_norm": 0.48306867480278015, + "learning_rate": 9.747176905747289e-05, + "loss": 1.8368, + "step": 733 + }, + { + "epoch": 0.13331214384634596, + "grad_norm": 1.0866397619247437, + "learning_rate": 9.74624766603404e-05, + "loss": 2.0045, + "step": 734 + }, + { + "epoch": 0.13349376802052354, + "grad_norm": 0.46396780014038086, + "learning_rate": 9.745316766221758e-05, + "loss": 1.7937, + "step": 735 + }, + { + "epoch": 0.13367539219470112, + "grad_norm": 0.512191116809845, + "learning_rate": 9.744384206636046e-05, + "loss": 1.8045, + "step": 736 + }, + { + "epoch": 0.1338570163688787, + "grad_norm": 0.5164349675178528, + "learning_rate": 9.743449987603083e-05, + "loss": 1.9569, + "step": 737 + }, + { + "epoch": 0.13403864054305628, + "grad_norm": 0.3853932023048401, + "learning_rate": 9.742514109449634e-05, + "loss": 1.8819, + "step": 738 + }, + { + "epoch": 0.13422026471723386, + "grad_norm": 0.6969515085220337, + "learning_rate": 9.741576572503042e-05, + "loss": 2.0295, + "step": 739 + }, + { + "epoch": 0.13440188889141144, + "grad_norm": 0.42487791180610657, + "learning_rate": 9.740637377091227e-05, + "loss": 1.8249, + "step": 740 + }, + { + "epoch": 0.13458351306558902, + "grad_norm": 0.4305058717727661, + "learning_rate": 9.739696523542696e-05, + "loss": 1.6173, + "step": 741 + }, + { + "epoch": 0.1347651372397666, + "grad_norm": 0.4297633767127991, + "learning_rate": 9.73875401218653e-05, + "loss": 1.8262, + "step": 742 + }, + { + "epoch": 0.13494676141394418, + "grad_norm": 0.48297378420829773, + "learning_rate": 9.737809843352395e-05, + "loss": 1.7246, + "step": 743 + }, + { + "epoch": 0.1351283855881218, + "grad_norm": 0.3761887848377228, + "learning_rate": 9.73686401737053e-05, + "loss": 1.7408, + "step": 744 + }, + { + "epoch": 0.13531000976229937, + "grad_norm": 0.39516323804855347, + "learning_rate": 9.735916534571757e-05, + "loss": 1.6812, + "step": 745 + }, + { + "epoch": 0.13549163393647695, + "grad_norm": 0.4295628070831299, + "learning_rate": 9.734967395287482e-05, + "loss": 1.7413, + "step": 746 + }, + { + "epoch": 0.13567325811065453, + "grad_norm": 0.4724743962287903, + "learning_rate": 9.734016599849682e-05, + "loss": 1.7513, + "step": 747 + }, + { + "epoch": 0.1358548822848321, + "grad_norm": 0.38395819067955017, + "learning_rate": 9.73306414859092e-05, + "loss": 1.7049, + "step": 748 + }, + { + "epoch": 0.1360365064590097, + "grad_norm": 0.42505574226379395, + "learning_rate": 9.732110041844335e-05, + "loss": 1.8655, + "step": 749 + }, + { + "epoch": 0.13621813063318727, + "grad_norm": 0.4777648448944092, + "learning_rate": 9.731154279943646e-05, + "loss": 1.7863, + "step": 750 + }, + { + "epoch": 0.13639975480736485, + "grad_norm": 0.33793503046035767, + "learning_rate": 9.73019686322315e-05, + "loss": 1.8687, + "step": 751 + }, + { + "epoch": 0.13658137898154243, + "grad_norm": 1.096353530883789, + "learning_rate": 9.729237792017722e-05, + "loss": 1.6119, + "step": 752 + }, + { + "epoch": 0.13676300315572001, + "grad_norm": 0.4883885979652405, + "learning_rate": 9.72827706666282e-05, + "loss": 1.8857, + "step": 753 + }, + { + "epoch": 0.13694462732989762, + "grad_norm": 0.6008234024047852, + "learning_rate": 9.72731468749448e-05, + "loss": 1.8031, + "step": 754 + }, + { + "epoch": 0.1371262515040752, + "grad_norm": 0.4024490714073181, + "learning_rate": 9.726350654849307e-05, + "loss": 1.6866, + "step": 755 + }, + { + "epoch": 0.13730787567825278, + "grad_norm": 0.3494734466075897, + "learning_rate": 9.725384969064498e-05, + "loss": 1.9031, + "step": 756 + }, + { + "epoch": 0.13748949985243036, + "grad_norm": 0.5499459505081177, + "learning_rate": 9.724417630477816e-05, + "loss": 1.6744, + "step": 757 + }, + { + "epoch": 0.13767112402660794, + "grad_norm": 0.5344276428222656, + "learning_rate": 9.723448639427613e-05, + "loss": 1.9063, + "step": 758 + }, + { + "epoch": 0.13785274820078552, + "grad_norm": 0.4833744168281555, + "learning_rate": 9.72247799625281e-05, + "loss": 1.9858, + "step": 759 + }, + { + "epoch": 0.1380343723749631, + "grad_norm": 0.4017169773578644, + "learning_rate": 9.721505701292912e-05, + "loss": 1.731, + "step": 760 + }, + { + "epoch": 0.13821599654914069, + "grad_norm": 0.40553420782089233, + "learning_rate": 9.720531754888e-05, + "loss": 1.6611, + "step": 761 + }, + { + "epoch": 0.13839762072331827, + "grad_norm": 0.38127800822257996, + "learning_rate": 9.71955615737873e-05, + "loss": 1.984, + "step": 762 + }, + { + "epoch": 0.13857924489749585, + "grad_norm": 0.4023358225822449, + "learning_rate": 9.718578909106339e-05, + "loss": 1.8551, + "step": 763 + }, + { + "epoch": 0.13876086907167343, + "grad_norm": 0.3421589732170105, + "learning_rate": 9.717600010412639e-05, + "loss": 1.782, + "step": 764 + }, + { + "epoch": 0.13894249324585103, + "grad_norm": 0.42043837904930115, + "learning_rate": 9.71661946164002e-05, + "loss": 1.8505, + "step": 765 + }, + { + "epoch": 0.13912411742002861, + "grad_norm": 0.4448941648006439, + "learning_rate": 9.71563726313145e-05, + "loss": 1.622, + "step": 766 + }, + { + "epoch": 0.1393057415942062, + "grad_norm": 0.49254903197288513, + "learning_rate": 9.714653415230475e-05, + "loss": 1.8164, + "step": 767 + }, + { + "epoch": 0.13948736576838378, + "grad_norm": 0.6033391952514648, + "learning_rate": 9.713667918281212e-05, + "loss": 2.1539, + "step": 768 + }, + { + "epoch": 0.13966898994256136, + "grad_norm": 0.5121180415153503, + "learning_rate": 9.712680772628364e-05, + "loss": 1.782, + "step": 769 + }, + { + "epoch": 0.13985061411673894, + "grad_norm": 0.3872841000556946, + "learning_rate": 9.711691978617203e-05, + "loss": 1.9016, + "step": 770 + }, + { + "epoch": 0.14003223829091652, + "grad_norm": 0.5137674808502197, + "learning_rate": 9.710701536593581e-05, + "loss": 1.8, + "step": 771 + }, + { + "epoch": 0.1402138624650941, + "grad_norm": 0.39773860573768616, + "learning_rate": 9.709709446903924e-05, + "loss": 1.7946, + "step": 772 + }, + { + "epoch": 0.14039548663927168, + "grad_norm": 0.7843666076660156, + "learning_rate": 9.708715709895239e-05, + "loss": 1.9306, + "step": 773 + }, + { + "epoch": 0.14057711081344926, + "grad_norm": 0.40145331621170044, + "learning_rate": 9.707720325915104e-05, + "loss": 1.8776, + "step": 774 + }, + { + "epoch": 0.14075873498762687, + "grad_norm": 1.4665875434875488, + "learning_rate": 9.706723295311677e-05, + "loss": 1.7401, + "step": 775 + }, + { + "epoch": 0.14094035916180445, + "grad_norm": 0.6247975826263428, + "learning_rate": 9.705724618433689e-05, + "loss": 1.9344, + "step": 776 + }, + { + "epoch": 0.14112198333598203, + "grad_norm": 1.907037615776062, + "learning_rate": 9.704724295630448e-05, + "loss": 1.9421, + "step": 777 + }, + { + "epoch": 0.1413036075101596, + "grad_norm": 0.6152373552322388, + "learning_rate": 9.703722327251838e-05, + "loss": 1.8075, + "step": 778 + }, + { + "epoch": 0.1414852316843372, + "grad_norm": 0.7393584847450256, + "learning_rate": 9.70271871364832e-05, + "loss": 1.8877, + "step": 779 + }, + { + "epoch": 0.14166685585851477, + "grad_norm": 0.5452238917350769, + "learning_rate": 9.701713455170926e-05, + "loss": 1.8288, + "step": 780 + }, + { + "epoch": 0.14184848003269235, + "grad_norm": 0.551150381565094, + "learning_rate": 9.700706552171268e-05, + "loss": 1.8438, + "step": 781 + }, + { + "epoch": 0.14203010420686993, + "grad_norm": 0.7001773715019226, + "learning_rate": 9.69969800500153e-05, + "loss": 1.9158, + "step": 782 + }, + { + "epoch": 0.1422117283810475, + "grad_norm": 0.5644949674606323, + "learning_rate": 9.698687814014473e-05, + "loss": 1.8601, + "step": 783 + }, + { + "epoch": 0.1423933525552251, + "grad_norm": 0.36467617750167847, + "learning_rate": 9.697675979563433e-05, + "loss": 1.7902, + "step": 784 + }, + { + "epoch": 0.1425749767294027, + "grad_norm": 0.5796684622764587, + "learning_rate": 9.69666250200232e-05, + "loss": 1.8638, + "step": 785 + }, + { + "epoch": 0.14275660090358028, + "grad_norm": 0.4253745377063751, + "learning_rate": 9.695647381685618e-05, + "loss": 1.8136, + "step": 786 + }, + { + "epoch": 0.14293822507775786, + "grad_norm": 0.4418640732765198, + "learning_rate": 9.694630618968385e-05, + "loss": 1.8265, + "step": 787 + }, + { + "epoch": 0.14311984925193544, + "grad_norm": 0.4063173830509186, + "learning_rate": 9.69361221420626e-05, + "loss": 1.7566, + "step": 788 + }, + { + "epoch": 0.14330147342611302, + "grad_norm": 0.5488857626914978, + "learning_rate": 9.692592167755447e-05, + "loss": 1.7943, + "step": 789 + }, + { + "epoch": 0.1434830976002906, + "grad_norm": 0.4161812663078308, + "learning_rate": 9.691570479972729e-05, + "loss": 2.0003, + "step": 790 + }, + { + "epoch": 0.14366472177446818, + "grad_norm": 0.7840547561645508, + "learning_rate": 9.690547151215463e-05, + "loss": 1.7923, + "step": 791 + }, + { + "epoch": 0.14384634594864576, + "grad_norm": 0.32812535762786865, + "learning_rate": 9.689522181841582e-05, + "loss": 1.7444, + "step": 792 + }, + { + "epoch": 0.14402797012282334, + "grad_norm": 0.6270074248313904, + "learning_rate": 9.688495572209587e-05, + "loss": 1.6994, + "step": 793 + }, + { + "epoch": 0.14420959429700092, + "grad_norm": 0.5689981579780579, + "learning_rate": 9.687467322678558e-05, + "loss": 1.9332, + "step": 794 + }, + { + "epoch": 0.1443912184711785, + "grad_norm": 0.43211427330970764, + "learning_rate": 9.686437433608145e-05, + "loss": 1.8906, + "step": 795 + }, + { + "epoch": 0.1445728426453561, + "grad_norm": 0.6135892271995544, + "learning_rate": 9.685405905358574e-05, + "loss": 2.0757, + "step": 796 + }, + { + "epoch": 0.1447544668195337, + "grad_norm": 0.3287840485572815, + "learning_rate": 9.684372738290645e-05, + "loss": 1.7578, + "step": 797 + }, + { + "epoch": 0.14493609099371127, + "grad_norm": 0.35681644082069397, + "learning_rate": 9.683337932765728e-05, + "loss": 1.7455, + "step": 798 + }, + { + "epoch": 0.14511771516788885, + "grad_norm": 0.5351925492286682, + "learning_rate": 9.682301489145769e-05, + "loss": 1.713, + "step": 799 + }, + { + "epoch": 0.14529933934206643, + "grad_norm": 0.3297032415866852, + "learning_rate": 9.681263407793284e-05, + "loss": 1.8836, + "step": 800 + }, + { + "epoch": 0.145480963516244, + "grad_norm": 0.45644041895866394, + "learning_rate": 9.680223689071364e-05, + "loss": 1.9082, + "step": 801 + }, + { + "epoch": 0.1456625876904216, + "grad_norm": 0.44933751225471497, + "learning_rate": 9.679182333343675e-05, + "loss": 2.0276, + "step": 802 + }, + { + "epoch": 0.14584421186459917, + "grad_norm": 1.9766268730163574, + "learning_rate": 9.678139340974449e-05, + "loss": 1.8762, + "step": 803 + }, + { + "epoch": 0.14602583603877675, + "grad_norm": 0.42773500084877014, + "learning_rate": 9.677094712328496e-05, + "loss": 1.9334, + "step": 804 + }, + { + "epoch": 0.14620746021295433, + "grad_norm": 0.769386887550354, + "learning_rate": 9.676048447771198e-05, + "loss": 1.9839, + "step": 805 + }, + { + "epoch": 0.14638908438713194, + "grad_norm": 0.37995070219039917, + "learning_rate": 9.675000547668504e-05, + "loss": 1.936, + "step": 806 + }, + { + "epoch": 0.14657070856130952, + "grad_norm": 0.3545726239681244, + "learning_rate": 9.673951012386944e-05, + "loss": 1.8559, + "step": 807 + }, + { + "epoch": 0.1467523327354871, + "grad_norm": 1.6339722871780396, + "learning_rate": 9.672899842293612e-05, + "loss": 1.9506, + "step": 808 + }, + { + "epoch": 0.14693395690966468, + "grad_norm": 0.6583107709884644, + "learning_rate": 9.671847037756176e-05, + "loss": 1.8576, + "step": 809 + }, + { + "epoch": 0.14711558108384226, + "grad_norm": 0.45175978541374207, + "learning_rate": 9.670792599142878e-05, + "loss": 1.7756, + "step": 810 + }, + { + "epoch": 0.14729720525801984, + "grad_norm": 0.484073668718338, + "learning_rate": 9.669736526822528e-05, + "loss": 1.9721, + "step": 811 + }, + { + "epoch": 0.14747882943219742, + "grad_norm": 0.7133439183235168, + "learning_rate": 9.668678821164513e-05, + "loss": 2.0369, + "step": 812 + }, + { + "epoch": 0.147660453606375, + "grad_norm": 0.4283084273338318, + "learning_rate": 9.667619482538783e-05, + "loss": 1.9776, + "step": 813 + }, + { + "epoch": 0.14784207778055258, + "grad_norm": 0.3714464604854584, + "learning_rate": 9.666558511315866e-05, + "loss": 1.909, + "step": 814 + }, + { + "epoch": 0.14802370195473016, + "grad_norm": 0.475719690322876, + "learning_rate": 9.665495907866859e-05, + "loss": 1.6767, + "step": 815 + }, + { + "epoch": 0.14820532612890774, + "grad_norm": 0.706728994846344, + "learning_rate": 9.664431672563429e-05, + "loss": 1.8423, + "step": 816 + }, + { + "epoch": 0.14838695030308535, + "grad_norm": 0.4125927984714508, + "learning_rate": 9.663365805777814e-05, + "loss": 1.6786, + "step": 817 + }, + { + "epoch": 0.14856857447726293, + "grad_norm": 0.37798959016799927, + "learning_rate": 9.662298307882825e-05, + "loss": 1.9671, + "step": 818 + }, + { + "epoch": 0.1487501986514405, + "grad_norm": 0.3459557592868805, + "learning_rate": 9.66122917925184e-05, + "loss": 1.6846, + "step": 819 + }, + { + "epoch": 0.1489318228256181, + "grad_norm": 0.698407769203186, + "learning_rate": 9.66015842025881e-05, + "loss": 1.7831, + "step": 820 + }, + { + "epoch": 0.14911344699979567, + "grad_norm": 0.45804333686828613, + "learning_rate": 9.659086031278254e-05, + "loss": 1.9599, + "step": 821 + }, + { + "epoch": 0.14929507117397325, + "grad_norm": 0.3957540988922119, + "learning_rate": 9.658012012685265e-05, + "loss": 1.7159, + "step": 822 + }, + { + "epoch": 0.14947669534815083, + "grad_norm": 0.3771595358848572, + "learning_rate": 9.6569363648555e-05, + "loss": 1.7884, + "step": 823 + }, + { + "epoch": 0.14965831952232841, + "grad_norm": 0.5018314719200134, + "learning_rate": 9.655859088165191e-05, + "loss": 1.8137, + "step": 824 + }, + { + "epoch": 0.149839943696506, + "grad_norm": 0.3932912349700928, + "learning_rate": 9.654780182991138e-05, + "loss": 1.6858, + "step": 825 + }, + { + "epoch": 0.15002156787068358, + "grad_norm": 0.36057305335998535, + "learning_rate": 9.65369964971071e-05, + "loss": 1.8158, + "step": 826 + }, + { + "epoch": 0.15020319204486118, + "grad_norm": 0.48851120471954346, + "learning_rate": 9.652617488701847e-05, + "loss": 1.9983, + "step": 827 + }, + { + "epoch": 0.15038481621903876, + "grad_norm": 0.3967165946960449, + "learning_rate": 9.651533700343057e-05, + "loss": 1.8843, + "step": 828 + }, + { + "epoch": 0.15056644039321634, + "grad_norm": 0.38978826999664307, + "learning_rate": 9.650448285013417e-05, + "loss": 1.9271, + "step": 829 + }, + { + "epoch": 0.15074806456739392, + "grad_norm": 2.024346351623535, + "learning_rate": 9.649361243092574e-05, + "loss": 2.0712, + "step": 830 + }, + { + "epoch": 0.1509296887415715, + "grad_norm": 0.3982148766517639, + "learning_rate": 9.648272574960744e-05, + "loss": 1.734, + "step": 831 + }, + { + "epoch": 0.15111131291574909, + "grad_norm": 0.5064598321914673, + "learning_rate": 9.64718228099871e-05, + "loss": 1.8563, + "step": 832 + }, + { + "epoch": 0.15129293708992667, + "grad_norm": 0.4470858573913574, + "learning_rate": 9.646090361587827e-05, + "loss": 1.9072, + "step": 833 + }, + { + "epoch": 0.15147456126410425, + "grad_norm": 0.32001280784606934, + "learning_rate": 9.644996817110015e-05, + "loss": 1.7312, + "step": 834 + }, + { + "epoch": 0.15165618543828183, + "grad_norm": 0.416939377784729, + "learning_rate": 9.643901647947764e-05, + "loss": 2.1561, + "step": 835 + }, + { + "epoch": 0.1518378096124594, + "grad_norm": 0.417041540145874, + "learning_rate": 9.642804854484133e-05, + "loss": 1.8068, + "step": 836 + }, + { + "epoch": 0.15201943378663701, + "grad_norm": 0.5186249613761902, + "learning_rate": 9.641706437102749e-05, + "loss": 1.9144, + "step": 837 + }, + { + "epoch": 0.1522010579608146, + "grad_norm": 1.202883005142212, + "learning_rate": 9.640606396187803e-05, + "loss": 1.9426, + "step": 838 + }, + { + "epoch": 0.15238268213499218, + "grad_norm": 0.3835715651512146, + "learning_rate": 9.639504732124062e-05, + "loss": 1.852, + "step": 839 + }, + { + "epoch": 0.15256430630916976, + "grad_norm": 0.47577813267707825, + "learning_rate": 9.638401445296854e-05, + "loss": 1.7919, + "step": 840 + }, + { + "epoch": 0.15274593048334734, + "grad_norm": 0.4148072302341461, + "learning_rate": 9.637296536092075e-05, + "loss": 1.7794, + "step": 841 + }, + { + "epoch": 0.15292755465752492, + "grad_norm": 1.4667614698410034, + "learning_rate": 9.636190004896191e-05, + "loss": 1.8645, + "step": 842 + }, + { + "epoch": 0.1531091788317025, + "grad_norm": 0.4000967741012573, + "learning_rate": 9.635081852096235e-05, + "loss": 1.7065, + "step": 843 + }, + { + "epoch": 0.15329080300588008, + "grad_norm": 0.42348387837409973, + "learning_rate": 9.633972078079807e-05, + "loss": 1.7786, + "step": 844 + }, + { + "epoch": 0.15347242718005766, + "grad_norm": 0.3681666851043701, + "learning_rate": 9.632860683235072e-05, + "loss": 1.8787, + "step": 845 + }, + { + "epoch": 0.15365405135423524, + "grad_norm": 0.4971955418586731, + "learning_rate": 9.631747667950764e-05, + "loss": 1.818, + "step": 846 + }, + { + "epoch": 0.15383567552841282, + "grad_norm": 0.7733164429664612, + "learning_rate": 9.630633032616183e-05, + "loss": 1.9307, + "step": 847 + }, + { + "epoch": 0.15401729970259043, + "grad_norm": 0.4637415409088135, + "learning_rate": 9.629516777621198e-05, + "loss": 1.8186, + "step": 848 + }, + { + "epoch": 0.154198923876768, + "grad_norm": 0.39012253284454346, + "learning_rate": 9.628398903356239e-05, + "loss": 1.6841, + "step": 849 + }, + { + "epoch": 0.1543805480509456, + "grad_norm": 0.5674143433570862, + "learning_rate": 9.627279410212309e-05, + "loss": 1.7955, + "step": 850 + }, + { + "epoch": 0.15456217222512317, + "grad_norm": 0.6544170379638672, + "learning_rate": 9.626158298580973e-05, + "loss": 1.8532, + "step": 851 + }, + { + "epoch": 0.15474379639930075, + "grad_norm": 0.38837486505508423, + "learning_rate": 9.625035568854362e-05, + "loss": 1.7158, + "step": 852 + }, + { + "epoch": 0.15492542057347833, + "grad_norm": 0.6505482196807861, + "learning_rate": 9.623911221425176e-05, + "loss": 1.6775, + "step": 853 + }, + { + "epoch": 0.1551070447476559, + "grad_norm": 0.4139814078807831, + "learning_rate": 9.622785256686677e-05, + "loss": 1.884, + "step": 854 + }, + { + "epoch": 0.1552886689218335, + "grad_norm": 0.3829917907714844, + "learning_rate": 9.621657675032697e-05, + "loss": 1.7857, + "step": 855 + }, + { + "epoch": 0.15547029309601107, + "grad_norm": 0.478694349527359, + "learning_rate": 9.620528476857629e-05, + "loss": 1.8299, + "step": 856 + }, + { + "epoch": 0.15565191727018865, + "grad_norm": 0.4130479395389557, + "learning_rate": 9.619397662556435e-05, + "loss": 1.9092, + "step": 857 + }, + { + "epoch": 0.15583354144436626, + "grad_norm": 0.36216405034065247, + "learning_rate": 9.618265232524639e-05, + "loss": 1.8368, + "step": 858 + }, + { + "epoch": 0.15601516561854384, + "grad_norm": 1.3993568420410156, + "learning_rate": 9.617131187158335e-05, + "loss": 1.8602, + "step": 859 + }, + { + "epoch": 0.15619678979272142, + "grad_norm": 0.35536015033721924, + "learning_rate": 9.615995526854176e-05, + "loss": 1.8262, + "step": 860 + }, + { + "epoch": 0.156378413966899, + "grad_norm": 0.45485836267471313, + "learning_rate": 9.614858252009385e-05, + "loss": 1.8864, + "step": 861 + }, + { + "epoch": 0.15656003814107658, + "grad_norm": 0.6847628355026245, + "learning_rate": 9.613719363021744e-05, + "loss": 1.8091, + "step": 862 + }, + { + "epoch": 0.15674166231525416, + "grad_norm": 0.7234273552894592, + "learning_rate": 9.61257886028961e-05, + "loss": 1.9421, + "step": 863 + }, + { + "epoch": 0.15692328648943174, + "grad_norm": 0.41061165928840637, + "learning_rate": 9.611436744211891e-05, + "loss": 1.7859, + "step": 864 + }, + { + "epoch": 0.15710491066360932, + "grad_norm": 0.33130714297294617, + "learning_rate": 9.610293015188067e-05, + "loss": 1.7818, + "step": 865 + }, + { + "epoch": 0.1572865348377869, + "grad_norm": 0.36260101199150085, + "learning_rate": 9.609147673618186e-05, + "loss": 1.9246, + "step": 866 + }, + { + "epoch": 0.15746815901196448, + "grad_norm": 0.31886717677116394, + "learning_rate": 9.60800071990285e-05, + "loss": 1.5919, + "step": 867 + }, + { + "epoch": 0.15764978318614206, + "grad_norm": 0.35298117995262146, + "learning_rate": 9.60685215444323e-05, + "loss": 1.7245, + "step": 868 + }, + { + "epoch": 0.15783140736031967, + "grad_norm": 0.7760421633720398, + "learning_rate": 9.605701977641064e-05, + "loss": 1.7379, + "step": 869 + }, + { + "epoch": 0.15801303153449725, + "grad_norm": 0.6306815147399902, + "learning_rate": 9.604550189898648e-05, + "loss": 1.8202, + "step": 870 + }, + { + "epoch": 0.15819465570867483, + "grad_norm": 0.5046470761299133, + "learning_rate": 9.603396791618844e-05, + "loss": 1.8678, + "step": 871 + }, + { + "epoch": 0.1583762798828524, + "grad_norm": 0.4896434545516968, + "learning_rate": 9.602241783205079e-05, + "loss": 1.7891, + "step": 872 + }, + { + "epoch": 0.15855790405703, + "grad_norm": 0.4759124219417572, + "learning_rate": 9.601085165061336e-05, + "loss": 1.8312, + "step": 873 + }, + { + "epoch": 0.15873952823120757, + "grad_norm": 0.42356470227241516, + "learning_rate": 9.599926937592174e-05, + "loss": 1.5614, + "step": 874 + }, + { + "epoch": 0.15892115240538515, + "grad_norm": 0.6849737167358398, + "learning_rate": 9.598767101202702e-05, + "loss": 1.6884, + "step": 875 + }, + { + "epoch": 0.15910277657956273, + "grad_norm": 0.41982701420783997, + "learning_rate": 9.597605656298596e-05, + "loss": 1.7484, + "step": 876 + }, + { + "epoch": 0.1592844007537403, + "grad_norm": 0.48178741335868835, + "learning_rate": 9.596442603286099e-05, + "loss": 1.7342, + "step": 877 + }, + { + "epoch": 0.1594660249279179, + "grad_norm": 0.469974160194397, + "learning_rate": 9.595277942572012e-05, + "loss": 1.8111, + "step": 878 + }, + { + "epoch": 0.1596476491020955, + "grad_norm": 0.5221380591392517, + "learning_rate": 9.594111674563697e-05, + "loss": 1.6604, + "step": 879 + }, + { + "epoch": 0.15982927327627308, + "grad_norm": 0.40796607732772827, + "learning_rate": 9.592943799669085e-05, + "loss": 1.936, + "step": 880 + }, + { + "epoch": 0.16001089745045066, + "grad_norm": 0.9685256481170654, + "learning_rate": 9.591774318296661e-05, + "loss": 2.01, + "step": 881 + }, + { + "epoch": 0.16019252162462824, + "grad_norm": 0.44533684849739075, + "learning_rate": 9.590603230855477e-05, + "loss": 1.7689, + "step": 882 + }, + { + "epoch": 0.16037414579880582, + "grad_norm": 0.4118293821811676, + "learning_rate": 9.589430537755144e-05, + "loss": 1.634, + "step": 883 + }, + { + "epoch": 0.1605557699729834, + "grad_norm": 0.5277968049049377, + "learning_rate": 9.588256239405837e-05, + "loss": 1.6984, + "step": 884 + }, + { + "epoch": 0.16073739414716098, + "grad_norm": 0.5670766234397888, + "learning_rate": 9.587080336218293e-05, + "loss": 1.6354, + "step": 885 + }, + { + "epoch": 0.16091901832133856, + "grad_norm": 0.4087660014629364, + "learning_rate": 9.585902828603804e-05, + "loss": 2.0154, + "step": 886 + }, + { + "epoch": 0.16110064249551614, + "grad_norm": 0.5155079364776611, + "learning_rate": 9.584723716974232e-05, + "loss": 1.8879, + "step": 887 + }, + { + "epoch": 0.16128226666969372, + "grad_norm": 0.5080800652503967, + "learning_rate": 9.583543001741994e-05, + "loss": 1.9369, + "step": 888 + }, + { + "epoch": 0.16146389084387133, + "grad_norm": 0.47774749994277954, + "learning_rate": 9.58236068332007e-05, + "loss": 1.9816, + "step": 889 + }, + { + "epoch": 0.1616455150180489, + "grad_norm": 0.6776544451713562, + "learning_rate": 9.581176762122e-05, + "loss": 1.8104, + "step": 890 + }, + { + "epoch": 0.1618271391922265, + "grad_norm": 0.9659112095832825, + "learning_rate": 9.579991238561887e-05, + "loss": 1.8874, + "step": 891 + }, + { + "epoch": 0.16200876336640407, + "grad_norm": 0.4211593270301819, + "learning_rate": 9.57880411305439e-05, + "loss": 1.8355, + "step": 892 + }, + { + "epoch": 0.16219038754058165, + "grad_norm": 0.4826183021068573, + "learning_rate": 9.577615386014733e-05, + "loss": 1.8173, + "step": 893 + }, + { + "epoch": 0.16237201171475923, + "grad_norm": 1.6897523403167725, + "learning_rate": 9.576425057858697e-05, + "loss": 1.8587, + "step": 894 + }, + { + "epoch": 0.16255363588893681, + "grad_norm": 0.46962425112724304, + "learning_rate": 9.575233129002624e-05, + "loss": 1.9073, + "step": 895 + }, + { + "epoch": 0.1627352600631144, + "grad_norm": 0.3869972229003906, + "learning_rate": 9.574039599863417e-05, + "loss": 1.8241, + "step": 896 + }, + { + "epoch": 0.16291688423729198, + "grad_norm": 0.34141993522644043, + "learning_rate": 9.572844470858537e-05, + "loss": 1.8927, + "step": 897 + }, + { + "epoch": 0.16309850841146956, + "grad_norm": 0.3877248167991638, + "learning_rate": 9.571647742406005e-05, + "loss": 1.8454, + "step": 898 + }, + { + "epoch": 0.16328013258564714, + "grad_norm": 0.48279669880867004, + "learning_rate": 9.570449414924402e-05, + "loss": 1.7141, + "step": 899 + }, + { + "epoch": 0.16346175675982474, + "grad_norm": 0.399410605430603, + "learning_rate": 9.569249488832867e-05, + "loss": 1.6167, + "step": 900 + }, + { + "epoch": 0.16364338093400232, + "grad_norm": 0.4445226788520813, + "learning_rate": 9.568047964551102e-05, + "loss": 1.8086, + "step": 901 + }, + { + "epoch": 0.1638250051081799, + "grad_norm": 0.5148810744285583, + "learning_rate": 9.566844842499361e-05, + "loss": 1.8294, + "step": 902 + }, + { + "epoch": 0.16400662928235749, + "grad_norm": 1.015985131263733, + "learning_rate": 9.565640123098466e-05, + "loss": 1.9008, + "step": 903 + }, + { + "epoch": 0.16418825345653507, + "grad_norm": 0.3741343021392822, + "learning_rate": 9.564433806769788e-05, + "loss": 1.6942, + "step": 904 + }, + { + "epoch": 0.16436987763071265, + "grad_norm": 0.34511032700538635, + "learning_rate": 9.563225893935263e-05, + "loss": 1.6712, + "step": 905 + }, + { + "epoch": 0.16455150180489023, + "grad_norm": 0.405396968126297, + "learning_rate": 9.562016385017385e-05, + "loss": 1.8855, + "step": 906 + }, + { + "epoch": 0.1647331259790678, + "grad_norm": 0.4103500545024872, + "learning_rate": 9.560805280439204e-05, + "loss": 1.8363, + "step": 907 + }, + { + "epoch": 0.1649147501532454, + "grad_norm": 0.5853566527366638, + "learning_rate": 9.559592580624328e-05, + "loss": 1.6602, + "step": 908 + }, + { + "epoch": 0.16509637432742297, + "grad_norm": 0.6188365817070007, + "learning_rate": 9.558378285996926e-05, + "loss": 1.7915, + "step": 909 + }, + { + "epoch": 0.16527799850160058, + "grad_norm": 0.7421554923057556, + "learning_rate": 9.557162396981722e-05, + "loss": 1.9338, + "step": 910 + }, + { + "epoch": 0.16545962267577816, + "grad_norm": 0.42040136456489563, + "learning_rate": 9.555944914003998e-05, + "loss": 1.9374, + "step": 911 + }, + { + "epoch": 0.16564124684995574, + "grad_norm": 0.5066253542900085, + "learning_rate": 9.554725837489594e-05, + "loss": 1.8283, + "step": 912 + }, + { + "epoch": 0.16582287102413332, + "grad_norm": 0.8612716794013977, + "learning_rate": 9.553505167864908e-05, + "loss": 1.9474, + "step": 913 + }, + { + "epoch": 0.1660044951983109, + "grad_norm": 0.37851759791374207, + "learning_rate": 9.552282905556896e-05, + "loss": 1.9031, + "step": 914 + }, + { + "epoch": 0.16618611937248848, + "grad_norm": 0.5981343388557434, + "learning_rate": 9.551059050993065e-05, + "loss": 1.9235, + "step": 915 + }, + { + "epoch": 0.16636774354666606, + "grad_norm": 0.5144421458244324, + "learning_rate": 9.549833604601491e-05, + "loss": 1.987, + "step": 916 + }, + { + "epoch": 0.16654936772084364, + "grad_norm": 0.31909725069999695, + "learning_rate": 9.548606566810791e-05, + "loss": 1.7925, + "step": 917 + }, + { + "epoch": 0.16673099189502122, + "grad_norm": 0.6750073432922363, + "learning_rate": 9.547377938050156e-05, + "loss": 1.9703, + "step": 918 + }, + { + "epoch": 0.1669126160691988, + "grad_norm": 0.4337034225463867, + "learning_rate": 9.546147718749316e-05, + "loss": 1.8325, + "step": 919 + }, + { + "epoch": 0.16709424024337638, + "grad_norm": 0.47028475999832153, + "learning_rate": 9.54491590933857e-05, + "loss": 1.8655, + "step": 920 + }, + { + "epoch": 0.167275864417554, + "grad_norm": 2.025397300720215, + "learning_rate": 9.54368251024877e-05, + "loss": 2.0819, + "step": 921 + }, + { + "epoch": 0.16745748859173157, + "grad_norm": 0.6747413277626038, + "learning_rate": 9.542447521911322e-05, + "loss": 1.7588, + "step": 922 + }, + { + "epoch": 0.16763911276590915, + "grad_norm": 0.4474550485610962, + "learning_rate": 9.541210944758187e-05, + "loss": 1.8973, + "step": 923 + }, + { + "epoch": 0.16782073694008673, + "grad_norm": 0.3815252482891083, + "learning_rate": 9.539972779221886e-05, + "loss": 1.6376, + "step": 924 + }, + { + "epoch": 0.1680023611142643, + "grad_norm": 0.3375750780105591, + "learning_rate": 9.538733025735494e-05, + "loss": 1.8168, + "step": 925 + }, + { + "epoch": 0.1681839852884419, + "grad_norm": 0.452978253364563, + "learning_rate": 9.537491684732636e-05, + "loss": 1.7464, + "step": 926 + }, + { + "epoch": 0.16836560946261947, + "grad_norm": 0.47344791889190674, + "learning_rate": 9.536248756647501e-05, + "loss": 1.9929, + "step": 927 + }, + { + "epoch": 0.16854723363679705, + "grad_norm": 0.5255886316299438, + "learning_rate": 9.535004241914829e-05, + "loss": 1.8068, + "step": 928 + }, + { + "epoch": 0.16872885781097463, + "grad_norm": 0.6484439373016357, + "learning_rate": 9.533758140969912e-05, + "loss": 1.8472, + "step": 929 + }, + { + "epoch": 0.1689104819851522, + "grad_norm": 0.4476175010204315, + "learning_rate": 9.532510454248605e-05, + "loss": 1.8698, + "step": 930 + }, + { + "epoch": 0.16909210615932982, + "grad_norm": 0.3076722025871277, + "learning_rate": 9.531261182187308e-05, + "loss": 1.7273, + "step": 931 + }, + { + "epoch": 0.1692737303335074, + "grad_norm": 0.7686033248901367, + "learning_rate": 9.530010325222979e-05, + "loss": 1.8047, + "step": 932 + }, + { + "epoch": 0.16945535450768498, + "grad_norm": 1.4961011409759521, + "learning_rate": 9.528757883793135e-05, + "loss": 1.8016, + "step": 933 + }, + { + "epoch": 0.16963697868186256, + "grad_norm": 0.3695860803127289, + "learning_rate": 9.527503858335842e-05, + "loss": 1.6325, + "step": 934 + }, + { + "epoch": 0.16981860285604014, + "grad_norm": 0.5530641078948975, + "learning_rate": 9.52624824928972e-05, + "loss": 1.8408, + "step": 935 + }, + { + "epoch": 0.17000022703021772, + "grad_norm": 0.47769695520401, + "learning_rate": 9.524991057093946e-05, + "loss": 1.8619, + "step": 936 + }, + { + "epoch": 0.1701818512043953, + "grad_norm": 0.6717424392700195, + "learning_rate": 9.52373228218825e-05, + "loss": 1.941, + "step": 937 + }, + { + "epoch": 0.17036347537857288, + "grad_norm": 0.4013419449329376, + "learning_rate": 9.522471925012914e-05, + "loss": 1.8584, + "step": 938 + }, + { + "epoch": 0.17054509955275046, + "grad_norm": 0.48477888107299805, + "learning_rate": 9.521209986008772e-05, + "loss": 1.8113, + "step": 939 + }, + { + "epoch": 0.17072672372692804, + "grad_norm": 0.32428058981895447, + "learning_rate": 9.519946465617218e-05, + "loss": 1.7981, + "step": 940 + }, + { + "epoch": 0.17090834790110565, + "grad_norm": 0.547222912311554, + "learning_rate": 9.51868136428019e-05, + "loss": 1.9699, + "step": 941 + }, + { + "epoch": 0.17108997207528323, + "grad_norm": 0.46474048495292664, + "learning_rate": 9.517414682440186e-05, + "loss": 1.8412, + "step": 942 + }, + { + "epoch": 0.1712715962494608, + "grad_norm": 0.5962857007980347, + "learning_rate": 9.516146420540254e-05, + "loss": 1.8689, + "step": 943 + }, + { + "epoch": 0.1714532204236384, + "grad_norm": 0.34335386753082275, + "learning_rate": 9.514876579023994e-05, + "loss": 1.7506, + "step": 944 + }, + { + "epoch": 0.17163484459781597, + "grad_norm": 0.3711894750595093, + "learning_rate": 9.513605158335562e-05, + "loss": 1.7763, + "step": 945 + }, + { + "epoch": 0.17181646877199355, + "grad_norm": 1.3741792440414429, + "learning_rate": 9.512332158919661e-05, + "loss": 2.0378, + "step": 946 + }, + { + "epoch": 0.17199809294617113, + "grad_norm": 0.5785238742828369, + "learning_rate": 9.511057581221552e-05, + "loss": 1.9563, + "step": 947 + }, + { + "epoch": 0.1721797171203487, + "grad_norm": 1.0559364557266235, + "learning_rate": 9.509781425687043e-05, + "loss": 1.7781, + "step": 948 + }, + { + "epoch": 0.1723613412945263, + "grad_norm": 0.38857850432395935, + "learning_rate": 9.508503692762495e-05, + "loss": 1.7495, + "step": 949 + }, + { + "epoch": 0.17254296546870387, + "grad_norm": 0.4824320673942566, + "learning_rate": 9.507224382894826e-05, + "loss": 1.9217, + "step": 950 + }, + { + "epoch": 0.17272458964288145, + "grad_norm": 0.46631962060928345, + "learning_rate": 9.505943496531496e-05, + "loss": 1.9072, + "step": 951 + }, + { + "epoch": 0.17290621381705906, + "grad_norm": 0.6800917387008667, + "learning_rate": 9.504661034120525e-05, + "loss": 1.968, + "step": 952 + }, + { + "epoch": 0.17308783799123664, + "grad_norm": 0.7281967997550964, + "learning_rate": 9.50337699611048e-05, + "loss": 2.0012, + "step": 953 + }, + { + "epoch": 0.17326946216541422, + "grad_norm": 0.26635217666625977, + "learning_rate": 9.502091382950482e-05, + "loss": 1.8218, + "step": 954 + }, + { + "epoch": 0.1734510863395918, + "grad_norm": 0.32601046562194824, + "learning_rate": 9.500804195090198e-05, + "loss": 1.7831, + "step": 955 + }, + { + "epoch": 0.17363271051376938, + "grad_norm": 0.44298943877220154, + "learning_rate": 9.499515432979849e-05, + "loss": 2.0326, + "step": 956 + }, + { + "epoch": 0.17381433468794696, + "grad_norm": 0.4366621971130371, + "learning_rate": 9.498225097070209e-05, + "loss": 1.946, + "step": 957 + }, + { + "epoch": 0.17399595886212454, + "grad_norm": 0.3234955072402954, + "learning_rate": 9.496933187812598e-05, + "loss": 1.8666, + "step": 958 + }, + { + "epoch": 0.17417758303630212, + "grad_norm": 0.527773380279541, + "learning_rate": 9.495639705658888e-05, + "loss": 1.972, + "step": 959 + }, + { + "epoch": 0.1743592072104797, + "grad_norm": 0.47820261120796204, + "learning_rate": 9.494344651061502e-05, + "loss": 1.9658, + "step": 960 + }, + { + "epoch": 0.17454083138465729, + "grad_norm": 0.6668755412101746, + "learning_rate": 9.493048024473412e-05, + "loss": 1.7126, + "step": 961 + }, + { + "epoch": 0.1747224555588349, + "grad_norm": 0.45200446248054504, + "learning_rate": 9.491749826348139e-05, + "loss": 1.7843, + "step": 962 + }, + { + "epoch": 0.17490407973301247, + "grad_norm": 1.8832879066467285, + "learning_rate": 9.490450057139758e-05, + "loss": 2.09, + "step": 963 + }, + { + "epoch": 0.17508570390719005, + "grad_norm": 1.0262620449066162, + "learning_rate": 9.489148717302888e-05, + "loss": 1.9822, + "step": 964 + }, + { + "epoch": 0.17526732808136763, + "grad_norm": 0.34405824542045593, + "learning_rate": 9.487845807292701e-05, + "loss": 1.7749, + "step": 965 + }, + { + "epoch": 0.17544895225554522, + "grad_norm": 0.5030402541160583, + "learning_rate": 9.486541327564916e-05, + "loss": 1.9306, + "step": 966 + }, + { + "epoch": 0.1756305764297228, + "grad_norm": 0.5356628894805908, + "learning_rate": 9.485235278575801e-05, + "loss": 1.8365, + "step": 967 + }, + { + "epoch": 0.17581220060390038, + "grad_norm": 0.5114620923995972, + "learning_rate": 9.483927660782176e-05, + "loss": 1.7721, + "step": 968 + }, + { + "epoch": 0.17599382477807796, + "grad_norm": 0.48758184909820557, + "learning_rate": 9.482618474641407e-05, + "loss": 1.6607, + "step": 969 + }, + { + "epoch": 0.17617544895225554, + "grad_norm": 0.45499250292778015, + "learning_rate": 9.481307720611408e-05, + "loss": 1.8592, + "step": 970 + }, + { + "epoch": 0.17635707312643312, + "grad_norm": 0.4692058265209198, + "learning_rate": 9.479995399150644e-05, + "loss": 1.9149, + "step": 971 + }, + { + "epoch": 0.1765386973006107, + "grad_norm": 0.6165156364440918, + "learning_rate": 9.478681510718124e-05, + "loss": 1.8526, + "step": 972 + }, + { + "epoch": 0.1767203214747883, + "grad_norm": 0.8035709261894226, + "learning_rate": 9.477366055773412e-05, + "loss": 1.8805, + "step": 973 + }, + { + "epoch": 0.17690194564896589, + "grad_norm": 0.5022410750389099, + "learning_rate": 9.476049034776613e-05, + "loss": 1.7292, + "step": 974 + }, + { + "epoch": 0.17708356982314347, + "grad_norm": 0.42469725012779236, + "learning_rate": 9.474730448188383e-05, + "loss": 1.8942, + "step": 975 + }, + { + "epoch": 0.17726519399732105, + "grad_norm": 1.3032971620559692, + "learning_rate": 9.473410296469924e-05, + "loss": 1.5477, + "step": 976 + }, + { + "epoch": 0.17744681817149863, + "grad_norm": 0.5435225963592529, + "learning_rate": 9.47208858008299e-05, + "loss": 1.7255, + "step": 977 + }, + { + "epoch": 0.1776284423456762, + "grad_norm": 0.4849220812320709, + "learning_rate": 9.470765299489877e-05, + "loss": 1.8483, + "step": 978 + }, + { + "epoch": 0.1778100665198538, + "grad_norm": 0.4345886707305908, + "learning_rate": 9.469440455153429e-05, + "loss": 1.6656, + "step": 979 + }, + { + "epoch": 0.17799169069403137, + "grad_norm": 0.7653133273124695, + "learning_rate": 9.468114047537039e-05, + "loss": 2.0267, + "step": 980 + }, + { + "epoch": 0.17817331486820895, + "grad_norm": 0.45317980647087097, + "learning_rate": 9.466786077104646e-05, + "loss": 1.8438, + "step": 981 + }, + { + "epoch": 0.17835493904238653, + "grad_norm": 0.3719966411590576, + "learning_rate": 9.465456544320733e-05, + "loss": 1.8779, + "step": 982 + }, + { + "epoch": 0.17853656321656414, + "grad_norm": 1.0564510822296143, + "learning_rate": 9.464125449650334e-05, + "loss": 1.8634, + "step": 983 + }, + { + "epoch": 0.17871818739074172, + "grad_norm": 0.46412840485572815, + "learning_rate": 9.462792793559028e-05, + "loss": 1.6395, + "step": 984 + }, + { + "epoch": 0.1788998115649193, + "grad_norm": 0.38892626762390137, + "learning_rate": 9.461458576512935e-05, + "loss": 1.8724, + "step": 985 + }, + { + "epoch": 0.17908143573909688, + "grad_norm": 0.46058911085128784, + "learning_rate": 9.460122798978731e-05, + "loss": 2.0241, + "step": 986 + }, + { + "epoch": 0.17926305991327446, + "grad_norm": 0.45922932028770447, + "learning_rate": 9.458785461423628e-05, + "loss": 1.8477, + "step": 987 + }, + { + "epoch": 0.17944468408745204, + "grad_norm": 0.4526157081127167, + "learning_rate": 9.457446564315388e-05, + "loss": 1.9376, + "step": 988 + }, + { + "epoch": 0.17962630826162962, + "grad_norm": 0.4078746438026428, + "learning_rate": 9.45610610812232e-05, + "loss": 1.9312, + "step": 989 + }, + { + "epoch": 0.1798079324358072, + "grad_norm": 0.4951235353946686, + "learning_rate": 9.454764093313275e-05, + "loss": 1.8202, + "step": 990 + }, + { + "epoch": 0.17998955660998478, + "grad_norm": 0.5562660694122314, + "learning_rate": 9.453420520357652e-05, + "loss": 1.9217, + "step": 991 + }, + { + "epoch": 0.18017118078416236, + "grad_norm": 0.4128775894641876, + "learning_rate": 9.452075389725392e-05, + "loss": 1.9405, + "step": 992 + }, + { + "epoch": 0.18035280495833997, + "grad_norm": 0.34307560324668884, + "learning_rate": 9.450728701886983e-05, + "loss": 1.8073, + "step": 993 + }, + { + "epoch": 0.18053442913251755, + "grad_norm": 0.46063369512557983, + "learning_rate": 9.449380457313458e-05, + "loss": 1.9199, + "step": 994 + }, + { + "epoch": 0.18071605330669513, + "grad_norm": 0.41959720849990845, + "learning_rate": 9.448030656476392e-05, + "loss": 1.8779, + "step": 995 + }, + { + "epoch": 0.1808976774808727, + "grad_norm": 1.8200995922088623, + "learning_rate": 9.446679299847908e-05, + "loss": 2.0139, + "step": 996 + }, + { + "epoch": 0.1810793016550503, + "grad_norm": 0.4536880552768707, + "learning_rate": 9.44532638790067e-05, + "loss": 1.8742, + "step": 997 + }, + { + "epoch": 0.18126092582922787, + "grad_norm": 1.0376689434051514, + "learning_rate": 9.443971921107886e-05, + "loss": 1.7954, + "step": 998 + }, + { + "epoch": 0.18144255000340545, + "grad_norm": 0.6142174005508423, + "learning_rate": 9.44261589994331e-05, + "loss": 1.7901, + "step": 999 + }, + { + "epoch": 0.18162417417758303, + "grad_norm": 0.36804619431495667, + "learning_rate": 9.441258324881241e-05, + "loss": 1.8166, + "step": 1000 + }, + { + "epoch": 0.1818057983517606, + "grad_norm": 0.5921896696090698, + "learning_rate": 9.439899196396515e-05, + "loss": 1.878, + "step": 1001 + }, + { + "epoch": 0.1819874225259382, + "grad_norm": 0.6065016388893127, + "learning_rate": 9.43853851496452e-05, + "loss": 1.8131, + "step": 1002 + }, + { + "epoch": 0.18216904670011577, + "grad_norm": 0.4648057818412781, + "learning_rate": 9.437176281061179e-05, + "loss": 1.7545, + "step": 1003 + }, + { + "epoch": 0.18235067087429338, + "grad_norm": 0.7126734256744385, + "learning_rate": 9.435812495162962e-05, + "loss": 1.7749, + "step": 1004 + }, + { + "epoch": 0.18253229504847096, + "grad_norm": 0.6523189544677734, + "learning_rate": 9.434447157746884e-05, + "loss": 1.7227, + "step": 1005 + }, + { + "epoch": 0.18271391922264854, + "grad_norm": 0.9972248077392578, + "learning_rate": 9.433080269290497e-05, + "loss": 1.7569, + "step": 1006 + }, + { + "epoch": 0.18289554339682612, + "grad_norm": 0.5643760561943054, + "learning_rate": 9.4317118302719e-05, + "loss": 1.9752, + "step": 1007 + }, + { + "epoch": 0.1830771675710037, + "grad_norm": 0.3208830654621124, + "learning_rate": 9.430341841169736e-05, + "loss": 1.8633, + "step": 1008 + }, + { + "epoch": 0.18325879174518128, + "grad_norm": 0.6847470998764038, + "learning_rate": 9.428970302463185e-05, + "loss": 1.8746, + "step": 1009 + }, + { + "epoch": 0.18344041591935886, + "grad_norm": 0.6316227912902832, + "learning_rate": 9.427597214631969e-05, + "loss": 1.7985, + "step": 1010 + }, + { + "epoch": 0.18362204009353644, + "grad_norm": 0.548018753528595, + "learning_rate": 9.426222578156356e-05, + "loss": 1.7727, + "step": 1011 + }, + { + "epoch": 0.18380366426771402, + "grad_norm": 0.4175761938095093, + "learning_rate": 9.424846393517155e-05, + "loss": 1.9415, + "step": 1012 + }, + { + "epoch": 0.1839852884418916, + "grad_norm": 0.4125344157218933, + "learning_rate": 9.423468661195713e-05, + "loss": 1.9009, + "step": 1013 + }, + { + "epoch": 0.1841669126160692, + "grad_norm": 0.8531230688095093, + "learning_rate": 9.422089381673923e-05, + "loss": 1.7222, + "step": 1014 + }, + { + "epoch": 0.1843485367902468, + "grad_norm": 1.213919758796692, + "learning_rate": 9.420708555434215e-05, + "loss": 1.8022, + "step": 1015 + }, + { + "epoch": 0.18453016096442437, + "grad_norm": 0.44588109850883484, + "learning_rate": 9.41932618295956e-05, + "loss": 1.8059, + "step": 1016 + }, + { + "epoch": 0.18471178513860195, + "grad_norm": 0.6506812572479248, + "learning_rate": 9.417942264733477e-05, + "loss": 1.8746, + "step": 1017 + }, + { + "epoch": 0.18489340931277953, + "grad_norm": 0.3283126950263977, + "learning_rate": 9.416556801240015e-05, + "loss": 1.7875, + "step": 1018 + }, + { + "epoch": 0.1850750334869571, + "grad_norm": 0.4236765205860138, + "learning_rate": 9.415169792963772e-05, + "loss": 1.9203, + "step": 1019 + }, + { + "epoch": 0.1852566576611347, + "grad_norm": 0.5342426300048828, + "learning_rate": 9.41378124038988e-05, + "loss": 2.0594, + "step": 1020 + }, + { + "epoch": 0.18543828183531227, + "grad_norm": 1.1219714879989624, + "learning_rate": 9.412391144004017e-05, + "loss": 1.8785, + "step": 1021 + }, + { + "epoch": 0.18561990600948985, + "grad_norm": 0.43214836716651917, + "learning_rate": 9.410999504292397e-05, + "loss": 1.6654, + "step": 1022 + }, + { + "epoch": 0.18580153018366743, + "grad_norm": 0.37878844141960144, + "learning_rate": 9.409606321741775e-05, + "loss": 1.8269, + "step": 1023 + }, + { + "epoch": 0.18598315435784504, + "grad_norm": 0.5087562799453735, + "learning_rate": 9.408211596839447e-05, + "loss": 1.965, + "step": 1024 + }, + { + "epoch": 0.18616477853202262, + "grad_norm": 0.4644775986671448, + "learning_rate": 9.406815330073244e-05, + "loss": 1.6623, + "step": 1025 + }, + { + "epoch": 0.1863464027062002, + "grad_norm": 0.5558162927627563, + "learning_rate": 9.405417521931543e-05, + "loss": 1.9202, + "step": 1026 + }, + { + "epoch": 0.18652802688037778, + "grad_norm": 0.37288567423820496, + "learning_rate": 9.404018172903254e-05, + "loss": 1.8772, + "step": 1027 + }, + { + "epoch": 0.18670965105455536, + "grad_norm": 0.45209193229675293, + "learning_rate": 9.402617283477829e-05, + "loss": 1.8162, + "step": 1028 + }, + { + "epoch": 0.18689127522873294, + "grad_norm": 0.5454465746879578, + "learning_rate": 9.40121485414526e-05, + "loss": 1.9186, + "step": 1029 + }, + { + "epoch": 0.18707289940291053, + "grad_norm": 0.9245274662971497, + "learning_rate": 9.399810885396072e-05, + "loss": 1.9671, + "step": 1030 + }, + { + "epoch": 0.1872545235770881, + "grad_norm": 0.6407060623168945, + "learning_rate": 9.398405377721338e-05, + "loss": 1.8057, + "step": 1031 + }, + { + "epoch": 0.18743614775126569, + "grad_norm": 0.4011857211589813, + "learning_rate": 9.396998331612657e-05, + "loss": 1.79, + "step": 1032 + }, + { + "epoch": 0.18761777192544327, + "grad_norm": 0.36899104714393616, + "learning_rate": 9.395589747562178e-05, + "loss": 1.8757, + "step": 1033 + }, + { + "epoch": 0.18779939609962085, + "grad_norm": 0.39424315094947815, + "learning_rate": 9.394179626062581e-05, + "loss": 1.8454, + "step": 1034 + }, + { + "epoch": 0.18798102027379845, + "grad_norm": 0.6563200950622559, + "learning_rate": 9.392767967607083e-05, + "loss": 1.7474, + "step": 1035 + }, + { + "epoch": 0.18816264444797604, + "grad_norm": 0.61153644323349, + "learning_rate": 9.391354772689445e-05, + "loss": 1.9002, + "step": 1036 + }, + { + "epoch": 0.18834426862215362, + "grad_norm": 0.4084111750125885, + "learning_rate": 9.389940041803959e-05, + "loss": 1.7591, + "step": 1037 + }, + { + "epoch": 0.1885258927963312, + "grad_norm": 0.7175431847572327, + "learning_rate": 9.388523775445457e-05, + "loss": 1.8888, + "step": 1038 + }, + { + "epoch": 0.18870751697050878, + "grad_norm": 0.8773576617240906, + "learning_rate": 9.387105974109306e-05, + "loss": 1.8434, + "step": 1039 + }, + { + "epoch": 0.18888914114468636, + "grad_norm": 0.4064632058143616, + "learning_rate": 9.385686638291417e-05, + "loss": 1.8648, + "step": 1040 + }, + { + "epoch": 0.18907076531886394, + "grad_norm": 0.3635701835155487, + "learning_rate": 9.384265768488225e-05, + "loss": 1.6632, + "step": 1041 + }, + { + "epoch": 0.18925238949304152, + "grad_norm": 0.35890886187553406, + "learning_rate": 9.382843365196716e-05, + "loss": 1.7616, + "step": 1042 + }, + { + "epoch": 0.1894340136672191, + "grad_norm": 0.4231317639350891, + "learning_rate": 9.381419428914397e-05, + "loss": 1.8131, + "step": 1043 + }, + { + "epoch": 0.18961563784139668, + "grad_norm": 0.48375535011291504, + "learning_rate": 9.379993960139327e-05, + "loss": 1.8629, + "step": 1044 + }, + { + "epoch": 0.1897972620155743, + "grad_norm": 0.44520241022109985, + "learning_rate": 9.37856695937009e-05, + "loss": 2.0289, + "step": 1045 + }, + { + "epoch": 0.18997888618975187, + "grad_norm": 0.41420361399650574, + "learning_rate": 9.37713842710581e-05, + "loss": 1.7706, + "step": 1046 + }, + { + "epoch": 0.19016051036392945, + "grad_norm": 0.6058051586151123, + "learning_rate": 9.375708363846145e-05, + "loss": 2.0743, + "step": 1047 + }, + { + "epoch": 0.19034213453810703, + "grad_norm": 0.6832868456840515, + "learning_rate": 9.374276770091289e-05, + "loss": 1.9056, + "step": 1048 + }, + { + "epoch": 0.1905237587122846, + "grad_norm": 0.3501856029033661, + "learning_rate": 9.372843646341974e-05, + "loss": 1.8838, + "step": 1049 + }, + { + "epoch": 0.1907053828864622, + "grad_norm": 0.6183803081512451, + "learning_rate": 9.371408993099464e-05, + "loss": 1.8907, + "step": 1050 + }, + { + "epoch": 0.19088700706063977, + "grad_norm": 1.2447444200515747, + "learning_rate": 9.369972810865557e-05, + "loss": 1.781, + "step": 1051 + }, + { + "epoch": 0.19106863123481735, + "grad_norm": 0.4630773365497589, + "learning_rate": 9.36853510014259e-05, + "loss": 1.8797, + "step": 1052 + }, + { + "epoch": 0.19125025540899493, + "grad_norm": 0.45398128032684326, + "learning_rate": 9.367095861433433e-05, + "loss": 1.7944, + "step": 1053 + }, + { + "epoch": 0.1914318795831725, + "grad_norm": 0.3916375935077667, + "learning_rate": 9.365655095241486e-05, + "loss": 1.7752, + "step": 1054 + }, + { + "epoch": 0.1916135037573501, + "grad_norm": 0.3886646032333374, + "learning_rate": 9.364212802070689e-05, + "loss": 1.6843, + "step": 1055 + }, + { + "epoch": 0.1917951279315277, + "grad_norm": 0.4763382375240326, + "learning_rate": 9.362768982425515e-05, + "loss": 1.7247, + "step": 1056 + }, + { + "epoch": 0.19197675210570528, + "grad_norm": 0.43076983094215393, + "learning_rate": 9.36132363681097e-05, + "loss": 1.7816, + "step": 1057 + }, + { + "epoch": 0.19215837627988286, + "grad_norm": 0.3599191904067993, + "learning_rate": 9.359876765732591e-05, + "loss": 1.847, + "step": 1058 + }, + { + "epoch": 0.19234000045406044, + "grad_norm": 0.5085789561271667, + "learning_rate": 9.358428369696457e-05, + "loss": 1.8556, + "step": 1059 + }, + { + "epoch": 0.19252162462823802, + "grad_norm": 0.4748426377773285, + "learning_rate": 9.356978449209167e-05, + "loss": 1.8244, + "step": 1060 + }, + { + "epoch": 0.1927032488024156, + "grad_norm": 1.909062147140503, + "learning_rate": 9.355527004777868e-05, + "loss": 1.8255, + "step": 1061 + }, + { + "epoch": 0.19288487297659318, + "grad_norm": 0.4526641368865967, + "learning_rate": 9.354074036910228e-05, + "loss": 1.8213, + "step": 1062 + }, + { + "epoch": 0.19306649715077076, + "grad_norm": 0.357756644487381, + "learning_rate": 9.352619546114456e-05, + "loss": 1.6179, + "step": 1063 + }, + { + "epoch": 0.19324812132494834, + "grad_norm": 0.4681167006492615, + "learning_rate": 9.351163532899287e-05, + "loss": 1.8496, + "step": 1064 + }, + { + "epoch": 0.19342974549912592, + "grad_norm": 0.33375781774520874, + "learning_rate": 9.349705997773997e-05, + "loss": 1.8009, + "step": 1065 + }, + { + "epoch": 0.19361136967330353, + "grad_norm": 0.42066332697868347, + "learning_rate": 9.348246941248384e-05, + "loss": 1.6782, + "step": 1066 + }, + { + "epoch": 0.1937929938474811, + "grad_norm": 0.7040932774543762, + "learning_rate": 9.346786363832788e-05, + "loss": 1.8747, + "step": 1067 + }, + { + "epoch": 0.1939746180216587, + "grad_norm": 0.4969865679740906, + "learning_rate": 9.345324266038074e-05, + "loss": 1.7512, + "step": 1068 + }, + { + "epoch": 0.19415624219583627, + "grad_norm": 0.38950279355049133, + "learning_rate": 9.34386064837564e-05, + "loss": 1.6321, + "step": 1069 + }, + { + "epoch": 0.19433786637001385, + "grad_norm": 0.7214657068252563, + "learning_rate": 9.342395511357418e-05, + "loss": 2.1113, + "step": 1070 + }, + { + "epoch": 0.19451949054419143, + "grad_norm": 0.42650076746940613, + "learning_rate": 9.340928855495872e-05, + "loss": 1.8698, + "step": 1071 + }, + { + "epoch": 0.194701114718369, + "grad_norm": 0.5472752451896667, + "learning_rate": 9.339460681303991e-05, + "loss": 1.798, + "step": 1072 + }, + { + "epoch": 0.1948827388925466, + "grad_norm": 0.5318459868431091, + "learning_rate": 9.337990989295306e-05, + "loss": 1.9489, + "step": 1073 + }, + { + "epoch": 0.19506436306672417, + "grad_norm": 0.4898572564125061, + "learning_rate": 9.336519779983867e-05, + "loss": 1.9164, + "step": 1074 + }, + { + "epoch": 0.19524598724090175, + "grad_norm": 0.7074351906776428, + "learning_rate": 9.335047053884261e-05, + "loss": 1.6992, + "step": 1075 + }, + { + "epoch": 0.19542761141507936, + "grad_norm": 0.34628209471702576, + "learning_rate": 9.333572811511608e-05, + "loss": 1.9169, + "step": 1076 + }, + { + "epoch": 0.19560923558925694, + "grad_norm": 0.4935842454433441, + "learning_rate": 9.33209705338155e-05, + "loss": 1.8374, + "step": 1077 + }, + { + "epoch": 0.19579085976343452, + "grad_norm": 0.4069223701953888, + "learning_rate": 9.330619780010268e-05, + "loss": 1.8123, + "step": 1078 + }, + { + "epoch": 0.1959724839376121, + "grad_norm": 0.34681081771850586, + "learning_rate": 9.329140991914467e-05, + "loss": 1.8658, + "step": 1079 + }, + { + "epoch": 0.19615410811178968, + "grad_norm": 0.5117766857147217, + "learning_rate": 9.327660689611386e-05, + "loss": 1.7999, + "step": 1080 + }, + { + "epoch": 0.19633573228596726, + "grad_norm": 0.44824445247650146, + "learning_rate": 9.32617887361879e-05, + "loss": 1.8599, + "step": 1081 + }, + { + "epoch": 0.19651735646014484, + "grad_norm": 0.4067938029766083, + "learning_rate": 9.324695544454974e-05, + "loss": 1.8745, + "step": 1082 + }, + { + "epoch": 0.19669898063432242, + "grad_norm": 0.41253358125686646, + "learning_rate": 9.323210702638766e-05, + "loss": 1.7731, + "step": 1083 + }, + { + "epoch": 0.1968806048085, + "grad_norm": 0.5918768048286438, + "learning_rate": 9.32172434868952e-05, + "loss": 1.8372, + "step": 1084 + }, + { + "epoch": 0.19706222898267758, + "grad_norm": 0.45955726504325867, + "learning_rate": 9.320236483127116e-05, + "loss": 1.8135, + "step": 1085 + }, + { + "epoch": 0.19724385315685516, + "grad_norm": 0.5816416144371033, + "learning_rate": 9.31874710647197e-05, + "loss": 1.951, + "step": 1086 + }, + { + "epoch": 0.19742547733103277, + "grad_norm": 0.3051910102367401, + "learning_rate": 9.31725621924502e-05, + "loss": 1.7796, + "step": 1087 + }, + { + "epoch": 0.19760710150521035, + "grad_norm": 0.4630873501300812, + "learning_rate": 9.315763821967736e-05, + "loss": 1.9826, + "step": 1088 + }, + { + "epoch": 0.19778872567938793, + "grad_norm": 0.3847540318965912, + "learning_rate": 9.314269915162114e-05, + "loss": 1.7959, + "step": 1089 + }, + { + "epoch": 0.1979703498535655, + "grad_norm": 0.3900764286518097, + "learning_rate": 9.312774499350682e-05, + "loss": 1.9418, + "step": 1090 + }, + { + "epoch": 0.1981519740277431, + "grad_norm": 0.8319585919380188, + "learning_rate": 9.311277575056489e-05, + "loss": 1.6176, + "step": 1091 + }, + { + "epoch": 0.19833359820192067, + "grad_norm": 1.2933127880096436, + "learning_rate": 9.309779142803116e-05, + "loss": 1.9365, + "step": 1092 + }, + { + "epoch": 0.19851522237609825, + "grad_norm": 0.9875990152359009, + "learning_rate": 9.308279203114674e-05, + "loss": 1.8654, + "step": 1093 + }, + { + "epoch": 0.19869684655027584, + "grad_norm": 0.3539016842842102, + "learning_rate": 9.306777756515795e-05, + "loss": 1.7664, + "step": 1094 + }, + { + "epoch": 0.19887847072445342, + "grad_norm": 1.098407506942749, + "learning_rate": 9.305274803531643e-05, + "loss": 1.6632, + "step": 1095 + }, + { + "epoch": 0.199060094898631, + "grad_norm": 0.38691574335098267, + "learning_rate": 9.303770344687906e-05, + "loss": 1.9799, + "step": 1096 + }, + { + "epoch": 0.1992417190728086, + "grad_norm": 0.5562824606895447, + "learning_rate": 9.302264380510801e-05, + "loss": 1.8481, + "step": 1097 + }, + { + "epoch": 0.19942334324698618, + "grad_norm": 0.3759826719760895, + "learning_rate": 9.30075691152707e-05, + "loss": 1.9244, + "step": 1098 + }, + { + "epoch": 0.19960496742116376, + "grad_norm": 0.5436197519302368, + "learning_rate": 9.29924793826398e-05, + "loss": 1.9693, + "step": 1099 + }, + { + "epoch": 0.19978659159534135, + "grad_norm": 0.5110308527946472, + "learning_rate": 9.297737461249329e-05, + "loss": 1.6382, + "step": 1100 + }, + { + "epoch": 0.19996821576951893, + "grad_norm": 0.6337760090827942, + "learning_rate": 9.296225481011436e-05, + "loss": 1.8621, + "step": 1101 + }, + { + "epoch": 0.2001498399436965, + "grad_norm": 0.42948660254478455, + "learning_rate": 9.294711998079146e-05, + "loss": 1.6966, + "step": 1102 + }, + { + "epoch": 0.2003314641178741, + "grad_norm": 0.44947415590286255, + "learning_rate": 9.293197012981834e-05, + "loss": 1.8621, + "step": 1103 + }, + { + "epoch": 0.20051308829205167, + "grad_norm": 0.3406376540660858, + "learning_rate": 9.291680526249396e-05, + "loss": 1.5984, + "step": 1104 + }, + { + "epoch": 0.20069471246622925, + "grad_norm": 0.4128546118736267, + "learning_rate": 9.290162538412256e-05, + "loss": 1.8673, + "step": 1105 + }, + { + "epoch": 0.20087633664040683, + "grad_norm": 1.5366406440734863, + "learning_rate": 9.288643050001361e-05, + "loss": 1.8021, + "step": 1106 + }, + { + "epoch": 0.2010579608145844, + "grad_norm": 0.4825185239315033, + "learning_rate": 9.287122061548184e-05, + "loss": 1.8772, + "step": 1107 + }, + { + "epoch": 0.20123958498876202, + "grad_norm": 1.675241231918335, + "learning_rate": 9.285599573584723e-05, + "loss": 1.8914, + "step": 1108 + }, + { + "epoch": 0.2014212091629396, + "grad_norm": 0.4106290340423584, + "learning_rate": 9.284075586643497e-05, + "loss": 1.8219, + "step": 1109 + }, + { + "epoch": 0.20160283333711718, + "grad_norm": 0.3895004689693451, + "learning_rate": 9.282550101257556e-05, + "loss": 1.7417, + "step": 1110 + }, + { + "epoch": 0.20178445751129476, + "grad_norm": 0.40775299072265625, + "learning_rate": 9.281023117960468e-05, + "loss": 1.7149, + "step": 1111 + }, + { + "epoch": 0.20196608168547234, + "grad_norm": 0.4186157286167145, + "learning_rate": 9.27949463728633e-05, + "loss": 1.7772, + "step": 1112 + }, + { + "epoch": 0.20214770585964992, + "grad_norm": 0.5769818425178528, + "learning_rate": 9.277964659769756e-05, + "loss": 1.7219, + "step": 1113 + }, + { + "epoch": 0.2023293300338275, + "grad_norm": 1.1607717275619507, + "learning_rate": 9.27643318594589e-05, + "loss": 2.079, + "step": 1114 + }, + { + "epoch": 0.20251095420800508, + "grad_norm": 0.42951518297195435, + "learning_rate": 9.274900216350396e-05, + "loss": 1.8583, + "step": 1115 + }, + { + "epoch": 0.20269257838218266, + "grad_norm": 0.558089554309845, + "learning_rate": 9.273365751519463e-05, + "loss": 1.923, + "step": 1116 + }, + { + "epoch": 0.20287420255636024, + "grad_norm": 0.39736244082450867, + "learning_rate": 9.271829791989801e-05, + "loss": 1.7313, + "step": 1117 + }, + { + "epoch": 0.20305582673053785, + "grad_norm": 0.36517056822776794, + "learning_rate": 9.270292338298645e-05, + "loss": 1.6769, + "step": 1118 + }, + { + "epoch": 0.20323745090471543, + "grad_norm": 0.6462035775184631, + "learning_rate": 9.26875339098375e-05, + "loss": 1.8008, + "step": 1119 + }, + { + "epoch": 0.203419075078893, + "grad_norm": 0.39897534251213074, + "learning_rate": 9.267212950583396e-05, + "loss": 1.8163, + "step": 1120 + }, + { + "epoch": 0.2036006992530706, + "grad_norm": 0.41678476333618164, + "learning_rate": 9.265671017636383e-05, + "loss": 1.7376, + "step": 1121 + }, + { + "epoch": 0.20378232342724817, + "grad_norm": 0.4451177716255188, + "learning_rate": 9.264127592682037e-05, + "loss": 1.6901, + "step": 1122 + }, + { + "epoch": 0.20396394760142575, + "grad_norm": 0.36556610465049744, + "learning_rate": 9.2625826762602e-05, + "loss": 1.9191, + "step": 1123 + }, + { + "epoch": 0.20414557177560333, + "grad_norm": 0.3452771306037903, + "learning_rate": 9.26103626891124e-05, + "loss": 1.6324, + "step": 1124 + }, + { + "epoch": 0.2043271959497809, + "grad_norm": 0.4905783534049988, + "learning_rate": 9.259488371176044e-05, + "loss": 1.8516, + "step": 1125 + }, + { + "epoch": 0.2045088201239585, + "grad_norm": 0.5711867809295654, + "learning_rate": 9.257938983596023e-05, + "loss": 1.7144, + "step": 1126 + }, + { + "epoch": 0.20469044429813607, + "grad_norm": 0.4529739320278168, + "learning_rate": 9.256388106713108e-05, + "loss": 1.8712, + "step": 1127 + }, + { + "epoch": 0.20487206847231368, + "grad_norm": 0.44811761379241943, + "learning_rate": 9.254835741069747e-05, + "loss": 1.7759, + "step": 1128 + }, + { + "epoch": 0.20505369264649126, + "grad_norm": 0.42048779129981995, + "learning_rate": 9.253281887208918e-05, + "loss": 1.6816, + "step": 1129 + }, + { + "epoch": 0.20523531682066884, + "grad_norm": 0.59731525182724, + "learning_rate": 9.251726545674108e-05, + "loss": 1.7628, + "step": 1130 + }, + { + "epoch": 0.20541694099484642, + "grad_norm": 0.3691215515136719, + "learning_rate": 9.250169717009334e-05, + "loss": 1.9319, + "step": 1131 + }, + { + "epoch": 0.205598565169024, + "grad_norm": 0.3778510093688965, + "learning_rate": 9.248611401759129e-05, + "loss": 2.0436, + "step": 1132 + }, + { + "epoch": 0.20578018934320158, + "grad_norm": 0.6171512007713318, + "learning_rate": 9.247051600468542e-05, + "loss": 2.0343, + "step": 1133 + }, + { + "epoch": 0.20596181351737916, + "grad_norm": 0.40991804003715515, + "learning_rate": 9.245490313683152e-05, + "loss": 1.7485, + "step": 1134 + }, + { + "epoch": 0.20614343769155674, + "grad_norm": 0.676139771938324, + "learning_rate": 9.243927541949046e-05, + "loss": 1.8931, + "step": 1135 + }, + { + "epoch": 0.20632506186573432, + "grad_norm": 0.3919764459133148, + "learning_rate": 9.242363285812842e-05, + "loss": 1.9253, + "step": 1136 + }, + { + "epoch": 0.2065066860399119, + "grad_norm": 0.4862998425960541, + "learning_rate": 9.240797545821667e-05, + "loss": 1.8735, + "step": 1137 + }, + { + "epoch": 0.20668831021408948, + "grad_norm": 0.3826025426387787, + "learning_rate": 9.239230322523171e-05, + "loss": 1.8665, + "step": 1138 + }, + { + "epoch": 0.2068699343882671, + "grad_norm": 0.4867055118083954, + "learning_rate": 9.237661616465525e-05, + "loss": 1.7689, + "step": 1139 + }, + { + "epoch": 0.20705155856244467, + "grad_norm": 0.43172499537467957, + "learning_rate": 9.236091428197415e-05, + "loss": 1.8634, + "step": 1140 + }, + { + "epoch": 0.20723318273662225, + "grad_norm": 0.3795762062072754, + "learning_rate": 9.234519758268049e-05, + "loss": 1.8223, + "step": 1141 + }, + { + "epoch": 0.20741480691079983, + "grad_norm": 0.611106812953949, + "learning_rate": 9.232946607227149e-05, + "loss": 1.7779, + "step": 1142 + }, + { + "epoch": 0.2075964310849774, + "grad_norm": 0.4657542407512665, + "learning_rate": 9.231371975624959e-05, + "loss": 1.8831, + "step": 1143 + }, + { + "epoch": 0.207778055259155, + "grad_norm": 0.3946947455406189, + "learning_rate": 9.229795864012239e-05, + "loss": 1.4365, + "step": 1144 + }, + { + "epoch": 0.20795967943333257, + "grad_norm": 0.5791359543800354, + "learning_rate": 9.228218272940265e-05, + "loss": 1.7069, + "step": 1145 + }, + { + "epoch": 0.20814130360751015, + "grad_norm": 0.4610961973667145, + "learning_rate": 9.226639202960836e-05, + "loss": 1.8215, + "step": 1146 + }, + { + "epoch": 0.20832292778168773, + "grad_norm": 0.41314300894737244, + "learning_rate": 9.225058654626263e-05, + "loss": 1.6886, + "step": 1147 + }, + { + "epoch": 0.2085045519558653, + "grad_norm": 0.4235198199748993, + "learning_rate": 9.223476628489373e-05, + "loss": 1.7021, + "step": 1148 + }, + { + "epoch": 0.20868617613004292, + "grad_norm": 0.5403158068656921, + "learning_rate": 9.221893125103518e-05, + "loss": 1.7773, + "step": 1149 + }, + { + "epoch": 0.2088678003042205, + "grad_norm": 0.4853987395763397, + "learning_rate": 9.220308145022556e-05, + "loss": 1.8314, + "step": 1150 + }, + { + "epoch": 0.20904942447839808, + "grad_norm": 1.149945855140686, + "learning_rate": 9.218721688800868e-05, + "loss": 2.0415, + "step": 1151 + }, + { + "epoch": 0.20923104865257566, + "grad_norm": 0.6981385350227356, + "learning_rate": 9.217133756993355e-05, + "loss": 1.7763, + "step": 1152 + }, + { + "epoch": 0.20941267282675324, + "grad_norm": 0.4831432104110718, + "learning_rate": 9.215544350155422e-05, + "loss": 1.8266, + "step": 1153 + }, + { + "epoch": 0.20959429700093082, + "grad_norm": 0.4029510021209717, + "learning_rate": 9.213953468843001e-05, + "loss": 1.9793, + "step": 1154 + }, + { + "epoch": 0.2097759211751084, + "grad_norm": 0.5245060920715332, + "learning_rate": 9.212361113612537e-05, + "loss": 1.6279, + "step": 1155 + }, + { + "epoch": 0.20995754534928598, + "grad_norm": 0.6977173686027527, + "learning_rate": 9.210767285020987e-05, + "loss": 1.8116, + "step": 1156 + }, + { + "epoch": 0.21013916952346356, + "grad_norm": 0.4026670455932617, + "learning_rate": 9.209171983625828e-05, + "loss": 1.8252, + "step": 1157 + }, + { + "epoch": 0.21032079369764115, + "grad_norm": 0.4475344717502594, + "learning_rate": 9.207575209985046e-05, + "loss": 1.9209, + "step": 1158 + }, + { + "epoch": 0.21050241787181873, + "grad_norm": 0.5432537198066711, + "learning_rate": 9.20597696465715e-05, + "loss": 1.8895, + "step": 1159 + }, + { + "epoch": 0.21068404204599633, + "grad_norm": 0.5444387197494507, + "learning_rate": 9.20437724820116e-05, + "loss": 1.8384, + "step": 1160 + }, + { + "epoch": 0.21086566622017391, + "grad_norm": 1.1847500801086426, + "learning_rate": 9.202776061176605e-05, + "loss": 2.013, + "step": 1161 + }, + { + "epoch": 0.2110472903943515, + "grad_norm": 1.405774474143982, + "learning_rate": 9.201173404143538e-05, + "loss": 1.7045, + "step": 1162 + }, + { + "epoch": 0.21122891456852907, + "grad_norm": 0.7657690644264221, + "learning_rate": 9.199569277662521e-05, + "loss": 1.6768, + "step": 1163 + }, + { + "epoch": 0.21141053874270666, + "grad_norm": 0.6287493109703064, + "learning_rate": 9.197963682294629e-05, + "loss": 2.0363, + "step": 1164 + }, + { + "epoch": 0.21159216291688424, + "grad_norm": 0.3797636330127716, + "learning_rate": 9.196356618601454e-05, + "loss": 1.8136, + "step": 1165 + }, + { + "epoch": 0.21177378709106182, + "grad_norm": 0.42388197779655457, + "learning_rate": 9.1947480871451e-05, + "loss": 1.6607, + "step": 1166 + }, + { + "epoch": 0.2119554112652394, + "grad_norm": 0.4848020076751709, + "learning_rate": 9.193138088488183e-05, + "loss": 1.8204, + "step": 1167 + }, + { + "epoch": 0.21213703543941698, + "grad_norm": 0.46670740842819214, + "learning_rate": 9.191526623193835e-05, + "loss": 1.843, + "step": 1168 + }, + { + "epoch": 0.21231865961359456, + "grad_norm": 0.7461701035499573, + "learning_rate": 9.1899136918257e-05, + "loss": 1.8213, + "step": 1169 + }, + { + "epoch": 0.21250028378777217, + "grad_norm": 0.46547478437423706, + "learning_rate": 9.188299294947932e-05, + "loss": 1.9027, + "step": 1170 + }, + { + "epoch": 0.21268190796194975, + "grad_norm": 0.42618006467819214, + "learning_rate": 9.186683433125203e-05, + "loss": 1.5502, + "step": 1171 + }, + { + "epoch": 0.21286353213612733, + "grad_norm": 0.3423370122909546, + "learning_rate": 9.185066106922693e-05, + "loss": 1.7343, + "step": 1172 + }, + { + "epoch": 0.2130451563103049, + "grad_norm": 1.3849879503250122, + "learning_rate": 9.183447316906093e-05, + "loss": 1.8506, + "step": 1173 + }, + { + "epoch": 0.2132267804844825, + "grad_norm": 0.5819071531295776, + "learning_rate": 9.181827063641613e-05, + "loss": 1.8143, + "step": 1174 + }, + { + "epoch": 0.21340840465866007, + "grad_norm": 0.4653153419494629, + "learning_rate": 9.180205347695968e-05, + "loss": 1.9617, + "step": 1175 + }, + { + "epoch": 0.21359002883283765, + "grad_norm": 0.5760759115219116, + "learning_rate": 9.178582169636385e-05, + "loss": 1.8091, + "step": 1176 + }, + { + "epoch": 0.21377165300701523, + "grad_norm": 0.4264085590839386, + "learning_rate": 9.176957530030609e-05, + "loss": 1.8206, + "step": 1177 + }, + { + "epoch": 0.2139532771811928, + "grad_norm": 0.38793668150901794, + "learning_rate": 9.175331429446887e-05, + "loss": 1.7846, + "step": 1178 + }, + { + "epoch": 0.2141349013553704, + "grad_norm": 0.34236007928848267, + "learning_rate": 9.173703868453986e-05, + "loss": 1.6795, + "step": 1179 + }, + { + "epoch": 0.214316525529548, + "grad_norm": 0.4392995238304138, + "learning_rate": 9.172074847621176e-05, + "loss": 1.8676, + "step": 1180 + }, + { + "epoch": 0.21449814970372558, + "grad_norm": 0.3785783648490906, + "learning_rate": 9.170444367518241e-05, + "loss": 1.7787, + "step": 1181 + }, + { + "epoch": 0.21467977387790316, + "grad_norm": 0.48478075861930847, + "learning_rate": 9.168812428715478e-05, + "loss": 1.7777, + "step": 1182 + }, + { + "epoch": 0.21486139805208074, + "grad_norm": 0.8389139175415039, + "learning_rate": 9.167179031783689e-05, + "loss": 1.7474, + "step": 1183 + }, + { + "epoch": 0.21504302222625832, + "grad_norm": 0.42488160729408264, + "learning_rate": 9.16554417729419e-05, + "loss": 1.7876, + "step": 1184 + }, + { + "epoch": 0.2152246464004359, + "grad_norm": 0.5318560600280762, + "learning_rate": 9.163907865818806e-05, + "loss": 1.7525, + "step": 1185 + }, + { + "epoch": 0.21540627057461348, + "grad_norm": 0.4764162003993988, + "learning_rate": 9.162270097929868e-05, + "loss": 1.8229, + "step": 1186 + }, + { + "epoch": 0.21558789474879106, + "grad_norm": 0.32771724462509155, + "learning_rate": 9.160630874200222e-05, + "loss": 1.7522, + "step": 1187 + }, + { + "epoch": 0.21576951892296864, + "grad_norm": 0.7233152985572815, + "learning_rate": 9.158990195203222e-05, + "loss": 1.85, + "step": 1188 + }, + { + "epoch": 0.21595114309714622, + "grad_norm": 0.41001489758491516, + "learning_rate": 9.157348061512727e-05, + "loss": 1.7499, + "step": 1189 + }, + { + "epoch": 0.2161327672713238, + "grad_norm": 0.37446266412734985, + "learning_rate": 9.155704473703109e-05, + "loss": 1.8523, + "step": 1190 + }, + { + "epoch": 0.2163143914455014, + "grad_norm": 0.44794270396232605, + "learning_rate": 9.154059432349245e-05, + "loss": 1.8538, + "step": 1191 + }, + { + "epoch": 0.216496015619679, + "grad_norm": 0.4953394830226898, + "learning_rate": 9.152412938026525e-05, + "loss": 1.8142, + "step": 1192 + }, + { + "epoch": 0.21667763979385657, + "grad_norm": 0.37274011969566345, + "learning_rate": 9.150764991310841e-05, + "loss": 1.5939, + "step": 1193 + }, + { + "epoch": 0.21685926396803415, + "grad_norm": 0.35870596766471863, + "learning_rate": 9.149115592778602e-05, + "loss": 1.9129, + "step": 1194 + }, + { + "epoch": 0.21704088814221173, + "grad_norm": 0.38615337014198303, + "learning_rate": 9.147464743006717e-05, + "loss": 1.6376, + "step": 1195 + }, + { + "epoch": 0.2172225123163893, + "grad_norm": 0.3782939314842224, + "learning_rate": 9.145812442572603e-05, + "loss": 1.9768, + "step": 1196 + }, + { + "epoch": 0.2174041364905669, + "grad_norm": 0.41374483704566956, + "learning_rate": 9.14415869205419e-05, + "loss": 1.7846, + "step": 1197 + }, + { + "epoch": 0.21758576066474447, + "grad_norm": 0.45065975189208984, + "learning_rate": 9.14250349202991e-05, + "loss": 1.8818, + "step": 1198 + }, + { + "epoch": 0.21776738483892205, + "grad_norm": 0.4766313433647156, + "learning_rate": 9.140846843078706e-05, + "loss": 1.7233, + "step": 1199 + }, + { + "epoch": 0.21794900901309963, + "grad_norm": 0.39583462476730347, + "learning_rate": 9.13918874578002e-05, + "loss": 1.6082, + "step": 1200 + }, + { + "epoch": 0.21813063318727724, + "grad_norm": 0.5321093797683716, + "learning_rate": 9.13752920071381e-05, + "loss": 1.9717, + "step": 1201 + }, + { + "epoch": 0.21831225736145482, + "grad_norm": 0.3590918183326721, + "learning_rate": 9.135868208460538e-05, + "loss": 1.7819, + "step": 1202 + }, + { + "epoch": 0.2184938815356324, + "grad_norm": 0.80759596824646, + "learning_rate": 9.134205769601167e-05, + "loss": 1.6651, + "step": 1203 + }, + { + "epoch": 0.21867550570980998, + "grad_norm": 0.7484925985336304, + "learning_rate": 9.132541884717172e-05, + "loss": 1.5954, + "step": 1204 + }, + { + "epoch": 0.21885712988398756, + "grad_norm": 0.4565178155899048, + "learning_rate": 9.13087655439053e-05, + "loss": 1.81, + "step": 1205 + }, + { + "epoch": 0.21903875405816514, + "grad_norm": 0.3506205976009369, + "learning_rate": 9.129209779203725e-05, + "loss": 2.0249, + "step": 1206 + }, + { + "epoch": 0.21922037823234272, + "grad_norm": 0.5716496109962463, + "learning_rate": 9.127541559739748e-05, + "loss": 1.9159, + "step": 1207 + }, + { + "epoch": 0.2194020024065203, + "grad_norm": 1.381463646888733, + "learning_rate": 9.125871896582092e-05, + "loss": 1.8532, + "step": 1208 + }, + { + "epoch": 0.21958362658069788, + "grad_norm": 0.5210908055305481, + "learning_rate": 9.124200790314758e-05, + "loss": 1.6809, + "step": 1209 + }, + { + "epoch": 0.21976525075487546, + "grad_norm": 0.48417213559150696, + "learning_rate": 9.122528241522248e-05, + "loss": 1.8111, + "step": 1210 + }, + { + "epoch": 0.21994687492905304, + "grad_norm": 0.5590111017227173, + "learning_rate": 9.120854250789573e-05, + "loss": 1.6587, + "step": 1211 + }, + { + "epoch": 0.22012849910323065, + "grad_norm": 0.4451367259025574, + "learning_rate": 9.119178818702246e-05, + "loss": 1.7058, + "step": 1212 + }, + { + "epoch": 0.22031012327740823, + "grad_norm": 0.4449574053287506, + "learning_rate": 9.11750194584628e-05, + "loss": 1.9764, + "step": 1213 + }, + { + "epoch": 0.2204917474515858, + "grad_norm": 1.0577818155288696, + "learning_rate": 9.115823632808202e-05, + "loss": 1.9971, + "step": 1214 + }, + { + "epoch": 0.2206733716257634, + "grad_norm": 0.4718782603740692, + "learning_rate": 9.114143880175036e-05, + "loss": 1.7425, + "step": 1215 + }, + { + "epoch": 0.22085499579994097, + "grad_norm": 0.5399010181427002, + "learning_rate": 9.112462688534308e-05, + "loss": 1.7552, + "step": 1216 + }, + { + "epoch": 0.22103661997411855, + "grad_norm": 0.5991796851158142, + "learning_rate": 9.110780058474052e-05, + "loss": 1.6524, + "step": 1217 + }, + { + "epoch": 0.22121824414829613, + "grad_norm": 0.4937182366847992, + "learning_rate": 9.109095990582798e-05, + "loss": 1.6643, + "step": 1218 + }, + { + "epoch": 0.22139986832247371, + "grad_norm": 0.44616031646728516, + "learning_rate": 9.107410485449592e-05, + "loss": 1.9143, + "step": 1219 + }, + { + "epoch": 0.2215814924966513, + "grad_norm": 0.4759426414966583, + "learning_rate": 9.105723543663969e-05, + "loss": 1.8986, + "step": 1220 + }, + { + "epoch": 0.22176311667082887, + "grad_norm": 0.315208375453949, + "learning_rate": 9.104035165815971e-05, + "loss": 1.7163, + "step": 1221 + }, + { + "epoch": 0.22194474084500648, + "grad_norm": 1.810794711112976, + "learning_rate": 9.10234535249615e-05, + "loss": 1.815, + "step": 1222 + }, + { + "epoch": 0.22212636501918406, + "grad_norm": 0.401417076587677, + "learning_rate": 9.100654104295546e-05, + "loss": 1.781, + "step": 1223 + }, + { + "epoch": 0.22230798919336164, + "grad_norm": 0.44618648290634155, + "learning_rate": 9.098961421805712e-05, + "loss": 1.7954, + "step": 1224 + }, + { + "epoch": 0.22248961336753922, + "grad_norm": 0.4736192226409912, + "learning_rate": 9.097267305618699e-05, + "loss": 1.757, + "step": 1225 + }, + { + "epoch": 0.2226712375417168, + "grad_norm": 0.6690506339073181, + "learning_rate": 9.095571756327059e-05, + "loss": 1.855, + "step": 1226 + }, + { + "epoch": 0.22285286171589438, + "grad_norm": 1.3945692777633667, + "learning_rate": 9.093874774523844e-05, + "loss": 2.0126, + "step": 1227 + }, + { + "epoch": 0.22303448589007197, + "grad_norm": 0.3743654787540436, + "learning_rate": 9.092176360802611e-05, + "loss": 1.9431, + "step": 1228 + }, + { + "epoch": 0.22321611006424955, + "grad_norm": 0.9722837805747986, + "learning_rate": 9.090476515757415e-05, + "loss": 1.8633, + "step": 1229 + }, + { + "epoch": 0.22339773423842713, + "grad_norm": 0.32934048771858215, + "learning_rate": 9.088775239982812e-05, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 0.2235793584126047, + "grad_norm": 0.42320239543914795, + "learning_rate": 9.087072534073859e-05, + "loss": 1.6718, + "step": 1231 + }, + { + "epoch": 0.22376098258678231, + "grad_norm": 0.3820490539073944, + "learning_rate": 9.085368398626112e-05, + "loss": 1.7504, + "step": 1232 + }, + { + "epoch": 0.2239426067609599, + "grad_norm": 0.3363991677761078, + "learning_rate": 9.08366283423563e-05, + "loss": 1.9616, + "step": 1233 + }, + { + "epoch": 0.22412423093513748, + "grad_norm": 0.45497575402259827, + "learning_rate": 9.081955841498966e-05, + "loss": 1.9675, + "step": 1234 + }, + { + "epoch": 0.22430585510931506, + "grad_norm": 0.356789231300354, + "learning_rate": 9.08024742101318e-05, + "loss": 1.8014, + "step": 1235 + }, + { + "epoch": 0.22448747928349264, + "grad_norm": 0.7202264666557312, + "learning_rate": 9.078537573375827e-05, + "loss": 1.7419, + "step": 1236 + }, + { + "epoch": 0.22466910345767022, + "grad_norm": 0.4640064239501953, + "learning_rate": 9.07682629918496e-05, + "loss": 1.6586, + "step": 1237 + }, + { + "epoch": 0.2248507276318478, + "grad_norm": 0.5959523916244507, + "learning_rate": 9.075113599039134e-05, + "loss": 1.7877, + "step": 1238 + }, + { + "epoch": 0.22503235180602538, + "grad_norm": 0.4715270400047302, + "learning_rate": 9.0733994735374e-05, + "loss": 1.7516, + "step": 1239 + }, + { + "epoch": 0.22521397598020296, + "grad_norm": 0.37684300541877747, + "learning_rate": 9.07168392327931e-05, + "loss": 1.5452, + "step": 1240 + }, + { + "epoch": 0.22539560015438054, + "grad_norm": 1.4524266719818115, + "learning_rate": 9.069966948864916e-05, + "loss": 1.8292, + "step": 1241 + }, + { + "epoch": 0.22557722432855812, + "grad_norm": 0.3977339565753937, + "learning_rate": 9.068248550894763e-05, + "loss": 1.7122, + "step": 1242 + }, + { + "epoch": 0.22575884850273573, + "grad_norm": 0.44066688418388367, + "learning_rate": 9.066528729969898e-05, + "loss": 1.6559, + "step": 1243 + }, + { + "epoch": 0.2259404726769133, + "grad_norm": 0.4816541373729706, + "learning_rate": 9.064807486691862e-05, + "loss": 1.8302, + "step": 1244 + }, + { + "epoch": 0.2261220968510909, + "grad_norm": 0.4477475583553314, + "learning_rate": 9.063084821662697e-05, + "loss": 1.8295, + "step": 1245 + }, + { + "epoch": 0.22630372102526847, + "grad_norm": 0.3535521328449249, + "learning_rate": 9.06136073548494e-05, + "loss": 1.8348, + "step": 1246 + }, + { + "epoch": 0.22648534519944605, + "grad_norm": 0.3537195920944214, + "learning_rate": 9.05963522876163e-05, + "loss": 1.796, + "step": 1247 + }, + { + "epoch": 0.22666696937362363, + "grad_norm": 0.3810422420501709, + "learning_rate": 9.057908302096294e-05, + "loss": 1.7939, + "step": 1248 + }, + { + "epoch": 0.2268485935478012, + "grad_norm": 0.3628976047039032, + "learning_rate": 9.056179956092962e-05, + "loss": 2.0375, + "step": 1249 + }, + { + "epoch": 0.2270302177219788, + "grad_norm": 0.5311593413352966, + "learning_rate": 9.054450191356161e-05, + "loss": 1.7495, + "step": 1250 + }, + { + "epoch": 0.22721184189615637, + "grad_norm": 0.47682297229766846, + "learning_rate": 9.052719008490909e-05, + "loss": 1.8839, + "step": 1251 + }, + { + "epoch": 0.22739346607033395, + "grad_norm": 0.382924348115921, + "learning_rate": 9.050986408102727e-05, + "loss": 1.6772, + "step": 1252 + }, + { + "epoch": 0.22757509024451156, + "grad_norm": 0.5718294382095337, + "learning_rate": 9.049252390797625e-05, + "loss": 1.8658, + "step": 1253 + }, + { + "epoch": 0.22775671441868914, + "grad_norm": 0.4122014045715332, + "learning_rate": 9.047516957182113e-05, + "loss": 1.9878, + "step": 1254 + }, + { + "epoch": 0.22793833859286672, + "grad_norm": 0.40650036931037903, + "learning_rate": 9.045780107863195e-05, + "loss": 1.7277, + "step": 1255 + }, + { + "epoch": 0.2281199627670443, + "grad_norm": 0.5534829497337341, + "learning_rate": 9.044041843448371e-05, + "loss": 2.006, + "step": 1256 + }, + { + "epoch": 0.22830158694122188, + "grad_norm": 0.49496176838874817, + "learning_rate": 9.042302164545633e-05, + "loss": 1.9565, + "step": 1257 + }, + { + "epoch": 0.22848321111539946, + "grad_norm": 0.4448326528072357, + "learning_rate": 9.040561071763472e-05, + "loss": 1.8059, + "step": 1258 + }, + { + "epoch": 0.22866483528957704, + "grad_norm": 0.5283184051513672, + "learning_rate": 9.038818565710872e-05, + "loss": 1.9942, + "step": 1259 + }, + { + "epoch": 0.22884645946375462, + "grad_norm": 0.3746737539768219, + "learning_rate": 9.037074646997309e-05, + "loss": 1.6198, + "step": 1260 + }, + { + "epoch": 0.2290280836379322, + "grad_norm": 0.5174587965011597, + "learning_rate": 9.035329316232755e-05, + "loss": 1.6584, + "step": 1261 + }, + { + "epoch": 0.22920970781210978, + "grad_norm": 0.37457743287086487, + "learning_rate": 9.033582574027677e-05, + "loss": 1.8997, + "step": 1262 + }, + { + "epoch": 0.22939133198628736, + "grad_norm": 0.3836328387260437, + "learning_rate": 9.031834420993033e-05, + "loss": 1.8269, + "step": 1263 + }, + { + "epoch": 0.22957295616046497, + "grad_norm": 0.3823738992214203, + "learning_rate": 9.030084857740278e-05, + "loss": 1.8184, + "step": 1264 + }, + { + "epoch": 0.22975458033464255, + "grad_norm": 0.35497957468032837, + "learning_rate": 9.028333884881357e-05, + "loss": 1.8525, + "step": 1265 + }, + { + "epoch": 0.22993620450882013, + "grad_norm": 0.3836558759212494, + "learning_rate": 9.026581503028708e-05, + "loss": 1.7373, + "step": 1266 + }, + { + "epoch": 0.2301178286829977, + "grad_norm": 1.0468482971191406, + "learning_rate": 9.024827712795265e-05, + "loss": 1.7489, + "step": 1267 + }, + { + "epoch": 0.2302994528571753, + "grad_norm": 0.556947648525238, + "learning_rate": 9.023072514794453e-05, + "loss": 1.7519, + "step": 1268 + }, + { + "epoch": 0.23048107703135287, + "grad_norm": 0.5769675374031067, + "learning_rate": 9.021315909640186e-05, + "loss": 1.8049, + "step": 1269 + }, + { + "epoch": 0.23066270120553045, + "grad_norm": 0.873871386051178, + "learning_rate": 9.019557897946878e-05, + "loss": 1.6399, + "step": 1270 + }, + { + "epoch": 0.23084432537970803, + "grad_norm": 0.37445878982543945, + "learning_rate": 9.017798480329427e-05, + "loss": 1.6783, + "step": 1271 + }, + { + "epoch": 0.2310259495538856, + "grad_norm": 0.42965948581695557, + "learning_rate": 9.016037657403224e-05, + "loss": 1.6429, + "step": 1272 + }, + { + "epoch": 0.2312075737280632, + "grad_norm": 1.1930856704711914, + "learning_rate": 9.014275429784159e-05, + "loss": 1.657, + "step": 1273 + }, + { + "epoch": 0.2313891979022408, + "grad_norm": 1.2174304723739624, + "learning_rate": 9.012511798088603e-05, + "loss": 1.9955, + "step": 1274 + }, + { + "epoch": 0.23157082207641838, + "grad_norm": 0.7174996137619019, + "learning_rate": 9.010746762933426e-05, + "loss": 1.9778, + "step": 1275 + }, + { + "epoch": 0.23175244625059596, + "grad_norm": 0.4167729616165161, + "learning_rate": 9.008980324935985e-05, + "loss": 1.8288, + "step": 1276 + }, + { + "epoch": 0.23193407042477354, + "grad_norm": 0.3597445785999298, + "learning_rate": 9.007212484714128e-05, + "loss": 1.655, + "step": 1277 + }, + { + "epoch": 0.23211569459895112, + "grad_norm": 0.5776665806770325, + "learning_rate": 9.005443242886194e-05, + "loss": 1.6891, + "step": 1278 + }, + { + "epoch": 0.2322973187731287, + "grad_norm": 0.35558223724365234, + "learning_rate": 9.003672600071013e-05, + "loss": 1.7698, + "step": 1279 + }, + { + "epoch": 0.23247894294730628, + "grad_norm": 0.38485708832740784, + "learning_rate": 9.001900556887902e-05, + "loss": 1.8349, + "step": 1280 + }, + { + "epoch": 0.23266056712148386, + "grad_norm": 0.36458003520965576, + "learning_rate": 9.000127113956674e-05, + "loss": 1.8361, + "step": 1281 + }, + { + "epoch": 0.23284219129566144, + "grad_norm": 0.38464903831481934, + "learning_rate": 8.998352271897623e-05, + "loss": 1.8032, + "step": 1282 + }, + { + "epoch": 0.23302381546983902, + "grad_norm": 0.38519811630249023, + "learning_rate": 8.99657603133154e-05, + "loss": 1.6358, + "step": 1283 + }, + { + "epoch": 0.23320543964401663, + "grad_norm": 0.4791126847267151, + "learning_rate": 8.994798392879701e-05, + "loss": 1.9693, + "step": 1284 + }, + { + "epoch": 0.2333870638181942, + "grad_norm": 0.46464812755584717, + "learning_rate": 8.993019357163873e-05, + "loss": 1.8347, + "step": 1285 + }, + { + "epoch": 0.2335686879923718, + "grad_norm": 0.46753305196762085, + "learning_rate": 8.99123892480631e-05, + "loss": 1.8466, + "step": 1286 + }, + { + "epoch": 0.23375031216654937, + "grad_norm": 0.3163427710533142, + "learning_rate": 8.989457096429756e-05, + "loss": 1.6808, + "step": 1287 + }, + { + "epoch": 0.23393193634072695, + "grad_norm": 0.584472119808197, + "learning_rate": 8.987673872657442e-05, + "loss": 1.7649, + "step": 1288 + }, + { + "epoch": 0.23411356051490453, + "grad_norm": 0.70416659116745, + "learning_rate": 8.985889254113088e-05, + "loss": 1.6798, + "step": 1289 + }, + { + "epoch": 0.23429518468908211, + "grad_norm": 0.39810335636138916, + "learning_rate": 8.984103241420902e-05, + "loss": 1.7275, + "step": 1290 + }, + { + "epoch": 0.2344768088632597, + "grad_norm": 0.3868221640586853, + "learning_rate": 8.982315835205578e-05, + "loss": 1.8893, + "step": 1291 + }, + { + "epoch": 0.23465843303743728, + "grad_norm": 0.47449931502342224, + "learning_rate": 8.980527036092299e-05, + "loss": 1.8842, + "step": 1292 + }, + { + "epoch": 0.23484005721161486, + "grad_norm": 0.4042537808418274, + "learning_rate": 8.978736844706736e-05, + "loss": 1.8095, + "step": 1293 + }, + { + "epoch": 0.23502168138579244, + "grad_norm": 0.36834603548049927, + "learning_rate": 8.976945261675042e-05, + "loss": 1.8752, + "step": 1294 + }, + { + "epoch": 0.23520330555997004, + "grad_norm": 0.4575563073158264, + "learning_rate": 8.975152287623867e-05, + "loss": 1.8539, + "step": 1295 + }, + { + "epoch": 0.23538492973414762, + "grad_norm": 0.4804386496543884, + "learning_rate": 8.973357923180334e-05, + "loss": 1.8371, + "step": 1296 + }, + { + "epoch": 0.2355665539083252, + "grad_norm": 0.38728946447372437, + "learning_rate": 8.971562168972064e-05, + "loss": 1.8838, + "step": 1297 + }, + { + "epoch": 0.23574817808250279, + "grad_norm": 0.36056190729141235, + "learning_rate": 8.969765025627158e-05, + "loss": 1.5917, + "step": 1298 + }, + { + "epoch": 0.23592980225668037, + "grad_norm": 0.3245345652103424, + "learning_rate": 8.967966493774205e-05, + "loss": 1.6633, + "step": 1299 + }, + { + "epoch": 0.23611142643085795, + "grad_norm": 0.3496229946613312, + "learning_rate": 8.966166574042277e-05, + "loss": 1.8946, + "step": 1300 + }, + { + "epoch": 0.23629305060503553, + "grad_norm": 0.7146358489990234, + "learning_rate": 8.964365267060935e-05, + "loss": 1.9097, + "step": 1301 + }, + { + "epoch": 0.2364746747792131, + "grad_norm": 1.2955214977264404, + "learning_rate": 8.962562573460225e-05, + "loss": 2.0666, + "step": 1302 + }, + { + "epoch": 0.2366562989533907, + "grad_norm": 2.074798345565796, + "learning_rate": 8.960758493870674e-05, + "loss": 2.0224, + "step": 1303 + }, + { + "epoch": 0.23683792312756827, + "grad_norm": 0.5233278274536133, + "learning_rate": 8.958953028923297e-05, + "loss": 1.8417, + "step": 1304 + }, + { + "epoch": 0.23701954730174588, + "grad_norm": 0.49418407678604126, + "learning_rate": 8.957146179249595e-05, + "loss": 1.8161, + "step": 1305 + }, + { + "epoch": 0.23720117147592346, + "grad_norm": 0.4487641453742981, + "learning_rate": 8.95533794548155e-05, + "loss": 1.9578, + "step": 1306 + }, + { + "epoch": 0.23738279565010104, + "grad_norm": 1.1594767570495605, + "learning_rate": 8.953528328251628e-05, + "loss": 1.6662, + "step": 1307 + }, + { + "epoch": 0.23756441982427862, + "grad_norm": 0.42648500204086304, + "learning_rate": 8.951717328192782e-05, + "loss": 1.7569, + "step": 1308 + }, + { + "epoch": 0.2377460439984562, + "grad_norm": 0.4575806260108948, + "learning_rate": 8.949904945938448e-05, + "loss": 1.8113, + "step": 1309 + }, + { + "epoch": 0.23792766817263378, + "grad_norm": 1.1227073669433594, + "learning_rate": 8.948091182122541e-05, + "loss": 1.9913, + "step": 1310 + }, + { + "epoch": 0.23810929234681136, + "grad_norm": 0.3796898424625397, + "learning_rate": 8.946276037379467e-05, + "loss": 2.0226, + "step": 1311 + }, + { + "epoch": 0.23829091652098894, + "grad_norm": 0.38645967841148376, + "learning_rate": 8.944459512344105e-05, + "loss": 1.6957, + "step": 1312 + }, + { + "epoch": 0.23847254069516652, + "grad_norm": 0.5727745890617371, + "learning_rate": 8.94264160765183e-05, + "loss": 1.8749, + "step": 1313 + }, + { + "epoch": 0.2386541648693441, + "grad_norm": 0.3445509374141693, + "learning_rate": 8.940822323938487e-05, + "loss": 1.896, + "step": 1314 + }, + { + "epoch": 0.23883578904352168, + "grad_norm": 0.4235701858997345, + "learning_rate": 8.939001661840408e-05, + "loss": 1.8093, + "step": 1315 + }, + { + "epoch": 0.2390174132176993, + "grad_norm": 0.3422987759113312, + "learning_rate": 8.93717962199441e-05, + "loss": 1.9405, + "step": 1316 + }, + { + "epoch": 0.23919903739187687, + "grad_norm": 0.36512377858161926, + "learning_rate": 8.935356205037789e-05, + "loss": 1.7158, + "step": 1317 + }, + { + "epoch": 0.23938066156605445, + "grad_norm": 0.467244029045105, + "learning_rate": 8.933531411608321e-05, + "loss": 1.8037, + "step": 1318 + }, + { + "epoch": 0.23956228574023203, + "grad_norm": 0.8343355059623718, + "learning_rate": 8.931705242344269e-05, + "loss": 1.7325, + "step": 1319 + }, + { + "epoch": 0.2397439099144096, + "grad_norm": 0.3280513286590576, + "learning_rate": 8.92987769788437e-05, + "loss": 1.8348, + "step": 1320 + }, + { + "epoch": 0.2399255340885872, + "grad_norm": 0.7833452224731445, + "learning_rate": 8.928048778867848e-05, + "loss": 1.6836, + "step": 1321 + }, + { + "epoch": 0.24010715826276477, + "grad_norm": 0.6535219550132751, + "learning_rate": 8.926218485934405e-05, + "loss": 2.0573, + "step": 1322 + }, + { + "epoch": 0.24028878243694235, + "grad_norm": 0.630365252494812, + "learning_rate": 8.924386819724225e-05, + "loss": 1.8903, + "step": 1323 + }, + { + "epoch": 0.24047040661111993, + "grad_norm": 0.5708134770393372, + "learning_rate": 8.922553780877969e-05, + "loss": 1.8288, + "step": 1324 + }, + { + "epoch": 0.2406520307852975, + "grad_norm": 0.5112709403038025, + "learning_rate": 8.920719370036783e-05, + "loss": 2.0027, + "step": 1325 + }, + { + "epoch": 0.24083365495947512, + "grad_norm": 0.37752243876457214, + "learning_rate": 8.91888358784229e-05, + "loss": 1.995, + "step": 1326 + }, + { + "epoch": 0.2410152791336527, + "grad_norm": 2.311835289001465, + "learning_rate": 8.917046434936591e-05, + "loss": 1.7968, + "step": 1327 + }, + { + "epoch": 0.24119690330783028, + "grad_norm": 0.44001007080078125, + "learning_rate": 8.915207911962271e-05, + "loss": 1.6921, + "step": 1328 + }, + { + "epoch": 0.24137852748200786, + "grad_norm": 1.1666814088821411, + "learning_rate": 8.913368019562391e-05, + "loss": 1.8676, + "step": 1329 + }, + { + "epoch": 0.24156015165618544, + "grad_norm": 0.9295823574066162, + "learning_rate": 8.911526758380493e-05, + "loss": 1.8226, + "step": 1330 + }, + { + "epoch": 0.24174177583036302, + "grad_norm": 1.7259984016418457, + "learning_rate": 8.909684129060593e-05, + "loss": 1.9469, + "step": 1331 + }, + { + "epoch": 0.2419234000045406, + "grad_norm": 0.3543682396411896, + "learning_rate": 8.907840132247192e-05, + "loss": 1.8287, + "step": 1332 + }, + { + "epoch": 0.24210502417871818, + "grad_norm": 0.3775070011615753, + "learning_rate": 8.905994768585266e-05, + "loss": 1.7684, + "step": 1333 + }, + { + "epoch": 0.24228664835289576, + "grad_norm": 0.5182675123214722, + "learning_rate": 8.904148038720268e-05, + "loss": 1.8592, + "step": 1334 + }, + { + "epoch": 0.24246827252707334, + "grad_norm": 0.6678969860076904, + "learning_rate": 8.902299943298131e-05, + "loss": 1.764, + "step": 1335 + }, + { + "epoch": 0.24264989670125095, + "grad_norm": 0.946260929107666, + "learning_rate": 8.900450482965264e-05, + "loss": 1.7558, + "step": 1336 + }, + { + "epoch": 0.24283152087542853, + "grad_norm": 0.46883752942085266, + "learning_rate": 8.898599658368556e-05, + "loss": 1.6703, + "step": 1337 + }, + { + "epoch": 0.2430131450496061, + "grad_norm": 0.38558968901634216, + "learning_rate": 8.89674747015537e-05, + "loss": 1.8392, + "step": 1338 + }, + { + "epoch": 0.2431947692237837, + "grad_norm": 0.5213986039161682, + "learning_rate": 8.894893918973551e-05, + "loss": 1.8051, + "step": 1339 + }, + { + "epoch": 0.24337639339796127, + "grad_norm": 0.4627891480922699, + "learning_rate": 8.89303900547141e-05, + "loss": 1.6084, + "step": 1340 + }, + { + "epoch": 0.24355801757213885, + "grad_norm": 0.5082706212997437, + "learning_rate": 8.89118273029775e-05, + "loss": 1.6441, + "step": 1341 + }, + { + "epoch": 0.24373964174631643, + "grad_norm": 0.4255543649196625, + "learning_rate": 8.889325094101835e-05, + "loss": 1.7555, + "step": 1342 + }, + { + "epoch": 0.243921265920494, + "grad_norm": 0.5370838046073914, + "learning_rate": 8.887466097533416e-05, + "loss": 1.6671, + "step": 1343 + }, + { + "epoch": 0.2441028900946716, + "grad_norm": 0.8110207319259644, + "learning_rate": 8.885605741242714e-05, + "loss": 1.9354, + "step": 1344 + }, + { + "epoch": 0.24428451426884917, + "grad_norm": 0.8565096855163574, + "learning_rate": 8.883744025880428e-05, + "loss": 1.7112, + "step": 1345 + }, + { + "epoch": 0.24446613844302675, + "grad_norm": 0.36345285177230835, + "learning_rate": 8.881880952097731e-05, + "loss": 1.7775, + "step": 1346 + }, + { + "epoch": 0.24464776261720436, + "grad_norm": 0.45891082286834717, + "learning_rate": 8.880016520546274e-05, + "loss": 1.8203, + "step": 1347 + }, + { + "epoch": 0.24482938679138194, + "grad_norm": 0.5802310705184937, + "learning_rate": 8.87815073187818e-05, + "loss": 1.9002, + "step": 1348 + }, + { + "epoch": 0.24501101096555952, + "grad_norm": 0.4701487720012665, + "learning_rate": 8.876283586746045e-05, + "loss": 1.6003, + "step": 1349 + }, + { + "epoch": 0.2451926351397371, + "grad_norm": 0.5230256915092468, + "learning_rate": 8.874415085802945e-05, + "loss": 1.6929, + "step": 1350 + }, + { + "epoch": 0.24537425931391468, + "grad_norm": 0.45331263542175293, + "learning_rate": 8.872545229702426e-05, + "loss": 1.8484, + "step": 1351 + }, + { + "epoch": 0.24555588348809226, + "grad_norm": 0.42038583755493164, + "learning_rate": 8.870674019098508e-05, + "loss": 1.8111, + "step": 1352 + }, + { + "epoch": 0.24573750766226984, + "grad_norm": 0.4338943660259247, + "learning_rate": 8.868801454645688e-05, + "loss": 1.9673, + "step": 1353 + }, + { + "epoch": 0.24591913183644742, + "grad_norm": 0.3554174304008484, + "learning_rate": 8.866927536998933e-05, + "loss": 1.8662, + "step": 1354 + }, + { + "epoch": 0.246100756010625, + "grad_norm": 0.43689262866973877, + "learning_rate": 8.865052266813685e-05, + "loss": 1.829, + "step": 1355 + }, + { + "epoch": 0.24628238018480259, + "grad_norm": 1.138323187828064, + "learning_rate": 8.863175644745859e-05, + "loss": 1.8556, + "step": 1356 + }, + { + "epoch": 0.2464640043589802, + "grad_norm": 0.41493895649909973, + "learning_rate": 8.861297671451844e-05, + "loss": 1.9054, + "step": 1357 + }, + { + "epoch": 0.24664562853315777, + "grad_norm": 0.4579445719718933, + "learning_rate": 8.859418347588497e-05, + "loss": 1.8102, + "step": 1358 + }, + { + "epoch": 0.24682725270733535, + "grad_norm": 0.34973251819610596, + "learning_rate": 8.857537673813153e-05, + "loss": 1.598, + "step": 1359 + }, + { + "epoch": 0.24700887688151293, + "grad_norm": 0.38782113790512085, + "learning_rate": 8.855655650783618e-05, + "loss": 2.0214, + "step": 1360 + }, + { + "epoch": 0.24719050105569051, + "grad_norm": 0.47396978735923767, + "learning_rate": 8.853772279158166e-05, + "loss": 1.6317, + "step": 1361 + }, + { + "epoch": 0.2473721252298681, + "grad_norm": 0.3889757990837097, + "learning_rate": 8.851887559595546e-05, + "loss": 1.8092, + "step": 1362 + }, + { + "epoch": 0.24755374940404568, + "grad_norm": 0.6568871140480042, + "learning_rate": 8.85000149275498e-05, + "loss": 1.7613, + "step": 1363 + }, + { + "epoch": 0.24773537357822326, + "grad_norm": 0.3259899318218231, + "learning_rate": 8.84811407929616e-05, + "loss": 1.641, + "step": 1364 + }, + { + "epoch": 0.24791699775240084, + "grad_norm": 0.9534372687339783, + "learning_rate": 8.846225319879243e-05, + "loss": 1.5891, + "step": 1365 + }, + { + "epoch": 0.24809862192657842, + "grad_norm": 0.41037997603416443, + "learning_rate": 8.844335215164866e-05, + "loss": 1.7379, + "step": 1366 + }, + { + "epoch": 0.248280246100756, + "grad_norm": 0.3401070237159729, + "learning_rate": 8.84244376581413e-05, + "loss": 1.865, + "step": 1367 + }, + { + "epoch": 0.2484618702749336, + "grad_norm": 0.3997363746166229, + "learning_rate": 8.840550972488612e-05, + "loss": 1.9505, + "step": 1368 + }, + { + "epoch": 0.24864349444911119, + "grad_norm": 0.5347998142242432, + "learning_rate": 8.838656835850353e-05, + "loss": 1.8737, + "step": 1369 + }, + { + "epoch": 0.24882511862328877, + "grad_norm": 0.866124153137207, + "learning_rate": 8.836761356561868e-05, + "loss": 1.8136, + "step": 1370 + }, + { + "epoch": 0.24900674279746635, + "grad_norm": 0.4944067895412445, + "learning_rate": 8.834864535286143e-05, + "loss": 1.8852, + "step": 1371 + }, + { + "epoch": 0.24918836697164393, + "grad_norm": 0.49751466512680054, + "learning_rate": 8.832966372686626e-05, + "loss": 1.9184, + "step": 1372 + }, + { + "epoch": 0.2493699911458215, + "grad_norm": 0.39150771498680115, + "learning_rate": 8.831066869427243e-05, + "loss": 1.7509, + "step": 1373 + }, + { + "epoch": 0.2495516153199991, + "grad_norm": 0.5467283129692078, + "learning_rate": 8.829166026172382e-05, + "loss": 1.8648, + "step": 1374 + }, + { + "epoch": 0.24973323949417667, + "grad_norm": 0.6361364722251892, + "learning_rate": 8.827263843586904e-05, + "loss": 2.0749, + "step": 1375 + }, + { + "epoch": 0.24991486366835425, + "grad_norm": 0.39558953046798706, + "learning_rate": 8.825360322336134e-05, + "loss": 1.7451, + "step": 1376 + }, + { + "epoch": 0.25009648784253186, + "grad_norm": 0.426521360874176, + "learning_rate": 8.823455463085873e-05, + "loss": 1.8085, + "step": 1377 + }, + { + "epoch": 0.2502781120167094, + "grad_norm": 0.45536383986473083, + "learning_rate": 8.821549266502383e-05, + "loss": 1.6781, + "step": 1378 + }, + { + "epoch": 0.250459736190887, + "grad_norm": 0.3901159465312958, + "learning_rate": 8.819641733252396e-05, + "loss": 1.7684, + "step": 1379 + }, + { + "epoch": 0.25064136036506457, + "grad_norm": 0.4756334722042084, + "learning_rate": 8.817732864003112e-05, + "loss": 1.9061, + "step": 1380 + }, + { + "epoch": 0.2508229845392422, + "grad_norm": 0.48614779114723206, + "learning_rate": 8.815822659422195e-05, + "loss": 1.6694, + "step": 1381 + }, + { + "epoch": 0.25100460871341973, + "grad_norm": 0.35622861981391907, + "learning_rate": 8.813911120177783e-05, + "loss": 1.5813, + "step": 1382 + }, + { + "epoch": 0.25118623288759734, + "grad_norm": 0.8388331532478333, + "learning_rate": 8.811998246938474e-05, + "loss": 1.8111, + "step": 1383 + }, + { + "epoch": 0.25136785706177495, + "grad_norm": 0.4043489098548889, + "learning_rate": 8.810084040373337e-05, + "loss": 1.6161, + "step": 1384 + }, + { + "epoch": 0.2515494812359525, + "grad_norm": 0.434173583984375, + "learning_rate": 8.808168501151904e-05, + "loss": 1.6906, + "step": 1385 + }, + { + "epoch": 0.2517311054101301, + "grad_norm": 0.3593811094760895, + "learning_rate": 8.806251629944176e-05, + "loss": 1.8377, + "step": 1386 + }, + { + "epoch": 0.25191272958430766, + "grad_norm": 1.8792071342468262, + "learning_rate": 8.804333427420617e-05, + "loss": 1.9506, + "step": 1387 + }, + { + "epoch": 0.25209435375848527, + "grad_norm": 0.49741142988204956, + "learning_rate": 8.80241389425216e-05, + "loss": 1.7815, + "step": 1388 + }, + { + "epoch": 0.2522759779326628, + "grad_norm": 1.1014071702957153, + "learning_rate": 8.800493031110202e-05, + "loss": 1.8673, + "step": 1389 + }, + { + "epoch": 0.25245760210684043, + "grad_norm": 0.3132319450378418, + "learning_rate": 8.798570838666602e-05, + "loss": 1.6818, + "step": 1390 + }, + { + "epoch": 0.252639226281018, + "grad_norm": 0.3532993197441101, + "learning_rate": 8.796647317593691e-05, + "loss": 1.6502, + "step": 1391 + }, + { + "epoch": 0.2528208504551956, + "grad_norm": 1.5560842752456665, + "learning_rate": 8.794722468564259e-05, + "loss": 1.8057, + "step": 1392 + }, + { + "epoch": 0.25300247462937314, + "grad_norm": 0.37145233154296875, + "learning_rate": 8.79279629225156e-05, + "loss": 1.6863, + "step": 1393 + }, + { + "epoch": 0.25318409880355075, + "grad_norm": 0.36171379685401917, + "learning_rate": 8.790868789329316e-05, + "loss": 2.0036, + "step": 1394 + }, + { + "epoch": 0.25336572297772836, + "grad_norm": 0.35242295265197754, + "learning_rate": 8.788939960471711e-05, + "loss": 1.912, + "step": 1395 + }, + { + "epoch": 0.2535473471519059, + "grad_norm": 0.3315792679786682, + "learning_rate": 8.787009806353395e-05, + "loss": 1.8098, + "step": 1396 + }, + { + "epoch": 0.2537289713260835, + "grad_norm": 0.5059865117073059, + "learning_rate": 8.785078327649477e-05, + "loss": 1.8668, + "step": 1397 + }, + { + "epoch": 0.25391059550026107, + "grad_norm": 0.3742939531803131, + "learning_rate": 8.783145525035532e-05, + "loss": 1.9148, + "step": 1398 + }, + { + "epoch": 0.2540922196744387, + "grad_norm": 0.8489415645599365, + "learning_rate": 8.781211399187602e-05, + "loss": 2.0839, + "step": 1399 + }, + { + "epoch": 0.25427384384861623, + "grad_norm": 0.36933979392051697, + "learning_rate": 8.77927595078218e-05, + "loss": 1.5535, + "step": 1400 + }, + { + "epoch": 0.25445546802279384, + "grad_norm": 0.43533822894096375, + "learning_rate": 8.777339180496238e-05, + "loss": 1.787, + "step": 1401 + }, + { + "epoch": 0.2546370921969714, + "grad_norm": 0.39339393377304077, + "learning_rate": 8.775401089007195e-05, + "loss": 1.8177, + "step": 1402 + }, + { + "epoch": 0.254818716371149, + "grad_norm": 0.6424275040626526, + "learning_rate": 8.773461676992945e-05, + "loss": 1.8205, + "step": 1403 + }, + { + "epoch": 0.2550003405453266, + "grad_norm": 0.35281631350517273, + "learning_rate": 8.771520945131832e-05, + "loss": 1.5955, + "step": 1404 + }, + { + "epoch": 0.25518196471950416, + "grad_norm": 0.9081873297691345, + "learning_rate": 8.76957889410267e-05, + "loss": 1.8953, + "step": 1405 + }, + { + "epoch": 0.25536358889368177, + "grad_norm": 0.513404369354248, + "learning_rate": 8.767635524584733e-05, + "loss": 1.8109, + "step": 1406 + }, + { + "epoch": 0.2555452130678593, + "grad_norm": 0.4581167697906494, + "learning_rate": 8.765690837257753e-05, + "loss": 2.0032, + "step": 1407 + }, + { + "epoch": 0.25572683724203693, + "grad_norm": 0.3656148314476013, + "learning_rate": 8.763744832801926e-05, + "loss": 1.5707, + "step": 1408 + }, + { + "epoch": 0.2559084614162145, + "grad_norm": 1.3142421245574951, + "learning_rate": 8.761797511897906e-05, + "loss": 1.9614, + "step": 1409 + }, + { + "epoch": 0.2560900855903921, + "grad_norm": 0.3925953805446625, + "learning_rate": 8.759848875226812e-05, + "loss": 1.7041, + "step": 1410 + }, + { + "epoch": 0.25627170976456964, + "grad_norm": 0.6808822751045227, + "learning_rate": 8.757898923470218e-05, + "loss": 1.7578, + "step": 1411 + }, + { + "epoch": 0.25645333393874725, + "grad_norm": 0.37422892451286316, + "learning_rate": 8.755947657310161e-05, + "loss": 1.6533, + "step": 1412 + }, + { + "epoch": 0.2566349581129248, + "grad_norm": 0.8154920339584351, + "learning_rate": 8.753995077429139e-05, + "loss": 1.9779, + "step": 1413 + }, + { + "epoch": 0.2568165822871024, + "grad_norm": 0.3473200798034668, + "learning_rate": 8.752041184510102e-05, + "loss": 1.6572, + "step": 1414 + }, + { + "epoch": 0.25699820646128, + "grad_norm": 0.4293542504310608, + "learning_rate": 8.750085979236469e-05, + "loss": 1.8444, + "step": 1415 + }, + { + "epoch": 0.2571798306354576, + "grad_norm": 0.7634435296058655, + "learning_rate": 8.748129462292114e-05, + "loss": 1.8888, + "step": 1416 + }, + { + "epoch": 0.2573614548096352, + "grad_norm": 0.48981615900993347, + "learning_rate": 8.746171634361368e-05, + "loss": 1.8218, + "step": 1417 + }, + { + "epoch": 0.25754307898381273, + "grad_norm": 0.39355990290641785, + "learning_rate": 8.744212496129022e-05, + "loss": 1.8138, + "step": 1418 + }, + { + "epoch": 0.25772470315799034, + "grad_norm": 0.36046916246414185, + "learning_rate": 8.742252048280328e-05, + "loss": 1.8849, + "step": 1419 + }, + { + "epoch": 0.2579063273321679, + "grad_norm": 0.5767477750778198, + "learning_rate": 8.74029029150099e-05, + "loss": 1.7638, + "step": 1420 + }, + { + "epoch": 0.2580879515063455, + "grad_norm": 0.37916961312294006, + "learning_rate": 8.738327226477176e-05, + "loss": 1.7989, + "step": 1421 + }, + { + "epoch": 0.25826957568052306, + "grad_norm": 0.48592230677604675, + "learning_rate": 8.736362853895508e-05, + "loss": 1.7046, + "step": 1422 + }, + { + "epoch": 0.25845119985470066, + "grad_norm": 0.45891353487968445, + "learning_rate": 8.734397174443064e-05, + "loss": 1.7817, + "step": 1423 + }, + { + "epoch": 0.2586328240288782, + "grad_norm": 0.36914366483688354, + "learning_rate": 8.732430188807384e-05, + "loss": 1.7019, + "step": 1424 + }, + { + "epoch": 0.2588144482030558, + "grad_norm": 0.37879806756973267, + "learning_rate": 8.730461897676464e-05, + "loss": 1.599, + "step": 1425 + }, + { + "epoch": 0.25899607237723343, + "grad_norm": 0.6337646245956421, + "learning_rate": 8.728492301738748e-05, + "loss": 1.7977, + "step": 1426 + }, + { + "epoch": 0.259177696551411, + "grad_norm": 0.365852028131485, + "learning_rate": 8.726521401683152e-05, + "loss": 1.842, + "step": 1427 + }, + { + "epoch": 0.2593593207255886, + "grad_norm": 0.5315985083580017, + "learning_rate": 8.724549198199034e-05, + "loss": 1.8826, + "step": 1428 + }, + { + "epoch": 0.25954094489976615, + "grad_norm": 0.4757029116153717, + "learning_rate": 8.722575691976213e-05, + "loss": 1.7849, + "step": 1429 + }, + { + "epoch": 0.25972256907394375, + "grad_norm": 0.40101388096809387, + "learning_rate": 8.720600883704965e-05, + "loss": 1.855, + "step": 1430 + }, + { + "epoch": 0.2599041932481213, + "grad_norm": 0.3816615641117096, + "learning_rate": 8.718624774076023e-05, + "loss": 1.8257, + "step": 1431 + }, + { + "epoch": 0.2600858174222989, + "grad_norm": 0.33440008759498596, + "learning_rate": 8.716647363780568e-05, + "loss": 1.7898, + "step": 1432 + }, + { + "epoch": 0.26026744159647647, + "grad_norm": 0.4474031329154968, + "learning_rate": 8.714668653510244e-05, + "loss": 1.7705, + "step": 1433 + }, + { + "epoch": 0.2604490657706541, + "grad_norm": 0.3366563320159912, + "learning_rate": 8.712688643957144e-05, + "loss": 1.8119, + "step": 1434 + }, + { + "epoch": 0.2606306899448317, + "grad_norm": 0.2957102954387665, + "learning_rate": 8.710707335813819e-05, + "loss": 1.7853, + "step": 1435 + }, + { + "epoch": 0.26081231411900924, + "grad_norm": 0.3422200679779053, + "learning_rate": 8.708724729773272e-05, + "loss": 1.8332, + "step": 1436 + }, + { + "epoch": 0.26099393829318684, + "grad_norm": 0.6829003691673279, + "learning_rate": 8.70674082652896e-05, + "loss": 1.8882, + "step": 1437 + }, + { + "epoch": 0.2611755624673644, + "grad_norm": 0.39767855405807495, + "learning_rate": 8.704755626774796e-05, + "loss": 1.8553, + "step": 1438 + }, + { + "epoch": 0.261357186641542, + "grad_norm": 1.068233847618103, + "learning_rate": 8.702769131205145e-05, + "loss": 1.7592, + "step": 1439 + }, + { + "epoch": 0.26153881081571956, + "grad_norm": 0.46263381838798523, + "learning_rate": 8.700781340514822e-05, + "loss": 1.875, + "step": 1440 + }, + { + "epoch": 0.26172043498989717, + "grad_norm": 0.43104514479637146, + "learning_rate": 8.698792255399104e-05, + "loss": 1.7989, + "step": 1441 + }, + { + "epoch": 0.2619020591640747, + "grad_norm": 0.46701791882514954, + "learning_rate": 8.696801876553711e-05, + "loss": 1.7127, + "step": 1442 + }, + { + "epoch": 0.2620836833382523, + "grad_norm": 0.566098690032959, + "learning_rate": 8.69481020467482e-05, + "loss": 1.8382, + "step": 1443 + }, + { + "epoch": 0.2622653075124299, + "grad_norm": 0.33358657360076904, + "learning_rate": 8.692817240459061e-05, + "loss": 1.8407, + "step": 1444 + }, + { + "epoch": 0.2624469316866075, + "grad_norm": 0.40457800030708313, + "learning_rate": 8.690822984603512e-05, + "loss": 1.7291, + "step": 1445 + }, + { + "epoch": 0.2626285558607851, + "grad_norm": 0.2878446877002716, + "learning_rate": 8.688827437805708e-05, + "loss": 1.6849, + "step": 1446 + }, + { + "epoch": 0.26281018003496265, + "grad_norm": 0.3388986587524414, + "learning_rate": 8.686830600763634e-05, + "loss": 1.4867, + "step": 1447 + }, + { + "epoch": 0.26299180420914026, + "grad_norm": 0.44893673062324524, + "learning_rate": 8.684832474175724e-05, + "loss": 1.7684, + "step": 1448 + }, + { + "epoch": 0.2631734283833178, + "grad_norm": 0.5829174518585205, + "learning_rate": 8.682833058740862e-05, + "loss": 1.6305, + "step": 1449 + }, + { + "epoch": 0.2633550525574954, + "grad_norm": 0.3476617932319641, + "learning_rate": 8.680832355158388e-05, + "loss": 1.8235, + "step": 1450 + }, + { + "epoch": 0.26353667673167297, + "grad_norm": 0.43227750062942505, + "learning_rate": 8.67883036412809e-05, + "loss": 2.0221, + "step": 1451 + }, + { + "epoch": 0.2637183009058506, + "grad_norm": 0.41027089953422546, + "learning_rate": 8.676827086350206e-05, + "loss": 1.7178, + "step": 1452 + }, + { + "epoch": 0.26389992508002813, + "grad_norm": 0.8589214086532593, + "learning_rate": 8.674822522525422e-05, + "loss": 1.942, + "step": 1453 + }, + { + "epoch": 0.26408154925420574, + "grad_norm": 0.4910159111022949, + "learning_rate": 8.672816673354878e-05, + "loss": 1.7875, + "step": 1454 + }, + { + "epoch": 0.2642631734283833, + "grad_norm": 0.6464884877204895, + "learning_rate": 8.670809539540162e-05, + "loss": 1.7374, + "step": 1455 + }, + { + "epoch": 0.2644447976025609, + "grad_norm": 0.3828637897968292, + "learning_rate": 8.668801121783308e-05, + "loss": 1.6545, + "step": 1456 + }, + { + "epoch": 0.2646264217767385, + "grad_norm": 0.6814947128295898, + "learning_rate": 8.666791420786803e-05, + "loss": 1.6902, + "step": 1457 + }, + { + "epoch": 0.26480804595091606, + "grad_norm": 0.41468775272369385, + "learning_rate": 8.664780437253586e-05, + "loss": 1.7243, + "step": 1458 + }, + { + "epoch": 0.26498967012509367, + "grad_norm": 0.9007341861724854, + "learning_rate": 8.662768171887034e-05, + "loss": 1.7549, + "step": 1459 + }, + { + "epoch": 0.2651712942992712, + "grad_norm": 0.54853755235672, + "learning_rate": 8.660754625390984e-05, + "loss": 1.9098, + "step": 1460 + }, + { + "epoch": 0.26535291847344883, + "grad_norm": 0.417065292596817, + "learning_rate": 8.658739798469712e-05, + "loss": 1.7397, + "step": 1461 + }, + { + "epoch": 0.2655345426476264, + "grad_norm": 0.421535462141037, + "learning_rate": 8.656723691827951e-05, + "loss": 1.7692, + "step": 1462 + }, + { + "epoch": 0.265716166821804, + "grad_norm": 0.476529061794281, + "learning_rate": 8.654706306170868e-05, + "loss": 1.7416, + "step": 1463 + }, + { + "epoch": 0.26589779099598154, + "grad_norm": 0.38049396872520447, + "learning_rate": 8.652687642204093e-05, + "loss": 1.8689, + "step": 1464 + }, + { + "epoch": 0.26607941517015915, + "grad_norm": 0.5481051206588745, + "learning_rate": 8.650667700633692e-05, + "loss": 1.8318, + "step": 1465 + }, + { + "epoch": 0.26626103934433676, + "grad_norm": 0.9362296462059021, + "learning_rate": 8.648646482166183e-05, + "loss": 1.8841, + "step": 1466 + }, + { + "epoch": 0.2664426635185143, + "grad_norm": 0.4077291190624237, + "learning_rate": 8.646623987508528e-05, + "loss": 1.8705, + "step": 1467 + }, + { + "epoch": 0.2666242876926919, + "grad_norm": 0.4437808394432068, + "learning_rate": 8.64460021736814e-05, + "loss": 1.8136, + "step": 1468 + }, + { + "epoch": 0.26680591186686947, + "grad_norm": 0.49600428342819214, + "learning_rate": 8.642575172452871e-05, + "loss": 1.8944, + "step": 1469 + }, + { + "epoch": 0.2669875360410471, + "grad_norm": 0.6178036332130432, + "learning_rate": 8.640548853471025e-05, + "loss": 2.0507, + "step": 1470 + }, + { + "epoch": 0.26716916021522463, + "grad_norm": 0.5106523633003235, + "learning_rate": 8.638521261131349e-05, + "loss": 1.9094, + "step": 1471 + }, + { + "epoch": 0.26735078438940224, + "grad_norm": 0.4348966181278229, + "learning_rate": 8.636492396143034e-05, + "loss": 1.751, + "step": 1472 + }, + { + "epoch": 0.2675324085635798, + "grad_norm": 0.4189602732658386, + "learning_rate": 8.634462259215719e-05, + "loss": 1.6349, + "step": 1473 + }, + { + "epoch": 0.2677140327377574, + "grad_norm": 0.4347527027130127, + "learning_rate": 8.632430851059487e-05, + "loss": 1.7882, + "step": 1474 + }, + { + "epoch": 0.26789565691193495, + "grad_norm": 0.3965340256690979, + "learning_rate": 8.630398172384865e-05, + "loss": 1.8054, + "step": 1475 + }, + { + "epoch": 0.26807728108611256, + "grad_norm": 0.4262984097003937, + "learning_rate": 8.628364223902825e-05, + "loss": 1.8195, + "step": 1476 + }, + { + "epoch": 0.26825890526029017, + "grad_norm": 0.40779852867126465, + "learning_rate": 8.626329006324782e-05, + "loss": 1.7362, + "step": 1477 + }, + { + "epoch": 0.2684405294344677, + "grad_norm": 0.3628137409687042, + "learning_rate": 8.624292520362596e-05, + "loss": 1.7078, + "step": 1478 + }, + { + "epoch": 0.26862215360864533, + "grad_norm": 0.400200754404068, + "learning_rate": 8.62225476672857e-05, + "loss": 1.917, + "step": 1479 + }, + { + "epoch": 0.2688037777828229, + "grad_norm": 0.5642498731613159, + "learning_rate": 8.620215746135454e-05, + "loss": 1.8916, + "step": 1480 + }, + { + "epoch": 0.2689854019570005, + "grad_norm": 0.3959648609161377, + "learning_rate": 8.618175459296433e-05, + "loss": 1.7588, + "step": 1481 + }, + { + "epoch": 0.26916702613117804, + "grad_norm": 0.5438734292984009, + "learning_rate": 8.616133906925145e-05, + "loss": 2.0303, + "step": 1482 + }, + { + "epoch": 0.26934865030535565, + "grad_norm": 0.37093105912208557, + "learning_rate": 8.61409108973566e-05, + "loss": 1.7656, + "step": 1483 + }, + { + "epoch": 0.2695302744795332, + "grad_norm": 0.6146934628486633, + "learning_rate": 8.612047008442501e-05, + "loss": 1.7954, + "step": 1484 + }, + { + "epoch": 0.2697118986537108, + "grad_norm": 0.39958831667900085, + "learning_rate": 8.610001663760626e-05, + "loss": 1.8056, + "step": 1485 + }, + { + "epoch": 0.26989352282788837, + "grad_norm": 0.30180105566978455, + "learning_rate": 8.607955056405435e-05, + "loss": 1.6453, + "step": 1486 + }, + { + "epoch": 0.270075147002066, + "grad_norm": 0.4860895872116089, + "learning_rate": 8.605907187092774e-05, + "loss": 1.957, + "step": 1487 + }, + { + "epoch": 0.2702567711762436, + "grad_norm": 0.4076760411262512, + "learning_rate": 8.603858056538927e-05, + "loss": 1.6576, + "step": 1488 + }, + { + "epoch": 0.27043839535042113, + "grad_norm": 0.4006046950817108, + "learning_rate": 8.60180766546062e-05, + "loss": 1.7824, + "step": 1489 + }, + { + "epoch": 0.27062001952459874, + "grad_norm": 0.38567915558815, + "learning_rate": 8.59975601457502e-05, + "loss": 1.8516, + "step": 1490 + }, + { + "epoch": 0.2708016436987763, + "grad_norm": 0.38249075412750244, + "learning_rate": 8.597703104599736e-05, + "loss": 1.6766, + "step": 1491 + }, + { + "epoch": 0.2709832678729539, + "grad_norm": 0.4776468873023987, + "learning_rate": 8.595648936252816e-05, + "loss": 1.8288, + "step": 1492 + }, + { + "epoch": 0.27116489204713146, + "grad_norm": 0.4284636676311493, + "learning_rate": 8.593593510252746e-05, + "loss": 1.8303, + "step": 1493 + }, + { + "epoch": 0.27134651622130906, + "grad_norm": 7.1767191886901855, + "learning_rate": 8.591536827318454e-05, + "loss": 1.9351, + "step": 1494 + }, + { + "epoch": 0.2715281403954866, + "grad_norm": 0.47047650814056396, + "learning_rate": 8.589478888169311e-05, + "loss": 1.8069, + "step": 1495 + }, + { + "epoch": 0.2717097645696642, + "grad_norm": 0.3791496157646179, + "learning_rate": 8.58741969352512e-05, + "loss": 1.7878, + "step": 1496 + }, + { + "epoch": 0.27189138874384183, + "grad_norm": 0.30033430457115173, + "learning_rate": 8.585359244106132e-05, + "loss": 1.6807, + "step": 1497 + }, + { + "epoch": 0.2720730129180194, + "grad_norm": 0.4274178147315979, + "learning_rate": 8.583297540633029e-05, + "loss": 1.698, + "step": 1498 + }, + { + "epoch": 0.272254637092197, + "grad_norm": 0.8717056512832642, + "learning_rate": 8.581234583826934e-05, + "loss": 1.7782, + "step": 1499 + }, + { + "epoch": 0.27243626126637455, + "grad_norm": 0.4252730906009674, + "learning_rate": 8.57917037440941e-05, + "loss": 1.7904, + "step": 1500 + }, + { + "epoch": 0.27261788544055215, + "grad_norm": 0.5986876487731934, + "learning_rate": 8.577104913102458e-05, + "loss": 1.9593, + "step": 1501 + }, + { + "epoch": 0.2727995096147297, + "grad_norm": 0.4603033661842346, + "learning_rate": 8.575038200628518e-05, + "loss": 1.9866, + "step": 1502 + }, + { + "epoch": 0.2729811337889073, + "grad_norm": 0.48111748695373535, + "learning_rate": 8.57297023771046e-05, + "loss": 1.8277, + "step": 1503 + }, + { + "epoch": 0.27316275796308487, + "grad_norm": 0.5980393886566162, + "learning_rate": 8.570901025071604e-05, + "loss": 1.815, + "step": 1504 + }, + { + "epoch": 0.2733443821372625, + "grad_norm": 0.38133129477500916, + "learning_rate": 8.568830563435694e-05, + "loss": 1.7003, + "step": 1505 + }, + { + "epoch": 0.27352600631144003, + "grad_norm": 0.7250569462776184, + "learning_rate": 8.566758853526923e-05, + "loss": 1.7994, + "step": 1506 + }, + { + "epoch": 0.27370763048561764, + "grad_norm": 0.34541594982147217, + "learning_rate": 8.56468589606991e-05, + "loss": 1.7173, + "step": 1507 + }, + { + "epoch": 0.27388925465979524, + "grad_norm": 0.43276166915893555, + "learning_rate": 8.562611691789717e-05, + "loss": 1.679, + "step": 1508 + }, + { + "epoch": 0.2740708788339728, + "grad_norm": 0.652392566204071, + "learning_rate": 8.56053624141184e-05, + "loss": 1.6934, + "step": 1509 + }, + { + "epoch": 0.2742525030081504, + "grad_norm": 0.3247782588005066, + "learning_rate": 8.55845954566221e-05, + "loss": 1.701, + "step": 1510 + }, + { + "epoch": 0.27443412718232796, + "grad_norm": 0.4163033664226532, + "learning_rate": 8.556381605267196e-05, + "loss": 1.8382, + "step": 1511 + }, + { + "epoch": 0.27461575135650557, + "grad_norm": 0.47123977541923523, + "learning_rate": 8.554302420953602e-05, + "loss": 1.863, + "step": 1512 + }, + { + "epoch": 0.2747973755306831, + "grad_norm": 0.4229269027709961, + "learning_rate": 8.552221993448664e-05, + "loss": 1.7846, + "step": 1513 + }, + { + "epoch": 0.2749789997048607, + "grad_norm": 0.26079288125038147, + "learning_rate": 8.550140323480056e-05, + "loss": 1.6644, + "step": 1514 + }, + { + "epoch": 0.2751606238790383, + "grad_norm": 0.41258901357650757, + "learning_rate": 8.548057411775883e-05, + "loss": 1.9173, + "step": 1515 + }, + { + "epoch": 0.2753422480532159, + "grad_norm": 0.31931260228157043, + "learning_rate": 8.54597325906469e-05, + "loss": 1.6645, + "step": 1516 + }, + { + "epoch": 0.27552387222739344, + "grad_norm": 0.3815561830997467, + "learning_rate": 8.543887866075451e-05, + "loss": 1.7489, + "step": 1517 + }, + { + "epoch": 0.27570549640157105, + "grad_norm": 0.3961819112300873, + "learning_rate": 8.541801233537578e-05, + "loss": 1.8446, + "step": 1518 + }, + { + "epoch": 0.27588712057574866, + "grad_norm": 0.310563325881958, + "learning_rate": 8.53971336218091e-05, + "loss": 1.7479, + "step": 1519 + }, + { + "epoch": 0.2760687447499262, + "grad_norm": 0.660724401473999, + "learning_rate": 8.537624252735728e-05, + "loss": 1.9004, + "step": 1520 + }, + { + "epoch": 0.2762503689241038, + "grad_norm": 0.6176539659500122, + "learning_rate": 8.535533905932738e-05, + "loss": 1.6536, + "step": 1521 + }, + { + "epoch": 0.27643199309828137, + "grad_norm": 0.39009881019592285, + "learning_rate": 8.533442322503085e-05, + "loss": 1.7921, + "step": 1522 + }, + { + "epoch": 0.276613617272459, + "grad_norm": 0.3938928246498108, + "learning_rate": 8.531349503178342e-05, + "loss": 1.7879, + "step": 1523 + }, + { + "epoch": 0.27679524144663653, + "grad_norm": 0.546247124671936, + "learning_rate": 8.529255448690517e-05, + "loss": 1.5495, + "step": 1524 + }, + { + "epoch": 0.27697686562081414, + "grad_norm": 0.6985185742378235, + "learning_rate": 8.52716015977205e-05, + "loss": 1.8005, + "step": 1525 + }, + { + "epoch": 0.2771584897949917, + "grad_norm": 0.6116211414337158, + "learning_rate": 8.52506363715581e-05, + "loss": 1.7663, + "step": 1526 + }, + { + "epoch": 0.2773401139691693, + "grad_norm": 0.5191812515258789, + "learning_rate": 8.5229658815751e-05, + "loss": 1.9486, + "step": 1527 + }, + { + "epoch": 0.27752173814334685, + "grad_norm": 0.46621444821357727, + "learning_rate": 8.520866893763655e-05, + "loss": 1.718, + "step": 1528 + }, + { + "epoch": 0.27770336231752446, + "grad_norm": 0.390455424785614, + "learning_rate": 8.51876667445564e-05, + "loss": 2.05, + "step": 1529 + }, + { + "epoch": 0.27788498649170207, + "grad_norm": 0.3839137852191925, + "learning_rate": 8.516665224385649e-05, + "loss": 1.7831, + "step": 1530 + }, + { + "epoch": 0.2780666106658796, + "grad_norm": 0.31432050466537476, + "learning_rate": 8.51456254428871e-05, + "loss": 1.8555, + "step": 1531 + }, + { + "epoch": 0.27824823484005723, + "grad_norm": 0.4558294117450714, + "learning_rate": 8.512458634900275e-05, + "loss": 1.5947, + "step": 1532 + }, + { + "epoch": 0.2784298590142348, + "grad_norm": 0.835339367389679, + "learning_rate": 8.510353496956234e-05, + "loss": 1.9337, + "step": 1533 + }, + { + "epoch": 0.2786114831884124, + "grad_norm": 0.4013761878013611, + "learning_rate": 8.508247131192902e-05, + "loss": 1.713, + "step": 1534 + }, + { + "epoch": 0.27879310736258994, + "grad_norm": 0.5864201784133911, + "learning_rate": 8.506139538347022e-05, + "loss": 1.8323, + "step": 1535 + }, + { + "epoch": 0.27897473153676755, + "grad_norm": 0.5541375875473022, + "learning_rate": 8.504030719155773e-05, + "loss": 1.9137, + "step": 1536 + }, + { + "epoch": 0.2791563557109451, + "grad_norm": 0.37119126319885254, + "learning_rate": 8.501920674356754e-05, + "loss": 1.8021, + "step": 1537 + }, + { + "epoch": 0.2793379798851227, + "grad_norm": 0.4635416269302368, + "learning_rate": 8.499809404688e-05, + "loss": 1.757, + "step": 1538 + }, + { + "epoch": 0.2795196040593003, + "grad_norm": 0.3179568350315094, + "learning_rate": 8.497696910887971e-05, + "loss": 1.6588, + "step": 1539 + }, + { + "epoch": 0.27970122823347787, + "grad_norm": 1.2767635583877563, + "learning_rate": 8.495583193695555e-05, + "loss": 1.851, + "step": 1540 + }, + { + "epoch": 0.2798828524076555, + "grad_norm": 0.43684694170951843, + "learning_rate": 8.49346825385007e-05, + "loss": 1.6097, + "step": 1541 + }, + { + "epoch": 0.28006447658183303, + "grad_norm": 0.4840036928653717, + "learning_rate": 8.491352092091258e-05, + "loss": 1.817, + "step": 1542 + }, + { + "epoch": 0.28024610075601064, + "grad_norm": 0.4346402883529663, + "learning_rate": 8.489234709159293e-05, + "loss": 1.7097, + "step": 1543 + }, + { + "epoch": 0.2804277249301882, + "grad_norm": 0.6161524057388306, + "learning_rate": 8.487116105794772e-05, + "loss": 2.096, + "step": 1544 + }, + { + "epoch": 0.2806093491043658, + "grad_norm": 0.5817924737930298, + "learning_rate": 8.484996282738722e-05, + "loss": 1.8926, + "step": 1545 + }, + { + "epoch": 0.28079097327854335, + "grad_norm": 0.343446284532547, + "learning_rate": 8.482875240732595e-05, + "loss": 1.8569, + "step": 1546 + }, + { + "epoch": 0.28097259745272096, + "grad_norm": 0.3547852635383606, + "learning_rate": 8.480752980518269e-05, + "loss": 1.7206, + "step": 1547 + }, + { + "epoch": 0.2811542216268985, + "grad_norm": 1.213689923286438, + "learning_rate": 8.47862950283805e-05, + "loss": 1.918, + "step": 1548 + }, + { + "epoch": 0.2813358458010761, + "grad_norm": 0.49426499009132385, + "learning_rate": 8.476504808434666e-05, + "loss": 1.7531, + "step": 1549 + }, + { + "epoch": 0.28151746997525373, + "grad_norm": 0.4561198651790619, + "learning_rate": 8.474378898051277e-05, + "loss": 1.9094, + "step": 1550 + }, + { + "epoch": 0.2816990941494313, + "grad_norm": 0.43311965465545654, + "learning_rate": 8.472251772431461e-05, + "loss": 1.8284, + "step": 1551 + }, + { + "epoch": 0.2818807183236089, + "grad_norm": 0.5331430435180664, + "learning_rate": 8.470123432319227e-05, + "loss": 1.8557, + "step": 1552 + }, + { + "epoch": 0.28206234249778644, + "grad_norm": 0.3101229965686798, + "learning_rate": 8.467993878459004e-05, + "loss": 1.6687, + "step": 1553 + }, + { + "epoch": 0.28224396667196405, + "grad_norm": 0.4086327850818634, + "learning_rate": 8.46586311159565e-05, + "loss": 1.5861, + "step": 1554 + }, + { + "epoch": 0.2824255908461416, + "grad_norm": 0.40725627541542053, + "learning_rate": 8.463731132474442e-05, + "loss": 1.8887, + "step": 1555 + }, + { + "epoch": 0.2826072150203192, + "grad_norm": 0.32979118824005127, + "learning_rate": 8.461597941841089e-05, + "loss": 1.8009, + "step": 1556 + }, + { + "epoch": 0.28278883919449677, + "grad_norm": 0.3970656991004944, + "learning_rate": 8.459463540441716e-05, + "loss": 1.8472, + "step": 1557 + }, + { + "epoch": 0.2829704633686744, + "grad_norm": 0.3560042083263397, + "learning_rate": 8.457327929022873e-05, + "loss": 1.8586, + "step": 1558 + }, + { + "epoch": 0.2831520875428519, + "grad_norm": 0.7143173813819885, + "learning_rate": 8.455191108331536e-05, + "loss": 1.7244, + "step": 1559 + }, + { + "epoch": 0.28333371171702953, + "grad_norm": 0.37010252475738525, + "learning_rate": 8.453053079115103e-05, + "loss": 1.7249, + "step": 1560 + }, + { + "epoch": 0.28351533589120714, + "grad_norm": 1.2078853845596313, + "learning_rate": 8.450913842121396e-05, + "loss": 1.7154, + "step": 1561 + }, + { + "epoch": 0.2836969600653847, + "grad_norm": 0.38228315114974976, + "learning_rate": 8.448773398098652e-05, + "loss": 1.8153, + "step": 1562 + }, + { + "epoch": 0.2838785842395623, + "grad_norm": 0.36804670095443726, + "learning_rate": 8.446631747795541e-05, + "loss": 1.8788, + "step": 1563 + }, + { + "epoch": 0.28406020841373986, + "grad_norm": 0.3285600543022156, + "learning_rate": 8.444488891961148e-05, + "loss": 1.7926, + "step": 1564 + }, + { + "epoch": 0.28424183258791746, + "grad_norm": 0.5678157210350037, + "learning_rate": 8.442344831344985e-05, + "loss": 1.8999, + "step": 1565 + }, + { + "epoch": 0.284423456762095, + "grad_norm": 0.509946882724762, + "learning_rate": 8.440199566696976e-05, + "loss": 1.8953, + "step": 1566 + }, + { + "epoch": 0.2846050809362726, + "grad_norm": 0.4109271764755249, + "learning_rate": 8.438053098767476e-05, + "loss": 1.6724, + "step": 1567 + }, + { + "epoch": 0.2847867051104502, + "grad_norm": 0.4237106740474701, + "learning_rate": 8.435905428307254e-05, + "loss": 1.6754, + "step": 1568 + }, + { + "epoch": 0.2849683292846278, + "grad_norm": 0.3430403470993042, + "learning_rate": 8.433756556067506e-05, + "loss": 1.6567, + "step": 1569 + }, + { + "epoch": 0.2851499534588054, + "grad_norm": 0.3556921184062958, + "learning_rate": 8.431606482799844e-05, + "loss": 1.6, + "step": 1570 + }, + { + "epoch": 0.28533157763298295, + "grad_norm": 1.2429702281951904, + "learning_rate": 8.429455209256297e-05, + "loss": 1.8691, + "step": 1571 + }, + { + "epoch": 0.28551320180716055, + "grad_norm": 0.5194343328475952, + "learning_rate": 8.427302736189323e-05, + "loss": 1.6934, + "step": 1572 + }, + { + "epoch": 0.2856948259813381, + "grad_norm": 0.5521764755249023, + "learning_rate": 8.42514906435179e-05, + "loss": 1.8232, + "step": 1573 + }, + { + "epoch": 0.2858764501555157, + "grad_norm": 0.47152310609817505, + "learning_rate": 8.422994194496991e-05, + "loss": 1.7638, + "step": 1574 + }, + { + "epoch": 0.28605807432969327, + "grad_norm": 0.4865815341472626, + "learning_rate": 8.420838127378639e-05, + "loss": 1.68, + "step": 1575 + }, + { + "epoch": 0.2862396985038709, + "grad_norm": 0.3550148904323578, + "learning_rate": 8.418680863750863e-05, + "loss": 1.7371, + "step": 1576 + }, + { + "epoch": 0.28642132267804843, + "grad_norm": 0.36257898807525635, + "learning_rate": 8.416522404368208e-05, + "loss": 1.8279, + "step": 1577 + }, + { + "epoch": 0.28660294685222604, + "grad_norm": 0.3876727223396301, + "learning_rate": 8.414362749985641e-05, + "loss": 1.6454, + "step": 1578 + }, + { + "epoch": 0.2867845710264036, + "grad_norm": 0.49917516112327576, + "learning_rate": 8.412201901358548e-05, + "loss": 1.8719, + "step": 1579 + }, + { + "epoch": 0.2869661952005812, + "grad_norm": 0.3595849275588989, + "learning_rate": 8.41003985924273e-05, + "loss": 1.6881, + "step": 1580 + }, + { + "epoch": 0.2871478193747588, + "grad_norm": 0.6796814203262329, + "learning_rate": 8.407876624394406e-05, + "loss": 1.7272, + "step": 1581 + }, + { + "epoch": 0.28732944354893636, + "grad_norm": 0.5059986710548401, + "learning_rate": 8.405712197570212e-05, + "loss": 1.7779, + "step": 1582 + }, + { + "epoch": 0.28751106772311397, + "grad_norm": 0.38144367933273315, + "learning_rate": 8.403546579527201e-05, + "loss": 1.6944, + "step": 1583 + }, + { + "epoch": 0.2876926918972915, + "grad_norm": 0.4057732820510864, + "learning_rate": 8.401379771022845e-05, + "loss": 1.8904, + "step": 1584 + }, + { + "epoch": 0.2878743160714691, + "grad_norm": 0.43234217166900635, + "learning_rate": 8.39921177281503e-05, + "loss": 1.7744, + "step": 1585 + }, + { + "epoch": 0.2880559402456467, + "grad_norm": 1.3444961309432983, + "learning_rate": 8.397042585662055e-05, + "loss": 1.647, + "step": 1586 + }, + { + "epoch": 0.2882375644198243, + "grad_norm": 0.29893288016319275, + "learning_rate": 8.394872210322645e-05, + "loss": 1.8355, + "step": 1587 + }, + { + "epoch": 0.28841918859400184, + "grad_norm": 0.4034648835659027, + "learning_rate": 8.392700647555929e-05, + "loss": 1.6613, + "step": 1588 + }, + { + "epoch": 0.28860081276817945, + "grad_norm": 0.5174705386161804, + "learning_rate": 8.390527898121456e-05, + "loss": 1.8074, + "step": 1589 + }, + { + "epoch": 0.288782436942357, + "grad_norm": 0.43445155024528503, + "learning_rate": 8.388353962779194e-05, + "loss": 1.7936, + "step": 1590 + }, + { + "epoch": 0.2889640611165346, + "grad_norm": 0.41943734884262085, + "learning_rate": 8.38617884228952e-05, + "loss": 1.8598, + "step": 1591 + }, + { + "epoch": 0.2891456852907122, + "grad_norm": 0.48723137378692627, + "learning_rate": 8.384002537413226e-05, + "loss": 1.7406, + "step": 1592 + }, + { + "epoch": 0.28932730946488977, + "grad_norm": 0.34488582611083984, + "learning_rate": 8.381825048911525e-05, + "loss": 1.6636, + "step": 1593 + }, + { + "epoch": 0.2895089336390674, + "grad_norm": 0.3721930980682373, + "learning_rate": 8.379646377546033e-05, + "loss": 1.7223, + "step": 1594 + }, + { + "epoch": 0.28969055781324493, + "grad_norm": 0.4354974925518036, + "learning_rate": 8.37746652407879e-05, + "loss": 1.9318, + "step": 1595 + }, + { + "epoch": 0.28987218198742254, + "grad_norm": 0.36430492997169495, + "learning_rate": 8.375285489272244e-05, + "loss": 1.7616, + "step": 1596 + }, + { + "epoch": 0.2900538061616001, + "grad_norm": 0.39568206667900085, + "learning_rate": 8.373103273889257e-05, + "loss": 1.8492, + "step": 1597 + }, + { + "epoch": 0.2902354303357777, + "grad_norm": 0.37203580141067505, + "learning_rate": 8.370919878693104e-05, + "loss": 1.5693, + "step": 1598 + }, + { + "epoch": 0.29041705450995525, + "grad_norm": 0.35803666710853577, + "learning_rate": 8.368735304447474e-05, + "loss": 1.6839, + "step": 1599 + }, + { + "epoch": 0.29059867868413286, + "grad_norm": 0.42503756284713745, + "learning_rate": 8.366549551916467e-05, + "loss": 1.83, + "step": 1600 + }, + { + "epoch": 0.29078030285831047, + "grad_norm": 0.9337709546089172, + "learning_rate": 8.364362621864595e-05, + "loss": 1.7747, + "step": 1601 + }, + { + "epoch": 0.290961927032488, + "grad_norm": 0.43157505989074707, + "learning_rate": 8.362174515056783e-05, + "loss": 1.8031, + "step": 1602 + }, + { + "epoch": 0.29114355120666563, + "grad_norm": 0.4516918361186981, + "learning_rate": 8.359985232258366e-05, + "loss": 1.8484, + "step": 1603 + }, + { + "epoch": 0.2913251753808432, + "grad_norm": 0.5206003189086914, + "learning_rate": 8.357794774235092e-05, + "loss": 1.8437, + "step": 1604 + }, + { + "epoch": 0.2915067995550208, + "grad_norm": 0.42459869384765625, + "learning_rate": 8.355603141753121e-05, + "loss": 1.7026, + "step": 1605 + }, + { + "epoch": 0.29168842372919834, + "grad_norm": 0.3845681846141815, + "learning_rate": 8.353410335579017e-05, + "loss": 1.5707, + "step": 1606 + }, + { + "epoch": 0.29187004790337595, + "grad_norm": 0.38807427883148193, + "learning_rate": 8.351216356479766e-05, + "loss": 1.6192, + "step": 1607 + }, + { + "epoch": 0.2920516720775535, + "grad_norm": 0.45921534299850464, + "learning_rate": 8.349021205222753e-05, + "loss": 1.9679, + "step": 1608 + }, + { + "epoch": 0.2922332962517311, + "grad_norm": 0.3885156214237213, + "learning_rate": 8.346824882575782e-05, + "loss": 1.8109, + "step": 1609 + }, + { + "epoch": 0.29241492042590866, + "grad_norm": 0.4315706193447113, + "learning_rate": 8.344627389307059e-05, + "loss": 1.8886, + "step": 1610 + }, + { + "epoch": 0.2925965446000863, + "grad_norm": 0.3100855350494385, + "learning_rate": 8.342428726185205e-05, + "loss": 1.657, + "step": 1611 + }, + { + "epoch": 0.2927781687742639, + "grad_norm": 0.4668561816215515, + "learning_rate": 8.340228893979247e-05, + "loss": 1.5896, + "step": 1612 + }, + { + "epoch": 0.29295979294844143, + "grad_norm": 0.4969610273838043, + "learning_rate": 8.338027893458625e-05, + "loss": 1.7778, + "step": 1613 + }, + { + "epoch": 0.29314141712261904, + "grad_norm": 0.3728976547718048, + "learning_rate": 8.33582572539318e-05, + "loss": 1.6292, + "step": 1614 + }, + { + "epoch": 0.2933230412967966, + "grad_norm": 0.3736809194087982, + "learning_rate": 8.33362239055317e-05, + "loss": 1.8759, + "step": 1615 + }, + { + "epoch": 0.2935046654709742, + "grad_norm": 0.5033447742462158, + "learning_rate": 8.331417889709258e-05, + "loss": 1.5234, + "step": 1616 + }, + { + "epoch": 0.29368628964515175, + "grad_norm": 0.42289623618125916, + "learning_rate": 8.329212223632511e-05, + "loss": 1.7941, + "step": 1617 + }, + { + "epoch": 0.29386791381932936, + "grad_norm": 0.46857261657714844, + "learning_rate": 8.32700539309441e-05, + "loss": 1.9086, + "step": 1618 + }, + { + "epoch": 0.2940495379935069, + "grad_norm": 0.6607900261878967, + "learning_rate": 8.324797398866835e-05, + "loss": 1.8309, + "step": 1619 + }, + { + "epoch": 0.2942311621676845, + "grad_norm": 1.01975417137146, + "learning_rate": 8.322588241722081e-05, + "loss": 1.8372, + "step": 1620 + }, + { + "epoch": 0.2944127863418621, + "grad_norm": 0.4104550778865814, + "learning_rate": 8.320377922432848e-05, + "loss": 1.5555, + "step": 1621 + }, + { + "epoch": 0.2945944105160397, + "grad_norm": 0.3480367958545685, + "learning_rate": 8.31816644177224e-05, + "loss": 1.7209, + "step": 1622 + }, + { + "epoch": 0.2947760346902173, + "grad_norm": 0.3571288287639618, + "learning_rate": 8.315953800513767e-05, + "loss": 1.6642, + "step": 1623 + }, + { + "epoch": 0.29495765886439484, + "grad_norm": 0.4660075604915619, + "learning_rate": 8.31373999943135e-05, + "loss": 1.8501, + "step": 1624 + }, + { + "epoch": 0.29513928303857245, + "grad_norm": 0.4009435176849365, + "learning_rate": 8.311525039299309e-05, + "loss": 1.6345, + "step": 1625 + }, + { + "epoch": 0.29532090721275, + "grad_norm": 0.37951433658599854, + "learning_rate": 8.309308920892371e-05, + "loss": 1.6316, + "step": 1626 + }, + { + "epoch": 0.2955025313869276, + "grad_norm": 0.28781551122665405, + "learning_rate": 8.307091644985673e-05, + "loss": 1.8333, + "step": 1627 + }, + { + "epoch": 0.29568415556110517, + "grad_norm": 0.37242501974105835, + "learning_rate": 8.304873212354756e-05, + "loss": 1.8091, + "step": 1628 + }, + { + "epoch": 0.2958657797352828, + "grad_norm": 0.42822158336639404, + "learning_rate": 8.302653623775556e-05, + "loss": 1.6991, + "step": 1629 + }, + { + "epoch": 0.2960474039094603, + "grad_norm": 0.6217488646507263, + "learning_rate": 8.300432880024424e-05, + "loss": 1.9121, + "step": 1630 + }, + { + "epoch": 0.29622902808363794, + "grad_norm": 0.543077290058136, + "learning_rate": 8.298210981878112e-05, + "loss": 1.8381, + "step": 1631 + }, + { + "epoch": 0.2964106522578155, + "grad_norm": 0.39732640981674194, + "learning_rate": 8.295987930113775e-05, + "loss": 1.8969, + "step": 1632 + }, + { + "epoch": 0.2965922764319931, + "grad_norm": 7.987948894500732, + "learning_rate": 8.293763725508969e-05, + "loss": 1.8503, + "step": 1633 + }, + { + "epoch": 0.2967739006061707, + "grad_norm": 0.38470903038978577, + "learning_rate": 8.29153836884166e-05, + "loss": 1.8432, + "step": 1634 + }, + { + "epoch": 0.29695552478034826, + "grad_norm": 0.7630496621131897, + "learning_rate": 8.28931186089021e-05, + "loss": 1.7184, + "step": 1635 + }, + { + "epoch": 0.29713714895452586, + "grad_norm": 0.4988643527030945, + "learning_rate": 8.287084202433385e-05, + "loss": 1.7823, + "step": 1636 + }, + { + "epoch": 0.2973187731287034, + "grad_norm": 0.4503086507320404, + "learning_rate": 8.284855394250362e-05, + "loss": 1.6591, + "step": 1637 + }, + { + "epoch": 0.297500397302881, + "grad_norm": 0.49368536472320557, + "learning_rate": 8.282625437120706e-05, + "loss": 1.855, + "step": 1638 + }, + { + "epoch": 0.2976820214770586, + "grad_norm": 0.33147287368774414, + "learning_rate": 8.280394331824393e-05, + "loss": 1.6912, + "step": 1639 + }, + { + "epoch": 0.2978636456512362, + "grad_norm": 0.3278810679912567, + "learning_rate": 8.2781620791418e-05, + "loss": 1.6929, + "step": 1640 + }, + { + "epoch": 0.29804526982541374, + "grad_norm": 0.5630048513412476, + "learning_rate": 8.275928679853703e-05, + "loss": 1.8836, + "step": 1641 + }, + { + "epoch": 0.29822689399959135, + "grad_norm": 0.7501957416534424, + "learning_rate": 8.273694134741278e-05, + "loss": 1.7583, + "step": 1642 + }, + { + "epoch": 0.29840851817376896, + "grad_norm": 0.6428157091140747, + "learning_rate": 8.271458444586107e-05, + "loss": 1.833, + "step": 1643 + }, + { + "epoch": 0.2985901423479465, + "grad_norm": 0.34124475717544556, + "learning_rate": 8.269221610170169e-05, + "loss": 1.6487, + "step": 1644 + }, + { + "epoch": 0.2987717665221241, + "grad_norm": 0.3392082452774048, + "learning_rate": 8.26698363227584e-05, + "loss": 1.7372, + "step": 1645 + }, + { + "epoch": 0.29895339069630167, + "grad_norm": 1.3782391548156738, + "learning_rate": 8.264744511685904e-05, + "loss": 1.9649, + "step": 1646 + }, + { + "epoch": 0.2991350148704793, + "grad_norm": 0.37580615282058716, + "learning_rate": 8.262504249183536e-05, + "loss": 1.8222, + "step": 1647 + }, + { + "epoch": 0.29931663904465683, + "grad_norm": 0.37503352761268616, + "learning_rate": 8.260262845552318e-05, + "loss": 1.7331, + "step": 1648 + }, + { + "epoch": 0.29949826321883444, + "grad_norm": 0.447391152381897, + "learning_rate": 8.258020301576224e-05, + "loss": 1.7843, + "step": 1649 + }, + { + "epoch": 0.299679887393012, + "grad_norm": 0.36880865693092346, + "learning_rate": 8.255776618039634e-05, + "loss": 1.7237, + "step": 1650 + }, + { + "epoch": 0.2998615115671896, + "grad_norm": 0.494592547416687, + "learning_rate": 8.253531795727319e-05, + "loss": 1.5702, + "step": 1651 + }, + { + "epoch": 0.30004313574136715, + "grad_norm": 0.39566588401794434, + "learning_rate": 8.251285835424459e-05, + "loss": 1.6889, + "step": 1652 + }, + { + "epoch": 0.30022475991554476, + "grad_norm": 0.35588473081588745, + "learning_rate": 8.249038737916617e-05, + "loss": 1.7813, + "step": 1653 + }, + { + "epoch": 0.30040638408972237, + "grad_norm": 0.32622307538986206, + "learning_rate": 8.246790503989767e-05, + "loss": 1.5334, + "step": 1654 + }, + { + "epoch": 0.3005880082638999, + "grad_norm": 0.6359313130378723, + "learning_rate": 8.244541134430276e-05, + "loss": 1.7064, + "step": 1655 + }, + { + "epoch": 0.3007696324380775, + "grad_norm": 0.3656284511089325, + "learning_rate": 8.242290630024909e-05, + "loss": 1.7534, + "step": 1656 + }, + { + "epoch": 0.3009512566122551, + "grad_norm": 0.7258945107460022, + "learning_rate": 8.240038991560823e-05, + "loss": 1.8054, + "step": 1657 + }, + { + "epoch": 0.3011328807864327, + "grad_norm": 0.396989107131958, + "learning_rate": 8.237786219825577e-05, + "loss": 1.7694, + "step": 1658 + }, + { + "epoch": 0.30131450496061024, + "grad_norm": 0.3892711400985718, + "learning_rate": 8.235532315607126e-05, + "loss": 1.5191, + "step": 1659 + }, + { + "epoch": 0.30149612913478785, + "grad_norm": 0.4993894100189209, + "learning_rate": 8.233277279693819e-05, + "loss": 1.7215, + "step": 1660 + }, + { + "epoch": 0.3016777533089654, + "grad_norm": 0.4352872967720032, + "learning_rate": 8.231021112874402e-05, + "loss": 1.7418, + "step": 1661 + }, + { + "epoch": 0.301859377483143, + "grad_norm": 0.4212697744369507, + "learning_rate": 8.228763815938014e-05, + "loss": 1.7753, + "step": 1662 + }, + { + "epoch": 0.30204100165732056, + "grad_norm": 0.409452348947525, + "learning_rate": 8.226505389674197e-05, + "loss": 1.6342, + "step": 1663 + }, + { + "epoch": 0.30222262583149817, + "grad_norm": 0.9205015897750854, + "learning_rate": 8.224245834872879e-05, + "loss": 1.772, + "step": 1664 + }, + { + "epoch": 0.3024042500056758, + "grad_norm": 0.4376778304576874, + "learning_rate": 8.221985152324385e-05, + "loss": 1.7322, + "step": 1665 + }, + { + "epoch": 0.30258587417985333, + "grad_norm": 0.5797874927520752, + "learning_rate": 8.21972334281944e-05, + "loss": 1.7802, + "step": 1666 + }, + { + "epoch": 0.30276749835403094, + "grad_norm": 0.41904348134994507, + "learning_rate": 8.217460407149156e-05, + "loss": 1.9213, + "step": 1667 + }, + { + "epoch": 0.3029491225282085, + "grad_norm": 0.425210565328598, + "learning_rate": 8.215196346105044e-05, + "loss": 1.7837, + "step": 1668 + }, + { + "epoch": 0.3031307467023861, + "grad_norm": 0.5702981352806091, + "learning_rate": 8.212931160479003e-05, + "loss": 1.763, + "step": 1669 + }, + { + "epoch": 0.30331237087656365, + "grad_norm": 0.5334601402282715, + "learning_rate": 8.210664851063333e-05, + "loss": 1.8934, + "step": 1670 + }, + { + "epoch": 0.30349399505074126, + "grad_norm": 0.4593403935432434, + "learning_rate": 8.20839741865072e-05, + "loss": 1.62, + "step": 1671 + }, + { + "epoch": 0.3036756192249188, + "grad_norm": 0.33287620544433594, + "learning_rate": 8.206128864034246e-05, + "loss": 1.5136, + "step": 1672 + }, + { + "epoch": 0.3038572433990964, + "grad_norm": 0.3537622392177582, + "learning_rate": 8.203859188007387e-05, + "loss": 1.7746, + "step": 1673 + }, + { + "epoch": 0.30403886757327403, + "grad_norm": 0.3839857578277588, + "learning_rate": 8.20158839136401e-05, + "loss": 1.7692, + "step": 1674 + }, + { + "epoch": 0.3042204917474516, + "grad_norm": 0.41766229271888733, + "learning_rate": 8.19931647489837e-05, + "loss": 1.7819, + "step": 1675 + }, + { + "epoch": 0.3044021159216292, + "grad_norm": 0.46632665395736694, + "learning_rate": 8.19704343940512e-05, + "loss": 1.6304, + "step": 1676 + }, + { + "epoch": 0.30458374009580674, + "grad_norm": 0.4375346899032593, + "learning_rate": 8.1947692856793e-05, + "loss": 1.8528, + "step": 1677 + }, + { + "epoch": 0.30476536426998435, + "grad_norm": 0.3301321566104889, + "learning_rate": 8.192494014516344e-05, + "loss": 1.7325, + "step": 1678 + }, + { + "epoch": 0.3049469884441619, + "grad_norm": 0.26222899556159973, + "learning_rate": 8.190217626712076e-05, + "loss": 1.6133, + "step": 1679 + }, + { + "epoch": 0.3051286126183395, + "grad_norm": 0.4180654287338257, + "learning_rate": 8.187940123062707e-05, + "loss": 1.9258, + "step": 1680 + }, + { + "epoch": 0.30531023679251706, + "grad_norm": 0.38665586709976196, + "learning_rate": 8.185661504364844e-05, + "loss": 1.9082, + "step": 1681 + }, + { + "epoch": 0.3054918609666947, + "grad_norm": 0.3522675931453705, + "learning_rate": 8.18338177141548e-05, + "loss": 1.6109, + "step": 1682 + }, + { + "epoch": 0.3056734851408722, + "grad_norm": 0.5746785998344421, + "learning_rate": 8.181100925012002e-05, + "loss": 1.8288, + "step": 1683 + }, + { + "epoch": 0.30585510931504983, + "grad_norm": 0.3774076998233795, + "learning_rate": 8.178818965952178e-05, + "loss": 1.8255, + "step": 1684 + }, + { + "epoch": 0.30603673348922744, + "grad_norm": 0.5806548595428467, + "learning_rate": 8.176535895034177e-05, + "loss": 1.7035, + "step": 1685 + }, + { + "epoch": 0.306218357663405, + "grad_norm": 0.31942322850227356, + "learning_rate": 8.174251713056547e-05, + "loss": 1.6803, + "step": 1686 + }, + { + "epoch": 0.3063999818375826, + "grad_norm": 0.5038524866104126, + "learning_rate": 8.171966420818228e-05, + "loss": 1.7035, + "step": 1687 + }, + { + "epoch": 0.30658160601176015, + "grad_norm": 0.37012773752212524, + "learning_rate": 8.16968001911855e-05, + "loss": 1.6698, + "step": 1688 + }, + { + "epoch": 0.30676323018593776, + "grad_norm": 1.2442718744277954, + "learning_rate": 8.16739250875723e-05, + "loss": 1.9469, + "step": 1689 + }, + { + "epoch": 0.3069448543601153, + "grad_norm": 0.6164734959602356, + "learning_rate": 8.165103890534372e-05, + "loss": 1.716, + "step": 1690 + }, + { + "epoch": 0.3071264785342929, + "grad_norm": 0.40922996401786804, + "learning_rate": 8.162814165250464e-05, + "loss": 1.8667, + "step": 1691 + }, + { + "epoch": 0.3073081027084705, + "grad_norm": 0.48859402537345886, + "learning_rate": 8.160523333706392e-05, + "loss": 1.7718, + "step": 1692 + }, + { + "epoch": 0.3074897268826481, + "grad_norm": 0.4092088043689728, + "learning_rate": 8.158231396703418e-05, + "loss": 1.7186, + "step": 1693 + }, + { + "epoch": 0.30767135105682564, + "grad_norm": 0.37130144238471985, + "learning_rate": 8.155938355043194e-05, + "loss": 1.6843, + "step": 1694 + }, + { + "epoch": 0.30785297523100325, + "grad_norm": 0.9312444925308228, + "learning_rate": 8.153644209527762e-05, + "loss": 1.8256, + "step": 1695 + }, + { + "epoch": 0.30803459940518085, + "grad_norm": 0.5170516967773438, + "learning_rate": 8.151348960959546e-05, + "loss": 1.7719, + "step": 1696 + }, + { + "epoch": 0.3082162235793584, + "grad_norm": 1.3815181255340576, + "learning_rate": 8.149052610141357e-05, + "loss": 1.9556, + "step": 1697 + }, + { + "epoch": 0.308397847753536, + "grad_norm": 0.3735514283180237, + "learning_rate": 8.146755157876392e-05, + "loss": 1.6568, + "step": 1698 + }, + { + "epoch": 0.30857947192771357, + "grad_norm": 0.4914175271987915, + "learning_rate": 8.14445660496823e-05, + "loss": 1.8183, + "step": 1699 + }, + { + "epoch": 0.3087610961018912, + "grad_norm": 0.37792912125587463, + "learning_rate": 8.142156952220841e-05, + "loss": 1.7114, + "step": 1700 + }, + { + "epoch": 0.3089427202760687, + "grad_norm": 0.38091933727264404, + "learning_rate": 8.139856200438575e-05, + "loss": 1.8503, + "step": 1701 + }, + { + "epoch": 0.30912434445024634, + "grad_norm": 0.4470556378364563, + "learning_rate": 8.137554350426167e-05, + "loss": 1.8102, + "step": 1702 + }, + { + "epoch": 0.3093059686244239, + "grad_norm": 0.5216695666313171, + "learning_rate": 8.135251402988741e-05, + "loss": 1.8336, + "step": 1703 + }, + { + "epoch": 0.3094875927986015, + "grad_norm": 0.350356787443161, + "learning_rate": 8.132947358931797e-05, + "loss": 1.7349, + "step": 1704 + }, + { + "epoch": 0.3096692169727791, + "grad_norm": 0.4206382930278778, + "learning_rate": 8.130642219061225e-05, + "loss": 1.8545, + "step": 1705 + }, + { + "epoch": 0.30985084114695666, + "grad_norm": 0.40645888447761536, + "learning_rate": 8.128335984183291e-05, + "loss": 1.7681, + "step": 1706 + }, + { + "epoch": 0.31003246532113427, + "grad_norm": 0.4699512720108032, + "learning_rate": 8.126028655104654e-05, + "loss": 1.9764, + "step": 1707 + }, + { + "epoch": 0.3102140894953118, + "grad_norm": 0.35773059725761414, + "learning_rate": 8.123720232632348e-05, + "loss": 1.7353, + "step": 1708 + }, + { + "epoch": 0.3103957136694894, + "grad_norm": 0.43512722849845886, + "learning_rate": 8.121410717573794e-05, + "loss": 1.847, + "step": 1709 + }, + { + "epoch": 0.310577337843667, + "grad_norm": 0.3780210614204407, + "learning_rate": 8.119100110736789e-05, + "loss": 1.7712, + "step": 1710 + }, + { + "epoch": 0.3107589620178446, + "grad_norm": 0.4100992679595947, + "learning_rate": 8.11678841292952e-05, + "loss": 1.5897, + "step": 1711 + }, + { + "epoch": 0.31094058619202214, + "grad_norm": 0.3409011960029602, + "learning_rate": 8.11447562496055e-05, + "loss": 1.785, + "step": 1712 + }, + { + "epoch": 0.31112221036619975, + "grad_norm": 0.4051324129104614, + "learning_rate": 8.112161747638823e-05, + "loss": 1.744, + "step": 1713 + }, + { + "epoch": 0.3113038345403773, + "grad_norm": 0.4095343053340912, + "learning_rate": 8.109846781773667e-05, + "loss": 1.8379, + "step": 1714 + }, + { + "epoch": 0.3114854587145549, + "grad_norm": 0.748431921005249, + "learning_rate": 8.10753072817479e-05, + "loss": 1.7764, + "step": 1715 + }, + { + "epoch": 0.3116670828887325, + "grad_norm": 0.417363703250885, + "learning_rate": 8.105213587652281e-05, + "loss": 1.8997, + "step": 1716 + }, + { + "epoch": 0.31184870706291007, + "grad_norm": 0.4905109405517578, + "learning_rate": 8.102895361016606e-05, + "loss": 1.9995, + "step": 1717 + }, + { + "epoch": 0.3120303312370877, + "grad_norm": 0.600260317325592, + "learning_rate": 8.100576049078616e-05, + "loss": 1.8361, + "step": 1718 + }, + { + "epoch": 0.31221195541126523, + "grad_norm": 0.471113920211792, + "learning_rate": 8.098255652649536e-05, + "loss": 1.601, + "step": 1719 + }, + { + "epoch": 0.31239357958544284, + "grad_norm": 0.40983596444129944, + "learning_rate": 8.095934172540974e-05, + "loss": 1.8926, + "step": 1720 + }, + { + "epoch": 0.3125752037596204, + "grad_norm": 0.7379222512245178, + "learning_rate": 8.093611609564913e-05, + "loss": 1.8733, + "step": 1721 + }, + { + "epoch": 0.312756827933798, + "grad_norm": 0.32716625928878784, + "learning_rate": 8.091287964533724e-05, + "loss": 1.6517, + "step": 1722 + }, + { + "epoch": 0.31293845210797555, + "grad_norm": 0.3813991844654083, + "learning_rate": 8.088963238260146e-05, + "loss": 1.7592, + "step": 1723 + }, + { + "epoch": 0.31312007628215316, + "grad_norm": 0.37484660744667053, + "learning_rate": 8.086637431557301e-05, + "loss": 1.7167, + "step": 1724 + }, + { + "epoch": 0.3133017004563307, + "grad_norm": 0.8108272552490234, + "learning_rate": 8.084310545238689e-05, + "loss": 1.8774, + "step": 1725 + }, + { + "epoch": 0.3134833246305083, + "grad_norm": 0.3170906603336334, + "learning_rate": 8.081982580118188e-05, + "loss": 1.7006, + "step": 1726 + }, + { + "epoch": 0.31366494880468593, + "grad_norm": 0.31348371505737305, + "learning_rate": 8.07965353701005e-05, + "loss": 1.6296, + "step": 1727 + }, + { + "epoch": 0.3138465729788635, + "grad_norm": 0.523552656173706, + "learning_rate": 8.077323416728908e-05, + "loss": 2.048, + "step": 1728 + }, + { + "epoch": 0.3140281971530411, + "grad_norm": 0.41044628620147705, + "learning_rate": 8.074992220089769e-05, + "loss": 1.9065, + "step": 1729 + }, + { + "epoch": 0.31420982132721864, + "grad_norm": 0.7035326957702637, + "learning_rate": 8.072659947908017e-05, + "loss": 1.6843, + "step": 1730 + }, + { + "epoch": 0.31439144550139625, + "grad_norm": 0.3855143189430237, + "learning_rate": 8.070326600999416e-05, + "loss": 1.8051, + "step": 1731 + }, + { + "epoch": 0.3145730696755738, + "grad_norm": 0.3537168502807617, + "learning_rate": 8.067992180180099e-05, + "loss": 1.9487, + "step": 1732 + }, + { + "epoch": 0.3147546938497514, + "grad_norm": 0.36071687936782837, + "learning_rate": 8.065656686266582e-05, + "loss": 1.69, + "step": 1733 + }, + { + "epoch": 0.31493631802392896, + "grad_norm": 0.36559250950813293, + "learning_rate": 8.06332012007575e-05, + "loss": 1.7844, + "step": 1734 + }, + { + "epoch": 0.31511794219810657, + "grad_norm": 0.5247604250907898, + "learning_rate": 8.060982482424866e-05, + "loss": 1.8759, + "step": 1735 + }, + { + "epoch": 0.3152995663722841, + "grad_norm": 0.402179479598999, + "learning_rate": 8.058643774131569e-05, + "loss": 1.8046, + "step": 1736 + }, + { + "epoch": 0.31548119054646173, + "grad_norm": 0.4598861038684845, + "learning_rate": 8.056303996013868e-05, + "loss": 1.8359, + "step": 1737 + }, + { + "epoch": 0.31566281472063934, + "grad_norm": 0.34515225887298584, + "learning_rate": 8.053963148890152e-05, + "loss": 1.8715, + "step": 1738 + }, + { + "epoch": 0.3158444388948169, + "grad_norm": 0.39699092507362366, + "learning_rate": 8.051621233579181e-05, + "loss": 1.9022, + "step": 1739 + }, + { + "epoch": 0.3160260630689945, + "grad_norm": 0.8651358485221863, + "learning_rate": 8.049278250900085e-05, + "loss": 1.9944, + "step": 1740 + }, + { + "epoch": 0.31620768724317205, + "grad_norm": 0.6047308444976807, + "learning_rate": 8.046934201672376e-05, + "loss": 1.5811, + "step": 1741 + }, + { + "epoch": 0.31638931141734966, + "grad_norm": 0.2858513593673706, + "learning_rate": 8.044589086715932e-05, + "loss": 1.8627, + "step": 1742 + }, + { + "epoch": 0.3165709355915272, + "grad_norm": 0.3781892657279968, + "learning_rate": 8.042242906851005e-05, + "loss": 1.7084, + "step": 1743 + }, + { + "epoch": 0.3167525597657048, + "grad_norm": 0.35374823212623596, + "learning_rate": 8.03989566289822e-05, + "loss": 1.7796, + "step": 1744 + }, + { + "epoch": 0.3169341839398824, + "grad_norm": 0.4294878840446472, + "learning_rate": 8.037547355678577e-05, + "loss": 1.717, + "step": 1745 + }, + { + "epoch": 0.31711580811406, + "grad_norm": 0.41374754905700684, + "learning_rate": 8.035197986013444e-05, + "loss": 1.655, + "step": 1746 + }, + { + "epoch": 0.3172974322882376, + "grad_norm": 0.3856959342956543, + "learning_rate": 8.032847554724562e-05, + "loss": 1.7267, + "step": 1747 + }, + { + "epoch": 0.31747905646241514, + "grad_norm": 0.3866060674190521, + "learning_rate": 8.030496062634042e-05, + "loss": 1.7225, + "step": 1748 + }, + { + "epoch": 0.31766068063659275, + "grad_norm": 0.3292600214481354, + "learning_rate": 8.02814351056437e-05, + "loss": 1.7564, + "step": 1749 + }, + { + "epoch": 0.3178423048107703, + "grad_norm": 0.4129892885684967, + "learning_rate": 8.025789899338397e-05, + "loss": 1.863, + "step": 1750 + }, + { + "epoch": 0.3180239289849479, + "grad_norm": 0.5928601622581482, + "learning_rate": 8.023435229779351e-05, + "loss": 1.8089, + "step": 1751 + }, + { + "epoch": 0.31820555315912546, + "grad_norm": 0.36589503288269043, + "learning_rate": 8.021079502710823e-05, + "loss": 1.7587, + "step": 1752 + }, + { + "epoch": 0.3183871773333031, + "grad_norm": 0.39332228899002075, + "learning_rate": 8.01872271895678e-05, + "loss": 1.8534, + "step": 1753 + }, + { + "epoch": 0.3185688015074806, + "grad_norm": 0.549704372882843, + "learning_rate": 8.016364879341557e-05, + "loss": 1.8221, + "step": 1754 + }, + { + "epoch": 0.31875042568165823, + "grad_norm": 0.40019214153289795, + "learning_rate": 8.014005984689856e-05, + "loss": 1.9095, + "step": 1755 + }, + { + "epoch": 0.3189320498558358, + "grad_norm": 0.3618466854095459, + "learning_rate": 8.01164603582675e-05, + "loss": 1.853, + "step": 1756 + }, + { + "epoch": 0.3191136740300134, + "grad_norm": 0.3509480059146881, + "learning_rate": 8.00928503357768e-05, + "loss": 1.8509, + "step": 1757 + }, + { + "epoch": 0.319295298204191, + "grad_norm": 0.3961155116558075, + "learning_rate": 8.006922978768456e-05, + "loss": 1.7335, + "step": 1758 + }, + { + "epoch": 0.31947692237836856, + "grad_norm": 0.322031170129776, + "learning_rate": 8.004559872225257e-05, + "loss": 1.6768, + "step": 1759 + }, + { + "epoch": 0.31965854655254616, + "grad_norm": 0.49394679069519043, + "learning_rate": 8.002195714774626e-05, + "loss": 1.8834, + "step": 1760 + }, + { + "epoch": 0.3198401707267237, + "grad_norm": 0.3530486822128296, + "learning_rate": 7.999830507243478e-05, + "loss": 1.8086, + "step": 1761 + }, + { + "epoch": 0.3200217949009013, + "grad_norm": 1.1861177682876587, + "learning_rate": 7.997464250459096e-05, + "loss": 1.8727, + "step": 1762 + }, + { + "epoch": 0.3202034190750789, + "grad_norm": 0.3873712122440338, + "learning_rate": 7.995096945249123e-05, + "loss": 1.881, + "step": 1763 + }, + { + "epoch": 0.3203850432492565, + "grad_norm": 0.7059659361839294, + "learning_rate": 7.992728592441576e-05, + "loss": 1.7464, + "step": 1764 + }, + { + "epoch": 0.32056666742343404, + "grad_norm": 0.4527912437915802, + "learning_rate": 7.990359192864836e-05, + "loss": 1.6934, + "step": 1765 + }, + { + "epoch": 0.32074829159761165, + "grad_norm": 0.8842293620109558, + "learning_rate": 7.987988747347647e-05, + "loss": 1.8314, + "step": 1766 + }, + { + "epoch": 0.3209299157717892, + "grad_norm": 0.4798884391784668, + "learning_rate": 7.985617256719127e-05, + "loss": 1.8109, + "step": 1767 + }, + { + "epoch": 0.3211115399459668, + "grad_norm": 0.7886636853218079, + "learning_rate": 7.983244721808749e-05, + "loss": 1.8171, + "step": 1768 + }, + { + "epoch": 0.3212931641201444, + "grad_norm": 0.347822368144989, + "learning_rate": 7.98087114344636e-05, + "loss": 1.5598, + "step": 1769 + }, + { + "epoch": 0.32147478829432197, + "grad_norm": 0.4227261543273926, + "learning_rate": 7.978496522462167e-05, + "loss": 1.8852, + "step": 1770 + }, + { + "epoch": 0.3216564124684996, + "grad_norm": 0.43332213163375854, + "learning_rate": 7.976120859686744e-05, + "loss": 1.8121, + "step": 1771 + }, + { + "epoch": 0.3218380366426771, + "grad_norm": 0.44609448313713074, + "learning_rate": 7.973744155951027e-05, + "loss": 1.8579, + "step": 1772 + }, + { + "epoch": 0.32201966081685474, + "grad_norm": 0.43425247073173523, + "learning_rate": 7.971366412086319e-05, + "loss": 1.6873, + "step": 1773 + }, + { + "epoch": 0.3222012849910323, + "grad_norm": 0.8733965158462524, + "learning_rate": 7.968987628924284e-05, + "loss": 1.8873, + "step": 1774 + }, + { + "epoch": 0.3223829091652099, + "grad_norm": 0.5272225141525269, + "learning_rate": 7.966607807296954e-05, + "loss": 1.869, + "step": 1775 + }, + { + "epoch": 0.32256453333938745, + "grad_norm": 0.48014530539512634, + "learning_rate": 7.964226948036717e-05, + "loss": 1.8084, + "step": 1776 + }, + { + "epoch": 0.32274615751356506, + "grad_norm": 0.49583685398101807, + "learning_rate": 7.961845051976334e-05, + "loss": 1.8808, + "step": 1777 + }, + { + "epoch": 0.32292778168774267, + "grad_norm": 1.2715961933135986, + "learning_rate": 7.959462119948914e-05, + "loss": 1.8361, + "step": 1778 + }, + { + "epoch": 0.3231094058619202, + "grad_norm": 0.3721200227737427, + "learning_rate": 7.957078152787947e-05, + "loss": 1.6525, + "step": 1779 + }, + { + "epoch": 0.3232910300360978, + "grad_norm": 0.3394365906715393, + "learning_rate": 7.954693151327269e-05, + "loss": 1.6239, + "step": 1780 + }, + { + "epoch": 0.3234726542102754, + "grad_norm": 0.791668176651001, + "learning_rate": 7.952307116401086e-05, + "loss": 1.8165, + "step": 1781 + }, + { + "epoch": 0.323654278384453, + "grad_norm": 1.4395864009857178, + "learning_rate": 7.949920048843962e-05, + "loss": 1.7569, + "step": 1782 + }, + { + "epoch": 0.32383590255863054, + "grad_norm": 0.917224645614624, + "learning_rate": 7.947531949490825e-05, + "loss": 1.8861, + "step": 1783 + }, + { + "epoch": 0.32401752673280815, + "grad_norm": 0.7737044095993042, + "learning_rate": 7.945142819176963e-05, + "loss": 1.8819, + "step": 1784 + }, + { + "epoch": 0.3241991509069857, + "grad_norm": 0.603383481502533, + "learning_rate": 7.942752658738022e-05, + "loss": 1.6953, + "step": 1785 + }, + { + "epoch": 0.3243807750811633, + "grad_norm": 0.8548228144645691, + "learning_rate": 7.940361469010012e-05, + "loss": 1.8128, + "step": 1786 + }, + { + "epoch": 0.32456239925534086, + "grad_norm": 0.33729177713394165, + "learning_rate": 7.9379692508293e-05, + "loss": 1.699, + "step": 1787 + }, + { + "epoch": 0.32474402342951847, + "grad_norm": 0.35402169823646545, + "learning_rate": 7.935576005032617e-05, + "loss": 1.7214, + "step": 1788 + }, + { + "epoch": 0.3249256476036961, + "grad_norm": 0.3761714994907379, + "learning_rate": 7.933181732457047e-05, + "loss": 1.7538, + "step": 1789 + }, + { + "epoch": 0.32510727177787363, + "grad_norm": 0.3775179386138916, + "learning_rate": 7.93078643394004e-05, + "loss": 1.6169, + "step": 1790 + }, + { + "epoch": 0.32528889595205124, + "grad_norm": 0.5310118794441223, + "learning_rate": 7.9283901103194e-05, + "loss": 1.7871, + "step": 1791 + }, + { + "epoch": 0.3254705201262288, + "grad_norm": 0.4098902940750122, + "learning_rate": 7.925992762433292e-05, + "loss": 1.7525, + "step": 1792 + }, + { + "epoch": 0.3256521443004064, + "grad_norm": 0.2991105020046234, + "learning_rate": 7.923594391120236e-05, + "loss": 1.6855, + "step": 1793 + }, + { + "epoch": 0.32583376847458395, + "grad_norm": 0.8670941591262817, + "learning_rate": 7.921194997219115e-05, + "loss": 1.9704, + "step": 1794 + }, + { + "epoch": 0.32601539264876156, + "grad_norm": 0.4419000744819641, + "learning_rate": 7.918794581569166e-05, + "loss": 1.7738, + "step": 1795 + }, + { + "epoch": 0.3261970168229391, + "grad_norm": 0.901620626449585, + "learning_rate": 7.916393145009983e-05, + "loss": 1.7736, + "step": 1796 + }, + { + "epoch": 0.3263786409971167, + "grad_norm": 0.4655294120311737, + "learning_rate": 7.913990688381522e-05, + "loss": 1.6957, + "step": 1797 + }, + { + "epoch": 0.3265602651712943, + "grad_norm": 0.6715974807739258, + "learning_rate": 7.91158721252409e-05, + "loss": 1.9584, + "step": 1798 + }, + { + "epoch": 0.3267418893454719, + "grad_norm": 0.38885679841041565, + "learning_rate": 7.909182718278352e-05, + "loss": 1.6076, + "step": 1799 + }, + { + "epoch": 0.3269235135196495, + "grad_norm": 0.44869279861450195, + "learning_rate": 7.90677720648533e-05, + "loss": 1.8233, + "step": 1800 + }, + { + "epoch": 0.32710513769382704, + "grad_norm": 0.48726481199264526, + "learning_rate": 7.904370677986404e-05, + "loss": 2.1069, + "step": 1801 + }, + { + "epoch": 0.32728676186800465, + "grad_norm": 0.39313414692878723, + "learning_rate": 7.901963133623307e-05, + "loss": 1.9382, + "step": 1802 + }, + { + "epoch": 0.3274683860421822, + "grad_norm": 0.536598801612854, + "learning_rate": 7.899554574238126e-05, + "loss": 1.7393, + "step": 1803 + }, + { + "epoch": 0.3276500102163598, + "grad_norm": 0.32724764943122864, + "learning_rate": 7.897145000673306e-05, + "loss": 1.5862, + "step": 1804 + }, + { + "epoch": 0.32783163439053736, + "grad_norm": 0.5881091952323914, + "learning_rate": 7.894734413771647e-05, + "loss": 1.8216, + "step": 1805 + }, + { + "epoch": 0.32801325856471497, + "grad_norm": 0.34817707538604736, + "learning_rate": 7.892322814376299e-05, + "loss": 1.7216, + "step": 1806 + }, + { + "epoch": 0.3281948827388925, + "grad_norm": 0.4413134455680847, + "learning_rate": 7.88991020333077e-05, + "loss": 1.8164, + "step": 1807 + }, + { + "epoch": 0.32837650691307013, + "grad_norm": 0.418634295463562, + "learning_rate": 7.887496581478923e-05, + "loss": 1.7289, + "step": 1808 + }, + { + "epoch": 0.32855813108724774, + "grad_norm": 0.4966290295124054, + "learning_rate": 7.88508194966497e-05, + "loss": 2.0008, + "step": 1809 + }, + { + "epoch": 0.3287397552614253, + "grad_norm": 0.43934524059295654, + "learning_rate": 7.882666308733482e-05, + "loss": 1.7941, + "step": 1810 + }, + { + "epoch": 0.3289213794356029, + "grad_norm": 0.4956071674823761, + "learning_rate": 7.880249659529376e-05, + "loss": 1.9321, + "step": 1811 + }, + { + "epoch": 0.32910300360978045, + "grad_norm": 0.3527243137359619, + "learning_rate": 7.87783200289793e-05, + "loss": 1.6903, + "step": 1812 + }, + { + "epoch": 0.32928462778395806, + "grad_norm": 0.3939318060874939, + "learning_rate": 7.875413339684763e-05, + "loss": 1.4228, + "step": 1813 + }, + { + "epoch": 0.3294662519581356, + "grad_norm": 0.4396166503429413, + "learning_rate": 7.872993670735858e-05, + "loss": 1.6713, + "step": 1814 + }, + { + "epoch": 0.3296478761323132, + "grad_norm": 1.497362732887268, + "learning_rate": 7.870572996897546e-05, + "loss": 1.7012, + "step": 1815 + }, + { + "epoch": 0.3298295003064908, + "grad_norm": 0.44147589802742004, + "learning_rate": 7.868151319016503e-05, + "loss": 1.6867, + "step": 1816 + }, + { + "epoch": 0.3300111244806684, + "grad_norm": 0.3562192916870117, + "learning_rate": 7.865728637939764e-05, + "loss": 1.8312, + "step": 1817 + }, + { + "epoch": 0.33019274865484594, + "grad_norm": 0.466450572013855, + "learning_rate": 7.863304954514714e-05, + "loss": 1.7412, + "step": 1818 + }, + { + "epoch": 0.33037437282902354, + "grad_norm": 0.303743451833725, + "learning_rate": 7.860880269589082e-05, + "loss": 1.6358, + "step": 1819 + }, + { + "epoch": 0.33055599700320115, + "grad_norm": 0.37766799330711365, + "learning_rate": 7.858454584010957e-05, + "loss": 1.7729, + "step": 1820 + }, + { + "epoch": 0.3307376211773787, + "grad_norm": 0.9616601467132568, + "learning_rate": 7.85602789862877e-05, + "loss": 1.7582, + "step": 1821 + }, + { + "epoch": 0.3309192453515563, + "grad_norm": 0.6607717871665955, + "learning_rate": 7.853600214291307e-05, + "loss": 1.8693, + "step": 1822 + }, + { + "epoch": 0.33110086952573387, + "grad_norm": 0.3538808524608612, + "learning_rate": 7.8511715318477e-05, + "loss": 1.6771, + "step": 1823 + }, + { + "epoch": 0.3312824936999115, + "grad_norm": 0.3912052810192108, + "learning_rate": 7.84874185214743e-05, + "loss": 1.9287, + "step": 1824 + }, + { + "epoch": 0.331464117874089, + "grad_norm": 0.35144710540771484, + "learning_rate": 7.846311176040331e-05, + "loss": 1.5881, + "step": 1825 + }, + { + "epoch": 0.33164574204826663, + "grad_norm": 0.33676397800445557, + "learning_rate": 7.843879504376579e-05, + "loss": 1.6456, + "step": 1826 + }, + { + "epoch": 0.3318273662224442, + "grad_norm": 0.352111279964447, + "learning_rate": 7.841446838006706e-05, + "loss": 1.8421, + "step": 1827 + }, + { + "epoch": 0.3320089903966218, + "grad_norm": 0.38674047589302063, + "learning_rate": 7.839013177781585e-05, + "loss": 1.6937, + "step": 1828 + }, + { + "epoch": 0.33219061457079935, + "grad_norm": 0.37708884477615356, + "learning_rate": 7.836578524552439e-05, + "loss": 1.8594, + "step": 1829 + }, + { + "epoch": 0.33237223874497696, + "grad_norm": 0.6731727123260498, + "learning_rate": 7.834142879170841e-05, + "loss": 1.8878, + "step": 1830 + }, + { + "epoch": 0.33255386291915456, + "grad_norm": 0.3866626024246216, + "learning_rate": 7.831706242488708e-05, + "loss": 1.8623, + "step": 1831 + }, + { + "epoch": 0.3327354870933321, + "grad_norm": 1.4541821479797363, + "learning_rate": 7.829268615358302e-05, + "loss": 1.8578, + "step": 1832 + }, + { + "epoch": 0.3329171112675097, + "grad_norm": 0.347703754901886, + "learning_rate": 7.826829998632237e-05, + "loss": 1.6997, + "step": 1833 + }, + { + "epoch": 0.3330987354416873, + "grad_norm": 0.4498690962791443, + "learning_rate": 7.824390393163469e-05, + "loss": 1.7049, + "step": 1834 + }, + { + "epoch": 0.3332803596158649, + "grad_norm": 0.5252015590667725, + "learning_rate": 7.821949799805301e-05, + "loss": 1.9958, + "step": 1835 + }, + { + "epoch": 0.33346198379004244, + "grad_norm": 0.390277236700058, + "learning_rate": 7.81950821941138e-05, + "loss": 1.7309, + "step": 1836 + }, + { + "epoch": 0.33364360796422005, + "grad_norm": 0.44444331526756287, + "learning_rate": 7.8170656528357e-05, + "loss": 1.8048, + "step": 1837 + }, + { + "epoch": 0.3338252321383976, + "grad_norm": 0.29955506324768066, + "learning_rate": 7.814622100932603e-05, + "loss": 1.9011, + "step": 1838 + }, + { + "epoch": 0.3340068563125752, + "grad_norm": 0.4231862425804138, + "learning_rate": 7.812177564556766e-05, + "loss": 1.8913, + "step": 1839 + }, + { + "epoch": 0.33418848048675276, + "grad_norm": 0.34459352493286133, + "learning_rate": 7.809732044563222e-05, + "loss": 1.736, + "step": 1840 + }, + { + "epoch": 0.33437010466093037, + "grad_norm": 0.7202975749969482, + "learning_rate": 7.80728554180734e-05, + "loss": 1.891, + "step": 1841 + }, + { + "epoch": 0.334551728835108, + "grad_norm": 0.4019789397716522, + "learning_rate": 7.804838057144839e-05, + "loss": 1.7882, + "step": 1842 + }, + { + "epoch": 0.33473335300928553, + "grad_norm": 0.2965039014816284, + "learning_rate": 7.802389591431772e-05, + "loss": 1.8223, + "step": 1843 + }, + { + "epoch": 0.33491497718346314, + "grad_norm": 0.366365909576416, + "learning_rate": 7.799940145524544e-05, + "loss": 1.7326, + "step": 1844 + }, + { + "epoch": 0.3350966013576407, + "grad_norm": 0.4295107126235962, + "learning_rate": 7.797489720279899e-05, + "loss": 1.8331, + "step": 1845 + }, + { + "epoch": 0.3352782255318183, + "grad_norm": 0.3314023017883301, + "learning_rate": 7.795038316554924e-05, + "loss": 1.6441, + "step": 1846 + }, + { + "epoch": 0.33545984970599585, + "grad_norm": 0.4658968448638916, + "learning_rate": 7.792585935207051e-05, + "loss": 1.8595, + "step": 1847 + }, + { + "epoch": 0.33564147388017346, + "grad_norm": 0.7156379818916321, + "learning_rate": 7.790132577094047e-05, + "loss": 1.7157, + "step": 1848 + }, + { + "epoch": 0.335823098054351, + "grad_norm": 0.3959856331348419, + "learning_rate": 7.78767824307403e-05, + "loss": 1.5264, + "step": 1849 + }, + { + "epoch": 0.3360047222285286, + "grad_norm": 0.646386981010437, + "learning_rate": 7.785222934005451e-05, + "loss": 1.6231, + "step": 1850 + }, + { + "epoch": 0.3361863464027062, + "grad_norm": 0.3195229172706604, + "learning_rate": 7.782766650747108e-05, + "loss": 1.9045, + "step": 1851 + }, + { + "epoch": 0.3363679705768838, + "grad_norm": 0.4228200614452362, + "learning_rate": 7.780309394158136e-05, + "loss": 1.9058, + "step": 1852 + }, + { + "epoch": 0.3365495947510614, + "grad_norm": 0.539542019367218, + "learning_rate": 7.777851165098012e-05, + "loss": 1.6611, + "step": 1853 + }, + { + "epoch": 0.33673121892523894, + "grad_norm": 0.34176090359687805, + "learning_rate": 7.775391964426551e-05, + "loss": 2.0279, + "step": 1854 + }, + { + "epoch": 0.33691284309941655, + "grad_norm": 0.4318482279777527, + "learning_rate": 7.772931793003912e-05, + "loss": 1.8528, + "step": 1855 + }, + { + "epoch": 0.3370944672735941, + "grad_norm": 0.3378757834434509, + "learning_rate": 7.77047065169059e-05, + "loss": 1.6981, + "step": 1856 + }, + { + "epoch": 0.3372760914477717, + "grad_norm": 0.6022688150405884, + "learning_rate": 7.768008541347423e-05, + "loss": 1.6896, + "step": 1857 + }, + { + "epoch": 0.33745771562194926, + "grad_norm": 0.5112354755401611, + "learning_rate": 7.765545462835582e-05, + "loss": 1.5985, + "step": 1858 + }, + { + "epoch": 0.33763933979612687, + "grad_norm": 1.1433141231536865, + "learning_rate": 7.763081417016582e-05, + "loss": 1.9083, + "step": 1859 + }, + { + "epoch": 0.3378209639703044, + "grad_norm": 0.7313655018806458, + "learning_rate": 7.760616404752272e-05, + "loss": 1.7073, + "step": 1860 + }, + { + "epoch": 0.33800258814448203, + "grad_norm": 0.9527460336685181, + "learning_rate": 7.758150426904845e-05, + "loss": 1.896, + "step": 1861 + }, + { + "epoch": 0.33818421231865964, + "grad_norm": 0.5776299238204956, + "learning_rate": 7.755683484336826e-05, + "loss": 1.825, + "step": 1862 + }, + { + "epoch": 0.3383658364928372, + "grad_norm": 0.5876672863960266, + "learning_rate": 7.753215577911079e-05, + "loss": 1.7412, + "step": 1863 + }, + { + "epoch": 0.3385474606670148, + "grad_norm": 0.4385468065738678, + "learning_rate": 7.750746708490808e-05, + "loss": 1.5397, + "step": 1864 + }, + { + "epoch": 0.33872908484119235, + "grad_norm": 0.3211999833583832, + "learning_rate": 7.74827687693955e-05, + "loss": 1.8936, + "step": 1865 + }, + { + "epoch": 0.33891070901536996, + "grad_norm": 0.42754116654396057, + "learning_rate": 7.745806084121179e-05, + "loss": 1.7557, + "step": 1866 + }, + { + "epoch": 0.3390923331895475, + "grad_norm": 0.4303213059902191, + "learning_rate": 7.743334330899908e-05, + "loss": 1.7897, + "step": 1867 + }, + { + "epoch": 0.3392739573637251, + "grad_norm": 1.335537314414978, + "learning_rate": 7.740861618140283e-05, + "loss": 1.7246, + "step": 1868 + }, + { + "epoch": 0.3394555815379027, + "grad_norm": 0.38739725947380066, + "learning_rate": 7.73838794670719e-05, + "loss": 1.5712, + "step": 1869 + }, + { + "epoch": 0.3396372057120803, + "grad_norm": 0.3921976089477539, + "learning_rate": 7.735913317465841e-05, + "loss": 1.9336, + "step": 1870 + }, + { + "epoch": 0.33981882988625783, + "grad_norm": 0.40459930896759033, + "learning_rate": 7.733437731281797e-05, + "loss": 1.7644, + "step": 1871 + }, + { + "epoch": 0.34000045406043544, + "grad_norm": 0.4171179234981537, + "learning_rate": 7.730961189020937e-05, + "loss": 1.6538, + "step": 1872 + }, + { + "epoch": 0.34018207823461305, + "grad_norm": 0.35432156920433044, + "learning_rate": 7.728483691549491e-05, + "loss": 1.8252, + "step": 1873 + }, + { + "epoch": 0.3403637024087906, + "grad_norm": 0.375660240650177, + "learning_rate": 7.726005239734012e-05, + "loss": 1.8752, + "step": 1874 + }, + { + "epoch": 0.3405453265829682, + "grad_norm": 0.36790624260902405, + "learning_rate": 7.72352583444139e-05, + "loss": 1.6648, + "step": 1875 + }, + { + "epoch": 0.34072695075714576, + "grad_norm": 0.4306769073009491, + "learning_rate": 7.721045476538849e-05, + "loss": 1.8478, + "step": 1876 + }, + { + "epoch": 0.34090857493132337, + "grad_norm": 0.421220600605011, + "learning_rate": 7.718564166893947e-05, + "loss": 1.9925, + "step": 1877 + }, + { + "epoch": 0.3410901991055009, + "grad_norm": 0.4341421127319336, + "learning_rate": 7.716081906374571e-05, + "loss": 1.8907, + "step": 1878 + }, + { + "epoch": 0.34127182327967853, + "grad_norm": 0.3992176353931427, + "learning_rate": 7.713598695848946e-05, + "loss": 1.8178, + "step": 1879 + }, + { + "epoch": 0.3414534474538561, + "grad_norm": 0.6412755846977234, + "learning_rate": 7.711114536185626e-05, + "loss": 1.8364, + "step": 1880 + }, + { + "epoch": 0.3416350716280337, + "grad_norm": 0.5427390933036804, + "learning_rate": 7.708629428253497e-05, + "loss": 1.9935, + "step": 1881 + }, + { + "epoch": 0.3418166958022113, + "grad_norm": 0.33292245864868164, + "learning_rate": 7.706143372921778e-05, + "loss": 1.8934, + "step": 1882 + }, + { + "epoch": 0.34199831997638885, + "grad_norm": 0.3104175627231598, + "learning_rate": 7.703656371060017e-05, + "loss": 1.7009, + "step": 1883 + }, + { + "epoch": 0.34217994415056646, + "grad_norm": 0.377848356962204, + "learning_rate": 7.701168423538099e-05, + "loss": 1.7108, + "step": 1884 + }, + { + "epoch": 0.342361568324744, + "grad_norm": 0.35984233021736145, + "learning_rate": 7.69867953122623e-05, + "loss": 1.6556, + "step": 1885 + }, + { + "epoch": 0.3425431924989216, + "grad_norm": 0.41129767894744873, + "learning_rate": 7.696189694994955e-05, + "loss": 1.7339, + "step": 1886 + }, + { + "epoch": 0.3427248166730992, + "grad_norm": 0.41722187399864197, + "learning_rate": 7.693698915715143e-05, + "loss": 1.8666, + "step": 1887 + }, + { + "epoch": 0.3429064408472768, + "grad_norm": 0.4802427291870117, + "learning_rate": 7.691207194258004e-05, + "loss": 1.978, + "step": 1888 + }, + { + "epoch": 0.34308806502145434, + "grad_norm": 0.33957645297050476, + "learning_rate": 7.688714531495061e-05, + "loss": 1.5438, + "step": 1889 + }, + { + "epoch": 0.34326968919563194, + "grad_norm": 0.3268083930015564, + "learning_rate": 7.686220928298178e-05, + "loss": 1.6527, + "step": 1890 + }, + { + "epoch": 0.3434513133698095, + "grad_norm": 0.3714955151081085, + "learning_rate": 7.683726385539544e-05, + "loss": 1.6682, + "step": 1891 + }, + { + "epoch": 0.3436329375439871, + "grad_norm": 0.4447016716003418, + "learning_rate": 7.681230904091678e-05, + "loss": 1.7318, + "step": 1892 + }, + { + "epoch": 0.3438145617181647, + "grad_norm": 0.33092358708381653, + "learning_rate": 7.678734484827428e-05, + "loss": 1.8584, + "step": 1893 + }, + { + "epoch": 0.34399618589234227, + "grad_norm": 0.479988157749176, + "learning_rate": 7.676237128619966e-05, + "loss": 1.6374, + "step": 1894 + }, + { + "epoch": 0.3441778100665199, + "grad_norm": 0.4738563299179077, + "learning_rate": 7.673738836342794e-05, + "loss": 1.8634, + "step": 1895 + }, + { + "epoch": 0.3443594342406974, + "grad_norm": 0.5624983906745911, + "learning_rate": 7.671239608869745e-05, + "loss": 1.5274, + "step": 1896 + }, + { + "epoch": 0.34454105841487503, + "grad_norm": 1.1549746990203857, + "learning_rate": 7.668739447074975e-05, + "loss": 2.0425, + "step": 1897 + }, + { + "epoch": 0.3447226825890526, + "grad_norm": 1.3563902378082275, + "learning_rate": 7.666238351832964e-05, + "loss": 1.9436, + "step": 1898 + }, + { + "epoch": 0.3449043067632302, + "grad_norm": 0.4628377854824066, + "learning_rate": 7.663736324018526e-05, + "loss": 1.7942, + "step": 1899 + }, + { + "epoch": 0.34508593093740775, + "grad_norm": 0.42579931020736694, + "learning_rate": 7.661233364506799e-05, + "loss": 1.5908, + "step": 1900 + }, + { + "epoch": 0.34526755511158536, + "grad_norm": 0.4293278753757477, + "learning_rate": 7.658729474173241e-05, + "loss": 1.7526, + "step": 1901 + }, + { + "epoch": 0.3454491792857629, + "grad_norm": 0.39985138177871704, + "learning_rate": 7.65622465389364e-05, + "loss": 1.6762, + "step": 1902 + }, + { + "epoch": 0.3456308034599405, + "grad_norm": 0.42846256494522095, + "learning_rate": 7.653718904544111e-05, + "loss": 1.8842, + "step": 1903 + }, + { + "epoch": 0.3458124276341181, + "grad_norm": 0.2938719391822815, + "learning_rate": 7.651212227001093e-05, + "loss": 1.8558, + "step": 1904 + }, + { + "epoch": 0.3459940518082957, + "grad_norm": 0.428912490606308, + "learning_rate": 7.648704622141347e-05, + "loss": 1.7401, + "step": 1905 + }, + { + "epoch": 0.3461756759824733, + "grad_norm": 0.3805762827396393, + "learning_rate": 7.646196090841962e-05, + "loss": 1.6943, + "step": 1906 + }, + { + "epoch": 0.34635730015665084, + "grad_norm": 0.37036699056625366, + "learning_rate": 7.643686633980344e-05, + "loss": 1.7528, + "step": 1907 + }, + { + "epoch": 0.34653892433082845, + "grad_norm": 0.3755590617656708, + "learning_rate": 7.641176252434233e-05, + "loss": 1.6728, + "step": 1908 + }, + { + "epoch": 0.346720548505006, + "grad_norm": 0.6397170424461365, + "learning_rate": 7.638664947081686e-05, + "loss": 1.6994, + "step": 1909 + }, + { + "epoch": 0.3469021726791836, + "grad_norm": 0.41290947794914246, + "learning_rate": 7.636152718801084e-05, + "loss": 1.8862, + "step": 1910 + }, + { + "epoch": 0.34708379685336116, + "grad_norm": 0.410111665725708, + "learning_rate": 7.63363956847113e-05, + "loss": 1.9622, + "step": 1911 + }, + { + "epoch": 0.34726542102753877, + "grad_norm": 0.41404417157173157, + "learning_rate": 7.631125496970854e-05, + "loss": 1.7822, + "step": 1912 + }, + { + "epoch": 0.3474470452017164, + "grad_norm": 0.4176514148712158, + "learning_rate": 7.628610505179602e-05, + "loss": 1.7585, + "step": 1913 + }, + { + "epoch": 0.34762866937589393, + "grad_norm": 0.47517165541648865, + "learning_rate": 7.626094593977045e-05, + "loss": 1.7531, + "step": 1914 + }, + { + "epoch": 0.34781029355007154, + "grad_norm": 1.062737226486206, + "learning_rate": 7.623577764243175e-05, + "loss": 1.7898, + "step": 1915 + }, + { + "epoch": 0.3479919177242491, + "grad_norm": 0.3843521475791931, + "learning_rate": 7.621060016858308e-05, + "loss": 1.8139, + "step": 1916 + }, + { + "epoch": 0.3481735418984267, + "grad_norm": 0.7029268145561218, + "learning_rate": 7.618541352703076e-05, + "loss": 1.8493, + "step": 1917 + }, + { + "epoch": 0.34835516607260425, + "grad_norm": 0.42538872361183167, + "learning_rate": 7.616021772658438e-05, + "loss": 1.8772, + "step": 1918 + }, + { + "epoch": 0.34853679024678186, + "grad_norm": 0.42931264638900757, + "learning_rate": 7.613501277605665e-05, + "loss": 1.7103, + "step": 1919 + }, + { + "epoch": 0.3487184144209594, + "grad_norm": 0.5315203070640564, + "learning_rate": 7.610979868426353e-05, + "loss": 1.6226, + "step": 1920 + }, + { + "epoch": 0.348900038595137, + "grad_norm": 0.38471972942352295, + "learning_rate": 7.608457546002424e-05, + "loss": 1.55, + "step": 1921 + }, + { + "epoch": 0.34908166276931457, + "grad_norm": 0.46782997250556946, + "learning_rate": 7.605934311216105e-05, + "loss": 1.7002, + "step": 1922 + }, + { + "epoch": 0.3492632869434922, + "grad_norm": 2.0748534202575684, + "learning_rate": 7.603410164949954e-05, + "loss": 1.884, + "step": 1923 + }, + { + "epoch": 0.3494449111176698, + "grad_norm": 0.36991992592811584, + "learning_rate": 7.600885108086841e-05, + "loss": 1.551, + "step": 1924 + }, + { + "epoch": 0.34962653529184734, + "grad_norm": 0.45522916316986084, + "learning_rate": 7.598359141509961e-05, + "loss": 1.8425, + "step": 1925 + }, + { + "epoch": 0.34980815946602495, + "grad_norm": 0.377135306596756, + "learning_rate": 7.59583226610282e-05, + "loss": 1.5753, + "step": 1926 + }, + { + "epoch": 0.3499897836402025, + "grad_norm": 0.3489382565021515, + "learning_rate": 7.593304482749247e-05, + "loss": 1.6424, + "step": 1927 + }, + { + "epoch": 0.3501714078143801, + "grad_norm": 0.3456614911556244, + "learning_rate": 7.590775792333389e-05, + "loss": 1.7102, + "step": 1928 + }, + { + "epoch": 0.35035303198855766, + "grad_norm": 1.0586925745010376, + "learning_rate": 7.588246195739703e-05, + "loss": 1.9916, + "step": 1929 + }, + { + "epoch": 0.35053465616273527, + "grad_norm": 0.9637052416801453, + "learning_rate": 7.585715693852973e-05, + "loss": 1.8644, + "step": 1930 + }, + { + "epoch": 0.3507162803369128, + "grad_norm": 0.4218997359275818, + "learning_rate": 7.58318428755829e-05, + "loss": 1.8364, + "step": 1931 + }, + { + "epoch": 0.35089790451109043, + "grad_norm": 0.4560905992984772, + "learning_rate": 7.580651977741071e-05, + "loss": 1.5022, + "step": 1932 + }, + { + "epoch": 0.351079528685268, + "grad_norm": 0.38443416357040405, + "learning_rate": 7.578118765287041e-05, + "loss": 1.7193, + "step": 1933 + }, + { + "epoch": 0.3512611528594456, + "grad_norm": 0.45235398411750793, + "learning_rate": 7.575584651082245e-05, + "loss": 1.8579, + "step": 1934 + }, + { + "epoch": 0.3514427770336232, + "grad_norm": 0.36086124181747437, + "learning_rate": 7.573049636013044e-05, + "loss": 1.7047, + "step": 1935 + }, + { + "epoch": 0.35162440120780075, + "grad_norm": 0.5537468791007996, + "learning_rate": 7.570513720966108e-05, + "loss": 1.8366, + "step": 1936 + }, + { + "epoch": 0.35180602538197836, + "grad_norm": 0.3868761658668518, + "learning_rate": 7.567976906828431e-05, + "loss": 1.7944, + "step": 1937 + }, + { + "epoch": 0.3519876495561559, + "grad_norm": 0.35049140453338623, + "learning_rate": 7.565439194487314e-05, + "loss": 1.6958, + "step": 1938 + }, + { + "epoch": 0.3521692737303335, + "grad_norm": 0.39969712495803833, + "learning_rate": 7.562900584830372e-05, + "loss": 1.9422, + "step": 1939 + }, + { + "epoch": 0.3523508979045111, + "grad_norm": 0.6938232183456421, + "learning_rate": 7.560361078745542e-05, + "loss": 1.8125, + "step": 1940 + }, + { + "epoch": 0.3525325220786887, + "grad_norm": 0.38648030161857605, + "learning_rate": 7.557820677121067e-05, + "loss": 1.5998, + "step": 1941 + }, + { + "epoch": 0.35271414625286623, + "grad_norm": 0.6122918128967285, + "learning_rate": 7.555279380845504e-05, + "loss": 1.7765, + "step": 1942 + }, + { + "epoch": 0.35289577042704384, + "grad_norm": 0.3944215476512909, + "learning_rate": 7.552737190807726e-05, + "loss": 1.7234, + "step": 1943 + }, + { + "epoch": 0.3530773946012214, + "grad_norm": 0.3669028580188751, + "learning_rate": 7.550194107896915e-05, + "loss": 1.6233, + "step": 1944 + }, + { + "epoch": 0.353259018775399, + "grad_norm": 0.6138156056404114, + "learning_rate": 7.54765013300257e-05, + "loss": 1.8512, + "step": 1945 + }, + { + "epoch": 0.3534406429495766, + "grad_norm": 0.42637693881988525, + "learning_rate": 7.545105267014499e-05, + "loss": 1.8107, + "step": 1946 + }, + { + "epoch": 0.35362226712375416, + "grad_norm": 0.33955273032188416, + "learning_rate": 7.54255951082282e-05, + "loss": 1.8642, + "step": 1947 + }, + { + "epoch": 0.35380389129793177, + "grad_norm": 0.494689017534256, + "learning_rate": 7.540012865317965e-05, + "loss": 1.8786, + "step": 1948 + }, + { + "epoch": 0.3539855154721093, + "grad_norm": 0.4117850363254547, + "learning_rate": 7.537465331390676e-05, + "loss": 1.7994, + "step": 1949 + }, + { + "epoch": 0.35416713964628693, + "grad_norm": 0.4999876320362091, + "learning_rate": 7.534916909932008e-05, + "loss": 1.6498, + "step": 1950 + }, + { + "epoch": 0.3543487638204645, + "grad_norm": 0.33305442333221436, + "learning_rate": 7.532367601833321e-05, + "loss": 1.7976, + "step": 1951 + }, + { + "epoch": 0.3545303879946421, + "grad_norm": 0.5412551164627075, + "learning_rate": 7.529817407986293e-05, + "loss": 1.9498, + "step": 1952 + }, + { + "epoch": 0.35471201216881965, + "grad_norm": 0.3238583505153656, + "learning_rate": 7.527266329282905e-05, + "loss": 1.6248, + "step": 1953 + }, + { + "epoch": 0.35489363634299725, + "grad_norm": 0.9848827123641968, + "learning_rate": 7.524714366615449e-05, + "loss": 1.7676, + "step": 1954 + }, + { + "epoch": 0.35507526051717486, + "grad_norm": 1.3679877519607544, + "learning_rate": 7.522161520876527e-05, + "loss": 1.8685, + "step": 1955 + }, + { + "epoch": 0.3552568846913524, + "grad_norm": 0.48305729031562805, + "learning_rate": 7.519607792959055e-05, + "loss": 1.7262, + "step": 1956 + }, + { + "epoch": 0.35543850886553, + "grad_norm": 0.3622657358646393, + "learning_rate": 7.517053183756246e-05, + "loss": 1.7174, + "step": 1957 + }, + { + "epoch": 0.3556201330397076, + "grad_norm": 0.3504152297973633, + "learning_rate": 7.51449769416163e-05, + "loss": 1.6077, + "step": 1958 + }, + { + "epoch": 0.3558017572138852, + "grad_norm": 0.4269239902496338, + "learning_rate": 7.511941325069045e-05, + "loss": 1.6968, + "step": 1959 + }, + { + "epoch": 0.35598338138806274, + "grad_norm": 0.7403337955474854, + "learning_rate": 7.509384077372632e-05, + "loss": 2.0059, + "step": 1960 + }, + { + "epoch": 0.35616500556224034, + "grad_norm": 0.5240353941917419, + "learning_rate": 7.506825951966843e-05, + "loss": 2.0018, + "step": 1961 + }, + { + "epoch": 0.3563466297364179, + "grad_norm": 0.47867724299430847, + "learning_rate": 7.504266949746435e-05, + "loss": 1.9673, + "step": 1962 + }, + { + "epoch": 0.3565282539105955, + "grad_norm": 0.5930111408233643, + "learning_rate": 7.501707071606472e-05, + "loss": 1.8528, + "step": 1963 + }, + { + "epoch": 0.35670987808477306, + "grad_norm": 0.39674296975135803, + "learning_rate": 7.499146318442324e-05, + "loss": 1.7739, + "step": 1964 + }, + { + "epoch": 0.35689150225895067, + "grad_norm": 0.7264280915260315, + "learning_rate": 7.49658469114967e-05, + "loss": 1.7426, + "step": 1965 + }, + { + "epoch": 0.3570731264331283, + "grad_norm": 0.3622196316719055, + "learning_rate": 7.494022190624492e-05, + "loss": 1.5879, + "step": 1966 + }, + { + "epoch": 0.3572547506073058, + "grad_norm": 0.3618766665458679, + "learning_rate": 7.491458817763077e-05, + "loss": 1.6993, + "step": 1967 + }, + { + "epoch": 0.35743637478148343, + "grad_norm": 0.7590869665145874, + "learning_rate": 7.488894573462018e-05, + "loss": 1.974, + "step": 1968 + }, + { + "epoch": 0.357617998955661, + "grad_norm": 1.6753489971160889, + "learning_rate": 7.486329458618215e-05, + "loss": 2.0123, + "step": 1969 + }, + { + "epoch": 0.3577996231298386, + "grad_norm": 0.43250593543052673, + "learning_rate": 7.483763474128867e-05, + "loss": 1.8028, + "step": 1970 + }, + { + "epoch": 0.35798124730401615, + "grad_norm": 0.2996358275413513, + "learning_rate": 7.481196620891482e-05, + "loss": 1.7729, + "step": 1971 + }, + { + "epoch": 0.35816287147819376, + "grad_norm": 0.36093026399612427, + "learning_rate": 7.478628899803873e-05, + "loss": 1.7874, + "step": 1972 + }, + { + "epoch": 0.3583444956523713, + "grad_norm": 0.369123637676239, + "learning_rate": 7.476060311764149e-05, + "loss": 1.7745, + "step": 1973 + }, + { + "epoch": 0.3585261198265489, + "grad_norm": 0.43757203221321106, + "learning_rate": 7.473490857670731e-05, + "loss": 1.8134, + "step": 1974 + }, + { + "epoch": 0.35870774400072647, + "grad_norm": 0.3923088312149048, + "learning_rate": 7.470920538422339e-05, + "loss": 1.7759, + "step": 1975 + }, + { + "epoch": 0.3588893681749041, + "grad_norm": 0.4013530910015106, + "learning_rate": 7.468349354917992e-05, + "loss": 1.8408, + "step": 1976 + }, + { + "epoch": 0.3590709923490817, + "grad_norm": 0.31479984521865845, + "learning_rate": 7.465777308057021e-05, + "loss": 1.8651, + "step": 1977 + }, + { + "epoch": 0.35925261652325924, + "grad_norm": 0.5002437233924866, + "learning_rate": 7.463204398739047e-05, + "loss": 1.7891, + "step": 1978 + }, + { + "epoch": 0.35943424069743685, + "grad_norm": 0.697327196598053, + "learning_rate": 7.460630627864002e-05, + "loss": 1.9176, + "step": 1979 + }, + { + "epoch": 0.3596158648716144, + "grad_norm": 0.3923889100551605, + "learning_rate": 7.458055996332118e-05, + "loss": 1.7604, + "step": 1980 + }, + { + "epoch": 0.359797489045792, + "grad_norm": 0.32724666595458984, + "learning_rate": 7.45548050504392e-05, + "loss": 1.8127, + "step": 1981 + }, + { + "epoch": 0.35997911321996956, + "grad_norm": 0.37425556778907776, + "learning_rate": 7.452904154900244e-05, + "loss": 1.8419, + "step": 1982 + }, + { + "epoch": 0.36016073739414717, + "grad_norm": 0.6986770629882812, + "learning_rate": 7.450326946802222e-05, + "loss": 1.9143, + "step": 1983 + }, + { + "epoch": 0.3603423615683247, + "grad_norm": 0.38532862067222595, + "learning_rate": 7.447748881651286e-05, + "loss": 1.8455, + "step": 1984 + }, + { + "epoch": 0.36052398574250233, + "grad_norm": 0.3622274398803711, + "learning_rate": 7.445169960349167e-05, + "loss": 1.515, + "step": 1985 + }, + { + "epoch": 0.36070560991667994, + "grad_norm": 0.4465091824531555, + "learning_rate": 7.442590183797896e-05, + "loss": 1.785, + "step": 1986 + }, + { + "epoch": 0.3608872340908575, + "grad_norm": 0.3934672176837921, + "learning_rate": 7.440009552899808e-05, + "loss": 1.8859, + "step": 1987 + }, + { + "epoch": 0.3610688582650351, + "grad_norm": 0.4587102234363556, + "learning_rate": 7.437428068557525e-05, + "loss": 1.678, + "step": 1988 + }, + { + "epoch": 0.36125048243921265, + "grad_norm": 0.3741019070148468, + "learning_rate": 7.43484573167398e-05, + "loss": 1.8111, + "step": 1989 + }, + { + "epoch": 0.36143210661339026, + "grad_norm": 1.8630338907241821, + "learning_rate": 7.432262543152399e-05, + "loss": 2.0343, + "step": 1990 + }, + { + "epoch": 0.3616137307875678, + "grad_norm": 0.31784775853157043, + "learning_rate": 7.429678503896304e-05, + "loss": 1.9505, + "step": 1991 + }, + { + "epoch": 0.3617953549617454, + "grad_norm": 0.35365164279937744, + "learning_rate": 7.427093614809519e-05, + "loss": 1.6159, + "step": 1992 + }, + { + "epoch": 0.36197697913592297, + "grad_norm": 0.35670703649520874, + "learning_rate": 7.424507876796163e-05, + "loss": 1.7116, + "step": 1993 + }, + { + "epoch": 0.3621586033101006, + "grad_norm": 0.4452590048313141, + "learning_rate": 7.421921290760648e-05, + "loss": 1.7515, + "step": 1994 + }, + { + "epoch": 0.36234022748427813, + "grad_norm": 0.38918641209602356, + "learning_rate": 7.419333857607688e-05, + "loss": 1.9466, + "step": 1995 + }, + { + "epoch": 0.36252185165845574, + "grad_norm": 0.39567258954048157, + "learning_rate": 7.416745578242296e-05, + "loss": 1.6938, + "step": 1996 + }, + { + "epoch": 0.36270347583263335, + "grad_norm": 0.3581540584564209, + "learning_rate": 7.414156453569771e-05, + "loss": 1.6312, + "step": 1997 + }, + { + "epoch": 0.3628851000068109, + "grad_norm": 1.9052141904830933, + "learning_rate": 7.411566484495714e-05, + "loss": 1.9657, + "step": 1998 + }, + { + "epoch": 0.3630667241809885, + "grad_norm": 0.4643442928791046, + "learning_rate": 7.408975671926024e-05, + "loss": 1.7506, + "step": 1999 + }, + { + "epoch": 0.36324834835516606, + "grad_norm": 0.8145013451576233, + "learning_rate": 7.40638401676689e-05, + "loss": 1.6697, + "step": 2000 + }, + { + "epoch": 0.36342997252934367, + "grad_norm": 1.3285554647445679, + "learning_rate": 7.403791519924794e-05, + "loss": 1.963, + "step": 2001 + }, + { + "epoch": 0.3636115967035212, + "grad_norm": 0.30230116844177246, + "learning_rate": 7.401198182306521e-05, + "loss": 1.7772, + "step": 2002 + }, + { + "epoch": 0.36379322087769883, + "grad_norm": 0.3377138674259186, + "learning_rate": 7.398604004819143e-05, + "loss": 1.556, + "step": 2003 + }, + { + "epoch": 0.3639748450518764, + "grad_norm": 0.2881469428539276, + "learning_rate": 7.396008988370027e-05, + "loss": 1.6648, + "step": 2004 + }, + { + "epoch": 0.364156469226054, + "grad_norm": 0.4425044357776642, + "learning_rate": 7.393413133866834e-05, + "loss": 1.8753, + "step": 2005 + }, + { + "epoch": 0.36433809340023154, + "grad_norm": 0.4343259036540985, + "learning_rate": 7.39081644221752e-05, + "loss": 2.0007, + "step": 2006 + }, + { + "epoch": 0.36451971757440915, + "grad_norm": 0.3887958824634552, + "learning_rate": 7.38821891433033e-05, + "loss": 1.635, + "step": 2007 + }, + { + "epoch": 0.36470134174858676, + "grad_norm": 1.1045604944229126, + "learning_rate": 7.385620551113803e-05, + "loss": 1.8985, + "step": 2008 + }, + { + "epoch": 0.3648829659227643, + "grad_norm": 0.47911664843559265, + "learning_rate": 7.383021353476774e-05, + "loss": 1.877, + "step": 2009 + }, + { + "epoch": 0.3650645900969419, + "grad_norm": 0.42727452516555786, + "learning_rate": 7.380421322328363e-05, + "loss": 1.7621, + "step": 2010 + }, + { + "epoch": 0.3652462142711195, + "grad_norm": 0.3689686954021454, + "learning_rate": 7.377820458577987e-05, + "loss": 1.6575, + "step": 2011 + }, + { + "epoch": 0.3654278384452971, + "grad_norm": 0.556098461151123, + "learning_rate": 7.375218763135352e-05, + "loss": 1.7471, + "step": 2012 + }, + { + "epoch": 0.36560946261947463, + "grad_norm": 0.4018637239933014, + "learning_rate": 7.372616236910456e-05, + "loss": 1.806, + "step": 2013 + }, + { + "epoch": 0.36579108679365224, + "grad_norm": 0.36069953441619873, + "learning_rate": 7.370012880813583e-05, + "loss": 1.8676, + "step": 2014 + }, + { + "epoch": 0.3659727109678298, + "grad_norm": 0.3542514145374298, + "learning_rate": 7.367408695755318e-05, + "loss": 1.7112, + "step": 2015 + }, + { + "epoch": 0.3661543351420074, + "grad_norm": 0.42891770601272583, + "learning_rate": 7.364803682646521e-05, + "loss": 1.8409, + "step": 2016 + }, + { + "epoch": 0.366335959316185, + "grad_norm": 0.4147874712944031, + "learning_rate": 7.362197842398355e-05, + "loss": 1.9204, + "step": 2017 + }, + { + "epoch": 0.36651758349036256, + "grad_norm": 0.7206998467445374, + "learning_rate": 7.359591175922266e-05, + "loss": 1.7545, + "step": 2018 + }, + { + "epoch": 0.36669920766454017, + "grad_norm": 0.3375997841358185, + "learning_rate": 7.35698368412999e-05, + "loss": 1.754, + "step": 2019 + }, + { + "epoch": 0.3668808318387177, + "grad_norm": 0.4528868496417999, + "learning_rate": 7.354375367933549e-05, + "loss": 1.5865, + "step": 2020 + }, + { + "epoch": 0.36706245601289533, + "grad_norm": 0.5421501398086548, + "learning_rate": 7.351766228245259e-05, + "loss": 1.7637, + "step": 2021 + }, + { + "epoch": 0.3672440801870729, + "grad_norm": 0.473322331905365, + "learning_rate": 7.349156265977719e-05, + "loss": 1.8855, + "step": 2022 + }, + { + "epoch": 0.3674257043612505, + "grad_norm": 0.35664573311805725, + "learning_rate": 7.346545482043819e-05, + "loss": 1.7701, + "step": 2023 + }, + { + "epoch": 0.36760732853542805, + "grad_norm": 0.42007988691329956, + "learning_rate": 7.343933877356734e-05, + "loss": 1.7349, + "step": 2024 + }, + { + "epoch": 0.36778895270960565, + "grad_norm": 0.6213206052780151, + "learning_rate": 7.34132145282993e-05, + "loss": 1.6343, + "step": 2025 + }, + { + "epoch": 0.3679705768837832, + "grad_norm": 0.48256585001945496, + "learning_rate": 7.338708209377153e-05, + "loss": 1.7393, + "step": 2026 + }, + { + "epoch": 0.3681522010579608, + "grad_norm": 0.5694558620452881, + "learning_rate": 7.33609414791244e-05, + "loss": 1.6801, + "step": 2027 + }, + { + "epoch": 0.3683338252321384, + "grad_norm": 1.2263097763061523, + "learning_rate": 7.333479269350117e-05, + "loss": 1.6916, + "step": 2028 + }, + { + "epoch": 0.368515449406316, + "grad_norm": 0.41886547207832336, + "learning_rate": 7.330863574604787e-05, + "loss": 1.7223, + "step": 2029 + }, + { + "epoch": 0.3686970735804936, + "grad_norm": 0.3144065737724304, + "learning_rate": 7.32824706459135e-05, + "loss": 1.5931, + "step": 2030 + }, + { + "epoch": 0.36887869775467114, + "grad_norm": 0.33857300877571106, + "learning_rate": 7.325629740224979e-05, + "loss": 1.6184, + "step": 2031 + }, + { + "epoch": 0.36906032192884874, + "grad_norm": 0.4139133393764496, + "learning_rate": 7.323011602421141e-05, + "loss": 1.7073, + "step": 2032 + }, + { + "epoch": 0.3692419461030263, + "grad_norm": 0.7628881335258484, + "learning_rate": 7.320392652095585e-05, + "loss": 1.7303, + "step": 2033 + }, + { + "epoch": 0.3694235702772039, + "grad_norm": 0.43699002265930176, + "learning_rate": 7.31777289016434e-05, + "loss": 1.8834, + "step": 2034 + }, + { + "epoch": 0.36960519445138146, + "grad_norm": 0.44640082120895386, + "learning_rate": 7.315152317543724e-05, + "loss": 1.8431, + "step": 2035 + }, + { + "epoch": 0.36978681862555907, + "grad_norm": 1.3358705043792725, + "learning_rate": 7.312530935150337e-05, + "loss": 1.8735, + "step": 2036 + }, + { + "epoch": 0.3699684427997366, + "grad_norm": 1.9224785566329956, + "learning_rate": 7.309908743901065e-05, + "loss": 1.7393, + "step": 2037 + }, + { + "epoch": 0.3701500669739142, + "grad_norm": 0.4071877598762512, + "learning_rate": 7.307285744713068e-05, + "loss": 1.9296, + "step": 2038 + }, + { + "epoch": 0.37033169114809183, + "grad_norm": 0.47320154309272766, + "learning_rate": 7.3046619385038e-05, + "loss": 1.9263, + "step": 2039 + }, + { + "epoch": 0.3705133153222694, + "grad_norm": 0.3988693058490753, + "learning_rate": 7.302037326190991e-05, + "loss": 1.8671, + "step": 2040 + }, + { + "epoch": 0.370694939496447, + "grad_norm": 0.3835192322731018, + "learning_rate": 7.299411908692649e-05, + "loss": 1.9278, + "step": 2041 + }, + { + "epoch": 0.37087656367062455, + "grad_norm": 0.47421783208847046, + "learning_rate": 7.296785686927075e-05, + "loss": 1.834, + "step": 2042 + }, + { + "epoch": 0.37105818784480216, + "grad_norm": 0.4219299256801605, + "learning_rate": 7.294158661812843e-05, + "loss": 1.7258, + "step": 2043 + }, + { + "epoch": 0.3712398120189797, + "grad_norm": 0.40113237500190735, + "learning_rate": 7.291530834268805e-05, + "loss": 1.6274, + "step": 2044 + }, + { + "epoch": 0.3714214361931573, + "grad_norm": 0.31899523735046387, + "learning_rate": 7.288902205214104e-05, + "loss": 1.7521, + "step": 2045 + }, + { + "epoch": 0.37160306036733487, + "grad_norm": 0.4019249975681305, + "learning_rate": 7.286272775568159e-05, + "loss": 1.9008, + "step": 2046 + }, + { + "epoch": 0.3717846845415125, + "grad_norm": 0.42321082949638367, + "learning_rate": 7.283642546250661e-05, + "loss": 1.7964, + "step": 2047 + }, + { + "epoch": 0.3719663087156901, + "grad_norm": 0.32307761907577515, + "learning_rate": 7.281011518181592e-05, + "loss": 1.8435, + "step": 2048 + }, + { + "epoch": 0.37214793288986764, + "grad_norm": 0.41982585191726685, + "learning_rate": 7.278379692281208e-05, + "loss": 1.7478, + "step": 2049 + }, + { + "epoch": 0.37232955706404525, + "grad_norm": 0.47847700119018555, + "learning_rate": 7.275747069470046e-05, + "loss": 1.8587, + "step": 2050 + }, + { + "epoch": 0.3725111812382228, + "grad_norm": 0.30819833278656006, + "learning_rate": 7.273113650668919e-05, + "loss": 1.6159, + "step": 2051 + }, + { + "epoch": 0.3726928054124004, + "grad_norm": 0.481187641620636, + "learning_rate": 7.270479436798922e-05, + "loss": 1.81, + "step": 2052 + }, + { + "epoch": 0.37287442958657796, + "grad_norm": 0.3713380992412567, + "learning_rate": 7.267844428781425e-05, + "loss": 1.7106, + "step": 2053 + }, + { + "epoch": 0.37305605376075557, + "grad_norm": 0.3637194037437439, + "learning_rate": 7.265208627538078e-05, + "loss": 1.6774, + "step": 2054 + }, + { + "epoch": 0.3732376779349331, + "grad_norm": 0.45125430822372437, + "learning_rate": 7.262572033990806e-05, + "loss": 1.6994, + "step": 2055 + }, + { + "epoch": 0.37341930210911073, + "grad_norm": 0.3305191695690155, + "learning_rate": 7.259934649061813e-05, + "loss": 1.7881, + "step": 2056 + }, + { + "epoch": 0.3736009262832883, + "grad_norm": 0.3148484230041504, + "learning_rate": 7.257296473673578e-05, + "loss": 1.714, + "step": 2057 + }, + { + "epoch": 0.3737825504574659, + "grad_norm": 0.6208624243736267, + "learning_rate": 7.254657508748861e-05, + "loss": 1.8637, + "step": 2058 + }, + { + "epoch": 0.3739641746316435, + "grad_norm": 0.3332320749759674, + "learning_rate": 7.252017755210693e-05, + "loss": 1.6568, + "step": 2059 + }, + { + "epoch": 0.37414579880582105, + "grad_norm": 0.44153183698654175, + "learning_rate": 7.249377213982383e-05, + "loss": 1.6107, + "step": 2060 + }, + { + "epoch": 0.37432742297999866, + "grad_norm": 0.40015000104904175, + "learning_rate": 7.246735885987515e-05, + "loss": 1.5697, + "step": 2061 + }, + { + "epoch": 0.3745090471541762, + "grad_norm": 0.3796318471431732, + "learning_rate": 7.24409377214995e-05, + "loss": 1.6216, + "step": 2062 + }, + { + "epoch": 0.3746906713283538, + "grad_norm": 0.4106977581977844, + "learning_rate": 7.24145087339382e-05, + "loss": 1.8918, + "step": 2063 + }, + { + "epoch": 0.37487229550253137, + "grad_norm": 0.5582180023193359, + "learning_rate": 7.238807190643535e-05, + "loss": 1.8113, + "step": 2064 + }, + { + "epoch": 0.375053919676709, + "grad_norm": 0.40471023321151733, + "learning_rate": 7.23616272482378e-05, + "loss": 1.8835, + "step": 2065 + }, + { + "epoch": 0.37523554385088653, + "grad_norm": 0.7257471680641174, + "learning_rate": 7.233517476859507e-05, + "loss": 1.8243, + "step": 2066 + }, + { + "epoch": 0.37541716802506414, + "grad_norm": 0.8905494213104248, + "learning_rate": 7.230871447675953e-05, + "loss": 1.7535, + "step": 2067 + }, + { + "epoch": 0.3755987921992417, + "grad_norm": 0.3805006742477417, + "learning_rate": 7.22822463819862e-05, + "loss": 1.6871, + "step": 2068 + }, + { + "epoch": 0.3757804163734193, + "grad_norm": 0.3619759976863861, + "learning_rate": 7.225577049353279e-05, + "loss": 1.8952, + "step": 2069 + }, + { + "epoch": 0.3759620405475969, + "grad_norm": 0.37048909068107605, + "learning_rate": 7.222928682065988e-05, + "loss": 1.7228, + "step": 2070 + }, + { + "epoch": 0.37614366472177446, + "grad_norm": 0.3264259994029999, + "learning_rate": 7.220279537263063e-05, + "loss": 1.5046, + "step": 2071 + }, + { + "epoch": 0.37632528889595207, + "grad_norm": 0.38617372512817383, + "learning_rate": 7.2176296158711e-05, + "loss": 1.6832, + "step": 2072 + }, + { + "epoch": 0.3765069130701296, + "grad_norm": 0.520910918712616, + "learning_rate": 7.214978918816961e-05, + "loss": 1.8532, + "step": 2073 + }, + { + "epoch": 0.37668853724430723, + "grad_norm": 1.3047138452529907, + "learning_rate": 7.212327447027789e-05, + "loss": 1.8216, + "step": 2074 + }, + { + "epoch": 0.3768701614184848, + "grad_norm": 0.36616113781929016, + "learning_rate": 7.209675201430986e-05, + "loss": 1.8097, + "step": 2075 + }, + { + "epoch": 0.3770517855926624, + "grad_norm": 0.6841278672218323, + "learning_rate": 7.207022182954229e-05, + "loss": 1.7092, + "step": 2076 + }, + { + "epoch": 0.37723340976683994, + "grad_norm": 0.486497163772583, + "learning_rate": 7.204368392525471e-05, + "loss": 1.6527, + "step": 2077 + }, + { + "epoch": 0.37741503394101755, + "grad_norm": 1.4018718004226685, + "learning_rate": 7.20171383107293e-05, + "loss": 1.786, + "step": 2078 + }, + { + "epoch": 0.3775966581151951, + "grad_norm": 0.35615256428718567, + "learning_rate": 7.199058499525092e-05, + "loss": 1.7951, + "step": 2079 + }, + { + "epoch": 0.3777782822893727, + "grad_norm": 0.4319935142993927, + "learning_rate": 7.196402398810716e-05, + "loss": 1.8712, + "step": 2080 + }, + { + "epoch": 0.3779599064635503, + "grad_norm": 0.6475411653518677, + "learning_rate": 7.193745529858826e-05, + "loss": 1.9245, + "step": 2081 + }, + { + "epoch": 0.3781415306377279, + "grad_norm": 0.5579408407211304, + "learning_rate": 7.19108789359872e-05, + "loss": 1.6537, + "step": 2082 + }, + { + "epoch": 0.3783231548119055, + "grad_norm": 0.7122630476951599, + "learning_rate": 7.188429490959962e-05, + "loss": 1.8024, + "step": 2083 + }, + { + "epoch": 0.37850477898608303, + "grad_norm": 0.3606022596359253, + "learning_rate": 7.185770322872383e-05, + "loss": 1.7206, + "step": 2084 + }, + { + "epoch": 0.37868640316026064, + "grad_norm": 0.5910794734954834, + "learning_rate": 7.18311039026608e-05, + "loss": 1.8693, + "step": 2085 + }, + { + "epoch": 0.3788680273344382, + "grad_norm": 0.4407062828540802, + "learning_rate": 7.180449694071424e-05, + "loss": 1.5298, + "step": 2086 + }, + { + "epoch": 0.3790496515086158, + "grad_norm": 0.3445195257663727, + "learning_rate": 7.177788235219046e-05, + "loss": 1.815, + "step": 2087 + }, + { + "epoch": 0.37923127568279336, + "grad_norm": 0.3710117042064667, + "learning_rate": 7.175126014639847e-05, + "loss": 1.5716, + "step": 2088 + }, + { + "epoch": 0.37941289985697096, + "grad_norm": 0.5525126457214355, + "learning_rate": 7.172463033264996e-05, + "loss": 1.8277, + "step": 2089 + }, + { + "epoch": 0.3795945240311486, + "grad_norm": 0.37456047534942627, + "learning_rate": 7.169799292025925e-05, + "loss": 1.7623, + "step": 2090 + }, + { + "epoch": 0.3797761482053261, + "grad_norm": 0.34927424788475037, + "learning_rate": 7.167134791854333e-05, + "loss": 1.594, + "step": 2091 + }, + { + "epoch": 0.37995777237950373, + "grad_norm": 0.3264741599559784, + "learning_rate": 7.164469533682183e-05, + "loss": 1.7194, + "step": 2092 + }, + { + "epoch": 0.3801393965536813, + "grad_norm": 0.4133761525154114, + "learning_rate": 7.161803518441707e-05, + "loss": 1.6982, + "step": 2093 + }, + { + "epoch": 0.3803210207278589, + "grad_norm": 0.4528980255126953, + "learning_rate": 7.159136747065398e-05, + "loss": 1.6624, + "step": 2094 + }, + { + "epoch": 0.38050264490203645, + "grad_norm": 0.39494484663009644, + "learning_rate": 7.156469220486016e-05, + "loss": 1.7321, + "step": 2095 + }, + { + "epoch": 0.38068426907621405, + "grad_norm": 0.4944459795951843, + "learning_rate": 7.153800939636585e-05, + "loss": 1.9799, + "step": 2096 + }, + { + "epoch": 0.3808658932503916, + "grad_norm": 0.351252943277359, + "learning_rate": 7.151131905450386e-05, + "loss": 1.6909, + "step": 2097 + }, + { + "epoch": 0.3810475174245692, + "grad_norm": 0.4569198489189148, + "learning_rate": 7.148462118860977e-05, + "loss": 1.7953, + "step": 2098 + }, + { + "epoch": 0.38122914159874677, + "grad_norm": 0.4190079867839813, + "learning_rate": 7.145791580802165e-05, + "loss": 1.6931, + "step": 2099 + }, + { + "epoch": 0.3814107657729244, + "grad_norm": 0.3377469778060913, + "learning_rate": 7.143120292208032e-05, + "loss": 1.8106, + "step": 2100 + }, + { + "epoch": 0.381592389947102, + "grad_norm": 0.9633534550666809, + "learning_rate": 7.140448254012912e-05, + "loss": 1.8842, + "step": 2101 + }, + { + "epoch": 0.38177401412127954, + "grad_norm": 0.8164104223251343, + "learning_rate": 7.137775467151411e-05, + "loss": 1.8591, + "step": 2102 + }, + { + "epoch": 0.38195563829545714, + "grad_norm": 0.2948068678379059, + "learning_rate": 7.135101932558387e-05, + "loss": 1.655, + "step": 2103 + }, + { + "epoch": 0.3821372624696347, + "grad_norm": 0.35634955763816833, + "learning_rate": 7.132427651168967e-05, + "loss": 1.7125, + "step": 2104 + }, + { + "epoch": 0.3823188866438123, + "grad_norm": 0.3574617803096771, + "learning_rate": 7.129752623918537e-05, + "loss": 1.7128, + "step": 2105 + }, + { + "epoch": 0.38250051081798986, + "grad_norm": 0.3420691192150116, + "learning_rate": 7.127076851742742e-05, + "loss": 1.7287, + "step": 2106 + }, + { + "epoch": 0.38268213499216747, + "grad_norm": 0.31610989570617676, + "learning_rate": 7.12440033557749e-05, + "loss": 1.8795, + "step": 2107 + }, + { + "epoch": 0.382863759166345, + "grad_norm": 0.43567606806755066, + "learning_rate": 7.121723076358948e-05, + "loss": 1.8909, + "step": 2108 + }, + { + "epoch": 0.3830453833405226, + "grad_norm": 0.3029618561267853, + "learning_rate": 7.119045075023542e-05, + "loss": 1.8337, + "step": 2109 + }, + { + "epoch": 0.3832270075147002, + "grad_norm": 0.34153884649276733, + "learning_rate": 7.116366332507962e-05, + "loss": 1.8354, + "step": 2110 + }, + { + "epoch": 0.3834086316888778, + "grad_norm": 0.9145708680152893, + "learning_rate": 7.113686849749153e-05, + "loss": 1.7822, + "step": 2111 + }, + { + "epoch": 0.3835902558630554, + "grad_norm": 0.3457806706428528, + "learning_rate": 7.111006627684317e-05, + "loss": 1.7422, + "step": 2112 + }, + { + "epoch": 0.38377188003723295, + "grad_norm": 1.2494697570800781, + "learning_rate": 7.10832566725092e-05, + "loss": 1.8164, + "step": 2113 + }, + { + "epoch": 0.38395350421141056, + "grad_norm": 0.36807066202163696, + "learning_rate": 7.105643969386685e-05, + "loss": 1.7413, + "step": 2114 + }, + { + "epoch": 0.3841351283855881, + "grad_norm": 0.3917871117591858, + "learning_rate": 7.102961535029589e-05, + "loss": 1.9289, + "step": 2115 + }, + { + "epoch": 0.3843167525597657, + "grad_norm": 0.3582775592803955, + "learning_rate": 7.10027836511787e-05, + "loss": 1.8283, + "step": 2116 + }, + { + "epoch": 0.38449837673394327, + "grad_norm": 0.33837971091270447, + "learning_rate": 7.097594460590023e-05, + "loss": 1.714, + "step": 2117 + }, + { + "epoch": 0.3846800009081209, + "grad_norm": 0.44809168577194214, + "learning_rate": 7.0949098223848e-05, + "loss": 1.7204, + "step": 2118 + }, + { + "epoch": 0.38486162508229843, + "grad_norm": 0.3948080539703369, + "learning_rate": 7.092224451441208e-05, + "loss": 1.9199, + "step": 2119 + }, + { + "epoch": 0.38504324925647604, + "grad_norm": 0.3608780801296234, + "learning_rate": 7.089538348698512e-05, + "loss": 1.7939, + "step": 2120 + }, + { + "epoch": 0.38522487343065365, + "grad_norm": 0.3923419713973999, + "learning_rate": 7.086851515096233e-05, + "loss": 1.7707, + "step": 2121 + }, + { + "epoch": 0.3854064976048312, + "grad_norm": 0.5077818036079407, + "learning_rate": 7.084163951574146e-05, + "loss": 1.78, + "step": 2122 + }, + { + "epoch": 0.3855881217790088, + "grad_norm": 0.35794204473495483, + "learning_rate": 7.081475659072284e-05, + "loss": 1.8171, + "step": 2123 + }, + { + "epoch": 0.38576974595318636, + "grad_norm": 0.759251594543457, + "learning_rate": 7.078786638530933e-05, + "loss": 1.8057, + "step": 2124 + }, + { + "epoch": 0.38595137012736397, + "grad_norm": 0.5275394916534424, + "learning_rate": 7.07609689089063e-05, + "loss": 1.946, + "step": 2125 + }, + { + "epoch": 0.3861329943015415, + "grad_norm": 0.511479914188385, + "learning_rate": 7.073406417092178e-05, + "loss": 1.6371, + "step": 2126 + }, + { + "epoch": 0.38631461847571913, + "grad_norm": 0.3902972340583801, + "learning_rate": 7.07071521807662e-05, + "loss": 1.8427, + "step": 2127 + }, + { + "epoch": 0.3864962426498967, + "grad_norm": 0.6306132078170776, + "learning_rate": 7.068023294785258e-05, + "loss": 1.8058, + "step": 2128 + }, + { + "epoch": 0.3866778668240743, + "grad_norm": 0.2761375904083252, + "learning_rate": 7.065330648159656e-05, + "loss": 1.705, + "step": 2129 + }, + { + "epoch": 0.38685949099825184, + "grad_norm": 0.38106122612953186, + "learning_rate": 7.062637279141616e-05, + "loss": 1.7045, + "step": 2130 + }, + { + "epoch": 0.38704111517242945, + "grad_norm": 0.42503851652145386, + "learning_rate": 7.0599431886732e-05, + "loss": 1.8766, + "step": 2131 + }, + { + "epoch": 0.38722273934660706, + "grad_norm": 0.44965189695358276, + "learning_rate": 7.057248377696727e-05, + "loss": 1.8317, + "step": 2132 + }, + { + "epoch": 0.3874043635207846, + "grad_norm": 0.3616545498371124, + "learning_rate": 7.05455284715476e-05, + "loss": 1.8508, + "step": 2133 + }, + { + "epoch": 0.3875859876949622, + "grad_norm": 0.3645125925540924, + "learning_rate": 7.05185659799012e-05, + "loss": 1.7963, + "step": 2134 + }, + { + "epoch": 0.38776761186913977, + "grad_norm": 0.33430933952331543, + "learning_rate": 7.049159631145872e-05, + "loss": 1.7094, + "step": 2135 + }, + { + "epoch": 0.3879492360433174, + "grad_norm": 0.31295010447502136, + "learning_rate": 7.046461947565339e-05, + "loss": 1.5944, + "step": 2136 + }, + { + "epoch": 0.38813086021749493, + "grad_norm": 0.5572431087493896, + "learning_rate": 7.04376354819209e-05, + "loss": 1.817, + "step": 2137 + }, + { + "epoch": 0.38831248439167254, + "grad_norm": 0.4785575866699219, + "learning_rate": 7.04106443396995e-05, + "loss": 1.6409, + "step": 2138 + }, + { + "epoch": 0.3884941085658501, + "grad_norm": 0.5061618685722351, + "learning_rate": 7.038364605842989e-05, + "loss": 1.7885, + "step": 2139 + }, + { + "epoch": 0.3886757327400277, + "grad_norm": 0.48401740193367004, + "learning_rate": 7.035664064755526e-05, + "loss": 1.7045, + "step": 2140 + }, + { + "epoch": 0.38885735691420525, + "grad_norm": 0.6620643734931946, + "learning_rate": 7.032962811652133e-05, + "loss": 1.7998, + "step": 2141 + }, + { + "epoch": 0.38903898108838286, + "grad_norm": 0.39755627512931824, + "learning_rate": 7.030260847477631e-05, + "loss": 1.7357, + "step": 2142 + }, + { + "epoch": 0.38922060526256047, + "grad_norm": 0.5020614862442017, + "learning_rate": 7.027558173177087e-05, + "loss": 1.7965, + "step": 2143 + }, + { + "epoch": 0.389402229436738, + "grad_norm": 0.3829381465911865, + "learning_rate": 7.024854789695816e-05, + "loss": 1.6214, + "step": 2144 + }, + { + "epoch": 0.38958385361091563, + "grad_norm": 0.5170410871505737, + "learning_rate": 7.022150697979384e-05, + "loss": 1.9145, + "step": 2145 + }, + { + "epoch": 0.3897654777850932, + "grad_norm": 0.39996492862701416, + "learning_rate": 7.019445898973607e-05, + "loss": 1.6734, + "step": 2146 + }, + { + "epoch": 0.3899471019592708, + "grad_norm": 1.2337815761566162, + "learning_rate": 7.01674039362454e-05, + "loss": 1.8838, + "step": 2147 + }, + { + "epoch": 0.39012872613344834, + "grad_norm": 0.5079956650733948, + "learning_rate": 7.014034182878491e-05, + "loss": 1.7825, + "step": 2148 + }, + { + "epoch": 0.39031035030762595, + "grad_norm": 0.6083911061286926, + "learning_rate": 7.011327267682013e-05, + "loss": 1.9132, + "step": 2149 + }, + { + "epoch": 0.3904919744818035, + "grad_norm": 1.7887481451034546, + "learning_rate": 7.008619648981908e-05, + "loss": 1.6928, + "step": 2150 + }, + { + "epoch": 0.3906735986559811, + "grad_norm": 0.4164898693561554, + "learning_rate": 7.005911327725222e-05, + "loss": 1.793, + "step": 2151 + }, + { + "epoch": 0.3908552228301587, + "grad_norm": 0.6848735213279724, + "learning_rate": 7.003202304859245e-05, + "loss": 1.5719, + "step": 2152 + }, + { + "epoch": 0.3910368470043363, + "grad_norm": 0.6052497625350952, + "learning_rate": 7.000492581331516e-05, + "loss": 1.8276, + "step": 2153 + }, + { + "epoch": 0.3912184711785139, + "grad_norm": 0.7914738059043884, + "learning_rate": 6.997782158089815e-05, + "loss": 1.9078, + "step": 2154 + }, + { + "epoch": 0.39140009535269144, + "grad_norm": 0.47066614031791687, + "learning_rate": 6.995071036082173e-05, + "loss": 1.9329, + "step": 2155 + }, + { + "epoch": 0.39158171952686904, + "grad_norm": 0.4073311984539032, + "learning_rate": 6.992359216256855e-05, + "loss": 1.6451, + "step": 2156 + }, + { + "epoch": 0.3917633437010466, + "grad_norm": 0.442801833152771, + "learning_rate": 6.989646699562384e-05, + "loss": 1.8179, + "step": 2157 + }, + { + "epoch": 0.3919449678752242, + "grad_norm": 0.2907392680644989, + "learning_rate": 6.986933486947513e-05, + "loss": 1.7647, + "step": 2158 + }, + { + "epoch": 0.39212659204940176, + "grad_norm": 0.34170281887054443, + "learning_rate": 6.984219579361248e-05, + "loss": 1.6948, + "step": 2159 + }, + { + "epoch": 0.39230821622357936, + "grad_norm": 0.5229573845863342, + "learning_rate": 6.981504977752834e-05, + "loss": 1.9535, + "step": 2160 + }, + { + "epoch": 0.3924898403977569, + "grad_norm": 0.712793231010437, + "learning_rate": 6.97878968307176e-05, + "loss": 1.8778, + "step": 2161 + }, + { + "epoch": 0.3926714645719345, + "grad_norm": 0.6705805659294128, + "learning_rate": 6.976073696267757e-05, + "loss": 1.6923, + "step": 2162 + }, + { + "epoch": 0.39285308874611213, + "grad_norm": 0.39574000239372253, + "learning_rate": 6.973357018290796e-05, + "loss": 1.7201, + "step": 2163 + }, + { + "epoch": 0.3930347129202897, + "grad_norm": 0.3999633491039276, + "learning_rate": 6.970639650091095e-05, + "loss": 1.8819, + "step": 2164 + }, + { + "epoch": 0.3932163370944673, + "grad_norm": 0.4300141930580139, + "learning_rate": 6.967921592619104e-05, + "loss": 1.7685, + "step": 2165 + }, + { + "epoch": 0.39339796126864485, + "grad_norm": 1.3083165884017944, + "learning_rate": 6.965202846825529e-05, + "loss": 1.7166, + "step": 2166 + }, + { + "epoch": 0.39357958544282245, + "grad_norm": 0.48443445563316345, + "learning_rate": 6.962483413661301e-05, + "loss": 1.6994, + "step": 2167 + }, + { + "epoch": 0.393761209617, + "grad_norm": 0.5183535218238831, + "learning_rate": 6.959763294077602e-05, + "loss": 1.8272, + "step": 2168 + }, + { + "epoch": 0.3939428337911776, + "grad_norm": 0.7813742160797119, + "learning_rate": 6.957042489025849e-05, + "loss": 1.6645, + "step": 2169 + }, + { + "epoch": 0.39412445796535517, + "grad_norm": 0.4502968192100525, + "learning_rate": 6.954320999457702e-05, + "loss": 1.5638, + "step": 2170 + }, + { + "epoch": 0.3943060821395328, + "grad_norm": 0.34565645456314087, + "learning_rate": 6.951598826325056e-05, + "loss": 1.7605, + "step": 2171 + }, + { + "epoch": 0.39448770631371033, + "grad_norm": 0.4068540036678314, + "learning_rate": 6.948875970580049e-05, + "loss": 1.9529, + "step": 2172 + }, + { + "epoch": 0.39466933048788794, + "grad_norm": 0.3750416934490204, + "learning_rate": 6.946152433175058e-05, + "loss": 1.6194, + "step": 2173 + }, + { + "epoch": 0.39485095466206555, + "grad_norm": 0.3799959123134613, + "learning_rate": 6.943428215062695e-05, + "loss": 1.8714, + "step": 2174 + }, + { + "epoch": 0.3950325788362431, + "grad_norm": 0.9225115180015564, + "learning_rate": 6.940703317195812e-05, + "loss": 1.7108, + "step": 2175 + }, + { + "epoch": 0.3952142030104207, + "grad_norm": 0.34909629821777344, + "learning_rate": 6.9379777405275e-05, + "loss": 1.9022, + "step": 2176 + }, + { + "epoch": 0.39539582718459826, + "grad_norm": 1.14286470413208, + "learning_rate": 6.935251486011087e-05, + "loss": 2.1061, + "step": 2177 + }, + { + "epoch": 0.39557745135877587, + "grad_norm": 0.4109739363193512, + "learning_rate": 6.932524554600134e-05, + "loss": 1.7165, + "step": 2178 + }, + { + "epoch": 0.3957590755329534, + "grad_norm": 0.4729115962982178, + "learning_rate": 6.929796947248445e-05, + "loss": 1.7919, + "step": 2179 + }, + { + "epoch": 0.395940699707131, + "grad_norm": 0.5388248562812805, + "learning_rate": 6.927068664910058e-05, + "loss": 1.9217, + "step": 2180 + }, + { + "epoch": 0.3961223238813086, + "grad_norm": 0.39753592014312744, + "learning_rate": 6.924339708539244e-05, + "loss": 1.8241, + "step": 2181 + }, + { + "epoch": 0.3963039480554862, + "grad_norm": 0.3766293525695801, + "learning_rate": 6.921610079090513e-05, + "loss": 1.722, + "step": 2182 + }, + { + "epoch": 0.39648557222966374, + "grad_norm": 0.3090784251689911, + "learning_rate": 6.918879777518614e-05, + "loss": 1.8316, + "step": 2183 + }, + { + "epoch": 0.39666719640384135, + "grad_norm": 0.30846890807151794, + "learning_rate": 6.916148804778518e-05, + "loss": 1.6654, + "step": 2184 + }, + { + "epoch": 0.39684882057801896, + "grad_norm": 0.34221041202545166, + "learning_rate": 6.91341716182545e-05, + "loss": 1.9297, + "step": 2185 + }, + { + "epoch": 0.3970304447521965, + "grad_norm": 0.3193320333957672, + "learning_rate": 6.910684849614853e-05, + "loss": 1.7412, + "step": 2186 + }, + { + "epoch": 0.3972120689263741, + "grad_norm": 0.3544865548610687, + "learning_rate": 6.907951869102409e-05, + "loss": 1.7825, + "step": 2187 + }, + { + "epoch": 0.39739369310055167, + "grad_norm": 0.3137286603450775, + "learning_rate": 6.905218221244038e-05, + "loss": 1.8894, + "step": 2188 + }, + { + "epoch": 0.3975753172747293, + "grad_norm": 0.3521876037120819, + "learning_rate": 6.90248390699589e-05, + "loss": 1.7161, + "step": 2189 + }, + { + "epoch": 0.39775694144890683, + "grad_norm": 0.46548840403556824, + "learning_rate": 6.899748927314346e-05, + "loss": 1.7907, + "step": 2190 + }, + { + "epoch": 0.39793856562308444, + "grad_norm": 0.6324931383132935, + "learning_rate": 6.897013283156026e-05, + "loss": 1.759, + "step": 2191 + }, + { + "epoch": 0.398120189797262, + "grad_norm": 1.9628968238830566, + "learning_rate": 6.894276975477776e-05, + "loss": 1.9913, + "step": 2192 + }, + { + "epoch": 0.3983018139714396, + "grad_norm": 0.43245401978492737, + "learning_rate": 6.891540005236675e-05, + "loss": 1.697, + "step": 2193 + }, + { + "epoch": 0.3984834381456172, + "grad_norm": 0.6427620649337769, + "learning_rate": 6.88880237339004e-05, + "loss": 1.7914, + "step": 2194 + }, + { + "epoch": 0.39866506231979476, + "grad_norm": 0.35636720061302185, + "learning_rate": 6.886064080895412e-05, + "loss": 1.7612, + "step": 2195 + }, + { + "epoch": 0.39884668649397237, + "grad_norm": 0.45053014159202576, + "learning_rate": 6.883325128710565e-05, + "loss": 1.7607, + "step": 2196 + }, + { + "epoch": 0.3990283106681499, + "grad_norm": 0.4914367198944092, + "learning_rate": 6.880585517793507e-05, + "loss": 1.6959, + "step": 2197 + }, + { + "epoch": 0.39920993484232753, + "grad_norm": 0.3305487036705017, + "learning_rate": 6.877845249102472e-05, + "loss": 1.7379, + "step": 2198 + }, + { + "epoch": 0.3993915590165051, + "grad_norm": 0.4273001253604889, + "learning_rate": 6.875104323595927e-05, + "loss": 1.7546, + "step": 2199 + }, + { + "epoch": 0.3995731831906827, + "grad_norm": 1.2408090829849243, + "learning_rate": 6.872362742232568e-05, + "loss": 1.8795, + "step": 2200 + }, + { + "epoch": 0.39975480736486024, + "grad_norm": 0.4054024815559387, + "learning_rate": 6.869620505971321e-05, + "loss": 1.6489, + "step": 2201 + }, + { + "epoch": 0.39993643153903785, + "grad_norm": 0.9999191164970398, + "learning_rate": 6.866877615771336e-05, + "loss": 1.8218, + "step": 2202 + }, + { + "epoch": 0.4001180557132154, + "grad_norm": 0.37730956077575684, + "learning_rate": 6.864134072592001e-05, + "loss": 1.8203, + "step": 2203 + }, + { + "epoch": 0.400299679887393, + "grad_norm": 0.3234674632549286, + "learning_rate": 6.861389877392925e-05, + "loss": 1.6376, + "step": 2204 + }, + { + "epoch": 0.4004813040615706, + "grad_norm": 0.7312324047088623, + "learning_rate": 6.85864503113395e-05, + "loss": 1.7334, + "step": 2205 + }, + { + "epoch": 0.4006629282357482, + "grad_norm": 0.3776227831840515, + "learning_rate": 6.855899534775138e-05, + "loss": 1.7481, + "step": 2206 + }, + { + "epoch": 0.4008445524099258, + "grad_norm": 0.4556482434272766, + "learning_rate": 6.85315338927679e-05, + "loss": 1.7959, + "step": 2207 + }, + { + "epoch": 0.40102617658410333, + "grad_norm": 0.4360094368457794, + "learning_rate": 6.850406595599421e-05, + "loss": 1.809, + "step": 2208 + }, + { + "epoch": 0.40120780075828094, + "grad_norm": 0.4798350930213928, + "learning_rate": 6.847659154703785e-05, + "loss": 1.8526, + "step": 2209 + }, + { + "epoch": 0.4013894249324585, + "grad_norm": 0.689530611038208, + "learning_rate": 6.844911067550855e-05, + "loss": 1.7495, + "step": 2210 + }, + { + "epoch": 0.4015710491066361, + "grad_norm": 0.44987282156944275, + "learning_rate": 6.842162335101829e-05, + "loss": 1.9244, + "step": 2211 + }, + { + "epoch": 0.40175267328081365, + "grad_norm": 0.47981634736061096, + "learning_rate": 6.839412958318137e-05, + "loss": 1.8798, + "step": 2212 + }, + { + "epoch": 0.40193429745499126, + "grad_norm": 0.4148099422454834, + "learning_rate": 6.836662938161429e-05, + "loss": 1.8719, + "step": 2213 + }, + { + "epoch": 0.4021159216291688, + "grad_norm": 0.4042699337005615, + "learning_rate": 6.833912275593584e-05, + "loss": 1.6108, + "step": 2214 + }, + { + "epoch": 0.4022975458033464, + "grad_norm": 0.40106531977653503, + "learning_rate": 6.831160971576697e-05, + "loss": 1.7769, + "step": 2215 + }, + { + "epoch": 0.40247916997752403, + "grad_norm": 0.43849995732307434, + "learning_rate": 6.828409027073103e-05, + "loss": 1.6951, + "step": 2216 + }, + { + "epoch": 0.4026607941517016, + "grad_norm": 0.4882762134075165, + "learning_rate": 6.825656443045347e-05, + "loss": 1.9587, + "step": 2217 + }, + { + "epoch": 0.4028424183258792, + "grad_norm": 0.5033427476882935, + "learning_rate": 6.822903220456204e-05, + "loss": 1.7703, + "step": 2218 + }, + { + "epoch": 0.40302404250005675, + "grad_norm": 0.42220258712768555, + "learning_rate": 6.82014936026867e-05, + "loss": 1.5641, + "step": 2219 + }, + { + "epoch": 0.40320566667423435, + "grad_norm": 0.35014596581459045, + "learning_rate": 6.817394863445965e-05, + "loss": 1.699, + "step": 2220 + }, + { + "epoch": 0.4033872908484119, + "grad_norm": 0.36348533630371094, + "learning_rate": 6.814639730951532e-05, + "loss": 1.7574, + "step": 2221 + }, + { + "epoch": 0.4035689150225895, + "grad_norm": 0.4646308124065399, + "learning_rate": 6.811883963749037e-05, + "loss": 1.6934, + "step": 2222 + }, + { + "epoch": 0.40375053919676707, + "grad_norm": 0.3145178258419037, + "learning_rate": 6.809127562802364e-05, + "loss": 1.7524, + "step": 2223 + }, + { + "epoch": 0.4039321633709447, + "grad_norm": 0.36656272411346436, + "learning_rate": 6.806370529075625e-05, + "loss": 1.6645, + "step": 2224 + }, + { + "epoch": 0.4041137875451223, + "grad_norm": 0.37823110818862915, + "learning_rate": 6.803612863533148e-05, + "loss": 1.7822, + "step": 2225 + }, + { + "epoch": 0.40429541171929984, + "grad_norm": 0.48924437165260315, + "learning_rate": 6.800854567139485e-05, + "loss": 1.7577, + "step": 2226 + }, + { + "epoch": 0.40447703589347744, + "grad_norm": 0.423355370759964, + "learning_rate": 6.798095640859408e-05, + "loss": 1.9491, + "step": 2227 + }, + { + "epoch": 0.404658660067655, + "grad_norm": 0.35371506214141846, + "learning_rate": 6.795336085657907e-05, + "loss": 1.6949, + "step": 2228 + }, + { + "epoch": 0.4048402842418326, + "grad_norm": 0.30740195512771606, + "learning_rate": 6.792575902500197e-05, + "loss": 1.7093, + "step": 2229 + }, + { + "epoch": 0.40502190841601016, + "grad_norm": 0.33203569054603577, + "learning_rate": 6.789815092351706e-05, + "loss": 1.7991, + "step": 2230 + }, + { + "epoch": 0.40520353259018776, + "grad_norm": 0.29283925890922546, + "learning_rate": 6.787053656178087e-05, + "loss": 1.5856, + "step": 2231 + }, + { + "epoch": 0.4053851567643653, + "grad_norm": 0.4273466467857361, + "learning_rate": 6.78429159494521e-05, + "loss": 1.7891, + "step": 2232 + }, + { + "epoch": 0.4055667809385429, + "grad_norm": 0.3595409691333771, + "learning_rate": 6.781528909619163e-05, + "loss": 1.7992, + "step": 2233 + }, + { + "epoch": 0.4057484051127205, + "grad_norm": 0.36114946007728577, + "learning_rate": 6.778765601166253e-05, + "loss": 1.7101, + "step": 2234 + }, + { + "epoch": 0.4059300292868981, + "grad_norm": 0.9585410356521606, + "learning_rate": 6.776001670553005e-05, + "loss": 1.6458, + "step": 2235 + }, + { + "epoch": 0.4061116534610757, + "grad_norm": 0.3503116965293884, + "learning_rate": 6.77323711874616e-05, + "loss": 1.9144, + "step": 2236 + }, + { + "epoch": 0.40629327763525325, + "grad_norm": 0.6715781092643738, + "learning_rate": 6.770471946712679e-05, + "loss": 1.8044, + "step": 2237 + }, + { + "epoch": 0.40647490180943086, + "grad_norm": 0.3724325895309448, + "learning_rate": 6.767706155419738e-05, + "loss": 1.7626, + "step": 2238 + }, + { + "epoch": 0.4066565259836084, + "grad_norm": 0.34956610202789307, + "learning_rate": 6.76493974583473e-05, + "loss": 1.7573, + "step": 2239 + }, + { + "epoch": 0.406838150157786, + "grad_norm": 0.3933010995388031, + "learning_rate": 6.762172718925264e-05, + "loss": 1.7788, + "step": 2240 + }, + { + "epoch": 0.40701977433196357, + "grad_norm": 0.4806971848011017, + "learning_rate": 6.759405075659166e-05, + "loss": 1.746, + "step": 2241 + }, + { + "epoch": 0.4072013985061412, + "grad_norm": 0.4133935570716858, + "learning_rate": 6.756636817004478e-05, + "loss": 1.4704, + "step": 2242 + }, + { + "epoch": 0.40738302268031873, + "grad_norm": 0.7301498651504517, + "learning_rate": 6.753867943929453e-05, + "loss": 1.6006, + "step": 2243 + }, + { + "epoch": 0.40756464685449634, + "grad_norm": 0.3848535716533661, + "learning_rate": 6.751098457402564e-05, + "loss": 1.7025, + "step": 2244 + }, + { + "epoch": 0.4077462710286739, + "grad_norm": 0.3402934968471527, + "learning_rate": 6.7483283583925e-05, + "loss": 1.6747, + "step": 2245 + }, + { + "epoch": 0.4079278952028515, + "grad_norm": 0.5041416883468628, + "learning_rate": 6.745557647868153e-05, + "loss": 1.9029, + "step": 2246 + }, + { + "epoch": 0.4081095193770291, + "grad_norm": 0.343868613243103, + "learning_rate": 6.742786326798645e-05, + "loss": 1.7195, + "step": 2247 + }, + { + "epoch": 0.40829114355120666, + "grad_norm": 0.456773579120636, + "learning_rate": 6.740014396153297e-05, + "loss": 1.7701, + "step": 2248 + }, + { + "epoch": 0.40847276772538427, + "grad_norm": 0.46982482075691223, + "learning_rate": 6.737241856901653e-05, + "loss": 2.0512, + "step": 2249 + }, + { + "epoch": 0.4086543918995618, + "grad_norm": 0.48741814494132996, + "learning_rate": 6.734468710013465e-05, + "loss": 1.9322, + "step": 2250 + }, + { + "epoch": 0.40883601607373943, + "grad_norm": 0.8469735383987427, + "learning_rate": 6.731694956458701e-05, + "loss": 1.7529, + "step": 2251 + }, + { + "epoch": 0.409017640247917, + "grad_norm": 0.6116758584976196, + "learning_rate": 6.728920597207536e-05, + "loss": 1.7028, + "step": 2252 + }, + { + "epoch": 0.4091992644220946, + "grad_norm": 0.3362194299697876, + "learning_rate": 6.72614563323036e-05, + "loss": 1.7922, + "step": 2253 + }, + { + "epoch": 0.40938088859627214, + "grad_norm": 0.727567195892334, + "learning_rate": 6.723370065497779e-05, + "loss": 1.7933, + "step": 2254 + }, + { + "epoch": 0.40956251277044975, + "grad_norm": 0.31798166036605835, + "learning_rate": 6.7205938949806e-05, + "loss": 1.621, + "step": 2255 + }, + { + "epoch": 0.40974413694462736, + "grad_norm": 0.9914749264717102, + "learning_rate": 6.71781712264985e-05, + "loss": 1.87, + "step": 2256 + }, + { + "epoch": 0.4099257611188049, + "grad_norm": 0.3395661413669586, + "learning_rate": 6.715039749476763e-05, + "loss": 1.8063, + "step": 2257 + }, + { + "epoch": 0.4101073852929825, + "grad_norm": 0.46002137660980225, + "learning_rate": 6.712261776432782e-05, + "loss": 1.8653, + "step": 2258 + }, + { + "epoch": 0.41028900946716007, + "grad_norm": 1.5534108877182007, + "learning_rate": 6.709483204489559e-05, + "loss": 1.9489, + "step": 2259 + }, + { + "epoch": 0.4104706336413377, + "grad_norm": 0.4689635634422302, + "learning_rate": 6.706704034618962e-05, + "loss": 1.7589, + "step": 2260 + }, + { + "epoch": 0.41065225781551523, + "grad_norm": 0.456772118806839, + "learning_rate": 6.703924267793061e-05, + "loss": 1.7133, + "step": 2261 + }, + { + "epoch": 0.41083388198969284, + "grad_norm": 0.35180938243865967, + "learning_rate": 6.701143904984138e-05, + "loss": 1.7093, + "step": 2262 + }, + { + "epoch": 0.4110155061638704, + "grad_norm": 0.5188599228858948, + "learning_rate": 6.69836294716468e-05, + "loss": 1.7943, + "step": 2263 + }, + { + "epoch": 0.411197130338048, + "grad_norm": 0.37782058119773865, + "learning_rate": 6.695581395307389e-05, + "loss": 1.6543, + "step": 2264 + }, + { + "epoch": 0.41137875451222555, + "grad_norm": 0.42485857009887695, + "learning_rate": 6.692799250385168e-05, + "loss": 1.7446, + "step": 2265 + }, + { + "epoch": 0.41156037868640316, + "grad_norm": 0.3332241475582123, + "learning_rate": 6.690016513371132e-05, + "loss": 1.6562, + "step": 2266 + }, + { + "epoch": 0.41174200286058077, + "grad_norm": 0.298382431268692, + "learning_rate": 6.6872331852386e-05, + "loss": 1.6522, + "step": 2267 + }, + { + "epoch": 0.4119236270347583, + "grad_norm": 0.3389226794242859, + "learning_rate": 6.6844492669611e-05, + "loss": 1.672, + "step": 2268 + }, + { + "epoch": 0.41210525120893593, + "grad_norm": 0.4247990846633911, + "learning_rate": 6.681664759512366e-05, + "loss": 1.7494, + "step": 2269 + }, + { + "epoch": 0.4122868753831135, + "grad_norm": 0.3196603059768677, + "learning_rate": 6.678879663866336e-05, + "loss": 1.8455, + "step": 2270 + }, + { + "epoch": 0.4124684995572911, + "grad_norm": 0.3994220495223999, + "learning_rate": 6.676093980997155e-05, + "loss": 1.8891, + "step": 2271 + }, + { + "epoch": 0.41265012373146864, + "grad_norm": 1.4463670253753662, + "learning_rate": 6.673307711879173e-05, + "loss": 1.665, + "step": 2272 + }, + { + "epoch": 0.41283174790564625, + "grad_norm": 0.3565225303173065, + "learning_rate": 6.67052085748695e-05, + "loss": 1.8166, + "step": 2273 + }, + { + "epoch": 0.4130133720798238, + "grad_norm": 0.3561110496520996, + "learning_rate": 6.667733418795242e-05, + "loss": 1.6353, + "step": 2274 + }, + { + "epoch": 0.4131949962540014, + "grad_norm": 0.3797493278980255, + "learning_rate": 6.664945396779015e-05, + "loss": 1.7568, + "step": 2275 + }, + { + "epoch": 0.41337662042817896, + "grad_norm": 0.32113975286483765, + "learning_rate": 6.662156792413438e-05, + "loss": 1.8314, + "step": 2276 + }, + { + "epoch": 0.4135582446023566, + "grad_norm": 0.42882078886032104, + "learning_rate": 6.659367606673882e-05, + "loss": 1.6557, + "step": 2277 + }, + { + "epoch": 0.4137398687765342, + "grad_norm": 0.454691618680954, + "learning_rate": 6.656577840535926e-05, + "loss": 1.6821, + "step": 2278 + }, + { + "epoch": 0.41392149295071173, + "grad_norm": 0.4323587417602539, + "learning_rate": 6.653787494975348e-05, + "loss": 1.7575, + "step": 2279 + }, + { + "epoch": 0.41410311712488934, + "grad_norm": 0.4747733771800995, + "learning_rate": 6.650996570968129e-05, + "loss": 1.7988, + "step": 2280 + }, + { + "epoch": 0.4142847412990669, + "grad_norm": 0.48346513509750366, + "learning_rate": 6.648205069490451e-05, + "loss": 1.8987, + "step": 2281 + }, + { + "epoch": 0.4144663654732445, + "grad_norm": 0.7901358008384705, + "learning_rate": 6.645412991518702e-05, + "loss": 1.7997, + "step": 2282 + }, + { + "epoch": 0.41464798964742206, + "grad_norm": 0.3922833502292633, + "learning_rate": 6.642620338029468e-05, + "loss": 1.861, + "step": 2283 + }, + { + "epoch": 0.41482961382159966, + "grad_norm": 0.3686610758304596, + "learning_rate": 6.63982710999954e-05, + "loss": 1.6827, + "step": 2284 + }, + { + "epoch": 0.4150112379957772, + "grad_norm": 0.7720385789871216, + "learning_rate": 6.637033308405905e-05, + "loss": 1.6939, + "step": 2285 + }, + { + "epoch": 0.4151928621699548, + "grad_norm": 0.47562727332115173, + "learning_rate": 6.634238934225755e-05, + "loss": 1.697, + "step": 2286 + }, + { + "epoch": 0.4153744863441324, + "grad_norm": 0.9161959290504456, + "learning_rate": 6.63144398843648e-05, + "loss": 1.7381, + "step": 2287 + }, + { + "epoch": 0.41555611051831, + "grad_norm": 0.43848729133605957, + "learning_rate": 6.628648472015672e-05, + "loss": 1.9125, + "step": 2288 + }, + { + "epoch": 0.4157377346924876, + "grad_norm": 0.3412732183933258, + "learning_rate": 6.625852385941119e-05, + "loss": 1.6318, + "step": 2289 + }, + { + "epoch": 0.41591935886666515, + "grad_norm": 0.3965975344181061, + "learning_rate": 6.62305573119081e-05, + "loss": 1.7014, + "step": 2290 + }, + { + "epoch": 0.41610098304084275, + "grad_norm": 1.069249153137207, + "learning_rate": 6.620258508742935e-05, + "loss": 1.9928, + "step": 2291 + }, + { + "epoch": 0.4162826072150203, + "grad_norm": 0.45243364572525024, + "learning_rate": 6.617460719575884e-05, + "loss": 1.8375, + "step": 2292 + }, + { + "epoch": 0.4164642313891979, + "grad_norm": 0.40674883127212524, + "learning_rate": 6.614662364668235e-05, + "loss": 1.7535, + "step": 2293 + }, + { + "epoch": 0.41664585556337547, + "grad_norm": 0.46788856387138367, + "learning_rate": 6.611863444998775e-05, + "loss": 1.9503, + "step": 2294 + }, + { + "epoch": 0.4168274797375531, + "grad_norm": 0.4412577152252197, + "learning_rate": 6.609063961546484e-05, + "loss": 1.7147, + "step": 2295 + }, + { + "epoch": 0.4170091039117306, + "grad_norm": 0.4114397466182709, + "learning_rate": 6.606263915290538e-05, + "loss": 1.8556, + "step": 2296 + }, + { + "epoch": 0.41719072808590824, + "grad_norm": 0.39450541138648987, + "learning_rate": 6.603463307210316e-05, + "loss": 1.7252, + "step": 2297 + }, + { + "epoch": 0.41737235226008584, + "grad_norm": 0.5031891465187073, + "learning_rate": 6.600662138285384e-05, + "loss": 1.6749, + "step": 2298 + }, + { + "epoch": 0.4175539764342634, + "grad_norm": 0.34434232115745544, + "learning_rate": 6.597860409495513e-05, + "loss": 1.5839, + "step": 2299 + }, + { + "epoch": 0.417735600608441, + "grad_norm": 0.4032187759876251, + "learning_rate": 6.595058121820662e-05, + "loss": 1.7251, + "step": 2300 + }, + { + "epoch": 0.41791722478261856, + "grad_norm": 0.47533273696899414, + "learning_rate": 6.592255276240994e-05, + "loss": 1.7863, + "step": 2301 + }, + { + "epoch": 0.41809884895679617, + "grad_norm": 0.33092233538627625, + "learning_rate": 6.589451873736859e-05, + "loss": 1.653, + "step": 2302 + }, + { + "epoch": 0.4182804731309737, + "grad_norm": 0.350396066904068, + "learning_rate": 6.586647915288808e-05, + "loss": 1.7071, + "step": 2303 + }, + { + "epoch": 0.4184620973051513, + "grad_norm": 0.5889908075332642, + "learning_rate": 6.583843401877584e-05, + "loss": 1.8519, + "step": 2304 + }, + { + "epoch": 0.4186437214793289, + "grad_norm": 0.483749121427536, + "learning_rate": 6.58103833448412e-05, + "loss": 1.9066, + "step": 2305 + }, + { + "epoch": 0.4188253456535065, + "grad_norm": 0.2735447585582733, + "learning_rate": 6.578232714089551e-05, + "loss": 1.9553, + "step": 2306 + }, + { + "epoch": 0.41900696982768404, + "grad_norm": 0.367914617061615, + "learning_rate": 6.5754265416752e-05, + "loss": 1.7239, + "step": 2307 + }, + { + "epoch": 0.41918859400186165, + "grad_norm": 0.37746110558509827, + "learning_rate": 6.572619818222585e-05, + "loss": 1.74, + "step": 2308 + }, + { + "epoch": 0.41937021817603926, + "grad_norm": 0.5195026397705078, + "learning_rate": 6.569812544713415e-05, + "loss": 1.9042, + "step": 2309 + }, + { + "epoch": 0.4195518423502168, + "grad_norm": 0.45533159375190735, + "learning_rate": 6.567004722129591e-05, + "loss": 1.8502, + "step": 2310 + }, + { + "epoch": 0.4197334665243944, + "grad_norm": 0.29701051115989685, + "learning_rate": 6.564196351453209e-05, + "loss": 1.6585, + "step": 2311 + }, + { + "epoch": 0.41991509069857197, + "grad_norm": 0.3349330723285675, + "learning_rate": 6.561387433666558e-05, + "loss": 1.9703, + "step": 2312 + }, + { + "epoch": 0.4200967148727496, + "grad_norm": 0.4905124008655548, + "learning_rate": 6.558577969752111e-05, + "loss": 2.0711, + "step": 2313 + }, + { + "epoch": 0.42027833904692713, + "grad_norm": 0.2982684075832367, + "learning_rate": 6.555767960692538e-05, + "loss": 1.994, + "step": 2314 + }, + { + "epoch": 0.42045996322110474, + "grad_norm": 0.3548983931541443, + "learning_rate": 6.552957407470699e-05, + "loss": 1.4865, + "step": 2315 + }, + { + "epoch": 0.4206415873952823, + "grad_norm": 0.37099364399909973, + "learning_rate": 6.550146311069643e-05, + "loss": 1.8621, + "step": 2316 + }, + { + "epoch": 0.4208232115694599, + "grad_norm": 0.42554375529289246, + "learning_rate": 6.54733467247261e-05, + "loss": 1.8678, + "step": 2317 + }, + { + "epoch": 0.42100483574363745, + "grad_norm": 0.3795839846134186, + "learning_rate": 6.544522492663028e-05, + "loss": 1.7916, + "step": 2318 + }, + { + "epoch": 0.42118645991781506, + "grad_norm": 0.573942244052887, + "learning_rate": 6.541709772624517e-05, + "loss": 1.9229, + "step": 2319 + }, + { + "epoch": 0.42136808409199267, + "grad_norm": 0.2982983887195587, + "learning_rate": 6.538896513340884e-05, + "loss": 1.858, + "step": 2320 + }, + { + "epoch": 0.4215497082661702, + "grad_norm": 0.40819987654685974, + "learning_rate": 6.536082715796125e-05, + "loss": 1.6255, + "step": 2321 + }, + { + "epoch": 0.42173133244034783, + "grad_norm": 0.5527991652488708, + "learning_rate": 6.533268380974424e-05, + "loss": 1.6984, + "step": 2322 + }, + { + "epoch": 0.4219129566145254, + "grad_norm": 0.3869839608669281, + "learning_rate": 6.530453509860153e-05, + "loss": 1.6857, + "step": 2323 + }, + { + "epoch": 0.422094580788703, + "grad_norm": 0.33319738507270813, + "learning_rate": 6.527638103437872e-05, + "loss": 1.4931, + "step": 2324 + }, + { + "epoch": 0.42227620496288054, + "grad_norm": 0.3095543682575226, + "learning_rate": 6.524822162692328e-05, + "loss": 1.667, + "step": 2325 + }, + { + "epoch": 0.42245782913705815, + "grad_norm": 0.48349517583847046, + "learning_rate": 6.522005688608456e-05, + "loss": 1.6331, + "step": 2326 + }, + { + "epoch": 0.4226394533112357, + "grad_norm": 0.339102178812027, + "learning_rate": 6.519188682171377e-05, + "loss": 1.6968, + "step": 2327 + }, + { + "epoch": 0.4228210774854133, + "grad_norm": 0.2920970320701599, + "learning_rate": 6.516371144366395e-05, + "loss": 1.4801, + "step": 2328 + }, + { + "epoch": 0.4230027016595909, + "grad_norm": 0.5081005692481995, + "learning_rate": 6.513553076179005e-05, + "loss": 1.9193, + "step": 2329 + }, + { + "epoch": 0.42318432583376847, + "grad_norm": 0.370382159948349, + "learning_rate": 6.510734478594884e-05, + "loss": 1.7528, + "step": 2330 + }, + { + "epoch": 0.4233659500079461, + "grad_norm": 0.6081043481826782, + "learning_rate": 6.507915352599895e-05, + "loss": 1.728, + "step": 2331 + }, + { + "epoch": 0.42354757418212363, + "grad_norm": 0.3403717577457428, + "learning_rate": 6.505095699180088e-05, + "loss": 1.6737, + "step": 2332 + }, + { + "epoch": 0.42372919835630124, + "grad_norm": 0.9446196556091309, + "learning_rate": 6.50227551932169e-05, + "loss": 1.7914, + "step": 2333 + }, + { + "epoch": 0.4239108225304788, + "grad_norm": 0.382093608379364, + "learning_rate": 6.499454814011126e-05, + "loss": 1.7792, + "step": 2334 + }, + { + "epoch": 0.4240924467046564, + "grad_norm": 0.31605127453804016, + "learning_rate": 6.49663358423499e-05, + "loss": 1.8648, + "step": 2335 + }, + { + "epoch": 0.42427407087883395, + "grad_norm": 0.45468422770500183, + "learning_rate": 6.493811830980067e-05, + "loss": 1.902, + "step": 2336 + }, + { + "epoch": 0.42445569505301156, + "grad_norm": 0.40957608819007874, + "learning_rate": 6.490989555233327e-05, + "loss": 2.0634, + "step": 2337 + }, + { + "epoch": 0.4246373192271891, + "grad_norm": 0.4399091899394989, + "learning_rate": 6.488166757981919e-05, + "loss": 1.7121, + "step": 2338 + }, + { + "epoch": 0.4248189434013667, + "grad_norm": 0.3772726356983185, + "learning_rate": 6.485343440213171e-05, + "loss": 1.6362, + "step": 2339 + }, + { + "epoch": 0.42500056757554433, + "grad_norm": 0.3675272464752197, + "learning_rate": 6.482519602914603e-05, + "loss": 1.8137, + "step": 2340 + }, + { + "epoch": 0.4251821917497219, + "grad_norm": 0.5758700370788574, + "learning_rate": 6.479695247073907e-05, + "loss": 1.9288, + "step": 2341 + }, + { + "epoch": 0.4253638159238995, + "grad_norm": 0.36714816093444824, + "learning_rate": 6.47687037367896e-05, + "loss": 1.624, + "step": 2342 + }, + { + "epoch": 0.42554544009807704, + "grad_norm": 0.5979301929473877, + "learning_rate": 6.474044983717824e-05, + "loss": 2.1203, + "step": 2343 + }, + { + "epoch": 0.42572706427225465, + "grad_norm": 0.35251933336257935, + "learning_rate": 6.471219078178735e-05, + "loss": 1.6645, + "step": 2344 + }, + { + "epoch": 0.4259086884464322, + "grad_norm": 0.8911857604980469, + "learning_rate": 6.468392658050113e-05, + "loss": 1.7985, + "step": 2345 + }, + { + "epoch": 0.4260903126206098, + "grad_norm": 0.6252380609512329, + "learning_rate": 6.465565724320558e-05, + "loss": 1.9426, + "step": 2346 + }, + { + "epoch": 0.42627193679478736, + "grad_norm": 0.3900127708911896, + "learning_rate": 6.462738277978849e-05, + "loss": 1.797, + "step": 2347 + }, + { + "epoch": 0.426453560968965, + "grad_norm": 0.36386042833328247, + "learning_rate": 6.459910320013942e-05, + "loss": 1.8762, + "step": 2348 + }, + { + "epoch": 0.4266351851431425, + "grad_norm": 0.3853292465209961, + "learning_rate": 6.457081851414977e-05, + "loss": 1.7378, + "step": 2349 + }, + { + "epoch": 0.42681680931732013, + "grad_norm": 0.4155346155166626, + "learning_rate": 6.454252873171269e-05, + "loss": 1.7926, + "step": 2350 + }, + { + "epoch": 0.42699843349149774, + "grad_norm": 0.3408297598361969, + "learning_rate": 6.451423386272312e-05, + "loss": 1.6713, + "step": 2351 + }, + { + "epoch": 0.4271800576656753, + "grad_norm": 0.3424580693244934, + "learning_rate": 6.448593391707779e-05, + "loss": 1.9659, + "step": 2352 + }, + { + "epoch": 0.4273616818398529, + "grad_norm": 0.4230232238769531, + "learning_rate": 6.445762890467517e-05, + "loss": 1.779, + "step": 2353 + }, + { + "epoch": 0.42754330601403046, + "grad_norm": 0.3702353239059448, + "learning_rate": 6.442931883541554e-05, + "loss": 1.7283, + "step": 2354 + }, + { + "epoch": 0.42772493018820806, + "grad_norm": 0.38063743710517883, + "learning_rate": 6.440100371920095e-05, + "loss": 1.8043, + "step": 2355 + }, + { + "epoch": 0.4279065543623856, + "grad_norm": 0.36361294984817505, + "learning_rate": 6.43726835659352e-05, + "loss": 1.5331, + "step": 2356 + }, + { + "epoch": 0.4280881785365632, + "grad_norm": 0.3519216477870941, + "learning_rate": 6.434435838552385e-05, + "loss": 1.7485, + "step": 2357 + }, + { + "epoch": 0.4282698027107408, + "grad_norm": 0.4753742218017578, + "learning_rate": 6.43160281878742e-05, + "loss": 1.83, + "step": 2358 + }, + { + "epoch": 0.4284514268849184, + "grad_norm": 0.3569190800189972, + "learning_rate": 6.428769298289534e-05, + "loss": 1.7546, + "step": 2359 + }, + { + "epoch": 0.428633051059096, + "grad_norm": 0.3084384799003601, + "learning_rate": 6.425935278049813e-05, + "loss": 1.6105, + "step": 2360 + }, + { + "epoch": 0.42881467523327355, + "grad_norm": 0.46572986245155334, + "learning_rate": 6.423100759059509e-05, + "loss": 1.8223, + "step": 2361 + }, + { + "epoch": 0.42899629940745115, + "grad_norm": 0.37671148777008057, + "learning_rate": 6.42026574231006e-05, + "loss": 1.6686, + "step": 2362 + }, + { + "epoch": 0.4291779235816287, + "grad_norm": 0.46332183480262756, + "learning_rate": 6.417430228793069e-05, + "loss": 1.7364, + "step": 2363 + }, + { + "epoch": 0.4293595477558063, + "grad_norm": 0.4585730731487274, + "learning_rate": 6.414594219500313e-05, + "loss": 1.6993, + "step": 2364 + }, + { + "epoch": 0.42954117192998387, + "grad_norm": 1.0186278820037842, + "learning_rate": 6.411757715423751e-05, + "loss": 1.8018, + "step": 2365 + }, + { + "epoch": 0.4297227961041615, + "grad_norm": 0.291267067193985, + "learning_rate": 6.408920717555507e-05, + "loss": 1.8182, + "step": 2366 + }, + { + "epoch": 0.42990442027833903, + "grad_norm": 0.4304133355617523, + "learning_rate": 6.406083226887881e-05, + "loss": 1.7682, + "step": 2367 + }, + { + "epoch": 0.43008604445251664, + "grad_norm": 0.4137377142906189, + "learning_rate": 6.40324524441334e-05, + "loss": 1.5798, + "step": 2368 + }, + { + "epoch": 0.4302676686266942, + "grad_norm": 0.604885995388031, + "learning_rate": 6.400406771124536e-05, + "loss": 1.855, + "step": 2369 + }, + { + "epoch": 0.4304492928008718, + "grad_norm": 0.40429478883743286, + "learning_rate": 6.397567808014276e-05, + "loss": 1.7721, + "step": 2370 + }, + { + "epoch": 0.4306309169750494, + "grad_norm": 0.276467889547348, + "learning_rate": 6.394728356075551e-05, + "loss": 1.7535, + "step": 2371 + }, + { + "epoch": 0.43081254114922696, + "grad_norm": 0.8406744599342346, + "learning_rate": 6.391888416301518e-05, + "loss": 1.7104, + "step": 2372 + }, + { + "epoch": 0.43099416532340457, + "grad_norm": 0.6406803131103516, + "learning_rate": 6.389047989685503e-05, + "loss": 1.711, + "step": 2373 + }, + { + "epoch": 0.4311757894975821, + "grad_norm": 0.47679397463798523, + "learning_rate": 6.386207077221005e-05, + "loss": 1.7212, + "step": 2374 + }, + { + "epoch": 0.4313574136717597, + "grad_norm": 0.5304102897644043, + "learning_rate": 6.383365679901696e-05, + "loss": 1.8356, + "step": 2375 + }, + { + "epoch": 0.4315390378459373, + "grad_norm": 0.30921950936317444, + "learning_rate": 6.38052379872141e-05, + "loss": 1.7164, + "step": 2376 + }, + { + "epoch": 0.4317206620201149, + "grad_norm": 0.4314732849597931, + "learning_rate": 6.377681434674154e-05, + "loss": 1.9203, + "step": 2377 + }, + { + "epoch": 0.43190228619429244, + "grad_norm": 0.32476410269737244, + "learning_rate": 6.374838588754108e-05, + "loss": 1.7086, + "step": 2378 + }, + { + "epoch": 0.43208391036847005, + "grad_norm": 2.053415536880493, + "learning_rate": 6.371995261955612e-05, + "loss": 1.929, + "step": 2379 + }, + { + "epoch": 0.4322655345426476, + "grad_norm": 0.462141215801239, + "learning_rate": 6.36915145527318e-05, + "loss": 1.8901, + "step": 2380 + }, + { + "epoch": 0.4324471587168252, + "grad_norm": 0.42606332898139954, + "learning_rate": 6.366307169701496e-05, + "loss": 1.8622, + "step": 2381 + }, + { + "epoch": 0.4326287828910028, + "grad_norm": 0.36178579926490784, + "learning_rate": 6.363462406235403e-05, + "loss": 1.708, + "step": 2382 + }, + { + "epoch": 0.43281040706518037, + "grad_norm": 1.08635675907135, + "learning_rate": 6.360617165869919e-05, + "loss": 1.7909, + "step": 2383 + }, + { + "epoch": 0.432992031239358, + "grad_norm": 0.3858637809753418, + "learning_rate": 6.357771449600227e-05, + "loss": 1.6361, + "step": 2384 + }, + { + "epoch": 0.43317365541353553, + "grad_norm": 0.48989787697792053, + "learning_rate": 6.354925258421675e-05, + "loss": 1.794, + "step": 2385 + }, + { + "epoch": 0.43335527958771314, + "grad_norm": 0.5650635957717896, + "learning_rate": 6.352078593329776e-05, + "loss": 1.9358, + "step": 2386 + }, + { + "epoch": 0.4335369037618907, + "grad_norm": 0.521224319934845, + "learning_rate": 6.349231455320214e-05, + "loss": 1.795, + "step": 2387 + }, + { + "epoch": 0.4337185279360683, + "grad_norm": 0.3765721321105957, + "learning_rate": 6.346383845388831e-05, + "loss": 1.7945, + "step": 2388 + }, + { + "epoch": 0.43390015211024585, + "grad_norm": 0.3259618580341339, + "learning_rate": 6.343535764531639e-05, + "loss": 1.8722, + "step": 2389 + }, + { + "epoch": 0.43408177628442346, + "grad_norm": 0.32835668325424194, + "learning_rate": 6.340687213744814e-05, + "loss": 1.6558, + "step": 2390 + }, + { + "epoch": 0.434263400458601, + "grad_norm": 0.35318729281425476, + "learning_rate": 6.337838194024697e-05, + "loss": 1.6496, + "step": 2391 + }, + { + "epoch": 0.4344450246327786, + "grad_norm": 0.4071395993232727, + "learning_rate": 6.334988706367788e-05, + "loss": 1.6869, + "step": 2392 + }, + { + "epoch": 0.43462664880695623, + "grad_norm": 0.5782719254493713, + "learning_rate": 6.332138751770762e-05, + "loss": 1.8465, + "step": 2393 + }, + { + "epoch": 0.4348082729811338, + "grad_norm": 0.3708358407020569, + "learning_rate": 6.329288331230443e-05, + "loss": 1.3687, + "step": 2394 + }, + { + "epoch": 0.4349898971553114, + "grad_norm": 0.4711517095565796, + "learning_rate": 6.326437445743829e-05, + "loss": 1.8816, + "step": 2395 + }, + { + "epoch": 0.43517152132948894, + "grad_norm": 0.3840806484222412, + "learning_rate": 6.323586096308075e-05, + "loss": 1.6608, + "step": 2396 + }, + { + "epoch": 0.43535314550366655, + "grad_norm": 0.37548530101776123, + "learning_rate": 6.320734283920502e-05, + "loss": 1.8066, + "step": 2397 + }, + { + "epoch": 0.4355347696778441, + "grad_norm": 0.7099406719207764, + "learning_rate": 6.317882009578586e-05, + "loss": 1.9055, + "step": 2398 + }, + { + "epoch": 0.4357163938520217, + "grad_norm": 0.46456077694892883, + "learning_rate": 6.315029274279976e-05, + "loss": 1.7758, + "step": 2399 + }, + { + "epoch": 0.43589801802619926, + "grad_norm": 0.35314643383026123, + "learning_rate": 6.31217607902247e-05, + "loss": 1.6695, + "step": 2400 + }, + { + "epoch": 0.43607964220037687, + "grad_norm": 0.9103403687477112, + "learning_rate": 6.309322424804034e-05, + "loss": 1.6601, + "step": 2401 + }, + { + "epoch": 0.4362612663745545, + "grad_norm": 0.3183012306690216, + "learning_rate": 6.306468312622795e-05, + "loss": 1.7659, + "step": 2402 + }, + { + "epoch": 0.43644289054873203, + "grad_norm": 0.5552078485488892, + "learning_rate": 6.303613743477036e-05, + "loss": 1.8556, + "step": 2403 + }, + { + "epoch": 0.43662451472290964, + "grad_norm": 0.33087271451950073, + "learning_rate": 6.300758718365203e-05, + "loss": 1.9257, + "step": 2404 + }, + { + "epoch": 0.4368061388970872, + "grad_norm": 0.4281154274940491, + "learning_rate": 6.2979032382859e-05, + "loss": 1.7186, + "step": 2405 + }, + { + "epoch": 0.4369877630712648, + "grad_norm": 0.5242148637771606, + "learning_rate": 6.295047304237893e-05, + "loss": 1.9079, + "step": 2406 + }, + { + "epoch": 0.43716938724544235, + "grad_norm": 0.4214757978916168, + "learning_rate": 6.292190917220101e-05, + "loss": 1.6174, + "step": 2407 + }, + { + "epoch": 0.43735101141961996, + "grad_norm": 1.0559252500534058, + "learning_rate": 6.289334078231609e-05, + "loss": 2.092, + "step": 2408 + }, + { + "epoch": 0.4375326355937975, + "grad_norm": 0.33853915333747864, + "learning_rate": 6.286476788271649e-05, + "loss": 1.7252, + "step": 2409 + }, + { + "epoch": 0.4377142597679751, + "grad_norm": 0.8812536001205444, + "learning_rate": 6.283619048339623e-05, + "loss": 1.9597, + "step": 2410 + }, + { + "epoch": 0.4378958839421527, + "grad_norm": 0.44149577617645264, + "learning_rate": 6.280760859435087e-05, + "loss": 1.7163, + "step": 2411 + }, + { + "epoch": 0.4380775081163303, + "grad_norm": 0.5489981770515442, + "learning_rate": 6.277902222557749e-05, + "loss": 1.9892, + "step": 2412 + }, + { + "epoch": 0.4382591322905079, + "grad_norm": 1.3137460947036743, + "learning_rate": 6.275043138707475e-05, + "loss": 1.8622, + "step": 2413 + }, + { + "epoch": 0.43844075646468544, + "grad_norm": 0.35573503375053406, + "learning_rate": 6.272183608884292e-05, + "loss": 1.756, + "step": 2414 + }, + { + "epoch": 0.43862238063886305, + "grad_norm": 0.39394116401672363, + "learning_rate": 6.269323634088383e-05, + "loss": 1.6955, + "step": 2415 + }, + { + "epoch": 0.4388040048130406, + "grad_norm": 0.3642912209033966, + "learning_rate": 6.266463215320075e-05, + "loss": 1.834, + "step": 2416 + }, + { + "epoch": 0.4389856289872182, + "grad_norm": 0.34496936202049255, + "learning_rate": 6.263602353579868e-05, + "loss": 2.011, + "step": 2417 + }, + { + "epoch": 0.43916725316139577, + "grad_norm": 0.3905300796031952, + "learning_rate": 6.260741049868401e-05, + "loss": 1.9158, + "step": 2418 + }, + { + "epoch": 0.4393488773355734, + "grad_norm": 0.4928402006626129, + "learning_rate": 6.257879305186482e-05, + "loss": 1.8061, + "step": 2419 + }, + { + "epoch": 0.4395305015097509, + "grad_norm": 0.5031622052192688, + "learning_rate": 6.255017120535059e-05, + "loss": 1.7105, + "step": 2420 + }, + { + "epoch": 0.43971212568392853, + "grad_norm": 0.2907533645629883, + "learning_rate": 6.252154496915244e-05, + "loss": 1.8001, + "step": 2421 + }, + { + "epoch": 0.4398937498581061, + "grad_norm": 0.628521740436554, + "learning_rate": 6.2492914353283e-05, + "loss": 1.9542, + "step": 2422 + }, + { + "epoch": 0.4400753740322837, + "grad_norm": 0.36374348402023315, + "learning_rate": 6.246427936775639e-05, + "loss": 1.7943, + "step": 2423 + }, + { + "epoch": 0.4402569982064613, + "grad_norm": 0.4003053605556488, + "learning_rate": 6.243564002258833e-05, + "loss": 1.6393, + "step": 2424 + }, + { + "epoch": 0.44043862238063886, + "grad_norm": 0.357403039932251, + "learning_rate": 6.240699632779602e-05, + "loss": 1.8367, + "step": 2425 + }, + { + "epoch": 0.44062024655481646, + "grad_norm": 0.39514589309692383, + "learning_rate": 6.237834829339817e-05, + "loss": 1.504, + "step": 2426 + }, + { + "epoch": 0.440801870728994, + "grad_norm": 0.47692182660102844, + "learning_rate": 6.234969592941504e-05, + "loss": 1.741, + "step": 2427 + }, + { + "epoch": 0.4409834949031716, + "grad_norm": 0.36549925804138184, + "learning_rate": 6.232103924586841e-05, + "loss": 1.8018, + "step": 2428 + }, + { + "epoch": 0.4411651190773492, + "grad_norm": 0.4405839145183563, + "learning_rate": 6.22923782527815e-05, + "loss": 1.624, + "step": 2429 + }, + { + "epoch": 0.4413467432515268, + "grad_norm": 0.3338333070278168, + "learning_rate": 6.226371296017916e-05, + "loss": 1.9147, + "step": 2430 + }, + { + "epoch": 0.44152836742570434, + "grad_norm": 0.3854905962944031, + "learning_rate": 6.223504337808761e-05, + "loss": 1.7115, + "step": 2431 + }, + { + "epoch": 0.44170999159988195, + "grad_norm": 0.3366553485393524, + "learning_rate": 6.220636951653467e-05, + "loss": 1.7548, + "step": 2432 + }, + { + "epoch": 0.44189161577405955, + "grad_norm": 0.41535547375679016, + "learning_rate": 6.21776913855496e-05, + "loss": 1.7746, + "step": 2433 + }, + { + "epoch": 0.4420732399482371, + "grad_norm": 0.3960694968700409, + "learning_rate": 6.21490089951632e-05, + "loss": 1.5385, + "step": 2434 + }, + { + "epoch": 0.4422548641224147, + "grad_norm": 0.3237675726413727, + "learning_rate": 6.212032235540772e-05, + "loss": 1.6607, + "step": 2435 + }, + { + "epoch": 0.44243648829659227, + "grad_norm": 0.8351730704307556, + "learning_rate": 6.209163147631689e-05, + "loss": 1.969, + "step": 2436 + }, + { + "epoch": 0.4426181124707699, + "grad_norm": 0.9061951637268066, + "learning_rate": 6.206293636792599e-05, + "loss": 1.978, + "step": 2437 + }, + { + "epoch": 0.44279973664494743, + "grad_norm": 0.44693824648857117, + "learning_rate": 6.203423704027167e-05, + "loss": 1.5579, + "step": 2438 + }, + { + "epoch": 0.44298136081912504, + "grad_norm": 0.5655565857887268, + "learning_rate": 6.200553350339218e-05, + "loss": 1.9517, + "step": 2439 + }, + { + "epoch": 0.4431629849933026, + "grad_norm": 0.45996424555778503, + "learning_rate": 6.197682576732713e-05, + "loss": 1.7876, + "step": 2440 + }, + { + "epoch": 0.4433446091674802, + "grad_norm": 0.3130119740962982, + "learning_rate": 6.194811384211768e-05, + "loss": 1.8374, + "step": 2441 + }, + { + "epoch": 0.44352623334165775, + "grad_norm": 0.4193412959575653, + "learning_rate": 6.191939773780642e-05, + "loss": 1.7432, + "step": 2442 + }, + { + "epoch": 0.44370785751583536, + "grad_norm": 0.396968275308609, + "learning_rate": 6.189067746443739e-05, + "loss": 1.9854, + "step": 2443 + }, + { + "epoch": 0.44388948169001297, + "grad_norm": 0.6756458878517151, + "learning_rate": 6.186195303205613e-05, + "loss": 1.8951, + "step": 2444 + }, + { + "epoch": 0.4440711058641905, + "grad_norm": 0.3834454417228699, + "learning_rate": 6.183322445070958e-05, + "loss": 1.818, + "step": 2445 + }, + { + "epoch": 0.4442527300383681, + "grad_norm": 0.38103610277175903, + "learning_rate": 6.180449173044619e-05, + "loss": 1.657, + "step": 2446 + }, + { + "epoch": 0.4444343542125457, + "grad_norm": 0.35326722264289856, + "learning_rate": 6.17757548813158e-05, + "loss": 1.7231, + "step": 2447 + }, + { + "epoch": 0.4446159783867233, + "grad_norm": 0.5894297361373901, + "learning_rate": 6.174701391336973e-05, + "loss": 1.9493, + "step": 2448 + }, + { + "epoch": 0.44479760256090084, + "grad_norm": 0.5921090841293335, + "learning_rate": 6.171826883666074e-05, + "loss": 1.6245, + "step": 2449 + }, + { + "epoch": 0.44497922673507845, + "grad_norm": 0.45089468359947205, + "learning_rate": 6.168951966124303e-05, + "loss": 1.7685, + "step": 2450 + }, + { + "epoch": 0.445160850909256, + "grad_norm": 0.40302541851997375, + "learning_rate": 6.166076639717218e-05, + "loss": 1.707, + "step": 2451 + }, + { + "epoch": 0.4453424750834336, + "grad_norm": 0.3937315344810486, + "learning_rate": 6.16320090545053e-05, + "loss": 1.7377, + "step": 2452 + }, + { + "epoch": 0.44552409925761116, + "grad_norm": 0.40113329887390137, + "learning_rate": 6.160324764330083e-05, + "loss": 1.6123, + "step": 2453 + }, + { + "epoch": 0.44570572343178877, + "grad_norm": 1.1789071559906006, + "learning_rate": 6.157448217361869e-05, + "loss": 1.9311, + "step": 2454 + }, + { + "epoch": 0.4458873476059664, + "grad_norm": 2.255734443664551, + "learning_rate": 6.154571265552019e-05, + "loss": 1.7224, + "step": 2455 + }, + { + "epoch": 0.44606897178014393, + "grad_norm": 0.5961069464683533, + "learning_rate": 6.151693909906808e-05, + "loss": 1.8341, + "step": 2456 + }, + { + "epoch": 0.44625059595432154, + "grad_norm": 0.4627784788608551, + "learning_rate": 6.14881615143265e-05, + "loss": 1.8042, + "step": 2457 + }, + { + "epoch": 0.4464322201284991, + "grad_norm": 0.37597742676734924, + "learning_rate": 6.145937991136102e-05, + "loss": 1.8158, + "step": 2458 + }, + { + "epoch": 0.4466138443026767, + "grad_norm": 0.6265427470207214, + "learning_rate": 6.143059430023862e-05, + "loss": 1.6132, + "step": 2459 + }, + { + "epoch": 0.44679546847685425, + "grad_norm": 0.6470116376876831, + "learning_rate": 6.140180469102761e-05, + "loss": 1.6342, + "step": 2460 + }, + { + "epoch": 0.44697709265103186, + "grad_norm": 0.3816601634025574, + "learning_rate": 6.137301109379783e-05, + "loss": 1.5984, + "step": 2461 + }, + { + "epoch": 0.4471587168252094, + "grad_norm": 0.3855556845664978, + "learning_rate": 6.13442135186204e-05, + "loss": 1.8602, + "step": 2462 + }, + { + "epoch": 0.447340340999387, + "grad_norm": 0.5694381594657898, + "learning_rate": 6.131541197556788e-05, + "loss": 1.7719, + "step": 2463 + }, + { + "epoch": 0.44752196517356463, + "grad_norm": 0.32834741473197937, + "learning_rate": 6.128660647471421e-05, + "loss": 1.6701, + "step": 2464 + }, + { + "epoch": 0.4477035893477422, + "grad_norm": 0.3885763883590698, + "learning_rate": 6.125779702613471e-05, + "loss": 1.6793, + "step": 2465 + }, + { + "epoch": 0.4478852135219198, + "grad_norm": 0.4274684488773346, + "learning_rate": 6.122898363990608e-05, + "loss": 1.6911, + "step": 2466 + }, + { + "epoch": 0.44806683769609734, + "grad_norm": 0.42507362365722656, + "learning_rate": 6.120016632610641e-05, + "loss": 1.5916, + "step": 2467 + }, + { + "epoch": 0.44824846187027495, + "grad_norm": 0.4668820798397064, + "learning_rate": 6.117134509481517e-05, + "loss": 1.7714, + "step": 2468 + }, + { + "epoch": 0.4484300860444525, + "grad_norm": 0.5645016431808472, + "learning_rate": 6.114251995611315e-05, + "loss": 1.7872, + "step": 2469 + }, + { + "epoch": 0.4486117102186301, + "grad_norm": 0.33259811997413635, + "learning_rate": 6.111369092008259e-05, + "loss": 1.8036, + "step": 2470 + }, + { + "epoch": 0.44879333439280766, + "grad_norm": 0.380506694316864, + "learning_rate": 6.108485799680701e-05, + "loss": 1.8078, + "step": 2471 + }, + { + "epoch": 0.44897495856698527, + "grad_norm": 0.4594610929489136, + "learning_rate": 6.105602119637134e-05, + "loss": 2.0313, + "step": 2472 + }, + { + "epoch": 0.4491565827411628, + "grad_norm": 0.8625211715698242, + "learning_rate": 6.1027180528861835e-05, + "loss": 1.8422, + "step": 2473 + }, + { + "epoch": 0.44933820691534043, + "grad_norm": 0.3555734157562256, + "learning_rate": 6.099833600436615e-05, + "loss": 1.9051, + "step": 2474 + }, + { + "epoch": 0.44951983108951804, + "grad_norm": 0.9155495762825012, + "learning_rate": 6.0969487632973245e-05, + "loss": 1.7569, + "step": 2475 + }, + { + "epoch": 0.4497014552636956, + "grad_norm": 0.3986756503582001, + "learning_rate": 6.0940635424773416e-05, + "loss": 1.6871, + "step": 2476 + }, + { + "epoch": 0.4498830794378732, + "grad_norm": 0.4267003536224365, + "learning_rate": 6.091177938985836e-05, + "loss": 1.7859, + "step": 2477 + }, + { + "epoch": 0.45006470361205075, + "grad_norm": 0.40857309103012085, + "learning_rate": 6.088291953832107e-05, + "loss": 1.8281, + "step": 2478 + }, + { + "epoch": 0.45024632778622836, + "grad_norm": 0.32222798466682434, + "learning_rate": 6.0854055880255844e-05, + "loss": 1.7295, + "step": 2479 + }, + { + "epoch": 0.4504279519604059, + "grad_norm": 0.32412323355674744, + "learning_rate": 6.0825188425758396e-05, + "loss": 1.6395, + "step": 2480 + }, + { + "epoch": 0.4506095761345835, + "grad_norm": 0.3946205973625183, + "learning_rate": 6.079631718492569e-05, + "loss": 1.7989, + "step": 2481 + }, + { + "epoch": 0.4507912003087611, + "grad_norm": 0.30933690071105957, + "learning_rate": 6.076744216785606e-05, + "loss": 1.6739, + "step": 2482 + }, + { + "epoch": 0.4509728244829387, + "grad_norm": 0.3473057448863983, + "learning_rate": 6.073856338464914e-05, + "loss": 1.6188, + "step": 2483 + }, + { + "epoch": 0.45115444865711624, + "grad_norm": 0.9501147866249084, + "learning_rate": 6.070968084540588e-05, + "loss": 1.8971, + "step": 2484 + }, + { + "epoch": 0.45133607283129384, + "grad_norm": 0.35653847455978394, + "learning_rate": 6.068079456022855e-05, + "loss": 1.7624, + "step": 2485 + }, + { + "epoch": 0.45151769700547145, + "grad_norm": 0.3123989403247833, + "learning_rate": 6.065190453922074e-05, + "loss": 1.6594, + "step": 2486 + }, + { + "epoch": 0.451699321179649, + "grad_norm": 0.37111198902130127, + "learning_rate": 6.062301079248733e-05, + "loss": 1.9126, + "step": 2487 + }, + { + "epoch": 0.4518809453538266, + "grad_norm": 0.6131197214126587, + "learning_rate": 6.0594113330134505e-05, + "loss": 1.6659, + "step": 2488 + }, + { + "epoch": 0.45206256952800417, + "grad_norm": 0.43477246165275574, + "learning_rate": 6.056521216226978e-05, + "loss": 1.7283, + "step": 2489 + }, + { + "epoch": 0.4522441937021818, + "grad_norm": 0.37334582209587097, + "learning_rate": 6.05363072990019e-05, + "loss": 1.7102, + "step": 2490 + }, + { + "epoch": 0.4524258178763593, + "grad_norm": 0.5178270936012268, + "learning_rate": 6.050739875044098e-05, + "loss": 1.7833, + "step": 2491 + }, + { + "epoch": 0.45260744205053693, + "grad_norm": 0.42796051502227783, + "learning_rate": 6.0478486526698363e-05, + "loss": 1.7048, + "step": 2492 + }, + { + "epoch": 0.4527890662247145, + "grad_norm": 1.2404965162277222, + "learning_rate": 6.044957063788673e-05, + "loss": 1.7848, + "step": 2493 + }, + { + "epoch": 0.4529706903988921, + "grad_norm": 0.5364905595779419, + "learning_rate": 6.0420651094119985e-05, + "loss": 1.9553, + "step": 2494 + }, + { + "epoch": 0.4531523145730697, + "grad_norm": 0.3782094419002533, + "learning_rate": 6.039172790551335e-05, + "loss": 1.5045, + "step": 2495 + }, + { + "epoch": 0.45333393874724726, + "grad_norm": 0.3657017648220062, + "learning_rate": 6.036280108218333e-05, + "loss": 1.7447, + "step": 2496 + }, + { + "epoch": 0.45351556292142486, + "grad_norm": 0.8037468791007996, + "learning_rate": 6.0333870634247645e-05, + "loss": 1.8259, + "step": 2497 + }, + { + "epoch": 0.4536971870956024, + "grad_norm": 0.39461299777030945, + "learning_rate": 6.0304936571825374e-05, + "loss": 1.5067, + "step": 2498 + }, + { + "epoch": 0.45387881126978, + "grad_norm": 0.6715894341468811, + "learning_rate": 6.0275998905036765e-05, + "loss": 1.5824, + "step": 2499 + }, + { + "epoch": 0.4540604354439576, + "grad_norm": 0.3832332193851471, + "learning_rate": 6.02470576440034e-05, + "loss": 1.6662, + "step": 2500 + }, + { + "epoch": 0.4542420596181352, + "grad_norm": 0.41623854637145996, + "learning_rate": 6.021811279884807e-05, + "loss": 1.7172, + "step": 2501 + }, + { + "epoch": 0.45442368379231274, + "grad_norm": 0.4735119938850403, + "learning_rate": 6.018916437969485e-05, + "loss": 1.5831, + "step": 2502 + }, + { + "epoch": 0.45460530796649035, + "grad_norm": 0.4732547402381897, + "learning_rate": 6.016021239666903e-05, + "loss": 1.6833, + "step": 2503 + }, + { + "epoch": 0.4547869321406679, + "grad_norm": 0.4351743161678314, + "learning_rate": 6.01312568598972e-05, + "loss": 1.5621, + "step": 2504 + }, + { + "epoch": 0.4549685563148455, + "grad_norm": 0.5131111741065979, + "learning_rate": 6.0102297779507136e-05, + "loss": 1.7195, + "step": 2505 + }, + { + "epoch": 0.4551501804890231, + "grad_norm": 0.3738043010234833, + "learning_rate": 6.007333516562791e-05, + "loss": 1.937, + "step": 2506 + }, + { + "epoch": 0.45533180466320067, + "grad_norm": 0.4055161476135254, + "learning_rate": 6.0044369028389765e-05, + "loss": 1.7783, + "step": 2507 + }, + { + "epoch": 0.4555134288373783, + "grad_norm": 0.8561174869537354, + "learning_rate": 6.001539937792423e-05, + "loss": 1.7771, + "step": 2508 + }, + { + "epoch": 0.45569505301155583, + "grad_norm": 0.8170810341835022, + "learning_rate": 5.9986426224364056e-05, + "loss": 1.7016, + "step": 2509 + }, + { + "epoch": 0.45587667718573344, + "grad_norm": 0.37274405360221863, + "learning_rate": 5.995744957784316e-05, + "loss": 1.9256, + "step": 2510 + }, + { + "epoch": 0.456058301359911, + "grad_norm": 0.39041775465011597, + "learning_rate": 5.992846944849679e-05, + "loss": 1.8285, + "step": 2511 + }, + { + "epoch": 0.4562399255340886, + "grad_norm": 0.6285026669502258, + "learning_rate": 5.989948584646131e-05, + "loss": 1.8796, + "step": 2512 + }, + { + "epoch": 0.45642154970826615, + "grad_norm": 0.3802514672279358, + "learning_rate": 5.9870498781874365e-05, + "loss": 1.74, + "step": 2513 + }, + { + "epoch": 0.45660317388244376, + "grad_norm": 0.35906240344047546, + "learning_rate": 5.9841508264874746e-05, + "loss": 1.8229, + "step": 2514 + }, + { + "epoch": 0.4567847980566213, + "grad_norm": 0.9665001630783081, + "learning_rate": 5.981251430560253e-05, + "loss": 1.8063, + "step": 2515 + }, + { + "epoch": 0.4569664222307989, + "grad_norm": 0.504569947719574, + "learning_rate": 5.978351691419893e-05, + "loss": 1.6688, + "step": 2516 + }, + { + "epoch": 0.4571480464049765, + "grad_norm": 0.3263058662414551, + "learning_rate": 5.9754516100806423e-05, + "loss": 1.6273, + "step": 2517 + }, + { + "epoch": 0.4573296705791541, + "grad_norm": 0.4843224585056305, + "learning_rate": 5.972551187556862e-05, + "loss": 1.6907, + "step": 2518 + }, + { + "epoch": 0.4575112947533317, + "grad_norm": 0.3492094576358795, + "learning_rate": 5.969650424863034e-05, + "loss": 1.6776, + "step": 2519 + }, + { + "epoch": 0.45769291892750924, + "grad_norm": 0.5201913118362427, + "learning_rate": 5.966749323013764e-05, + "loss": 1.8867, + "step": 2520 + }, + { + "epoch": 0.45787454310168685, + "grad_norm": 0.4940626919269562, + "learning_rate": 5.96384788302377e-05, + "loss": 1.7419, + "step": 2521 + }, + { + "epoch": 0.4580561672758644, + "grad_norm": 0.3463269770145416, + "learning_rate": 5.960946105907893e-05, + "loss": 1.639, + "step": 2522 + }, + { + "epoch": 0.458237791450042, + "grad_norm": 0.43187084794044495, + "learning_rate": 5.958043992681089e-05, + "loss": 1.6687, + "step": 2523 + }, + { + "epoch": 0.45841941562421956, + "grad_norm": 0.33705389499664307, + "learning_rate": 5.9551415443584346e-05, + "loss": 1.6284, + "step": 2524 + }, + { + "epoch": 0.45860103979839717, + "grad_norm": 0.3545244336128235, + "learning_rate": 5.9522387619551166e-05, + "loss": 1.8853, + "step": 2525 + }, + { + "epoch": 0.4587826639725747, + "grad_norm": 0.356995552778244, + "learning_rate": 5.9493356464864504e-05, + "loss": 1.6011, + "step": 2526 + }, + { + "epoch": 0.45896428814675233, + "grad_norm": 0.34074491262435913, + "learning_rate": 5.9464321989678564e-05, + "loss": 1.8814, + "step": 2527 + }, + { + "epoch": 0.45914591232092994, + "grad_norm": 0.5004465579986572, + "learning_rate": 5.9435284204148763e-05, + "loss": 1.6676, + "step": 2528 + }, + { + "epoch": 0.4593275364951075, + "grad_norm": 0.3206934332847595, + "learning_rate": 5.940624311843169e-05, + "loss": 1.5719, + "step": 2529 + }, + { + "epoch": 0.4595091606692851, + "grad_norm": 0.5434625148773193, + "learning_rate": 5.937719874268506e-05, + "loss": 1.8206, + "step": 2530 + }, + { + "epoch": 0.45969078484346265, + "grad_norm": 0.4434613585472107, + "learning_rate": 5.934815108706775e-05, + "loss": 1.5694, + "step": 2531 + }, + { + "epoch": 0.45987240901764026, + "grad_norm": 0.4680669903755188, + "learning_rate": 5.931910016173977e-05, + "loss": 1.7293, + "step": 2532 + }, + { + "epoch": 0.4600540331918178, + "grad_norm": 0.3426927924156189, + "learning_rate": 5.929004597686232e-05, + "loss": 1.6077, + "step": 2533 + }, + { + "epoch": 0.4602356573659954, + "grad_norm": 0.3258073329925537, + "learning_rate": 5.926098854259767e-05, + "loss": 2.0001, + "step": 2534 + }, + { + "epoch": 0.460417281540173, + "grad_norm": 0.33389583230018616, + "learning_rate": 5.9231927869109274e-05, + "loss": 1.5052, + "step": 2535 + }, + { + "epoch": 0.4605989057143506, + "grad_norm": 0.3394566774368286, + "learning_rate": 5.920286396656172e-05, + "loss": 1.7619, + "step": 2536 + }, + { + "epoch": 0.4607805298885282, + "grad_norm": 0.3685675859451294, + "learning_rate": 5.917379684512071e-05, + "loss": 1.7063, + "step": 2537 + }, + { + "epoch": 0.46096215406270574, + "grad_norm": 0.42925500869750977, + "learning_rate": 5.914472651495305e-05, + "loss": 1.6524, + "step": 2538 + }, + { + "epoch": 0.46114377823688335, + "grad_norm": 0.27306675910949707, + "learning_rate": 5.911565298622674e-05, + "loss": 1.6077, + "step": 2539 + }, + { + "epoch": 0.4613254024110609, + "grad_norm": 0.4810583293437958, + "learning_rate": 5.908657626911083e-05, + "loss": 1.8973, + "step": 2540 + }, + { + "epoch": 0.4615070265852385, + "grad_norm": 0.6337254643440247, + "learning_rate": 5.905749637377549e-05, + "loss": 1.8176, + "step": 2541 + }, + { + "epoch": 0.46168865075941606, + "grad_norm": 0.45548516511917114, + "learning_rate": 5.902841331039204e-05, + "loss": 1.6808, + "step": 2542 + }, + { + "epoch": 0.46187027493359367, + "grad_norm": 0.38243281841278076, + "learning_rate": 5.899932708913288e-05, + "loss": 1.7181, + "step": 2543 + }, + { + "epoch": 0.4620518991077712, + "grad_norm": 0.49814197421073914, + "learning_rate": 5.897023772017153e-05, + "loss": 1.6092, + "step": 2544 + }, + { + "epoch": 0.46223352328194883, + "grad_norm": 0.5524522662162781, + "learning_rate": 5.8941145213682594e-05, + "loss": 1.5886, + "step": 2545 + }, + { + "epoch": 0.4624151474561264, + "grad_norm": 0.9081956744194031, + "learning_rate": 5.8912049579841786e-05, + "loss": 1.701, + "step": 2546 + }, + { + "epoch": 0.462596771630304, + "grad_norm": 0.34799322485923767, + "learning_rate": 5.88829508288259e-05, + "loss": 1.5523, + "step": 2547 + }, + { + "epoch": 0.4627783958044816, + "grad_norm": 0.35038360953330994, + "learning_rate": 5.885384897081287e-05, + "loss": 1.7962, + "step": 2548 + }, + { + "epoch": 0.46296001997865915, + "grad_norm": 0.36654090881347656, + "learning_rate": 5.882474401598163e-05, + "loss": 1.7966, + "step": 2549 + }, + { + "epoch": 0.46314164415283676, + "grad_norm": 0.3268367350101471, + "learning_rate": 5.879563597451225e-05, + "loss": 1.6408, + "step": 2550 + }, + { + "epoch": 0.4633232683270143, + "grad_norm": 0.4617006778717041, + "learning_rate": 5.87665248565859e-05, + "loss": 1.7699, + "step": 2551 + }, + { + "epoch": 0.4635048925011919, + "grad_norm": 0.4217093586921692, + "learning_rate": 5.8737410672384794e-05, + "loss": 1.8419, + "step": 2552 + }, + { + "epoch": 0.4636865166753695, + "grad_norm": 0.41762039065361023, + "learning_rate": 5.870829343209221e-05, + "loss": 1.7041, + "step": 2553 + }, + { + "epoch": 0.4638681408495471, + "grad_norm": 0.33267512917518616, + "learning_rate": 5.867917314589252e-05, + "loss": 1.9666, + "step": 2554 + }, + { + "epoch": 0.46404976502372464, + "grad_norm": 0.450063019990921, + "learning_rate": 5.865004982397115e-05, + "loss": 1.7322, + "step": 2555 + }, + { + "epoch": 0.46423138919790224, + "grad_norm": 0.4956912696361542, + "learning_rate": 5.8620923476514576e-05, + "loss": 1.7077, + "step": 2556 + }, + { + "epoch": 0.4644130133720798, + "grad_norm": 0.6563782095909119, + "learning_rate": 5.859179411371037e-05, + "loss": 1.7561, + "step": 2557 + }, + { + "epoch": 0.4645946375462574, + "grad_norm": 0.4208507835865021, + "learning_rate": 5.856266174574711e-05, + "loss": 1.6288, + "step": 2558 + }, + { + "epoch": 0.464776261720435, + "grad_norm": 0.8876340985298157, + "learning_rate": 5.853352638281446e-05, + "loss": 1.7082, + "step": 2559 + }, + { + "epoch": 0.46495788589461257, + "grad_norm": 0.654487133026123, + "learning_rate": 5.850438803510312e-05, + "loss": 1.6397, + "step": 2560 + }, + { + "epoch": 0.4651395100687902, + "grad_norm": 0.5370750427246094, + "learning_rate": 5.847524671280484e-05, + "loss": 1.9079, + "step": 2561 + }, + { + "epoch": 0.4653211342429677, + "grad_norm": 0.4445587396621704, + "learning_rate": 5.8446102426112394e-05, + "loss": 1.8073, + "step": 2562 + }, + { + "epoch": 0.46550275841714533, + "grad_norm": 0.416720986366272, + "learning_rate": 5.84169551852196e-05, + "loss": 1.6642, + "step": 2563 + }, + { + "epoch": 0.4656843825913229, + "grad_norm": 0.29684385657310486, + "learning_rate": 5.838780500032133e-05, + "loss": 1.8566, + "step": 2564 + }, + { + "epoch": 0.4658660067655005, + "grad_norm": 0.4545023441314697, + "learning_rate": 5.835865188161346e-05, + "loss": 1.7393, + "step": 2565 + }, + { + "epoch": 0.46604763093967805, + "grad_norm": 0.3288765549659729, + "learning_rate": 5.832949583929289e-05, + "loss": 1.8895, + "step": 2566 + }, + { + "epoch": 0.46622925511385566, + "grad_norm": 0.3366550803184509, + "learning_rate": 5.830033688355757e-05, + "loss": 1.5972, + "step": 2567 + }, + { + "epoch": 0.46641087928803326, + "grad_norm": 0.39360249042510986, + "learning_rate": 5.827117502460644e-05, + "loss": 1.6574, + "step": 2568 + }, + { + "epoch": 0.4665925034622108, + "grad_norm": 0.3341945707798004, + "learning_rate": 5.824201027263948e-05, + "loss": 1.8313, + "step": 2569 + }, + { + "epoch": 0.4667741276363884, + "grad_norm": 0.5761505365371704, + "learning_rate": 5.821284263785767e-05, + "loss": 1.7319, + "step": 2570 + }, + { + "epoch": 0.466955751810566, + "grad_norm": 0.3439394235610962, + "learning_rate": 5.818367213046298e-05, + "loss": 1.7843, + "step": 2571 + }, + { + "epoch": 0.4671373759847436, + "grad_norm": 0.5826895236968994, + "learning_rate": 5.815449876065842e-05, + "loss": 1.6917, + "step": 2572 + }, + { + "epoch": 0.46731900015892114, + "grad_norm": 0.34965792298316956, + "learning_rate": 5.8125322538647974e-05, + "loss": 1.6514, + "step": 2573 + }, + { + "epoch": 0.46750062433309875, + "grad_norm": 0.4185847043991089, + "learning_rate": 5.809614347463665e-05, + "loss": 1.9274, + "step": 2574 + }, + { + "epoch": 0.4676822485072763, + "grad_norm": 0.34972187876701355, + "learning_rate": 5.8066961578830405e-05, + "loss": 1.7051, + "step": 2575 + }, + { + "epoch": 0.4678638726814539, + "grad_norm": 0.2878931164741516, + "learning_rate": 5.803777686143626e-05, + "loss": 1.7521, + "step": 2576 + }, + { + "epoch": 0.46804549685563146, + "grad_norm": 1.0576481819152832, + "learning_rate": 5.8008589332662136e-05, + "loss": 1.9222, + "step": 2577 + }, + { + "epoch": 0.46822712102980907, + "grad_norm": 0.5341198444366455, + "learning_rate": 5.797939900271697e-05, + "loss": 1.9363, + "step": 2578 + }, + { + "epoch": 0.4684087452039867, + "grad_norm": 0.4173814654350281, + "learning_rate": 5.795020588181075e-05, + "loss": 1.7124, + "step": 2579 + }, + { + "epoch": 0.46859036937816423, + "grad_norm": 0.47301238775253296, + "learning_rate": 5.792100998015432e-05, + "loss": 1.662, + "step": 2580 + }, + { + "epoch": 0.46877199355234184, + "grad_norm": 0.3419872224330902, + "learning_rate": 5.7891811307959574e-05, + "loss": 1.6318, + "step": 2581 + }, + { + "epoch": 0.4689536177265194, + "grad_norm": 0.34300127625465393, + "learning_rate": 5.786260987543936e-05, + "loss": 1.7538, + "step": 2582 + }, + { + "epoch": 0.469135241900697, + "grad_norm": 0.40087971091270447, + "learning_rate": 5.7833405692807493e-05, + "loss": 1.871, + "step": 2583 + }, + { + "epoch": 0.46931686607487455, + "grad_norm": 0.5803071856498718, + "learning_rate": 5.780419877027872e-05, + "loss": 1.8812, + "step": 2584 + }, + { + "epoch": 0.46949849024905216, + "grad_norm": 0.3707352876663208, + "learning_rate": 5.77749891180688e-05, + "loss": 1.7746, + "step": 2585 + }, + { + "epoch": 0.4696801144232297, + "grad_norm": 1.219076156616211, + "learning_rate": 5.774577674639441e-05, + "loss": 1.8943, + "step": 2586 + }, + { + "epoch": 0.4698617385974073, + "grad_norm": 0.5204890370368958, + "learning_rate": 5.7716561665473165e-05, + "loss": 1.6908, + "step": 2587 + }, + { + "epoch": 0.47004336277158487, + "grad_norm": 0.6196826100349426, + "learning_rate": 5.768734388552368e-05, + "loss": 1.8811, + "step": 2588 + }, + { + "epoch": 0.4702249869457625, + "grad_norm": 0.40200942754745483, + "learning_rate": 5.7658123416765464e-05, + "loss": 1.6479, + "step": 2589 + }, + { + "epoch": 0.4704066111199401, + "grad_norm": 0.31278547644615173, + "learning_rate": 5.762890026941898e-05, + "loss": 1.6518, + "step": 2590 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.6924257278442383, + "learning_rate": 5.759967445370564e-05, + "loss": 1.9108, + "step": 2591 + }, + { + "epoch": 0.47076985946829525, + "grad_norm": 1.4424809217453003, + "learning_rate": 5.75704459798478e-05, + "loss": 1.7538, + "step": 2592 + }, + { + "epoch": 0.4709514836424728, + "grad_norm": 0.38621243834495544, + "learning_rate": 5.7541214858068705e-05, + "loss": 1.6963, + "step": 2593 + }, + { + "epoch": 0.4711331078166504, + "grad_norm": 0.4075799286365509, + "learning_rate": 5.751198109859254e-05, + "loss": 1.7448, + "step": 2594 + }, + { + "epoch": 0.47131473199082796, + "grad_norm": 0.323247492313385, + "learning_rate": 5.7482744711644446e-05, + "loss": 1.8165, + "step": 2595 + }, + { + "epoch": 0.47149635616500557, + "grad_norm": 0.3995501399040222, + "learning_rate": 5.745350570745045e-05, + "loss": 1.8812, + "step": 2596 + }, + { + "epoch": 0.4716779803391831, + "grad_norm": 0.34212103486061096, + "learning_rate": 5.742426409623749e-05, + "loss": 1.7201, + "step": 2597 + }, + { + "epoch": 0.47185960451336073, + "grad_norm": 0.3639693856239319, + "learning_rate": 5.739501988823346e-05, + "loss": 1.8507, + "step": 2598 + }, + { + "epoch": 0.47204122868753834, + "grad_norm": 0.40119677782058716, + "learning_rate": 5.73657730936671e-05, + "loss": 1.8284, + "step": 2599 + }, + { + "epoch": 0.4722228528617159, + "grad_norm": 0.3793904781341553, + "learning_rate": 5.733652372276809e-05, + "loss": 1.6453, + "step": 2600 + }, + { + "epoch": 0.4724044770358935, + "grad_norm": 0.6714252233505249, + "learning_rate": 5.7307271785767034e-05, + "loss": 1.833, + "step": 2601 + }, + { + "epoch": 0.47258610121007105, + "grad_norm": 0.44084471464157104, + "learning_rate": 5.727801729289537e-05, + "loss": 1.8066, + "step": 2602 + }, + { + "epoch": 0.47276772538424866, + "grad_norm": 0.7439478635787964, + "learning_rate": 5.724876025438549e-05, + "loss": 1.7297, + "step": 2603 + }, + { + "epoch": 0.4729493495584262, + "grad_norm": 0.699898898601532, + "learning_rate": 5.721950068047065e-05, + "loss": 1.7457, + "step": 2604 + }, + { + "epoch": 0.4731309737326038, + "grad_norm": 0.6715090870857239, + "learning_rate": 5.7190238581384994e-05, + "loss": 1.948, + "step": 2605 + }, + { + "epoch": 0.4733125979067814, + "grad_norm": 0.401113897562027, + "learning_rate": 5.716097396736354e-05, + "loss": 1.8577, + "step": 2606 + }, + { + "epoch": 0.473494222080959, + "grad_norm": 0.3589351773262024, + "learning_rate": 5.713170684864222e-05, + "loss": 1.7012, + "step": 2607 + }, + { + "epoch": 0.47367584625513653, + "grad_norm": 0.7842576503753662, + "learning_rate": 5.71024372354578e-05, + "loss": 1.675, + "step": 2608 + }, + { + "epoch": 0.47385747042931414, + "grad_norm": 0.3716539144515991, + "learning_rate": 5.7073165138047924e-05, + "loss": 1.6087, + "step": 2609 + }, + { + "epoch": 0.47403909460349175, + "grad_norm": 0.3986756205558777, + "learning_rate": 5.704389056665116e-05, + "loss": 1.4657, + "step": 2610 + }, + { + "epoch": 0.4742207187776693, + "grad_norm": 0.4061032831668854, + "learning_rate": 5.701461353150687e-05, + "loss": 1.6322, + "step": 2611 + }, + { + "epoch": 0.4744023429518469, + "grad_norm": 0.5833276510238647, + "learning_rate": 5.698533404285531e-05, + "loss": 1.842, + "step": 2612 + }, + { + "epoch": 0.47458396712602446, + "grad_norm": 0.7427014708518982, + "learning_rate": 5.695605211093758e-05, + "loss": 2.0144, + "step": 2613 + }, + { + "epoch": 0.47476559130020207, + "grad_norm": 0.47940415143966675, + "learning_rate": 5.692676774599569e-05, + "loss": 1.7989, + "step": 2614 + }, + { + "epoch": 0.4749472154743796, + "grad_norm": 0.35378071665763855, + "learning_rate": 5.6897480958272396e-05, + "loss": 1.5511, + "step": 2615 + }, + { + "epoch": 0.47512883964855723, + "grad_norm": 0.42513006925582886, + "learning_rate": 5.6868191758011425e-05, + "loss": 1.6716, + "step": 2616 + }, + { + "epoch": 0.4753104638227348, + "grad_norm": 0.33174529671669006, + "learning_rate": 5.683890015545723e-05, + "loss": 1.8067, + "step": 2617 + }, + { + "epoch": 0.4754920879969124, + "grad_norm": 0.2803780734539032, + "learning_rate": 5.680960616085519e-05, + "loss": 1.747, + "step": 2618 + }, + { + "epoch": 0.47567371217108995, + "grad_norm": 0.4567261338233948, + "learning_rate": 5.678030978445148e-05, + "loss": 1.7066, + "step": 2619 + }, + { + "epoch": 0.47585533634526755, + "grad_norm": 0.42615506052970886, + "learning_rate": 5.675101103649313e-05, + "loss": 1.7468, + "step": 2620 + }, + { + "epoch": 0.47603696051944516, + "grad_norm": 0.4551263153553009, + "learning_rate": 5.6721709927227974e-05, + "loss": 1.9956, + "step": 2621 + }, + { + "epoch": 0.4762185846936227, + "grad_norm": 0.5006154179573059, + "learning_rate": 5.669240646690469e-05, + "loss": 1.4407, + "step": 2622 + }, + { + "epoch": 0.4764002088678003, + "grad_norm": 0.31792908906936646, + "learning_rate": 5.666310066577277e-05, + "loss": 1.6236, + "step": 2623 + }, + { + "epoch": 0.4765818330419779, + "grad_norm": 0.3417931795120239, + "learning_rate": 5.663379253408254e-05, + "loss": 1.7822, + "step": 2624 + }, + { + "epoch": 0.4767634572161555, + "grad_norm": 0.43639707565307617, + "learning_rate": 5.660448208208513e-05, + "loss": 1.6201, + "step": 2625 + }, + { + "epoch": 0.47694508139033304, + "grad_norm": 0.4140772819519043, + "learning_rate": 5.657516932003246e-05, + "loss": 1.7749, + "step": 2626 + }, + { + "epoch": 0.47712670556451064, + "grad_norm": 0.40097710490226746, + "learning_rate": 5.6545854258177287e-05, + "loss": 1.6069, + "step": 2627 + }, + { + "epoch": 0.4773083297386882, + "grad_norm": 0.4223651587963104, + "learning_rate": 5.651653690677319e-05, + "loss": 1.6971, + "step": 2628 + }, + { + "epoch": 0.4774899539128658, + "grad_norm": 0.38230133056640625, + "learning_rate": 5.64872172760745e-05, + "loss": 1.569, + "step": 2629 + }, + { + "epoch": 0.47767157808704336, + "grad_norm": 0.5578054189682007, + "learning_rate": 5.645789537633638e-05, + "loss": 1.8423, + "step": 2630 + }, + { + "epoch": 0.47785320226122097, + "grad_norm": 0.39643850922584534, + "learning_rate": 5.642857121781475e-05, + "loss": 1.6461, + "step": 2631 + }, + { + "epoch": 0.4780348264353986, + "grad_norm": 0.4129710793495178, + "learning_rate": 5.6399244810766385e-05, + "loss": 1.8674, + "step": 2632 + }, + { + "epoch": 0.4782164506095761, + "grad_norm": 0.45838168263435364, + "learning_rate": 5.636991616544878e-05, + "loss": 1.9063, + "step": 2633 + }, + { + "epoch": 0.47839807478375374, + "grad_norm": 0.42233988642692566, + "learning_rate": 5.634058529212024e-05, + "loss": 1.7253, + "step": 2634 + }, + { + "epoch": 0.4785796989579313, + "grad_norm": 0.4370240569114685, + "learning_rate": 5.631125220103987e-05, + "loss": 1.7815, + "step": 2635 + }, + { + "epoch": 0.4787613231321089, + "grad_norm": 0.8381924033164978, + "learning_rate": 5.628191690246751e-05, + "loss": 1.8375, + "step": 2636 + }, + { + "epoch": 0.47894294730628645, + "grad_norm": 0.3237784802913666, + "learning_rate": 5.6252579406663784e-05, + "loss": 1.744, + "step": 2637 + }, + { + "epoch": 0.47912457148046406, + "grad_norm": 0.3706619441509247, + "learning_rate": 5.622323972389013e-05, + "loss": 1.7852, + "step": 2638 + }, + { + "epoch": 0.4793061956546416, + "grad_norm": 0.34983178973197937, + "learning_rate": 5.6193897864408686e-05, + "loss": 1.7955, + "step": 2639 + }, + { + "epoch": 0.4794878198288192, + "grad_norm": 0.3924939036369324, + "learning_rate": 5.616455383848237e-05, + "loss": 1.7941, + "step": 2640 + }, + { + "epoch": 0.4796694440029968, + "grad_norm": 0.3438774645328522, + "learning_rate": 5.613520765637489e-05, + "loss": 1.7966, + "step": 2641 + }, + { + "epoch": 0.4798510681771744, + "grad_norm": 0.40586915612220764, + "learning_rate": 5.610585932835067e-05, + "loss": 1.8819, + "step": 2642 + }, + { + "epoch": 0.480032692351352, + "grad_norm": 0.4116702079772949, + "learning_rate": 5.60765088646749e-05, + "loss": 1.5039, + "step": 2643 + }, + { + "epoch": 0.48021431652552954, + "grad_norm": 0.37006744742393494, + "learning_rate": 5.604715627561353e-05, + "loss": 1.6931, + "step": 2644 + }, + { + "epoch": 0.48039594069970715, + "grad_norm": 0.3480381369590759, + "learning_rate": 5.601780157143323e-05, + "loss": 1.8085, + "step": 2645 + }, + { + "epoch": 0.4805775648738847, + "grad_norm": 0.35109370946884155, + "learning_rate": 5.59884447624014e-05, + "loss": 1.3842, + "step": 2646 + }, + { + "epoch": 0.4807591890480623, + "grad_norm": 0.4702397286891937, + "learning_rate": 5.5959085858786244e-05, + "loss": 1.6776, + "step": 2647 + }, + { + "epoch": 0.48094081322223986, + "grad_norm": 0.3446997106075287, + "learning_rate": 5.5929724870856616e-05, + "loss": 1.6328, + "step": 2648 + }, + { + "epoch": 0.48112243739641747, + "grad_norm": 0.40302202105522156, + "learning_rate": 5.590036180888212e-05, + "loss": 1.6459, + "step": 2649 + }, + { + "epoch": 0.481304061570595, + "grad_norm": 0.3687279224395752, + "learning_rate": 5.5870996683133126e-05, + "loss": 1.765, + "step": 2650 + }, + { + "epoch": 0.48148568574477263, + "grad_norm": 0.45990726351737976, + "learning_rate": 5.58416295038807e-05, + "loss": 1.6285, + "step": 2651 + }, + { + "epoch": 0.48166730991895024, + "grad_norm": 0.25262874364852905, + "learning_rate": 5.581226028139661e-05, + "loss": 1.8076, + "step": 2652 + }, + { + "epoch": 0.4818489340931278, + "grad_norm": 1.241579294204712, + "learning_rate": 5.5782889025953355e-05, + "loss": 1.7148, + "step": 2653 + }, + { + "epoch": 0.4820305582673054, + "grad_norm": 0.38248828053474426, + "learning_rate": 5.575351574782415e-05, + "loss": 1.8923, + "step": 2654 + }, + { + "epoch": 0.48221218244148295, + "grad_norm": 1.379368782043457, + "learning_rate": 5.57241404572829e-05, + "loss": 1.8407, + "step": 2655 + }, + { + "epoch": 0.48239380661566056, + "grad_norm": 0.5920520424842834, + "learning_rate": 5.5694763164604244e-05, + "loss": 1.9931, + "step": 2656 + }, + { + "epoch": 0.4825754307898381, + "grad_norm": 0.417731374502182, + "learning_rate": 5.56653838800635e-05, + "loss": 1.5503, + "step": 2657 + }, + { + "epoch": 0.4827570549640157, + "grad_norm": 0.46463072299957275, + "learning_rate": 5.563600261393667e-05, + "loss": 1.6891, + "step": 2658 + }, + { + "epoch": 0.48293867913819327, + "grad_norm": 0.4815327525138855, + "learning_rate": 5.560661937650047e-05, + "loss": 1.6877, + "step": 2659 + }, + { + "epoch": 0.4831203033123709, + "grad_norm": 0.37310361862182617, + "learning_rate": 5.55772341780323e-05, + "loss": 1.7804, + "step": 2660 + }, + { + "epoch": 0.48330192748654843, + "grad_norm": 0.34655681252479553, + "learning_rate": 5.554784702881025e-05, + "loss": 1.7273, + "step": 2661 + }, + { + "epoch": 0.48348355166072604, + "grad_norm": 0.9903326630592346, + "learning_rate": 5.5518457939113075e-05, + "loss": 1.6611, + "step": 2662 + }, + { + "epoch": 0.48366517583490365, + "grad_norm": 0.291898638010025, + "learning_rate": 5.548906691922024e-05, + "loss": 1.6538, + "step": 2663 + }, + { + "epoch": 0.4838468000090812, + "grad_norm": 1.9235401153564453, + "learning_rate": 5.5459673979411864e-05, + "loss": 1.8776, + "step": 2664 + }, + { + "epoch": 0.4840284241832588, + "grad_norm": 0.38301882147789, + "learning_rate": 5.543027912996872e-05, + "loss": 1.7572, + "step": 2665 + }, + { + "epoch": 0.48421004835743636, + "grad_norm": 0.40141576528549194, + "learning_rate": 5.540088238117229e-05, + "loss": 1.8794, + "step": 2666 + }, + { + "epoch": 0.48439167253161397, + "grad_norm": 0.4457523822784424, + "learning_rate": 5.53714837433047e-05, + "loss": 1.8913, + "step": 2667 + }, + { + "epoch": 0.4845732967057915, + "grad_norm": 0.35130220651626587, + "learning_rate": 5.534208322664871e-05, + "loss": 1.6205, + "step": 2668 + }, + { + "epoch": 0.48475492087996913, + "grad_norm": 0.5073100924491882, + "learning_rate": 5.53126808414878e-05, + "loss": 1.9216, + "step": 2669 + }, + { + "epoch": 0.4849365450541467, + "grad_norm": 0.49080130457878113, + "learning_rate": 5.528327659810605e-05, + "loss": 1.7219, + "step": 2670 + }, + { + "epoch": 0.4851181692283243, + "grad_norm": 0.5346786975860596, + "learning_rate": 5.525387050678819e-05, + "loss": 1.5435, + "step": 2671 + }, + { + "epoch": 0.4852997934025019, + "grad_norm": 0.7149062752723694, + "learning_rate": 5.522446257781965e-05, + "loss": 1.7696, + "step": 2672 + }, + { + "epoch": 0.48548141757667945, + "grad_norm": 0.4365018606185913, + "learning_rate": 5.519505282148644e-05, + "loss": 1.4964, + "step": 2673 + }, + { + "epoch": 0.48566304175085706, + "grad_norm": 0.3724755048751831, + "learning_rate": 5.516564124807522e-05, + "loss": 1.6802, + "step": 2674 + }, + { + "epoch": 0.4858446659250346, + "grad_norm": 0.9912995100021362, + "learning_rate": 5.513622786787335e-05, + "loss": 1.8903, + "step": 2675 + }, + { + "epoch": 0.4860262900992122, + "grad_norm": 0.35261255502700806, + "learning_rate": 5.510681269116873e-05, + "loss": 1.9221, + "step": 2676 + }, + { + "epoch": 0.4862079142733898, + "grad_norm": 0.5018541216850281, + "learning_rate": 5.507739572824995e-05, + "loss": 1.7803, + "step": 2677 + }, + { + "epoch": 0.4863895384475674, + "grad_norm": 0.5001533627510071, + "learning_rate": 5.504797698940619e-05, + "loss": 1.4867, + "step": 2678 + }, + { + "epoch": 0.48657116262174493, + "grad_norm": 1.6012896299362183, + "learning_rate": 5.50185564849273e-05, + "loss": 1.7597, + "step": 2679 + }, + { + "epoch": 0.48675278679592254, + "grad_norm": 0.5943880081176758, + "learning_rate": 5.4989134225103666e-05, + "loss": 1.8699, + "step": 2680 + }, + { + "epoch": 0.4869344109701001, + "grad_norm": 0.3092082440853119, + "learning_rate": 5.495971022022638e-05, + "loss": 1.5874, + "step": 2681 + }, + { + "epoch": 0.4871160351442777, + "grad_norm": 0.3420947790145874, + "learning_rate": 5.493028448058708e-05, + "loss": 1.6601, + "step": 2682 + }, + { + "epoch": 0.4872976593184553, + "grad_norm": 0.3592749536037445, + "learning_rate": 5.490085701647805e-05, + "loss": 1.8634, + "step": 2683 + }, + { + "epoch": 0.48747928349263286, + "grad_norm": 0.4408836364746094, + "learning_rate": 5.4871427838192124e-05, + "loss": 1.619, + "step": 2684 + }, + { + "epoch": 0.4876609076668105, + "grad_norm": 0.2642973065376282, + "learning_rate": 5.484199695602279e-05, + "loss": 1.541, + "step": 2685 + }, + { + "epoch": 0.487842531840988, + "grad_norm": 0.41182941198349, + "learning_rate": 5.481256438026412e-05, + "loss": 1.5318, + "step": 2686 + }, + { + "epoch": 0.48802415601516563, + "grad_norm": 0.3876485824584961, + "learning_rate": 5.478313012121077e-05, + "loss": 1.7176, + "step": 2687 + }, + { + "epoch": 0.4882057801893432, + "grad_norm": 0.3642648160457611, + "learning_rate": 5.4753694189157976e-05, + "loss": 1.6678, + "step": 2688 + }, + { + "epoch": 0.4883874043635208, + "grad_norm": 0.36821502447128296, + "learning_rate": 5.472425659440157e-05, + "loss": 1.6613, + "step": 2689 + }, + { + "epoch": 0.48856902853769835, + "grad_norm": 0.31391987204551697, + "learning_rate": 5.469481734723797e-05, + "loss": 1.7065, + "step": 2690 + }, + { + "epoch": 0.48875065271187595, + "grad_norm": 0.6137616038322449, + "learning_rate": 5.466537645796416e-05, + "loss": 1.7823, + "step": 2691 + }, + { + "epoch": 0.4889322768860535, + "grad_norm": 0.4284909963607788, + "learning_rate": 5.463593393687771e-05, + "loss": 1.8421, + "step": 2692 + }, + { + "epoch": 0.4891139010602311, + "grad_norm": 0.34355640411376953, + "learning_rate": 5.4606489794276736e-05, + "loss": 1.7299, + "step": 2693 + }, + { + "epoch": 0.4892955252344087, + "grad_norm": 0.28843599557876587, + "learning_rate": 5.457704404045998e-05, + "loss": 1.6867, + "step": 2694 + }, + { + "epoch": 0.4894771494085863, + "grad_norm": 0.3358434736728668, + "learning_rate": 5.4547596685726685e-05, + "loss": 1.607, + "step": 2695 + }, + { + "epoch": 0.4896587735827639, + "grad_norm": 0.3860785961151123, + "learning_rate": 5.451814774037666e-05, + "loss": 1.7383, + "step": 2696 + }, + { + "epoch": 0.48984039775694144, + "grad_norm": 0.3157297670841217, + "learning_rate": 5.448869721471033e-05, + "loss": 1.7343, + "step": 2697 + }, + { + "epoch": 0.49002202193111905, + "grad_norm": 0.5644932985305786, + "learning_rate": 5.445924511902858e-05, + "loss": 1.897, + "step": 2698 + }, + { + "epoch": 0.4902036461052966, + "grad_norm": 0.7902799844741821, + "learning_rate": 5.442979146363293e-05, + "loss": 1.8249, + "step": 2699 + }, + { + "epoch": 0.4903852702794742, + "grad_norm": 0.7166121006011963, + "learning_rate": 5.440033625882539e-05, + "loss": 1.5999, + "step": 2700 + }, + { + "epoch": 0.49056689445365176, + "grad_norm": 0.3844698965549469, + "learning_rate": 5.437087951490856e-05, + "loss": 1.8642, + "step": 2701 + }, + { + "epoch": 0.49074851862782937, + "grad_norm": 0.6881653666496277, + "learning_rate": 5.4341421242185495e-05, + "loss": 1.6257, + "step": 2702 + }, + { + "epoch": 0.490930142802007, + "grad_norm": 0.6296918988227844, + "learning_rate": 5.431196145095991e-05, + "loss": 1.8427, + "step": 2703 + }, + { + "epoch": 0.4911117669761845, + "grad_norm": 0.3402605354785919, + "learning_rate": 5.428250015153593e-05, + "loss": 1.759, + "step": 2704 + }, + { + "epoch": 0.49129339115036214, + "grad_norm": 0.7583489418029785, + "learning_rate": 5.425303735421828e-05, + "loss": 1.7263, + "step": 2705 + }, + { + "epoch": 0.4914750153245397, + "grad_norm": 0.30228927731513977, + "learning_rate": 5.4223573069312184e-05, + "loss": 1.6521, + "step": 2706 + }, + { + "epoch": 0.4916566394987173, + "grad_norm": 0.3075658977031708, + "learning_rate": 5.419410730712339e-05, + "loss": 1.6865, + "step": 2707 + }, + { + "epoch": 0.49183826367289485, + "grad_norm": 0.367009699344635, + "learning_rate": 5.416464007795815e-05, + "loss": 1.7765, + "step": 2708 + }, + { + "epoch": 0.49201988784707246, + "grad_norm": 0.29035261273384094, + "learning_rate": 5.413517139212326e-05, + "loss": 1.605, + "step": 2709 + }, + { + "epoch": 0.49220151202125, + "grad_norm": 0.4735918939113617, + "learning_rate": 5.4105701259926e-05, + "loss": 1.8224, + "step": 2710 + }, + { + "epoch": 0.4923831361954276, + "grad_norm": 0.3734782934188843, + "learning_rate": 5.4076229691674164e-05, + "loss": 1.6862, + "step": 2711 + }, + { + "epoch": 0.49256476036960517, + "grad_norm": 0.5866392254829407, + "learning_rate": 5.4046756697676026e-05, + "loss": 1.7678, + "step": 2712 + }, + { + "epoch": 0.4927463845437828, + "grad_norm": 0.5155999660491943, + "learning_rate": 5.401728228824041e-05, + "loss": 1.7142, + "step": 2713 + }, + { + "epoch": 0.4929280087179604, + "grad_norm": 0.521765947341919, + "learning_rate": 5.3987806473676594e-05, + "loss": 1.768, + "step": 2714 + }, + { + "epoch": 0.49310963289213794, + "grad_norm": 1.2853446006774902, + "learning_rate": 5.395832926429435e-05, + "loss": 1.9386, + "step": 2715 + }, + { + "epoch": 0.49329125706631555, + "grad_norm": 0.3297964632511139, + "learning_rate": 5.392885067040397e-05, + "loss": 1.7476, + "step": 2716 + }, + { + "epoch": 0.4934728812404931, + "grad_norm": 0.41895636916160583, + "learning_rate": 5.389937070231619e-05, + "loss": 1.7587, + "step": 2717 + }, + { + "epoch": 0.4936545054146707, + "grad_norm": 0.3960736095905304, + "learning_rate": 5.386988937034223e-05, + "loss": 1.7551, + "step": 2718 + }, + { + "epoch": 0.49383612958884826, + "grad_norm": 0.39063718914985657, + "learning_rate": 5.384040668479383e-05, + "loss": 1.7843, + "step": 2719 + }, + { + "epoch": 0.49401775376302587, + "grad_norm": 0.3320341408252716, + "learning_rate": 5.3810922655983145e-05, + "loss": 1.6974, + "step": 2720 + }, + { + "epoch": 0.4941993779372034, + "grad_norm": 0.33217349648475647, + "learning_rate": 5.3781437294222845e-05, + "loss": 1.6881, + "step": 2721 + }, + { + "epoch": 0.49438100211138103, + "grad_norm": 0.3247702717781067, + "learning_rate": 5.375195060982604e-05, + "loss": 1.7218, + "step": 2722 + }, + { + "epoch": 0.4945626262855586, + "grad_norm": 0.8122655153274536, + "learning_rate": 5.372246261310634e-05, + "loss": 1.8187, + "step": 2723 + }, + { + "epoch": 0.4947442504597362, + "grad_norm": 0.3445173501968384, + "learning_rate": 5.3692973314377724e-05, + "loss": 1.7943, + "step": 2724 + }, + { + "epoch": 0.4949258746339138, + "grad_norm": 0.3348889648914337, + "learning_rate": 5.3663482723954774e-05, + "loss": 1.7909, + "step": 2725 + }, + { + "epoch": 0.49510749880809135, + "grad_norm": 0.4341531991958618, + "learning_rate": 5.3633990852152375e-05, + "loss": 1.6002, + "step": 2726 + }, + { + "epoch": 0.49528912298226896, + "grad_norm": 0.37302637100219727, + "learning_rate": 5.360449770928594e-05, + "loss": 1.7485, + "step": 2727 + }, + { + "epoch": 0.4954707471564465, + "grad_norm": 0.4303012192249298, + "learning_rate": 5.357500330567131e-05, + "loss": 1.8266, + "step": 2728 + }, + { + "epoch": 0.4956523713306241, + "grad_norm": 0.3042967617511749, + "learning_rate": 5.3545507651624794e-05, + "loss": 1.6243, + "step": 2729 + }, + { + "epoch": 0.4958339955048017, + "grad_norm": 0.36651867628097534, + "learning_rate": 5.3516010757463057e-05, + "loss": 1.676, + "step": 2730 + }, + { + "epoch": 0.4960156196789793, + "grad_norm": 0.36293599009513855, + "learning_rate": 5.3486512633503303e-05, + "loss": 1.5028, + "step": 2731 + }, + { + "epoch": 0.49619724385315683, + "grad_norm": 0.7723996043205261, + "learning_rate": 5.345701329006311e-05, + "loss": 1.7118, + "step": 2732 + }, + { + "epoch": 0.49637886802733444, + "grad_norm": 0.43266090750694275, + "learning_rate": 5.3427512737460436e-05, + "loss": 1.6889, + "step": 2733 + }, + { + "epoch": 0.496560492201512, + "grad_norm": 0.6108726859092712, + "learning_rate": 5.339801098601379e-05, + "loss": 1.8325, + "step": 2734 + }, + { + "epoch": 0.4967421163756896, + "grad_norm": 0.5286149978637695, + "learning_rate": 5.3368508046041964e-05, + "loss": 2.014, + "step": 2735 + }, + { + "epoch": 0.4969237405498672, + "grad_norm": 0.2946939766407013, + "learning_rate": 5.3339003927864265e-05, + "loss": 1.6866, + "step": 2736 + }, + { + "epoch": 0.49710536472404476, + "grad_norm": 0.4414820671081543, + "learning_rate": 5.3309498641800337e-05, + "loss": 1.7728, + "step": 2737 + }, + { + "epoch": 0.49728698889822237, + "grad_norm": 0.3513832986354828, + "learning_rate": 5.3279992198170313e-05, + "loss": 1.7776, + "step": 2738 + }, + { + "epoch": 0.4974686130723999, + "grad_norm": 0.3432561159133911, + "learning_rate": 5.325048460729465e-05, + "loss": 1.7289, + "step": 2739 + }, + { + "epoch": 0.49765023724657753, + "grad_norm": 0.40738445520401, + "learning_rate": 5.322097587949425e-05, + "loss": 1.9042, + "step": 2740 + }, + { + "epoch": 0.4978318614207551, + "grad_norm": 0.41008618474006653, + "learning_rate": 5.319146602509042e-05, + "loss": 1.88, + "step": 2741 + }, + { + "epoch": 0.4980134855949327, + "grad_norm": 0.43215155601501465, + "learning_rate": 5.316195505440483e-05, + "loss": 1.7619, + "step": 2742 + }, + { + "epoch": 0.49819510976911024, + "grad_norm": 0.4417966604232788, + "learning_rate": 5.3132442977759575e-05, + "loss": 1.6443, + "step": 2743 + }, + { + "epoch": 0.49837673394328785, + "grad_norm": 0.7243943810462952, + "learning_rate": 5.3102929805477106e-05, + "loss": 1.737, + "step": 2744 + }, + { + "epoch": 0.49855835811746546, + "grad_norm": 0.4576661288738251, + "learning_rate": 5.307341554788027e-05, + "loss": 1.759, + "step": 2745 + }, + { + "epoch": 0.498739982291643, + "grad_norm": 0.40402933955192566, + "learning_rate": 5.3043900215292284e-05, + "loss": 1.6163, + "step": 2746 + }, + { + "epoch": 0.4989216064658206, + "grad_norm": 0.409685879945755, + "learning_rate": 5.301438381803679e-05, + "loss": 1.9215, + "step": 2747 + }, + { + "epoch": 0.4991032306399982, + "grad_norm": 0.3290952146053314, + "learning_rate": 5.298486636643771e-05, + "loss": 1.5819, + "step": 2748 + }, + { + "epoch": 0.4992848548141758, + "grad_norm": 0.44784656167030334, + "learning_rate": 5.295534787081943e-05, + "loss": 1.618, + "step": 2749 + }, + { + "epoch": 0.49946647898835334, + "grad_norm": 0.440998911857605, + "learning_rate": 5.292582834150663e-05, + "loss": 1.6607, + "step": 2750 + }, + { + "epoch": 0.49964810316253094, + "grad_norm": 0.44840186834335327, + "learning_rate": 5.289630778882442e-05, + "loss": 1.6878, + "step": 2751 + }, + { + "epoch": 0.4998297273367085, + "grad_norm": 0.5628447532653809, + "learning_rate": 5.286678622309817e-05, + "loss": 1.7366, + "step": 2752 + }, + { + "epoch": 0.500011351510886, + "grad_norm": 0.34844061732292175, + "learning_rate": 5.2837263654653715e-05, + "loss": 1.6455, + "step": 2753 + }, + { + "epoch": 0.5001929756850637, + "grad_norm": 0.3329295814037323, + "learning_rate": 5.280774009381715e-05, + "loss": 1.6057, + "step": 2754 + }, + { + "epoch": 0.5003745998592413, + "grad_norm": 0.39739951491355896, + "learning_rate": 5.2778215550914976e-05, + "loss": 1.6229, + "step": 2755 + }, + { + "epoch": 0.5005562240334188, + "grad_norm": 0.807303249835968, + "learning_rate": 5.274869003627404e-05, + "loss": 1.9598, + "step": 2756 + }, + { + "epoch": 0.5007378482075965, + "grad_norm": 0.35275474190711975, + "learning_rate": 5.2719163560221466e-05, + "loss": 1.6895, + "step": 2757 + }, + { + "epoch": 0.500919472381774, + "grad_norm": 0.34519726037979126, + "learning_rate": 5.268963613308475e-05, + "loss": 1.8842, + "step": 2758 + }, + { + "epoch": 0.5011010965559516, + "grad_norm": 1.066236972808838, + "learning_rate": 5.266010776519177e-05, + "loss": 1.6598, + "step": 2759 + }, + { + "epoch": 0.5012827207301291, + "grad_norm": 0.635984480381012, + "learning_rate": 5.263057846687066e-05, + "loss": 1.7448, + "step": 2760 + }, + { + "epoch": 0.5014643449043068, + "grad_norm": 0.559842050075531, + "learning_rate": 5.260104824844989e-05, + "loss": 1.8731, + "step": 2761 + }, + { + "epoch": 0.5016459690784844, + "grad_norm": 0.3441810607910156, + "learning_rate": 5.257151712025832e-05, + "loss": 1.6222, + "step": 2762 + }, + { + "epoch": 0.5018275932526619, + "grad_norm": 0.3385317325592041, + "learning_rate": 5.254198509262502e-05, + "loss": 1.6572, + "step": 2763 + }, + { + "epoch": 0.5020092174268395, + "grad_norm": 0.4592202305793762, + "learning_rate": 5.251245217587947e-05, + "loss": 1.741, + "step": 2764 + }, + { + "epoch": 0.5021908416010171, + "grad_norm": 0.35400402545928955, + "learning_rate": 5.248291838035141e-05, + "loss": 1.7544, + "step": 2765 + }, + { + "epoch": 0.5023724657751947, + "grad_norm": 0.4047846496105194, + "learning_rate": 5.245338371637091e-05, + "loss": 1.5765, + "step": 2766 + }, + { + "epoch": 0.5025540899493722, + "grad_norm": 0.3841531574726105, + "learning_rate": 5.2423848194268323e-05, + "loss": 1.6001, + "step": 2767 + }, + { + "epoch": 0.5027357141235499, + "grad_norm": 0.42585060000419617, + "learning_rate": 5.239431182437431e-05, + "loss": 1.9258, + "step": 2768 + }, + { + "epoch": 0.5029173382977274, + "grad_norm": 0.44597962498664856, + "learning_rate": 5.236477461701985e-05, + "loss": 1.6404, + "step": 2769 + }, + { + "epoch": 0.503098962471905, + "grad_norm": 0.32767581939697266, + "learning_rate": 5.233523658253616e-05, + "loss": 1.8665, + "step": 2770 + }, + { + "epoch": 0.5032805866460826, + "grad_norm": 0.37264811992645264, + "learning_rate": 5.230569773125484e-05, + "loss": 1.6392, + "step": 2771 + }, + { + "epoch": 0.5034622108202602, + "grad_norm": 0.45817798376083374, + "learning_rate": 5.227615807350767e-05, + "loss": 1.7356, + "step": 2772 + }, + { + "epoch": 0.5036438349944378, + "grad_norm": 0.5583562254905701, + "learning_rate": 5.2246617619626795e-05, + "loss": 1.6561, + "step": 2773 + }, + { + "epoch": 0.5038254591686153, + "grad_norm": 0.4904632866382599, + "learning_rate": 5.221707637994456e-05, + "loss": 1.9675, + "step": 2774 + }, + { + "epoch": 0.5040070833427929, + "grad_norm": 0.6503326296806335, + "learning_rate": 5.2187534364793686e-05, + "loss": 1.9269, + "step": 2775 + }, + { + "epoch": 0.5041887075169705, + "grad_norm": 0.36856263875961304, + "learning_rate": 5.215799158450707e-05, + "loss": 1.7779, + "step": 2776 + }, + { + "epoch": 0.5043703316911481, + "grad_norm": 0.5082567930221558, + "learning_rate": 5.212844804941792e-05, + "loss": 2.0107, + "step": 2777 + }, + { + "epoch": 0.5045519558653256, + "grad_norm": 0.851468026638031, + "learning_rate": 5.209890376985972e-05, + "loss": 1.7846, + "step": 2778 + }, + { + "epoch": 0.5047335800395033, + "grad_norm": 0.28268980979919434, + "learning_rate": 5.206935875616618e-05, + "loss": 1.8633, + "step": 2779 + }, + { + "epoch": 0.5049152042136809, + "grad_norm": 0.37341731786727905, + "learning_rate": 5.203981301867128e-05, + "loss": 1.7612, + "step": 2780 + }, + { + "epoch": 0.5050968283878584, + "grad_norm": 0.4578326344490051, + "learning_rate": 5.201026656770926e-05, + "loss": 1.7016, + "step": 2781 + }, + { + "epoch": 0.505278452562036, + "grad_norm": 0.3837997615337372, + "learning_rate": 5.1980719413614645e-05, + "loss": 1.8774, + "step": 2782 + }, + { + "epoch": 0.5054600767362136, + "grad_norm": 0.46896713972091675, + "learning_rate": 5.1951171566722104e-05, + "loss": 1.8885, + "step": 2783 + }, + { + "epoch": 0.5056417009103912, + "grad_norm": 0.41199901700019836, + "learning_rate": 5.192162303736667e-05, + "loss": 1.7295, + "step": 2784 + }, + { + "epoch": 0.5058233250845687, + "grad_norm": 0.4138210415840149, + "learning_rate": 5.1892073835883524e-05, + "loss": 1.8544, + "step": 2785 + }, + { + "epoch": 0.5060049492587463, + "grad_norm": 0.6284353137016296, + "learning_rate": 5.186252397260811e-05, + "loss": 1.9949, + "step": 2786 + }, + { + "epoch": 0.506186573432924, + "grad_norm": 0.4697243273258209, + "learning_rate": 5.183297345787613e-05, + "loss": 1.8048, + "step": 2787 + }, + { + "epoch": 0.5063681976071015, + "grad_norm": 0.526177704334259, + "learning_rate": 5.1803422302023495e-05, + "loss": 1.75, + "step": 2788 + }, + { + "epoch": 0.506549821781279, + "grad_norm": 0.34434396028518677, + "learning_rate": 5.177387051538631e-05, + "loss": 1.6678, + "step": 2789 + }, + { + "epoch": 0.5067314459554567, + "grad_norm": 0.501967191696167, + "learning_rate": 5.174431810830096e-05, + "loss": 1.8817, + "step": 2790 + }, + { + "epoch": 0.5069130701296343, + "grad_norm": 0.4013645648956299, + "learning_rate": 5.1714765091104003e-05, + "loss": 1.8502, + "step": 2791 + }, + { + "epoch": 0.5070946943038118, + "grad_norm": 0.3378397226333618, + "learning_rate": 5.16852114741322e-05, + "loss": 1.8946, + "step": 2792 + }, + { + "epoch": 0.5072763184779894, + "grad_norm": 0.7363464832305908, + "learning_rate": 5.165565726772258e-05, + "loss": 1.8355, + "step": 2793 + }, + { + "epoch": 0.507457942652167, + "grad_norm": 0.5177839994430542, + "learning_rate": 5.162610248221232e-05, + "loss": 1.6775, + "step": 2794 + }, + { + "epoch": 0.5076395668263446, + "grad_norm": 0.41134512424468994, + "learning_rate": 5.159654712793882e-05, + "loss": 1.8211, + "step": 2795 + }, + { + "epoch": 0.5078211910005221, + "grad_norm": 0.36441606283187866, + "learning_rate": 5.15669912152397e-05, + "loss": 1.7089, + "step": 2796 + }, + { + "epoch": 0.5080028151746998, + "grad_norm": 0.3411926031112671, + "learning_rate": 5.153743475445276e-05, + "loss": 1.681, + "step": 2797 + }, + { + "epoch": 0.5081844393488774, + "grad_norm": 0.35284823179244995, + "learning_rate": 5.150787775591596e-05, + "loss": 1.7799, + "step": 2798 + }, + { + "epoch": 0.5083660635230549, + "grad_norm": 0.42299309372901917, + "learning_rate": 5.147832022996748e-05, + "loss": 1.7665, + "step": 2799 + }, + { + "epoch": 0.5085476876972325, + "grad_norm": 0.4419465661048889, + "learning_rate": 5.144876218694571e-05, + "loss": 1.67, + "step": 2800 + }, + { + "epoch": 0.5087293118714101, + "grad_norm": 0.3332909047603607, + "learning_rate": 5.141920363718916e-05, + "loss": 1.7101, + "step": 2801 + }, + { + "epoch": 0.5089109360455877, + "grad_norm": 0.3410511612892151, + "learning_rate": 5.138964459103658e-05, + "loss": 1.6198, + "step": 2802 + }, + { + "epoch": 0.5090925602197652, + "grad_norm": 0.7086399793624878, + "learning_rate": 5.1360085058826827e-05, + "loss": 1.6621, + "step": 2803 + }, + { + "epoch": 0.5092741843939428, + "grad_norm": 0.644344687461853, + "learning_rate": 5.133052505089898e-05, + "loss": 1.6765, + "step": 2804 + }, + { + "epoch": 0.5094558085681204, + "grad_norm": 0.3445694148540497, + "learning_rate": 5.130096457759227e-05, + "loss": 1.561, + "step": 2805 + }, + { + "epoch": 0.509637432742298, + "grad_norm": 0.39053332805633545, + "learning_rate": 5.12714036492461e-05, + "loss": 1.9529, + "step": 2806 + }, + { + "epoch": 0.5098190569164756, + "grad_norm": 0.5176990628242493, + "learning_rate": 5.124184227619999e-05, + "loss": 1.7135, + "step": 2807 + }, + { + "epoch": 0.5100006810906532, + "grad_norm": 0.3503478169441223, + "learning_rate": 5.1212280468793674e-05, + "loss": 1.7929, + "step": 2808 + }, + { + "epoch": 0.5101823052648308, + "grad_norm": 0.3715820014476776, + "learning_rate": 5.118271823736699e-05, + "loss": 1.5789, + "step": 2809 + }, + { + "epoch": 0.5103639294390083, + "grad_norm": 0.3965926766395569, + "learning_rate": 5.115315559225997e-05, + "loss": 1.719, + "step": 2810 + }, + { + "epoch": 0.5105455536131859, + "grad_norm": 0.49202096462249756, + "learning_rate": 5.1123592543812734e-05, + "loss": 1.9482, + "step": 2811 + }, + { + "epoch": 0.5107271777873635, + "grad_norm": 0.48637381196022034, + "learning_rate": 5.10940291023656e-05, + "loss": 1.8647, + "step": 2812 + }, + { + "epoch": 0.5109088019615411, + "grad_norm": 0.4545120596885681, + "learning_rate": 5.1064465278258986e-05, + "loss": 1.7831, + "step": 2813 + }, + { + "epoch": 0.5110904261357186, + "grad_norm": 0.35235223174095154, + "learning_rate": 5.103490108183345e-05, + "loss": 1.5748, + "step": 2814 + }, + { + "epoch": 0.5112720503098962, + "grad_norm": 0.418613463640213, + "learning_rate": 5.100533652342971e-05, + "loss": 1.7236, + "step": 2815 + }, + { + "epoch": 0.5114536744840739, + "grad_norm": 0.4045114815235138, + "learning_rate": 5.0975771613388566e-05, + "loss": 1.7243, + "step": 2816 + }, + { + "epoch": 0.5116352986582514, + "grad_norm": 0.43919461965560913, + "learning_rate": 5.094620636205095e-05, + "loss": 1.6051, + "step": 2817 + }, + { + "epoch": 0.511816922832429, + "grad_norm": 0.34949395060539246, + "learning_rate": 5.0916640779757954e-05, + "loss": 1.7193, + "step": 2818 + }, + { + "epoch": 0.5119985470066066, + "grad_norm": 0.9224593639373779, + "learning_rate": 5.088707487685075e-05, + "loss": 1.5237, + "step": 2819 + }, + { + "epoch": 0.5121801711807842, + "grad_norm": 0.4211752712726593, + "learning_rate": 5.0857508663670596e-05, + "loss": 1.6705, + "step": 2820 + }, + { + "epoch": 0.5123617953549617, + "grad_norm": 0.3695501685142517, + "learning_rate": 5.082794215055894e-05, + "loss": 1.7852, + "step": 2821 + }, + { + "epoch": 0.5125434195291393, + "grad_norm": 0.3376506567001343, + "learning_rate": 5.0798375347857244e-05, + "loss": 1.7163, + "step": 2822 + }, + { + "epoch": 0.512725043703317, + "grad_norm": 0.3512142300605774, + "learning_rate": 5.0768808265907145e-05, + "loss": 1.5831, + "step": 2823 + }, + { + "epoch": 0.5129066678774945, + "grad_norm": 0.7823832631111145, + "learning_rate": 5.073924091505032e-05, + "loss": 1.9701, + "step": 2824 + }, + { + "epoch": 0.5130882920516721, + "grad_norm": 0.4613237977027893, + "learning_rate": 5.070967330562859e-05, + "loss": 1.6807, + "step": 2825 + }, + { + "epoch": 0.5132699162258496, + "grad_norm": 0.4309426248073578, + "learning_rate": 5.068010544798383e-05, + "loss": 1.6576, + "step": 2826 + }, + { + "epoch": 0.5134515404000273, + "grad_norm": 0.4744395315647125, + "learning_rate": 5.065053735245802e-05, + "loss": 1.7003, + "step": 2827 + }, + { + "epoch": 0.5136331645742048, + "grad_norm": 0.4285747706890106, + "learning_rate": 5.062096902939322e-05, + "loss": 1.5686, + "step": 2828 + }, + { + "epoch": 0.5138147887483824, + "grad_norm": 0.3092881739139557, + "learning_rate": 5.059140048913153e-05, + "loss": 1.5905, + "step": 2829 + }, + { + "epoch": 0.51399641292256, + "grad_norm": 0.3975408971309662, + "learning_rate": 5.056183174201522e-05, + "loss": 1.712, + "step": 2830 + }, + { + "epoch": 0.5141780370967376, + "grad_norm": 0.33092576265335083, + "learning_rate": 5.0532262798386544e-05, + "loss": 1.6738, + "step": 2831 + }, + { + "epoch": 0.5143596612709151, + "grad_norm": 0.7227604389190674, + "learning_rate": 5.050269366858787e-05, + "loss": 1.7984, + "step": 2832 + }, + { + "epoch": 0.5145412854450927, + "grad_norm": 0.44575902819633484, + "learning_rate": 5.047312436296159e-05, + "loss": 1.8844, + "step": 2833 + }, + { + "epoch": 0.5147229096192704, + "grad_norm": 0.32791855931282043, + "learning_rate": 5.044355489185022e-05, + "loss": 1.8189, + "step": 2834 + }, + { + "epoch": 0.5149045337934479, + "grad_norm": 0.47892484068870544, + "learning_rate": 5.0413985265596275e-05, + "loss": 1.6392, + "step": 2835 + }, + { + "epoch": 0.5150861579676255, + "grad_norm": 0.33402329683303833, + "learning_rate": 5.038441549454236e-05, + "loss": 1.7477, + "step": 2836 + }, + { + "epoch": 0.515267782141803, + "grad_norm": 0.42285028100013733, + "learning_rate": 5.035484558903111e-05, + "loss": 1.5562, + "step": 2837 + }, + { + "epoch": 0.5154494063159807, + "grad_norm": 0.3708788752555847, + "learning_rate": 5.0325275559405226e-05, + "loss": 1.7468, + "step": 2838 + }, + { + "epoch": 0.5156310304901582, + "grad_norm": 0.33388784527778625, + "learning_rate": 5.029570541600743e-05, + "loss": 1.8551, + "step": 2839 + }, + { + "epoch": 0.5158126546643358, + "grad_norm": 0.465767502784729, + "learning_rate": 5.0266135169180505e-05, + "loss": 1.7406, + "step": 2840 + }, + { + "epoch": 0.5159942788385135, + "grad_norm": 0.6672598719596863, + "learning_rate": 5.023656482926727e-05, + "loss": 1.6688, + "step": 2841 + }, + { + "epoch": 0.516175903012691, + "grad_norm": 0.3987756073474884, + "learning_rate": 5.020699440661054e-05, + "loss": 1.6192, + "step": 2842 + }, + { + "epoch": 0.5163575271868686, + "grad_norm": 0.5646878480911255, + "learning_rate": 5.017742391155321e-05, + "loss": 1.7918, + "step": 2843 + }, + { + "epoch": 0.5165391513610461, + "grad_norm": 2.0278446674346924, + "learning_rate": 5.0147853354438165e-05, + "loss": 1.889, + "step": 2844 + }, + { + "epoch": 0.5167207755352238, + "grad_norm": 0.3603026270866394, + "learning_rate": 5.0118282745608336e-05, + "loss": 1.7995, + "step": 2845 + }, + { + "epoch": 0.5169023997094013, + "grad_norm": 0.33142009377479553, + "learning_rate": 5.008871209540664e-05, + "loss": 1.8246, + "step": 2846 + }, + { + "epoch": 0.5170840238835789, + "grad_norm": 0.4178944528102875, + "learning_rate": 5.005914141417606e-05, + "loss": 1.7066, + "step": 2847 + }, + { + "epoch": 0.5172656480577564, + "grad_norm": 0.4205268919467926, + "learning_rate": 5.002957071225951e-05, + "loss": 1.8358, + "step": 2848 + }, + { + "epoch": 0.5174472722319341, + "grad_norm": 1.1215673685073853, + "learning_rate": 5e-05, + "loss": 2.0027, + "step": 2849 + }, + { + "epoch": 0.5176288964061116, + "grad_norm": 0.6708070635795593, + "learning_rate": 4.9970429287740505e-05, + "loss": 1.7981, + "step": 2850 + }, + { + "epoch": 0.5178105205802892, + "grad_norm": 0.3974936902523041, + "learning_rate": 4.994085858582397e-05, + "loss": 1.8397, + "step": 2851 + }, + { + "epoch": 0.5179921447544669, + "grad_norm": 0.39815935492515564, + "learning_rate": 4.9911287904593365e-05, + "loss": 1.7971, + "step": 2852 + }, + { + "epoch": 0.5181737689286444, + "grad_norm": 0.3397134244441986, + "learning_rate": 4.988171725439168e-05, + "loss": 1.6208, + "step": 2853 + }, + { + "epoch": 0.518355393102822, + "grad_norm": 0.40135064721107483, + "learning_rate": 4.985214664556184e-05, + "loss": 1.855, + "step": 2854 + }, + { + "epoch": 0.5185370172769995, + "grad_norm": 0.3672015070915222, + "learning_rate": 4.982257608844681e-05, + "loss": 1.7499, + "step": 2855 + }, + { + "epoch": 0.5187186414511772, + "grad_norm": 0.4846999943256378, + "learning_rate": 4.979300559338946e-05, + "loss": 2.089, + "step": 2856 + }, + { + "epoch": 0.5189002656253547, + "grad_norm": 0.38221490383148193, + "learning_rate": 4.976343517073274e-05, + "loss": 1.9163, + "step": 2857 + }, + { + "epoch": 0.5190818897995323, + "grad_norm": 0.33765316009521484, + "learning_rate": 4.97338648308195e-05, + "loss": 1.8587, + "step": 2858 + }, + { + "epoch": 0.51926351397371, + "grad_norm": 0.3140835165977478, + "learning_rate": 4.9704294583992586e-05, + "loss": 1.8236, + "step": 2859 + }, + { + "epoch": 0.5194451381478875, + "grad_norm": 0.442600816488266, + "learning_rate": 4.967472444059478e-05, + "loss": 1.597, + "step": 2860 + }, + { + "epoch": 0.5196267623220651, + "grad_norm": 0.3485032618045807, + "learning_rate": 4.964515441096889e-05, + "loss": 1.7018, + "step": 2861 + }, + { + "epoch": 0.5198083864962426, + "grad_norm": 0.32942289113998413, + "learning_rate": 4.961558450545765e-05, + "loss": 1.6579, + "step": 2862 + }, + { + "epoch": 0.5199900106704203, + "grad_norm": 0.41698604822158813, + "learning_rate": 4.9586014734403736e-05, + "loss": 1.6355, + "step": 2863 + }, + { + "epoch": 0.5201716348445978, + "grad_norm": 0.6701741218566895, + "learning_rate": 4.95564451081498e-05, + "loss": 1.8248, + "step": 2864 + }, + { + "epoch": 0.5203532590187754, + "grad_norm": 0.4028608202934265, + "learning_rate": 4.952687563703841e-05, + "loss": 1.5726, + "step": 2865 + }, + { + "epoch": 0.5205348831929529, + "grad_norm": 0.34862902760505676, + "learning_rate": 4.949730633141215e-05, + "loss": 1.7904, + "step": 2866 + }, + { + "epoch": 0.5207165073671306, + "grad_norm": 0.43819811940193176, + "learning_rate": 4.946773720161347e-05, + "loss": 1.6943, + "step": 2867 + }, + { + "epoch": 0.5208981315413082, + "grad_norm": 0.36282825469970703, + "learning_rate": 4.94381682579848e-05, + "loss": 1.663, + "step": 2868 + }, + { + "epoch": 0.5210797557154857, + "grad_norm": 0.43907052278518677, + "learning_rate": 4.940859951086847e-05, + "loss": 1.7575, + "step": 2869 + }, + { + "epoch": 0.5212613798896634, + "grad_norm": 0.6785220503807068, + "learning_rate": 4.93790309706068e-05, + "loss": 1.7652, + "step": 2870 + }, + { + "epoch": 0.5214430040638409, + "grad_norm": 0.399495929479599, + "learning_rate": 4.934946264754199e-05, + "loss": 1.7454, + "step": 2871 + }, + { + "epoch": 0.5216246282380185, + "grad_norm": 0.339959979057312, + "learning_rate": 4.9319894552016175e-05, + "loss": 1.647, + "step": 2872 + }, + { + "epoch": 0.521806252412196, + "grad_norm": 0.6537312865257263, + "learning_rate": 4.929032669437142e-05, + "loss": 1.9947, + "step": 2873 + }, + { + "epoch": 0.5219878765863737, + "grad_norm": 0.7766566276550293, + "learning_rate": 4.926075908494968e-05, + "loss": 1.7759, + "step": 2874 + }, + { + "epoch": 0.5221695007605512, + "grad_norm": 0.4378495216369629, + "learning_rate": 4.923119173409287e-05, + "loss": 1.7783, + "step": 2875 + }, + { + "epoch": 0.5223511249347288, + "grad_norm": 0.8541765213012695, + "learning_rate": 4.920162465214277e-05, + "loss": 1.7105, + "step": 2876 + }, + { + "epoch": 0.5225327491089063, + "grad_norm": 0.47026684880256653, + "learning_rate": 4.917205784944109e-05, + "loss": 1.766, + "step": 2877 + }, + { + "epoch": 0.522714373283084, + "grad_norm": 0.4407563805580139, + "learning_rate": 4.914249133632941e-05, + "loss": 1.6347, + "step": 2878 + }, + { + "epoch": 0.5228959974572616, + "grad_norm": 0.4099004864692688, + "learning_rate": 4.911292512314927e-05, + "loss": 1.8861, + "step": 2879 + }, + { + "epoch": 0.5230776216314391, + "grad_norm": 0.378995805978775, + "learning_rate": 4.908335922024206e-05, + "loss": 1.7136, + "step": 2880 + }, + { + "epoch": 0.5232592458056168, + "grad_norm": 0.41718965768814087, + "learning_rate": 4.9053793637949067e-05, + "loss": 1.6326, + "step": 2881 + }, + { + "epoch": 0.5234408699797943, + "grad_norm": 0.3791438639163971, + "learning_rate": 4.9024228386611445e-05, + "loss": 1.6962, + "step": 2882 + }, + { + "epoch": 0.5236224941539719, + "grad_norm": 0.8232355713844299, + "learning_rate": 4.899466347657029e-05, + "loss": 1.9319, + "step": 2883 + }, + { + "epoch": 0.5238041183281494, + "grad_norm": 0.6122906804084778, + "learning_rate": 4.8965098918166555e-05, + "loss": 1.9456, + "step": 2884 + }, + { + "epoch": 0.5239857425023271, + "grad_norm": 0.8749732375144958, + "learning_rate": 4.8935534721741025e-05, + "loss": 1.7752, + "step": 2885 + }, + { + "epoch": 0.5241673666765047, + "grad_norm": 0.3208705484867096, + "learning_rate": 4.890597089763442e-05, + "loss": 1.6108, + "step": 2886 + }, + { + "epoch": 0.5243489908506822, + "grad_norm": 0.4424605071544647, + "learning_rate": 4.887640745618727e-05, + "loss": 1.6995, + "step": 2887 + }, + { + "epoch": 0.5245306150248598, + "grad_norm": 0.38798078894615173, + "learning_rate": 4.884684440774004e-05, + "loss": 1.9152, + "step": 2888 + }, + { + "epoch": 0.5247122391990374, + "grad_norm": 0.3605513572692871, + "learning_rate": 4.881728176263302e-05, + "loss": 1.7251, + "step": 2889 + }, + { + "epoch": 0.524893863373215, + "grad_norm": 0.3897262215614319, + "learning_rate": 4.878771953120635e-05, + "loss": 1.8836, + "step": 2890 + }, + { + "epoch": 0.5250754875473925, + "grad_norm": 0.41708269715309143, + "learning_rate": 4.875815772380002e-05, + "loss": 1.7017, + "step": 2891 + }, + { + "epoch": 0.5252571117215702, + "grad_norm": 0.38209208846092224, + "learning_rate": 4.872859635075391e-05, + "loss": 1.664, + "step": 2892 + }, + { + "epoch": 0.5254387358957477, + "grad_norm": 0.37980273365974426, + "learning_rate": 4.869903542240774e-05, + "loss": 1.6587, + "step": 2893 + }, + { + "epoch": 0.5256203600699253, + "grad_norm": 0.43226686120033264, + "learning_rate": 4.8669474949101035e-05, + "loss": 1.6224, + "step": 2894 + }, + { + "epoch": 0.5258019842441028, + "grad_norm": 0.40736526250839233, + "learning_rate": 4.863991494117318e-05, + "loss": 1.7136, + "step": 2895 + }, + { + "epoch": 0.5259836084182805, + "grad_norm": 0.541822075843811, + "learning_rate": 4.861035540896344e-05, + "loss": 1.5985, + "step": 2896 + }, + { + "epoch": 0.5261652325924581, + "grad_norm": 0.4571339190006256, + "learning_rate": 4.858079636281085e-05, + "loss": 1.7237, + "step": 2897 + }, + { + "epoch": 0.5263468567666356, + "grad_norm": 0.4957195222377777, + "learning_rate": 4.855123781305431e-05, + "loss": 1.8243, + "step": 2898 + }, + { + "epoch": 0.5265284809408132, + "grad_norm": 0.43145057559013367, + "learning_rate": 4.852167977003253e-05, + "loss": 1.84, + "step": 2899 + }, + { + "epoch": 0.5267101051149908, + "grad_norm": 0.3903335928916931, + "learning_rate": 4.849212224408405e-05, + "loss": 1.802, + "step": 2900 + }, + { + "epoch": 0.5268917292891684, + "grad_norm": 1.3803681135177612, + "learning_rate": 4.846256524554725e-05, + "loss": 1.8478, + "step": 2901 + }, + { + "epoch": 0.5270733534633459, + "grad_norm": 0.315991073846817, + "learning_rate": 4.843300878476031e-05, + "loss": 1.8497, + "step": 2902 + }, + { + "epoch": 0.5272549776375236, + "grad_norm": 0.3225935101509094, + "learning_rate": 4.8403452872061186e-05, + "loss": 1.7651, + "step": 2903 + }, + { + "epoch": 0.5274366018117012, + "grad_norm": 0.6784349679946899, + "learning_rate": 4.837389751778768e-05, + "loss": 1.7633, + "step": 2904 + }, + { + "epoch": 0.5276182259858787, + "grad_norm": 0.45341062545776367, + "learning_rate": 4.834434273227743e-05, + "loss": 1.8234, + "step": 2905 + }, + { + "epoch": 0.5277998501600563, + "grad_norm": 0.4998728334903717, + "learning_rate": 4.831478852586781e-05, + "loss": 1.7743, + "step": 2906 + }, + { + "epoch": 0.5279814743342339, + "grad_norm": 0.40720534324645996, + "learning_rate": 4.8285234908896015e-05, + "loss": 1.7316, + "step": 2907 + }, + { + "epoch": 0.5281630985084115, + "grad_norm": 0.37080660462379456, + "learning_rate": 4.8255681891699035e-05, + "loss": 1.6782, + "step": 2908 + }, + { + "epoch": 0.528344722682589, + "grad_norm": 0.34969615936279297, + "learning_rate": 4.8226129484613694e-05, + "loss": 1.5468, + "step": 2909 + }, + { + "epoch": 0.5285263468567666, + "grad_norm": 0.39518314599990845, + "learning_rate": 4.819657769797651e-05, + "loss": 1.5138, + "step": 2910 + }, + { + "epoch": 0.5287079710309442, + "grad_norm": 0.3760863244533539, + "learning_rate": 4.8167026542123874e-05, + "loss": 1.6192, + "step": 2911 + }, + { + "epoch": 0.5288895952051218, + "grad_norm": 0.36826586723327637, + "learning_rate": 4.8137476027391906e-05, + "loss": 1.7566, + "step": 2912 + }, + { + "epoch": 0.5290712193792994, + "grad_norm": 0.39331546425819397, + "learning_rate": 4.810792616411649e-05, + "loss": 1.6127, + "step": 2913 + }, + { + "epoch": 0.529252843553477, + "grad_norm": 0.35108184814453125, + "learning_rate": 4.8078376962633346e-05, + "loss": 1.6441, + "step": 2914 + }, + { + "epoch": 0.5294344677276546, + "grad_norm": 0.39733704924583435, + "learning_rate": 4.80488284332779e-05, + "loss": 1.7607, + "step": 2915 + }, + { + "epoch": 0.5296160919018321, + "grad_norm": 0.6716375946998596, + "learning_rate": 4.801928058638538e-05, + "loss": 1.7501, + "step": 2916 + }, + { + "epoch": 0.5297977160760097, + "grad_norm": 0.36340561509132385, + "learning_rate": 4.798973343229073e-05, + "loss": 1.894, + "step": 2917 + }, + { + "epoch": 0.5299793402501873, + "grad_norm": 0.3745698034763336, + "learning_rate": 4.796018698132873e-05, + "loss": 1.7472, + "step": 2918 + }, + { + "epoch": 0.5301609644243649, + "grad_norm": 0.7620311975479126, + "learning_rate": 4.793064124383383e-05, + "loss": 1.7076, + "step": 2919 + }, + { + "epoch": 0.5303425885985424, + "grad_norm": 0.8714175224304199, + "learning_rate": 4.79010962301403e-05, + "loss": 1.635, + "step": 2920 + }, + { + "epoch": 0.53052421277272, + "grad_norm": 0.46562302112579346, + "learning_rate": 4.78715519505821e-05, + "loss": 1.9189, + "step": 2921 + }, + { + "epoch": 0.5307058369468977, + "grad_norm": 0.39596402645111084, + "learning_rate": 4.784200841549294e-05, + "loss": 1.7143, + "step": 2922 + }, + { + "epoch": 0.5308874611210752, + "grad_norm": 0.3948782682418823, + "learning_rate": 4.781246563520632e-05, + "loss": 1.6245, + "step": 2923 + }, + { + "epoch": 0.5310690852952528, + "grad_norm": 0.5284755229949951, + "learning_rate": 4.778292362005544e-05, + "loss": 1.726, + "step": 2924 + }, + { + "epoch": 0.5312507094694304, + "grad_norm": 0.35158050060272217, + "learning_rate": 4.775338238037322e-05, + "loss": 1.8536, + "step": 2925 + }, + { + "epoch": 0.531432333643608, + "grad_norm": 0.30244722962379456, + "learning_rate": 4.7723841926492326e-05, + "loss": 1.7343, + "step": 2926 + }, + { + "epoch": 0.5316139578177855, + "grad_norm": 0.3213544189929962, + "learning_rate": 4.769430226874517e-05, + "loss": 1.7078, + "step": 2927 + }, + { + "epoch": 0.5317955819919631, + "grad_norm": 0.7023633718490601, + "learning_rate": 4.766476341746385e-05, + "loss": 1.8127, + "step": 2928 + }, + { + "epoch": 0.5319772061661407, + "grad_norm": 0.39560666680336, + "learning_rate": 4.7635225382980176e-05, + "loss": 1.6988, + "step": 2929 + }, + { + "epoch": 0.5321588303403183, + "grad_norm": 0.3848857879638672, + "learning_rate": 4.760568817562569e-05, + "loss": 1.7863, + "step": 2930 + }, + { + "epoch": 0.5323404545144959, + "grad_norm": 0.6247043013572693, + "learning_rate": 4.7576151805731695e-05, + "loss": 1.7132, + "step": 2931 + }, + { + "epoch": 0.5325220786886735, + "grad_norm": 0.33924803137779236, + "learning_rate": 4.7546616283629105e-05, + "loss": 1.6253, + "step": 2932 + }, + { + "epoch": 0.5327037028628511, + "grad_norm": 1.196282982826233, + "learning_rate": 4.751708161964861e-05, + "loss": 1.7044, + "step": 2933 + }, + { + "epoch": 0.5328853270370286, + "grad_norm": 1.3817503452301025, + "learning_rate": 4.748754782412054e-05, + "loss": 1.849, + "step": 2934 + }, + { + "epoch": 0.5330669512112062, + "grad_norm": 0.4345369338989258, + "learning_rate": 4.745801490737498e-05, + "loss": 1.7218, + "step": 2935 + }, + { + "epoch": 0.5332485753853838, + "grad_norm": 0.4485497772693634, + "learning_rate": 4.74284828797417e-05, + "loss": 1.7147, + "step": 2936 + }, + { + "epoch": 0.5334301995595614, + "grad_norm": 0.42817434668540955, + "learning_rate": 4.739895175155012e-05, + "loss": 1.6612, + "step": 2937 + }, + { + "epoch": 0.5336118237337389, + "grad_norm": 0.42060843110084534, + "learning_rate": 4.736942153312936e-05, + "loss": 1.9614, + "step": 2938 + }, + { + "epoch": 0.5337934479079165, + "grad_norm": 0.3778499662876129, + "learning_rate": 4.733989223480823e-05, + "loss": 1.7671, + "step": 2939 + }, + { + "epoch": 0.5339750720820942, + "grad_norm": 0.4409087002277374, + "learning_rate": 4.7310363866915256e-05, + "loss": 1.756, + "step": 2940 + }, + { + "epoch": 0.5341566962562717, + "grad_norm": 1.8601369857788086, + "learning_rate": 4.728083643977855e-05, + "loss": 1.914, + "step": 2941 + }, + { + "epoch": 0.5343383204304493, + "grad_norm": 0.4209557771682739, + "learning_rate": 4.725130996372599e-05, + "loss": 1.7012, + "step": 2942 + }, + { + "epoch": 0.5345199446046269, + "grad_norm": 0.32352036237716675, + "learning_rate": 4.722178444908502e-05, + "loss": 1.8166, + "step": 2943 + }, + { + "epoch": 0.5347015687788045, + "grad_norm": 0.33558061718940735, + "learning_rate": 4.719225990618285e-05, + "loss": 1.7401, + "step": 2944 + }, + { + "epoch": 0.534883192952982, + "grad_norm": 0.43910759687423706, + "learning_rate": 4.7162736345346303e-05, + "loss": 1.8214, + "step": 2945 + }, + { + "epoch": 0.5350648171271596, + "grad_norm": 0.3253006041049957, + "learning_rate": 4.713321377690185e-05, + "loss": 1.7224, + "step": 2946 + }, + { + "epoch": 0.5352464413013373, + "grad_norm": 0.5381451845169067, + "learning_rate": 4.710369221117561e-05, + "loss": 1.7173, + "step": 2947 + }, + { + "epoch": 0.5354280654755148, + "grad_norm": 0.36698704957962036, + "learning_rate": 4.7074171658493366e-05, + "loss": 1.7013, + "step": 2948 + }, + { + "epoch": 0.5356096896496924, + "grad_norm": 0.3765612244606018, + "learning_rate": 4.7044652129180584e-05, + "loss": 1.6466, + "step": 2949 + }, + { + "epoch": 0.5357913138238699, + "grad_norm": 0.2852771580219269, + "learning_rate": 4.7015133633562295e-05, + "loss": 1.8676, + "step": 2950 + }, + { + "epoch": 0.5359729379980476, + "grad_norm": 0.35236647725105286, + "learning_rate": 4.698561618196323e-05, + "loss": 1.5934, + "step": 2951 + }, + { + "epoch": 0.5361545621722251, + "grad_norm": 1.1398124694824219, + "learning_rate": 4.695609978470771e-05, + "loss": 1.6869, + "step": 2952 + }, + { + "epoch": 0.5363361863464027, + "grad_norm": 0.34798505902290344, + "learning_rate": 4.692658445211974e-05, + "loss": 1.7062, + "step": 2953 + }, + { + "epoch": 0.5365178105205803, + "grad_norm": 0.37446799874305725, + "learning_rate": 4.6897070194522905e-05, + "loss": 1.8125, + "step": 2954 + }, + { + "epoch": 0.5366994346947579, + "grad_norm": 0.33341214060783386, + "learning_rate": 4.686755702224044e-05, + "loss": 1.6866, + "step": 2955 + }, + { + "epoch": 0.5368810588689354, + "grad_norm": 0.527295708656311, + "learning_rate": 4.683804494559518e-05, + "loss": 1.7112, + "step": 2956 + }, + { + "epoch": 0.537062683043113, + "grad_norm": 0.7665203213691711, + "learning_rate": 4.680853397490958e-05, + "loss": 1.7512, + "step": 2957 + }, + { + "epoch": 0.5372443072172907, + "grad_norm": 0.39253002405166626, + "learning_rate": 4.677902412050576e-05, + "loss": 1.6781, + "step": 2958 + }, + { + "epoch": 0.5374259313914682, + "grad_norm": 0.39547768235206604, + "learning_rate": 4.6749515392705363e-05, + "loss": 1.7967, + "step": 2959 + }, + { + "epoch": 0.5376075555656458, + "grad_norm": 0.3258834779262543, + "learning_rate": 4.6720007801829705e-05, + "loss": 1.5368, + "step": 2960 + }, + { + "epoch": 0.5377891797398233, + "grad_norm": 0.3889782428741455, + "learning_rate": 4.669050135819966e-05, + "loss": 1.8644, + "step": 2961 + }, + { + "epoch": 0.537970803914001, + "grad_norm": 0.5215182304382324, + "learning_rate": 4.6660996072135753e-05, + "loss": 2.0288, + "step": 2962 + }, + { + "epoch": 0.5381524280881785, + "grad_norm": 1.4264978170394897, + "learning_rate": 4.663149195395805e-05, + "loss": 1.6352, + "step": 2963 + }, + { + "epoch": 0.5383340522623561, + "grad_norm": 0.5044140219688416, + "learning_rate": 4.660198901398624e-05, + "loss": 1.7508, + "step": 2964 + }, + { + "epoch": 0.5385156764365338, + "grad_norm": 0.469123512506485, + "learning_rate": 4.657248726253956e-05, + "loss": 1.7447, + "step": 2965 + }, + { + "epoch": 0.5386973006107113, + "grad_norm": 0.5130043625831604, + "learning_rate": 4.6542986709936904e-05, + "loss": 1.5906, + "step": 2966 + }, + { + "epoch": 0.5388789247848889, + "grad_norm": 0.4137539565563202, + "learning_rate": 4.651348736649671e-05, + "loss": 1.662, + "step": 2967 + }, + { + "epoch": 0.5390605489590664, + "grad_norm": 0.31328219175338745, + "learning_rate": 4.6483989242536955e-05, + "loss": 1.8943, + "step": 2968 + }, + { + "epoch": 0.5392421731332441, + "grad_norm": 0.31315726041793823, + "learning_rate": 4.645449234837523e-05, + "loss": 1.7822, + "step": 2969 + }, + { + "epoch": 0.5394237973074216, + "grad_norm": 0.37687674164772034, + "learning_rate": 4.642499669432869e-05, + "loss": 1.8474, + "step": 2970 + }, + { + "epoch": 0.5396054214815992, + "grad_norm": 0.3897865414619446, + "learning_rate": 4.639550229071407e-05, + "loss": 1.7015, + "step": 2971 + }, + { + "epoch": 0.5397870456557767, + "grad_norm": 0.7218741774559021, + "learning_rate": 4.636600914784764e-05, + "loss": 1.6151, + "step": 2972 + }, + { + "epoch": 0.5399686698299544, + "grad_norm": 0.48132094740867615, + "learning_rate": 4.633651727604525e-05, + "loss": 1.9258, + "step": 2973 + }, + { + "epoch": 0.540150294004132, + "grad_norm": 0.5032230019569397, + "learning_rate": 4.630702668562227e-05, + "loss": 1.8365, + "step": 2974 + }, + { + "epoch": 0.5403319181783095, + "grad_norm": 0.707777202129364, + "learning_rate": 4.6277537386893676e-05, + "loss": 1.6928, + "step": 2975 + }, + { + "epoch": 0.5405135423524872, + "grad_norm": 0.6744298934936523, + "learning_rate": 4.624804939017397e-05, + "loss": 1.7318, + "step": 2976 + }, + { + "epoch": 0.5406951665266647, + "grad_norm": 0.455518513917923, + "learning_rate": 4.621856270577718e-05, + "loss": 1.6765, + "step": 2977 + }, + { + "epoch": 0.5408767907008423, + "grad_norm": 0.3832658529281616, + "learning_rate": 4.6189077344016867e-05, + "loss": 1.8102, + "step": 2978 + }, + { + "epoch": 0.5410584148750198, + "grad_norm": 0.3849180340766907, + "learning_rate": 4.6159593315206186e-05, + "loss": 1.659, + "step": 2979 + }, + { + "epoch": 0.5412400390491975, + "grad_norm": 0.46578994393348694, + "learning_rate": 4.6130110629657786e-05, + "loss": 1.8465, + "step": 2980 + }, + { + "epoch": 0.541421663223375, + "grad_norm": 0.44555699825286865, + "learning_rate": 4.610062929768383e-05, + "loss": 1.5825, + "step": 2981 + }, + { + "epoch": 0.5416032873975526, + "grad_norm": 0.4938177466392517, + "learning_rate": 4.6071149329596045e-05, + "loss": 1.767, + "step": 2982 + }, + { + "epoch": 0.5417849115717301, + "grad_norm": 0.5701612830162048, + "learning_rate": 4.6041670735705646e-05, + "loss": 1.8279, + "step": 2983 + }, + { + "epoch": 0.5419665357459078, + "grad_norm": 0.413324773311615, + "learning_rate": 4.6012193526323424e-05, + "loss": 1.8168, + "step": 2984 + }, + { + "epoch": 0.5421481599200854, + "grad_norm": 0.3314409852027893, + "learning_rate": 4.59827177117596e-05, + "loss": 2.0089, + "step": 2985 + }, + { + "epoch": 0.5423297840942629, + "grad_norm": 1.7619380950927734, + "learning_rate": 4.595324330232399e-05, + "loss": 1.7205, + "step": 2986 + }, + { + "epoch": 0.5425114082684406, + "grad_norm": 0.2812374234199524, + "learning_rate": 4.5923770308325855e-05, + "loss": 1.8112, + "step": 2987 + }, + { + "epoch": 0.5426930324426181, + "grad_norm": 0.41959112882614136, + "learning_rate": 4.589429874007401e-05, + "loss": 1.8539, + "step": 2988 + }, + { + "epoch": 0.5428746566167957, + "grad_norm": 0.4328691363334656, + "learning_rate": 4.586482860787675e-05, + "loss": 1.715, + "step": 2989 + }, + { + "epoch": 0.5430562807909732, + "grad_norm": 0.3959183394908905, + "learning_rate": 4.5835359922041854e-05, + "loss": 1.7931, + "step": 2990 + }, + { + "epoch": 0.5432379049651509, + "grad_norm": 1.3084266185760498, + "learning_rate": 4.580589269287661e-05, + "loss": 2.0072, + "step": 2991 + }, + { + "epoch": 0.5434195291393284, + "grad_norm": 2.4215168952941895, + "learning_rate": 4.5776426930687814e-05, + "loss": 1.6566, + "step": 2992 + }, + { + "epoch": 0.543601153313506, + "grad_norm": 0.341755747795105, + "learning_rate": 4.5746962645781724e-05, + "loss": 1.6671, + "step": 2993 + }, + { + "epoch": 0.5437827774876837, + "grad_norm": 0.471945196390152, + "learning_rate": 4.5717499848464075e-05, + "loss": 1.7632, + "step": 2994 + }, + { + "epoch": 0.5439644016618612, + "grad_norm": 0.4539923071861267, + "learning_rate": 4.5688038549040106e-05, + "loss": 1.9153, + "step": 2995 + }, + { + "epoch": 0.5441460258360388, + "grad_norm": 0.5195289254188538, + "learning_rate": 4.5658578757814496e-05, + "loss": 1.754, + "step": 2996 + }, + { + "epoch": 0.5443276500102163, + "grad_norm": 0.38594508171081543, + "learning_rate": 4.5629120485091454e-05, + "loss": 1.6478, + "step": 2997 + }, + { + "epoch": 0.544509274184394, + "grad_norm": 0.3982070982456207, + "learning_rate": 4.559966374117462e-05, + "loss": 1.5914, + "step": 2998 + }, + { + "epoch": 0.5446908983585715, + "grad_norm": 0.3899438977241516, + "learning_rate": 4.5570208536367095e-05, + "loss": 1.8398, + "step": 2999 + }, + { + "epoch": 0.5448725225327491, + "grad_norm": 0.3706665337085724, + "learning_rate": 4.554075488097143e-05, + "loss": 1.6374, + "step": 3000 + }, + { + "epoch": 0.5450541467069266, + "grad_norm": 0.4129939079284668, + "learning_rate": 4.5511302785289685e-05, + "loss": 1.7997, + "step": 3001 + }, + { + "epoch": 0.5452357708811043, + "grad_norm": 0.5824579000473022, + "learning_rate": 4.548185225962335e-05, + "loss": 1.7685, + "step": 3002 + }, + { + "epoch": 0.5454173950552819, + "grad_norm": 0.28057196736335754, + "learning_rate": 4.545240331427333e-05, + "loss": 1.7745, + "step": 3003 + }, + { + "epoch": 0.5455990192294594, + "grad_norm": 0.31444117426872253, + "learning_rate": 4.5422955959540036e-05, + "loss": 1.6253, + "step": 3004 + }, + { + "epoch": 0.5457806434036371, + "grad_norm": 0.39628198742866516, + "learning_rate": 4.539351020572326e-05, + "loss": 1.7583, + "step": 3005 + }, + { + "epoch": 0.5459622675778146, + "grad_norm": 0.6715196371078491, + "learning_rate": 4.53640660631223e-05, + "loss": 1.788, + "step": 3006 + }, + { + "epoch": 0.5461438917519922, + "grad_norm": 0.3895531892776489, + "learning_rate": 4.533462354203586e-05, + "loss": 1.7539, + "step": 3007 + }, + { + "epoch": 0.5463255159261697, + "grad_norm": 1.0019886493682861, + "learning_rate": 4.5305182652762057e-05, + "loss": 1.9565, + "step": 3008 + }, + { + "epoch": 0.5465071401003474, + "grad_norm": 0.42348575592041016, + "learning_rate": 4.527574340559844e-05, + "loss": 1.8162, + "step": 3009 + }, + { + "epoch": 0.546688764274525, + "grad_norm": 2.2442355155944824, + "learning_rate": 4.524630581084203e-05, + "loss": 1.8566, + "step": 3010 + }, + { + "epoch": 0.5468703884487025, + "grad_norm": 0.48699700832366943, + "learning_rate": 4.521686987878925e-05, + "loss": 1.8174, + "step": 3011 + }, + { + "epoch": 0.5470520126228801, + "grad_norm": 0.43209177255630493, + "learning_rate": 4.5187435619735894e-05, + "loss": 1.7264, + "step": 3012 + }, + { + "epoch": 0.5472336367970577, + "grad_norm": 0.452869176864624, + "learning_rate": 4.515800304397721e-05, + "loss": 1.4897, + "step": 3013 + }, + { + "epoch": 0.5474152609712353, + "grad_norm": 0.4100572168827057, + "learning_rate": 4.5128572161807894e-05, + "loss": 1.6594, + "step": 3014 + }, + { + "epoch": 0.5475968851454128, + "grad_norm": 0.27166029810905457, + "learning_rate": 4.509914298352197e-05, + "loss": 1.7331, + "step": 3015 + }, + { + "epoch": 0.5477785093195905, + "grad_norm": 0.3632904887199402, + "learning_rate": 4.506971551941294e-05, + "loss": 1.6356, + "step": 3016 + }, + { + "epoch": 0.547960133493768, + "grad_norm": 0.4976261258125305, + "learning_rate": 4.5040289779773645e-05, + "loss": 1.845, + "step": 3017 + }, + { + "epoch": 0.5481417576679456, + "grad_norm": 0.5234648585319519, + "learning_rate": 4.501086577489634e-05, + "loss": 1.8458, + "step": 3018 + }, + { + "epoch": 0.5483233818421231, + "grad_norm": 0.36320754885673523, + "learning_rate": 4.498144351507272e-05, + "loss": 1.7093, + "step": 3019 + }, + { + "epoch": 0.5485050060163008, + "grad_norm": 0.3720032572746277, + "learning_rate": 4.495202301059382e-05, + "loss": 1.6817, + "step": 3020 + }, + { + "epoch": 0.5486866301904784, + "grad_norm": 0.37255483865737915, + "learning_rate": 4.492260427175007e-05, + "loss": 1.5922, + "step": 3021 + }, + { + "epoch": 0.5488682543646559, + "grad_norm": 0.8174536824226379, + "learning_rate": 4.489318730883127e-05, + "loss": 1.8362, + "step": 3022 + }, + { + "epoch": 0.5490498785388335, + "grad_norm": 0.4399205148220062, + "learning_rate": 4.486377213212666e-05, + "loss": 1.7618, + "step": 3023 + }, + { + "epoch": 0.5492315027130111, + "grad_norm": 0.4642428755760193, + "learning_rate": 4.4834358751924785e-05, + "loss": 1.7557, + "step": 3024 + }, + { + "epoch": 0.5494131268871887, + "grad_norm": 0.37069132924079895, + "learning_rate": 4.480494717851359e-05, + "loss": 1.7165, + "step": 3025 + }, + { + "epoch": 0.5495947510613662, + "grad_norm": 0.5889238119125366, + "learning_rate": 4.477553742218035e-05, + "loss": 1.7065, + "step": 3026 + }, + { + "epoch": 0.5497763752355439, + "grad_norm": 0.40545952320098877, + "learning_rate": 4.4746129493211816e-05, + "loss": 1.6491, + "step": 3027 + }, + { + "epoch": 0.5499579994097215, + "grad_norm": 0.3318103849887848, + "learning_rate": 4.471672340189396e-05, + "loss": 1.7286, + "step": 3028 + }, + { + "epoch": 0.550139623583899, + "grad_norm": 0.4129540026187897, + "learning_rate": 4.4687319158512215e-05, + "loss": 1.8589, + "step": 3029 + }, + { + "epoch": 0.5503212477580766, + "grad_norm": 0.38841700553894043, + "learning_rate": 4.4657916773351295e-05, + "loss": 1.7562, + "step": 3030 + }, + { + "epoch": 0.5505028719322542, + "grad_norm": 0.36347073316574097, + "learning_rate": 4.4628516256695305e-05, + "loss": 1.8081, + "step": 3031 + }, + { + "epoch": 0.5506844961064318, + "grad_norm": 0.3503294587135315, + "learning_rate": 4.4599117618827714e-05, + "loss": 1.6039, + "step": 3032 + }, + { + "epoch": 0.5508661202806093, + "grad_norm": 0.3276011049747467, + "learning_rate": 4.45697208700313e-05, + "loss": 1.8396, + "step": 3033 + }, + { + "epoch": 0.5510477444547869, + "grad_norm": 0.47073838114738464, + "learning_rate": 4.4540326020588154e-05, + "loss": 2.0044, + "step": 3034 + }, + { + "epoch": 0.5512293686289645, + "grad_norm": 0.33397236466407776, + "learning_rate": 4.451093308077976e-05, + "loss": 1.7842, + "step": 3035 + }, + { + "epoch": 0.5514109928031421, + "grad_norm": 0.4012407064437866, + "learning_rate": 4.448154206088693e-05, + "loss": 1.864, + "step": 3036 + }, + { + "epoch": 0.5515926169773196, + "grad_norm": 1.1681228876113892, + "learning_rate": 4.445215297118976e-05, + "loss": 1.9198, + "step": 3037 + }, + { + "epoch": 0.5517742411514973, + "grad_norm": 0.4616161584854126, + "learning_rate": 4.442276582196771e-05, + "loss": 1.8005, + "step": 3038 + }, + { + "epoch": 0.5519558653256749, + "grad_norm": 0.6042470932006836, + "learning_rate": 4.4393380623499556e-05, + "loss": 1.7137, + "step": 3039 + }, + { + "epoch": 0.5521374894998524, + "grad_norm": 0.4424363076686859, + "learning_rate": 4.436399738606334e-05, + "loss": 1.8482, + "step": 3040 + }, + { + "epoch": 0.55231911367403, + "grad_norm": 0.36305463314056396, + "learning_rate": 4.433461611993651e-05, + "loss": 1.6194, + "step": 3041 + }, + { + "epoch": 0.5525007378482076, + "grad_norm": 0.42508465051651, + "learning_rate": 4.430523683539577e-05, + "loss": 1.7381, + "step": 3042 + }, + { + "epoch": 0.5526823620223852, + "grad_norm": 0.36984720826148987, + "learning_rate": 4.4275859542717105e-05, + "loss": 1.6148, + "step": 3043 + }, + { + "epoch": 0.5528639861965627, + "grad_norm": 0.3983752131462097, + "learning_rate": 4.424648425217585e-05, + "loss": 1.7639, + "step": 3044 + }, + { + "epoch": 0.5530456103707403, + "grad_norm": 0.9604812264442444, + "learning_rate": 4.421711097404666e-05, + "loss": 1.7538, + "step": 3045 + }, + { + "epoch": 0.553227234544918, + "grad_norm": 0.531287431716919, + "learning_rate": 4.41877397186034e-05, + "loss": 1.6599, + "step": 3046 + }, + { + "epoch": 0.5534088587190955, + "grad_norm": 0.5279927253723145, + "learning_rate": 4.415837049611932e-05, + "loss": 1.7581, + "step": 3047 + }, + { + "epoch": 0.5535904828932731, + "grad_norm": 0.3729476034641266, + "learning_rate": 4.412900331686687e-05, + "loss": 1.7134, + "step": 3048 + }, + { + "epoch": 0.5537721070674507, + "grad_norm": 0.4706531763076782, + "learning_rate": 4.4099638191117885e-05, + "loss": 1.7073, + "step": 3049 + }, + { + "epoch": 0.5539537312416283, + "grad_norm": 0.3671492338180542, + "learning_rate": 4.40702751291434e-05, + "loss": 1.8931, + "step": 3050 + }, + { + "epoch": 0.5541353554158058, + "grad_norm": 0.5275436043739319, + "learning_rate": 4.4040914141213774e-05, + "loss": 1.6944, + "step": 3051 + }, + { + "epoch": 0.5543169795899834, + "grad_norm": 0.42474934458732605, + "learning_rate": 4.4011555237598604e-05, + "loss": 1.6966, + "step": 3052 + }, + { + "epoch": 0.554498603764161, + "grad_norm": 0.373788058757782, + "learning_rate": 4.398219842856677e-05, + "loss": 1.9961, + "step": 3053 + }, + { + "epoch": 0.5546802279383386, + "grad_norm": 0.5257490873336792, + "learning_rate": 4.395284372438648e-05, + "loss": 1.786, + "step": 3054 + }, + { + "epoch": 0.5548618521125162, + "grad_norm": 0.37978705763816833, + "learning_rate": 4.392349113532511e-05, + "loss": 1.6179, + "step": 3055 + }, + { + "epoch": 0.5550434762866937, + "grad_norm": 0.31648018956184387, + "learning_rate": 4.389414067164935e-05, + "loss": 1.9276, + "step": 3056 + }, + { + "epoch": 0.5552251004608714, + "grad_norm": 0.6888502240180969, + "learning_rate": 4.386479234362512e-05, + "loss": 1.8434, + "step": 3057 + }, + { + "epoch": 0.5554067246350489, + "grad_norm": 0.349650502204895, + "learning_rate": 4.383544616151764e-05, + "loss": 1.7804, + "step": 3058 + }, + { + "epoch": 0.5555883488092265, + "grad_norm": 0.3757147192955017, + "learning_rate": 4.3806102135591326e-05, + "loss": 1.684, + "step": 3059 + }, + { + "epoch": 0.5557699729834041, + "grad_norm": 0.3596339821815491, + "learning_rate": 4.3776760276109886e-05, + "loss": 1.7197, + "step": 3060 + }, + { + "epoch": 0.5559515971575817, + "grad_norm": 0.596581220626831, + "learning_rate": 4.374742059333621e-05, + "loss": 1.8448, + "step": 3061 + }, + { + "epoch": 0.5561332213317592, + "grad_norm": 0.41029781103134155, + "learning_rate": 4.3718083097532494e-05, + "loss": 1.7261, + "step": 3062 + }, + { + "epoch": 0.5563148455059368, + "grad_norm": 0.33367544412612915, + "learning_rate": 4.3688747798960144e-05, + "loss": 1.7165, + "step": 3063 + }, + { + "epoch": 0.5564964696801145, + "grad_norm": 0.3735615611076355, + "learning_rate": 4.3659414707879775e-05, + "loss": 1.7183, + "step": 3064 + }, + { + "epoch": 0.556678093854292, + "grad_norm": 0.3822565972805023, + "learning_rate": 4.363008383455124e-05, + "loss": 1.8202, + "step": 3065 + }, + { + "epoch": 0.5568597180284696, + "grad_norm": 0.32675305008888245, + "learning_rate": 4.360075518923362e-05, + "loss": 1.7073, + "step": 3066 + }, + { + "epoch": 0.5570413422026472, + "grad_norm": 0.39790624380111694, + "learning_rate": 4.3571428782185254e-05, + "loss": 1.6414, + "step": 3067 + }, + { + "epoch": 0.5572229663768248, + "grad_norm": 0.38399696350097656, + "learning_rate": 4.354210462366364e-05, + "loss": 1.6777, + "step": 3068 + }, + { + "epoch": 0.5574045905510023, + "grad_norm": 0.4035964012145996, + "learning_rate": 4.3512782723925516e-05, + "loss": 1.8626, + "step": 3069 + }, + { + "epoch": 0.5575862147251799, + "grad_norm": 0.6398143768310547, + "learning_rate": 4.3483463093226815e-05, + "loss": 1.9636, + "step": 3070 + }, + { + "epoch": 0.5577678388993575, + "grad_norm": 0.35253065824508667, + "learning_rate": 4.345414574182272e-05, + "loss": 1.7514, + "step": 3071 + }, + { + "epoch": 0.5579494630735351, + "grad_norm": 0.4357300102710724, + "learning_rate": 4.342483067996756e-05, + "loss": 1.7492, + "step": 3072 + }, + { + "epoch": 0.5581310872477127, + "grad_norm": 0.31384000182151794, + "learning_rate": 4.3395517917914895e-05, + "loss": 1.6363, + "step": 3073 + }, + { + "epoch": 0.5583127114218902, + "grad_norm": 0.3592666685581207, + "learning_rate": 4.336620746591746e-05, + "loss": 1.6125, + "step": 3074 + }, + { + "epoch": 0.5584943355960679, + "grad_norm": 0.3414047062397003, + "learning_rate": 4.333689933422723e-05, + "loss": 1.8255, + "step": 3075 + }, + { + "epoch": 0.5586759597702454, + "grad_norm": 0.4308973252773285, + "learning_rate": 4.330759353309532e-05, + "loss": 1.6917, + "step": 3076 + }, + { + "epoch": 0.558857583944423, + "grad_norm": 0.9381508827209473, + "learning_rate": 4.327829007277204e-05, + "loss": 1.9667, + "step": 3077 + }, + { + "epoch": 0.5590392081186006, + "grad_norm": 0.5937433242797852, + "learning_rate": 4.324898896350689e-05, + "loss": 1.7481, + "step": 3078 + }, + { + "epoch": 0.5592208322927782, + "grad_norm": 1.2213655710220337, + "learning_rate": 4.321969021554852e-05, + "loss": 1.8065, + "step": 3079 + }, + { + "epoch": 0.5594024564669557, + "grad_norm": 0.7346343398094177, + "learning_rate": 4.319039383914482e-05, + "loss": 2.0194, + "step": 3080 + }, + { + "epoch": 0.5595840806411333, + "grad_norm": 0.45549121499061584, + "learning_rate": 4.316109984454278e-05, + "loss": 1.7332, + "step": 3081 + }, + { + "epoch": 0.559765704815311, + "grad_norm": 0.40990790724754333, + "learning_rate": 4.31318082419886e-05, + "loss": 1.9136, + "step": 3082 + }, + { + "epoch": 0.5599473289894885, + "grad_norm": 0.6663644313812256, + "learning_rate": 4.3102519041727596e-05, + "loss": 1.7457, + "step": 3083 + }, + { + "epoch": 0.5601289531636661, + "grad_norm": 1.0199326276779175, + "learning_rate": 4.307323225400432e-05, + "loss": 1.6279, + "step": 3084 + }, + { + "epoch": 0.5603105773378436, + "grad_norm": 0.35918015241622925, + "learning_rate": 4.304394788906242e-05, + "loss": 1.691, + "step": 3085 + }, + { + "epoch": 0.5604922015120213, + "grad_norm": 0.47701966762542725, + "learning_rate": 4.301466595714472e-05, + "loss": 1.8062, + "step": 3086 + }, + { + "epoch": 0.5606738256861988, + "grad_norm": 0.36877626180648804, + "learning_rate": 4.298538646849315e-05, + "loss": 1.8047, + "step": 3087 + }, + { + "epoch": 0.5608554498603764, + "grad_norm": 0.42370444536209106, + "learning_rate": 4.2956109433348844e-05, + "loss": 1.9133, + "step": 3088 + }, + { + "epoch": 0.561037074034554, + "grad_norm": 0.47274646162986755, + "learning_rate": 4.292683486195208e-05, + "loss": 1.8605, + "step": 3089 + }, + { + "epoch": 0.5612186982087316, + "grad_norm": 0.4934017062187195, + "learning_rate": 4.289756276454222e-05, + "loss": 1.721, + "step": 3090 + }, + { + "epoch": 0.5614003223829092, + "grad_norm": 1.3612183332443237, + "learning_rate": 4.2868293151357806e-05, + "loss": 1.7816, + "step": 3091 + }, + { + "epoch": 0.5615819465570867, + "grad_norm": 0.46443653106689453, + "learning_rate": 4.283902603263646e-05, + "loss": 1.6706, + "step": 3092 + }, + { + "epoch": 0.5617635707312644, + "grad_norm": 0.36967116594314575, + "learning_rate": 4.280976141861501e-05, + "loss": 1.699, + "step": 3093 + }, + { + "epoch": 0.5619451949054419, + "grad_norm": 0.506899893283844, + "learning_rate": 4.278049931952937e-05, + "loss": 1.8362, + "step": 3094 + }, + { + "epoch": 0.5621268190796195, + "grad_norm": 0.4592292904853821, + "learning_rate": 4.275123974561453e-05, + "loss": 1.8751, + "step": 3095 + }, + { + "epoch": 0.562308443253797, + "grad_norm": 0.7786725163459778, + "learning_rate": 4.2721982707104635e-05, + "loss": 1.7194, + "step": 3096 + }, + { + "epoch": 0.5624900674279747, + "grad_norm": 0.31210023164749146, + "learning_rate": 4.269272821423298e-05, + "loss": 1.8617, + "step": 3097 + }, + { + "epoch": 0.5626716916021522, + "grad_norm": 0.4271332025527954, + "learning_rate": 4.2663476277231915e-05, + "loss": 1.7421, + "step": 3098 + }, + { + "epoch": 0.5628533157763298, + "grad_norm": 0.3322439193725586, + "learning_rate": 4.263422690633292e-05, + "loss": 1.6639, + "step": 3099 + }, + { + "epoch": 0.5630349399505075, + "grad_norm": 0.6006413102149963, + "learning_rate": 4.260498011176657e-05, + "loss": 1.6851, + "step": 3100 + }, + { + "epoch": 0.563216564124685, + "grad_norm": 0.3098527193069458, + "learning_rate": 4.2575735903762513e-05, + "loss": 1.6995, + "step": 3101 + }, + { + "epoch": 0.5633981882988626, + "grad_norm": 0.39940145611763, + "learning_rate": 4.254649429254956e-05, + "loss": 1.7699, + "step": 3102 + }, + { + "epoch": 0.5635798124730401, + "grad_norm": 0.7841716408729553, + "learning_rate": 4.2517255288355566e-05, + "loss": 1.8228, + "step": 3103 + }, + { + "epoch": 0.5637614366472178, + "grad_norm": 0.37807977199554443, + "learning_rate": 4.2488018901407475e-05, + "loss": 1.6928, + "step": 3104 + }, + { + "epoch": 0.5639430608213953, + "grad_norm": 0.4170122742652893, + "learning_rate": 4.2458785141931314e-05, + "loss": 1.7446, + "step": 3105 + }, + { + "epoch": 0.5641246849955729, + "grad_norm": 0.3593517243862152, + "learning_rate": 4.242955402015221e-05, + "loss": 1.6609, + "step": 3106 + }, + { + "epoch": 0.5643063091697504, + "grad_norm": 0.3077159523963928, + "learning_rate": 4.240032554629436e-05, + "loss": 1.5956, + "step": 3107 + }, + { + "epoch": 0.5644879333439281, + "grad_norm": 0.42333143949508667, + "learning_rate": 4.2371099730581024e-05, + "loss": 1.6585, + "step": 3108 + }, + { + "epoch": 0.5646695575181057, + "grad_norm": 0.6731040477752686, + "learning_rate": 4.2341876583234534e-05, + "loss": 2.0455, + "step": 3109 + }, + { + "epoch": 0.5648511816922832, + "grad_norm": 0.44226107001304626, + "learning_rate": 4.2312656114476325e-05, + "loss": 1.6948, + "step": 3110 + }, + { + "epoch": 0.5650328058664609, + "grad_norm": 0.36984243988990784, + "learning_rate": 4.228343833452684e-05, + "loss": 1.6548, + "step": 3111 + }, + { + "epoch": 0.5652144300406384, + "grad_norm": 0.6221312284469604, + "learning_rate": 4.2254223253605604e-05, + "loss": 1.643, + "step": 3112 + }, + { + "epoch": 0.565396054214816, + "grad_norm": 0.3887101709842682, + "learning_rate": 4.222501088193122e-05, + "loss": 1.6681, + "step": 3113 + }, + { + "epoch": 0.5655776783889935, + "grad_norm": 0.48307082056999207, + "learning_rate": 4.219580122972128e-05, + "loss": 1.5971, + "step": 3114 + }, + { + "epoch": 0.5657593025631712, + "grad_norm": 0.3757035732269287, + "learning_rate": 4.216659430719252e-05, + "loss": 1.761, + "step": 3115 + }, + { + "epoch": 0.5659409267373487, + "grad_norm": 0.38280725479125977, + "learning_rate": 4.2137390124560654e-05, + "loss": 1.7518, + "step": 3116 + }, + { + "epoch": 0.5661225509115263, + "grad_norm": 0.3861446678638458, + "learning_rate": 4.210818869204044e-05, + "loss": 1.8221, + "step": 3117 + }, + { + "epoch": 0.5663041750857039, + "grad_norm": 0.3802427649497986, + "learning_rate": 4.2078990019845685e-05, + "loss": 1.7708, + "step": 3118 + }, + { + "epoch": 0.5664857992598815, + "grad_norm": 0.3916220963001251, + "learning_rate": 4.204979411818927e-05, + "loss": 1.7036, + "step": 3119 + }, + { + "epoch": 0.5666674234340591, + "grad_norm": 1.064565658569336, + "learning_rate": 4.2020600997283035e-05, + "loss": 1.8059, + "step": 3120 + }, + { + "epoch": 0.5668490476082366, + "grad_norm": 0.40302518010139465, + "learning_rate": 4.1991410667337896e-05, + "loss": 1.5994, + "step": 3121 + }, + { + "epoch": 0.5670306717824143, + "grad_norm": 0.35955309867858887, + "learning_rate": 4.1962223138563774e-05, + "loss": 1.7329, + "step": 3122 + }, + { + "epoch": 0.5672122959565918, + "grad_norm": 0.42209064960479736, + "learning_rate": 4.193303842116959e-05, + "loss": 1.7667, + "step": 3123 + }, + { + "epoch": 0.5673939201307694, + "grad_norm": 0.26646435260772705, + "learning_rate": 4.190385652536336e-05, + "loss": 1.6579, + "step": 3124 + }, + { + "epoch": 0.5675755443049469, + "grad_norm": 0.3734237551689148, + "learning_rate": 4.187467746135204e-05, + "loss": 1.8069, + "step": 3125 + }, + { + "epoch": 0.5677571684791246, + "grad_norm": 0.36509665846824646, + "learning_rate": 4.18455012393416e-05, + "loss": 1.5594, + "step": 3126 + }, + { + "epoch": 0.5679387926533022, + "grad_norm": 0.3226165473461151, + "learning_rate": 4.181632786953702e-05, + "loss": 1.7206, + "step": 3127 + }, + { + "epoch": 0.5681204168274797, + "grad_norm": 0.8226976990699768, + "learning_rate": 4.178715736214234e-05, + "loss": 1.7855, + "step": 3128 + }, + { + "epoch": 0.5683020410016573, + "grad_norm": 0.33051368594169617, + "learning_rate": 4.175798972736053e-05, + "loss": 1.8385, + "step": 3129 + }, + { + "epoch": 0.5684836651758349, + "grad_norm": 0.45441320538520813, + "learning_rate": 4.1728824975393565e-05, + "loss": 1.826, + "step": 3130 + }, + { + "epoch": 0.5686652893500125, + "grad_norm": 0.5435267686843872, + "learning_rate": 4.1699663116442434e-05, + "loss": 1.9078, + "step": 3131 + }, + { + "epoch": 0.56884691352419, + "grad_norm": 0.36156409978866577, + "learning_rate": 4.167050416070712e-05, + "loss": 1.7229, + "step": 3132 + }, + { + "epoch": 0.5690285376983677, + "grad_norm": 0.41830435395240784, + "learning_rate": 4.164134811838655e-05, + "loss": 1.9401, + "step": 3133 + }, + { + "epoch": 0.5692101618725453, + "grad_norm": 0.5447048544883728, + "learning_rate": 4.161219499967869e-05, + "loss": 1.6283, + "step": 3134 + }, + { + "epoch": 0.5693917860467228, + "grad_norm": 0.439449280500412, + "learning_rate": 4.158304481478042e-05, + "loss": 1.7906, + "step": 3135 + }, + { + "epoch": 0.5695734102209004, + "grad_norm": 0.5891101360321045, + "learning_rate": 4.155389757388762e-05, + "loss": 1.8998, + "step": 3136 + }, + { + "epoch": 0.569755034395078, + "grad_norm": 0.4076891839504242, + "learning_rate": 4.1524753287195165e-05, + "loss": 1.856, + "step": 3137 + }, + { + "epoch": 0.5699366585692556, + "grad_norm": 0.3714604675769806, + "learning_rate": 4.149561196489689e-05, + "loss": 1.8063, + "step": 3138 + }, + { + "epoch": 0.5701182827434331, + "grad_norm": 0.7613690495491028, + "learning_rate": 4.1466473617185556e-05, + "loss": 1.7565, + "step": 3139 + }, + { + "epoch": 0.5702999069176108, + "grad_norm": 0.33339470624923706, + "learning_rate": 4.143733825425289e-05, + "loss": 1.6238, + "step": 3140 + }, + { + "epoch": 0.5704815310917883, + "grad_norm": 0.3866981863975525, + "learning_rate": 4.140820588628964e-05, + "loss": 1.7835, + "step": 3141 + }, + { + "epoch": 0.5706631552659659, + "grad_norm": 0.32379111647605896, + "learning_rate": 4.1379076523485436e-05, + "loss": 1.6483, + "step": 3142 + }, + { + "epoch": 0.5708447794401434, + "grad_norm": 0.38136905431747437, + "learning_rate": 4.134995017602887e-05, + "loss": 1.5246, + "step": 3143 + }, + { + "epoch": 0.5710264036143211, + "grad_norm": 0.39482972025871277, + "learning_rate": 4.132082685410748e-05, + "loss": 1.7716, + "step": 3144 + }, + { + "epoch": 0.5712080277884987, + "grad_norm": 0.5119278430938721, + "learning_rate": 4.1291706567907794e-05, + "loss": 1.6681, + "step": 3145 + }, + { + "epoch": 0.5713896519626762, + "grad_norm": 0.29233333468437195, + "learning_rate": 4.126258932761522e-05, + "loss": 1.7991, + "step": 3146 + }, + { + "epoch": 0.5715712761368538, + "grad_norm": 0.6026432514190674, + "learning_rate": 4.1233475143414105e-05, + "loss": 1.9085, + "step": 3147 + }, + { + "epoch": 0.5717529003110314, + "grad_norm": 0.9535061120986938, + "learning_rate": 4.120436402548776e-05, + "loss": 1.7077, + "step": 3148 + }, + { + "epoch": 0.571934524485209, + "grad_norm": 0.3570280075073242, + "learning_rate": 4.117525598401838e-05, + "loss": 1.8077, + "step": 3149 + }, + { + "epoch": 0.5721161486593865, + "grad_norm": 0.4747987389564514, + "learning_rate": 4.1146151029187144e-05, + "loss": 1.6077, + "step": 3150 + }, + { + "epoch": 0.5722977728335642, + "grad_norm": 0.44376784563064575, + "learning_rate": 4.1117049171174104e-05, + "loss": 2.0593, + "step": 3151 + }, + { + "epoch": 0.5724793970077418, + "grad_norm": 0.47592246532440186, + "learning_rate": 4.1087950420158225e-05, + "loss": 1.6527, + "step": 3152 + }, + { + "epoch": 0.5726610211819193, + "grad_norm": 0.49360495805740356, + "learning_rate": 4.105885478631741e-05, + "loss": 1.743, + "step": 3153 + }, + { + "epoch": 0.5728426453560969, + "grad_norm": 0.39720043540000916, + "learning_rate": 4.102976227982848e-05, + "loss": 1.6537, + "step": 3154 + }, + { + "epoch": 0.5730242695302745, + "grad_norm": 0.3961147964000702, + "learning_rate": 4.1000672910867124e-05, + "loss": 1.8362, + "step": 3155 + }, + { + "epoch": 0.5732058937044521, + "grad_norm": 0.44962048530578613, + "learning_rate": 4.097158668960798e-05, + "loss": 1.8642, + "step": 3156 + }, + { + "epoch": 0.5733875178786296, + "grad_norm": 0.43676361441612244, + "learning_rate": 4.0942503626224514e-05, + "loss": 1.816, + "step": 3157 + }, + { + "epoch": 0.5735691420528072, + "grad_norm": 1.0404932498931885, + "learning_rate": 4.091342373088919e-05, + "loss": 1.8149, + "step": 3158 + }, + { + "epoch": 0.5737507662269848, + "grad_norm": 0.37696680426597595, + "learning_rate": 4.088434701377326e-05, + "loss": 1.7948, + "step": 3159 + }, + { + "epoch": 0.5739323904011624, + "grad_norm": 0.38608360290527344, + "learning_rate": 4.085527348504696e-05, + "loss": 1.6473, + "step": 3160 + }, + { + "epoch": 0.57411401457534, + "grad_norm": 0.7836238145828247, + "learning_rate": 4.082620315487931e-05, + "loss": 1.823, + "step": 3161 + }, + { + "epoch": 0.5742956387495176, + "grad_norm": 0.6360437870025635, + "learning_rate": 4.079713603343828e-05, + "loss": 1.938, + "step": 3162 + }, + { + "epoch": 0.5744772629236952, + "grad_norm": 0.514967143535614, + "learning_rate": 4.076807213089073e-05, + "loss": 1.6954, + "step": 3163 + }, + { + "epoch": 0.5746588870978727, + "grad_norm": 0.4463537931442261, + "learning_rate": 4.0739011457402346e-05, + "loss": 1.7663, + "step": 3164 + }, + { + "epoch": 0.5748405112720503, + "grad_norm": 0.36427947878837585, + "learning_rate": 4.0709954023137703e-05, + "loss": 1.8707, + "step": 3165 + }, + { + "epoch": 0.5750221354462279, + "grad_norm": 0.34270310401916504, + "learning_rate": 4.068089983826023e-05, + "loss": 1.653, + "step": 3166 + }, + { + "epoch": 0.5752037596204055, + "grad_norm": 0.4984973073005676, + "learning_rate": 4.065184891293227e-05, + "loss": 1.7958, + "step": 3167 + }, + { + "epoch": 0.575385383794583, + "grad_norm": 0.6870991587638855, + "learning_rate": 4.0622801257314945e-05, + "loss": 1.6833, + "step": 3168 + }, + { + "epoch": 0.5755670079687606, + "grad_norm": 0.9280707836151123, + "learning_rate": 4.059375688156832e-05, + "loss": 1.8825, + "step": 3169 + }, + { + "epoch": 0.5757486321429383, + "grad_norm": 0.41666123270988464, + "learning_rate": 4.056471579585125e-05, + "loss": 1.5798, + "step": 3170 + }, + { + "epoch": 0.5759302563171158, + "grad_norm": 0.5221550464630127, + "learning_rate": 4.053567801032144e-05, + "loss": 1.6189, + "step": 3171 + }, + { + "epoch": 0.5761118804912934, + "grad_norm": 0.3321922719478607, + "learning_rate": 4.050664353513551e-05, + "loss": 1.5503, + "step": 3172 + }, + { + "epoch": 0.576293504665471, + "grad_norm": 0.3960914611816406, + "learning_rate": 4.047761238044884e-05, + "loss": 1.7552, + "step": 3173 + }, + { + "epoch": 0.5764751288396486, + "grad_norm": 0.322042316198349, + "learning_rate": 4.044858455641568e-05, + "loss": 1.7192, + "step": 3174 + }, + { + "epoch": 0.5766567530138261, + "grad_norm": 0.3711080849170685, + "learning_rate": 4.041956007318911e-05, + "loss": 1.5052, + "step": 3175 + }, + { + "epoch": 0.5768383771880037, + "grad_norm": 0.3592276871204376, + "learning_rate": 4.039053894092108e-05, + "loss": 1.836, + "step": 3176 + }, + { + "epoch": 0.5770200013621813, + "grad_norm": 0.37354862689971924, + "learning_rate": 4.03615211697623e-05, + "loss": 1.7455, + "step": 3177 + }, + { + "epoch": 0.5772016255363589, + "grad_norm": 0.42526692152023315, + "learning_rate": 4.033250676986238e-05, + "loss": 1.808, + "step": 3178 + }, + { + "epoch": 0.5773832497105365, + "grad_norm": 0.4479105472564697, + "learning_rate": 4.030349575136967e-05, + "loss": 1.5272, + "step": 3179 + }, + { + "epoch": 0.577564873884714, + "grad_norm": 0.535372793674469, + "learning_rate": 4.027448812443139e-05, + "loss": 1.6989, + "step": 3180 + }, + { + "epoch": 0.5777464980588917, + "grad_norm": 0.40864700078964233, + "learning_rate": 4.0245483899193595e-05, + "loss": 1.7446, + "step": 3181 + }, + { + "epoch": 0.5779281222330692, + "grad_norm": 0.3029618561267853, + "learning_rate": 4.021648308580108e-05, + "loss": 1.5344, + "step": 3182 + }, + { + "epoch": 0.5781097464072468, + "grad_norm": 0.495913565158844, + "learning_rate": 4.018748569439749e-05, + "loss": 1.5826, + "step": 3183 + }, + { + "epoch": 0.5782913705814244, + "grad_norm": 0.42684775590896606, + "learning_rate": 4.015849173512525e-05, + "loss": 1.8485, + "step": 3184 + }, + { + "epoch": 0.578472994755602, + "grad_norm": 0.387961745262146, + "learning_rate": 4.012950121812565e-05, + "loss": 1.6993, + "step": 3185 + }, + { + "epoch": 0.5786546189297795, + "grad_norm": 0.38299426436424255, + "learning_rate": 4.010051415353869e-05, + "loss": 1.6826, + "step": 3186 + }, + { + "epoch": 0.5788362431039571, + "grad_norm": 0.7178893685340881, + "learning_rate": 4.0071530551503226e-05, + "loss": 1.7987, + "step": 3187 + }, + { + "epoch": 0.5790178672781348, + "grad_norm": 0.31592512130737305, + "learning_rate": 4.0042550422156835e-05, + "loss": 1.8703, + "step": 3188 + }, + { + "epoch": 0.5791994914523123, + "grad_norm": 0.3410983681678772, + "learning_rate": 4.001357377563596e-05, + "loss": 1.7465, + "step": 3189 + }, + { + "epoch": 0.5793811156264899, + "grad_norm": 0.5199748277664185, + "learning_rate": 3.998460062207578e-05, + "loss": 1.7003, + "step": 3190 + }, + { + "epoch": 0.5795627398006674, + "grad_norm": 0.502461314201355, + "learning_rate": 3.995563097161026e-05, + "loss": 1.6619, + "step": 3191 + }, + { + "epoch": 0.5797443639748451, + "grad_norm": 0.6543825268745422, + "learning_rate": 3.99266648343721e-05, + "loss": 1.7041, + "step": 3192 + }, + { + "epoch": 0.5799259881490226, + "grad_norm": 0.5346996784210205, + "learning_rate": 3.989770222049286e-05, + "loss": 1.8658, + "step": 3193 + }, + { + "epoch": 0.5801076123232002, + "grad_norm": 0.3722122013568878, + "learning_rate": 3.986874314010282e-05, + "loss": 1.7043, + "step": 3194 + }, + { + "epoch": 0.5802892364973778, + "grad_norm": 1.1967912912368774, + "learning_rate": 3.983978760333097e-05, + "loss": 2.0266, + "step": 3195 + }, + { + "epoch": 0.5804708606715554, + "grad_norm": 0.4607936143875122, + "learning_rate": 3.9810835620305176e-05, + "loss": 1.8145, + "step": 3196 + }, + { + "epoch": 0.580652484845733, + "grad_norm": 0.5419071316719055, + "learning_rate": 3.978188720115194e-05, + "loss": 1.8601, + "step": 3197 + }, + { + "epoch": 0.5808341090199105, + "grad_norm": 0.3670607805252075, + "learning_rate": 3.9752942355996616e-05, + "loss": 1.7268, + "step": 3198 + }, + { + "epoch": 0.5810157331940882, + "grad_norm": 0.3478156626224518, + "learning_rate": 3.972400109496324e-05, + "loss": 1.6886, + "step": 3199 + }, + { + "epoch": 0.5811973573682657, + "grad_norm": 1.3344593048095703, + "learning_rate": 3.9695063428174644e-05, + "loss": 1.8563, + "step": 3200 + }, + { + "epoch": 0.5813789815424433, + "grad_norm": 0.3498849868774414, + "learning_rate": 3.966612936575235e-05, + "loss": 1.8313, + "step": 3201 + }, + { + "epoch": 0.5815606057166209, + "grad_norm": 0.34745803475379944, + "learning_rate": 3.963719891781668e-05, + "loss": 1.5506, + "step": 3202 + }, + { + "epoch": 0.5817422298907985, + "grad_norm": 0.6178663372993469, + "learning_rate": 3.960827209448666e-05, + "loss": 1.6006, + "step": 3203 + }, + { + "epoch": 0.581923854064976, + "grad_norm": 0.39800578355789185, + "learning_rate": 3.9579348905880026e-05, + "loss": 1.7849, + "step": 3204 + }, + { + "epoch": 0.5821054782391536, + "grad_norm": 0.3593049943447113, + "learning_rate": 3.9550429362113286e-05, + "loss": 1.714, + "step": 3205 + }, + { + "epoch": 0.5822871024133313, + "grad_norm": 0.4393908381462097, + "learning_rate": 3.952151347330163e-05, + "loss": 1.8166, + "step": 3206 + }, + { + "epoch": 0.5824687265875088, + "grad_norm": 0.2860795557498932, + "learning_rate": 3.949260124955903e-05, + "loss": 1.6224, + "step": 3207 + }, + { + "epoch": 0.5826503507616864, + "grad_norm": 0.42104923725128174, + "learning_rate": 3.946369270099811e-05, + "loss": 1.6887, + "step": 3208 + }, + { + "epoch": 0.5828319749358639, + "grad_norm": 0.36777228116989136, + "learning_rate": 3.943478783773025e-05, + "loss": 1.7068, + "step": 3209 + }, + { + "epoch": 0.5830135991100416, + "grad_norm": 0.3724530041217804, + "learning_rate": 3.940588666986549e-05, + "loss": 1.6301, + "step": 3210 + }, + { + "epoch": 0.5831952232842191, + "grad_norm": 0.38686349987983704, + "learning_rate": 3.937698920751268e-05, + "loss": 1.8674, + "step": 3211 + }, + { + "epoch": 0.5833768474583967, + "grad_norm": 0.36016014218330383, + "learning_rate": 3.934809546077928e-05, + "loss": 1.8308, + "step": 3212 + }, + { + "epoch": 0.5835584716325743, + "grad_norm": 0.5937172174453735, + "learning_rate": 3.931920543977147e-05, + "loss": 1.7824, + "step": 3213 + }, + { + "epoch": 0.5837400958067519, + "grad_norm": 0.5479726195335388, + "learning_rate": 3.9290319154594136e-05, + "loss": 1.6413, + "step": 3214 + }, + { + "epoch": 0.5839217199809295, + "grad_norm": 0.38819602131843567, + "learning_rate": 3.926143661535087e-05, + "loss": 1.7652, + "step": 3215 + }, + { + "epoch": 0.584103344155107, + "grad_norm": 0.3445228636264801, + "learning_rate": 3.9232557832143955e-05, + "loss": 1.6225, + "step": 3216 + }, + { + "epoch": 0.5842849683292847, + "grad_norm": 0.6998990774154663, + "learning_rate": 3.9203682815074316e-05, + "loss": 1.5225, + "step": 3217 + }, + { + "epoch": 0.5844665925034622, + "grad_norm": 0.344250351190567, + "learning_rate": 3.917481157424163e-05, + "loss": 1.6992, + "step": 3218 + }, + { + "epoch": 0.5846482166776398, + "grad_norm": 0.36061426997184753, + "learning_rate": 3.914594411974416e-05, + "loss": 1.6319, + "step": 3219 + }, + { + "epoch": 0.5848298408518173, + "grad_norm": 0.3509247303009033, + "learning_rate": 3.9117080461678944e-05, + "loss": 1.7385, + "step": 3220 + }, + { + "epoch": 0.585011465025995, + "grad_norm": 0.4389633536338806, + "learning_rate": 3.9088220610141655e-05, + "loss": 1.506, + "step": 3221 + }, + { + "epoch": 0.5851930892001725, + "grad_norm": 0.41941842436790466, + "learning_rate": 3.9059364575226596e-05, + "loss": 1.6015, + "step": 3222 + }, + { + "epoch": 0.5853747133743501, + "grad_norm": 0.44490745663642883, + "learning_rate": 3.9030512367026774e-05, + "loss": 1.8996, + "step": 3223 + }, + { + "epoch": 0.5855563375485278, + "grad_norm": 0.5549112558364868, + "learning_rate": 3.9001663995633855e-05, + "loss": 1.7448, + "step": 3224 + }, + { + "epoch": 0.5857379617227053, + "grad_norm": 0.782598078250885, + "learning_rate": 3.897281947113817e-05, + "loss": 1.8186, + "step": 3225 + }, + { + "epoch": 0.5859195858968829, + "grad_norm": 0.3501138389110565, + "learning_rate": 3.894397880362868e-05, + "loss": 1.893, + "step": 3226 + }, + { + "epoch": 0.5861012100710604, + "grad_norm": 0.4228905439376831, + "learning_rate": 3.891514200319299e-05, + "loss": 1.6235, + "step": 3227 + }, + { + "epoch": 0.5862828342452381, + "grad_norm": 0.43510961532592773, + "learning_rate": 3.8886309079917415e-05, + "loss": 1.8345, + "step": 3228 + }, + { + "epoch": 0.5864644584194156, + "grad_norm": 0.39325910806655884, + "learning_rate": 3.8857480043886854e-05, + "loss": 1.6439, + "step": 3229 + }, + { + "epoch": 0.5866460825935932, + "grad_norm": 0.4605485796928406, + "learning_rate": 3.8828654905184846e-05, + "loss": 1.5645, + "step": 3230 + }, + { + "epoch": 0.5868277067677707, + "grad_norm": 0.363793283700943, + "learning_rate": 3.87998336738936e-05, + "loss": 1.7458, + "step": 3231 + }, + { + "epoch": 0.5870093309419484, + "grad_norm": 0.3468109667301178, + "learning_rate": 3.877101636009393e-05, + "loss": 1.7712, + "step": 3232 + }, + { + "epoch": 0.587190955116126, + "grad_norm": 0.5899457931518555, + "learning_rate": 3.87422029738653e-05, + "loss": 1.7007, + "step": 3233 + }, + { + "epoch": 0.5873725792903035, + "grad_norm": 0.42132118344306946, + "learning_rate": 3.871339352528581e-05, + "loss": 1.4704, + "step": 3234 + }, + { + "epoch": 0.5875542034644812, + "grad_norm": 1.085288643836975, + "learning_rate": 3.868458802443213e-05, + "loss": 1.9485, + "step": 3235 + }, + { + "epoch": 0.5877358276386587, + "grad_norm": 0.5399613976478577, + "learning_rate": 3.865578648137959e-05, + "loss": 1.73, + "step": 3236 + }, + { + "epoch": 0.5879174518128363, + "grad_norm": 0.4336874186992645, + "learning_rate": 3.8626988906202165e-05, + "loss": 1.6299, + "step": 3237 + }, + { + "epoch": 0.5880990759870138, + "grad_norm": 0.39436566829681396, + "learning_rate": 3.859819530897239e-05, + "loss": 1.7216, + "step": 3238 + }, + { + "epoch": 0.5882807001611915, + "grad_norm": 0.7763102054595947, + "learning_rate": 3.85694056997614e-05, + "loss": 1.7128, + "step": 3239 + }, + { + "epoch": 0.588462324335369, + "grad_norm": 0.35827556252479553, + "learning_rate": 3.854062008863897e-05, + "loss": 1.7361, + "step": 3240 + }, + { + "epoch": 0.5886439485095466, + "grad_norm": 0.39082083106040955, + "learning_rate": 3.851183848567351e-05, + "loss": 1.7116, + "step": 3241 + }, + { + "epoch": 0.5888255726837242, + "grad_norm": 0.5121601819992065, + "learning_rate": 3.848306090093193e-05, + "loss": 1.6372, + "step": 3242 + }, + { + "epoch": 0.5890071968579018, + "grad_norm": 0.49307599663734436, + "learning_rate": 3.845428734447983e-05, + "loss": 1.8841, + "step": 3243 + }, + { + "epoch": 0.5891888210320794, + "grad_norm": 0.504595160484314, + "learning_rate": 3.842551782638134e-05, + "loss": 1.6925, + "step": 3244 + }, + { + "epoch": 0.5893704452062569, + "grad_norm": 0.39040353894233704, + "learning_rate": 3.839675235669918e-05, + "loss": 1.5498, + "step": 3245 + }, + { + "epoch": 0.5895520693804346, + "grad_norm": 0.4950917959213257, + "learning_rate": 3.8367990945494715e-05, + "loss": 1.8611, + "step": 3246 + }, + { + "epoch": 0.5897336935546121, + "grad_norm": 0.4270734488964081, + "learning_rate": 3.833923360282783e-05, + "loss": 1.78, + "step": 3247 + }, + { + "epoch": 0.5899153177287897, + "grad_norm": 1.341448426246643, + "learning_rate": 3.8310480338756994e-05, + "loss": 1.8103, + "step": 3248 + }, + { + "epoch": 0.5900969419029672, + "grad_norm": 1.9807018041610718, + "learning_rate": 3.828173116333925e-05, + "loss": 1.7296, + "step": 3249 + }, + { + "epoch": 0.5902785660771449, + "grad_norm": 0.4910638928413391, + "learning_rate": 3.825298608663028e-05, + "loss": 1.7937, + "step": 3250 + }, + { + "epoch": 0.5904601902513225, + "grad_norm": 0.3633494973182678, + "learning_rate": 3.822424511868421e-05, + "loss": 1.7745, + "step": 3251 + }, + { + "epoch": 0.5906418144255, + "grad_norm": 0.5556860566139221, + "learning_rate": 3.819550826955383e-05, + "loss": 1.8891, + "step": 3252 + }, + { + "epoch": 0.5908234385996776, + "grad_norm": 0.31251266598701477, + "learning_rate": 3.816677554929044e-05, + "loss": 1.8013, + "step": 3253 + }, + { + "epoch": 0.5910050627738552, + "grad_norm": 0.4251961410045624, + "learning_rate": 3.813804696794388e-05, + "loss": 1.7449, + "step": 3254 + }, + { + "epoch": 0.5911866869480328, + "grad_norm": 0.3884718418121338, + "learning_rate": 3.8109322535562607e-05, + "loss": 1.9231, + "step": 3255 + }, + { + "epoch": 0.5913683111222103, + "grad_norm": 0.3590061366558075, + "learning_rate": 3.8080602262193595e-05, + "loss": 1.7965, + "step": 3256 + }, + { + "epoch": 0.591549935296388, + "grad_norm": 0.30909547209739685, + "learning_rate": 3.8051886157882336e-05, + "loss": 1.4671, + "step": 3257 + }, + { + "epoch": 0.5917315594705655, + "grad_norm": 0.345048189163208, + "learning_rate": 3.8023174232672864e-05, + "loss": 1.6261, + "step": 3258 + }, + { + "epoch": 0.5919131836447431, + "grad_norm": 0.3202994763851166, + "learning_rate": 3.799446649660784e-05, + "loss": 1.8111, + "step": 3259 + }, + { + "epoch": 0.5920948078189207, + "grad_norm": 0.4438817799091339, + "learning_rate": 3.796576295972835e-05, + "loss": 1.8826, + "step": 3260 + }, + { + "epoch": 0.5922764319930983, + "grad_norm": 0.38444438576698303, + "learning_rate": 3.7937063632074036e-05, + "loss": 1.7749, + "step": 3261 + }, + { + "epoch": 0.5924580561672759, + "grad_norm": 0.3833353519439697, + "learning_rate": 3.790836852368311e-05, + "loss": 1.8388, + "step": 3262 + }, + { + "epoch": 0.5926396803414534, + "grad_norm": 0.4431373178958893, + "learning_rate": 3.7879677644592296e-05, + "loss": 1.6966, + "step": 3263 + }, + { + "epoch": 0.592821304515631, + "grad_norm": 0.29732707142829895, + "learning_rate": 3.785099100483681e-05, + "loss": 1.5885, + "step": 3264 + }, + { + "epoch": 0.5930029286898086, + "grad_norm": 0.5733619332313538, + "learning_rate": 3.7822308614450406e-05, + "loss": 1.714, + "step": 3265 + }, + { + "epoch": 0.5931845528639862, + "grad_norm": 0.45433586835861206, + "learning_rate": 3.7793630483465345e-05, + "loss": 1.8788, + "step": 3266 + }, + { + "epoch": 0.5933661770381637, + "grad_norm": 0.41477152705192566, + "learning_rate": 3.7764956621912394e-05, + "loss": 1.6727, + "step": 3267 + }, + { + "epoch": 0.5935478012123414, + "grad_norm": 0.4479207992553711, + "learning_rate": 3.773628703982086e-05, + "loss": 1.5858, + "step": 3268 + }, + { + "epoch": 0.593729425386519, + "grad_norm": 0.40840932726860046, + "learning_rate": 3.7707621747218506e-05, + "loss": 1.8089, + "step": 3269 + }, + { + "epoch": 0.5939110495606965, + "grad_norm": 0.33433252573013306, + "learning_rate": 3.7678960754131614e-05, + "loss": 1.7592, + "step": 3270 + }, + { + "epoch": 0.5940926737348741, + "grad_norm": 0.32882314920425415, + "learning_rate": 3.7650304070584955e-05, + "loss": 1.8243, + "step": 3271 + }, + { + "epoch": 0.5942742979090517, + "grad_norm": 0.4276122748851776, + "learning_rate": 3.762165170660184e-05, + "loss": 1.7036, + "step": 3272 + }, + { + "epoch": 0.5944559220832293, + "grad_norm": 0.34730902314186096, + "learning_rate": 3.7593003672204e-05, + "loss": 1.7141, + "step": 3273 + }, + { + "epoch": 0.5946375462574068, + "grad_norm": 0.4075720012187958, + "learning_rate": 3.7564359977411684e-05, + "loss": 1.7601, + "step": 3274 + }, + { + "epoch": 0.5948191704315845, + "grad_norm": 0.29629239439964294, + "learning_rate": 3.753572063224361e-05, + "loss": 1.7655, + "step": 3275 + }, + { + "epoch": 0.595000794605762, + "grad_norm": 0.30448877811431885, + "learning_rate": 3.750708564671701e-05, + "loss": 1.7627, + "step": 3276 + }, + { + "epoch": 0.5951824187799396, + "grad_norm": 0.367159366607666, + "learning_rate": 3.747845503084757e-05, + "loss": 1.6587, + "step": 3277 + }, + { + "epoch": 0.5953640429541172, + "grad_norm": 0.3898000419139862, + "learning_rate": 3.744982879464943e-05, + "loss": 1.7958, + "step": 3278 + }, + { + "epoch": 0.5955456671282948, + "grad_norm": 0.3978586196899414, + "learning_rate": 3.742120694813521e-05, + "loss": 1.5953, + "step": 3279 + }, + { + "epoch": 0.5957272913024724, + "grad_norm": 0.42545056343078613, + "learning_rate": 3.7392589501315984e-05, + "loss": 1.7766, + "step": 3280 + }, + { + "epoch": 0.5959089154766499, + "grad_norm": 0.4680168032646179, + "learning_rate": 3.736397646420135e-05, + "loss": 1.8103, + "step": 3281 + }, + { + "epoch": 0.5960905396508275, + "grad_norm": 0.45215165615081787, + "learning_rate": 3.733536784679925e-05, + "loss": 1.5738, + "step": 3282 + }, + { + "epoch": 0.5962721638250051, + "grad_norm": 0.47365114092826843, + "learning_rate": 3.7306763659116207e-05, + "loss": 1.6734, + "step": 3283 + }, + { + "epoch": 0.5964537879991827, + "grad_norm": 0.39974725246429443, + "learning_rate": 3.727816391115707e-05, + "loss": 1.72, + "step": 3284 + }, + { + "epoch": 0.5966354121733602, + "grad_norm": 0.33161699771881104, + "learning_rate": 3.7249568612925254e-05, + "loss": 1.7702, + "step": 3285 + }, + { + "epoch": 0.5968170363475379, + "grad_norm": 0.3110460937023163, + "learning_rate": 3.722097777442253e-05, + "loss": 1.6268, + "step": 3286 + }, + { + "epoch": 0.5969986605217155, + "grad_norm": 0.36931705474853516, + "learning_rate": 3.719239140564914e-05, + "loss": 1.6582, + "step": 3287 + }, + { + "epoch": 0.597180284695893, + "grad_norm": 0.34550848603248596, + "learning_rate": 3.7163809516603764e-05, + "loss": 1.7185, + "step": 3288 + }, + { + "epoch": 0.5973619088700706, + "grad_norm": 0.46524032950401306, + "learning_rate": 3.7135232117283506e-05, + "loss": 1.6096, + "step": 3289 + }, + { + "epoch": 0.5975435330442482, + "grad_norm": 0.48363450169563293, + "learning_rate": 3.710665921768394e-05, + "loss": 1.699, + "step": 3290 + }, + { + "epoch": 0.5977251572184258, + "grad_norm": 0.5256975293159485, + "learning_rate": 3.7078090827799e-05, + "loss": 1.7502, + "step": 3291 + }, + { + "epoch": 0.5979067813926033, + "grad_norm": 0.4381980299949646, + "learning_rate": 3.7049526957621084e-05, + "loss": 1.7704, + "step": 3292 + }, + { + "epoch": 0.5980884055667809, + "grad_norm": 0.4008888006210327, + "learning_rate": 3.702096761714099e-05, + "loss": 1.8165, + "step": 3293 + }, + { + "epoch": 0.5982700297409586, + "grad_norm": 0.34640738368034363, + "learning_rate": 3.6992412816347974e-05, + "loss": 1.5687, + "step": 3294 + }, + { + "epoch": 0.5984516539151361, + "grad_norm": 0.3249538540840149, + "learning_rate": 3.696386256522964e-05, + "loss": 1.7601, + "step": 3295 + }, + { + "epoch": 0.5986332780893137, + "grad_norm": 0.43844231963157654, + "learning_rate": 3.693531687377207e-05, + "loss": 1.8256, + "step": 3296 + }, + { + "epoch": 0.5988149022634913, + "grad_norm": 0.35670849680900574, + "learning_rate": 3.690677575195967e-05, + "loss": 1.6806, + "step": 3297 + }, + { + "epoch": 0.5989965264376689, + "grad_norm": 0.434283584356308, + "learning_rate": 3.6878239209775314e-05, + "loss": 1.5321, + "step": 3298 + }, + { + "epoch": 0.5991781506118464, + "grad_norm": 0.7681728601455688, + "learning_rate": 3.684970725720026e-05, + "loss": 1.9593, + "step": 3299 + }, + { + "epoch": 0.599359774786024, + "grad_norm": 0.4835064113140106, + "learning_rate": 3.682117990421415e-05, + "loss": 1.7749, + "step": 3300 + }, + { + "epoch": 0.5995413989602016, + "grad_norm": 0.400931715965271, + "learning_rate": 3.679265716079501e-05, + "loss": 1.7265, + "step": 3301 + }, + { + "epoch": 0.5997230231343792, + "grad_norm": 0.3914940655231476, + "learning_rate": 3.676413903691924e-05, + "loss": 1.5451, + "step": 3302 + }, + { + "epoch": 0.5999046473085567, + "grad_norm": 0.3174111545085907, + "learning_rate": 3.673562554256171e-05, + "loss": 1.7085, + "step": 3303 + }, + { + "epoch": 0.6000862714827343, + "grad_norm": 0.3781942129135132, + "learning_rate": 3.670711668769558e-05, + "loss": 1.7569, + "step": 3304 + }, + { + "epoch": 0.600267895656912, + "grad_norm": 0.5930119752883911, + "learning_rate": 3.66786124822924e-05, + "loss": 1.6009, + "step": 3305 + }, + { + "epoch": 0.6004495198310895, + "grad_norm": 0.32191282510757446, + "learning_rate": 3.6650112936322115e-05, + "loss": 1.8651, + "step": 3306 + }, + { + "epoch": 0.6006311440052671, + "grad_norm": 0.37134599685668945, + "learning_rate": 3.662161805975305e-05, + "loss": 1.7665, + "step": 3307 + }, + { + "epoch": 0.6008127681794447, + "grad_norm": 0.3092164993286133, + "learning_rate": 3.659312786255188e-05, + "loss": 1.564, + "step": 3308 + }, + { + "epoch": 0.6009943923536223, + "grad_norm": 0.3440742790699005, + "learning_rate": 3.656464235468364e-05, + "loss": 1.6019, + "step": 3309 + }, + { + "epoch": 0.6011760165277998, + "grad_norm": 0.4397311210632324, + "learning_rate": 3.653616154611171e-05, + "loss": 1.7948, + "step": 3310 + }, + { + "epoch": 0.6013576407019774, + "grad_norm": 0.30041366815567017, + "learning_rate": 3.650768544679788e-05, + "loss": 1.8247, + "step": 3311 + }, + { + "epoch": 0.601539264876155, + "grad_norm": 0.47520899772644043, + "learning_rate": 3.647921406670225e-05, + "loss": 1.7476, + "step": 3312 + }, + { + "epoch": 0.6017208890503326, + "grad_norm": 0.6460574865341187, + "learning_rate": 3.645074741578326e-05, + "loss": 1.7806, + "step": 3313 + }, + { + "epoch": 0.6019025132245102, + "grad_norm": 0.41146448254585266, + "learning_rate": 3.642228550399775e-05, + "loss": 1.5104, + "step": 3314 + }, + { + "epoch": 0.6020841373986877, + "grad_norm": 0.6904112696647644, + "learning_rate": 3.6393828341300807e-05, + "loss": 1.6852, + "step": 3315 + }, + { + "epoch": 0.6022657615728654, + "grad_norm": 0.4753240942955017, + "learning_rate": 3.6365375937645985e-05, + "loss": 1.8335, + "step": 3316 + }, + { + "epoch": 0.6024473857470429, + "grad_norm": 0.5325042009353638, + "learning_rate": 3.6336928302985065e-05, + "loss": 1.766, + "step": 3317 + }, + { + "epoch": 0.6026290099212205, + "grad_norm": 0.4925849139690399, + "learning_rate": 3.630848544726821e-05, + "loss": 1.8549, + "step": 3318 + }, + { + "epoch": 0.6028106340953981, + "grad_norm": 0.36259379982948303, + "learning_rate": 3.628004738044389e-05, + "loss": 1.5812, + "step": 3319 + }, + { + "epoch": 0.6029922582695757, + "grad_norm": 0.4266106188297272, + "learning_rate": 3.625161411245893e-05, + "loss": 1.6165, + "step": 3320 + }, + { + "epoch": 0.6031738824437533, + "grad_norm": 0.4788222014904022, + "learning_rate": 3.622318565325847e-05, + "loss": 1.5864, + "step": 3321 + }, + { + "epoch": 0.6033555066179308, + "grad_norm": 0.38492777943611145, + "learning_rate": 3.619476201278592e-05, + "loss": 1.8065, + "step": 3322 + }, + { + "epoch": 0.6035371307921085, + "grad_norm": 0.37821248173713684, + "learning_rate": 3.6166343200983047e-05, + "loss": 1.7972, + "step": 3323 + }, + { + "epoch": 0.603718754966286, + "grad_norm": 0.3498292565345764, + "learning_rate": 3.6137929227789946e-05, + "loss": 1.5662, + "step": 3324 + }, + { + "epoch": 0.6039003791404636, + "grad_norm": 0.6213298439979553, + "learning_rate": 3.610952010314499e-05, + "loss": 1.5576, + "step": 3325 + }, + { + "epoch": 0.6040820033146411, + "grad_norm": 0.41625961661338806, + "learning_rate": 3.608111583698484e-05, + "loss": 1.5004, + "step": 3326 + }, + { + "epoch": 0.6042636274888188, + "grad_norm": 0.5363500714302063, + "learning_rate": 3.605271643924451e-05, + "loss": 1.8978, + "step": 3327 + }, + { + "epoch": 0.6044452516629963, + "grad_norm": 1.378745675086975, + "learning_rate": 3.6024321919857246e-05, + "loss": 1.8581, + "step": 3328 + }, + { + "epoch": 0.6046268758371739, + "grad_norm": 0.6033296585083008, + "learning_rate": 3.599593228875465e-05, + "loss": 1.8218, + "step": 3329 + }, + { + "epoch": 0.6048085000113516, + "grad_norm": 0.34095895290374756, + "learning_rate": 3.59675475558666e-05, + "loss": 1.716, + "step": 3330 + }, + { + "epoch": 0.6049901241855291, + "grad_norm": 0.3666979670524597, + "learning_rate": 3.593916773112122e-05, + "loss": 1.9557, + "step": 3331 + }, + { + "epoch": 0.6051717483597067, + "grad_norm": 1.5356837511062622, + "learning_rate": 3.5910792824444937e-05, + "loss": 1.7718, + "step": 3332 + }, + { + "epoch": 0.6053533725338842, + "grad_norm": 0.4250616431236267, + "learning_rate": 3.5882422845762493e-05, + "loss": 1.7059, + "step": 3333 + }, + { + "epoch": 0.6055349967080619, + "grad_norm": 0.40631404519081116, + "learning_rate": 3.585405780499688e-05, + "loss": 1.8473, + "step": 3334 + }, + { + "epoch": 0.6057166208822394, + "grad_norm": 0.36123213171958923, + "learning_rate": 3.5825697712069336e-05, + "loss": 1.6753, + "step": 3335 + }, + { + "epoch": 0.605898245056417, + "grad_norm": 0.3753015100955963, + "learning_rate": 3.579734257689943e-05, + "loss": 1.7491, + "step": 3336 + }, + { + "epoch": 0.6060798692305945, + "grad_norm": 0.43641865253448486, + "learning_rate": 3.576899240940491e-05, + "loss": 1.5488, + "step": 3337 + }, + { + "epoch": 0.6062614934047722, + "grad_norm": 1.0176668167114258, + "learning_rate": 3.574064721950188e-05, + "loss": 1.9446, + "step": 3338 + }, + { + "epoch": 0.6064431175789498, + "grad_norm": 0.34702152013778687, + "learning_rate": 3.5712307017104664e-05, + "loss": 1.7745, + "step": 3339 + }, + { + "epoch": 0.6066247417531273, + "grad_norm": 0.3846658766269684, + "learning_rate": 3.5683971812125825e-05, + "loss": 1.6778, + "step": 3340 + }, + { + "epoch": 0.606806365927305, + "grad_norm": 0.3291984498500824, + "learning_rate": 3.565564161447617e-05, + "loss": 1.6974, + "step": 3341 + }, + { + "epoch": 0.6069879901014825, + "grad_norm": 0.6117645502090454, + "learning_rate": 3.5627316434064806e-05, + "loss": 1.7711, + "step": 3342 + }, + { + "epoch": 0.6071696142756601, + "grad_norm": 0.37497034668922424, + "learning_rate": 3.559899628079906e-05, + "loss": 1.8196, + "step": 3343 + }, + { + "epoch": 0.6073512384498376, + "grad_norm": 0.33548957109451294, + "learning_rate": 3.557068116458446e-05, + "loss": 1.8151, + "step": 3344 + }, + { + "epoch": 0.6075328626240153, + "grad_norm": 0.4898432195186615, + "learning_rate": 3.554237109532483e-05, + "loss": 1.5718, + "step": 3345 + }, + { + "epoch": 0.6077144867981928, + "grad_norm": 0.4317809045314789, + "learning_rate": 3.551406608292223e-05, + "loss": 1.7057, + "step": 3346 + }, + { + "epoch": 0.6078961109723704, + "grad_norm": 1.2717902660369873, + "learning_rate": 3.5485766137276894e-05, + "loss": 1.5514, + "step": 3347 + }, + { + "epoch": 0.6080777351465481, + "grad_norm": 0.3416529595851898, + "learning_rate": 3.545747126828732e-05, + "loss": 1.6997, + "step": 3348 + }, + { + "epoch": 0.6082593593207256, + "grad_norm": 0.9565325975418091, + "learning_rate": 3.542918148585025e-05, + "loss": 1.78, + "step": 3349 + }, + { + "epoch": 0.6084409834949032, + "grad_norm": 0.4282439053058624, + "learning_rate": 3.540089679986058e-05, + "loss": 1.6308, + "step": 3350 + }, + { + "epoch": 0.6086226076690807, + "grad_norm": 0.32846757769584656, + "learning_rate": 3.5372617220211525e-05, + "loss": 1.7775, + "step": 3351 + }, + { + "epoch": 0.6088042318432584, + "grad_norm": 0.39696604013442993, + "learning_rate": 3.5344342756794436e-05, + "loss": 1.7651, + "step": 3352 + }, + { + "epoch": 0.6089858560174359, + "grad_norm": 1.0639774799346924, + "learning_rate": 3.5316073419498886e-05, + "loss": 1.8482, + "step": 3353 + }, + { + "epoch": 0.6091674801916135, + "grad_norm": 0.5177795886993408, + "learning_rate": 3.528780921821265e-05, + "loss": 1.6951, + "step": 3354 + }, + { + "epoch": 0.609349104365791, + "grad_norm": 0.2884437143802643, + "learning_rate": 3.525955016282177e-05, + "loss": 1.7374, + "step": 3355 + }, + { + "epoch": 0.6095307285399687, + "grad_norm": 2.0127780437469482, + "learning_rate": 3.523129626321041e-05, + "loss": 1.7393, + "step": 3356 + }, + { + "epoch": 0.6097123527141463, + "grad_norm": 0.8145685791969299, + "learning_rate": 3.520304752926095e-05, + "loss": 1.5901, + "step": 3357 + }, + { + "epoch": 0.6098939768883238, + "grad_norm": 0.48498061299324036, + "learning_rate": 3.5174803970853974e-05, + "loss": 1.6577, + "step": 3358 + }, + { + "epoch": 0.6100756010625015, + "grad_norm": 0.4216499328613281, + "learning_rate": 3.514656559786829e-05, + "loss": 1.6178, + "step": 3359 + }, + { + "epoch": 0.610257225236679, + "grad_norm": 0.441108375787735, + "learning_rate": 3.5118332420180824e-05, + "loss": 1.8482, + "step": 3360 + }, + { + "epoch": 0.6104388494108566, + "grad_norm": 0.3214413523674011, + "learning_rate": 3.509010444766674e-05, + "loss": 1.8203, + "step": 3361 + }, + { + "epoch": 0.6106204735850341, + "grad_norm": 0.4354317784309387, + "learning_rate": 3.506188169019933e-05, + "loss": 1.7853, + "step": 3362 + }, + { + "epoch": 0.6108020977592118, + "grad_norm": 0.9063714146614075, + "learning_rate": 3.50336641576501e-05, + "loss": 1.8154, + "step": 3363 + }, + { + "epoch": 0.6109837219333893, + "grad_norm": 0.3250770568847656, + "learning_rate": 3.5005451859888754e-05, + "loss": 1.5717, + "step": 3364 + }, + { + "epoch": 0.6111653461075669, + "grad_norm": 0.4407554864883423, + "learning_rate": 3.49772448067831e-05, + "loss": 1.764, + "step": 3365 + }, + { + "epoch": 0.6113469702817445, + "grad_norm": 0.5454978942871094, + "learning_rate": 3.494904300819915e-05, + "loss": 1.7413, + "step": 3366 + }, + { + "epoch": 0.6115285944559221, + "grad_norm": 0.35305073857307434, + "learning_rate": 3.492084647400106e-05, + "loss": 1.7021, + "step": 3367 + }, + { + "epoch": 0.6117102186300997, + "grad_norm": 0.37028181552886963, + "learning_rate": 3.489265521405117e-05, + "loss": 1.7079, + "step": 3368 + }, + { + "epoch": 0.6118918428042772, + "grad_norm": 1.053606629371643, + "learning_rate": 3.486446923820996e-05, + "loss": 1.8912, + "step": 3369 + }, + { + "epoch": 0.6120734669784549, + "grad_norm": 0.4372890889644623, + "learning_rate": 3.483628855633606e-05, + "loss": 1.8199, + "step": 3370 + }, + { + "epoch": 0.6122550911526324, + "grad_norm": 0.3291037976741791, + "learning_rate": 3.480811317828625e-05, + "loss": 1.7006, + "step": 3371 + }, + { + "epoch": 0.61243671532681, + "grad_norm": 0.5920373201370239, + "learning_rate": 3.477994311391544e-05, + "loss": 1.8037, + "step": 3372 + }, + { + "epoch": 0.6126183395009875, + "grad_norm": 1.6128413677215576, + "learning_rate": 3.475177837307671e-05, + "loss": 1.9099, + "step": 3373 + }, + { + "epoch": 0.6127999636751652, + "grad_norm": 0.39965078234672546, + "learning_rate": 3.47236189656213e-05, + "loss": 1.8221, + "step": 3374 + }, + { + "epoch": 0.6129815878493428, + "grad_norm": 0.36222633719444275, + "learning_rate": 3.469546490139849e-05, + "loss": 1.6965, + "step": 3375 + }, + { + "epoch": 0.6131632120235203, + "grad_norm": 0.35010984539985657, + "learning_rate": 3.4667316190255766e-05, + "loss": 1.5717, + "step": 3376 + }, + { + "epoch": 0.6133448361976979, + "grad_norm": 0.4812450408935547, + "learning_rate": 3.463917284203876e-05, + "loss": 1.7827, + "step": 3377 + }, + { + "epoch": 0.6135264603718755, + "grad_norm": 0.5444563627243042, + "learning_rate": 3.4611034866591166e-05, + "loss": 1.755, + "step": 3378 + }, + { + "epoch": 0.6137080845460531, + "grad_norm": 0.4972471594810486, + "learning_rate": 3.4582902273754844e-05, + "loss": 1.6554, + "step": 3379 + }, + { + "epoch": 0.6138897087202306, + "grad_norm": 0.9881906509399414, + "learning_rate": 3.455477507336972e-05, + "loss": 1.7724, + "step": 3380 + }, + { + "epoch": 0.6140713328944083, + "grad_norm": 0.9628702402114868, + "learning_rate": 3.452665327527391e-05, + "loss": 1.9018, + "step": 3381 + }, + { + "epoch": 0.6142529570685858, + "grad_norm": 1.1831761598587036, + "learning_rate": 3.449853688930358e-05, + "loss": 1.831, + "step": 3382 + }, + { + "epoch": 0.6144345812427634, + "grad_norm": 0.5470401644706726, + "learning_rate": 3.447042592529303e-05, + "loss": 1.7168, + "step": 3383 + }, + { + "epoch": 0.614616205416941, + "grad_norm": 0.520492672920227, + "learning_rate": 3.444232039307463e-05, + "loss": 1.7445, + "step": 3384 + }, + { + "epoch": 0.6147978295911186, + "grad_norm": 0.4397439956665039, + "learning_rate": 3.4414220302478896e-05, + "loss": 1.6292, + "step": 3385 + }, + { + "epoch": 0.6149794537652962, + "grad_norm": 0.3522368371486664, + "learning_rate": 3.438612566333443e-05, + "loss": 1.6056, + "step": 3386 + }, + { + "epoch": 0.6151610779394737, + "grad_norm": 0.5144216418266296, + "learning_rate": 3.435803648546791e-05, + "loss": 1.9944, + "step": 3387 + }, + { + "epoch": 0.6153427021136513, + "grad_norm": 0.6787732839584351, + "learning_rate": 3.43299527787041e-05, + "loss": 1.6931, + "step": 3388 + }, + { + "epoch": 0.6155243262878289, + "grad_norm": 0.5286373496055603, + "learning_rate": 3.430187455286586e-05, + "loss": 1.811, + "step": 3389 + }, + { + "epoch": 0.6157059504620065, + "grad_norm": 1.8390388488769531, + "learning_rate": 3.4273801817774166e-05, + "loss": 1.6736, + "step": 3390 + }, + { + "epoch": 0.615887574636184, + "grad_norm": 0.431353896856308, + "learning_rate": 3.4245734583248e-05, + "loss": 1.7088, + "step": 3391 + }, + { + "epoch": 0.6160691988103617, + "grad_norm": 0.5567449331283569, + "learning_rate": 3.42176728591045e-05, + "loss": 1.7783, + "step": 3392 + }, + { + "epoch": 0.6162508229845393, + "grad_norm": 0.4123658835887909, + "learning_rate": 3.41896166551588e-05, + "loss": 1.6714, + "step": 3393 + }, + { + "epoch": 0.6164324471587168, + "grad_norm": 0.3482390344142914, + "learning_rate": 3.4161565981224175e-05, + "loss": 1.6397, + "step": 3394 + }, + { + "epoch": 0.6166140713328944, + "grad_norm": 0.38009965419769287, + "learning_rate": 3.4133520847111934e-05, + "loss": 1.7907, + "step": 3395 + }, + { + "epoch": 0.616795695507072, + "grad_norm": 0.3605816662311554, + "learning_rate": 3.4105481262631424e-05, + "loss": 1.7523, + "step": 3396 + }, + { + "epoch": 0.6169773196812496, + "grad_norm": 0.4533000886440277, + "learning_rate": 3.4077447237590074e-05, + "loss": 1.8567, + "step": 3397 + }, + { + "epoch": 0.6171589438554271, + "grad_norm": 0.36026808619499207, + "learning_rate": 3.404941878179338e-05, + "loss": 1.6155, + "step": 3398 + }, + { + "epoch": 0.6173405680296047, + "grad_norm": 0.48293793201446533, + "learning_rate": 3.40213959050449e-05, + "loss": 1.6219, + "step": 3399 + }, + { + "epoch": 0.6175221922037823, + "grad_norm": 0.38771313428878784, + "learning_rate": 3.3993378617146164e-05, + "loss": 1.827, + "step": 3400 + }, + { + "epoch": 0.6177038163779599, + "grad_norm": 0.3759533762931824, + "learning_rate": 3.3965366927896864e-05, + "loss": 1.7182, + "step": 3401 + }, + { + "epoch": 0.6178854405521375, + "grad_norm": 0.5564964413642883, + "learning_rate": 3.393736084709461e-05, + "loss": 1.6646, + "step": 3402 + }, + { + "epoch": 0.6180670647263151, + "grad_norm": 0.3732925057411194, + "learning_rate": 3.3909360384535185e-05, + "loss": 1.7003, + "step": 3403 + }, + { + "epoch": 0.6182486889004927, + "grad_norm": 1.1266400814056396, + "learning_rate": 3.388136555001227e-05, + "loss": 1.881, + "step": 3404 + }, + { + "epoch": 0.6184303130746702, + "grad_norm": 0.9194115400314331, + "learning_rate": 3.3853376353317674e-05, + "loss": 1.8462, + "step": 3405 + }, + { + "epoch": 0.6186119372488478, + "grad_norm": 1.0411486625671387, + "learning_rate": 3.3825392804241176e-05, + "loss": 1.7436, + "step": 3406 + }, + { + "epoch": 0.6187935614230254, + "grad_norm": 0.36940082907676697, + "learning_rate": 3.379741491257064e-05, + "loss": 1.7383, + "step": 3407 + }, + { + "epoch": 0.618975185597203, + "grad_norm": 0.3950757682323456, + "learning_rate": 3.37694426880919e-05, + "loss": 1.8713, + "step": 3408 + }, + { + "epoch": 0.6191568097713805, + "grad_norm": 1.5666948556900024, + "learning_rate": 3.3741476140588824e-05, + "loss": 2.0853, + "step": 3409 + }, + { + "epoch": 0.6193384339455582, + "grad_norm": 0.9109283685684204, + "learning_rate": 3.37135152798433e-05, + "loss": 1.9575, + "step": 3410 + }, + { + "epoch": 0.6195200581197358, + "grad_norm": 0.5373722314834595, + "learning_rate": 3.3685560115635195e-05, + "loss": 1.8618, + "step": 3411 + }, + { + "epoch": 0.6197016822939133, + "grad_norm": 0.3790779411792755, + "learning_rate": 3.365761065774246e-05, + "loss": 1.5579, + "step": 3412 + }, + { + "epoch": 0.6198833064680909, + "grad_norm": 0.3736322522163391, + "learning_rate": 3.362966691594096e-05, + "loss": 1.6572, + "step": 3413 + }, + { + "epoch": 0.6200649306422685, + "grad_norm": 0.34655502438545227, + "learning_rate": 3.360172890000462e-05, + "loss": 1.649, + "step": 3414 + }, + { + "epoch": 0.6202465548164461, + "grad_norm": 0.5891299843788147, + "learning_rate": 3.357379661970532e-05, + "loss": 1.8347, + "step": 3415 + }, + { + "epoch": 0.6204281789906236, + "grad_norm": 0.32791924476623535, + "learning_rate": 3.354587008481298e-05, + "loss": 1.6307, + "step": 3416 + }, + { + "epoch": 0.6206098031648012, + "grad_norm": 0.32938411831855774, + "learning_rate": 3.3517949305095495e-05, + "loss": 1.5815, + "step": 3417 + }, + { + "epoch": 0.6207914273389789, + "grad_norm": 0.484225869178772, + "learning_rate": 3.349003429031873e-05, + "loss": 1.6866, + "step": 3418 + }, + { + "epoch": 0.6209730515131564, + "grad_norm": 0.3753473460674286, + "learning_rate": 3.346212505024653e-05, + "loss": 1.7252, + "step": 3419 + }, + { + "epoch": 0.621154675687334, + "grad_norm": 1.0095387697219849, + "learning_rate": 3.343422159464073e-05, + "loss": 1.5943, + "step": 3420 + }, + { + "epoch": 0.6213362998615116, + "grad_norm": 1.5640485286712646, + "learning_rate": 3.340632393326118e-05, + "loss": 1.857, + "step": 3421 + }, + { + "epoch": 0.6215179240356892, + "grad_norm": 0.35337033867836, + "learning_rate": 3.337843207586564e-05, + "loss": 1.873, + "step": 3422 + }, + { + "epoch": 0.6216995482098667, + "grad_norm": 0.3772355914115906, + "learning_rate": 3.3350546032209876e-05, + "loss": 1.8285, + "step": 3423 + }, + { + "epoch": 0.6218811723840443, + "grad_norm": 0.8368522524833679, + "learning_rate": 3.3322665812047596e-05, + "loss": 1.8419, + "step": 3424 + }, + { + "epoch": 0.6220627965582219, + "grad_norm": 0.3277459442615509, + "learning_rate": 3.329479142513051e-05, + "loss": 1.6946, + "step": 3425 + }, + { + "epoch": 0.6222444207323995, + "grad_norm": 1.2708301544189453, + "learning_rate": 3.326692288120827e-05, + "loss": 1.886, + "step": 3426 + }, + { + "epoch": 0.622426044906577, + "grad_norm": 0.3888011574745178, + "learning_rate": 3.3239060190028476e-05, + "loss": 1.9134, + "step": 3427 + }, + { + "epoch": 0.6226076690807546, + "grad_norm": 0.3452543616294861, + "learning_rate": 3.321120336133666e-05, + "loss": 1.8456, + "step": 3428 + }, + { + "epoch": 0.6227892932549323, + "grad_norm": 0.9706088900566101, + "learning_rate": 3.318335240487634e-05, + "loss": 1.8497, + "step": 3429 + }, + { + "epoch": 0.6229709174291098, + "grad_norm": 0.7653040885925293, + "learning_rate": 3.3155507330389e-05, + "loss": 1.8759, + "step": 3430 + }, + { + "epoch": 0.6231525416032874, + "grad_norm": 0.3317773640155792, + "learning_rate": 3.3127668147614e-05, + "loss": 1.747, + "step": 3431 + }, + { + "epoch": 0.623334165777465, + "grad_norm": 0.3604030907154083, + "learning_rate": 3.3099834866288694e-05, + "loss": 1.478, + "step": 3432 + }, + { + "epoch": 0.6235157899516426, + "grad_norm": 0.3628806173801422, + "learning_rate": 3.307200749614832e-05, + "loss": 1.8408, + "step": 3433 + }, + { + "epoch": 0.6236974141258201, + "grad_norm": 0.42315149307250977, + "learning_rate": 3.304418604692612e-05, + "loss": 1.6036, + "step": 3434 + }, + { + "epoch": 0.6238790382999977, + "grad_norm": 0.6001616716384888, + "learning_rate": 3.3016370528353215e-05, + "loss": 1.8223, + "step": 3435 + }, + { + "epoch": 0.6240606624741754, + "grad_norm": 0.37654611468315125, + "learning_rate": 3.2988560950158655e-05, + "loss": 1.7643, + "step": 3436 + }, + { + "epoch": 0.6242422866483529, + "grad_norm": 0.5442213416099548, + "learning_rate": 3.2960757322069405e-05, + "loss": 1.7134, + "step": 3437 + }, + { + "epoch": 0.6244239108225305, + "grad_norm": 0.33103787899017334, + "learning_rate": 3.293295965381038e-05, + "loss": 1.7047, + "step": 3438 + }, + { + "epoch": 0.624605534996708, + "grad_norm": 0.4884653687477112, + "learning_rate": 3.290516795510441e-05, + "loss": 1.8696, + "step": 3439 + }, + { + "epoch": 0.6247871591708857, + "grad_norm": 0.5534495115280151, + "learning_rate": 3.2877382235672195e-05, + "loss": 1.7113, + "step": 3440 + }, + { + "epoch": 0.6249687833450632, + "grad_norm": 0.5338122844696045, + "learning_rate": 3.284960250523237e-05, + "loss": 1.7189, + "step": 3441 + }, + { + "epoch": 0.6251504075192408, + "grad_norm": 0.31529319286346436, + "learning_rate": 3.28218287735015e-05, + "loss": 1.6949, + "step": 3442 + }, + { + "epoch": 0.6253320316934184, + "grad_norm": 0.34978532791137695, + "learning_rate": 3.2794061050194005e-05, + "loss": 1.5999, + "step": 3443 + }, + { + "epoch": 0.625513655867596, + "grad_norm": 1.4083174467086792, + "learning_rate": 3.2766299345022224e-05, + "loss": 1.7598, + "step": 3444 + }, + { + "epoch": 0.6256952800417735, + "grad_norm": 0.37362316250801086, + "learning_rate": 3.273854366769641e-05, + "loss": 1.5897, + "step": 3445 + }, + { + "epoch": 0.6258769042159511, + "grad_norm": 0.39247623085975647, + "learning_rate": 3.271079402792465e-05, + "loss": 1.7323, + "step": 3446 + }, + { + "epoch": 0.6260585283901288, + "grad_norm": 0.37939178943634033, + "learning_rate": 3.2683050435413e-05, + "loss": 1.8313, + "step": 3447 + }, + { + "epoch": 0.6262401525643063, + "grad_norm": 0.3929939270019531, + "learning_rate": 3.265531289986535e-05, + "loss": 1.7077, + "step": 3448 + }, + { + "epoch": 0.6264217767384839, + "grad_norm": 0.45553454756736755, + "learning_rate": 3.2627581430983476e-05, + "loss": 1.8046, + "step": 3449 + }, + { + "epoch": 0.6266034009126614, + "grad_norm": 0.267913281917572, + "learning_rate": 3.2599856038467025e-05, + "loss": 1.9338, + "step": 3450 + }, + { + "epoch": 0.6267850250868391, + "grad_norm": 0.3601245582103729, + "learning_rate": 3.2572136732013555e-05, + "loss": 1.9162, + "step": 3451 + }, + { + "epoch": 0.6269666492610166, + "grad_norm": 0.4046638607978821, + "learning_rate": 3.254442352131847e-05, + "loss": 1.7821, + "step": 3452 + }, + { + "epoch": 0.6271482734351942, + "grad_norm": 0.37537309527397156, + "learning_rate": 3.251671641607502e-05, + "loss": 1.6363, + "step": 3453 + }, + { + "epoch": 0.6273298976093719, + "grad_norm": 0.47147199511528015, + "learning_rate": 3.248901542597437e-05, + "loss": 1.9336, + "step": 3454 + }, + { + "epoch": 0.6275115217835494, + "grad_norm": 0.3221721053123474, + "learning_rate": 3.2461320560705476e-05, + "loss": 1.7271, + "step": 3455 + }, + { + "epoch": 0.627693145957727, + "grad_norm": 0.4282022714614868, + "learning_rate": 3.243363182995524e-05, + "loss": 1.7464, + "step": 3456 + }, + { + "epoch": 0.6278747701319045, + "grad_norm": 0.4354749917984009, + "learning_rate": 3.240594924340835e-05, + "loss": 1.6202, + "step": 3457 + }, + { + "epoch": 0.6280563943060822, + "grad_norm": 0.42293816804885864, + "learning_rate": 3.237827281074738e-05, + "loss": 1.7433, + "step": 3458 + }, + { + "epoch": 0.6282380184802597, + "grad_norm": 0.4070383906364441, + "learning_rate": 3.235060254165272e-05, + "loss": 1.7696, + "step": 3459 + }, + { + "epoch": 0.6284196426544373, + "grad_norm": 0.59364914894104, + "learning_rate": 3.232293844580263e-05, + "loss": 1.7964, + "step": 3460 + }, + { + "epoch": 0.6286012668286148, + "grad_norm": 0.30130040645599365, + "learning_rate": 3.2295280532873226e-05, + "loss": 1.7656, + "step": 3461 + }, + { + "epoch": 0.6287828910027925, + "grad_norm": 0.44314417243003845, + "learning_rate": 3.226762881253841e-05, + "loss": 1.6771, + "step": 3462 + }, + { + "epoch": 0.62896451517697, + "grad_norm": 0.4816541075706482, + "learning_rate": 3.223998329446996e-05, + "loss": 1.6898, + "step": 3463 + }, + { + "epoch": 0.6291461393511476, + "grad_norm": 0.856191873550415, + "learning_rate": 3.221234398833747e-05, + "loss": 1.7014, + "step": 3464 + }, + { + "epoch": 0.6293277635253253, + "grad_norm": 0.5019277930259705, + "learning_rate": 3.218471090380837e-05, + "loss": 1.7229, + "step": 3465 + }, + { + "epoch": 0.6295093876995028, + "grad_norm": 0.5622115135192871, + "learning_rate": 3.215708405054791e-05, + "loss": 1.7772, + "step": 3466 + }, + { + "epoch": 0.6296910118736804, + "grad_norm": 1.0550681352615356, + "learning_rate": 3.2129463438219146e-05, + "loss": 1.948, + "step": 3467 + }, + { + "epoch": 0.6298726360478579, + "grad_norm": 0.3776564598083496, + "learning_rate": 3.210184907648295e-05, + "loss": 1.7534, + "step": 3468 + }, + { + "epoch": 0.6300542602220356, + "grad_norm": 0.33458855748176575, + "learning_rate": 3.207424097499805e-05, + "loss": 1.7337, + "step": 3469 + }, + { + "epoch": 0.6302358843962131, + "grad_norm": 0.3147968053817749, + "learning_rate": 3.204663914342094e-05, + "loss": 1.6687, + "step": 3470 + }, + { + "epoch": 0.6304175085703907, + "grad_norm": 0.35003361105918884, + "learning_rate": 3.2019043591405936e-05, + "loss": 1.8125, + "step": 3471 + }, + { + "epoch": 0.6305991327445682, + "grad_norm": 0.374064564704895, + "learning_rate": 3.199145432860515e-05, + "loss": 1.7109, + "step": 3472 + }, + { + "epoch": 0.6307807569187459, + "grad_norm": 0.32393091917037964, + "learning_rate": 3.196387136466853e-05, + "loss": 1.4498, + "step": 3473 + }, + { + "epoch": 0.6309623810929235, + "grad_norm": 0.8735793828964233, + "learning_rate": 3.193629470924377e-05, + "loss": 1.7784, + "step": 3474 + }, + { + "epoch": 0.631144005267101, + "grad_norm": 0.42786839604377747, + "learning_rate": 3.1908724371976376e-05, + "loss": 1.789, + "step": 3475 + }, + { + "epoch": 0.6313256294412787, + "grad_norm": 0.3395615518093109, + "learning_rate": 3.1881160362509643e-05, + "loss": 1.6862, + "step": 3476 + }, + { + "epoch": 0.6315072536154562, + "grad_norm": 0.39592552185058594, + "learning_rate": 3.185360269048469e-05, + "loss": 1.7542, + "step": 3477 + }, + { + "epoch": 0.6316888777896338, + "grad_norm": 0.34075668454170227, + "learning_rate": 3.182605136554036e-05, + "loss": 1.7361, + "step": 3478 + }, + { + "epoch": 0.6318705019638113, + "grad_norm": 0.35441136360168457, + "learning_rate": 3.179850639731331e-05, + "loss": 1.7793, + "step": 3479 + }, + { + "epoch": 0.632052126137989, + "grad_norm": 0.4088059067726135, + "learning_rate": 3.177096779543797e-05, + "loss": 1.5843, + "step": 3480 + }, + { + "epoch": 0.6322337503121666, + "grad_norm": 0.5560126900672913, + "learning_rate": 3.174343556954652e-05, + "loss": 1.5757, + "step": 3481 + }, + { + "epoch": 0.6324153744863441, + "grad_norm": 0.33600232005119324, + "learning_rate": 3.1715909729268964e-05, + "loss": 1.9928, + "step": 3482 + }, + { + "epoch": 0.6325969986605218, + "grad_norm": 0.7601543664932251, + "learning_rate": 3.1688390284233024e-05, + "loss": 1.8777, + "step": 3483 + }, + { + "epoch": 0.6327786228346993, + "grad_norm": 0.4251508116722107, + "learning_rate": 3.166087724406419e-05, + "loss": 1.6407, + "step": 3484 + }, + { + "epoch": 0.6329602470088769, + "grad_norm": 0.46276700496673584, + "learning_rate": 3.163337061838571e-05, + "loss": 1.6855, + "step": 3485 + }, + { + "epoch": 0.6331418711830544, + "grad_norm": 0.3864264488220215, + "learning_rate": 3.160587041681864e-05, + "loss": 1.5572, + "step": 3486 + }, + { + "epoch": 0.6333234953572321, + "grad_norm": 0.33785271644592285, + "learning_rate": 3.157837664898172e-05, + "loss": 1.7919, + "step": 3487 + }, + { + "epoch": 0.6335051195314096, + "grad_norm": 0.3367176949977875, + "learning_rate": 3.155088932449147e-05, + "loss": 1.7087, + "step": 3488 + }, + { + "epoch": 0.6336867437055872, + "grad_norm": 0.5323870778083801, + "learning_rate": 3.152340845296216e-05, + "loss": 1.678, + "step": 3489 + }, + { + "epoch": 0.6338683678797647, + "grad_norm": 0.389884889125824, + "learning_rate": 3.149593404400579e-05, + "loss": 1.6912, + "step": 3490 + }, + { + "epoch": 0.6340499920539424, + "grad_norm": 0.3234858810901642, + "learning_rate": 3.146846610723212e-05, + "loss": 1.6491, + "step": 3491 + }, + { + "epoch": 0.63423161622812, + "grad_norm": 0.48343726992607117, + "learning_rate": 3.144100465224863e-05, + "loss": 1.8398, + "step": 3492 + }, + { + "epoch": 0.6344132404022975, + "grad_norm": 0.36413347721099854, + "learning_rate": 3.141354968866053e-05, + "loss": 1.808, + "step": 3493 + }, + { + "epoch": 0.6345948645764752, + "grad_norm": 0.4677601158618927, + "learning_rate": 3.1386101226070746e-05, + "loss": 1.7064, + "step": 3494 + }, + { + "epoch": 0.6347764887506527, + "grad_norm": 0.35651862621307373, + "learning_rate": 3.135865927408e-05, + "loss": 1.8482, + "step": 3495 + }, + { + "epoch": 0.6349581129248303, + "grad_norm": 0.4648207128047943, + "learning_rate": 3.133122384228665e-05, + "loss": 1.911, + "step": 3496 + }, + { + "epoch": 0.6351397370990078, + "grad_norm": 0.32545334100723267, + "learning_rate": 3.130379494028682e-05, + "loss": 1.8677, + "step": 3497 + }, + { + "epoch": 0.6353213612731855, + "grad_norm": 0.5703951716423035, + "learning_rate": 3.127637257767432e-05, + "loss": 1.8178, + "step": 3498 + }, + { + "epoch": 0.6355029854473631, + "grad_norm": 0.5233213305473328, + "learning_rate": 3.124895676404074e-05, + "loss": 1.8629, + "step": 3499 + }, + { + "epoch": 0.6356846096215406, + "grad_norm": 0.3390849530696869, + "learning_rate": 3.122154750897528e-05, + "loss": 1.7577, + "step": 3500 + }, + { + "epoch": 0.6358662337957182, + "grad_norm": 1.264968991279602, + "learning_rate": 3.1194144822064944e-05, + "loss": 1.9033, + "step": 3501 + }, + { + "epoch": 0.6360478579698958, + "grad_norm": 0.49303385615348816, + "learning_rate": 3.1166748712894356e-05, + "loss": 1.7944, + "step": 3502 + }, + { + "epoch": 0.6362294821440734, + "grad_norm": 0.4814928472042084, + "learning_rate": 3.113935919104588e-05, + "loss": 1.7051, + "step": 3503 + }, + { + "epoch": 0.6364111063182509, + "grad_norm": 0.4097898006439209, + "learning_rate": 3.1111976266099606e-05, + "loss": 1.7083, + "step": 3504 + }, + { + "epoch": 0.6365927304924286, + "grad_norm": 0.33019545674324036, + "learning_rate": 3.1084599947633256e-05, + "loss": 1.7185, + "step": 3505 + }, + { + "epoch": 0.6367743546666061, + "grad_norm": 0.571073055267334, + "learning_rate": 3.105723024522226e-05, + "loss": 1.6363, + "step": 3506 + }, + { + "epoch": 0.6369559788407837, + "grad_norm": 0.3672334849834442, + "learning_rate": 3.102986716843974e-05, + "loss": 1.6929, + "step": 3507 + }, + { + "epoch": 0.6371376030149613, + "grad_norm": 0.3666480481624603, + "learning_rate": 3.100251072685655e-05, + "loss": 1.7767, + "step": 3508 + }, + { + "epoch": 0.6373192271891389, + "grad_norm": 0.34782376885414124, + "learning_rate": 3.097516093004111e-05, + "loss": 1.6146, + "step": 3509 + }, + { + "epoch": 0.6375008513633165, + "grad_norm": 0.45957791805267334, + "learning_rate": 3.094781778755964e-05, + "loss": 1.9637, + "step": 3510 + }, + { + "epoch": 0.637682475537494, + "grad_norm": 0.49048423767089844, + "learning_rate": 3.0920481308975926e-05, + "loss": 1.7302, + "step": 3511 + }, + { + "epoch": 0.6378640997116716, + "grad_norm": 0.4185323119163513, + "learning_rate": 3.0893151503851494e-05, + "loss": 1.7911, + "step": 3512 + }, + { + "epoch": 0.6380457238858492, + "grad_norm": 0.513258159160614, + "learning_rate": 3.086582838174551e-05, + "loss": 1.9447, + "step": 3513 + }, + { + "epoch": 0.6382273480600268, + "grad_norm": 0.5047030448913574, + "learning_rate": 3.083851195221482e-05, + "loss": 1.7337, + "step": 3514 + }, + { + "epoch": 0.6384089722342043, + "grad_norm": 0.2850649058818817, + "learning_rate": 3.081120222481389e-05, + "loss": 1.6495, + "step": 3515 + }, + { + "epoch": 0.638590596408382, + "grad_norm": 0.4228608310222626, + "learning_rate": 3.0783899209094866e-05, + "loss": 1.6313, + "step": 3516 + }, + { + "epoch": 0.6387722205825596, + "grad_norm": 0.5512344837188721, + "learning_rate": 3.075660291460757e-05, + "loss": 1.6115, + "step": 3517 + }, + { + "epoch": 0.6389538447567371, + "grad_norm": 0.3985038697719574, + "learning_rate": 3.072931335089944e-05, + "loss": 1.7089, + "step": 3518 + }, + { + "epoch": 0.6391354689309147, + "grad_norm": 0.4051051735877991, + "learning_rate": 3.0702030527515566e-05, + "loss": 1.8352, + "step": 3519 + }, + { + "epoch": 0.6393170931050923, + "grad_norm": 0.3719717562198639, + "learning_rate": 3.067475445399867e-05, + "loss": 1.6405, + "step": 3520 + }, + { + "epoch": 0.6394987172792699, + "grad_norm": 0.32680124044418335, + "learning_rate": 3.0647485139889145e-05, + "loss": 1.6977, + "step": 3521 + }, + { + "epoch": 0.6396803414534474, + "grad_norm": 0.3427116572856903, + "learning_rate": 3.062022259472501e-05, + "loss": 1.7987, + "step": 3522 + }, + { + "epoch": 0.639861965627625, + "grad_norm": 0.29063987731933594, + "learning_rate": 3.0592966828041896e-05, + "loss": 1.5237, + "step": 3523 + }, + { + "epoch": 0.6400435898018026, + "grad_norm": 0.31771740317344666, + "learning_rate": 3.0565717849373066e-05, + "loss": 1.7192, + "step": 3524 + }, + { + "epoch": 0.6402252139759802, + "grad_norm": 1.3625543117523193, + "learning_rate": 3.053847566824943e-05, + "loss": 1.7709, + "step": 3525 + }, + { + "epoch": 0.6404068381501578, + "grad_norm": 0.31008008122444153, + "learning_rate": 3.0511240294199516e-05, + "loss": 1.6663, + "step": 3526 + }, + { + "epoch": 0.6405884623243354, + "grad_norm": 0.5661123991012573, + "learning_rate": 3.0484011736749452e-05, + "loss": 1.9441, + "step": 3527 + }, + { + "epoch": 0.640770086498513, + "grad_norm": 0.39413022994995117, + "learning_rate": 3.0456790005423e-05, + "loss": 1.7621, + "step": 3528 + }, + { + "epoch": 0.6409517106726905, + "grad_norm": 0.42092204093933105, + "learning_rate": 3.0429575109741503e-05, + "loss": 1.6243, + "step": 3529 + }, + { + "epoch": 0.6411333348468681, + "grad_norm": 1.2673561573028564, + "learning_rate": 3.040236705922399e-05, + "loss": 1.9178, + "step": 3530 + }, + { + "epoch": 0.6413149590210457, + "grad_norm": 0.4786038100719452, + "learning_rate": 3.037516586338699e-05, + "loss": 1.6566, + "step": 3531 + }, + { + "epoch": 0.6414965831952233, + "grad_norm": 0.35530900955200195, + "learning_rate": 3.0347971531744728e-05, + "loss": 1.7829, + "step": 3532 + }, + { + "epoch": 0.6416782073694008, + "grad_norm": 0.29047560691833496, + "learning_rate": 3.032078407380895e-05, + "loss": 1.7162, + "step": 3533 + }, + { + "epoch": 0.6418598315435784, + "grad_norm": 0.4913342297077179, + "learning_rate": 3.0293603499089064e-05, + "loss": 1.8024, + "step": 3534 + }, + { + "epoch": 0.6420414557177561, + "grad_norm": 0.4070332646369934, + "learning_rate": 3.0266429817092045e-05, + "loss": 1.7607, + "step": 3535 + }, + { + "epoch": 0.6422230798919336, + "grad_norm": 0.5638381242752075, + "learning_rate": 3.023926303732244e-05, + "loss": 1.8244, + "step": 3536 + }, + { + "epoch": 0.6424047040661112, + "grad_norm": 0.3682697117328644, + "learning_rate": 3.0212103169282414e-05, + "loss": 1.6768, + "step": 3537 + }, + { + "epoch": 0.6425863282402888, + "grad_norm": 0.36565983295440674, + "learning_rate": 3.018495022247165e-05, + "loss": 1.683, + "step": 3538 + }, + { + "epoch": 0.6427679524144664, + "grad_norm": 0.39643222093582153, + "learning_rate": 3.0157804206387528e-05, + "loss": 1.7747, + "step": 3539 + }, + { + "epoch": 0.6429495765886439, + "grad_norm": 1.11576509475708, + "learning_rate": 3.013066513052488e-05, + "loss": 1.7484, + "step": 3540 + }, + { + "epoch": 0.6431312007628215, + "grad_norm": 0.9141802191734314, + "learning_rate": 3.0103533004376183e-05, + "loss": 1.6152, + "step": 3541 + }, + { + "epoch": 0.6433128249369992, + "grad_norm": 0.35925644636154175, + "learning_rate": 3.0076407837431454e-05, + "loss": 1.6214, + "step": 3542 + }, + { + "epoch": 0.6434944491111767, + "grad_norm": 0.45830294489860535, + "learning_rate": 3.004928963917829e-05, + "loss": 1.7787, + "step": 3543 + }, + { + "epoch": 0.6436760732853543, + "grad_norm": 0.722534716129303, + "learning_rate": 3.002217841910186e-05, + "loss": 1.7929, + "step": 3544 + }, + { + "epoch": 0.6438576974595319, + "grad_norm": 0.5889890789985657, + "learning_rate": 2.9995074186684868e-05, + "loss": 1.7963, + "step": 3545 + }, + { + "epoch": 0.6440393216337095, + "grad_norm": 0.5406284928321838, + "learning_rate": 2.9967976951407555e-05, + "loss": 1.8899, + "step": 3546 + }, + { + "epoch": 0.644220945807887, + "grad_norm": 0.5256069898605347, + "learning_rate": 2.9940886722747784e-05, + "loss": 1.9822, + "step": 3547 + }, + { + "epoch": 0.6444025699820646, + "grad_norm": 0.37428081035614014, + "learning_rate": 2.9913803510180927e-05, + "loss": 1.6045, + "step": 3548 + }, + { + "epoch": 0.6445841941562422, + "grad_norm": 0.6238949298858643, + "learning_rate": 2.9886727323179875e-05, + "loss": 1.7419, + "step": 3549 + }, + { + "epoch": 0.6447658183304198, + "grad_norm": 0.7119675874710083, + "learning_rate": 2.985965817121512e-05, + "loss": 1.8466, + "step": 3550 + }, + { + "epoch": 0.6449474425045973, + "grad_norm": 0.5012040138244629, + "learning_rate": 2.9832596063754613e-05, + "loss": 1.8475, + "step": 3551 + }, + { + "epoch": 0.6451290666787749, + "grad_norm": 1.8932859897613525, + "learning_rate": 2.980554101026394e-05, + "loss": 1.719, + "step": 3552 + }, + { + "epoch": 0.6453106908529526, + "grad_norm": 0.6732601523399353, + "learning_rate": 2.9778493020206154e-05, + "loss": 1.8569, + "step": 3553 + }, + { + "epoch": 0.6454923150271301, + "grad_norm": 0.796323299407959, + "learning_rate": 2.9751452103041856e-05, + "loss": 1.5953, + "step": 3554 + }, + { + "epoch": 0.6456739392013077, + "grad_norm": 0.48024535179138184, + "learning_rate": 2.972441826822915e-05, + "loss": 1.5242, + "step": 3555 + }, + { + "epoch": 0.6458555633754853, + "grad_norm": 0.3972235918045044, + "learning_rate": 2.9697391525223694e-05, + "loss": 1.6589, + "step": 3556 + }, + { + "epoch": 0.6460371875496629, + "grad_norm": 0.34082531929016113, + "learning_rate": 2.9670371883478675e-05, + "loss": 1.7547, + "step": 3557 + }, + { + "epoch": 0.6462188117238404, + "grad_norm": 0.3848145604133606, + "learning_rate": 2.9643359352444754e-05, + "loss": 1.856, + "step": 3558 + }, + { + "epoch": 0.646400435898018, + "grad_norm": 0.3222949802875519, + "learning_rate": 2.961635394157012e-05, + "loss": 1.7928, + "step": 3559 + }, + { + "epoch": 0.6465820600721957, + "grad_norm": 0.44019556045532227, + "learning_rate": 2.95893556603005e-05, + "loss": 1.7785, + "step": 3560 + }, + { + "epoch": 0.6467636842463732, + "grad_norm": 0.41749146580696106, + "learning_rate": 2.9562364518079105e-05, + "loss": 1.6219, + "step": 3561 + }, + { + "epoch": 0.6469453084205508, + "grad_norm": 1.1551686525344849, + "learning_rate": 2.953538052434663e-05, + "loss": 1.826, + "step": 3562 + }, + { + "epoch": 0.6471269325947283, + "grad_norm": 0.5280522108078003, + "learning_rate": 2.9508403688541307e-05, + "loss": 1.6681, + "step": 3563 + }, + { + "epoch": 0.647308556768906, + "grad_norm": 0.46009716391563416, + "learning_rate": 2.948143402009882e-05, + "loss": 1.7527, + "step": 3564 + }, + { + "epoch": 0.6474901809430835, + "grad_norm": 0.3663176894187927, + "learning_rate": 2.94544715284524e-05, + "loss": 1.7333, + "step": 3565 + }, + { + "epoch": 0.6476718051172611, + "grad_norm": 0.31207865476608276, + "learning_rate": 2.9427516223032736e-05, + "loss": 1.5442, + "step": 3566 + }, + { + "epoch": 0.6478534292914387, + "grad_norm": 0.3871380388736725, + "learning_rate": 2.9400568113268e-05, + "loss": 1.9012, + "step": 3567 + }, + { + "epoch": 0.6480350534656163, + "grad_norm": 0.28625836968421936, + "learning_rate": 2.9373627208583852e-05, + "loss": 1.645, + "step": 3568 + }, + { + "epoch": 0.6482166776397938, + "grad_norm": 0.3514483869075775, + "learning_rate": 2.9346693518403458e-05, + "loss": 1.7764, + "step": 3569 + }, + { + "epoch": 0.6483983018139714, + "grad_norm": 0.4612262547016144, + "learning_rate": 2.9319767052147417e-05, + "loss": 1.6788, + "step": 3570 + }, + { + "epoch": 0.6485799259881491, + "grad_norm": 0.5171740651130676, + "learning_rate": 2.929284781923382e-05, + "loss": 1.9261, + "step": 3571 + }, + { + "epoch": 0.6487615501623266, + "grad_norm": 0.45161890983581543, + "learning_rate": 2.9265935829078227e-05, + "loss": 1.6946, + "step": 3572 + }, + { + "epoch": 0.6489431743365042, + "grad_norm": 0.4884694814682007, + "learning_rate": 2.9239031091093695e-05, + "loss": 1.848, + "step": 3573 + }, + { + "epoch": 0.6491247985106817, + "grad_norm": 1.0825523138046265, + "learning_rate": 2.9212133614690683e-05, + "loss": 1.8531, + "step": 3574 + }, + { + "epoch": 0.6493064226848594, + "grad_norm": 0.2933640778064728, + "learning_rate": 2.918524340927717e-05, + "loss": 1.7238, + "step": 3575 + }, + { + "epoch": 0.6494880468590369, + "grad_norm": 0.46100619435310364, + "learning_rate": 2.915836048425855e-05, + "loss": 1.6282, + "step": 3576 + }, + { + "epoch": 0.6496696710332145, + "grad_norm": 0.4292515218257904, + "learning_rate": 2.913148484903768e-05, + "loss": 1.7264, + "step": 3577 + }, + { + "epoch": 0.6498512952073922, + "grad_norm": 0.4340824782848358, + "learning_rate": 2.91046165130149e-05, + "loss": 1.6625, + "step": 3578 + }, + { + "epoch": 0.6500329193815697, + "grad_norm": 0.40076714754104614, + "learning_rate": 2.907775548558793e-05, + "loss": 1.719, + "step": 3579 + }, + { + "epoch": 0.6502145435557473, + "grad_norm": 0.3704671561717987, + "learning_rate": 2.9050901776152023e-05, + "loss": 1.8673, + "step": 3580 + }, + { + "epoch": 0.6503961677299248, + "grad_norm": 0.3615882992744446, + "learning_rate": 2.902405539409978e-05, + "loss": 1.508, + "step": 3581 + }, + { + "epoch": 0.6505777919041025, + "grad_norm": 0.5248231291770935, + "learning_rate": 2.899721634882132e-05, + "loss": 1.6097, + "step": 3582 + }, + { + "epoch": 0.65075941607828, + "grad_norm": 0.5523951649665833, + "learning_rate": 2.897038464970414e-05, + "loss": 1.6902, + "step": 3583 + }, + { + "epoch": 0.6509410402524576, + "grad_norm": 0.3148176670074463, + "learning_rate": 2.8943560306133183e-05, + "loss": 1.6906, + "step": 3584 + }, + { + "epoch": 0.6511226644266351, + "grad_norm": 0.43345457315444946, + "learning_rate": 2.8916743327490803e-05, + "loss": 1.8147, + "step": 3585 + }, + { + "epoch": 0.6513042886008128, + "grad_norm": 0.4376501142978668, + "learning_rate": 2.8889933723156825e-05, + "loss": 1.7103, + "step": 3586 + }, + { + "epoch": 0.6514859127749904, + "grad_norm": 1.5950593948364258, + "learning_rate": 2.886313150250848e-05, + "loss": 1.8195, + "step": 3587 + }, + { + "epoch": 0.6516675369491679, + "grad_norm": 0.5814979076385498, + "learning_rate": 2.8836336674920385e-05, + "loss": 1.677, + "step": 3588 + }, + { + "epoch": 0.6518491611233456, + "grad_norm": 0.7540147304534912, + "learning_rate": 2.8809549249764588e-05, + "loss": 1.8052, + "step": 3589 + }, + { + "epoch": 0.6520307852975231, + "grad_norm": 0.53880375623703, + "learning_rate": 2.8782769236410535e-05, + "loss": 1.6283, + "step": 3590 + }, + { + "epoch": 0.6522124094717007, + "grad_norm": 0.3496240973472595, + "learning_rate": 2.8755996644225097e-05, + "loss": 1.5022, + "step": 3591 + }, + { + "epoch": 0.6523940336458782, + "grad_norm": 0.7825833559036255, + "learning_rate": 2.8729231482572584e-05, + "loss": 1.8349, + "step": 3592 + }, + { + "epoch": 0.6525756578200559, + "grad_norm": 0.9617693424224854, + "learning_rate": 2.870247376081464e-05, + "loss": 1.7706, + "step": 3593 + }, + { + "epoch": 0.6527572819942334, + "grad_norm": 0.3869241774082184, + "learning_rate": 2.8675723488310323e-05, + "loss": 1.7661, + "step": 3594 + }, + { + "epoch": 0.652938906168411, + "grad_norm": 0.31282371282577515, + "learning_rate": 2.864898067441614e-05, + "loss": 1.6701, + "step": 3595 + }, + { + "epoch": 0.6531205303425885, + "grad_norm": 0.6413426399230957, + "learning_rate": 2.8622245328485907e-05, + "loss": 1.6581, + "step": 3596 + }, + { + "epoch": 0.6533021545167662, + "grad_norm": 0.3139571249485016, + "learning_rate": 2.8595517459870868e-05, + "loss": 1.8853, + "step": 3597 + }, + { + "epoch": 0.6534837786909438, + "grad_norm": 0.43221452832221985, + "learning_rate": 2.856879707791969e-05, + "loss": 1.7057, + "step": 3598 + }, + { + "epoch": 0.6536654028651213, + "grad_norm": 0.41614291071891785, + "learning_rate": 2.8542084191978336e-05, + "loss": 1.8151, + "step": 3599 + }, + { + "epoch": 0.653847027039299, + "grad_norm": 1.3850685358047485, + "learning_rate": 2.8515378811390243e-05, + "loss": 1.7621, + "step": 3600 + }, + { + "epoch": 0.6540286512134765, + "grad_norm": 0.5765554904937744, + "learning_rate": 2.848868094549615e-05, + "loss": 1.5359, + "step": 3601 + }, + { + "epoch": 0.6542102753876541, + "grad_norm": 0.3640555143356323, + "learning_rate": 2.8461990603634193e-05, + "loss": 1.8288, + "step": 3602 + }, + { + "epoch": 0.6543918995618316, + "grad_norm": 0.46745386719703674, + "learning_rate": 2.8435307795139848e-05, + "loss": 1.8423, + "step": 3603 + }, + { + "epoch": 0.6545735237360093, + "grad_norm": 0.3952290415763855, + "learning_rate": 2.8408632529346012e-05, + "loss": 1.6054, + "step": 3604 + }, + { + "epoch": 0.6547551479101869, + "grad_norm": 1.1179530620574951, + "learning_rate": 2.8381964815582934e-05, + "loss": 1.653, + "step": 3605 + }, + { + "epoch": 0.6549367720843644, + "grad_norm": 0.40477800369262695, + "learning_rate": 2.8355304663178185e-05, + "loss": 1.6123, + "step": 3606 + }, + { + "epoch": 0.655118396258542, + "grad_norm": 0.3106452226638794, + "learning_rate": 2.832865208145668e-05, + "loss": 1.4433, + "step": 3607 + }, + { + "epoch": 0.6553000204327196, + "grad_norm": 0.7606632709503174, + "learning_rate": 2.8302007079740766e-05, + "loss": 1.8322, + "step": 3608 + }, + { + "epoch": 0.6554816446068972, + "grad_norm": 0.4153408408164978, + "learning_rate": 2.827536966735006e-05, + "loss": 1.7436, + "step": 3609 + }, + { + "epoch": 0.6556632687810747, + "grad_norm": 0.38457900285720825, + "learning_rate": 2.824873985360153e-05, + "loss": 1.8534, + "step": 3610 + }, + { + "epoch": 0.6558448929552524, + "grad_norm": 0.47730302810668945, + "learning_rate": 2.8222117647809553e-05, + "loss": 1.7454, + "step": 3611 + }, + { + "epoch": 0.6560265171294299, + "grad_norm": 0.4376366138458252, + "learning_rate": 2.8195503059285767e-05, + "loss": 1.7272, + "step": 3612 + }, + { + "epoch": 0.6562081413036075, + "grad_norm": 0.39260032773017883, + "learning_rate": 2.8168896097339203e-05, + "loss": 1.6697, + "step": 3613 + }, + { + "epoch": 0.656389765477785, + "grad_norm": 0.38519397377967834, + "learning_rate": 2.8142296771276193e-05, + "loss": 1.7513, + "step": 3614 + }, + { + "epoch": 0.6565713896519627, + "grad_norm": 0.43893563747406006, + "learning_rate": 2.8115705090400384e-05, + "loss": 1.7019, + "step": 3615 + }, + { + "epoch": 0.6567530138261403, + "grad_norm": 0.3695445656776428, + "learning_rate": 2.8089121064012786e-05, + "loss": 1.7376, + "step": 3616 + }, + { + "epoch": 0.6569346380003178, + "grad_norm": 0.4757639467716217, + "learning_rate": 2.8062544701411742e-05, + "loss": 1.586, + "step": 3617 + }, + { + "epoch": 0.6571162621744955, + "grad_norm": 0.4526097774505615, + "learning_rate": 2.8035976011892863e-05, + "loss": 1.8958, + "step": 3618 + }, + { + "epoch": 0.657297886348673, + "grad_norm": 1.5158528089523315, + "learning_rate": 2.8009415004749094e-05, + "loss": 1.7872, + "step": 3619 + }, + { + "epoch": 0.6574795105228506, + "grad_norm": 0.35384029150009155, + "learning_rate": 2.7982861689270722e-05, + "loss": 1.8339, + "step": 3620 + }, + { + "epoch": 0.6576611346970281, + "grad_norm": 0.503014862537384, + "learning_rate": 2.7956316074745293e-05, + "loss": 1.6316, + "step": 3621 + }, + { + "epoch": 0.6578427588712058, + "grad_norm": 0.5150052309036255, + "learning_rate": 2.7929778170457698e-05, + "loss": 1.8388, + "step": 3622 + }, + { + "epoch": 0.6580243830453834, + "grad_norm": 0.5930125117301941, + "learning_rate": 2.7903247985690163e-05, + "loss": 1.7469, + "step": 3623 + }, + { + "epoch": 0.6582060072195609, + "grad_norm": 0.37005847692489624, + "learning_rate": 2.7876725529722135e-05, + "loss": 1.7704, + "step": 3624 + }, + { + "epoch": 0.6583876313937385, + "grad_norm": 0.42368775606155396, + "learning_rate": 2.785021081183038e-05, + "loss": 1.9073, + "step": 3625 + }, + { + "epoch": 0.6585692555679161, + "grad_norm": 0.9337003827095032, + "learning_rate": 2.7823703841289018e-05, + "loss": 1.6907, + "step": 3626 + }, + { + "epoch": 0.6587508797420937, + "grad_norm": 0.3932209610939026, + "learning_rate": 2.779720462736939e-05, + "loss": 1.8343, + "step": 3627 + }, + { + "epoch": 0.6589325039162712, + "grad_norm": 0.4807223081588745, + "learning_rate": 2.7770713179340128e-05, + "loss": 1.7128, + "step": 3628 + }, + { + "epoch": 0.6591141280904489, + "grad_norm": 0.37218761444091797, + "learning_rate": 2.7744229506467197e-05, + "loss": 1.7901, + "step": 3629 + }, + { + "epoch": 0.6592957522646264, + "grad_norm": 0.3071860074996948, + "learning_rate": 2.771775361801382e-05, + "loss": 1.6194, + "step": 3630 + }, + { + "epoch": 0.659477376438804, + "grad_norm": 0.32147690653800964, + "learning_rate": 2.7691285523240474e-05, + "loss": 1.45, + "step": 3631 + }, + { + "epoch": 0.6596590006129815, + "grad_norm": 0.48563721776008606, + "learning_rate": 2.7664825231404934e-05, + "loss": 1.9763, + "step": 3632 + }, + { + "epoch": 0.6598406247871592, + "grad_norm": 0.41948702931404114, + "learning_rate": 2.763837275176224e-05, + "loss": 1.759, + "step": 3633 + }, + { + "epoch": 0.6600222489613368, + "grad_norm": 0.35556134581565857, + "learning_rate": 2.7611928093564664e-05, + "loss": 1.6597, + "step": 3634 + }, + { + "epoch": 0.6602038731355143, + "grad_norm": 0.35066190361976624, + "learning_rate": 2.7585491266061808e-05, + "loss": 1.6342, + "step": 3635 + }, + { + "epoch": 0.6603854973096919, + "grad_norm": 0.4621551036834717, + "learning_rate": 2.7559062278500524e-05, + "loss": 1.769, + "step": 3636 + }, + { + "epoch": 0.6605671214838695, + "grad_norm": 0.34428638219833374, + "learning_rate": 2.753264114012487e-05, + "loss": 1.7868, + "step": 3637 + }, + { + "epoch": 0.6607487456580471, + "grad_norm": 0.3598118722438812, + "learning_rate": 2.7506227860176183e-05, + "loss": 1.6287, + "step": 3638 + }, + { + "epoch": 0.6609303698322246, + "grad_norm": 0.39492061734199524, + "learning_rate": 2.747982244789309e-05, + "loss": 1.8145, + "step": 3639 + }, + { + "epoch": 0.6611119940064023, + "grad_norm": 0.3363431990146637, + "learning_rate": 2.745342491251141e-05, + "loss": 1.6476, + "step": 3640 + }, + { + "epoch": 0.6612936181805799, + "grad_norm": 0.48884791135787964, + "learning_rate": 2.7427035263264222e-05, + "loss": 1.8512, + "step": 3641 + }, + { + "epoch": 0.6614752423547574, + "grad_norm": 0.4921896457672119, + "learning_rate": 2.7400653509381875e-05, + "loss": 1.6407, + "step": 3642 + }, + { + "epoch": 0.661656866528935, + "grad_norm": 0.3730270266532898, + "learning_rate": 2.737427966009195e-05, + "loss": 1.6068, + "step": 3643 + }, + { + "epoch": 0.6618384907031126, + "grad_norm": 0.481446236371994, + "learning_rate": 2.7347913724619232e-05, + "loss": 1.7354, + "step": 3644 + }, + { + "epoch": 0.6620201148772902, + "grad_norm": 0.410196453332901, + "learning_rate": 2.7321555712185766e-05, + "loss": 1.6478, + "step": 3645 + }, + { + "epoch": 0.6622017390514677, + "grad_norm": 0.382311075925827, + "learning_rate": 2.7295205632010777e-05, + "loss": 1.6866, + "step": 3646 + }, + { + "epoch": 0.6623833632256453, + "grad_norm": 0.979132890701294, + "learning_rate": 2.7268863493310794e-05, + "loss": 1.853, + "step": 3647 + }, + { + "epoch": 0.662564987399823, + "grad_norm": 0.513227105140686, + "learning_rate": 2.7242529305299543e-05, + "loss": 1.5968, + "step": 3648 + }, + { + "epoch": 0.6627466115740005, + "grad_norm": 0.5699822306632996, + "learning_rate": 2.721620307718793e-05, + "loss": 1.7731, + "step": 3649 + }, + { + "epoch": 0.662928235748178, + "grad_norm": 0.4429536461830139, + "learning_rate": 2.71898848181841e-05, + "loss": 1.5861, + "step": 3650 + }, + { + "epoch": 0.6631098599223557, + "grad_norm": 0.3877580463886261, + "learning_rate": 2.7163574537493407e-05, + "loss": 1.8525, + "step": 3651 + }, + { + "epoch": 0.6632914840965333, + "grad_norm": 0.7645485997200012, + "learning_rate": 2.7137272244318446e-05, + "loss": 1.8148, + "step": 3652 + }, + { + "epoch": 0.6634731082707108, + "grad_norm": 0.3316713273525238, + "learning_rate": 2.711097794785895e-05, + "loss": 1.677, + "step": 3653 + }, + { + "epoch": 0.6636547324448884, + "grad_norm": 0.4044135808944702, + "learning_rate": 2.7084691657311957e-05, + "loss": 1.858, + "step": 3654 + }, + { + "epoch": 0.663836356619066, + "grad_norm": 0.762764036655426, + "learning_rate": 2.7058413381871584e-05, + "loss": 1.6901, + "step": 3655 + }, + { + "epoch": 0.6640179807932436, + "grad_norm": 1.4210572242736816, + "learning_rate": 2.7032143130729255e-05, + "loss": 1.9538, + "step": 3656 + }, + { + "epoch": 0.6641996049674211, + "grad_norm": 0.3540404736995697, + "learning_rate": 2.700588091307351e-05, + "loss": 1.4961, + "step": 3657 + }, + { + "epoch": 0.6643812291415987, + "grad_norm": 0.35310885310173035, + "learning_rate": 2.6979626738090124e-05, + "loss": 1.5723, + "step": 3658 + }, + { + "epoch": 0.6645628533157764, + "grad_norm": 0.4240778088569641, + "learning_rate": 2.6953380614962004e-05, + "loss": 1.7365, + "step": 3659 + }, + { + "epoch": 0.6647444774899539, + "grad_norm": 0.5094699263572693, + "learning_rate": 2.692714255286931e-05, + "loss": 1.6968, + "step": 3660 + }, + { + "epoch": 0.6649261016641315, + "grad_norm": 0.3629837930202484, + "learning_rate": 2.690091256098936e-05, + "loss": 1.7568, + "step": 3661 + }, + { + "epoch": 0.6651077258383091, + "grad_norm": 1.1700624227523804, + "learning_rate": 2.6874690648496632e-05, + "loss": 1.9199, + "step": 3662 + }, + { + "epoch": 0.6652893500124867, + "grad_norm": 0.36596113443374634, + "learning_rate": 2.6848476824562772e-05, + "loss": 1.7784, + "step": 3663 + }, + { + "epoch": 0.6654709741866642, + "grad_norm": 0.4069920480251312, + "learning_rate": 2.682227109835661e-05, + "loss": 1.8016, + "step": 3664 + }, + { + "epoch": 0.6656525983608418, + "grad_norm": 0.4748948812484741, + "learning_rate": 2.6796073479044174e-05, + "loss": 1.8424, + "step": 3665 + }, + { + "epoch": 0.6658342225350194, + "grad_norm": 2.0147876739501953, + "learning_rate": 2.676988397578859e-05, + "loss": 1.956, + "step": 3666 + }, + { + "epoch": 0.666015846709197, + "grad_norm": 0.3715302348136902, + "learning_rate": 2.674370259775022e-05, + "loss": 1.6772, + "step": 3667 + }, + { + "epoch": 0.6661974708833746, + "grad_norm": 0.39338651299476624, + "learning_rate": 2.6717529354086524e-05, + "loss": 1.8452, + "step": 3668 + }, + { + "epoch": 0.6663790950575521, + "grad_norm": 0.494728684425354, + "learning_rate": 2.6691364253952124e-05, + "loss": 1.6342, + "step": 3669 + }, + { + "epoch": 0.6665607192317298, + "grad_norm": 0.42761194705963135, + "learning_rate": 2.666520730649885e-05, + "loss": 1.8867, + "step": 3670 + }, + { + "epoch": 0.6667423434059073, + "grad_norm": 0.5211673974990845, + "learning_rate": 2.6639058520875615e-05, + "loss": 1.6735, + "step": 3671 + }, + { + "epoch": 0.6669239675800849, + "grad_norm": 0.8390423059463501, + "learning_rate": 2.661291790622849e-05, + "loss": 1.9962, + "step": 3672 + }, + { + "epoch": 0.6671055917542625, + "grad_norm": 0.44559329748153687, + "learning_rate": 2.658678547170071e-05, + "loss": 1.7669, + "step": 3673 + }, + { + "epoch": 0.6672872159284401, + "grad_norm": 0.40113404393196106, + "learning_rate": 2.656066122643266e-05, + "loss": 1.7215, + "step": 3674 + }, + { + "epoch": 0.6674688401026176, + "grad_norm": 0.3960772156715393, + "learning_rate": 2.6534545179561825e-05, + "loss": 1.9361, + "step": 3675 + }, + { + "epoch": 0.6676504642767952, + "grad_norm": 0.4492075741291046, + "learning_rate": 2.6508437340222835e-05, + "loss": 1.6879, + "step": 3676 + }, + { + "epoch": 0.6678320884509729, + "grad_norm": 0.4822429418563843, + "learning_rate": 2.6482337717547427e-05, + "loss": 1.6658, + "step": 3677 + }, + { + "epoch": 0.6680137126251504, + "grad_norm": 0.5393883585929871, + "learning_rate": 2.6456246320664514e-05, + "loss": 1.8526, + "step": 3678 + }, + { + "epoch": 0.668195336799328, + "grad_norm": 0.32822102308273315, + "learning_rate": 2.6430163158700115e-05, + "loss": 1.7051, + "step": 3679 + }, + { + "epoch": 0.6683769609735055, + "grad_norm": 0.45794522762298584, + "learning_rate": 2.6404088240777352e-05, + "loss": 1.7239, + "step": 3680 + }, + { + "epoch": 0.6685585851476832, + "grad_norm": 0.3477844297885895, + "learning_rate": 2.6378021576016466e-05, + "loss": 1.814, + "step": 3681 + }, + { + "epoch": 0.6687402093218607, + "grad_norm": 0.8864353895187378, + "learning_rate": 2.6351963173534794e-05, + "loss": 1.8188, + "step": 3682 + }, + { + "epoch": 0.6689218334960383, + "grad_norm": 0.2682989537715912, + "learning_rate": 2.6325913042446847e-05, + "loss": 1.8624, + "step": 3683 + }, + { + "epoch": 0.669103457670216, + "grad_norm": 0.46772506833076477, + "learning_rate": 2.6299871191864163e-05, + "loss": 1.678, + "step": 3684 + }, + { + "epoch": 0.6692850818443935, + "grad_norm": 0.48050233721733093, + "learning_rate": 2.627383763089546e-05, + "loss": 1.8227, + "step": 3685 + }, + { + "epoch": 0.6694667060185711, + "grad_norm": 0.2711726427078247, + "learning_rate": 2.6247812368646475e-05, + "loss": 1.5815, + "step": 3686 + }, + { + "epoch": 0.6696483301927486, + "grad_norm": 0.3554350435733795, + "learning_rate": 2.622179541422013e-05, + "loss": 1.8446, + "step": 3687 + }, + { + "epoch": 0.6698299543669263, + "grad_norm": 0.7588464021682739, + "learning_rate": 2.619578677671638e-05, + "loss": 1.7356, + "step": 3688 + }, + { + "epoch": 0.6700115785411038, + "grad_norm": 0.41006985306739807, + "learning_rate": 2.6169786465232284e-05, + "loss": 1.6817, + "step": 3689 + }, + { + "epoch": 0.6701932027152814, + "grad_norm": 0.36677494645118713, + "learning_rate": 2.614379448886197e-05, + "loss": 1.5982, + "step": 3690 + }, + { + "epoch": 0.670374826889459, + "grad_norm": 0.4502516984939575, + "learning_rate": 2.6117810856696702e-05, + "loss": 1.7839, + "step": 3691 + }, + { + "epoch": 0.6705564510636366, + "grad_norm": 0.3863702118396759, + "learning_rate": 2.6091835577824808e-05, + "loss": 1.8254, + "step": 3692 + }, + { + "epoch": 0.6707380752378141, + "grad_norm": 0.3510890007019043, + "learning_rate": 2.6065868661331673e-05, + "loss": 1.7329, + "step": 3693 + }, + { + "epoch": 0.6709196994119917, + "grad_norm": 0.33847129344940186, + "learning_rate": 2.6039910116299753e-05, + "loss": 1.6493, + "step": 3694 + }, + { + "epoch": 0.6711013235861694, + "grad_norm": 0.7752748131752014, + "learning_rate": 2.6013959951808585e-05, + "loss": 1.9045, + "step": 3695 + }, + { + "epoch": 0.6712829477603469, + "grad_norm": 0.3767830729484558, + "learning_rate": 2.5988018176934803e-05, + "loss": 1.9544, + "step": 3696 + }, + { + "epoch": 0.6714645719345245, + "grad_norm": 0.4244510233402252, + "learning_rate": 2.5962084800752063e-05, + "loss": 1.6599, + "step": 3697 + }, + { + "epoch": 0.671646196108702, + "grad_norm": 0.4101501405239105, + "learning_rate": 2.593615983233113e-05, + "loss": 1.6538, + "step": 3698 + }, + { + "epoch": 0.6718278202828797, + "grad_norm": 0.34569650888442993, + "learning_rate": 2.5910243280739766e-05, + "loss": 1.5432, + "step": 3699 + }, + { + "epoch": 0.6720094444570572, + "grad_norm": 0.6445664167404175, + "learning_rate": 2.5884335155042867e-05, + "loss": 1.8156, + "step": 3700 + }, + { + "epoch": 0.6721910686312348, + "grad_norm": 0.35771629214286804, + "learning_rate": 2.5858435464302315e-05, + "loss": 1.8072, + "step": 3701 + }, + { + "epoch": 0.6723726928054125, + "grad_norm": 0.32047533988952637, + "learning_rate": 2.583254421757705e-05, + "loss": 1.7534, + "step": 3702 + }, + { + "epoch": 0.67255431697959, + "grad_norm": 0.40096551179885864, + "learning_rate": 2.580666142392312e-05, + "loss": 1.6398, + "step": 3703 + }, + { + "epoch": 0.6727359411537676, + "grad_norm": 0.311082661151886, + "learning_rate": 2.578078709239352e-05, + "loss": 1.7293, + "step": 3704 + }, + { + "epoch": 0.6729175653279451, + "grad_norm": 0.383953332901001, + "learning_rate": 2.5754921232038388e-05, + "loss": 1.9609, + "step": 3705 + }, + { + "epoch": 0.6730991895021228, + "grad_norm": 0.397897332906723, + "learning_rate": 2.5729063851904816e-05, + "loss": 1.8329, + "step": 3706 + }, + { + "epoch": 0.6732808136763003, + "grad_norm": 1.8524385690689087, + "learning_rate": 2.570321496103697e-05, + "loss": 2.0167, + "step": 3707 + }, + { + "epoch": 0.6734624378504779, + "grad_norm": 0.5721729397773743, + "learning_rate": 2.567737456847602e-05, + "loss": 1.7711, + "step": 3708 + }, + { + "epoch": 0.6736440620246554, + "grad_norm": 0.4179374575614929, + "learning_rate": 2.5651542683260192e-05, + "loss": 1.6148, + "step": 3709 + }, + { + "epoch": 0.6738256861988331, + "grad_norm": 0.44549626111984253, + "learning_rate": 2.5625719314424755e-05, + "loss": 1.897, + "step": 3710 + }, + { + "epoch": 0.6740073103730106, + "grad_norm": 0.7184817790985107, + "learning_rate": 2.559990447100195e-05, + "loss": 1.6262, + "step": 3711 + }, + { + "epoch": 0.6741889345471882, + "grad_norm": 0.5328118801116943, + "learning_rate": 2.5574098162021032e-05, + "loss": 1.6442, + "step": 3712 + }, + { + "epoch": 0.6743705587213659, + "grad_norm": 0.3804372251033783, + "learning_rate": 2.554830039650834e-05, + "loss": 1.5663, + "step": 3713 + }, + { + "epoch": 0.6745521828955434, + "grad_norm": 0.39976412057876587, + "learning_rate": 2.552251118348716e-05, + "loss": 1.6387, + "step": 3714 + }, + { + "epoch": 0.674733807069721, + "grad_norm": 0.36907002329826355, + "learning_rate": 2.549673053197778e-05, + "loss": 1.7488, + "step": 3715 + }, + { + "epoch": 0.6749154312438985, + "grad_norm": 0.36284488439559937, + "learning_rate": 2.5470958450997562e-05, + "loss": 1.8009, + "step": 3716 + }, + { + "epoch": 0.6750970554180762, + "grad_norm": 0.6194390654563904, + "learning_rate": 2.5445194949560795e-05, + "loss": 1.5786, + "step": 3717 + }, + { + "epoch": 0.6752786795922537, + "grad_norm": 0.3902406692504883, + "learning_rate": 2.5419440036678836e-05, + "loss": 1.8518, + "step": 3718 + }, + { + "epoch": 0.6754603037664313, + "grad_norm": 0.5929348468780518, + "learning_rate": 2.5393693721359985e-05, + "loss": 1.9427, + "step": 3719 + }, + { + "epoch": 0.6756419279406088, + "grad_norm": 0.34327125549316406, + "learning_rate": 2.536795601260955e-05, + "loss": 1.5854, + "step": 3720 + }, + { + "epoch": 0.6758235521147865, + "grad_norm": 0.5788311958312988, + "learning_rate": 2.5342226919429806e-05, + "loss": 1.5765, + "step": 3721 + }, + { + "epoch": 0.6760051762889641, + "grad_norm": 0.37283676862716675, + "learning_rate": 2.531650645082007e-05, + "loss": 1.9163, + "step": 3722 + }, + { + "epoch": 0.6761868004631416, + "grad_norm": 0.36711111664772034, + "learning_rate": 2.5290794615776624e-05, + "loss": 1.7086, + "step": 3723 + }, + { + "epoch": 0.6763684246373193, + "grad_norm": 0.3637526333332062, + "learning_rate": 2.5265091423292703e-05, + "loss": 1.6884, + "step": 3724 + }, + { + "epoch": 0.6765500488114968, + "grad_norm": 0.49599647521972656, + "learning_rate": 2.5239396882358514e-05, + "loss": 1.6774, + "step": 3725 + }, + { + "epoch": 0.6767316729856744, + "grad_norm": 0.4217991232872009, + "learning_rate": 2.5213711001961294e-05, + "loss": 1.7934, + "step": 3726 + }, + { + "epoch": 0.6769132971598519, + "grad_norm": 0.47039228677749634, + "learning_rate": 2.51880337910852e-05, + "loss": 1.7519, + "step": 3727 + }, + { + "epoch": 0.6770949213340296, + "grad_norm": 0.6201013922691345, + "learning_rate": 2.516236525871134e-05, + "loss": 1.9426, + "step": 3728 + }, + { + "epoch": 0.6772765455082072, + "grad_norm": 0.42070847749710083, + "learning_rate": 2.5136705413817875e-05, + "loss": 1.7694, + "step": 3729 + }, + { + "epoch": 0.6774581696823847, + "grad_norm": 0.38301393389701843, + "learning_rate": 2.511105426537982e-05, + "loss": 1.7428, + "step": 3730 + }, + { + "epoch": 0.6776397938565623, + "grad_norm": 0.43644383549690247, + "learning_rate": 2.5085411822369244e-05, + "loss": 1.8949, + "step": 3731 + }, + { + "epoch": 0.6778214180307399, + "grad_norm": 0.535317063331604, + "learning_rate": 2.5059778093755092e-05, + "loss": 1.6528, + "step": 3732 + }, + { + "epoch": 0.6780030422049175, + "grad_norm": 0.4340539872646332, + "learning_rate": 2.5034153088503298e-05, + "loss": 1.641, + "step": 3733 + }, + { + "epoch": 0.678184666379095, + "grad_norm": 0.7050914764404297, + "learning_rate": 2.5008536815576744e-05, + "loss": 1.7886, + "step": 3734 + }, + { + "epoch": 0.6783662905532727, + "grad_norm": 0.3197711706161499, + "learning_rate": 2.4982929283935287e-05, + "loss": 1.6119, + "step": 3735 + }, + { + "epoch": 0.6785479147274502, + "grad_norm": 0.49305489659309387, + "learning_rate": 2.4957330502535663e-05, + "loss": 1.6243, + "step": 3736 + }, + { + "epoch": 0.6787295389016278, + "grad_norm": 0.39605382084846497, + "learning_rate": 2.4931740480331588e-05, + "loss": 1.7723, + "step": 3737 + }, + { + "epoch": 0.6789111630758053, + "grad_norm": 0.3538605272769928, + "learning_rate": 2.4906159226273685e-05, + "loss": 1.8815, + "step": 3738 + }, + { + "epoch": 0.679092787249983, + "grad_norm": 0.33815130591392517, + "learning_rate": 2.488058674930956e-05, + "loss": 1.6709, + "step": 3739 + }, + { + "epoch": 0.6792744114241606, + "grad_norm": 1.302302598953247, + "learning_rate": 2.4855023058383692e-05, + "loss": 1.7055, + "step": 3740 + }, + { + "epoch": 0.6794560355983381, + "grad_norm": 0.31927549839019775, + "learning_rate": 2.4829468162437554e-05, + "loss": 1.621, + "step": 3741 + }, + { + "epoch": 0.6796376597725157, + "grad_norm": 0.3847543001174927, + "learning_rate": 2.4803922070409473e-05, + "loss": 1.6249, + "step": 3742 + }, + { + "epoch": 0.6798192839466933, + "grad_norm": 0.4099222421646118, + "learning_rate": 2.4778384791234722e-05, + "loss": 1.7133, + "step": 3743 + }, + { + "epoch": 0.6800009081208709, + "grad_norm": 0.3297675549983978, + "learning_rate": 2.4752856333845525e-05, + "loss": 1.5762, + "step": 3744 + }, + { + "epoch": 0.6801825322950484, + "grad_norm": 0.34864020347595215, + "learning_rate": 2.4727336707170973e-05, + "loss": 1.6802, + "step": 3745 + }, + { + "epoch": 0.6803641564692261, + "grad_norm": 0.29313376545906067, + "learning_rate": 2.4701825920137074e-05, + "loss": 1.7264, + "step": 3746 + }, + { + "epoch": 0.6805457806434037, + "grad_norm": 0.6469292640686035, + "learning_rate": 2.467632398166677e-05, + "loss": 1.7885, + "step": 3747 + }, + { + "epoch": 0.6807274048175812, + "grad_norm": 0.4410405158996582, + "learning_rate": 2.465083090067992e-05, + "loss": 1.7321, + "step": 3748 + }, + { + "epoch": 0.6809090289917588, + "grad_norm": 0.2748869061470032, + "learning_rate": 2.4625346686093244e-05, + "loss": 1.6662, + "step": 3749 + }, + { + "epoch": 0.6810906531659364, + "grad_norm": 0.4099177122116089, + "learning_rate": 2.4599871346820363e-05, + "loss": 1.8389, + "step": 3750 + }, + { + "epoch": 0.681272277340114, + "grad_norm": 0.5982104539871216, + "learning_rate": 2.4574404891771826e-05, + "loss": 1.7338, + "step": 3751 + }, + { + "epoch": 0.6814539015142915, + "grad_norm": 0.4212876558303833, + "learning_rate": 2.454894732985502e-05, + "loss": 1.8269, + "step": 3752 + }, + { + "epoch": 0.6816355256884692, + "grad_norm": 0.992641270160675, + "learning_rate": 2.4523498669974294e-05, + "loss": 1.7796, + "step": 3753 + }, + { + "epoch": 0.6818171498626467, + "grad_norm": 0.5142229199409485, + "learning_rate": 2.4498058921030847e-05, + "loss": 1.7786, + "step": 3754 + }, + { + "epoch": 0.6819987740368243, + "grad_norm": 0.8492749929428101, + "learning_rate": 2.447262809192276e-05, + "loss": 2.0043, + "step": 3755 + }, + { + "epoch": 0.6821803982110018, + "grad_norm": 0.6872209906578064, + "learning_rate": 2.444720619154497e-05, + "loss": 1.6472, + "step": 3756 + }, + { + "epoch": 0.6823620223851795, + "grad_norm": 0.584552526473999, + "learning_rate": 2.4421793228789354e-05, + "loss": 1.7768, + "step": 3757 + }, + { + "epoch": 0.6825436465593571, + "grad_norm": 0.4288613796234131, + "learning_rate": 2.43963892125446e-05, + "loss": 1.9162, + "step": 3758 + }, + { + "epoch": 0.6827252707335346, + "grad_norm": 0.3864923119544983, + "learning_rate": 2.4370994151696287e-05, + "loss": 1.6913, + "step": 3759 + }, + { + "epoch": 0.6829068949077122, + "grad_norm": 0.42785993218421936, + "learning_rate": 2.4345608055126874e-05, + "loss": 1.8861, + "step": 3760 + }, + { + "epoch": 0.6830885190818898, + "grad_norm": 0.8030163049697876, + "learning_rate": 2.43202309317157e-05, + "loss": 1.6821, + "step": 3761 + }, + { + "epoch": 0.6832701432560674, + "grad_norm": 0.37061038613319397, + "learning_rate": 2.4294862790338917e-05, + "loss": 1.5597, + "step": 3762 + }, + { + "epoch": 0.6834517674302449, + "grad_norm": 0.8454883694648743, + "learning_rate": 2.426950363986958e-05, + "loss": 1.7767, + "step": 3763 + }, + { + "epoch": 0.6836333916044226, + "grad_norm": 0.5000087022781372, + "learning_rate": 2.4244153489177545e-05, + "loss": 1.8421, + "step": 3764 + }, + { + "epoch": 0.6838150157786002, + "grad_norm": 0.37381336092948914, + "learning_rate": 2.4218812347129578e-05, + "loss": 1.6695, + "step": 3765 + }, + { + "epoch": 0.6839966399527777, + "grad_norm": 0.4053579866886139, + "learning_rate": 2.4193480222589295e-05, + "loss": 1.7735, + "step": 3766 + }, + { + "epoch": 0.6841782641269553, + "grad_norm": 0.5134552121162415, + "learning_rate": 2.4168157124417108e-05, + "loss": 1.7642, + "step": 3767 + }, + { + "epoch": 0.6843598883011329, + "grad_norm": 0.5469010472297668, + "learning_rate": 2.41428430614703e-05, + "loss": 1.764, + "step": 3768 + }, + { + "epoch": 0.6845415124753105, + "grad_norm": 0.3211790919303894, + "learning_rate": 2.4117538042602977e-05, + "loss": 1.657, + "step": 3769 + }, + { + "epoch": 0.684723136649488, + "grad_norm": 0.35741114616394043, + "learning_rate": 2.409224207666614e-05, + "loss": 1.645, + "step": 3770 + }, + { + "epoch": 0.6849047608236656, + "grad_norm": 0.3645443320274353, + "learning_rate": 2.406695517250753e-05, + "loss": 1.5983, + "step": 3771 + }, + { + "epoch": 0.6850863849978432, + "grad_norm": 0.4320109486579895, + "learning_rate": 2.404167733897181e-05, + "loss": 1.654, + "step": 3772 + }, + { + "epoch": 0.6852680091720208, + "grad_norm": 0.40356317162513733, + "learning_rate": 2.4016408584900395e-05, + "loss": 1.8402, + "step": 3773 + }, + { + "epoch": 0.6854496333461984, + "grad_norm": 0.4030505418777466, + "learning_rate": 2.3991148919131595e-05, + "loss": 1.9403, + "step": 3774 + }, + { + "epoch": 0.685631257520376, + "grad_norm": 1.6213847398757935, + "learning_rate": 2.3965898350500484e-05, + "loss": 2.0133, + "step": 3775 + }, + { + "epoch": 0.6858128816945536, + "grad_norm": 0.3481433391571045, + "learning_rate": 2.3940656887838975e-05, + "loss": 1.7454, + "step": 3776 + }, + { + "epoch": 0.6859945058687311, + "grad_norm": 0.3363727033138275, + "learning_rate": 2.391542453997578e-05, + "loss": 1.8448, + "step": 3777 + }, + { + "epoch": 0.6861761300429087, + "grad_norm": 0.36488208174705505, + "learning_rate": 2.389020131573645e-05, + "loss": 1.7203, + "step": 3778 + }, + { + "epoch": 0.6863577542170863, + "grad_norm": 0.36540400981903076, + "learning_rate": 2.3864987223943365e-05, + "loss": 1.7354, + "step": 3779 + }, + { + "epoch": 0.6865393783912639, + "grad_norm": 0.4198780059814453, + "learning_rate": 2.3839782273415645e-05, + "loss": 1.7169, + "step": 3780 + }, + { + "epoch": 0.6867210025654414, + "grad_norm": 0.45003369450569153, + "learning_rate": 2.381458647296925e-05, + "loss": 1.8364, + "step": 3781 + }, + { + "epoch": 0.686902626739619, + "grad_norm": 0.3605731725692749, + "learning_rate": 2.378939983141693e-05, + "loss": 1.4973, + "step": 3782 + }, + { + "epoch": 0.6870842509137967, + "grad_norm": 0.35536181926727295, + "learning_rate": 2.3764222357568266e-05, + "loss": 1.5922, + "step": 3783 + }, + { + "epoch": 0.6872658750879742, + "grad_norm": 0.4191442131996155, + "learning_rate": 2.3739054060229564e-05, + "loss": 1.6456, + "step": 3784 + }, + { + "epoch": 0.6874474992621518, + "grad_norm": 0.6703386306762695, + "learning_rate": 2.3713894948204003e-05, + "loss": 1.8911, + "step": 3785 + }, + { + "epoch": 0.6876291234363294, + "grad_norm": 0.3737078905105591, + "learning_rate": 2.3688745030291486e-05, + "loss": 1.8019, + "step": 3786 + }, + { + "epoch": 0.687810747610507, + "grad_norm": 0.5523988008499146, + "learning_rate": 2.36636043152887e-05, + "loss": 1.9043, + "step": 3787 + }, + { + "epoch": 0.6879923717846845, + "grad_norm": 0.534428060054779, + "learning_rate": 2.3638472811989176e-05, + "loss": 1.6837, + "step": 3788 + }, + { + "epoch": 0.6881739959588621, + "grad_norm": 0.37295374274253845, + "learning_rate": 2.3613350529183142e-05, + "loss": 1.7327, + "step": 3789 + }, + { + "epoch": 0.6883556201330397, + "grad_norm": 0.4401415288448334, + "learning_rate": 2.3588237475657677e-05, + "loss": 1.6797, + "step": 3790 + }, + { + "epoch": 0.6885372443072173, + "grad_norm": 0.799808144569397, + "learning_rate": 2.3563133660196556e-05, + "loss": 1.7533, + "step": 3791 + }, + { + "epoch": 0.6887188684813949, + "grad_norm": 0.3116327226161957, + "learning_rate": 2.35380390915804e-05, + "loss": 1.6127, + "step": 3792 + }, + { + "epoch": 0.6889004926555724, + "grad_norm": 0.5932947993278503, + "learning_rate": 2.3512953778586537e-05, + "loss": 1.7008, + "step": 3793 + }, + { + "epoch": 0.6890821168297501, + "grad_norm": 0.5408570766448975, + "learning_rate": 2.3487877729989084e-05, + "loss": 1.5244, + "step": 3794 + }, + { + "epoch": 0.6892637410039276, + "grad_norm": 1.7298539876937866, + "learning_rate": 2.346281095455889e-05, + "loss": 1.8003, + "step": 3795 + }, + { + "epoch": 0.6894453651781052, + "grad_norm": 0.41897276043891907, + "learning_rate": 2.3437753461063593e-05, + "loss": 1.7362, + "step": 3796 + }, + { + "epoch": 0.6896269893522828, + "grad_norm": 0.6875098347663879, + "learning_rate": 2.3412705258267604e-05, + "loss": 1.7909, + "step": 3797 + }, + { + "epoch": 0.6898086135264604, + "grad_norm": 0.3823298215866089, + "learning_rate": 2.338766635493203e-05, + "loss": 1.6275, + "step": 3798 + }, + { + "epoch": 0.6899902377006379, + "grad_norm": 0.900367796421051, + "learning_rate": 2.3362636759814748e-05, + "loss": 1.7036, + "step": 3799 + }, + { + "epoch": 0.6901718618748155, + "grad_norm": 0.420887291431427, + "learning_rate": 2.3337616481670366e-05, + "loss": 1.866, + "step": 3800 + }, + { + "epoch": 0.6903534860489932, + "grad_norm": 0.35067054629325867, + "learning_rate": 2.3312605529250276e-05, + "loss": 1.7655, + "step": 3801 + }, + { + "epoch": 0.6905351102231707, + "grad_norm": 1.4487292766571045, + "learning_rate": 2.3287603911302553e-05, + "loss": 1.7648, + "step": 3802 + }, + { + "epoch": 0.6907167343973483, + "grad_norm": 0.3478233516216278, + "learning_rate": 2.3262611636572067e-05, + "loss": 1.8595, + "step": 3803 + }, + { + "epoch": 0.6908983585715258, + "grad_norm": 0.5204758048057556, + "learning_rate": 2.3237628713800352e-05, + "loss": 1.6098, + "step": 3804 + }, + { + "epoch": 0.6910799827457035, + "grad_norm": 0.35520848631858826, + "learning_rate": 2.3212655151725738e-05, + "loss": 1.5322, + "step": 3805 + }, + { + "epoch": 0.691261606919881, + "grad_norm": 0.4085613787174225, + "learning_rate": 2.318769095908323e-05, + "loss": 1.7328, + "step": 3806 + }, + { + "epoch": 0.6914432310940586, + "grad_norm": 0.41932618618011475, + "learning_rate": 2.316273614460458e-05, + "loss": 1.5727, + "step": 3807 + }, + { + "epoch": 0.6916248552682362, + "grad_norm": 0.5504932403564453, + "learning_rate": 2.3137790717018238e-05, + "loss": 1.6756, + "step": 3808 + }, + { + "epoch": 0.6918064794424138, + "grad_norm": 0.37336885929107666, + "learning_rate": 2.3112854685049397e-05, + "loss": 1.8712, + "step": 3809 + }, + { + "epoch": 0.6919881036165914, + "grad_norm": 0.4401828646659851, + "learning_rate": 2.3087928057419973e-05, + "loss": 1.51, + "step": 3810 + }, + { + "epoch": 0.6921697277907689, + "grad_norm": 0.3953062891960144, + "learning_rate": 2.3063010842848564e-05, + "loss": 1.8694, + "step": 3811 + }, + { + "epoch": 0.6923513519649466, + "grad_norm": 0.3225861191749573, + "learning_rate": 2.3038103050050476e-05, + "loss": 1.844, + "step": 3812 + }, + { + "epoch": 0.6925329761391241, + "grad_norm": 0.7811111807823181, + "learning_rate": 2.3013204687737715e-05, + "loss": 1.6529, + "step": 3813 + }, + { + "epoch": 0.6927146003133017, + "grad_norm": 0.4427846074104309, + "learning_rate": 2.298831576461904e-05, + "loss": 1.7801, + "step": 3814 + }, + { + "epoch": 0.6928962244874792, + "grad_norm": 0.3977075219154358, + "learning_rate": 2.2963436289399824e-05, + "loss": 1.8149, + "step": 3815 + }, + { + "epoch": 0.6930778486616569, + "grad_norm": 0.5572588443756104, + "learning_rate": 2.2938566270782235e-05, + "loss": 1.7238, + "step": 3816 + }, + { + "epoch": 0.6932594728358344, + "grad_norm": 0.41032615303993225, + "learning_rate": 2.291370571746503e-05, + "loss": 1.8395, + "step": 3817 + }, + { + "epoch": 0.693441097010012, + "grad_norm": 0.5512093305587769, + "learning_rate": 2.2888854638143748e-05, + "loss": 1.7017, + "step": 3818 + }, + { + "epoch": 0.6936227211841897, + "grad_norm": 0.4830801486968994, + "learning_rate": 2.2864013041510553e-05, + "loss": 1.7143, + "step": 3819 + }, + { + "epoch": 0.6938043453583672, + "grad_norm": 0.5850067138671875, + "learning_rate": 2.283918093625429e-05, + "loss": 1.4267, + "step": 3820 + }, + { + "epoch": 0.6939859695325448, + "grad_norm": 0.40191054344177246, + "learning_rate": 2.2814358331060532e-05, + "loss": 1.831, + "step": 3821 + }, + { + "epoch": 0.6941675937067223, + "grad_norm": 0.42241016030311584, + "learning_rate": 2.278954523461151e-05, + "loss": 1.8327, + "step": 3822 + }, + { + "epoch": 0.6943492178809, + "grad_norm": 0.34547895193099976, + "learning_rate": 2.2764741655586108e-05, + "loss": 1.6495, + "step": 3823 + }, + { + "epoch": 0.6945308420550775, + "grad_norm": 0.6157140135765076, + "learning_rate": 2.2739947602659894e-05, + "loss": 1.8921, + "step": 3824 + }, + { + "epoch": 0.6947124662292551, + "grad_norm": 0.45738446712493896, + "learning_rate": 2.271516308450511e-05, + "loss": 1.7904, + "step": 3825 + }, + { + "epoch": 0.6948940904034328, + "grad_norm": 0.4610554873943329, + "learning_rate": 2.2690388109790628e-05, + "loss": 1.7596, + "step": 3826 + }, + { + "epoch": 0.6950757145776103, + "grad_norm": 0.6973397731781006, + "learning_rate": 2.2665622687182044e-05, + "loss": 1.9144, + "step": 3827 + }, + { + "epoch": 0.6952573387517879, + "grad_norm": 0.38503777980804443, + "learning_rate": 2.2640866825341588e-05, + "loss": 1.6051, + "step": 3828 + }, + { + "epoch": 0.6954389629259654, + "grad_norm": 0.34028372168540955, + "learning_rate": 2.2616120532928126e-05, + "loss": 1.6608, + "step": 3829 + }, + { + "epoch": 0.6956205871001431, + "grad_norm": 0.35655155777931213, + "learning_rate": 2.2591383818597166e-05, + "loss": 1.611, + "step": 3830 + }, + { + "epoch": 0.6958022112743206, + "grad_norm": 0.6442535519599915, + "learning_rate": 2.2566656691000932e-05, + "loss": 1.8859, + "step": 3831 + }, + { + "epoch": 0.6959838354484982, + "grad_norm": 0.37632399797439575, + "learning_rate": 2.2541939158788227e-05, + "loss": 1.775, + "step": 3832 + }, + { + "epoch": 0.6961654596226757, + "grad_norm": 0.4902356266975403, + "learning_rate": 2.2517231230604513e-05, + "loss": 1.7718, + "step": 3833 + }, + { + "epoch": 0.6963470837968534, + "grad_norm": 1.0558276176452637, + "learning_rate": 2.2492532915091936e-05, + "loss": 1.7319, + "step": 3834 + }, + { + "epoch": 0.696528707971031, + "grad_norm": 0.3605803847312927, + "learning_rate": 2.2467844220889207e-05, + "loss": 1.5731, + "step": 3835 + }, + { + "epoch": 0.6967103321452085, + "grad_norm": 0.4328206777572632, + "learning_rate": 2.244316515663175e-05, + "loss": 1.6734, + "step": 3836 + }, + { + "epoch": 0.6968919563193862, + "grad_norm": 0.41567543148994446, + "learning_rate": 2.2418495730951566e-05, + "loss": 1.8063, + "step": 3837 + }, + { + "epoch": 0.6970735804935637, + "grad_norm": 0.3211617171764374, + "learning_rate": 2.23938359524773e-05, + "loss": 1.6934, + "step": 3838 + }, + { + "epoch": 0.6972552046677413, + "grad_norm": 0.6185224056243896, + "learning_rate": 2.23691858298342e-05, + "loss": 1.6865, + "step": 3839 + }, + { + "epoch": 0.6974368288419188, + "grad_norm": 0.3324646055698395, + "learning_rate": 2.2344545371644182e-05, + "loss": 1.8106, + "step": 3840 + }, + { + "epoch": 0.6976184530160965, + "grad_norm": 0.3568219840526581, + "learning_rate": 2.2319914586525777e-05, + "loss": 2.0302, + "step": 3841 + }, + { + "epoch": 0.697800077190274, + "grad_norm": 0.3210183382034302, + "learning_rate": 2.2295293483094105e-05, + "loss": 1.7719, + "step": 3842 + }, + { + "epoch": 0.6979817013644516, + "grad_norm": 0.472869336605072, + "learning_rate": 2.227068206996088e-05, + "loss": 1.7583, + "step": 3843 + }, + { + "epoch": 0.6981633255386291, + "grad_norm": 0.587960422039032, + "learning_rate": 2.2246080355734494e-05, + "loss": 1.6294, + "step": 3844 + }, + { + "epoch": 0.6983449497128068, + "grad_norm": 0.786527156829834, + "learning_rate": 2.2221488349019903e-05, + "loss": 1.9192, + "step": 3845 + }, + { + "epoch": 0.6985265738869844, + "grad_norm": 0.43787094950675964, + "learning_rate": 2.2196906058418643e-05, + "loss": 1.8211, + "step": 3846 + }, + { + "epoch": 0.6987081980611619, + "grad_norm": 0.9651280045509338, + "learning_rate": 2.2172333492528928e-05, + "loss": 1.6751, + "step": 3847 + }, + { + "epoch": 0.6988898222353396, + "grad_norm": 0.3655005097389221, + "learning_rate": 2.2147770659945483e-05, + "loss": 1.5088, + "step": 3848 + }, + { + "epoch": 0.6990714464095171, + "grad_norm": 0.3637803792953491, + "learning_rate": 2.212321756925971e-05, + "loss": 1.8837, + "step": 3849 + }, + { + "epoch": 0.6992530705836947, + "grad_norm": 1.6883597373962402, + "learning_rate": 2.2098674229059537e-05, + "loss": 1.9328, + "step": 3850 + }, + { + "epoch": 0.6994346947578722, + "grad_norm": 0.4251827895641327, + "learning_rate": 2.2074140647929503e-05, + "loss": 1.8993, + "step": 3851 + }, + { + "epoch": 0.6996163189320499, + "grad_norm": 0.45079970359802246, + "learning_rate": 2.2049616834450754e-05, + "loss": 1.6713, + "step": 3852 + }, + { + "epoch": 0.6997979431062274, + "grad_norm": 0.4839218258857727, + "learning_rate": 2.202510279720102e-05, + "loss": 1.8312, + "step": 3853 + }, + { + "epoch": 0.699979567280405, + "grad_norm": 0.36552420258522034, + "learning_rate": 2.2000598544754575e-05, + "loss": 1.5616, + "step": 3854 + }, + { + "epoch": 0.7001611914545826, + "grad_norm": 0.3364499807357788, + "learning_rate": 2.1976104085682304e-05, + "loss": 1.7084, + "step": 3855 + }, + { + "epoch": 0.7003428156287602, + "grad_norm": 0.4655369818210602, + "learning_rate": 2.1951619428551624e-05, + "loss": 1.6816, + "step": 3856 + }, + { + "epoch": 0.7005244398029378, + "grad_norm": 0.36903512477874756, + "learning_rate": 2.1927144581926596e-05, + "loss": 1.6712, + "step": 3857 + }, + { + "epoch": 0.7007060639771153, + "grad_norm": 0.3903234004974365, + "learning_rate": 2.1902679554367765e-05, + "loss": 1.5809, + "step": 3858 + }, + { + "epoch": 0.700887688151293, + "grad_norm": 0.36547115445137024, + "learning_rate": 2.1878224354432337e-05, + "loss": 1.5965, + "step": 3859 + }, + { + "epoch": 0.7010693123254705, + "grad_norm": 0.36237454414367676, + "learning_rate": 2.185377899067399e-05, + "loss": 1.6257, + "step": 3860 + }, + { + "epoch": 0.7012509364996481, + "grad_norm": 0.4105582535266876, + "learning_rate": 2.1829343471642994e-05, + "loss": 1.7666, + "step": 3861 + }, + { + "epoch": 0.7014325606738256, + "grad_norm": 0.3480347990989685, + "learning_rate": 2.1804917805886216e-05, + "loss": 1.4528, + "step": 3862 + }, + { + "epoch": 0.7016141848480033, + "grad_norm": 0.40663161873817444, + "learning_rate": 2.178050200194702e-05, + "loss": 1.6609, + "step": 3863 + }, + { + "epoch": 0.7017958090221809, + "grad_norm": 0.3302551805973053, + "learning_rate": 2.1756096068365316e-05, + "loss": 1.7953, + "step": 3864 + }, + { + "epoch": 0.7019774331963584, + "grad_norm": 0.462695449590683, + "learning_rate": 2.1731700013677624e-05, + "loss": 1.9899, + "step": 3865 + }, + { + "epoch": 0.702159057370536, + "grad_norm": 0.3502143621444702, + "learning_rate": 2.1707313846416982e-05, + "loss": 1.8173, + "step": 3866 + }, + { + "epoch": 0.7023406815447136, + "grad_norm": 0.4024147689342499, + "learning_rate": 2.1682937575112938e-05, + "loss": 1.7573, + "step": 3867 + }, + { + "epoch": 0.7025223057188912, + "grad_norm": 0.5103248357772827, + "learning_rate": 2.1658571208291606e-05, + "loss": 1.8046, + "step": 3868 + }, + { + "epoch": 0.7027039298930687, + "grad_norm": 0.35663479566574097, + "learning_rate": 2.163421475447563e-05, + "loss": 1.8286, + "step": 3869 + }, + { + "epoch": 0.7028855540672464, + "grad_norm": 0.3213939070701599, + "learning_rate": 2.1609868222184166e-05, + "loss": 1.6335, + "step": 3870 + }, + { + "epoch": 0.703067178241424, + "grad_norm": 0.3262680768966675, + "learning_rate": 2.158553161993294e-05, + "loss": 1.7351, + "step": 3871 + }, + { + "epoch": 0.7032488024156015, + "grad_norm": 0.3402324616909027, + "learning_rate": 2.1561204956234214e-05, + "loss": 1.7357, + "step": 3872 + }, + { + "epoch": 0.7034304265897791, + "grad_norm": 0.557905375957489, + "learning_rate": 2.1536888239596714e-05, + "loss": 1.7928, + "step": 3873 + }, + { + "epoch": 0.7036120507639567, + "grad_norm": 1.4567310810089111, + "learning_rate": 2.1512581478525706e-05, + "loss": 1.8293, + "step": 3874 + }, + { + "epoch": 0.7037936749381343, + "grad_norm": 0.5219042301177979, + "learning_rate": 2.148828468152302e-05, + "loss": 1.5974, + "step": 3875 + }, + { + "epoch": 0.7039752991123118, + "grad_norm": 0.38405415415763855, + "learning_rate": 2.1463997857086932e-05, + "loss": 1.6587, + "step": 3876 + }, + { + "epoch": 0.7041569232864894, + "grad_norm": 0.4648084342479706, + "learning_rate": 2.1439721013712306e-05, + "loss": 1.8136, + "step": 3877 + }, + { + "epoch": 0.704338547460667, + "grad_norm": 0.3524421453475952, + "learning_rate": 2.141545415989043e-05, + "loss": 1.6526, + "step": 3878 + }, + { + "epoch": 0.7045201716348446, + "grad_norm": 0.516213595867157, + "learning_rate": 2.139119730410918e-05, + "loss": 1.6633, + "step": 3879 + }, + { + "epoch": 0.7047017958090221, + "grad_norm": 0.37498700618743896, + "learning_rate": 2.1366950454852885e-05, + "loss": 1.7019, + "step": 3880 + }, + { + "epoch": 0.7048834199831998, + "grad_norm": 0.3541460335254669, + "learning_rate": 2.134271362060238e-05, + "loss": 1.8154, + "step": 3881 + }, + { + "epoch": 0.7050650441573774, + "grad_norm": 0.29681992530822754, + "learning_rate": 2.131848680983498e-05, + "loss": 1.8262, + "step": 3882 + }, + { + "epoch": 0.7052466683315549, + "grad_norm": 0.9753704071044922, + "learning_rate": 2.129427003102455e-05, + "loss": 1.7149, + "step": 3883 + }, + { + "epoch": 0.7054282925057325, + "grad_norm": 0.5128195881843567, + "learning_rate": 2.1270063292641417e-05, + "loss": 1.6539, + "step": 3884 + }, + { + "epoch": 0.7056099166799101, + "grad_norm": 0.377507746219635, + "learning_rate": 2.124586660315238e-05, + "loss": 1.5946, + "step": 3885 + }, + { + "epoch": 0.7057915408540877, + "grad_norm": 0.42025288939476013, + "learning_rate": 2.1221679971020735e-05, + "loss": 1.6447, + "step": 3886 + }, + { + "epoch": 0.7059731650282652, + "grad_norm": 0.6911914944648743, + "learning_rate": 2.1197503404706243e-05, + "loss": 1.8275, + "step": 3887 + }, + { + "epoch": 0.7061547892024428, + "grad_norm": 0.39656466245651245, + "learning_rate": 2.117333691266519e-05, + "loss": 1.7684, + "step": 3888 + }, + { + "epoch": 0.7063364133766205, + "grad_norm": 0.45368918776512146, + "learning_rate": 2.114918050335029e-05, + "loss": 1.8586, + "step": 3889 + }, + { + "epoch": 0.706518037550798, + "grad_norm": 0.47124505043029785, + "learning_rate": 2.112503418521078e-05, + "loss": 1.8808, + "step": 3890 + }, + { + "epoch": 0.7066996617249756, + "grad_norm": 0.4053756296634674, + "learning_rate": 2.1100897966692297e-05, + "loss": 1.5419, + "step": 3891 + }, + { + "epoch": 0.7068812858991532, + "grad_norm": 0.5621311664581299, + "learning_rate": 2.107677185623702e-05, + "loss": 1.5633, + "step": 3892 + }, + { + "epoch": 0.7070629100733308, + "grad_norm": 1.1969174146652222, + "learning_rate": 2.1052655862283548e-05, + "loss": 1.8213, + "step": 3893 + }, + { + "epoch": 0.7072445342475083, + "grad_norm": 0.47229042649269104, + "learning_rate": 2.1028549993266956e-05, + "loss": 1.7048, + "step": 3894 + }, + { + "epoch": 0.7074261584216859, + "grad_norm": 0.7403913736343384, + "learning_rate": 2.100445425761875e-05, + "loss": 1.8565, + "step": 3895 + }, + { + "epoch": 0.7076077825958635, + "grad_norm": 0.3527761399745941, + "learning_rate": 2.098036866376693e-05, + "loss": 1.4443, + "step": 3896 + }, + { + "epoch": 0.7077894067700411, + "grad_norm": 0.785034716129303, + "learning_rate": 2.0956293220135957e-05, + "loss": 1.7838, + "step": 3897 + }, + { + "epoch": 0.7079710309442186, + "grad_norm": 0.40770792961120605, + "learning_rate": 2.0932227935146708e-05, + "loss": 1.5923, + "step": 3898 + }, + { + "epoch": 0.7081526551183963, + "grad_norm": 0.3775874972343445, + "learning_rate": 2.0908172817216504e-05, + "loss": 1.805, + "step": 3899 + }, + { + "epoch": 0.7083342792925739, + "grad_norm": 0.7825526595115662, + "learning_rate": 2.088412787475912e-05, + "loss": 1.7858, + "step": 3900 + }, + { + "epoch": 0.7085159034667514, + "grad_norm": 0.6640651822090149, + "learning_rate": 2.0860093116184797e-05, + "loss": 1.6786, + "step": 3901 + }, + { + "epoch": 0.708697527640929, + "grad_norm": 0.8286015391349792, + "learning_rate": 2.0836068549900167e-05, + "loss": 1.9654, + "step": 3902 + }, + { + "epoch": 0.7088791518151066, + "grad_norm": 0.4485858082771301, + "learning_rate": 2.0812054184308356e-05, + "loss": 1.8735, + "step": 3903 + }, + { + "epoch": 0.7090607759892842, + "grad_norm": 0.47265082597732544, + "learning_rate": 2.0788050027808852e-05, + "loss": 1.5906, + "step": 3904 + }, + { + "epoch": 0.7092424001634617, + "grad_norm": 0.4206588566303253, + "learning_rate": 2.0764056088797645e-05, + "loss": 1.6135, + "step": 3905 + }, + { + "epoch": 0.7094240243376393, + "grad_norm": 0.3237096667289734, + "learning_rate": 2.0740072375667103e-05, + "loss": 1.6179, + "step": 3906 + }, + { + "epoch": 0.709605648511817, + "grad_norm": 1.3852214813232422, + "learning_rate": 2.0716098896806003e-05, + "loss": 1.644, + "step": 3907 + }, + { + "epoch": 0.7097872726859945, + "grad_norm": 0.3905258774757385, + "learning_rate": 2.069213566059961e-05, + "loss": 1.5796, + "step": 3908 + }, + { + "epoch": 0.7099688968601721, + "grad_norm": 0.42961981892585754, + "learning_rate": 2.0668182675429528e-05, + "loss": 1.7332, + "step": 3909 + }, + { + "epoch": 0.7101505210343497, + "grad_norm": 0.8276214599609375, + "learning_rate": 2.0644239949673843e-05, + "loss": 1.7369, + "step": 3910 + }, + { + "epoch": 0.7103321452085273, + "grad_norm": 0.34182009100914, + "learning_rate": 2.0620307491707012e-05, + "loss": 1.7488, + "step": 3911 + }, + { + "epoch": 0.7105137693827048, + "grad_norm": 0.33718442916870117, + "learning_rate": 2.0596385309899906e-05, + "loss": 1.715, + "step": 3912 + }, + { + "epoch": 0.7106953935568824, + "grad_norm": 0.29390469193458557, + "learning_rate": 2.0572473412619797e-05, + "loss": 1.669, + "step": 3913 + }, + { + "epoch": 0.71087701773106, + "grad_norm": 0.4331960082054138, + "learning_rate": 2.0548571808230384e-05, + "loss": 1.6957, + "step": 3914 + }, + { + "epoch": 0.7110586419052376, + "grad_norm": 0.3848959505558014, + "learning_rate": 2.052468050509176e-05, + "loss": 1.5997, + "step": 3915 + }, + { + "epoch": 0.7112402660794152, + "grad_norm": 0.49063247442245483, + "learning_rate": 2.050079951156039e-05, + "loss": 1.8536, + "step": 3916 + }, + { + "epoch": 0.7114218902535927, + "grad_norm": 0.3565727472305298, + "learning_rate": 2.0476928835989166e-05, + "loss": 1.7898, + "step": 3917 + }, + { + "epoch": 0.7116035144277704, + "grad_norm": 0.4938317537307739, + "learning_rate": 2.045306848672732e-05, + "loss": 1.6969, + "step": 3918 + }, + { + "epoch": 0.7117851386019479, + "grad_norm": 1.0890432596206665, + "learning_rate": 2.042921847212055e-05, + "loss": 1.9689, + "step": 3919 + }, + { + "epoch": 0.7119667627761255, + "grad_norm": 0.42845484614372253, + "learning_rate": 2.0405378800510848e-05, + "loss": 1.8189, + "step": 3920 + }, + { + "epoch": 0.7121483869503031, + "grad_norm": 0.3759959936141968, + "learning_rate": 2.0381549480236685e-05, + "loss": 1.6842, + "step": 3921 + }, + { + "epoch": 0.7123300111244807, + "grad_norm": 0.4006047546863556, + "learning_rate": 2.035773051963282e-05, + "loss": 1.5154, + "step": 3922 + }, + { + "epoch": 0.7125116352986582, + "grad_norm": 0.422373503446579, + "learning_rate": 2.0333921927030475e-05, + "loss": 1.7149, + "step": 3923 + }, + { + "epoch": 0.7126932594728358, + "grad_norm": 0.5878560543060303, + "learning_rate": 2.0310123710757167e-05, + "loss": 1.7106, + "step": 3924 + }, + { + "epoch": 0.7128748836470135, + "grad_norm": 0.34539610147476196, + "learning_rate": 2.0286335879136836e-05, + "loss": 1.8063, + "step": 3925 + }, + { + "epoch": 0.713056507821191, + "grad_norm": 0.3223040699958801, + "learning_rate": 2.0262558440489743e-05, + "loss": 1.8951, + "step": 3926 + }, + { + "epoch": 0.7132381319953686, + "grad_norm": 0.4847765862941742, + "learning_rate": 2.0238791403132567e-05, + "loss": 1.7029, + "step": 3927 + }, + { + "epoch": 0.7134197561695461, + "grad_norm": 0.7378479838371277, + "learning_rate": 2.0215034775378332e-05, + "loss": 1.6852, + "step": 3928 + }, + { + "epoch": 0.7136013803437238, + "grad_norm": 0.9753962159156799, + "learning_rate": 2.019128856553641e-05, + "loss": 1.5605, + "step": 3929 + }, + { + "epoch": 0.7137830045179013, + "grad_norm": 0.32335227727890015, + "learning_rate": 2.0167552781912524e-05, + "loss": 1.7153, + "step": 3930 + }, + { + "epoch": 0.7139646286920789, + "grad_norm": 0.3533690869808197, + "learning_rate": 2.0143827432808743e-05, + "loss": 1.7101, + "step": 3931 + }, + { + "epoch": 0.7141462528662565, + "grad_norm": 0.4341740608215332, + "learning_rate": 2.0120112526523517e-05, + "loss": 1.6367, + "step": 3932 + }, + { + "epoch": 0.7143278770404341, + "grad_norm": 0.5084637999534607, + "learning_rate": 2.009640807135165e-05, + "loss": 1.8908, + "step": 3933 + }, + { + "epoch": 0.7145095012146117, + "grad_norm": 0.3869413733482361, + "learning_rate": 2.0072714075584253e-05, + "loss": 1.6995, + "step": 3934 + }, + { + "epoch": 0.7146911253887892, + "grad_norm": 0.44110745191574097, + "learning_rate": 2.0049030547508774e-05, + "loss": 1.6543, + "step": 3935 + }, + { + "epoch": 0.7148727495629669, + "grad_norm": 0.40112459659576416, + "learning_rate": 2.0025357495409058e-05, + "loss": 1.765, + "step": 3936 + }, + { + "epoch": 0.7150543737371444, + "grad_norm": 0.4374082088470459, + "learning_rate": 2.000169492756523e-05, + "loss": 1.8209, + "step": 3937 + }, + { + "epoch": 0.715235997911322, + "grad_norm": 0.4214734435081482, + "learning_rate": 1.997804285225375e-05, + "loss": 1.8357, + "step": 3938 + }, + { + "epoch": 0.7154176220854995, + "grad_norm": 0.6786261200904846, + "learning_rate": 1.9954401277747432e-05, + "loss": 1.7469, + "step": 3939 + }, + { + "epoch": 0.7155992462596772, + "grad_norm": 0.6167196035385132, + "learning_rate": 1.9930770212315442e-05, + "loss": 1.8445, + "step": 3940 + }, + { + "epoch": 0.7157808704338547, + "grad_norm": 0.38104137778282166, + "learning_rate": 1.990714966422321e-05, + "loss": 1.6891, + "step": 3941 + }, + { + "epoch": 0.7159624946080323, + "grad_norm": 0.4175965189933777, + "learning_rate": 1.9883539641732517e-05, + "loss": 1.6191, + "step": 3942 + }, + { + "epoch": 0.71614411878221, + "grad_norm": 0.4070028066635132, + "learning_rate": 1.9859940153101465e-05, + "loss": 1.6975, + "step": 3943 + }, + { + "epoch": 0.7163257429563875, + "grad_norm": 0.3498595058917999, + "learning_rate": 1.9836351206584442e-05, + "loss": 1.6092, + "step": 3944 + }, + { + "epoch": 0.7165073671305651, + "grad_norm": 0.666684627532959, + "learning_rate": 1.9812772810432194e-05, + "loss": 2.003, + "step": 3945 + }, + { + "epoch": 0.7166889913047426, + "grad_norm": 0.41960519552230835, + "learning_rate": 1.978920497289178e-05, + "loss": 1.9697, + "step": 3946 + }, + { + "epoch": 0.7168706154789203, + "grad_norm": 0.38588088750839233, + "learning_rate": 1.976564770220652e-05, + "loss": 1.4813, + "step": 3947 + }, + { + "epoch": 0.7170522396530978, + "grad_norm": 0.36951377987861633, + "learning_rate": 1.9742101006616037e-05, + "loss": 1.5891, + "step": 3948 + }, + { + "epoch": 0.7172338638272754, + "grad_norm": 0.39202842116355896, + "learning_rate": 1.971856489435632e-05, + "loss": 1.7498, + "step": 3949 + }, + { + "epoch": 0.7174154880014529, + "grad_norm": 0.42060762643814087, + "learning_rate": 1.9695039373659596e-05, + "loss": 1.7737, + "step": 3950 + }, + { + "epoch": 0.7175971121756306, + "grad_norm": 0.38676345348358154, + "learning_rate": 1.9671524452754393e-05, + "loss": 1.8284, + "step": 3951 + }, + { + "epoch": 0.7177787363498082, + "grad_norm": 0.3023572862148285, + "learning_rate": 1.9648020139865574e-05, + "loss": 1.7556, + "step": 3952 + }, + { + "epoch": 0.7179603605239857, + "grad_norm": 0.542769730091095, + "learning_rate": 1.9624526443214224e-05, + "loss": 1.6858, + "step": 3953 + }, + { + "epoch": 0.7181419846981634, + "grad_norm": 0.4516908824443817, + "learning_rate": 1.9601043371017804e-05, + "loss": 1.7344, + "step": 3954 + }, + { + "epoch": 0.7183236088723409, + "grad_norm": 0.3573784828186035, + "learning_rate": 1.957757093148997e-05, + "loss": 1.6293, + "step": 3955 + }, + { + "epoch": 0.7185052330465185, + "grad_norm": 1.7843668460845947, + "learning_rate": 1.955410913284071e-05, + "loss": 1.8209, + "step": 3956 + }, + { + "epoch": 0.718686857220696, + "grad_norm": 0.3990734815597534, + "learning_rate": 1.953065798327625e-05, + "loss": 1.882, + "step": 3957 + }, + { + "epoch": 0.7188684813948737, + "grad_norm": 0.5187407732009888, + "learning_rate": 1.9507217490999146e-05, + "loss": 1.6486, + "step": 3958 + }, + { + "epoch": 0.7190501055690512, + "grad_norm": 0.4391959011554718, + "learning_rate": 1.948378766420821e-05, + "loss": 1.7744, + "step": 3959 + }, + { + "epoch": 0.7192317297432288, + "grad_norm": 0.377516508102417, + "learning_rate": 1.9460368511098498e-05, + "loss": 1.765, + "step": 3960 + }, + { + "epoch": 0.7194133539174065, + "grad_norm": 0.45972439646720886, + "learning_rate": 1.9436960039861324e-05, + "loss": 1.7457, + "step": 3961 + }, + { + "epoch": 0.719594978091584, + "grad_norm": 0.47720760107040405, + "learning_rate": 1.9413562258684332e-05, + "loss": 1.7402, + "step": 3962 + }, + { + "epoch": 0.7197766022657616, + "grad_norm": 0.36355283856391907, + "learning_rate": 1.939017517575134e-05, + "loss": 1.8009, + "step": 3963 + }, + { + "epoch": 0.7199582264399391, + "grad_norm": 0.3478757441043854, + "learning_rate": 1.9366798799242508e-05, + "loss": 1.6899, + "step": 3964 + }, + { + "epoch": 0.7201398506141168, + "grad_norm": 0.35619980096817017, + "learning_rate": 1.9343433137334194e-05, + "loss": 1.686, + "step": 3965 + }, + { + "epoch": 0.7203214747882943, + "grad_norm": 0.36106422543525696, + "learning_rate": 1.9320078198199003e-05, + "loss": 1.7619, + "step": 3966 + }, + { + "epoch": 0.7205030989624719, + "grad_norm": 0.657001793384552, + "learning_rate": 1.929673399000585e-05, + "loss": 1.8045, + "step": 3967 + }, + { + "epoch": 0.7206847231366494, + "grad_norm": 0.33529841899871826, + "learning_rate": 1.927340052091984e-05, + "loss": 1.5886, + "step": 3968 + }, + { + "epoch": 0.7208663473108271, + "grad_norm": 0.36188751459121704, + "learning_rate": 1.9250077799102322e-05, + "loss": 1.6718, + "step": 3969 + }, + { + "epoch": 0.7210479714850047, + "grad_norm": 0.3667532205581665, + "learning_rate": 1.9226765832710926e-05, + "loss": 1.6989, + "step": 3970 + }, + { + "epoch": 0.7212295956591822, + "grad_norm": 0.4154311418533325, + "learning_rate": 1.9203464629899502e-05, + "loss": 1.5948, + "step": 3971 + }, + { + "epoch": 0.7214112198333599, + "grad_norm": 0.6346042156219482, + "learning_rate": 1.9180174198818133e-05, + "loss": 1.9388, + "step": 3972 + }, + { + "epoch": 0.7215928440075374, + "grad_norm": 0.36216065287590027, + "learning_rate": 1.915689454761312e-05, + "loss": 1.9003, + "step": 3973 + }, + { + "epoch": 0.721774468181715, + "grad_norm": 0.3416580259799957, + "learning_rate": 1.9133625684426993e-05, + "loss": 1.6592, + "step": 3974 + }, + { + "epoch": 0.7219560923558925, + "grad_norm": 1.247453212738037, + "learning_rate": 1.911036761739855e-05, + "loss": 1.8284, + "step": 3975 + }, + { + "epoch": 0.7221377165300702, + "grad_norm": 0.5800723433494568, + "learning_rate": 1.908712035466276e-05, + "loss": 1.5038, + "step": 3976 + }, + { + "epoch": 0.7223193407042477, + "grad_norm": 0.8043866157531738, + "learning_rate": 1.906388390435087e-05, + "loss": 1.6338, + "step": 3977 + }, + { + "epoch": 0.7225009648784253, + "grad_norm": 0.4391559064388275, + "learning_rate": 1.904065827459029e-05, + "loss": 1.8298, + "step": 3978 + }, + { + "epoch": 0.7226825890526029, + "grad_norm": 0.41768714785575867, + "learning_rate": 1.9017443473504654e-05, + "loss": 1.6661, + "step": 3979 + }, + { + "epoch": 0.7228642132267805, + "grad_norm": 0.3770376443862915, + "learning_rate": 1.8994239509213858e-05, + "loss": 1.7566, + "step": 3980 + }, + { + "epoch": 0.7230458374009581, + "grad_norm": 0.45210275053977966, + "learning_rate": 1.8971046389833952e-05, + "loss": 1.7151, + "step": 3981 + }, + { + "epoch": 0.7232274615751356, + "grad_norm": 0.36073940992355347, + "learning_rate": 1.8947864123477194e-05, + "loss": 1.6906, + "step": 3982 + }, + { + "epoch": 0.7234090857493133, + "grad_norm": 0.45705604553222656, + "learning_rate": 1.892469271825209e-05, + "loss": 1.5285, + "step": 3983 + }, + { + "epoch": 0.7235907099234908, + "grad_norm": 0.3256854712963104, + "learning_rate": 1.8901532182263333e-05, + "loss": 1.9405, + "step": 3984 + }, + { + "epoch": 0.7237723340976684, + "grad_norm": 0.33251988887786865, + "learning_rate": 1.8878382523611786e-05, + "loss": 1.7889, + "step": 3985 + }, + { + "epoch": 0.7239539582718459, + "grad_norm": 0.34114742279052734, + "learning_rate": 1.885524375039453e-05, + "loss": 1.6563, + "step": 3986 + }, + { + "epoch": 0.7241355824460236, + "grad_norm": 0.596284806728363, + "learning_rate": 1.8832115870704807e-05, + "loss": 1.7436, + "step": 3987 + }, + { + "epoch": 0.7243172066202012, + "grad_norm": 0.35528838634490967, + "learning_rate": 1.880899889263212e-05, + "loss": 1.5693, + "step": 3988 + }, + { + "epoch": 0.7244988307943787, + "grad_norm": 0.6344895362854004, + "learning_rate": 1.878589282426207e-05, + "loss": 1.7543, + "step": 3989 + }, + { + "epoch": 0.7246804549685563, + "grad_norm": 0.35488390922546387, + "learning_rate": 1.8762797673676526e-05, + "loss": 1.7009, + "step": 3990 + }, + { + "epoch": 0.7248620791427339, + "grad_norm": 0.379031240940094, + "learning_rate": 1.873971344895347e-05, + "loss": 1.828, + "step": 3991 + }, + { + "epoch": 0.7250437033169115, + "grad_norm": 0.33757132291793823, + "learning_rate": 1.871664015816709e-05, + "loss": 1.8013, + "step": 3992 + }, + { + "epoch": 0.725225327491089, + "grad_norm": 0.4579141139984131, + "learning_rate": 1.869357780938778e-05, + "loss": 1.6888, + "step": 3993 + }, + { + "epoch": 0.7254069516652667, + "grad_norm": 0.3263653516769409, + "learning_rate": 1.8670526410682032e-05, + "loss": 1.6562, + "step": 3994 + }, + { + "epoch": 0.7255885758394442, + "grad_norm": 0.37161728739738464, + "learning_rate": 1.86474859701126e-05, + "loss": 1.7556, + "step": 3995 + }, + { + "epoch": 0.7257702000136218, + "grad_norm": 0.40120553970336914, + "learning_rate": 1.862445649573832e-05, + "loss": 1.6214, + "step": 3996 + }, + { + "epoch": 0.7259518241877994, + "grad_norm": 0.417508989572525, + "learning_rate": 1.8601437995614262e-05, + "loss": 1.8197, + "step": 3997 + }, + { + "epoch": 0.726133448361977, + "grad_norm": 0.4185163676738739, + "learning_rate": 1.8578430477791614e-05, + "loss": 1.7321, + "step": 3998 + }, + { + "epoch": 0.7263150725361546, + "grad_norm": 0.8456979990005493, + "learning_rate": 1.8555433950317725e-05, + "loss": 1.6852, + "step": 3999 + }, + { + "epoch": 0.7264966967103321, + "grad_norm": 0.9052025079727173, + "learning_rate": 1.8532448421236105e-05, + "loss": 1.7936, + "step": 4000 + }, + { + "epoch": 0.7266783208845097, + "grad_norm": 0.4781523048877716, + "learning_rate": 1.850947389858643e-05, + "loss": 1.7868, + "step": 4001 + }, + { + "epoch": 0.7268599450586873, + "grad_norm": 0.3725859224796295, + "learning_rate": 1.8486510390404544e-05, + "loss": 1.7168, + "step": 4002 + }, + { + "epoch": 0.7270415692328649, + "grad_norm": 0.6500416398048401, + "learning_rate": 1.8463557904722382e-05, + "loss": 1.7434, + "step": 4003 + }, + { + "epoch": 0.7272231934070424, + "grad_norm": 0.5128167867660522, + "learning_rate": 1.844061644956807e-05, + "loss": 1.7021, + "step": 4004 + }, + { + "epoch": 0.7274048175812201, + "grad_norm": 0.49076318740844727, + "learning_rate": 1.841768603296583e-05, + "loss": 1.8712, + "step": 4005 + }, + { + "epoch": 0.7275864417553977, + "grad_norm": 0.4494345188140869, + "learning_rate": 1.83947666629361e-05, + "loss": 1.8017, + "step": 4006 + }, + { + "epoch": 0.7277680659295752, + "grad_norm": 0.3524850606918335, + "learning_rate": 1.837185834749536e-05, + "loss": 1.8606, + "step": 4007 + }, + { + "epoch": 0.7279496901037528, + "grad_norm": 0.8879585862159729, + "learning_rate": 1.8348961094656308e-05, + "loss": 1.7656, + "step": 4008 + }, + { + "epoch": 0.7281313142779304, + "grad_norm": 0.433942973613739, + "learning_rate": 1.8326074912427705e-05, + "loss": 1.6256, + "step": 4009 + }, + { + "epoch": 0.728312938452108, + "grad_norm": 0.3379595875740051, + "learning_rate": 1.8303199808814507e-05, + "loss": 1.6514, + "step": 4010 + }, + { + "epoch": 0.7284945626262855, + "grad_norm": 0.3674887418746948, + "learning_rate": 1.8280335791817733e-05, + "loss": 1.6173, + "step": 4011 + }, + { + "epoch": 0.7286761868004631, + "grad_norm": 0.3227197825908661, + "learning_rate": 1.8257482869434556e-05, + "loss": 1.7735, + "step": 4012 + }, + { + "epoch": 0.7288578109746408, + "grad_norm": 0.4534490406513214, + "learning_rate": 1.823464104965824e-05, + "loss": 1.7969, + "step": 4013 + }, + { + "epoch": 0.7290394351488183, + "grad_norm": 0.33658063411712646, + "learning_rate": 1.8211810340478207e-05, + "loss": 1.6389, + "step": 4014 + }, + { + "epoch": 0.7292210593229959, + "grad_norm": 0.45910942554473877, + "learning_rate": 1.818899074987999e-05, + "loss": 1.7018, + "step": 4015 + }, + { + "epoch": 0.7294026834971735, + "grad_norm": 0.3372138738632202, + "learning_rate": 1.8166182285845207e-05, + "loss": 1.8285, + "step": 4016 + }, + { + "epoch": 0.7295843076713511, + "grad_norm": 1.3891503810882568, + "learning_rate": 1.8143384956351578e-05, + "loss": 1.6431, + "step": 4017 + }, + { + "epoch": 0.7297659318455286, + "grad_norm": 0.47550955414772034, + "learning_rate": 1.8120598769372937e-05, + "loss": 1.8795, + "step": 4018 + }, + { + "epoch": 0.7299475560197062, + "grad_norm": 1.186001181602478, + "learning_rate": 1.8097823732879243e-05, + "loss": 1.7628, + "step": 4019 + }, + { + "epoch": 0.7301291801938838, + "grad_norm": 0.388323038816452, + "learning_rate": 1.8075059854836564e-05, + "loss": 1.7353, + "step": 4020 + }, + { + "epoch": 0.7303108043680614, + "grad_norm": 0.389908105134964, + "learning_rate": 1.805230714320701e-05, + "loss": 1.8192, + "step": 4021 + }, + { + "epoch": 0.730492428542239, + "grad_norm": 0.3813144266605377, + "learning_rate": 1.8029565605948802e-05, + "loss": 1.7229, + "step": 4022 + }, + { + "epoch": 0.7306740527164165, + "grad_norm": 0.44931066036224365, + "learning_rate": 1.8006835251016307e-05, + "loss": 1.7239, + "step": 4023 + }, + { + "epoch": 0.7308556768905942, + "grad_norm": 0.847997784614563, + "learning_rate": 1.798411608635992e-05, + "loss": 1.8178, + "step": 4024 + }, + { + "epoch": 0.7310373010647717, + "grad_norm": 0.413612961769104, + "learning_rate": 1.7961408119926132e-05, + "loss": 1.6774, + "step": 4025 + }, + { + "epoch": 0.7312189252389493, + "grad_norm": 0.35099324584007263, + "learning_rate": 1.7938711359657547e-05, + "loss": 1.6855, + "step": 4026 + }, + { + "epoch": 0.7314005494131269, + "grad_norm": 0.7201859354972839, + "learning_rate": 1.791602581349281e-05, + "loss": 1.7978, + "step": 4027 + }, + { + "epoch": 0.7315821735873045, + "grad_norm": 0.4115089178085327, + "learning_rate": 1.789335148936669e-05, + "loss": 1.6706, + "step": 4028 + }, + { + "epoch": 0.731763797761482, + "grad_norm": 0.29868656396865845, + "learning_rate": 1.7870688395209983e-05, + "loss": 1.7339, + "step": 4029 + }, + { + "epoch": 0.7319454219356596, + "grad_norm": 0.41203486919403076, + "learning_rate": 1.7848036538949593e-05, + "loss": 1.6668, + "step": 4030 + }, + { + "epoch": 0.7321270461098373, + "grad_norm": 0.3769657015800476, + "learning_rate": 1.7825395928508447e-05, + "loss": 1.6477, + "step": 4031 + }, + { + "epoch": 0.7323086702840148, + "grad_norm": 0.4847228229045868, + "learning_rate": 1.7802766571805602e-05, + "loss": 1.7809, + "step": 4032 + }, + { + "epoch": 0.7324902944581924, + "grad_norm": 0.37708672881126404, + "learning_rate": 1.7780148476756147e-05, + "loss": 1.6393, + "step": 4033 + }, + { + "epoch": 0.73267191863237, + "grad_norm": 0.45900559425354004, + "learning_rate": 1.7757541651271232e-05, + "loss": 1.5648, + "step": 4034 + }, + { + "epoch": 0.7328535428065476, + "grad_norm": 0.3198724687099457, + "learning_rate": 1.7734946103258047e-05, + "loss": 1.7067, + "step": 4035 + }, + { + "epoch": 0.7330351669807251, + "grad_norm": 0.6120316982269287, + "learning_rate": 1.7712361840619858e-05, + "loss": 1.8604, + "step": 4036 + }, + { + "epoch": 0.7332167911549027, + "grad_norm": 0.41048115491867065, + "learning_rate": 1.7689788871256e-05, + "loss": 1.6233, + "step": 4037 + }, + { + "epoch": 0.7333984153290803, + "grad_norm": 0.6390156149864197, + "learning_rate": 1.766722720306182e-05, + "loss": 1.6116, + "step": 4038 + }, + { + "epoch": 0.7335800395032579, + "grad_norm": 0.41870975494384766, + "learning_rate": 1.7644676843928753e-05, + "loss": 1.6578, + "step": 4039 + }, + { + "epoch": 0.7337616636774354, + "grad_norm": 0.44182220101356506, + "learning_rate": 1.7622137801744233e-05, + "loss": 1.7468, + "step": 4040 + }, + { + "epoch": 0.733943287851613, + "grad_norm": 0.40923696756362915, + "learning_rate": 1.7599610084391784e-05, + "loss": 1.7362, + "step": 4041 + }, + { + "epoch": 0.7341249120257907, + "grad_norm": 0.3672255277633667, + "learning_rate": 1.757709369975093e-05, + "loss": 1.8991, + "step": 4042 + }, + { + "epoch": 0.7343065361999682, + "grad_norm": 0.37017983198165894, + "learning_rate": 1.7554588655697248e-05, + "loss": 1.6529, + "step": 4043 + }, + { + "epoch": 0.7344881603741458, + "grad_norm": 0.4056765139102936, + "learning_rate": 1.753209496010233e-05, + "loss": 1.8377, + "step": 4044 + }, + { + "epoch": 0.7346697845483234, + "grad_norm": 0.4102430045604706, + "learning_rate": 1.750961262083383e-05, + "loss": 1.6701, + "step": 4045 + }, + { + "epoch": 0.734851408722501, + "grad_norm": 0.3856065273284912, + "learning_rate": 1.7487141645755435e-05, + "loss": 1.6688, + "step": 4046 + }, + { + "epoch": 0.7350330328966785, + "grad_norm": 0.48595964908599854, + "learning_rate": 1.7464682042726815e-05, + "loss": 1.8452, + "step": 4047 + }, + { + "epoch": 0.7352146570708561, + "grad_norm": 0.6532465219497681, + "learning_rate": 1.7442233819603687e-05, + "loss": 1.7688, + "step": 4048 + }, + { + "epoch": 0.7353962812450338, + "grad_norm": 0.4151637852191925, + "learning_rate": 1.7419796984237768e-05, + "loss": 1.5654, + "step": 4049 + }, + { + "epoch": 0.7355779054192113, + "grad_norm": 0.6366296410560608, + "learning_rate": 1.7397371544476825e-05, + "loss": 1.8147, + "step": 4050 + }, + { + "epoch": 0.7357595295933889, + "grad_norm": 0.36441928148269653, + "learning_rate": 1.737495750816464e-05, + "loss": 1.6101, + "step": 4051 + }, + { + "epoch": 0.7359411537675664, + "grad_norm": 0.37510207295417786, + "learning_rate": 1.7352554883140977e-05, + "loss": 1.8342, + "step": 4052 + }, + { + "epoch": 0.7361227779417441, + "grad_norm": 0.5651563405990601, + "learning_rate": 1.733016367724159e-05, + "loss": 1.7438, + "step": 4053 + }, + { + "epoch": 0.7363044021159216, + "grad_norm": 0.4439482092857361, + "learning_rate": 1.730778389829832e-05, + "loss": 1.5688, + "step": 4054 + }, + { + "epoch": 0.7364860262900992, + "grad_norm": 0.5134232044219971, + "learning_rate": 1.7285415554138935e-05, + "loss": 1.7749, + "step": 4055 + }, + { + "epoch": 0.7366676504642768, + "grad_norm": 0.8027551174163818, + "learning_rate": 1.7263058652587216e-05, + "loss": 1.892, + "step": 4056 + }, + { + "epoch": 0.7368492746384544, + "grad_norm": 0.41066989302635193, + "learning_rate": 1.7240713201462973e-05, + "loss": 1.8085, + "step": 4057 + }, + { + "epoch": 0.737030898812632, + "grad_norm": 0.4081187844276428, + "learning_rate": 1.7218379208582002e-05, + "loss": 1.6832, + "step": 4058 + }, + { + "epoch": 0.7372125229868095, + "grad_norm": 0.35432490706443787, + "learning_rate": 1.719605668175608e-05, + "loss": 1.7247, + "step": 4059 + }, + { + "epoch": 0.7373941471609872, + "grad_norm": 0.41471248865127563, + "learning_rate": 1.7173745628792958e-05, + "loss": 1.6084, + "step": 4060 + }, + { + "epoch": 0.7375757713351647, + "grad_norm": 0.37682878971099854, + "learning_rate": 1.7151446057496406e-05, + "loss": 1.8194, + "step": 4061 + }, + { + "epoch": 0.7377573955093423, + "grad_norm": 0.6947411894798279, + "learning_rate": 1.7129157975666143e-05, + "loss": 1.6867, + "step": 4062 + }, + { + "epoch": 0.7379390196835198, + "grad_norm": 0.5173976421356201, + "learning_rate": 1.7106881391097906e-05, + "loss": 1.8626, + "step": 4063 + }, + { + "epoch": 0.7381206438576975, + "grad_norm": 0.33689457178115845, + "learning_rate": 1.7084616311583414e-05, + "loss": 1.954, + "step": 4064 + }, + { + "epoch": 0.738302268031875, + "grad_norm": 0.3583717346191406, + "learning_rate": 1.7062362744910322e-05, + "loss": 1.7539, + "step": 4065 + }, + { + "epoch": 0.7384838922060526, + "grad_norm": 0.40497785806655884, + "learning_rate": 1.7040120698862268e-05, + "loss": 1.6395, + "step": 4066 + }, + { + "epoch": 0.7386655163802303, + "grad_norm": 0.5250065326690674, + "learning_rate": 1.7017890181218892e-05, + "loss": 2.0143, + "step": 4067 + }, + { + "epoch": 0.7388471405544078, + "grad_norm": 0.29535168409347534, + "learning_rate": 1.6995671199755774e-05, + "loss": 1.7049, + "step": 4068 + }, + { + "epoch": 0.7390287647285854, + "grad_norm": 0.33136895298957825, + "learning_rate": 1.6973463762244453e-05, + "loss": 1.8051, + "step": 4069 + }, + { + "epoch": 0.7392103889027629, + "grad_norm": 0.3494846224784851, + "learning_rate": 1.695126787645245e-05, + "loss": 1.8671, + "step": 4070 + }, + { + "epoch": 0.7393920130769406, + "grad_norm": 0.38031989336013794, + "learning_rate": 1.6929083550143255e-05, + "loss": 1.6787, + "step": 4071 + }, + { + "epoch": 0.7395736372511181, + "grad_norm": 0.3923795521259308, + "learning_rate": 1.690691079107629e-05, + "loss": 1.8481, + "step": 4072 + }, + { + "epoch": 0.7397552614252957, + "grad_norm": 0.40789178013801575, + "learning_rate": 1.688474960700694e-05, + "loss": 1.7716, + "step": 4073 + }, + { + "epoch": 0.7399368855994732, + "grad_norm": 0.7172250151634216, + "learning_rate": 1.686260000568653e-05, + "loss": 1.6596, + "step": 4074 + }, + { + "epoch": 0.7401185097736509, + "grad_norm": 0.5707058906555176, + "learning_rate": 1.684046199486234e-05, + "loss": 1.6936, + "step": 4075 + }, + { + "epoch": 0.7403001339478285, + "grad_norm": 0.4323611259460449, + "learning_rate": 1.681833558227761e-05, + "loss": 1.6256, + "step": 4076 + }, + { + "epoch": 0.740481758122006, + "grad_norm": 0.4075111150741577, + "learning_rate": 1.6796220775671534e-05, + "loss": 1.8647, + "step": 4077 + }, + { + "epoch": 0.7406633822961837, + "grad_norm": 0.4243597984313965, + "learning_rate": 1.6774117582779202e-05, + "loss": 1.8315, + "step": 4078 + }, + { + "epoch": 0.7408450064703612, + "grad_norm": 0.37556517124176025, + "learning_rate": 1.675202601133166e-05, + "loss": 1.7119, + "step": 4079 + }, + { + "epoch": 0.7410266306445388, + "grad_norm": 0.3972294330596924, + "learning_rate": 1.672994606905593e-05, + "loss": 1.6568, + "step": 4080 + }, + { + "epoch": 0.7412082548187163, + "grad_norm": 0.4020039439201355, + "learning_rate": 1.670787776367489e-05, + "loss": 1.823, + "step": 4081 + }, + { + "epoch": 0.741389878992894, + "grad_norm": 0.32826822996139526, + "learning_rate": 1.668582110290742e-05, + "loss": 1.8932, + "step": 4082 + }, + { + "epoch": 0.7415715031670715, + "grad_norm": 0.3530539870262146, + "learning_rate": 1.6663776094468296e-05, + "loss": 1.6307, + "step": 4083 + }, + { + "epoch": 0.7417531273412491, + "grad_norm": 0.6230703592300415, + "learning_rate": 1.664174274606819e-05, + "loss": 1.6929, + "step": 4084 + }, + { + "epoch": 0.7419347515154266, + "grad_norm": 0.38356050848960876, + "learning_rate": 1.6619721065413763e-05, + "loss": 1.6409, + "step": 4085 + }, + { + "epoch": 0.7421163756896043, + "grad_norm": 0.4013217091560364, + "learning_rate": 1.6597711060207538e-05, + "loss": 1.5049, + "step": 4086 + }, + { + "epoch": 0.7422979998637819, + "grad_norm": 0.36131054162979126, + "learning_rate": 1.6575712738147954e-05, + "loss": 1.6503, + "step": 4087 + }, + { + "epoch": 0.7424796240379594, + "grad_norm": 0.40880751609802246, + "learning_rate": 1.65537261069294e-05, + "loss": 1.5838, + "step": 4088 + }, + { + "epoch": 0.7426612482121371, + "grad_norm": 0.3663584589958191, + "learning_rate": 1.653175117424218e-05, + "loss": 1.6321, + "step": 4089 + }, + { + "epoch": 0.7428428723863146, + "grad_norm": 0.5283758640289307, + "learning_rate": 1.650978794777247e-05, + "loss": 1.799, + "step": 4090 + }, + { + "epoch": 0.7430244965604922, + "grad_norm": 0.39176812767982483, + "learning_rate": 1.6487836435202357e-05, + "loss": 1.7538, + "step": 4091 + }, + { + "epoch": 0.7432061207346697, + "grad_norm": 0.7181249260902405, + "learning_rate": 1.6465896644209827e-05, + "loss": 1.5907, + "step": 4092 + }, + { + "epoch": 0.7433877449088474, + "grad_norm": 0.6097697615623474, + "learning_rate": 1.644396858246881e-05, + "loss": 1.6595, + "step": 4093 + }, + { + "epoch": 0.743569369083025, + "grad_norm": 0.37166592478752136, + "learning_rate": 1.6422052257649078e-05, + "loss": 1.9346, + "step": 4094 + }, + { + "epoch": 0.7437509932572025, + "grad_norm": 1.6401299238204956, + "learning_rate": 1.640014767741635e-05, + "loss": 1.9361, + "step": 4095 + }, + { + "epoch": 0.7439326174313802, + "grad_norm": 0.7039845585823059, + "learning_rate": 1.637825484943219e-05, + "loss": 1.7234, + "step": 4096 + }, + { + "epoch": 0.7441142416055577, + "grad_norm": 0.3711150288581848, + "learning_rate": 1.6356373781354058e-05, + "loss": 1.8152, + "step": 4097 + }, + { + "epoch": 0.7442958657797353, + "grad_norm": 0.5686275959014893, + "learning_rate": 1.6334504480835337e-05, + "loss": 1.7349, + "step": 4098 + }, + { + "epoch": 0.7444774899539128, + "grad_norm": 0.4365050494670868, + "learning_rate": 1.6312646955525274e-05, + "loss": 1.8066, + "step": 4099 + }, + { + "epoch": 0.7446591141280905, + "grad_norm": 0.38787028193473816, + "learning_rate": 1.6290801213068962e-05, + "loss": 1.7137, + "step": 4100 + }, + { + "epoch": 0.744840738302268, + "grad_norm": 0.44262462854385376, + "learning_rate": 1.6268967261107426e-05, + "loss": 1.7172, + "step": 4101 + }, + { + "epoch": 0.7450223624764456, + "grad_norm": 0.3364495038986206, + "learning_rate": 1.6247145107277562e-05, + "loss": 1.7197, + "step": 4102 + }, + { + "epoch": 0.7452039866506232, + "grad_norm": 0.7012154459953308, + "learning_rate": 1.622533475921211e-05, + "loss": 1.7643, + "step": 4103 + }, + { + "epoch": 0.7453856108248008, + "grad_norm": 0.48845627903938293, + "learning_rate": 1.6203536224539683e-05, + "loss": 1.6468, + "step": 4104 + }, + { + "epoch": 0.7455672349989784, + "grad_norm": 0.648082971572876, + "learning_rate": 1.6181749510884763e-05, + "loss": 1.9667, + "step": 4105 + }, + { + "epoch": 0.7457488591731559, + "grad_norm": 0.4289121627807617, + "learning_rate": 1.615997462586773e-05, + "loss": 1.7607, + "step": 4106 + }, + { + "epoch": 0.7459304833473336, + "grad_norm": 0.7917602062225342, + "learning_rate": 1.6138211577104812e-05, + "loss": 1.6842, + "step": 4107 + }, + { + "epoch": 0.7461121075215111, + "grad_norm": 0.8497735857963562, + "learning_rate": 1.611646037220807e-05, + "loss": 1.8004, + "step": 4108 + }, + { + "epoch": 0.7462937316956887, + "grad_norm": 0.6006237268447876, + "learning_rate": 1.6094721018785454e-05, + "loss": 1.7804, + "step": 4109 + }, + { + "epoch": 0.7464753558698662, + "grad_norm": 0.3799632489681244, + "learning_rate": 1.607299352444072e-05, + "loss": 1.7597, + "step": 4110 + }, + { + "epoch": 0.7466569800440439, + "grad_norm": 0.5181890726089478, + "learning_rate": 1.6051277896773565e-05, + "loss": 1.8095, + "step": 4111 + }, + { + "epoch": 0.7468386042182215, + "grad_norm": 0.3357810080051422, + "learning_rate": 1.6029574143379437e-05, + "loss": 1.6779, + "step": 4112 + }, + { + "epoch": 0.747020228392399, + "grad_norm": 0.3890918791294098, + "learning_rate": 1.6007882271849716e-05, + "loss": 1.6702, + "step": 4113 + }, + { + "epoch": 0.7472018525665766, + "grad_norm": 1.0919835567474365, + "learning_rate": 1.5986202289771545e-05, + "loss": 1.7756, + "step": 4114 + }, + { + "epoch": 0.7473834767407542, + "grad_norm": 0.31073814630508423, + "learning_rate": 1.5964534204727995e-05, + "loss": 1.6875, + "step": 4115 + }, + { + "epoch": 0.7475651009149318, + "grad_norm": 0.34213170409202576, + "learning_rate": 1.5942878024297898e-05, + "loss": 1.8739, + "step": 4116 + }, + { + "epoch": 0.7477467250891093, + "grad_norm": 0.6594771146774292, + "learning_rate": 1.5921233756055964e-05, + "loss": 1.8602, + "step": 4117 + }, + { + "epoch": 0.747928349263287, + "grad_norm": 0.38413774967193604, + "learning_rate": 1.5899601407572707e-05, + "loss": 1.6121, + "step": 4118 + }, + { + "epoch": 0.7481099734374645, + "grad_norm": 0.45495328307151794, + "learning_rate": 1.5877980986414514e-05, + "loss": 1.7204, + "step": 4119 + }, + { + "epoch": 0.7482915976116421, + "grad_norm": 0.7501469254493713, + "learning_rate": 1.585637250014359e-05, + "loss": 1.721, + "step": 4120 + }, + { + "epoch": 0.7484732217858197, + "grad_norm": 0.40018123388290405, + "learning_rate": 1.583477595631794e-05, + "loss": 1.6597, + "step": 4121 + }, + { + "epoch": 0.7486548459599973, + "grad_norm": 1.172298550605774, + "learning_rate": 1.581319136249139e-05, + "loss": 1.8315, + "step": 4122 + }, + { + "epoch": 0.7488364701341749, + "grad_norm": 0.6361446976661682, + "learning_rate": 1.579161872621361e-05, + "loss": 1.8173, + "step": 4123 + }, + { + "epoch": 0.7490180943083524, + "grad_norm": 0.3031150996685028, + "learning_rate": 1.5770058055030096e-05, + "loss": 1.8149, + "step": 4124 + }, + { + "epoch": 0.74919971848253, + "grad_norm": 0.392094224691391, + "learning_rate": 1.574850935648211e-05, + "loss": 1.6834, + "step": 4125 + }, + { + "epoch": 0.7493813426567076, + "grad_norm": 0.3219311535358429, + "learning_rate": 1.5726972638106796e-05, + "loss": 1.7911, + "step": 4126 + }, + { + "epoch": 0.7495629668308852, + "grad_norm": 0.8192560076713562, + "learning_rate": 1.5705447907437037e-05, + "loss": 1.8681, + "step": 4127 + }, + { + "epoch": 0.7497445910050627, + "grad_norm": 0.676507830619812, + "learning_rate": 1.5683935172001586e-05, + "loss": 1.8379, + "step": 4128 + }, + { + "epoch": 0.7499262151792404, + "grad_norm": 0.43896564841270447, + "learning_rate": 1.566243443932496e-05, + "loss": 1.7465, + "step": 4129 + }, + { + "epoch": 0.750107839353418, + "grad_norm": 0.3637924790382385, + "learning_rate": 1.5640945716927475e-05, + "loss": 1.6533, + "step": 4130 + }, + { + "epoch": 0.7502894635275955, + "grad_norm": 0.38706567883491516, + "learning_rate": 1.5619469012325255e-05, + "loss": 1.6769, + "step": 4131 + }, + { + "epoch": 0.7504710877017731, + "grad_norm": 0.3383162319660187, + "learning_rate": 1.5598004333030237e-05, + "loss": 1.6198, + "step": 4132 + }, + { + "epoch": 0.7506527118759507, + "grad_norm": 0.4421071708202362, + "learning_rate": 1.557655168655016e-05, + "loss": 1.6735, + "step": 4133 + }, + { + "epoch": 0.7508343360501283, + "grad_norm": 0.6577813625335693, + "learning_rate": 1.5555111080388512e-05, + "loss": 1.6201, + "step": 4134 + }, + { + "epoch": 0.7510159602243058, + "grad_norm": 1.1857377290725708, + "learning_rate": 1.55336825220446e-05, + "loss": 1.726, + "step": 4135 + }, + { + "epoch": 0.7511975843984834, + "grad_norm": 1.168179988861084, + "learning_rate": 1.551226601901349e-05, + "loss": 1.7128, + "step": 4136 + }, + { + "epoch": 0.751379208572661, + "grad_norm": 0.31418755650520325, + "learning_rate": 1.5490861578786054e-05, + "loss": 1.7104, + "step": 4137 + }, + { + "epoch": 0.7515608327468386, + "grad_norm": 0.3819555640220642, + "learning_rate": 1.5469469208848973e-05, + "loss": 1.7123, + "step": 4138 + }, + { + "epoch": 0.7517424569210162, + "grad_norm": 0.33084821701049805, + "learning_rate": 1.5448088916684655e-05, + "loss": 1.8019, + "step": 4139 + }, + { + "epoch": 0.7519240810951938, + "grad_norm": 0.375338077545166, + "learning_rate": 1.542672070977128e-05, + "loss": 1.5492, + "step": 4140 + }, + { + "epoch": 0.7521057052693714, + "grad_norm": 0.36095306277275085, + "learning_rate": 1.540536459558286e-05, + "loss": 1.5636, + "step": 4141 + }, + { + "epoch": 0.7522873294435489, + "grad_norm": 0.4708751440048218, + "learning_rate": 1.5384020581589127e-05, + "loss": 1.59, + "step": 4142 + }, + { + "epoch": 0.7524689536177265, + "grad_norm": 0.5904425978660583, + "learning_rate": 1.5362688675255575e-05, + "loss": 1.7221, + "step": 4143 + }, + { + "epoch": 0.7526505777919041, + "grad_norm": 0.4659729599952698, + "learning_rate": 1.5341368884043518e-05, + "loss": 1.6564, + "step": 4144 + }, + { + "epoch": 0.7528322019660817, + "grad_norm": 0.33539512753486633, + "learning_rate": 1.5320061215409958e-05, + "loss": 1.5938, + "step": 4145 + }, + { + "epoch": 0.7530138261402592, + "grad_norm": 0.3387681841850281, + "learning_rate": 1.5298765676807742e-05, + "loss": 1.6379, + "step": 4146 + }, + { + "epoch": 0.7531954503144368, + "grad_norm": 0.39968717098236084, + "learning_rate": 1.52774822756854e-05, + "loss": 1.7234, + "step": 4147 + }, + { + "epoch": 0.7533770744886145, + "grad_norm": 0.32778799533843994, + "learning_rate": 1.5256211019487248e-05, + "loss": 1.8343, + "step": 4148 + }, + { + "epoch": 0.753558698662792, + "grad_norm": 0.3864232897758484, + "learning_rate": 1.523495191565334e-05, + "loss": 1.888, + "step": 4149 + }, + { + "epoch": 0.7537403228369696, + "grad_norm": 0.3323093354701996, + "learning_rate": 1.5213704971619502e-05, + "loss": 1.9033, + "step": 4150 + }, + { + "epoch": 0.7539219470111472, + "grad_norm": 1.1405141353607178, + "learning_rate": 1.519247019481731e-05, + "loss": 1.6093, + "step": 4151 + }, + { + "epoch": 0.7541035711853248, + "grad_norm": 0.3977713882923126, + "learning_rate": 1.5171247592674059e-05, + "loss": 1.7644, + "step": 4152 + }, + { + "epoch": 0.7542851953595023, + "grad_norm": 0.4082343876361847, + "learning_rate": 1.515003717261278e-05, + "loss": 1.7628, + "step": 4153 + }, + { + "epoch": 0.7544668195336799, + "grad_norm": 0.3433813750743866, + "learning_rate": 1.5128838942052282e-05, + "loss": 1.6646, + "step": 4154 + }, + { + "epoch": 0.7546484437078576, + "grad_norm": 0.33034488558769226, + "learning_rate": 1.5107652908407082e-05, + "loss": 1.7591, + "step": 4155 + }, + { + "epoch": 0.7548300678820351, + "grad_norm": 0.3999110758304596, + "learning_rate": 1.5086479079087423e-05, + "loss": 1.8159, + "step": 4156 + }, + { + "epoch": 0.7550116920562127, + "grad_norm": 1.0722752809524536, + "learning_rate": 1.5065317461499312e-05, + "loss": 1.8809, + "step": 4157 + }, + { + "epoch": 0.7551933162303902, + "grad_norm": 0.44335848093032837, + "learning_rate": 1.5044168063044445e-05, + "loss": 1.8349, + "step": 4158 + }, + { + "epoch": 0.7553749404045679, + "grad_norm": 0.5181594491004944, + "learning_rate": 1.5023030891120293e-05, + "loss": 1.8196, + "step": 4159 + }, + { + "epoch": 0.7555565645787454, + "grad_norm": 0.4768300950527191, + "learning_rate": 1.500190595312001e-05, + "loss": 1.6854, + "step": 4160 + }, + { + "epoch": 0.755738188752923, + "grad_norm": 0.8667663335800171, + "learning_rate": 1.4980793256432474e-05, + "loss": 1.6944, + "step": 4161 + }, + { + "epoch": 0.7559198129271006, + "grad_norm": 0.40919166803359985, + "learning_rate": 1.4959692808442289e-05, + "loss": 1.6798, + "step": 4162 + }, + { + "epoch": 0.7561014371012782, + "grad_norm": 0.3469299376010895, + "learning_rate": 1.4938604616529777e-05, + "loss": 1.6604, + "step": 4163 + }, + { + "epoch": 0.7562830612754557, + "grad_norm": 0.38557857275009155, + "learning_rate": 1.4917528688070998e-05, + "loss": 1.651, + "step": 4164 + }, + { + "epoch": 0.7564646854496333, + "grad_norm": 0.36639314889907837, + "learning_rate": 1.4896465030437678e-05, + "loss": 1.6385, + "step": 4165 + }, + { + "epoch": 0.756646309623811, + "grad_norm": 0.46716710925102234, + "learning_rate": 1.4875413650997272e-05, + "loss": 1.8558, + "step": 4166 + }, + { + "epoch": 0.7568279337979885, + "grad_norm": 0.5786119103431702, + "learning_rate": 1.4854374557112926e-05, + "loss": 1.8928, + "step": 4167 + }, + { + "epoch": 0.7570095579721661, + "grad_norm": 0.4797822833061218, + "learning_rate": 1.4833347756143506e-05, + "loss": 1.7161, + "step": 4168 + }, + { + "epoch": 0.7571911821463437, + "grad_norm": 0.5627937912940979, + "learning_rate": 1.4812333255443606e-05, + "loss": 1.7219, + "step": 4169 + }, + { + "epoch": 0.7573728063205213, + "grad_norm": 0.35671618580818176, + "learning_rate": 1.4791331062363451e-05, + "loss": 1.8461, + "step": 4170 + }, + { + "epoch": 0.7575544304946988, + "grad_norm": 0.49763354659080505, + "learning_rate": 1.4770341184248997e-05, + "loss": 1.6951, + "step": 4171 + }, + { + "epoch": 0.7577360546688764, + "grad_norm": 0.6327499747276306, + "learning_rate": 1.4749363628441914e-05, + "loss": 1.7569, + "step": 4172 + }, + { + "epoch": 0.757917678843054, + "grad_norm": 0.3401448726654053, + "learning_rate": 1.4728398402279525e-05, + "loss": 1.8731, + "step": 4173 + }, + { + "epoch": 0.7580993030172316, + "grad_norm": 0.4425918459892273, + "learning_rate": 1.4707445513094837e-05, + "loss": 1.641, + "step": 4174 + }, + { + "epoch": 0.7582809271914092, + "grad_norm": 0.9634883403778076, + "learning_rate": 1.468650496821658e-05, + "loss": 1.7429, + "step": 4175 + }, + { + "epoch": 0.7584625513655867, + "grad_norm": 1.0498324632644653, + "learning_rate": 1.4665576774969158e-05, + "loss": 1.8871, + "step": 4176 + }, + { + "epoch": 0.7586441755397644, + "grad_norm": 0.38177040219306946, + "learning_rate": 1.4644660940672627e-05, + "loss": 1.7049, + "step": 4177 + }, + { + "epoch": 0.7588257997139419, + "grad_norm": 0.3262515664100647, + "learning_rate": 1.462375747264274e-05, + "loss": 1.6788, + "step": 4178 + }, + { + "epoch": 0.7590074238881195, + "grad_norm": 0.3479563891887665, + "learning_rate": 1.4602866378190916e-05, + "loss": 1.8543, + "step": 4179 + }, + { + "epoch": 0.7591890480622971, + "grad_norm": 0.5018351674079895, + "learning_rate": 1.4581987664624236e-05, + "loss": 1.7739, + "step": 4180 + }, + { + "epoch": 0.7593706722364747, + "grad_norm": 0.6247456073760986, + "learning_rate": 1.4561121339245487e-05, + "loss": 1.6143, + "step": 4181 + }, + { + "epoch": 0.7595522964106523, + "grad_norm": 0.5524882674217224, + "learning_rate": 1.4540267409353104e-05, + "loss": 1.5619, + "step": 4182 + }, + { + "epoch": 0.7597339205848298, + "grad_norm": 0.3717946708202362, + "learning_rate": 1.4519425882241183e-05, + "loss": 1.5984, + "step": 4183 + }, + { + "epoch": 0.7599155447590075, + "grad_norm": 0.9055682420730591, + "learning_rate": 1.4498596765199452e-05, + "loss": 1.76, + "step": 4184 + }, + { + "epoch": 0.760097168933185, + "grad_norm": 0.33380892872810364, + "learning_rate": 1.447778006551337e-05, + "loss": 1.713, + "step": 4185 + }, + { + "epoch": 0.7602787931073626, + "grad_norm": 0.5809338688850403, + "learning_rate": 1.4456975790463995e-05, + "loss": 1.6811, + "step": 4186 + }, + { + "epoch": 0.7604604172815401, + "grad_norm": 0.5443786382675171, + "learning_rate": 1.4436183947328036e-05, + "loss": 1.7789, + "step": 4187 + }, + { + "epoch": 0.7606420414557178, + "grad_norm": 0.42955371737480164, + "learning_rate": 1.4415404543377892e-05, + "loss": 1.73, + "step": 4188 + }, + { + "epoch": 0.7608236656298953, + "grad_norm": 0.35628145933151245, + "learning_rate": 1.439463758588161e-05, + "loss": 1.5626, + "step": 4189 + }, + { + "epoch": 0.7610052898040729, + "grad_norm": 0.4249333143234253, + "learning_rate": 1.437388308210284e-05, + "loss": 1.6248, + "step": 4190 + }, + { + "epoch": 0.7611869139782506, + "grad_norm": 0.3871692717075348, + "learning_rate": 1.4353141039300921e-05, + "loss": 1.7438, + "step": 4191 + }, + { + "epoch": 0.7613685381524281, + "grad_norm": 0.5509214997291565, + "learning_rate": 1.4332411464730783e-05, + "loss": 1.6315, + "step": 4192 + }, + { + "epoch": 0.7615501623266057, + "grad_norm": 0.37787920236587524, + "learning_rate": 1.4311694365643047e-05, + "loss": 1.657, + "step": 4193 + }, + { + "epoch": 0.7617317865007832, + "grad_norm": 0.44771137833595276, + "learning_rate": 1.4290989749283967e-05, + "loss": 1.6578, + "step": 4194 + }, + { + "epoch": 0.7619134106749609, + "grad_norm": 0.757748544216156, + "learning_rate": 1.4270297622895395e-05, + "loss": 1.7479, + "step": 4195 + }, + { + "epoch": 0.7620950348491384, + "grad_norm": 0.5786657929420471, + "learning_rate": 1.4249617993714842e-05, + "loss": 1.7694, + "step": 4196 + }, + { + "epoch": 0.762276659023316, + "grad_norm": 0.362109512090683, + "learning_rate": 1.4228950868975416e-05, + "loss": 1.8236, + "step": 4197 + }, + { + "epoch": 0.7624582831974935, + "grad_norm": 0.4445590078830719, + "learning_rate": 1.4208296255905906e-05, + "loss": 1.7541, + "step": 4198 + }, + { + "epoch": 0.7626399073716712, + "grad_norm": 0.37870317697525024, + "learning_rate": 1.4187654161730667e-05, + "loss": 1.7851, + "step": 4199 + }, + { + "epoch": 0.7628215315458488, + "grad_norm": 1.5135464668273926, + "learning_rate": 1.4167024593669725e-05, + "loss": 1.9133, + "step": 4200 + }, + { + "epoch": 0.7630031557200263, + "grad_norm": 0.6860260963439941, + "learning_rate": 1.4146407558938695e-05, + "loss": 1.7671, + "step": 4201 + }, + { + "epoch": 0.763184779894204, + "grad_norm": 0.4201655387878418, + "learning_rate": 1.4125803064748794e-05, + "loss": 1.7229, + "step": 4202 + }, + { + "epoch": 0.7633664040683815, + "grad_norm": 0.3680914044380188, + "learning_rate": 1.4105211118306905e-05, + "loss": 1.6582, + "step": 4203 + }, + { + "epoch": 0.7635480282425591, + "grad_norm": 0.9631714224815369, + "learning_rate": 1.4084631726815472e-05, + "loss": 1.9659, + "step": 4204 + }, + { + "epoch": 0.7637296524167366, + "grad_norm": 0.42780983448028564, + "learning_rate": 1.4064064897472556e-05, + "loss": 1.6064, + "step": 4205 + }, + { + "epoch": 0.7639112765909143, + "grad_norm": 0.558135986328125, + "learning_rate": 1.4043510637471845e-05, + "loss": 1.7392, + "step": 4206 + }, + { + "epoch": 0.7640929007650918, + "grad_norm": 0.3827285170555115, + "learning_rate": 1.4022968954002641e-05, + "loss": 1.8247, + "step": 4207 + }, + { + "epoch": 0.7642745249392694, + "grad_norm": 0.4203011989593506, + "learning_rate": 1.4002439854249805e-05, + "loss": 1.6871, + "step": 4208 + }, + { + "epoch": 0.764456149113447, + "grad_norm": 0.5771546959877014, + "learning_rate": 1.3981923345393815e-05, + "loss": 1.6021, + "step": 4209 + }, + { + "epoch": 0.7646377732876246, + "grad_norm": 0.35695627331733704, + "learning_rate": 1.396141943461074e-05, + "loss": 1.631, + "step": 4210 + }, + { + "epoch": 0.7648193974618022, + "grad_norm": 0.32528796792030334, + "learning_rate": 1.3940928129072279e-05, + "loss": 1.7699, + "step": 4211 + }, + { + "epoch": 0.7650010216359797, + "grad_norm": 0.36780014634132385, + "learning_rate": 1.3920449435945654e-05, + "loss": 1.3558, + "step": 4212 + }, + { + "epoch": 0.7651826458101574, + "grad_norm": 0.37920820713043213, + "learning_rate": 1.3899983362393754e-05, + "loss": 1.7015, + "step": 4213 + }, + { + "epoch": 0.7653642699843349, + "grad_norm": 0.4190102517604828, + "learning_rate": 1.3879529915575002e-05, + "loss": 1.8481, + "step": 4214 + }, + { + "epoch": 0.7655458941585125, + "grad_norm": 0.9391307234764099, + "learning_rate": 1.385908910264339e-05, + "loss": 1.5664, + "step": 4215 + }, + { + "epoch": 0.76572751833269, + "grad_norm": 1.9302830696105957, + "learning_rate": 1.3838660930748565e-05, + "loss": 1.8412, + "step": 4216 + }, + { + "epoch": 0.7659091425068677, + "grad_norm": 0.3428305983543396, + "learning_rate": 1.3818245407035673e-05, + "loss": 1.6864, + "step": 4217 + }, + { + "epoch": 0.7660907666810453, + "grad_norm": 0.4523715078830719, + "learning_rate": 1.3797842538645466e-05, + "loss": 1.8309, + "step": 4218 + }, + { + "epoch": 0.7662723908552228, + "grad_norm": 0.4167747497558594, + "learning_rate": 1.3777452332714286e-05, + "loss": 1.8155, + "step": 4219 + }, + { + "epoch": 0.7664540150294004, + "grad_norm": 0.32244959473609924, + "learning_rate": 1.3757074796374048e-05, + "loss": 1.7545, + "step": 4220 + }, + { + "epoch": 0.766635639203578, + "grad_norm": 0.48793062567710876, + "learning_rate": 1.3736709936752196e-05, + "loss": 1.7273, + "step": 4221 + }, + { + "epoch": 0.7668172633777556, + "grad_norm": 0.4004579484462738, + "learning_rate": 1.3716357760971776e-05, + "loss": 1.652, + "step": 4222 + }, + { + "epoch": 0.7669988875519331, + "grad_norm": 0.5321884155273438, + "learning_rate": 1.3696018276151362e-05, + "loss": 1.6789, + "step": 4223 + }, + { + "epoch": 0.7671805117261108, + "grad_norm": 0.42339563369750977, + "learning_rate": 1.367569148940513e-05, + "loss": 1.7375, + "step": 4224 + }, + { + "epoch": 0.7673621359002883, + "grad_norm": 0.4051699638366699, + "learning_rate": 1.3655377407842812e-05, + "loss": 1.663, + "step": 4225 + }, + { + "epoch": 0.7675437600744659, + "grad_norm": 0.40933680534362793, + "learning_rate": 1.3635076038569673e-05, + "loss": 1.8344, + "step": 4226 + }, + { + "epoch": 0.7677253842486434, + "grad_norm": 0.39174485206604004, + "learning_rate": 1.3614787388686528e-05, + "loss": 1.6875, + "step": 4227 + }, + { + "epoch": 0.7679070084228211, + "grad_norm": 0.3594547212123871, + "learning_rate": 1.359451146528975e-05, + "loss": 1.8279, + "step": 4228 + }, + { + "epoch": 0.7680886325969987, + "grad_norm": 0.5098435878753662, + "learning_rate": 1.3574248275471297e-05, + "loss": 1.8045, + "step": 4229 + }, + { + "epoch": 0.7682702567711762, + "grad_norm": 0.42030927538871765, + "learning_rate": 1.3553997826318598e-05, + "loss": 1.8487, + "step": 4230 + }, + { + "epoch": 0.7684518809453538, + "grad_norm": 0.34279248118400574, + "learning_rate": 1.3533760124914713e-05, + "loss": 1.9313, + "step": 4231 + }, + { + "epoch": 0.7686335051195314, + "grad_norm": 0.4184218645095825, + "learning_rate": 1.3513535178338167e-05, + "loss": 1.7386, + "step": 4232 + }, + { + "epoch": 0.768815129293709, + "grad_norm": 0.3181811571121216, + "learning_rate": 1.3493322993663089e-05, + "loss": 1.6705, + "step": 4233 + }, + { + "epoch": 0.7689967534678865, + "grad_norm": 0.36398419737815857, + "learning_rate": 1.3473123577959085e-05, + "loss": 1.6486, + "step": 4234 + }, + { + "epoch": 0.7691783776420642, + "grad_norm": 0.42261311411857605, + "learning_rate": 1.3452936938291333e-05, + "loss": 1.563, + "step": 4235 + }, + { + "epoch": 0.7693600018162418, + "grad_norm": 0.8296411037445068, + "learning_rate": 1.3432763081720512e-05, + "loss": 1.7877, + "step": 4236 + }, + { + "epoch": 0.7695416259904193, + "grad_norm": 0.6092419028282166, + "learning_rate": 1.3412602015302866e-05, + "loss": 1.8077, + "step": 4237 + }, + { + "epoch": 0.7697232501645969, + "grad_norm": 0.3711092174053192, + "learning_rate": 1.3392453746090161e-05, + "loss": 1.7071, + "step": 4238 + }, + { + "epoch": 0.7699048743387745, + "grad_norm": 0.3661447763442993, + "learning_rate": 1.3372318281129659e-05, + "loss": 1.7217, + "step": 4239 + }, + { + "epoch": 0.7700864985129521, + "grad_norm": 0.3520258963108063, + "learning_rate": 1.3352195627464159e-05, + "loss": 1.6584, + "step": 4240 + }, + { + "epoch": 0.7702681226871296, + "grad_norm": 0.35179486870765686, + "learning_rate": 1.3332085792131966e-05, + "loss": 1.7709, + "step": 4241 + }, + { + "epoch": 0.7704497468613073, + "grad_norm": 0.3778263032436371, + "learning_rate": 1.3311988782166935e-05, + "loss": 1.7226, + "step": 4242 + }, + { + "epoch": 0.7706313710354848, + "grad_norm": 0.46752724051475525, + "learning_rate": 1.3291904604598392e-05, + "loss": 1.62, + "step": 4243 + }, + { + "epoch": 0.7708129952096624, + "grad_norm": 0.47172635793685913, + "learning_rate": 1.3271833266451233e-05, + "loss": 1.7653, + "step": 4244 + }, + { + "epoch": 0.77099461938384, + "grad_norm": 0.34338462352752686, + "learning_rate": 1.3251774774745785e-05, + "loss": 1.766, + "step": 4245 + }, + { + "epoch": 0.7711762435580176, + "grad_norm": 0.3563118875026703, + "learning_rate": 1.3231729136497955e-05, + "loss": 1.5229, + "step": 4246 + }, + { + "epoch": 0.7713578677321952, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.321169635871911e-05, + "loss": 1.7234, + "step": 4247 + }, + { + "epoch": 0.7715394919063727, + "grad_norm": 0.36612674593925476, + "learning_rate": 1.3191676448416134e-05, + "loss": 1.6639, + "step": 4248 + }, + { + "epoch": 0.7717211160805503, + "grad_norm": 0.3302246034145355, + "learning_rate": 1.3171669412591392e-05, + "loss": 1.6655, + "step": 4249 + }, + { + "epoch": 0.7719027402547279, + "grad_norm": 0.5210248231887817, + "learning_rate": 1.3151675258242768e-05, + "loss": 1.7874, + "step": 4250 + }, + { + "epoch": 0.7720843644289055, + "grad_norm": 0.43592268228530884, + "learning_rate": 1.3131693992363664e-05, + "loss": 1.5713, + "step": 4251 + }, + { + "epoch": 0.772265988603083, + "grad_norm": 0.34379762411117554, + "learning_rate": 1.3111725621942922e-05, + "loss": 1.6194, + "step": 4252 + }, + { + "epoch": 0.7724476127772607, + "grad_norm": 0.8084208965301514, + "learning_rate": 1.309177015396489e-05, + "loss": 1.8519, + "step": 4253 + }, + { + "epoch": 0.7726292369514383, + "grad_norm": 0.44669103622436523, + "learning_rate": 1.3071827595409403e-05, + "loss": 1.7715, + "step": 4254 + }, + { + "epoch": 0.7728108611256158, + "grad_norm": 0.5011433959007263, + "learning_rate": 1.3051897953251797e-05, + "loss": 1.7945, + "step": 4255 + }, + { + "epoch": 0.7729924852997934, + "grad_norm": 2.0691335201263428, + "learning_rate": 1.3031981234462892e-05, + "loss": 1.8861, + "step": 4256 + }, + { + "epoch": 0.773174109473971, + "grad_norm": 0.31306329369544983, + "learning_rate": 1.3012077446008968e-05, + "loss": 1.8261, + "step": 4257 + }, + { + "epoch": 0.7733557336481486, + "grad_norm": 1.4421753883361816, + "learning_rate": 1.2992186594851768e-05, + "loss": 1.8557, + "step": 4258 + }, + { + "epoch": 0.7735373578223261, + "grad_norm": 0.38541164994239807, + "learning_rate": 1.2972308687948565e-05, + "loss": 1.5616, + "step": 4259 + }, + { + "epoch": 0.7737189819965037, + "grad_norm": 0.5398344993591309, + "learning_rate": 1.2952443732252057e-05, + "loss": 1.7958, + "step": 4260 + }, + { + "epoch": 0.7739006061706813, + "grad_norm": 0.3978300988674164, + "learning_rate": 1.293259173471041e-05, + "loss": 1.6294, + "step": 4261 + }, + { + "epoch": 0.7740822303448589, + "grad_norm": 0.3187747001647949, + "learning_rate": 1.29127527022673e-05, + "loss": 1.9252, + "step": 4262 + }, + { + "epoch": 0.7742638545190365, + "grad_norm": 0.5095344185829163, + "learning_rate": 1.2892926641861814e-05, + "loss": 1.6712, + "step": 4263 + }, + { + "epoch": 0.7744454786932141, + "grad_norm": 0.4691472053527832, + "learning_rate": 1.2873113560428568e-05, + "loss": 1.7195, + "step": 4264 + }, + { + "epoch": 0.7746271028673917, + "grad_norm": 0.491781085729599, + "learning_rate": 1.285331346489757e-05, + "loss": 1.8219, + "step": 4265 + }, + { + "epoch": 0.7748087270415692, + "grad_norm": 0.370246559381485, + "learning_rate": 1.2833526362194332e-05, + "loss": 1.7141, + "step": 4266 + }, + { + "epoch": 0.7749903512157468, + "grad_norm": 0.711889922618866, + "learning_rate": 1.2813752259239781e-05, + "loss": 1.699, + "step": 4267 + }, + { + "epoch": 0.7751719753899244, + "grad_norm": 0.7655342221260071, + "learning_rate": 1.2793991162950337e-05, + "loss": 1.7805, + "step": 4268 + }, + { + "epoch": 0.775353599564102, + "grad_norm": 0.40114110708236694, + "learning_rate": 1.2774243080237874e-05, + "loss": 1.7917, + "step": 4269 + }, + { + "epoch": 0.7755352237382795, + "grad_norm": 0.2923238277435303, + "learning_rate": 1.2754508018009675e-05, + "loss": 1.6588, + "step": 4270 + }, + { + "epoch": 0.7757168479124571, + "grad_norm": 0.34129759669303894, + "learning_rate": 1.2734785983168485e-05, + "loss": 1.6993, + "step": 4271 + }, + { + "epoch": 0.7758984720866348, + "grad_norm": 0.33769315481185913, + "learning_rate": 1.2715076982612511e-05, + "loss": 1.6588, + "step": 4272 + }, + { + "epoch": 0.7760800962608123, + "grad_norm": 0.8178960084915161, + "learning_rate": 1.2695381023235386e-05, + "loss": 1.7092, + "step": 4273 + }, + { + "epoch": 0.7762617204349899, + "grad_norm": 0.6499577760696411, + "learning_rate": 1.267569811192616e-05, + "loss": 1.6992, + "step": 4274 + }, + { + "epoch": 0.7764433446091675, + "grad_norm": 0.4919712543487549, + "learning_rate": 1.2656028255569375e-05, + "loss": 1.9522, + "step": 4275 + }, + { + "epoch": 0.7766249687833451, + "grad_norm": 0.44945672154426575, + "learning_rate": 1.2636371461044933e-05, + "loss": 1.8821, + "step": 4276 + }, + { + "epoch": 0.7768065929575226, + "grad_norm": 0.36502188444137573, + "learning_rate": 1.261672773522825e-05, + "loss": 1.5964, + "step": 4277 + }, + { + "epoch": 0.7769882171317002, + "grad_norm": 0.40018266439437866, + "learning_rate": 1.2597097084990112e-05, + "loss": 1.7381, + "step": 4278 + }, + { + "epoch": 0.7771698413058779, + "grad_norm": 0.3559877276420593, + "learning_rate": 1.2577479517196727e-05, + "loss": 1.4965, + "step": 4279 + }, + { + "epoch": 0.7773514654800554, + "grad_norm": 0.4960487484931946, + "learning_rate": 1.2557875038709765e-05, + "loss": 1.778, + "step": 4280 + }, + { + "epoch": 0.777533089654233, + "grad_norm": 0.317813515663147, + "learning_rate": 1.2538283656386319e-05, + "loss": 1.8122, + "step": 4281 + }, + { + "epoch": 0.7777147138284105, + "grad_norm": 0.396422415971756, + "learning_rate": 1.2518705377078866e-05, + "loss": 1.8543, + "step": 4282 + }, + { + "epoch": 0.7778963380025882, + "grad_norm": 0.3818439841270447, + "learning_rate": 1.2499140207635319e-05, + "loss": 1.6222, + "step": 4283 + }, + { + "epoch": 0.7780779621767657, + "grad_norm": 0.4326133131980896, + "learning_rate": 1.2479588154899003e-05, + "loss": 1.7868, + "step": 4284 + }, + { + "epoch": 0.7782595863509433, + "grad_norm": 0.3515814542770386, + "learning_rate": 1.2460049225708637e-05, + "loss": 1.731, + "step": 4285 + }, + { + "epoch": 0.7784412105251209, + "grad_norm": 1.0905495882034302, + "learning_rate": 1.2440523426898387e-05, + "loss": 1.7083, + "step": 4286 + }, + { + "epoch": 0.7786228346992985, + "grad_norm": 0.3423493206501007, + "learning_rate": 1.242101076529782e-05, + "loss": 1.7752, + "step": 4287 + }, + { + "epoch": 0.778804458873476, + "grad_norm": 0.431401789188385, + "learning_rate": 1.240151124773189e-05, + "loss": 1.7583, + "step": 4288 + }, + { + "epoch": 0.7789860830476536, + "grad_norm": 0.3722066581249237, + "learning_rate": 1.2382024881020937e-05, + "loss": 1.8214, + "step": 4289 + }, + { + "epoch": 0.7791677072218313, + "grad_norm": 0.5015780925750732, + "learning_rate": 1.2362551671980755e-05, + "loss": 1.6462, + "step": 4290 + }, + { + "epoch": 0.7793493313960088, + "grad_norm": 0.35513433814048767, + "learning_rate": 1.2343091627422487e-05, + "loss": 1.4519, + "step": 4291 + }, + { + "epoch": 0.7795309555701864, + "grad_norm": 0.386691153049469, + "learning_rate": 1.232364475415268e-05, + "loss": 1.7246, + "step": 4292 + }, + { + "epoch": 0.7797125797443639, + "grad_norm": 0.34038490056991577, + "learning_rate": 1.2304211058973297e-05, + "loss": 1.8466, + "step": 4293 + }, + { + "epoch": 0.7798942039185416, + "grad_norm": 0.5250787138938904, + "learning_rate": 1.2284790548681684e-05, + "loss": 1.8184, + "step": 4294 + }, + { + "epoch": 0.7800758280927191, + "grad_norm": 0.40798166394233704, + "learning_rate": 1.226538323007057e-05, + "loss": 1.8223, + "step": 4295 + }, + { + "epoch": 0.7802574522668967, + "grad_norm": 0.37590616941452026, + "learning_rate": 1.2245989109928057e-05, + "loss": 1.9055, + "step": 4296 + }, + { + "epoch": 0.7804390764410744, + "grad_norm": 0.5477168560028076, + "learning_rate": 1.2226608195037648e-05, + "loss": 1.7743, + "step": 4297 + }, + { + "epoch": 0.7806207006152519, + "grad_norm": 0.4393215775489807, + "learning_rate": 1.2207240492178206e-05, + "loss": 1.802, + "step": 4298 + }, + { + "epoch": 0.7808023247894295, + "grad_norm": 0.3563307523727417, + "learning_rate": 1.2187886008124e-05, + "loss": 1.7445, + "step": 4299 + }, + { + "epoch": 0.780983948963607, + "grad_norm": 0.4044618010520935, + "learning_rate": 1.2168544749644683e-05, + "loss": 1.6543, + "step": 4300 + }, + { + "epoch": 0.7811655731377847, + "grad_norm": 0.44848498702049255, + "learning_rate": 1.2149216723505246e-05, + "loss": 1.8761, + "step": 4301 + }, + { + "epoch": 0.7813471973119622, + "grad_norm": 0.37952542304992676, + "learning_rate": 1.2129901936466059e-05, + "loss": 1.7154, + "step": 4302 + }, + { + "epoch": 0.7815288214861398, + "grad_norm": 0.4693027436733246, + "learning_rate": 1.2110600395282896e-05, + "loss": 1.7886, + "step": 4303 + }, + { + "epoch": 0.7817104456603174, + "grad_norm": 0.4422491490840912, + "learning_rate": 1.209131210670686e-05, + "loss": 1.7925, + "step": 4304 + }, + { + "epoch": 0.781892069834495, + "grad_norm": 0.6252224445343018, + "learning_rate": 1.2072037077484416e-05, + "loss": 1.747, + "step": 4305 + }, + { + "epoch": 0.7820736940086725, + "grad_norm": 0.3943629860877991, + "learning_rate": 1.2052775314357423e-05, + "loss": 1.7079, + "step": 4306 + }, + { + "epoch": 0.7822553181828501, + "grad_norm": 0.4714201092720032, + "learning_rate": 1.2033526824063096e-05, + "loss": 1.665, + "step": 4307 + }, + { + "epoch": 0.7824369423570278, + "grad_norm": 0.4220423400402069, + "learning_rate": 1.2014291613333984e-05, + "loss": 1.8069, + "step": 4308 + }, + { + "epoch": 0.7826185665312053, + "grad_norm": 0.4449215531349182, + "learning_rate": 1.1995069688898003e-05, + "loss": 1.8611, + "step": 4309 + }, + { + "epoch": 0.7828001907053829, + "grad_norm": 0.34403201937675476, + "learning_rate": 1.197586105747841e-05, + "loss": 1.7783, + "step": 4310 + }, + { + "epoch": 0.7829818148795604, + "grad_norm": 0.4335022270679474, + "learning_rate": 1.1956665725793831e-05, + "loss": 1.7227, + "step": 4311 + }, + { + "epoch": 0.7831634390537381, + "grad_norm": 0.38837575912475586, + "learning_rate": 1.1937483700558256e-05, + "loss": 1.6256, + "step": 4312 + }, + { + "epoch": 0.7833450632279156, + "grad_norm": 0.40016186237335205, + "learning_rate": 1.1918314988480977e-05, + "loss": 1.6264, + "step": 4313 + }, + { + "epoch": 0.7835266874020932, + "grad_norm": 0.3776865005493164, + "learning_rate": 1.1899159596266652e-05, + "loss": 1.6613, + "step": 4314 + }, + { + "epoch": 0.7837083115762709, + "grad_norm": 0.4662635624408722, + "learning_rate": 1.1880017530615267e-05, + "loss": 1.6111, + "step": 4315 + }, + { + "epoch": 0.7838899357504484, + "grad_norm": 0.6887052059173584, + "learning_rate": 1.1860888798222187e-05, + "loss": 1.783, + "step": 4316 + }, + { + "epoch": 0.784071559924626, + "grad_norm": 1.3682857751846313, + "learning_rate": 1.184177340577805e-05, + "loss": 1.7721, + "step": 4317 + }, + { + "epoch": 0.7842531840988035, + "grad_norm": 0.3863130509853363, + "learning_rate": 1.1822671359968901e-05, + "loss": 1.8216, + "step": 4318 + }, + { + "epoch": 0.7844348082729812, + "grad_norm": 0.37203511595726013, + "learning_rate": 1.1803582667476043e-05, + "loss": 1.6507, + "step": 4319 + }, + { + "epoch": 0.7846164324471587, + "grad_norm": 0.3317359387874603, + "learning_rate": 1.1784507334976175e-05, + "loss": 1.4854, + "step": 4320 + }, + { + "epoch": 0.7847980566213363, + "grad_norm": 0.43319979310035706, + "learning_rate": 1.1765445369141276e-05, + "loss": 1.9536, + "step": 4321 + }, + { + "epoch": 0.7849796807955138, + "grad_norm": 0.40528199076652527, + "learning_rate": 1.1746396776638669e-05, + "loss": 1.5639, + "step": 4322 + }, + { + "epoch": 0.7851613049696915, + "grad_norm": 0.3822980523109436, + "learning_rate": 1.1727361564130979e-05, + "loss": 1.7153, + "step": 4323 + }, + { + "epoch": 0.785342929143869, + "grad_norm": 0.49178346991539, + "learning_rate": 1.1708339738276181e-05, + "loss": 1.504, + "step": 4324 + }, + { + "epoch": 0.7855245533180466, + "grad_norm": 0.42102450132369995, + "learning_rate": 1.1689331305727574e-05, + "loss": 1.919, + "step": 4325 + }, + { + "epoch": 0.7857061774922243, + "grad_norm": 0.41422492265701294, + "learning_rate": 1.1670336273133742e-05, + "loss": 1.6895, + "step": 4326 + }, + { + "epoch": 0.7858878016664018, + "grad_norm": 0.9862390160560608, + "learning_rate": 1.165135464713858e-05, + "loss": 1.7175, + "step": 4327 + }, + { + "epoch": 0.7860694258405794, + "grad_norm": 0.42308226227760315, + "learning_rate": 1.163238643438131e-05, + "loss": 1.7794, + "step": 4328 + }, + { + "epoch": 0.7862510500147569, + "grad_norm": 0.407773494720459, + "learning_rate": 1.1613431641496475e-05, + "loss": 1.5528, + "step": 4329 + }, + { + "epoch": 0.7864326741889346, + "grad_norm": 0.5459104776382446, + "learning_rate": 1.1594490275113884e-05, + "loss": 1.8652, + "step": 4330 + }, + { + "epoch": 0.7866142983631121, + "grad_norm": 0.31533345580101013, + "learning_rate": 1.1575562341858709e-05, + "loss": 1.8543, + "step": 4331 + }, + { + "epoch": 0.7867959225372897, + "grad_norm": 0.33378443121910095, + "learning_rate": 1.1556647848351365e-05, + "loss": 1.6461, + "step": 4332 + }, + { + "epoch": 0.7869775467114672, + "grad_norm": 0.4196130633354187, + "learning_rate": 1.1537746801207583e-05, + "loss": 1.8263, + "step": 4333 + }, + { + "epoch": 0.7871591708856449, + "grad_norm": 0.3888135552406311, + "learning_rate": 1.1518859207038429e-05, + "loss": 1.982, + "step": 4334 + }, + { + "epoch": 0.7873407950598225, + "grad_norm": 0.31101202964782715, + "learning_rate": 1.1499985072450208e-05, + "loss": 1.6314, + "step": 4335 + }, + { + "epoch": 0.787522419234, + "grad_norm": 0.36064112186431885, + "learning_rate": 1.1481124404044535e-05, + "loss": 1.8554, + "step": 4336 + }, + { + "epoch": 0.7877040434081777, + "grad_norm": 0.3885866105556488, + "learning_rate": 1.1462277208418338e-05, + "loss": 1.7755, + "step": 4337 + }, + { + "epoch": 0.7878856675823552, + "grad_norm": 0.6025899052619934, + "learning_rate": 1.144344349216383e-05, + "loss": 1.7034, + "step": 4338 + }, + { + "epoch": 0.7880672917565328, + "grad_norm": 0.3720230758190155, + "learning_rate": 1.1424623261868472e-05, + "loss": 1.6349, + "step": 4339 + }, + { + "epoch": 0.7882489159307103, + "grad_norm": 0.3159719407558441, + "learning_rate": 1.1405816524115044e-05, + "loss": 1.5664, + "step": 4340 + }, + { + "epoch": 0.788430540104888, + "grad_norm": 0.6075266003608704, + "learning_rate": 1.1387023285481575e-05, + "loss": 1.744, + "step": 4341 + }, + { + "epoch": 0.7886121642790656, + "grad_norm": 0.45991504192352295, + "learning_rate": 1.1368243552541403e-05, + "loss": 1.7863, + "step": 4342 + }, + { + "epoch": 0.7887937884532431, + "grad_norm": 0.442395955324173, + "learning_rate": 1.134947733186315e-05, + "loss": 1.7916, + "step": 4343 + }, + { + "epoch": 0.7889754126274207, + "grad_norm": 0.4696204960346222, + "learning_rate": 1.133072463001068e-05, + "loss": 1.7074, + "step": 4344 + }, + { + "epoch": 0.7891570368015983, + "grad_norm": 0.3874971568584442, + "learning_rate": 1.1311985453543134e-05, + "loss": 1.7635, + "step": 4345 + }, + { + "epoch": 0.7893386609757759, + "grad_norm": 0.37201961874961853, + "learning_rate": 1.1293259809014922e-05, + "loss": 1.7668, + "step": 4346 + }, + { + "epoch": 0.7895202851499534, + "grad_norm": 0.3903019428253174, + "learning_rate": 1.1274547702975757e-05, + "loss": 1.6576, + "step": 4347 + }, + { + "epoch": 0.7897019093241311, + "grad_norm": 0.4975596070289612, + "learning_rate": 1.1255849141970554e-05, + "loss": 1.5879, + "step": 4348 + }, + { + "epoch": 0.7898835334983086, + "grad_norm": 0.35393449664115906, + "learning_rate": 1.1237164132539551e-05, + "loss": 1.7143, + "step": 4349 + }, + { + "epoch": 0.7900651576724862, + "grad_norm": 0.4197903871536255, + "learning_rate": 1.1218492681218202e-05, + "loss": 1.6567, + "step": 4350 + }, + { + "epoch": 0.7902467818466637, + "grad_norm": 0.42693057656288147, + "learning_rate": 1.1199834794537263e-05, + "loss": 1.6714, + "step": 4351 + }, + { + "epoch": 0.7904284060208414, + "grad_norm": 0.3511961102485657, + "learning_rate": 1.118119047902269e-05, + "loss": 1.806, + "step": 4352 + }, + { + "epoch": 0.790610030195019, + "grad_norm": 0.3782356381416321, + "learning_rate": 1.1162559741195733e-05, + "loss": 1.7853, + "step": 4353 + }, + { + "epoch": 0.7907916543691965, + "grad_norm": 0.4044421315193176, + "learning_rate": 1.114394258757287e-05, + "loss": 1.759, + "step": 4354 + }, + { + "epoch": 0.7909732785433741, + "grad_norm": 0.5887256860733032, + "learning_rate": 1.1125339024665843e-05, + "loss": 1.7284, + "step": 4355 + }, + { + "epoch": 0.7911549027175517, + "grad_norm": 0.38168877363204956, + "learning_rate": 1.1106749058981653e-05, + "loss": 1.8397, + "step": 4356 + }, + { + "epoch": 0.7913365268917293, + "grad_norm": 0.410632461309433, + "learning_rate": 1.108817269702252e-05, + "loss": 1.6726, + "step": 4357 + }, + { + "epoch": 0.7915181510659068, + "grad_norm": 0.42759329080581665, + "learning_rate": 1.1069609945285902e-05, + "loss": 1.7859, + "step": 4358 + }, + { + "epoch": 0.7916997752400845, + "grad_norm": 0.5910013318061829, + "learning_rate": 1.1051060810264508e-05, + "loss": 1.8166, + "step": 4359 + }, + { + "epoch": 0.791881399414262, + "grad_norm": 0.37470152974128723, + "learning_rate": 1.10325252984463e-05, + "loss": 1.6343, + "step": 4360 + }, + { + "epoch": 0.7920630235884396, + "grad_norm": 0.46716374158859253, + "learning_rate": 1.1014003416314439e-05, + "loss": 1.8285, + "step": 4361 + }, + { + "epoch": 0.7922446477626172, + "grad_norm": 0.5370849967002869, + "learning_rate": 1.0995495170347365e-05, + "loss": 1.663, + "step": 4362 + }, + { + "epoch": 0.7924262719367948, + "grad_norm": 1.662850260734558, + "learning_rate": 1.0977000567018697e-05, + "loss": 1.686, + "step": 4363 + }, + { + "epoch": 0.7926078961109724, + "grad_norm": 0.42218026518821716, + "learning_rate": 1.095851961279733e-05, + "loss": 1.6322, + "step": 4364 + }, + { + "epoch": 0.7927895202851499, + "grad_norm": 0.40685606002807617, + "learning_rate": 1.0940052314147358e-05, + "loss": 1.6172, + "step": 4365 + }, + { + "epoch": 0.7929711444593275, + "grad_norm": 0.39650702476501465, + "learning_rate": 1.0921598677528078e-05, + "loss": 1.7453, + "step": 4366 + }, + { + "epoch": 0.7931527686335051, + "grad_norm": 0.31650277972221375, + "learning_rate": 1.0903158709394074e-05, + "loss": 1.7781, + "step": 4367 + }, + { + "epoch": 0.7933343928076827, + "grad_norm": 0.39536502957344055, + "learning_rate": 1.0884732416195075e-05, + "loss": 1.9665, + "step": 4368 + }, + { + "epoch": 0.7935160169818603, + "grad_norm": 0.3061832785606384, + "learning_rate": 1.0866319804376085e-05, + "loss": 1.8434, + "step": 4369 + }, + { + "epoch": 0.7936976411560379, + "grad_norm": 0.31881165504455566, + "learning_rate": 1.0847920880377293e-05, + "loss": 1.5791, + "step": 4370 + }, + { + "epoch": 0.7938792653302155, + "grad_norm": 0.44311171770095825, + "learning_rate": 1.0829535650634104e-05, + "loss": 1.8169, + "step": 4371 + }, + { + "epoch": 0.794060889504393, + "grad_norm": 0.34514209628105164, + "learning_rate": 1.0811164121577116e-05, + "loss": 1.796, + "step": 4372 + }, + { + "epoch": 0.7942425136785706, + "grad_norm": 0.40144217014312744, + "learning_rate": 1.0792806299632175e-05, + "loss": 1.5431, + "step": 4373 + }, + { + "epoch": 0.7944241378527482, + "grad_norm": 0.3435051441192627, + "learning_rate": 1.077446219122032e-05, + "loss": 1.5503, + "step": 4374 + }, + { + "epoch": 0.7946057620269258, + "grad_norm": 0.4188168942928314, + "learning_rate": 1.0756131802757768e-05, + "loss": 1.9323, + "step": 4375 + }, + { + "epoch": 0.7947873862011033, + "grad_norm": 0.7587621808052063, + "learning_rate": 1.0737815140655955e-05, + "loss": 1.8282, + "step": 4376 + }, + { + "epoch": 0.794969010375281, + "grad_norm": 0.34740912914276123, + "learning_rate": 1.0719512211321531e-05, + "loss": 1.7388, + "step": 4377 + }, + { + "epoch": 0.7951506345494586, + "grad_norm": 0.8639332056045532, + "learning_rate": 1.070122302115632e-05, + "loss": 1.7383, + "step": 4378 + }, + { + "epoch": 0.7953322587236361, + "grad_norm": 0.5078075528144836, + "learning_rate": 1.0682947576557329e-05, + "loss": 1.6998, + "step": 4379 + }, + { + "epoch": 0.7955138828978137, + "grad_norm": 0.37886303663253784, + "learning_rate": 1.0664685883916797e-05, + "loss": 1.69, + "step": 4380 + }, + { + "epoch": 0.7956955070719913, + "grad_norm": 0.36574116349220276, + "learning_rate": 1.0646437949622118e-05, + "loss": 1.8691, + "step": 4381 + }, + { + "epoch": 0.7958771312461689, + "grad_norm": 0.5430355668067932, + "learning_rate": 1.0628203780055907e-05, + "loss": 1.7145, + "step": 4382 + }, + { + "epoch": 0.7960587554203464, + "grad_norm": 0.42663589119911194, + "learning_rate": 1.0609983381595934e-05, + "loss": 1.6141, + "step": 4383 + }, + { + "epoch": 0.796240379594524, + "grad_norm": 0.3236360251903534, + "learning_rate": 1.0591776760615158e-05, + "loss": 1.684, + "step": 4384 + }, + { + "epoch": 0.7964220037687016, + "grad_norm": 0.38191041350364685, + "learning_rate": 1.0573583923481711e-05, + "loss": 1.819, + "step": 4385 + }, + { + "epoch": 0.7966036279428792, + "grad_norm": 0.3629547357559204, + "learning_rate": 1.0555404876558939e-05, + "loss": 1.7753, + "step": 4386 + }, + { + "epoch": 0.7967852521170568, + "grad_norm": 0.3731331527233124, + "learning_rate": 1.0537239626205347e-05, + "loss": 1.6857, + "step": 4387 + }, + { + "epoch": 0.7969668762912344, + "grad_norm": 0.617185652256012, + "learning_rate": 1.05190881787746e-05, + "loss": 1.5497, + "step": 4388 + }, + { + "epoch": 0.797148500465412, + "grad_norm": 0.5770426392555237, + "learning_rate": 1.0500950540615534e-05, + "loss": 1.6317, + "step": 4389 + }, + { + "epoch": 0.7973301246395895, + "grad_norm": 0.3708058297634125, + "learning_rate": 1.0482826718072186e-05, + "loss": 1.7382, + "step": 4390 + }, + { + "epoch": 0.7975117488137671, + "grad_norm": 0.644450306892395, + "learning_rate": 1.0464716717483736e-05, + "loss": 1.7498, + "step": 4391 + }, + { + "epoch": 0.7976933729879447, + "grad_norm": 0.3912017047405243, + "learning_rate": 1.044662054518451e-05, + "loss": 1.671, + "step": 4392 + }, + { + "epoch": 0.7978749971621223, + "grad_norm": 0.35457611083984375, + "learning_rate": 1.0428538207504057e-05, + "loss": 1.8412, + "step": 4393 + }, + { + "epoch": 0.7980566213362998, + "grad_norm": 0.41863059997558594, + "learning_rate": 1.0410469710767023e-05, + "loss": 1.6353, + "step": 4394 + }, + { + "epoch": 0.7982382455104774, + "grad_norm": 1.5347224473953247, + "learning_rate": 1.0392415061293264e-05, + "loss": 1.9343, + "step": 4395 + }, + { + "epoch": 0.7984198696846551, + "grad_norm": 0.3452093303203583, + "learning_rate": 1.0374374265397763e-05, + "loss": 1.5659, + "step": 4396 + }, + { + "epoch": 0.7986014938588326, + "grad_norm": 0.5257523655891418, + "learning_rate": 1.0356347329390647e-05, + "loss": 1.7907, + "step": 4397 + }, + { + "epoch": 0.7987831180330102, + "grad_norm": 0.43351879715919495, + "learning_rate": 1.0338334259577226e-05, + "loss": 1.8591, + "step": 4398 + }, + { + "epoch": 0.7989647422071878, + "grad_norm": 0.3520071506500244, + "learning_rate": 1.0320335062257958e-05, + "loss": 1.9433, + "step": 4399 + }, + { + "epoch": 0.7991463663813654, + "grad_norm": 0.24747538566589355, + "learning_rate": 1.0302349743728423e-05, + "loss": 1.559, + "step": 4400 + }, + { + "epoch": 0.7993279905555429, + "grad_norm": 0.33472487330436707, + "learning_rate": 1.0284378310279369e-05, + "loss": 1.4275, + "step": 4401 + }, + { + "epoch": 0.7995096147297205, + "grad_norm": 0.3638002574443817, + "learning_rate": 1.026642076819666e-05, + "loss": 1.7579, + "step": 4402 + }, + { + "epoch": 0.7996912389038981, + "grad_norm": 1.4795234203338623, + "learning_rate": 1.0248477123761352e-05, + "loss": 1.9663, + "step": 4403 + }, + { + "epoch": 0.7998728630780757, + "grad_norm": 0.4721723198890686, + "learning_rate": 1.0230547383249573e-05, + "loss": 1.7062, + "step": 4404 + }, + { + "epoch": 0.8000544872522533, + "grad_norm": 1.862224817276001, + "learning_rate": 1.0212631552932655e-05, + "loss": 2.0029, + "step": 4405 + }, + { + "epoch": 0.8002361114264308, + "grad_norm": 0.4283364713191986, + "learning_rate": 1.0194729639077021e-05, + "loss": 1.8172, + "step": 4406 + }, + { + "epoch": 0.8004177356006085, + "grad_norm": 0.3574099540710449, + "learning_rate": 1.017684164794423e-05, + "loss": 1.5621, + "step": 4407 + }, + { + "epoch": 0.800599359774786, + "grad_norm": 0.3817092478275299, + "learning_rate": 1.0158967585790997e-05, + "loss": 1.5093, + "step": 4408 + }, + { + "epoch": 0.8007809839489636, + "grad_norm": 0.4326121509075165, + "learning_rate": 1.0141107458869131e-05, + "loss": 1.8302, + "step": 4409 + }, + { + "epoch": 0.8009626081231412, + "grad_norm": 0.38276612758636475, + "learning_rate": 1.0123261273425588e-05, + "loss": 1.6688, + "step": 4410 + }, + { + "epoch": 0.8011442322973188, + "grad_norm": 0.36293771862983704, + "learning_rate": 1.0105429035702441e-05, + "loss": 1.5868, + "step": 4411 + }, + { + "epoch": 0.8013258564714963, + "grad_norm": 0.35320183634757996, + "learning_rate": 1.0087610751936904e-05, + "loss": 1.7529, + "step": 4412 + }, + { + "epoch": 0.8015074806456739, + "grad_norm": 0.4537498950958252, + "learning_rate": 1.0069806428361278e-05, + "loss": 1.8066, + "step": 4413 + }, + { + "epoch": 0.8016891048198516, + "grad_norm": 0.42348748445510864, + "learning_rate": 1.0052016071203002e-05, + "loss": 1.8355, + "step": 4414 + }, + { + "epoch": 0.8018707289940291, + "grad_norm": 0.7976099252700806, + "learning_rate": 1.0034239686684621e-05, + "loss": 1.7557, + "step": 4415 + }, + { + "epoch": 0.8020523531682067, + "grad_norm": 1.0132827758789062, + "learning_rate": 1.0016477281023784e-05, + "loss": 1.8068, + "step": 4416 + }, + { + "epoch": 0.8022339773423842, + "grad_norm": 0.36656931042671204, + "learning_rate": 9.998728860433276e-06, + "loss": 1.6974, + "step": 4417 + }, + { + "epoch": 0.8024156015165619, + "grad_norm": 0.42615750432014465, + "learning_rate": 9.980994431120988e-06, + "loss": 1.7299, + "step": 4418 + }, + { + "epoch": 0.8025972256907394, + "grad_norm": 0.37986376881599426, + "learning_rate": 9.96327399928989e-06, + "loss": 1.5381, + "step": 4419 + }, + { + "epoch": 0.802778849864917, + "grad_norm": 0.39807745814323425, + "learning_rate": 9.945567571138065e-06, + "loss": 1.6857, + "step": 4420 + }, + { + "epoch": 0.8029604740390947, + "grad_norm": 0.42913058400154114, + "learning_rate": 9.927875152858729e-06, + "loss": 1.7933, + "step": 4421 + }, + { + "epoch": 0.8031420982132722, + "grad_norm": 0.4496798813343048, + "learning_rate": 9.91019675064016e-06, + "loss": 1.6258, + "step": 4422 + }, + { + "epoch": 0.8033237223874498, + "grad_norm": 1.142650842666626, + "learning_rate": 9.89253237066574e-06, + "loss": 1.9673, + "step": 4423 + }, + { + "epoch": 0.8035053465616273, + "grad_norm": 0.8243594169616699, + "learning_rate": 9.874882019113957e-06, + "loss": 1.7683, + "step": 4424 + }, + { + "epoch": 0.803686970735805, + "grad_norm": 0.3937850296497345, + "learning_rate": 9.857245702158413e-06, + "loss": 1.5651, + "step": 4425 + }, + { + "epoch": 0.8038685949099825, + "grad_norm": 0.6863056421279907, + "learning_rate": 9.83962342596776e-06, + "loss": 1.7385, + "step": 4426 + }, + { + "epoch": 0.8040502190841601, + "grad_norm": 0.40004217624664307, + "learning_rate": 9.822015196705753e-06, + "loss": 1.8222, + "step": 4427 + }, + { + "epoch": 0.8042318432583376, + "grad_norm": 0.5212815403938293, + "learning_rate": 9.80442102053123e-06, + "loss": 1.7254, + "step": 4428 + }, + { + "epoch": 0.8044134674325153, + "grad_norm": 0.4886102080345154, + "learning_rate": 9.786840903598126e-06, + "loss": 1.8885, + "step": 4429 + }, + { + "epoch": 0.8045950916066928, + "grad_norm": 0.30598706007003784, + "learning_rate": 9.769274852055477e-06, + "loss": 1.6355, + "step": 4430 + }, + { + "epoch": 0.8047767157808704, + "grad_norm": 0.5227814316749573, + "learning_rate": 9.751722872047353e-06, + "loss": 1.8088, + "step": 4431 + }, + { + "epoch": 0.8049583399550481, + "grad_norm": 0.42592695355415344, + "learning_rate": 9.734184969712934e-06, + "loss": 1.7488, + "step": 4432 + }, + { + "epoch": 0.8051399641292256, + "grad_norm": 0.33315160870552063, + "learning_rate": 9.71666115118644e-06, + "loss": 1.6693, + "step": 4433 + }, + { + "epoch": 0.8053215883034032, + "grad_norm": 0.590226411819458, + "learning_rate": 9.69915142259723e-06, + "loss": 1.736, + "step": 4434 + }, + { + "epoch": 0.8055032124775807, + "grad_norm": 0.3399716913700104, + "learning_rate": 9.681655790069666e-06, + "loss": 1.6184, + "step": 4435 + }, + { + "epoch": 0.8056848366517584, + "grad_norm": 1.8442965745925903, + "learning_rate": 9.664174259723241e-06, + "loss": 1.7177, + "step": 4436 + }, + { + "epoch": 0.8058664608259359, + "grad_norm": 0.4820113480091095, + "learning_rate": 9.646706837672447e-06, + "loss": 1.6801, + "step": 4437 + }, + { + "epoch": 0.8060480850001135, + "grad_norm": 0.47913283109664917, + "learning_rate": 9.629253530026915e-06, + "loss": 1.601, + "step": 4438 + }, + { + "epoch": 0.806229709174291, + "grad_norm": 0.4816998839378357, + "learning_rate": 9.61181434289129e-06, + "loss": 1.669, + "step": 4439 + }, + { + "epoch": 0.8064113333484687, + "grad_norm": 0.39520880579948425, + "learning_rate": 9.59438928236529e-06, + "loss": 1.64, + "step": 4440 + }, + { + "epoch": 0.8065929575226463, + "grad_norm": 0.4566781222820282, + "learning_rate": 9.57697835454367e-06, + "loss": 1.8133, + "step": 4441 + }, + { + "epoch": 0.8067745816968238, + "grad_norm": 1.4188544750213623, + "learning_rate": 9.559581565516296e-06, + "loss": 1.6027, + "step": 4442 + }, + { + "epoch": 0.8069562058710015, + "grad_norm": 0.3312291204929352, + "learning_rate": 9.54219892136805e-06, + "loss": 1.8243, + "step": 4443 + }, + { + "epoch": 0.807137830045179, + "grad_norm": 0.5583207607269287, + "learning_rate": 9.524830428178883e-06, + "loss": 1.6753, + "step": 4444 + }, + { + "epoch": 0.8073194542193566, + "grad_norm": 0.42123767733573914, + "learning_rate": 9.507476092023771e-06, + "loss": 1.6777, + "step": 4445 + }, + { + "epoch": 0.8075010783935341, + "grad_norm": 0.49007120728492737, + "learning_rate": 9.490135918972743e-06, + "loss": 1.5762, + "step": 4446 + }, + { + "epoch": 0.8076827025677118, + "grad_norm": 0.3137439787387848, + "learning_rate": 9.47280991509092e-06, + "loss": 1.8879, + "step": 4447 + }, + { + "epoch": 0.8078643267418893, + "grad_norm": 0.41184550523757935, + "learning_rate": 9.455498086438402e-06, + "loss": 1.8462, + "step": 4448 + }, + { + "epoch": 0.8080459509160669, + "grad_norm": 0.8131788969039917, + "learning_rate": 9.438200439070388e-06, + "loss": 1.785, + "step": 4449 + }, + { + "epoch": 0.8082275750902446, + "grad_norm": 0.60750412940979, + "learning_rate": 9.420916979037081e-06, + "loss": 1.8536, + "step": 4450 + }, + { + "epoch": 0.8084091992644221, + "grad_norm": 1.920356035232544, + "learning_rate": 9.403647712383712e-06, + "loss": 1.8773, + "step": 4451 + }, + { + "epoch": 0.8085908234385997, + "grad_norm": 0.4496820271015167, + "learning_rate": 9.3863926451506e-06, + "loss": 1.9638, + "step": 4452 + }, + { + "epoch": 0.8087724476127772, + "grad_norm": 0.43319326639175415, + "learning_rate": 9.369151783373032e-06, + "loss": 1.8234, + "step": 4453 + }, + { + "epoch": 0.8089540717869549, + "grad_norm": 0.4602745771408081, + "learning_rate": 9.351925133081391e-06, + "loss": 1.6582, + "step": 4454 + }, + { + "epoch": 0.8091356959611324, + "grad_norm": 0.4104689061641693, + "learning_rate": 9.334712700301023e-06, + "loss": 1.7961, + "step": 4455 + }, + { + "epoch": 0.80931732013531, + "grad_norm": 0.36866065859794617, + "learning_rate": 9.31751449105237e-06, + "loss": 1.5866, + "step": 4456 + }, + { + "epoch": 0.8094989443094875, + "grad_norm": 0.3532170057296753, + "learning_rate": 9.300330511350841e-06, + "loss": 1.5545, + "step": 4457 + }, + { + "epoch": 0.8096805684836652, + "grad_norm": 0.3832783102989197, + "learning_rate": 9.283160767206906e-06, + "loss": 1.7301, + "step": 4458 + }, + { + "epoch": 0.8098621926578428, + "grad_norm": 0.502297043800354, + "learning_rate": 9.26600526462601e-06, + "loss": 1.9426, + "step": 4459 + }, + { + "epoch": 0.8100438168320203, + "grad_norm": 0.4047532081604004, + "learning_rate": 9.248864009608671e-06, + "loss": 1.7371, + "step": 4460 + }, + { + "epoch": 0.810225441006198, + "grad_norm": 0.4234907627105713, + "learning_rate": 9.231737008150415e-06, + "loss": 1.6497, + "step": 4461 + }, + { + "epoch": 0.8104070651803755, + "grad_norm": 0.38775715231895447, + "learning_rate": 9.214624266241744e-06, + "loss": 1.6976, + "step": 4462 + }, + { + "epoch": 0.8105886893545531, + "grad_norm": 0.34686675667762756, + "learning_rate": 9.197525789868211e-06, + "loss": 1.7712, + "step": 4463 + }, + { + "epoch": 0.8107703135287306, + "grad_norm": 0.4441828727722168, + "learning_rate": 9.18044158501034e-06, + "loss": 1.6533, + "step": 4464 + }, + { + "epoch": 0.8109519377029083, + "grad_norm": 0.4412269592285156, + "learning_rate": 9.163371657643716e-06, + "loss": 1.6714, + "step": 4465 + }, + { + "epoch": 0.8111335618770859, + "grad_norm": 0.7936723828315735, + "learning_rate": 9.146316013738876e-06, + "loss": 1.8763, + "step": 4466 + }, + { + "epoch": 0.8113151860512634, + "grad_norm": 0.6511148810386658, + "learning_rate": 9.129274659261416e-06, + "loss": 1.847, + "step": 4467 + }, + { + "epoch": 0.811496810225441, + "grad_norm": 0.9829946756362915, + "learning_rate": 9.11224760017188e-06, + "loss": 1.717, + "step": 4468 + }, + { + "epoch": 0.8116784343996186, + "grad_norm": 0.3161734342575073, + "learning_rate": 9.095234842425854e-06, + "loss": 1.6431, + "step": 4469 + }, + { + "epoch": 0.8118600585737962, + "grad_norm": 0.39895156025886536, + "learning_rate": 9.078236391973899e-06, + "loss": 1.7324, + "step": 4470 + }, + { + "epoch": 0.8120416827479737, + "grad_norm": 0.39854562282562256, + "learning_rate": 9.061252254761576e-06, + "loss": 1.6501, + "step": 4471 + }, + { + "epoch": 0.8122233069221514, + "grad_norm": 0.5556802749633789, + "learning_rate": 9.044282436729429e-06, + "loss": 1.7953, + "step": 4472 + }, + { + "epoch": 0.8124049310963289, + "grad_norm": 0.3907518982887268, + "learning_rate": 9.027326943813014e-06, + "loss": 1.7312, + "step": 4473 + }, + { + "epoch": 0.8125865552705065, + "grad_norm": 0.6525285243988037, + "learning_rate": 9.010385781942887e-06, + "loss": 1.7417, + "step": 4474 + }, + { + "epoch": 0.812768179444684, + "grad_norm": 0.6970868706703186, + "learning_rate": 8.993458957044554e-06, + "loss": 1.729, + "step": 4475 + }, + { + "epoch": 0.8129498036188617, + "grad_norm": 0.34302520751953125, + "learning_rate": 8.976546475038527e-06, + "loss": 1.6576, + "step": 4476 + }, + { + "epoch": 0.8131314277930393, + "grad_norm": 0.4179999530315399, + "learning_rate": 8.959648341840283e-06, + "loss": 1.7959, + "step": 4477 + }, + { + "epoch": 0.8133130519672168, + "grad_norm": 0.40484970808029175, + "learning_rate": 8.942764563360329e-06, + "loss": 1.6875, + "step": 4478 + }, + { + "epoch": 0.8134946761413944, + "grad_norm": 0.8769966959953308, + "learning_rate": 8.925895145504087e-06, + "loss": 1.7627, + "step": 4479 + }, + { + "epoch": 0.813676300315572, + "grad_norm": 0.3794907331466675, + "learning_rate": 8.909040094172017e-06, + "loss": 1.7799, + "step": 4480 + }, + { + "epoch": 0.8138579244897496, + "grad_norm": 0.8722105026245117, + "learning_rate": 8.8921994152595e-06, + "loss": 1.6157, + "step": 4481 + }, + { + "epoch": 0.8140395486639271, + "grad_norm": 0.9528540968894958, + "learning_rate": 8.875373114656932e-06, + "loss": 1.9013, + "step": 4482 + }, + { + "epoch": 0.8142211728381048, + "grad_norm": 0.41487956047058105, + "learning_rate": 8.858561198249654e-06, + "loss": 1.6944, + "step": 4483 + }, + { + "epoch": 0.8144027970122824, + "grad_norm": 1.4292429685592651, + "learning_rate": 8.841763671917973e-06, + "loss": 1.8216, + "step": 4484 + }, + { + "epoch": 0.8145844211864599, + "grad_norm": 0.3294813632965088, + "learning_rate": 8.824980541537187e-06, + "loss": 1.6859, + "step": 4485 + }, + { + "epoch": 0.8147660453606375, + "grad_norm": 0.40697917342185974, + "learning_rate": 8.808211812977552e-06, + "loss": 1.76, + "step": 4486 + }, + { + "epoch": 0.8149476695348151, + "grad_norm": 0.3332662880420685, + "learning_rate": 8.791457492104277e-06, + "loss": 1.6008, + "step": 4487 + }, + { + "epoch": 0.8151292937089927, + "grad_norm": 0.47830700874328613, + "learning_rate": 8.77471758477753e-06, + "loss": 1.7972, + "step": 4488 + }, + { + "epoch": 0.8153109178831702, + "grad_norm": 0.32696768641471863, + "learning_rate": 8.75799209685244e-06, + "loss": 1.7273, + "step": 4489 + }, + { + "epoch": 0.8154925420573478, + "grad_norm": 0.557127058506012, + "learning_rate": 8.741281034179082e-06, + "loss": 1.8354, + "step": 4490 + }, + { + "epoch": 0.8156741662315254, + "grad_norm": 0.43133386969566345, + "learning_rate": 8.724584402602521e-06, + "loss": 1.6597, + "step": 4491 + }, + { + "epoch": 0.815855790405703, + "grad_norm": 0.7965632677078247, + "learning_rate": 8.707902207962754e-06, + "loss": 1.6835, + "step": 4492 + }, + { + "epoch": 0.8160374145798805, + "grad_norm": 0.42121127247810364, + "learning_rate": 8.691234456094716e-06, + "loss": 1.8131, + "step": 4493 + }, + { + "epoch": 0.8162190387540582, + "grad_norm": 0.44624435901641846, + "learning_rate": 8.674581152828293e-06, + "loss": 1.5947, + "step": 4494 + }, + { + "epoch": 0.8164006629282358, + "grad_norm": 0.40528181195259094, + "learning_rate": 8.657942303988343e-06, + "loss": 1.763, + "step": 4495 + }, + { + "epoch": 0.8165822871024133, + "grad_norm": 0.3385777175426483, + "learning_rate": 8.641317915394637e-06, + "loss": 1.8647, + "step": 4496 + }, + { + "epoch": 0.8167639112765909, + "grad_norm": 0.4479692578315735, + "learning_rate": 8.624707992861897e-06, + "loss": 1.7855, + "step": 4497 + }, + { + "epoch": 0.8169455354507685, + "grad_norm": 0.6150509715080261, + "learning_rate": 8.608112542199809e-06, + "loss": 1.8532, + "step": 4498 + }, + { + "epoch": 0.8171271596249461, + "grad_norm": 0.39260217547416687, + "learning_rate": 8.591531569212958e-06, + "loss": 1.6679, + "step": 4499 + }, + { + "epoch": 0.8173087837991236, + "grad_norm": 0.40654000639915466, + "learning_rate": 8.574965079700897e-06, + "loss": 1.7897, + "step": 4500 + }, + { + "epoch": 0.8174904079733012, + "grad_norm": 0.3651551306247711, + "learning_rate": 8.558413079458106e-06, + "loss": 1.8651, + "step": 4501 + }, + { + "epoch": 0.8176720321474789, + "grad_norm": 0.3985881805419922, + "learning_rate": 8.541875574273978e-06, + "loss": 1.95, + "step": 4502 + }, + { + "epoch": 0.8178536563216564, + "grad_norm": 0.4375286400318146, + "learning_rate": 8.525352569932842e-06, + "loss": 1.6703, + "step": 4503 + }, + { + "epoch": 0.818035280495834, + "grad_norm": 0.42005816102027893, + "learning_rate": 8.508844072213978e-06, + "loss": 1.6617, + "step": 4504 + }, + { + "epoch": 0.8182169046700116, + "grad_norm": 0.357739120721817, + "learning_rate": 8.492350086891588e-06, + "loss": 1.8258, + "step": 4505 + }, + { + "epoch": 0.8183985288441892, + "grad_norm": 0.6357346177101135, + "learning_rate": 8.475870619734777e-06, + "loss": 1.6941, + "step": 4506 + }, + { + "epoch": 0.8185801530183667, + "grad_norm": 0.5118164420127869, + "learning_rate": 8.459405676507559e-06, + "loss": 1.6308, + "step": 4507 + }, + { + "epoch": 0.8187617771925443, + "grad_norm": 1.8529053926467896, + "learning_rate": 8.442955262968933e-06, + "loss": 1.5972, + "step": 4508 + }, + { + "epoch": 0.818943401366722, + "grad_norm": 0.43778860569000244, + "learning_rate": 8.426519384872733e-06, + "loss": 1.719, + "step": 4509 + }, + { + "epoch": 0.8191250255408995, + "grad_norm": 0.5573384761810303, + "learning_rate": 8.410098047967785e-06, + "loss": 1.7067, + "step": 4510 + }, + { + "epoch": 0.819306649715077, + "grad_norm": 0.549683153629303, + "learning_rate": 8.393691257997782e-06, + "loss": 1.8619, + "step": 4511 + }, + { + "epoch": 0.8194882738892547, + "grad_norm": 0.36581897735595703, + "learning_rate": 8.377299020701318e-06, + "loss": 1.5251, + "step": 4512 + }, + { + "epoch": 0.8196698980634323, + "grad_norm": 0.613357663154602, + "learning_rate": 8.360921341811956e-06, + "loss": 1.7129, + "step": 4513 + }, + { + "epoch": 0.8198515222376098, + "grad_norm": 0.40676620602607727, + "learning_rate": 8.344558227058108e-06, + "loss": 1.9957, + "step": 4514 + }, + { + "epoch": 0.8200331464117874, + "grad_norm": 0.3401442766189575, + "learning_rate": 8.328209682163113e-06, + "loss": 1.6199, + "step": 4515 + }, + { + "epoch": 0.820214770585965, + "grad_norm": 0.31169605255126953, + "learning_rate": 8.311875712845218e-06, + "loss": 1.7676, + "step": 4516 + }, + { + "epoch": 0.8203963947601426, + "grad_norm": 1.6254545450210571, + "learning_rate": 8.295556324817588e-06, + "loss": 1.7806, + "step": 4517 + }, + { + "epoch": 0.8205780189343201, + "grad_norm": 0.403364360332489, + "learning_rate": 8.27925152378825e-06, + "loss": 1.7474, + "step": 4518 + }, + { + "epoch": 0.8207596431084977, + "grad_norm": 0.49392643570899963, + "learning_rate": 8.262961315460156e-06, + "loss": 1.6856, + "step": 4519 + }, + { + "epoch": 0.8209412672826754, + "grad_norm": 0.36520007252693176, + "learning_rate": 8.246685705531127e-06, + "loss": 1.5321, + "step": 4520 + }, + { + "epoch": 0.8211228914568529, + "grad_norm": 0.45114800333976746, + "learning_rate": 8.230424699693923e-06, + "loss": 1.4625, + "step": 4521 + }, + { + "epoch": 0.8213045156310305, + "grad_norm": 0.3903229236602783, + "learning_rate": 8.214178303636144e-06, + "loss": 1.8749, + "step": 4522 + }, + { + "epoch": 0.8214861398052081, + "grad_norm": 0.4191426634788513, + "learning_rate": 8.19794652304034e-06, + "loss": 1.6966, + "step": 4523 + }, + { + "epoch": 0.8216677639793857, + "grad_norm": 0.6068341732025146, + "learning_rate": 8.181729363583884e-06, + "loss": 1.6301, + "step": 4524 + }, + { + "epoch": 0.8218493881535632, + "grad_norm": 0.4025677442550659, + "learning_rate": 8.165526830939068e-06, + "loss": 1.6426, + "step": 4525 + }, + { + "epoch": 0.8220310123277408, + "grad_norm": 0.4344487488269806, + "learning_rate": 8.149338930773087e-06, + "loss": 1.5544, + "step": 4526 + }, + { + "epoch": 0.8222126365019184, + "grad_norm": 0.38597023487091064, + "learning_rate": 8.133165668747983e-06, + "loss": 1.5812, + "step": 4527 + }, + { + "epoch": 0.822394260676096, + "grad_norm": 0.3508085608482361, + "learning_rate": 8.11700705052068e-06, + "loss": 1.7431, + "step": 4528 + }, + { + "epoch": 0.8225758848502736, + "grad_norm": 0.3742254376411438, + "learning_rate": 8.100863081742999e-06, + "loss": 1.683, + "step": 4529 + }, + { + "epoch": 0.8227575090244511, + "grad_norm": 0.3079211115837097, + "learning_rate": 8.084733768061647e-06, + "loss": 1.4651, + "step": 4530 + }, + { + "epoch": 0.8229391331986288, + "grad_norm": 0.39941713213920593, + "learning_rate": 8.068619115118176e-06, + "loss": 1.7281, + "step": 4531 + }, + { + "epoch": 0.8231207573728063, + "grad_norm": 0.4476274251937866, + "learning_rate": 8.052519128549013e-06, + "loss": 1.763, + "step": 4532 + }, + { + "epoch": 0.8233023815469839, + "grad_norm": 0.3590467870235443, + "learning_rate": 8.036433813985478e-06, + "loss": 1.7607, + "step": 4533 + }, + { + "epoch": 0.8234840057211615, + "grad_norm": 0.37985730171203613, + "learning_rate": 8.020363177053719e-06, + "loss": 1.9247, + "step": 4534 + }, + { + "epoch": 0.8236656298953391, + "grad_norm": 0.3273206651210785, + "learning_rate": 8.004307223374797e-06, + "loss": 1.4634, + "step": 4535 + }, + { + "epoch": 0.8238472540695166, + "grad_norm": 0.5853471755981445, + "learning_rate": 7.988265958564629e-06, + "loss": 1.8271, + "step": 4536 + }, + { + "epoch": 0.8240288782436942, + "grad_norm": 0.42051878571510315, + "learning_rate": 7.97223938823396e-06, + "loss": 1.9087, + "step": 4537 + }, + { + "epoch": 0.8242105024178719, + "grad_norm": 0.4234544634819031, + "learning_rate": 7.956227517988412e-06, + "loss": 1.8079, + "step": 4538 + }, + { + "epoch": 0.8243921265920494, + "grad_norm": 0.3352257311344147, + "learning_rate": 7.940230353428501e-06, + "loss": 1.6069, + "step": 4539 + }, + { + "epoch": 0.824573750766227, + "grad_norm": 0.40788036584854126, + "learning_rate": 7.924247900149534e-06, + "loss": 1.7011, + "step": 4540 + }, + { + "epoch": 0.8247553749404045, + "grad_norm": 0.3859773874282837, + "learning_rate": 7.908280163741732e-06, + "loss": 1.8087, + "step": 4541 + }, + { + "epoch": 0.8249369991145822, + "grad_norm": 0.5238548517227173, + "learning_rate": 7.892327149790124e-06, + "loss": 1.6406, + "step": 4542 + }, + { + "epoch": 0.8251186232887597, + "grad_norm": 0.5224126577377319, + "learning_rate": 7.876388863874629e-06, + "loss": 1.7267, + "step": 4543 + }, + { + "epoch": 0.8253002474629373, + "grad_norm": 0.28484129905700684, + "learning_rate": 7.860465311569987e-06, + "loss": 1.7578, + "step": 4544 + }, + { + "epoch": 0.825481871637115, + "grad_norm": 0.4434871971607208, + "learning_rate": 7.844556498445788e-06, + "loss": 1.6759, + "step": 4545 + }, + { + "epoch": 0.8256634958112925, + "grad_norm": 0.9725404381752014, + "learning_rate": 7.828662430066464e-06, + "loss": 1.6306, + "step": 4546 + }, + { + "epoch": 0.82584511998547, + "grad_norm": 0.3530537188053131, + "learning_rate": 7.812783111991306e-06, + "loss": 1.619, + "step": 4547 + }, + { + "epoch": 0.8260267441596476, + "grad_norm": 0.5140106081962585, + "learning_rate": 7.796918549774445e-06, + "loss": 1.7248, + "step": 4548 + }, + { + "epoch": 0.8262083683338253, + "grad_norm": 0.7394374012947083, + "learning_rate": 7.781068748964831e-06, + "loss": 1.7684, + "step": 4549 + }, + { + "epoch": 0.8263899925080028, + "grad_norm": 1.5544350147247314, + "learning_rate": 7.765233715106273e-06, + "loss": 1.8209, + "step": 4550 + }, + { + "epoch": 0.8265716166821804, + "grad_norm": 0.483481228351593, + "learning_rate": 7.749413453737375e-06, + "loss": 1.8427, + "step": 4551 + }, + { + "epoch": 0.8267532408563579, + "grad_norm": 1.1530052423477173, + "learning_rate": 7.733607970391643e-06, + "loss": 1.8006, + "step": 4552 + }, + { + "epoch": 0.8269348650305356, + "grad_norm": 0.4193165600299835, + "learning_rate": 7.717817270597339e-06, + "loss": 1.6611, + "step": 4553 + }, + { + "epoch": 0.8271164892047131, + "grad_norm": 0.485752135515213, + "learning_rate": 7.702041359877615e-06, + "loss": 1.6685, + "step": 4554 + }, + { + "epoch": 0.8272981133788907, + "grad_norm": 1.1974376440048218, + "learning_rate": 7.68628024375041e-06, + "loss": 1.9664, + "step": 4555 + }, + { + "epoch": 0.8274797375530684, + "grad_norm": 0.4965997040271759, + "learning_rate": 7.670533927728513e-06, + "loss": 1.768, + "step": 4556 + }, + { + "epoch": 0.8276613617272459, + "grad_norm": 0.38078033924102783, + "learning_rate": 7.654802417319523e-06, + "loss": 1.7619, + "step": 4557 + }, + { + "epoch": 0.8278429859014235, + "grad_norm": 0.42181286215782166, + "learning_rate": 7.639085718025862e-06, + "loss": 1.6563, + "step": 4558 + }, + { + "epoch": 0.828024610075601, + "grad_norm": 0.3510996401309967, + "learning_rate": 7.623383835344761e-06, + "loss": 1.696, + "step": 4559 + }, + { + "epoch": 0.8282062342497787, + "grad_norm": 0.5339398384094238, + "learning_rate": 7.6076967747682915e-06, + "loss": 1.5834, + "step": 4560 + }, + { + "epoch": 0.8283878584239562, + "grad_norm": 0.848004937171936, + "learning_rate": 7.592024541783343e-06, + "loss": 1.6745, + "step": 4561 + }, + { + "epoch": 0.8285694825981338, + "grad_norm": 0.3263623118400574, + "learning_rate": 7.576367141871593e-06, + "loss": 1.7604, + "step": 4562 + }, + { + "epoch": 0.8287511067723113, + "grad_norm": 0.5536161065101624, + "learning_rate": 7.560724580509543e-06, + "loss": 1.8341, + "step": 4563 + }, + { + "epoch": 0.828932730946489, + "grad_norm": 0.46182650327682495, + "learning_rate": 7.545096863168494e-06, + "loss": 1.6378, + "step": 4564 + }, + { + "epoch": 0.8291143551206666, + "grad_norm": 0.34273186326026917, + "learning_rate": 7.529483995314585e-06, + "loss": 1.8972, + "step": 4565 + }, + { + "epoch": 0.8292959792948441, + "grad_norm": 0.5623989105224609, + "learning_rate": 7.513885982408725e-06, + "loss": 1.7228, + "step": 4566 + }, + { + "epoch": 0.8294776034690218, + "grad_norm": 0.3875284790992737, + "learning_rate": 7.498302829906667e-06, + "loss": 1.5788, + "step": 4567 + }, + { + "epoch": 0.8296592276431993, + "grad_norm": 0.45376652479171753, + "learning_rate": 7.482734543258918e-06, + "loss": 1.7221, + "step": 4568 + }, + { + "epoch": 0.8298408518173769, + "grad_norm": 0.46927034854888916, + "learning_rate": 7.467181127910832e-06, + "loss": 1.7749, + "step": 4569 + }, + { + "epoch": 0.8300224759915544, + "grad_norm": 0.394682914018631, + "learning_rate": 7.451642589302532e-06, + "loss": 1.821, + "step": 4570 + }, + { + "epoch": 0.8302041001657321, + "grad_norm": 0.3551117479801178, + "learning_rate": 7.43611893286893e-06, + "loss": 1.7533, + "step": 4571 + }, + { + "epoch": 0.8303857243399096, + "grad_norm": 1.1871610879898071, + "learning_rate": 7.420610164039776e-06, + "loss": 1.4986, + "step": 4572 + }, + { + "epoch": 0.8305673485140872, + "grad_norm": 0.34976187348365784, + "learning_rate": 7.405116288239561e-06, + "loss": 1.9051, + "step": 4573 + }, + { + "epoch": 0.8307489726882648, + "grad_norm": 0.4654274880886078, + "learning_rate": 7.38963731088761e-06, + "loss": 1.6833, + "step": 4574 + }, + { + "epoch": 0.8309305968624424, + "grad_norm": 0.7300633192062378, + "learning_rate": 7.374173237398013e-06, + "loss": 1.4802, + "step": 4575 + }, + { + "epoch": 0.83111222103662, + "grad_norm": 0.5527960062026978, + "learning_rate": 7.3587240731796454e-06, + "loss": 1.7693, + "step": 4576 + }, + { + "epoch": 0.8312938452107975, + "grad_norm": 0.34297844767570496, + "learning_rate": 7.343289823636168e-06, + "loss": 1.7357, + "step": 4577 + }, + { + "epoch": 0.8314754693849752, + "grad_norm": 0.4765230119228363, + "learning_rate": 7.327870494166039e-06, + "loss": 1.8868, + "step": 4578 + }, + { + "epoch": 0.8316570935591527, + "grad_norm": 0.6672186255455017, + "learning_rate": 7.312466090162506e-06, + "loss": 1.7913, + "step": 4579 + }, + { + "epoch": 0.8318387177333303, + "grad_norm": 0.44170916080474854, + "learning_rate": 7.297076617013565e-06, + "loss": 1.8425, + "step": 4580 + }, + { + "epoch": 0.8320203419075078, + "grad_norm": 0.359697163105011, + "learning_rate": 7.281702080102004e-06, + "loss": 1.5121, + "step": 4581 + }, + { + "epoch": 0.8322019660816855, + "grad_norm": 0.39276495575904846, + "learning_rate": 7.266342484805377e-06, + "loss": 1.7938, + "step": 4582 + }, + { + "epoch": 0.8323835902558631, + "grad_norm": 0.35208359360694885, + "learning_rate": 7.250997836496049e-06, + "loss": 1.7592, + "step": 4583 + }, + { + "epoch": 0.8325652144300406, + "grad_norm": 0.34802010655403137, + "learning_rate": 7.235668140541108e-06, + "loss": 1.7716, + "step": 4584 + }, + { + "epoch": 0.8327468386042183, + "grad_norm": 0.6458538174629211, + "learning_rate": 7.220353402302449e-06, + "loss": 1.9398, + "step": 4585 + }, + { + "epoch": 0.8329284627783958, + "grad_norm": 0.37030285596847534, + "learning_rate": 7.20505362713671e-06, + "loss": 1.7781, + "step": 4586 + }, + { + "epoch": 0.8331100869525734, + "grad_norm": 0.4258744716644287, + "learning_rate": 7.189768820395321e-06, + "loss": 1.8914, + "step": 4587 + }, + { + "epoch": 0.8332917111267509, + "grad_norm": 0.414620965719223, + "learning_rate": 7.174498987424449e-06, + "loss": 1.6697, + "step": 4588 + }, + { + "epoch": 0.8334733353009286, + "grad_norm": 0.42834851145744324, + "learning_rate": 7.159244133565046e-06, + "loss": 1.873, + "step": 4589 + }, + { + "epoch": 0.8336549594751061, + "grad_norm": 0.4060649871826172, + "learning_rate": 7.144004264152793e-06, + "loss": 1.6457, + "step": 4590 + }, + { + "epoch": 0.8338365836492837, + "grad_norm": 0.4087388515472412, + "learning_rate": 7.128779384518164e-06, + "loss": 1.9018, + "step": 4591 + }, + { + "epoch": 0.8340182078234613, + "grad_norm": 0.4235808551311493, + "learning_rate": 7.1135694999864e-06, + "loss": 1.6965, + "step": 4592 + }, + { + "epoch": 0.8341998319976389, + "grad_norm": 0.3724633455276489, + "learning_rate": 7.098374615877451e-06, + "loss": 1.8956, + "step": 4593 + }, + { + "epoch": 0.8343814561718165, + "grad_norm": 0.44105735421180725, + "learning_rate": 7.083194737506055e-06, + "loss": 1.7515, + "step": 4594 + }, + { + "epoch": 0.834563080345994, + "grad_norm": 0.3442532420158386, + "learning_rate": 7.068029870181669e-06, + "loss": 1.888, + "step": 4595 + }, + { + "epoch": 0.8347447045201717, + "grad_norm": 0.3498273193836212, + "learning_rate": 7.052880019208541e-06, + "loss": 1.6597, + "step": 4596 + }, + { + "epoch": 0.8349263286943492, + "grad_norm": 0.379044771194458, + "learning_rate": 7.0377451898856525e-06, + "loss": 1.6747, + "step": 4597 + }, + { + "epoch": 0.8351079528685268, + "grad_norm": 0.397169828414917, + "learning_rate": 7.022625387506721e-06, + "loss": 1.7862, + "step": 4598 + }, + { + "epoch": 0.8352895770427043, + "grad_norm": 1.8129520416259766, + "learning_rate": 7.007520617360197e-06, + "loss": 2.085, + "step": 4599 + }, + { + "epoch": 0.835471201216882, + "grad_norm": 0.42858970165252686, + "learning_rate": 6.9924308847293114e-06, + "loss": 1.7951, + "step": 4600 + }, + { + "epoch": 0.8356528253910596, + "grad_norm": 0.3910629153251648, + "learning_rate": 6.977356194891998e-06, + "loss": 1.9031, + "step": 4601 + }, + { + "epoch": 0.8358344495652371, + "grad_norm": 0.3858119547367096, + "learning_rate": 6.962296553120939e-06, + "loss": 1.6954, + "step": 4602 + }, + { + "epoch": 0.8360160737394147, + "grad_norm": 0.5473548769950867, + "learning_rate": 6.947251964683565e-06, + "loss": 1.9679, + "step": 4603 + }, + { + "epoch": 0.8361976979135923, + "grad_norm": 0.395009845495224, + "learning_rate": 6.932222434842051e-06, + "loss": 1.5785, + "step": 4604 + }, + { + "epoch": 0.8363793220877699, + "grad_norm": 0.3763134181499481, + "learning_rate": 6.917207968853268e-06, + "loss": 1.6612, + "step": 4605 + }, + { + "epoch": 0.8365609462619474, + "grad_norm": 0.7432315349578857, + "learning_rate": 6.9022085719688435e-06, + "loss": 1.6255, + "step": 4606 + }, + { + "epoch": 0.8367425704361251, + "grad_norm": 0.33347854018211365, + "learning_rate": 6.887224249435131e-06, + "loss": 1.5904, + "step": 4607 + }, + { + "epoch": 0.8369241946103027, + "grad_norm": 0.5026660561561584, + "learning_rate": 6.8722550064932e-06, + "loss": 1.7964, + "step": 4608 + }, + { + "epoch": 0.8371058187844802, + "grad_norm": 0.39120903611183167, + "learning_rate": 6.857300848378856e-06, + "loss": 1.7182, + "step": 4609 + }, + { + "epoch": 0.8372874429586578, + "grad_norm": 0.34710198640823364, + "learning_rate": 6.842361780322648e-06, + "loss": 1.6439, + "step": 4610 + }, + { + "epoch": 0.8374690671328354, + "grad_norm": 0.5175197720527649, + "learning_rate": 6.827437807549814e-06, + "loss": 1.7006, + "step": 4611 + }, + { + "epoch": 0.837650691307013, + "grad_norm": 0.3168351352214813, + "learning_rate": 6.812528935280304e-06, + "loss": 1.635, + "step": 4612 + }, + { + "epoch": 0.8378323154811905, + "grad_norm": 0.42101845145225525, + "learning_rate": 6.797635168728844e-06, + "loss": 1.7629, + "step": 4613 + }, + { + "epoch": 0.8380139396553681, + "grad_norm": 0.3649003505706787, + "learning_rate": 6.782756513104821e-06, + "loss": 1.7687, + "step": 4614 + }, + { + "epoch": 0.8381955638295457, + "grad_norm": 0.3926709294319153, + "learning_rate": 6.7678929736123445e-06, + "loss": 1.6839, + "step": 4615 + }, + { + "epoch": 0.8383771880037233, + "grad_norm": 0.6546918749809265, + "learning_rate": 6.753044555450266e-06, + "loss": 1.7735, + "step": 4616 + }, + { + "epoch": 0.8385588121779008, + "grad_norm": 0.40754860639572144, + "learning_rate": 6.738211263812111e-06, + "loss": 1.7522, + "step": 4617 + }, + { + "epoch": 0.8387404363520785, + "grad_norm": 0.37151867151260376, + "learning_rate": 6.72339310388615e-06, + "loss": 1.6825, + "step": 4618 + }, + { + "epoch": 0.8389220605262561, + "grad_norm": 0.3118463456630707, + "learning_rate": 6.708590080855337e-06, + "loss": 1.6518, + "step": 4619 + }, + { + "epoch": 0.8391036847004336, + "grad_norm": 0.4389057755470276, + "learning_rate": 6.693802199897337e-06, + "loss": 1.7454, + "step": 4620 + }, + { + "epoch": 0.8392853088746112, + "grad_norm": 0.5745437741279602, + "learning_rate": 6.679029466184506e-06, + "loss": 1.7043, + "step": 4621 + }, + { + "epoch": 0.8394669330487888, + "grad_norm": 0.5264360308647156, + "learning_rate": 6.664271884883932e-06, + "loss": 1.7927, + "step": 4622 + }, + { + "epoch": 0.8396485572229664, + "grad_norm": 0.4042648375034332, + "learning_rate": 6.6495294611573885e-06, + "loss": 1.6518, + "step": 4623 + }, + { + "epoch": 0.8398301813971439, + "grad_norm": 0.48894163966178894, + "learning_rate": 6.6348022001613445e-06, + "loss": 1.7546, + "step": 4624 + }, + { + "epoch": 0.8400118055713215, + "grad_norm": 0.8592407703399658, + "learning_rate": 6.62009010704695e-06, + "loss": 1.7916, + "step": 4625 + }, + { + "epoch": 0.8401934297454992, + "grad_norm": 0.34292635321617126, + "learning_rate": 6.605393186960085e-06, + "loss": 1.7503, + "step": 4626 + }, + { + "epoch": 0.8403750539196767, + "grad_norm": 0.928813636302948, + "learning_rate": 6.590711445041286e-06, + "loss": 1.906, + "step": 4627 + }, + { + "epoch": 0.8405566780938543, + "grad_norm": 0.551295280456543, + "learning_rate": 6.576044886425825e-06, + "loss": 1.7994, + "step": 4628 + }, + { + "epoch": 0.8407383022680319, + "grad_norm": 0.38426095247268677, + "learning_rate": 6.561393516243619e-06, + "loss": 1.5606, + "step": 4629 + }, + { + "epoch": 0.8409199264422095, + "grad_norm": 0.44166868925094604, + "learning_rate": 6.546757339619275e-06, + "loss": 1.7789, + "step": 4630 + }, + { + "epoch": 0.841101550616387, + "grad_norm": 0.3755934238433838, + "learning_rate": 6.5321363616721306e-06, + "loss": 1.7556, + "step": 4631 + }, + { + "epoch": 0.8412831747905646, + "grad_norm": 0.36526620388031006, + "learning_rate": 6.517530587516163e-06, + "loss": 1.8313, + "step": 4632 + }, + { + "epoch": 0.8414647989647422, + "grad_norm": 0.519339382648468, + "learning_rate": 6.50294002226004e-06, + "loss": 1.6142, + "step": 4633 + }, + { + "epoch": 0.8416464231389198, + "grad_norm": 0.45061933994293213, + "learning_rate": 6.488364671007119e-06, + "loss": 1.7728, + "step": 4634 + }, + { + "epoch": 0.8418280473130973, + "grad_norm": 0.7178833484649658, + "learning_rate": 6.473804538855449e-06, + "loss": 1.8872, + "step": 4635 + }, + { + "epoch": 0.8420096714872749, + "grad_norm": 0.4222225844860077, + "learning_rate": 6.459259630897729e-06, + "loss": 1.6051, + "step": 4636 + }, + { + "epoch": 0.8421912956614526, + "grad_norm": 0.3886779248714447, + "learning_rate": 6.44472995222134e-06, + "loss": 1.7687, + "step": 4637 + }, + { + "epoch": 0.8423729198356301, + "grad_norm": 0.5618228912353516, + "learning_rate": 6.430215507908332e-06, + "loss": 1.7551, + "step": 4638 + }, + { + "epoch": 0.8425545440098077, + "grad_norm": 0.4222123622894287, + "learning_rate": 6.4157163030354515e-06, + "loss": 2.0139, + "step": 4639 + }, + { + "epoch": 0.8427361681839853, + "grad_norm": 0.33495426177978516, + "learning_rate": 6.401232342674085e-06, + "loss": 1.7348, + "step": 4640 + }, + { + "epoch": 0.8429177923581629, + "grad_norm": 0.4118056893348694, + "learning_rate": 6.386763631890313e-06, + "loss": 1.715, + "step": 4641 + }, + { + "epoch": 0.8430994165323404, + "grad_norm": 0.4981967806816101, + "learning_rate": 6.372310175744861e-06, + "loss": 1.8468, + "step": 4642 + }, + { + "epoch": 0.843281040706518, + "grad_norm": 0.4465429186820984, + "learning_rate": 6.357871979293117e-06, + "loss": 1.6013, + "step": 4643 + }, + { + "epoch": 0.8434626648806957, + "grad_norm": 0.31328389048576355, + "learning_rate": 6.343449047585159e-06, + "loss": 1.8428, + "step": 4644 + }, + { + "epoch": 0.8436442890548732, + "grad_norm": 0.4178454875946045, + "learning_rate": 6.329041385665696e-06, + "loss": 1.7719, + "step": 4645 + }, + { + "epoch": 0.8438259132290508, + "grad_norm": 1.8917529582977295, + "learning_rate": 6.314648998574108e-06, + "loss": 1.9143, + "step": 4646 + }, + { + "epoch": 0.8440075374032284, + "grad_norm": 0.48078858852386475, + "learning_rate": 6.3002718913444284e-06, + "loss": 1.8361, + "step": 4647 + }, + { + "epoch": 0.844189161577406, + "grad_norm": 0.6122763752937317, + "learning_rate": 6.285910069005369e-06, + "loss": 1.9378, + "step": 4648 + }, + { + "epoch": 0.8443707857515835, + "grad_norm": 0.30317631363868713, + "learning_rate": 6.271563536580266e-06, + "loss": 1.5995, + "step": 4649 + }, + { + "epoch": 0.8445524099257611, + "grad_norm": 0.37600278854370117, + "learning_rate": 6.257232299087118e-06, + "loss": 1.7643, + "step": 4650 + }, + { + "epoch": 0.8447340340999387, + "grad_norm": 0.303162157535553, + "learning_rate": 6.242916361538559e-06, + "loss": 1.7604, + "step": 4651 + }, + { + "epoch": 0.8449156582741163, + "grad_norm": 0.5965837836265564, + "learning_rate": 6.228615728941917e-06, + "loss": 2.1341, + "step": 4652 + }, + { + "epoch": 0.8450972824482939, + "grad_norm": 0.3276098668575287, + "learning_rate": 6.214330406299101e-06, + "loss": 1.7242, + "step": 4653 + }, + { + "epoch": 0.8452789066224714, + "grad_norm": 0.374006062746048, + "learning_rate": 6.200060398606733e-06, + "loss": 1.627, + "step": 4654 + }, + { + "epoch": 0.8454605307966491, + "grad_norm": 0.4520247280597687, + "learning_rate": 6.18580571085603e-06, + "loss": 1.7737, + "step": 4655 + }, + { + "epoch": 0.8456421549708266, + "grad_norm": 0.521427571773529, + "learning_rate": 6.171566348032859e-06, + "loss": 1.8031, + "step": 4656 + }, + { + "epoch": 0.8458237791450042, + "grad_norm": 0.27801281213760376, + "learning_rate": 6.1573423151177534e-06, + "loss": 1.6442, + "step": 4657 + }, + { + "epoch": 0.8460054033191818, + "grad_norm": 0.3354121744632721, + "learning_rate": 6.143133617085839e-06, + "loss": 1.8719, + "step": 4658 + }, + { + "epoch": 0.8461870274933594, + "grad_norm": 0.4458027184009552, + "learning_rate": 6.128940258906934e-06, + "loss": 1.687, + "step": 4659 + }, + { + "epoch": 0.8463686516675369, + "grad_norm": 0.3684740364551544, + "learning_rate": 6.11476224554543e-06, + "loss": 1.6156, + "step": 4660 + }, + { + "epoch": 0.8465502758417145, + "grad_norm": 0.3686135709285736, + "learning_rate": 6.100599581960415e-06, + "loss": 1.7754, + "step": 4661 + }, + { + "epoch": 0.8467319000158922, + "grad_norm": 0.47751253843307495, + "learning_rate": 6.086452273105558e-06, + "loss": 1.8846, + "step": 4662 + }, + { + "epoch": 0.8469135241900697, + "grad_norm": 0.3908393383026123, + "learning_rate": 6.072320323929176e-06, + "loss": 1.5964, + "step": 4663 + }, + { + "epoch": 0.8470951483642473, + "grad_norm": 0.4551343023777008, + "learning_rate": 6.058203739374202e-06, + "loss": 1.9462, + "step": 4664 + }, + { + "epoch": 0.8472767725384248, + "grad_norm": 0.5249073505401611, + "learning_rate": 6.04410252437822e-06, + "loss": 1.8134, + "step": 4665 + }, + { + "epoch": 0.8474583967126025, + "grad_norm": 0.6978768706321716, + "learning_rate": 6.030016683873429e-06, + "loss": 1.7427, + "step": 4666 + }, + { + "epoch": 0.84764002088678, + "grad_norm": 0.42747312784194946, + "learning_rate": 6.015946222786639e-06, + "loss": 1.6283, + "step": 4667 + }, + { + "epoch": 0.8478216450609576, + "grad_norm": 0.39025259017944336, + "learning_rate": 6.0018911460392845e-06, + "loss": 1.8063, + "step": 4668 + }, + { + "epoch": 0.8480032692351352, + "grad_norm": 0.7290012240409851, + "learning_rate": 5.987851458547411e-06, + "loss": 1.6642, + "step": 4669 + }, + { + "epoch": 0.8481848934093128, + "grad_norm": 0.6381733417510986, + "learning_rate": 5.973827165221718e-06, + "loss": 1.7383, + "step": 4670 + }, + { + "epoch": 0.8483665175834904, + "grad_norm": 0.43319493532180786, + "learning_rate": 5.9598182709674655e-06, + "loss": 1.6116, + "step": 4671 + }, + { + "epoch": 0.8485481417576679, + "grad_norm": 0.38457825779914856, + "learning_rate": 5.9458247806845775e-06, + "loss": 1.6598, + "step": 4672 + }, + { + "epoch": 0.8487297659318456, + "grad_norm": 0.32082483172416687, + "learning_rate": 5.931846699267557e-06, + "loss": 1.5727, + "step": 4673 + }, + { + "epoch": 0.8489113901060231, + "grad_norm": 0.4609431028366089, + "learning_rate": 5.917884031605536e-06, + "loss": 1.5498, + "step": 4674 + }, + { + "epoch": 0.8490930142802007, + "grad_norm": 0.42315179109573364, + "learning_rate": 5.903936782582253e-06, + "loss": 1.7364, + "step": 4675 + }, + { + "epoch": 0.8492746384543782, + "grad_norm": 0.3471173942089081, + "learning_rate": 5.8900049570760394e-06, + "loss": 1.5025, + "step": 4676 + }, + { + "epoch": 0.8494562626285559, + "grad_norm": 0.42376708984375, + "learning_rate": 5.876088559959836e-06, + "loss": 1.9111, + "step": 4677 + }, + { + "epoch": 0.8496378868027334, + "grad_norm": 0.3407013416290283, + "learning_rate": 5.862187596101198e-06, + "loss": 1.6217, + "step": 4678 + }, + { + "epoch": 0.849819510976911, + "grad_norm": 0.36414921283721924, + "learning_rate": 5.8483020703622934e-06, + "loss": 1.7871, + "step": 4679 + }, + { + "epoch": 0.8500011351510887, + "grad_norm": 0.5999419689178467, + "learning_rate": 5.834431987599859e-06, + "loss": 1.4052, + "step": 4680 + }, + { + "epoch": 0.8501827593252662, + "grad_norm": 0.3875734210014343, + "learning_rate": 5.820577352665252e-06, + "loss": 1.7629, + "step": 4681 + }, + { + "epoch": 0.8503643834994438, + "grad_norm": 0.41001221537590027, + "learning_rate": 5.806738170404396e-06, + "loss": 1.7495, + "step": 4682 + }, + { + "epoch": 0.8505460076736213, + "grad_norm": 1.5595526695251465, + "learning_rate": 5.79291444565786e-06, + "loss": 1.7264, + "step": 4683 + }, + { + "epoch": 0.850727631847799, + "grad_norm": 1.2196030616760254, + "learning_rate": 5.77910618326078e-06, + "loss": 1.8338, + "step": 4684 + }, + { + "epoch": 0.8509092560219765, + "grad_norm": 0.4035707414150238, + "learning_rate": 5.7653133880428755e-06, + "loss": 1.793, + "step": 4685 + }, + { + "epoch": 0.8510908801961541, + "grad_norm": 0.3707543611526489, + "learning_rate": 5.751536064828455e-06, + "loss": 1.7103, + "step": 4686 + }, + { + "epoch": 0.8512725043703316, + "grad_norm": 0.38721996545791626, + "learning_rate": 5.737774218436443e-06, + "loss": 1.8461, + "step": 4687 + }, + { + "epoch": 0.8514541285445093, + "grad_norm": 0.43750235438346863, + "learning_rate": 5.724027853680325e-06, + "loss": 1.6506, + "step": 4688 + }, + { + "epoch": 0.8516357527186869, + "grad_norm": 0.4024195969104767, + "learning_rate": 5.710296975368162e-06, + "loss": 1.6395, + "step": 4689 + }, + { + "epoch": 0.8518173768928644, + "grad_norm": 0.4064927101135254, + "learning_rate": 5.696581588302641e-06, + "loss": 1.79, + "step": 4690 + }, + { + "epoch": 0.8519990010670421, + "grad_norm": 0.3773770034313202, + "learning_rate": 5.682881697280984e-06, + "loss": 1.6532, + "step": 4691 + }, + { + "epoch": 0.8521806252412196, + "grad_norm": 0.3433830440044403, + "learning_rate": 5.669197307095031e-06, + "loss": 1.784, + "step": 4692 + }, + { + "epoch": 0.8523622494153972, + "grad_norm": 0.39828646183013916, + "learning_rate": 5.6555284225311755e-06, + "loss": 1.6075, + "step": 4693 + }, + { + "epoch": 0.8525438735895747, + "grad_norm": 0.9382124543190002, + "learning_rate": 5.641875048370393e-06, + "loss": 1.8571, + "step": 4694 + }, + { + "epoch": 0.8527254977637524, + "grad_norm": 0.4408373236656189, + "learning_rate": 5.628237189388225e-06, + "loss": 1.788, + "step": 4695 + }, + { + "epoch": 0.85290712193793, + "grad_norm": 0.33402684330940247, + "learning_rate": 5.614614850354805e-06, + "loss": 1.6198, + "step": 4696 + }, + { + "epoch": 0.8530887461121075, + "grad_norm": 0.36351513862609863, + "learning_rate": 5.601008036034844e-06, + "loss": 1.6475, + "step": 4697 + }, + { + "epoch": 0.853270370286285, + "grad_norm": 0.3783824145793915, + "learning_rate": 5.587416751187596e-06, + "loss": 1.5808, + "step": 4698 + }, + { + "epoch": 0.8534519944604627, + "grad_norm": 0.43725448846817017, + "learning_rate": 5.573841000566898e-06, + "loss": 1.781, + "step": 4699 + }, + { + "epoch": 0.8536336186346403, + "grad_norm": 0.6768486499786377, + "learning_rate": 5.560280788921146e-06, + "loss": 1.785, + "step": 4700 + }, + { + "epoch": 0.8538152428088178, + "grad_norm": 0.3926435112953186, + "learning_rate": 5.546736120993318e-06, + "loss": 1.8018, + "step": 4701 + }, + { + "epoch": 0.8539968669829955, + "grad_norm": 0.34480172395706177, + "learning_rate": 5.533207001520924e-06, + "loss": 1.6587, + "step": 4702 + }, + { + "epoch": 0.854178491157173, + "grad_norm": 0.3835197687149048, + "learning_rate": 5.519693435236084e-06, + "loss": 1.6765, + "step": 4703 + }, + { + "epoch": 0.8543601153313506, + "grad_norm": 0.464915931224823, + "learning_rate": 5.506195426865424e-06, + "loss": 1.9253, + "step": 4704 + }, + { + "epoch": 0.8545417395055281, + "grad_norm": 0.5696031451225281, + "learning_rate": 5.4927129811301715e-06, + "loss": 1.6995, + "step": 4705 + }, + { + "epoch": 0.8547233636797058, + "grad_norm": 0.38286638259887695, + "learning_rate": 5.479246102746088e-06, + "loss": 1.503, + "step": 4706 + }, + { + "epoch": 0.8549049878538834, + "grad_norm": 0.4218374192714691, + "learning_rate": 5.46579479642349e-06, + "loss": 1.7919, + "step": 4707 + }, + { + "epoch": 0.8550866120280609, + "grad_norm": 0.5207827687263489, + "learning_rate": 5.452359066867252e-06, + "loss": 1.8763, + "step": 4708 + }, + { + "epoch": 0.8552682362022385, + "grad_norm": 0.9872759580612183, + "learning_rate": 5.438938918776792e-06, + "loss": 1.7208, + "step": 4709 + }, + { + "epoch": 0.8554498603764161, + "grad_norm": 0.4949210584163666, + "learning_rate": 5.425534356846118e-06, + "loss": 1.6879, + "step": 4710 + }, + { + "epoch": 0.8556314845505937, + "grad_norm": 0.3875967860221863, + "learning_rate": 5.412145385763728e-06, + "loss": 1.8285, + "step": 4711 + }, + { + "epoch": 0.8558131087247712, + "grad_norm": 0.36343926191329956, + "learning_rate": 5.398772010212705e-06, + "loss": 1.9412, + "step": 4712 + }, + { + "epoch": 0.8559947328989489, + "grad_norm": 0.7179122567176819, + "learning_rate": 5.385414234870645e-06, + "loss": 1.6626, + "step": 4713 + }, + { + "epoch": 0.8561763570731264, + "grad_norm": 1.1941689252853394, + "learning_rate": 5.372072064409728e-06, + "loss": 1.7897, + "step": 4714 + }, + { + "epoch": 0.856357981247304, + "grad_norm": 1.7377663850784302, + "learning_rate": 5.358745503496665e-06, + "loss": 1.7335, + "step": 4715 + }, + { + "epoch": 0.8565396054214816, + "grad_norm": 0.5499184727668762, + "learning_rate": 5.345434556792683e-06, + "loss": 1.7959, + "step": 4716 + }, + { + "epoch": 0.8567212295956592, + "grad_norm": 1.0417404174804688, + "learning_rate": 5.332139228953553e-06, + "loss": 1.7447, + "step": 4717 + }, + { + "epoch": 0.8569028537698368, + "grad_norm": 0.3872587978839874, + "learning_rate": 5.318859524629621e-06, + "loss": 1.5897, + "step": 4718 + }, + { + "epoch": 0.8570844779440143, + "grad_norm": 0.3485618829727173, + "learning_rate": 5.3055954484657225e-06, + "loss": 1.6989, + "step": 4719 + }, + { + "epoch": 0.857266102118192, + "grad_norm": 1.0305229425430298, + "learning_rate": 5.2923470051012334e-06, + "loss": 1.7588, + "step": 4720 + }, + { + "epoch": 0.8574477262923695, + "grad_norm": 0.3963860273361206, + "learning_rate": 5.279114199170093e-06, + "loss": 1.9125, + "step": 4721 + }, + { + "epoch": 0.8576293504665471, + "grad_norm": 0.7201009392738342, + "learning_rate": 5.2658970353007545e-06, + "loss": 1.511, + "step": 4722 + }, + { + "epoch": 0.8578109746407246, + "grad_norm": 0.5264396071434021, + "learning_rate": 5.25269551811618e-06, + "loss": 1.4937, + "step": 4723 + }, + { + "epoch": 0.8579925988149023, + "grad_norm": 0.4580937623977661, + "learning_rate": 5.239509652233887e-06, + "loss": 1.9143, + "step": 4724 + }, + { + "epoch": 0.8581742229890799, + "grad_norm": 0.5218356847763062, + "learning_rate": 5.226339442265904e-06, + "loss": 1.77, + "step": 4725 + }, + { + "epoch": 0.8583558471632574, + "grad_norm": 0.42671337723731995, + "learning_rate": 5.213184892818768e-06, + "loss": 1.6829, + "step": 4726 + }, + { + "epoch": 0.858537471337435, + "grad_norm": 0.4582129120826721, + "learning_rate": 5.200046008493576e-06, + "loss": 1.3646, + "step": 4727 + }, + { + "epoch": 0.8587190955116126, + "grad_norm": 0.7177714109420776, + "learning_rate": 5.186922793885934e-06, + "loss": 1.5991, + "step": 4728 + }, + { + "epoch": 0.8589007196857902, + "grad_norm": 1.6235089302062988, + "learning_rate": 5.173815253585951e-06, + "loss": 1.6047, + "step": 4729 + }, + { + "epoch": 0.8590823438599677, + "grad_norm": 0.42071160674095154, + "learning_rate": 5.160723392178246e-06, + "loss": 1.7112, + "step": 4730 + }, + { + "epoch": 0.8592639680341454, + "grad_norm": 1.4314664602279663, + "learning_rate": 5.1476472142419965e-06, + "loss": 1.8237, + "step": 4731 + }, + { + "epoch": 0.859445592208323, + "grad_norm": 0.3387848138809204, + "learning_rate": 5.134586724350859e-06, + "loss": 1.8485, + "step": 4732 + }, + { + "epoch": 0.8596272163825005, + "grad_norm": 0.3652893304824829, + "learning_rate": 5.121541927072998e-06, + "loss": 1.7553, + "step": 4733 + }, + { + "epoch": 0.8598088405566781, + "grad_norm": 0.3516179621219635, + "learning_rate": 5.108512826971118e-06, + "loss": 1.5952, + "step": 4734 + }, + { + "epoch": 0.8599904647308557, + "grad_norm": 0.4236083924770355, + "learning_rate": 5.095499428602424e-06, + "loss": 1.6721, + "step": 4735 + }, + { + "epoch": 0.8601720889050333, + "grad_norm": 0.32711848616600037, + "learning_rate": 5.082501736518613e-06, + "loss": 1.6381, + "step": 4736 + }, + { + "epoch": 0.8603537130792108, + "grad_norm": 1.1951066255569458, + "learning_rate": 5.0695197552659e-06, + "loss": 1.7143, + "step": 4737 + }, + { + "epoch": 0.8605353372533884, + "grad_norm": 0.3818596601486206, + "learning_rate": 5.056553489385002e-06, + "loss": 1.6131, + "step": 4738 + }, + { + "epoch": 0.860716961427566, + "grad_norm": 0.3587017357349396, + "learning_rate": 5.043602943411135e-06, + "loss": 1.8446, + "step": 4739 + }, + { + "epoch": 0.8608985856017436, + "grad_norm": 0.42883020639419556, + "learning_rate": 5.030668121874033e-06, + "loss": 1.464, + "step": 4740 + }, + { + "epoch": 0.8610802097759211, + "grad_norm": 0.3849579095840454, + "learning_rate": 5.017749029297919e-06, + "loss": 1.6489, + "step": 4741 + }, + { + "epoch": 0.8612618339500988, + "grad_norm": 0.4418644309043884, + "learning_rate": 5.004845670201519e-06, + "loss": 1.768, + "step": 4742 + }, + { + "epoch": 0.8614434581242764, + "grad_norm": 0.36562371253967285, + "learning_rate": 4.9919580490980275e-06, + "loss": 1.5677, + "step": 4743 + }, + { + "epoch": 0.8616250822984539, + "grad_norm": 0.58484947681427, + "learning_rate": 4.979086170495195e-06, + "loss": 1.8157, + "step": 4744 + }, + { + "epoch": 0.8618067064726315, + "grad_norm": 0.38133031129837036, + "learning_rate": 4.966230038895192e-06, + "loss": 1.6971, + "step": 4745 + }, + { + "epoch": 0.8619883306468091, + "grad_norm": 0.3157115876674652, + "learning_rate": 4.953389658794749e-06, + "loss": 1.9074, + "step": 4746 + }, + { + "epoch": 0.8621699548209867, + "grad_norm": 0.396696537733078, + "learning_rate": 4.940565034685046e-06, + "loss": 1.8298, + "step": 4747 + }, + { + "epoch": 0.8623515789951642, + "grad_norm": 0.35415270924568176, + "learning_rate": 4.92775617105175e-06, + "loss": 1.7168, + "step": 4748 + }, + { + "epoch": 0.8625332031693418, + "grad_norm": 0.4133901298046112, + "learning_rate": 4.91496307237505e-06, + "loss": 1.7518, + "step": 4749 + }, + { + "epoch": 0.8627148273435195, + "grad_norm": 0.9732052087783813, + "learning_rate": 4.902185743129584e-06, + "loss": 1.7133, + "step": 4750 + }, + { + "epoch": 0.862896451517697, + "grad_norm": 0.41692036390304565, + "learning_rate": 4.889424187784486e-06, + "loss": 1.6926, + "step": 4751 + }, + { + "epoch": 0.8630780756918746, + "grad_norm": 0.5932571887969971, + "learning_rate": 4.876678410803382e-06, + "loss": 1.7479, + "step": 4752 + }, + { + "epoch": 0.8632596998660522, + "grad_norm": 0.40991032123565674, + "learning_rate": 4.863948416644382e-06, + "loss": 1.9691, + "step": 4753 + }, + { + "epoch": 0.8634413240402298, + "grad_norm": 0.38049161434173584, + "learning_rate": 4.851234209760058e-06, + "loss": 1.618, + "step": 4754 + }, + { + "epoch": 0.8636229482144073, + "grad_norm": 0.4407075047492981, + "learning_rate": 4.838535794597476e-06, + "loss": 1.7129, + "step": 4755 + }, + { + "epoch": 0.8638045723885849, + "grad_norm": 0.4235159754753113, + "learning_rate": 4.825853175598149e-06, + "loss": 1.768, + "step": 4756 + }, + { + "epoch": 0.8639861965627625, + "grad_norm": 0.36091384291648865, + "learning_rate": 4.813186357198113e-06, + "loss": 1.8811, + "step": 4757 + }, + { + "epoch": 0.8641678207369401, + "grad_norm": 0.42261549830436707, + "learning_rate": 4.800535343827833e-06, + "loss": 1.4945, + "step": 4758 + }, + { + "epoch": 0.8643494449111176, + "grad_norm": 0.6162317395210266, + "learning_rate": 4.7879001399122826e-06, + "loss": 1.5863, + "step": 4759 + }, + { + "epoch": 0.8645310690852952, + "grad_norm": 0.44976353645324707, + "learning_rate": 4.7752807498708754e-06, + "loss": 1.668, + "step": 4760 + }, + { + "epoch": 0.8647126932594729, + "grad_norm": 0.32623860239982605, + "learning_rate": 4.762677178117503e-06, + "loss": 1.63, + "step": 4761 + }, + { + "epoch": 0.8648943174336504, + "grad_norm": 0.3620838522911072, + "learning_rate": 4.750089429060544e-06, + "loss": 1.6925, + "step": 4762 + }, + { + "epoch": 0.865075941607828, + "grad_norm": 0.38984382152557373, + "learning_rate": 4.737517507102812e-06, + "loss": 1.7612, + "step": 4763 + }, + { + "epoch": 0.8652575657820056, + "grad_norm": 0.35165324807167053, + "learning_rate": 4.724961416641593e-06, + "loss": 1.6969, + "step": 4764 + }, + { + "epoch": 0.8654391899561832, + "grad_norm": 0.42474836111068726, + "learning_rate": 4.712421162068653e-06, + "loss": 1.6748, + "step": 4765 + }, + { + "epoch": 0.8656208141303607, + "grad_norm": 0.3810454308986664, + "learning_rate": 4.699896747770216e-06, + "loss": 1.7363, + "step": 4766 + }, + { + "epoch": 0.8658024383045383, + "grad_norm": 0.4404069185256958, + "learning_rate": 4.687388178126939e-06, + "loss": 1.6801, + "step": 4767 + }, + { + "epoch": 0.865984062478716, + "grad_norm": 0.3963400721549988, + "learning_rate": 4.674895457513967e-06, + "loss": 1.7928, + "step": 4768 + }, + { + "epoch": 0.8661656866528935, + "grad_norm": 0.3807142376899719, + "learning_rate": 4.662418590300871e-06, + "loss": 1.7017, + "step": 4769 + }, + { + "epoch": 0.8663473108270711, + "grad_norm": 0.4066357910633087, + "learning_rate": 4.6499575808517105e-06, + "loss": 1.6537, + "step": 4770 + }, + { + "epoch": 0.8665289350012486, + "grad_norm": 0.33538374304771423, + "learning_rate": 4.637512433524987e-06, + "loss": 1.6668, + "step": 4771 + }, + { + "epoch": 0.8667105591754263, + "grad_norm": 0.4804120659828186, + "learning_rate": 4.6250831526736485e-06, + "loss": 1.6617, + "step": 4772 + }, + { + "epoch": 0.8668921833496038, + "grad_norm": 0.5306397080421448, + "learning_rate": 4.612669742645087e-06, + "loss": 1.7911, + "step": 4773 + }, + { + "epoch": 0.8670738075237814, + "grad_norm": 0.3707389235496521, + "learning_rate": 4.6002722077811426e-06, + "loss": 1.641, + "step": 4774 + }, + { + "epoch": 0.867255431697959, + "grad_norm": 0.5169014930725098, + "learning_rate": 4.587890552418139e-06, + "loss": 1.6546, + "step": 4775 + }, + { + "epoch": 0.8674370558721366, + "grad_norm": 0.4237309396266937, + "learning_rate": 4.575524780886792e-06, + "loss": 1.864, + "step": 4776 + }, + { + "epoch": 0.8676186800463142, + "grad_norm": 0.32845550775527954, + "learning_rate": 4.563174897512306e-06, + "loss": 1.7945, + "step": 4777 + }, + { + "epoch": 0.8678003042204917, + "grad_norm": 0.37785694003105164, + "learning_rate": 4.550840906614295e-06, + "loss": 1.8189, + "step": 4778 + }, + { + "epoch": 0.8679819283946694, + "grad_norm": 0.865151584148407, + "learning_rate": 4.538522812506851e-06, + "loss": 1.8256, + "step": 4779 + }, + { + "epoch": 0.8681635525688469, + "grad_norm": 0.3163357675075531, + "learning_rate": 4.5262206194984665e-06, + "loss": 1.7284, + "step": 4780 + }, + { + "epoch": 0.8683451767430245, + "grad_norm": 0.4086284935474396, + "learning_rate": 4.5139343318920945e-06, + "loss": 1.8283, + "step": 4781 + }, + { + "epoch": 0.868526800917202, + "grad_norm": 0.40185749530792236, + "learning_rate": 4.501663953985108e-06, + "loss": 1.5737, + "step": 4782 + }, + { + "epoch": 0.8687084250913797, + "grad_norm": 1.618055820465088, + "learning_rate": 4.489409490069341e-06, + "loss": 1.8387, + "step": 4783 + }, + { + "epoch": 0.8688900492655572, + "grad_norm": 0.38672563433647156, + "learning_rate": 4.477170944431053e-06, + "loss": 1.7766, + "step": 4784 + }, + { + "epoch": 0.8690716734397348, + "grad_norm": 0.32249966263771057, + "learning_rate": 4.464948321350925e-06, + "loss": 1.7389, + "step": 4785 + }, + { + "epoch": 0.8692532976139125, + "grad_norm": 0.44598716497421265, + "learning_rate": 4.4527416251040735e-06, + "loss": 1.8316, + "step": 4786 + }, + { + "epoch": 0.86943492178809, + "grad_norm": 0.5057557225227356, + "learning_rate": 4.44055085996003e-06, + "loss": 1.7066, + "step": 4787 + }, + { + "epoch": 0.8696165459622676, + "grad_norm": 0.7773755192756653, + "learning_rate": 4.428376030182796e-06, + "loss": 1.8223, + "step": 4788 + }, + { + "epoch": 0.8697981701364451, + "grad_norm": 0.44659557938575745, + "learning_rate": 4.416217140030743e-06, + "loss": 1.7054, + "step": 4789 + }, + { + "epoch": 0.8699797943106228, + "grad_norm": 0.42944422364234924, + "learning_rate": 4.404074193756725e-06, + "loss": 1.7623, + "step": 4790 + }, + { + "epoch": 0.8701614184848003, + "grad_norm": 1.535235047340393, + "learning_rate": 4.391947195607965e-06, + "loss": 1.7326, + "step": 4791 + }, + { + "epoch": 0.8703430426589779, + "grad_norm": 0.7215322852134705, + "learning_rate": 4.379836149826155e-06, + "loss": 1.8891, + "step": 4792 + }, + { + "epoch": 0.8705246668331555, + "grad_norm": 0.5390497446060181, + "learning_rate": 4.367741060647379e-06, + "loss": 1.8254, + "step": 4793 + }, + { + "epoch": 0.8707062910073331, + "grad_norm": 0.49187013506889343, + "learning_rate": 4.355661932302141e-06, + "loss": 1.7045, + "step": 4794 + }, + { + "epoch": 0.8708879151815107, + "grad_norm": 0.3264692723751068, + "learning_rate": 4.343598769015361e-06, + "loss": 1.6704, + "step": 4795 + }, + { + "epoch": 0.8710695393556882, + "grad_norm": 0.45486190915107727, + "learning_rate": 4.331551575006387e-06, + "loss": 1.8362, + "step": 4796 + }, + { + "epoch": 0.8712511635298659, + "grad_norm": 0.6055688261985779, + "learning_rate": 4.319520354488993e-06, + "loss": 1.7872, + "step": 4797 + }, + { + "epoch": 0.8714327877040434, + "grad_norm": 1.0458317995071411, + "learning_rate": 4.307505111671339e-06, + "loss": 1.6698, + "step": 4798 + }, + { + "epoch": 0.871614411878221, + "grad_norm": 1.2162200212478638, + "learning_rate": 4.2955058507559985e-06, + "loss": 1.8642, + "step": 4799 + }, + { + "epoch": 0.8717960360523985, + "grad_norm": 0.37028709053993225, + "learning_rate": 4.2835225759399635e-06, + "loss": 1.5911, + "step": 4800 + }, + { + "epoch": 0.8719776602265762, + "grad_norm": 0.4125514030456543, + "learning_rate": 4.271555291414636e-06, + "loss": 1.627, + "step": 4801 + }, + { + "epoch": 0.8721592844007537, + "grad_norm": 0.6718879342079163, + "learning_rate": 4.2596040013658355e-06, + "loss": 1.8319, + "step": 4802 + }, + { + "epoch": 0.8723409085749313, + "grad_norm": 0.6809714436531067, + "learning_rate": 4.2476687099737625e-06, + "loss": 1.8236, + "step": 4803 + }, + { + "epoch": 0.872522532749109, + "grad_norm": 0.3178693354129791, + "learning_rate": 4.235749421413032e-06, + "loss": 1.9456, + "step": 4804 + }, + { + "epoch": 0.8727041569232865, + "grad_norm": 0.4383208453655243, + "learning_rate": 4.2238461398526775e-06, + "loss": 1.9289, + "step": 4805 + }, + { + "epoch": 0.8728857810974641, + "grad_norm": 0.3733803331851959, + "learning_rate": 4.211958869456106e-06, + "loss": 1.7917, + "step": 4806 + }, + { + "epoch": 0.8730674052716416, + "grad_norm": 0.4316546320915222, + "learning_rate": 4.200087614381138e-06, + "loss": 1.8095, + "step": 4807 + }, + { + "epoch": 0.8732490294458193, + "grad_norm": 0.5708910822868347, + "learning_rate": 4.188232378780005e-06, + "loss": 1.6153, + "step": 4808 + }, + { + "epoch": 0.8734306536199968, + "grad_norm": 0.3938082456588745, + "learning_rate": 4.176393166799303e-06, + "loss": 1.7687, + "step": 4809 + }, + { + "epoch": 0.8736122777941744, + "grad_norm": 0.5026504397392273, + "learning_rate": 4.164569982580069e-06, + "loss": 1.7352, + "step": 4810 + }, + { + "epoch": 0.8737939019683519, + "grad_norm": 0.5346285104751587, + "learning_rate": 4.152762830257689e-06, + "loss": 1.7767, + "step": 4811 + }, + { + "epoch": 0.8739755261425296, + "grad_norm": 1.5252220630645752, + "learning_rate": 4.140971713961966e-06, + "loss": 1.6426, + "step": 4812 + }, + { + "epoch": 0.8741571503167072, + "grad_norm": 0.7335418462753296, + "learning_rate": 4.129196637817084e-06, + "loss": 1.7288, + "step": 4813 + }, + { + "epoch": 0.8743387744908847, + "grad_norm": 0.6731449961662292, + "learning_rate": 4.117437605941621e-06, + "loss": 1.6514, + "step": 4814 + }, + { + "epoch": 0.8745203986650624, + "grad_norm": 0.7816442847251892, + "learning_rate": 4.105694622448558e-06, + "loss": 1.9133, + "step": 4815 + }, + { + "epoch": 0.8747020228392399, + "grad_norm": 0.37143710255622864, + "learning_rate": 4.0939676914452385e-06, + "loss": 1.8532, + "step": 4816 + }, + { + "epoch": 0.8748836470134175, + "grad_norm": 0.7392839193344116, + "learning_rate": 4.082256817033392e-06, + "loss": 1.618, + "step": 4817 + }, + { + "epoch": 0.875065271187595, + "grad_norm": 0.44710707664489746, + "learning_rate": 4.0705620033091585e-06, + "loss": 1.5429, + "step": 4818 + }, + { + "epoch": 0.8752468953617727, + "grad_norm": 0.40069931745529175, + "learning_rate": 4.058883254363033e-06, + "loss": 1.626, + "step": 4819 + }, + { + "epoch": 0.8754285195359502, + "grad_norm": 0.3408013880252838, + "learning_rate": 4.047220574279892e-06, + "loss": 1.6742, + "step": 4820 + }, + { + "epoch": 0.8756101437101278, + "grad_norm": 0.33345144987106323, + "learning_rate": 4.035573967139023e-06, + "loss": 1.6679, + "step": 4821 + }, + { + "epoch": 0.8757917678843053, + "grad_norm": 0.3946489989757538, + "learning_rate": 4.023943437014044e-06, + "loss": 1.6653, + "step": 4822 + }, + { + "epoch": 0.875973392058483, + "grad_norm": 0.5449888706207275, + "learning_rate": 4.012328987973002e-06, + "loss": 1.711, + "step": 4823 + }, + { + "epoch": 0.8761550162326606, + "grad_norm": 0.5896198749542236, + "learning_rate": 4.000730624078275e-06, + "loss": 1.6555, + "step": 4824 + }, + { + "epoch": 0.8763366404068381, + "grad_norm": 0.3910166621208191, + "learning_rate": 3.98914834938664e-06, + "loss": 1.7172, + "step": 4825 + }, + { + "epoch": 0.8765182645810158, + "grad_norm": 0.36756375432014465, + "learning_rate": 3.977582167949228e-06, + "loss": 1.6651, + "step": 4826 + }, + { + "epoch": 0.8766998887551933, + "grad_norm": 0.9083905220031738, + "learning_rate": 3.96603208381156e-06, + "loss": 1.6466, + "step": 4827 + }, + { + "epoch": 0.8768815129293709, + "grad_norm": 0.40728238224983215, + "learning_rate": 3.954498101013526e-06, + "loss": 1.8045, + "step": 4828 + }, + { + "epoch": 0.8770631371035484, + "grad_norm": 0.37934595346450806, + "learning_rate": 3.942980223589371e-06, + "loss": 1.71, + "step": 4829 + }, + { + "epoch": 0.8772447612777261, + "grad_norm": 0.3849989175796509, + "learning_rate": 3.931478455567705e-06, + "loss": 1.8982, + "step": 4830 + }, + { + "epoch": 0.8774263854519037, + "grad_norm": 0.845258355140686, + "learning_rate": 3.919992800971517e-06, + "loss": 1.7184, + "step": 4831 + }, + { + "epoch": 0.8776080096260812, + "grad_norm": 0.38689112663269043, + "learning_rate": 3.9085232638181476e-06, + "loss": 1.8121, + "step": 4832 + }, + { + "epoch": 0.8777896338002588, + "grad_norm": 0.5032036900520325, + "learning_rate": 3.897069848119323e-06, + "loss": 1.8445, + "step": 4833 + }, + { + "epoch": 0.8779712579744364, + "grad_norm": 0.4630155861377716, + "learning_rate": 3.885632557881108e-06, + "loss": 1.7979, + "step": 4834 + }, + { + "epoch": 0.878152882148614, + "grad_norm": 0.37546420097351074, + "learning_rate": 3.874211397103916e-06, + "loss": 1.6901, + "step": 4835 + }, + { + "epoch": 0.8783345063227915, + "grad_norm": 0.38750123977661133, + "learning_rate": 3.862806369782557e-06, + "loss": 1.6251, + "step": 4836 + }, + { + "epoch": 0.8785161304969692, + "grad_norm": 0.3760819435119629, + "learning_rate": 3.851417479906172e-06, + "loss": 1.6787, + "step": 4837 + }, + { + "epoch": 0.8786977546711467, + "grad_norm": 0.3537256419658661, + "learning_rate": 3.8400447314582535e-06, + "loss": 1.6889, + "step": 4838 + }, + { + "epoch": 0.8788793788453243, + "grad_norm": 0.5345553159713745, + "learning_rate": 3.8286881284166636e-06, + "loss": 1.7312, + "step": 4839 + }, + { + "epoch": 0.8790610030195019, + "grad_norm": 0.4270947277545929, + "learning_rate": 3.817347674753613e-06, + "loss": 1.8274, + "step": 4840 + }, + { + "epoch": 0.8792426271936795, + "grad_norm": 0.6291266083717346, + "learning_rate": 3.8060233744356633e-06, + "loss": 1.7848, + "step": 4841 + }, + { + "epoch": 0.8794242513678571, + "grad_norm": 0.5639631748199463, + "learning_rate": 3.7947152314237233e-06, + "loss": 1.8254, + "step": 4842 + }, + { + "epoch": 0.8796058755420346, + "grad_norm": 0.42455971240997314, + "learning_rate": 3.783423249673046e-06, + "loss": 1.8145, + "step": 4843 + }, + { + "epoch": 0.8797874997162122, + "grad_norm": 0.3315695524215698, + "learning_rate": 3.772147433133233e-06, + "loss": 1.7941, + "step": 4844 + }, + { + "epoch": 0.8799691238903898, + "grad_norm": 0.4174742102622986, + "learning_rate": 3.7608877857482404e-06, + "loss": 1.8397, + "step": 4845 + }, + { + "epoch": 0.8801507480645674, + "grad_norm": 0.46815672516822815, + "learning_rate": 3.7496443114563796e-06, + "loss": 1.6106, + "step": 4846 + }, + { + "epoch": 0.8803323722387449, + "grad_norm": 1.4082038402557373, + "learning_rate": 3.738417014190282e-06, + "loss": 1.8949, + "step": 4847 + }, + { + "epoch": 0.8805139964129226, + "grad_norm": 0.34680846333503723, + "learning_rate": 3.727205897876912e-06, + "loss": 1.6922, + "step": 4848 + }, + { + "epoch": 0.8806956205871002, + "grad_norm": 0.3281535506248474, + "learning_rate": 3.716010966437611e-06, + "loss": 1.632, + "step": 4849 + }, + { + "epoch": 0.8808772447612777, + "grad_norm": 0.3787660598754883, + "learning_rate": 3.704832223788035e-06, + "loss": 1.7548, + "step": 4850 + }, + { + "epoch": 0.8810588689354553, + "grad_norm": 1.5565160512924194, + "learning_rate": 3.6936696738381737e-06, + "loss": 1.9729, + "step": 4851 + }, + { + "epoch": 0.8812404931096329, + "grad_norm": 0.4436298906803131, + "learning_rate": 3.682523320492365e-06, + "loss": 1.6543, + "step": 4852 + }, + { + "epoch": 0.8814221172838105, + "grad_norm": 0.2897511422634125, + "learning_rate": 3.6713931676492897e-06, + "loss": 1.5825, + "step": 4853 + }, + { + "epoch": 0.881603741457988, + "grad_norm": 0.42980942130088806, + "learning_rate": 3.6602792192019454e-06, + "loss": 1.7579, + "step": 4854 + }, + { + "epoch": 0.8817853656321657, + "grad_norm": 1.3438072204589844, + "learning_rate": 3.649181479037661e-06, + "loss": 1.898, + "step": 4855 + }, + { + "epoch": 0.8819669898063432, + "grad_norm": 0.38720205426216125, + "learning_rate": 3.638099951038093e-06, + "loss": 1.7395, + "step": 4856 + }, + { + "epoch": 0.8821486139805208, + "grad_norm": 0.38516902923583984, + "learning_rate": 3.6270346390792574e-06, + "loss": 1.9352, + "step": 4857 + }, + { + "epoch": 0.8823302381546984, + "grad_norm": 0.3344322144985199, + "learning_rate": 3.6159855470314698e-06, + "loss": 1.4477, + "step": 4858 + }, + { + "epoch": 0.882511862328876, + "grad_norm": 0.33653953671455383, + "learning_rate": 3.6049526787593823e-06, + "loss": 1.6554, + "step": 4859 + }, + { + "epoch": 0.8826934865030536, + "grad_norm": 0.4856278598308563, + "learning_rate": 3.59393603812197e-06, + "loss": 1.6988, + "step": 4860 + }, + { + "epoch": 0.8828751106772311, + "grad_norm": 0.35749682784080505, + "learning_rate": 3.5829356289725223e-06, + "loss": 1.6942, + "step": 4861 + }, + { + "epoch": 0.8830567348514087, + "grad_norm": 0.4094535708427429, + "learning_rate": 3.5719514551586785e-06, + "loss": 1.6073, + "step": 4862 + }, + { + "epoch": 0.8832383590255863, + "grad_norm": 0.4106106162071228, + "learning_rate": 3.560983520522365e-06, + "loss": 1.7728, + "step": 4863 + }, + { + "epoch": 0.8834199831997639, + "grad_norm": 0.3251866400241852, + "learning_rate": 3.5500318288998634e-06, + "loss": 1.6829, + "step": 4864 + }, + { + "epoch": 0.8836016073739414, + "grad_norm": 0.37184229493141174, + "learning_rate": 3.539096384121743e-06, + "loss": 1.7829, + "step": 4865 + }, + { + "epoch": 0.8837832315481191, + "grad_norm": 0.409301221370697, + "learning_rate": 3.5281771900129045e-06, + "loss": 1.6484, + "step": 4866 + }, + { + "epoch": 0.8839648557222967, + "grad_norm": 0.39652809500694275, + "learning_rate": 3.5172742503925716e-06, + "loss": 1.7567, + "step": 4867 + }, + { + "epoch": 0.8841464798964742, + "grad_norm": 0.5470221042633057, + "learning_rate": 3.506387569074271e-06, + "loss": 1.7913, + "step": 4868 + }, + { + "epoch": 0.8843281040706518, + "grad_norm": 0.3969081938266754, + "learning_rate": 3.4955171498658346e-06, + "loss": 1.8598, + "step": 4869 + }, + { + "epoch": 0.8845097282448294, + "grad_norm": 0.3852759003639221, + "learning_rate": 3.4846629965694267e-06, + "loss": 1.5899, + "step": 4870 + }, + { + "epoch": 0.884691352419007, + "grad_norm": 0.3390767574310303, + "learning_rate": 3.473825112981527e-06, + "loss": 1.7658, + "step": 4871 + }, + { + "epoch": 0.8848729765931845, + "grad_norm": 0.4276787042617798, + "learning_rate": 3.463003502892903e-06, + "loss": 1.5841, + "step": 4872 + }, + { + "epoch": 0.8850546007673621, + "grad_norm": 0.44183242321014404, + "learning_rate": 3.4521981700886273e-06, + "loss": 1.7634, + "step": 4873 + }, + { + "epoch": 0.8852362249415398, + "grad_norm": 0.34280019998550415, + "learning_rate": 3.4414091183480933e-06, + "loss": 1.6023, + "step": 4874 + }, + { + "epoch": 0.8854178491157173, + "grad_norm": 0.35648423433303833, + "learning_rate": 3.43063635144501e-06, + "loss": 1.7603, + "step": 4875 + }, + { + "epoch": 0.8855994732898949, + "grad_norm": 0.4279175102710724, + "learning_rate": 3.419879873147358e-06, + "loss": 1.8383, + "step": 4876 + }, + { + "epoch": 0.8857810974640725, + "grad_norm": 0.45809945464134216, + "learning_rate": 3.4091396872174606e-06, + "loss": 1.7194, + "step": 4877 + }, + { + "epoch": 0.8859627216382501, + "grad_norm": 0.42889755964279175, + "learning_rate": 3.398415797411908e-06, + "loss": 1.7463, + "step": 4878 + }, + { + "epoch": 0.8861443458124276, + "grad_norm": 0.3490757346153259, + "learning_rate": 3.3877082074815992e-06, + "loss": 1.6214, + "step": 4879 + }, + { + "epoch": 0.8863259699866052, + "grad_norm": 0.4410065710544586, + "learning_rate": 3.377016921171755e-06, + "loss": 1.6349, + "step": 4880 + }, + { + "epoch": 0.8865075941607828, + "grad_norm": 1.3013991117477417, + "learning_rate": 3.3663419422218677e-06, + "loss": 1.944, + "step": 4881 + }, + { + "epoch": 0.8866892183349604, + "grad_norm": 0.395610511302948, + "learning_rate": 3.355683274365723e-06, + "loss": 1.6218, + "step": 4882 + }, + { + "epoch": 0.886870842509138, + "grad_norm": 0.3933349847793579, + "learning_rate": 3.345040921331416e-06, + "loss": 1.6542, + "step": 4883 + }, + { + "epoch": 0.8870524666833155, + "grad_norm": 0.4133455157279968, + "learning_rate": 3.334414886841347e-06, + "loss": 1.6865, + "step": 4884 + }, + { + "epoch": 0.8872340908574932, + "grad_norm": 0.4337834119796753, + "learning_rate": 3.3238051746121822e-06, + "loss": 1.7013, + "step": 4885 + }, + { + "epoch": 0.8874157150316707, + "grad_norm": 0.4055754244327545, + "learning_rate": 3.3132117883548864e-06, + "loss": 1.5534, + "step": 4886 + }, + { + "epoch": 0.8875973392058483, + "grad_norm": 0.690500020980835, + "learning_rate": 3.3026347317747173e-06, + "loss": 1.7611, + "step": 4887 + }, + { + "epoch": 0.8877789633800259, + "grad_norm": 0.6729840636253357, + "learning_rate": 3.292074008571222e-06, + "loss": 1.7316, + "step": 4888 + }, + { + "epoch": 0.8879605875542035, + "grad_norm": 0.40274226665496826, + "learning_rate": 3.2815296224382395e-06, + "loss": 1.6895, + "step": 4889 + }, + { + "epoch": 0.888142211728381, + "grad_norm": 0.5526754856109619, + "learning_rate": 3.271001577063887e-06, + "loss": 1.7472, + "step": 4890 + }, + { + "epoch": 0.8883238359025586, + "grad_norm": 0.46629366278648376, + "learning_rate": 3.260489876130568e-06, + "loss": 1.8018, + "step": 4891 + }, + { + "epoch": 0.8885054600767363, + "grad_norm": 0.5316206216812134, + "learning_rate": 3.249994523314953e-06, + "loss": 1.6375, + "step": 4892 + }, + { + "epoch": 0.8886870842509138, + "grad_norm": 0.3918965458869934, + "learning_rate": 3.2395155222880334e-06, + "loss": 1.7969, + "step": 4893 + }, + { + "epoch": 0.8888687084250914, + "grad_norm": 0.35109129548072815, + "learning_rate": 3.229052876715044e-06, + "loss": 1.7066, + "step": 4894 + }, + { + "epoch": 0.8890503325992689, + "grad_norm": 0.3469581604003906, + "learning_rate": 3.218606590255524e-06, + "loss": 1.7509, + "step": 4895 + }, + { + "epoch": 0.8892319567734466, + "grad_norm": 0.430176317691803, + "learning_rate": 3.2081766665632616e-06, + "loss": 1.7441, + "step": 4896 + }, + { + "epoch": 0.8894135809476241, + "grad_norm": 0.5207772254943848, + "learning_rate": 3.1977631092863615e-06, + "loss": 1.8532, + "step": 4897 + }, + { + "epoch": 0.8895952051218017, + "grad_norm": 0.6833198666572571, + "learning_rate": 3.187365922067176e-06, + "loss": 1.5778, + "step": 4898 + }, + { + "epoch": 0.8897768292959793, + "grad_norm": 0.35329797863960266, + "learning_rate": 3.17698510854233e-06, + "loss": 1.6279, + "step": 4899 + }, + { + "epoch": 0.8899584534701569, + "grad_norm": 0.36422693729400635, + "learning_rate": 3.1666206723427293e-06, + "loss": 1.6127, + "step": 4900 + }, + { + "epoch": 0.8901400776443344, + "grad_norm": 0.36708614230155945, + "learning_rate": 3.156272617093553e-06, + "loss": 1.8204, + "step": 4901 + }, + { + "epoch": 0.890321701818512, + "grad_norm": 0.4385983645915985, + "learning_rate": 3.145940946414261e-06, + "loss": 1.8401, + "step": 4902 + }, + { + "epoch": 0.8905033259926897, + "grad_norm": 0.42256179451942444, + "learning_rate": 3.135625663918562e-06, + "loss": 1.7256, + "step": 4903 + }, + { + "epoch": 0.8906849501668672, + "grad_norm": 0.9379191994667053, + "learning_rate": 3.1253267732144374e-06, + "loss": 1.8517, + "step": 4904 + }, + { + "epoch": 0.8908665743410448, + "grad_norm": 0.5877417922019958, + "learning_rate": 3.115044277904139e-06, + "loss": 1.7425, + "step": 4905 + }, + { + "epoch": 0.8910481985152223, + "grad_norm": 0.5052782893180847, + "learning_rate": 3.104778181584189e-06, + "loss": 1.756, + "step": 4906 + }, + { + "epoch": 0.8912298226894, + "grad_norm": 0.3445552885532379, + "learning_rate": 3.0945284878453663e-06, + "loss": 1.7749, + "step": 4907 + }, + { + "epoch": 0.8914114468635775, + "grad_norm": 1.0857478380203247, + "learning_rate": 3.084295200272719e-06, + "loss": 1.889, + "step": 4908 + }, + { + "epoch": 0.8915930710377551, + "grad_norm": 0.5826210379600525, + "learning_rate": 3.0740783224455396e-06, + "loss": 1.6645, + "step": 4909 + }, + { + "epoch": 0.8917746952119328, + "grad_norm": 0.3628610074520111, + "learning_rate": 3.0638778579374084e-06, + "loss": 1.7538, + "step": 4910 + }, + { + "epoch": 0.8919563193861103, + "grad_norm": 0.4078531861305237, + "learning_rate": 3.0536938103161494e-06, + "loss": 1.7627, + "step": 4911 + }, + { + "epoch": 0.8921379435602879, + "grad_norm": 0.4627036452293396, + "learning_rate": 3.043526183143841e-06, + "loss": 1.7641, + "step": 4912 + }, + { + "epoch": 0.8923195677344654, + "grad_norm": 0.4015800356864929, + "learning_rate": 3.0333749799768107e-06, + "loss": 1.7579, + "step": 4913 + }, + { + "epoch": 0.8925011919086431, + "grad_norm": 0.557856559753418, + "learning_rate": 3.0232402043656737e-06, + "loss": 1.7829, + "step": 4914 + }, + { + "epoch": 0.8926828160828206, + "grad_norm": 0.433573842048645, + "learning_rate": 3.0131218598552722e-06, + "loss": 1.7771, + "step": 4915 + }, + { + "epoch": 0.8928644402569982, + "grad_norm": 0.38200899958610535, + "learning_rate": 3.003019949984709e-06, + "loss": 1.6235, + "step": 4916 + }, + { + "epoch": 0.8930460644311757, + "grad_norm": 0.5362004637718201, + "learning_rate": 2.992934478287335e-06, + "loss": 1.6613, + "step": 4917 + }, + { + "epoch": 0.8932276886053534, + "grad_norm": 0.5254557728767395, + "learning_rate": 2.9828654482907448e-06, + "loss": 1.7141, + "step": 4918 + }, + { + "epoch": 0.893409312779531, + "grad_norm": 0.36158159375190735, + "learning_rate": 2.972812863516805e-06, + "loss": 1.5449, + "step": 4919 + }, + { + "epoch": 0.8935909369537085, + "grad_norm": 0.38946881890296936, + "learning_rate": 2.9627767274816142e-06, + "loss": 1.6342, + "step": 4920 + }, + { + "epoch": 0.8937725611278862, + "grad_norm": 0.3454825282096863, + "learning_rate": 2.9527570436955255e-06, + "loss": 1.6179, + "step": 4921 + }, + { + "epoch": 0.8939541853020637, + "grad_norm": 0.4651676118373871, + "learning_rate": 2.9427538156631127e-06, + "loss": 1.6606, + "step": 4922 + }, + { + "epoch": 0.8941358094762413, + "grad_norm": 0.35290220379829407, + "learning_rate": 2.932767046883239e-06, + "loss": 1.6763, + "step": 4923 + }, + { + "epoch": 0.8943174336504188, + "grad_norm": 0.40354567766189575, + "learning_rate": 2.9227967408489653e-06, + "loss": 1.8884, + "step": 4924 + }, + { + "epoch": 0.8944990578245965, + "grad_norm": 0.4312422573566437, + "learning_rate": 2.912842901047619e-06, + "loss": 1.6992, + "step": 4925 + }, + { + "epoch": 0.894680681998774, + "grad_norm": 0.2876504957675934, + "learning_rate": 2.9029055309607646e-06, + "loss": 1.5527, + "step": 4926 + }, + { + "epoch": 0.8948623061729516, + "grad_norm": 0.3094663918018341, + "learning_rate": 2.8929846340641996e-06, + "loss": 1.698, + "step": 4927 + }, + { + "epoch": 0.8950439303471293, + "grad_norm": 0.30408158898353577, + "learning_rate": 2.8830802138279824e-06, + "loss": 1.7812, + "step": 4928 + }, + { + "epoch": 0.8952255545213068, + "grad_norm": 0.43729129433631897, + "learning_rate": 2.8731922737163685e-06, + "loss": 1.5909, + "step": 4929 + }, + { + "epoch": 0.8954071786954844, + "grad_norm": 0.41448718309402466, + "learning_rate": 2.863320817187881e-06, + "loss": 1.8378, + "step": 4930 + }, + { + "epoch": 0.8955888028696619, + "grad_norm": 0.614971935749054, + "learning_rate": 2.8534658476952635e-06, + "loss": 1.6735, + "step": 4931 + }, + { + "epoch": 0.8957704270438396, + "grad_norm": 0.47587335109710693, + "learning_rate": 2.8436273686854973e-06, + "loss": 1.6476, + "step": 4932 + }, + { + "epoch": 0.8959520512180171, + "grad_norm": 0.36614811420440674, + "learning_rate": 2.8338053835998023e-06, + "loss": 1.7047, + "step": 4933 + }, + { + "epoch": 0.8961336753921947, + "grad_norm": 0.345451682806015, + "learning_rate": 2.8239998958736193e-06, + "loss": 1.8022, + "step": 4934 + }, + { + "epoch": 0.8963152995663722, + "grad_norm": 0.3949457108974457, + "learning_rate": 2.8142109089366154e-06, + "loss": 1.7749, + "step": 4935 + }, + { + "epoch": 0.8964969237405499, + "grad_norm": 0.47801119089126587, + "learning_rate": 2.8044384262127023e-06, + "loss": 1.8814, + "step": 4936 + }, + { + "epoch": 0.8966785479147275, + "grad_norm": 0.5465490818023682, + "learning_rate": 2.7946824511200064e-06, + "loss": 1.5625, + "step": 4937 + }, + { + "epoch": 0.896860172088905, + "grad_norm": 1.2134408950805664, + "learning_rate": 2.7849429870708767e-06, + "loss": 1.8705, + "step": 4938 + }, + { + "epoch": 0.8970417962630827, + "grad_norm": 0.37429898977279663, + "learning_rate": 2.775220037471904e-06, + "loss": 1.8318, + "step": 4939 + }, + { + "epoch": 0.8972234204372602, + "grad_norm": 0.44062694907188416, + "learning_rate": 2.7655136057238796e-06, + "loss": 1.7951, + "step": 4940 + }, + { + "epoch": 0.8974050446114378, + "grad_norm": 0.4546429216861725, + "learning_rate": 2.7558236952218485e-06, + "loss": 1.6298, + "step": 4941 + }, + { + "epoch": 0.8975866687856153, + "grad_norm": 0.3776446282863617, + "learning_rate": 2.7461503093550446e-06, + "loss": 1.7869, + "step": 4942 + }, + { + "epoch": 0.897768292959793, + "grad_norm": 0.5528941750526428, + "learning_rate": 2.7364934515069327e-06, + "loss": 1.5798, + "step": 4943 + }, + { + "epoch": 0.8979499171339705, + "grad_norm": 1.6426584720611572, + "learning_rate": 2.726853125055212e-06, + "loss": 1.6974, + "step": 4944 + }, + { + "epoch": 0.8981315413081481, + "grad_norm": 0.3716231882572174, + "learning_rate": 2.7172293333717848e-06, + "loss": 1.768, + "step": 4945 + }, + { + "epoch": 0.8983131654823256, + "grad_norm": 0.37134313583374023, + "learning_rate": 2.7076220798227746e-06, + "loss": 1.6515, + "step": 4946 + }, + { + "epoch": 0.8984947896565033, + "grad_norm": 0.37827959656715393, + "learning_rate": 2.6980313677685166e-06, + "loss": 1.8193, + "step": 4947 + }, + { + "epoch": 0.8986764138306809, + "grad_norm": 0.813024640083313, + "learning_rate": 2.68845720056356e-06, + "loss": 1.8179, + "step": 4948 + }, + { + "epoch": 0.8988580380048584, + "grad_norm": 0.47203049063682556, + "learning_rate": 2.67889958155666e-06, + "loss": 1.7503, + "step": 4949 + }, + { + "epoch": 0.8990396621790361, + "grad_norm": 0.39956989884376526, + "learning_rate": 2.6693585140908027e-06, + "loss": 1.5794, + "step": 4950 + }, + { + "epoch": 0.8992212863532136, + "grad_norm": 0.7087671160697937, + "learning_rate": 2.659834001503186e-06, + "loss": 1.8297, + "step": 4951 + }, + { + "epoch": 0.8994029105273912, + "grad_norm": 0.7077451348304749, + "learning_rate": 2.6503260471251957e-06, + "loss": 1.8217, + "step": 4952 + }, + { + "epoch": 0.8995845347015687, + "grad_norm": 0.34020093083381653, + "learning_rate": 2.640834654282431e-06, + "loss": 1.8177, + "step": 4953 + }, + { + "epoch": 0.8997661588757464, + "grad_norm": 0.42147213220596313, + "learning_rate": 2.631359826294716e-06, + "loss": 1.7985, + "step": 4954 + }, + { + "epoch": 0.899947783049924, + "grad_norm": 0.6461643576622009, + "learning_rate": 2.621901566476065e-06, + "loss": 1.6814, + "step": 4955 + }, + { + "epoch": 0.9001294072241015, + "grad_norm": 0.9043461680412292, + "learning_rate": 2.612459878134693e-06, + "loss": 1.7268, + "step": 4956 + }, + { + "epoch": 0.9003110313982791, + "grad_norm": 0.34412574768066406, + "learning_rate": 2.603034764573037e-06, + "loss": 1.6008, + "step": 4957 + }, + { + "epoch": 0.9004926555724567, + "grad_norm": 0.40242719650268555, + "learning_rate": 2.5936262290877312e-06, + "loss": 1.7961, + "step": 4958 + }, + { + "epoch": 0.9006742797466343, + "grad_norm": 0.8859853744506836, + "learning_rate": 2.5842342749695992e-06, + "loss": 1.8598, + "step": 4959 + }, + { + "epoch": 0.9008559039208118, + "grad_norm": 0.370846688747406, + "learning_rate": 2.574858905503674e-06, + "loss": 1.8599, + "step": 4960 + }, + { + "epoch": 0.9010375280949895, + "grad_norm": 0.39616814255714417, + "learning_rate": 2.5655001239691835e-06, + "loss": 1.7715, + "step": 4961 + }, + { + "epoch": 0.901219152269167, + "grad_norm": 0.3948545753955841, + "learning_rate": 2.556157933639558e-06, + "loss": 1.6414, + "step": 4962 + }, + { + "epoch": 0.9014007764433446, + "grad_norm": 0.47337475419044495, + "learning_rate": 2.546832337782423e-06, + "loss": 1.6324, + "step": 4963 + }, + { + "epoch": 0.9015824006175222, + "grad_norm": 0.5724080204963684, + "learning_rate": 2.5375233396596086e-06, + "loss": 2.0041, + "step": 4964 + }, + { + "epoch": 0.9017640247916998, + "grad_norm": 0.46016326546669006, + "learning_rate": 2.5282309425271213e-06, + "loss": 1.6862, + "step": 4965 + }, + { + "epoch": 0.9019456489658774, + "grad_norm": 0.35614195466041565, + "learning_rate": 2.5189551496351716e-06, + "loss": 1.6945, + "step": 4966 + }, + { + "epoch": 0.9021272731400549, + "grad_norm": 0.3615615963935852, + "learning_rate": 2.5096959642281704e-06, + "loss": 1.7375, + "step": 4967 + }, + { + "epoch": 0.9023088973142325, + "grad_norm": 0.37611088156700134, + "learning_rate": 2.500453389544699e-06, + "loss": 1.6941, + "step": 4968 + }, + { + "epoch": 0.9024905214884101, + "grad_norm": 0.40894389152526855, + "learning_rate": 2.4912274288175388e-06, + "loss": 1.8415, + "step": 4969 + }, + { + "epoch": 0.9026721456625877, + "grad_norm": 0.36269310116767883, + "learning_rate": 2.4820180852736687e-06, + "loss": 1.7816, + "step": 4970 + }, + { + "epoch": 0.9028537698367652, + "grad_norm": 0.3888978362083435, + "learning_rate": 2.4728253621342566e-06, + "loss": 1.5018, + "step": 4971 + }, + { + "epoch": 0.9030353940109429, + "grad_norm": 0.3639528751373291, + "learning_rate": 2.4636492626146425e-06, + "loss": 1.6744, + "step": 4972 + }, + { + "epoch": 0.9032170181851205, + "grad_norm": 0.4272617697715759, + "learning_rate": 2.4544897899243523e-06, + "loss": 1.7797, + "step": 4973 + }, + { + "epoch": 0.903398642359298, + "grad_norm": 0.3925301432609558, + "learning_rate": 2.445346947267102e-06, + "loss": 1.8598, + "step": 4974 + }, + { + "epoch": 0.9035802665334756, + "grad_norm": 1.3145991563796997, + "learning_rate": 2.4362207378407944e-06, + "loss": 1.8845, + "step": 4975 + }, + { + "epoch": 0.9037618907076532, + "grad_norm": 0.45497187972068787, + "learning_rate": 2.4271111648375143e-06, + "loss": 1.7921, + "step": 4976 + }, + { + "epoch": 0.9039435148818308, + "grad_norm": 0.3711586892604828, + "learning_rate": 2.4180182314435307e-06, + "loss": 1.7123, + "step": 4977 + }, + { + "epoch": 0.9041251390560083, + "grad_norm": 1.4351342916488647, + "learning_rate": 2.4089419408392767e-06, + "loss": 1.6234, + "step": 4978 + }, + { + "epoch": 0.9043067632301859, + "grad_norm": 0.7801569104194641, + "learning_rate": 2.3998822961993685e-06, + "loss": 1.6757, + "step": 4979 + }, + { + "epoch": 0.9044883874043635, + "grad_norm": 0.5062945485115051, + "learning_rate": 2.3908393006926268e-06, + "loss": 1.8073, + "step": 4980 + }, + { + "epoch": 0.9046700115785411, + "grad_norm": 0.41659626364707947, + "learning_rate": 2.381812957481999e-06, + "loss": 1.7175, + "step": 4981 + }, + { + "epoch": 0.9048516357527187, + "grad_norm": 0.3958587646484375, + "learning_rate": 2.372803269724666e-06, + "loss": 1.7117, + "step": 4982 + }, + { + "epoch": 0.9050332599268963, + "grad_norm": 0.3627260625362396, + "learning_rate": 2.3638102405719285e-06, + "loss": 1.5551, + "step": 4983 + }, + { + "epoch": 0.9052148841010739, + "grad_norm": 0.500877320766449, + "learning_rate": 2.3548338731693044e-06, + "loss": 1.5005, + "step": 4984 + }, + { + "epoch": 0.9053965082752514, + "grad_norm": 1.0702635049819946, + "learning_rate": 2.345874170656459e-06, + "loss": 2.0246, + "step": 4985 + }, + { + "epoch": 0.905578132449429, + "grad_norm": 0.3434082269668579, + "learning_rate": 2.3369311361672364e-06, + "loss": 1.7872, + "step": 4986 + }, + { + "epoch": 0.9057597566236066, + "grad_norm": 0.355937659740448, + "learning_rate": 2.3280047728296395e-06, + "loss": 1.8252, + "step": 4987 + }, + { + "epoch": 0.9059413807977842, + "grad_norm": 0.3664059340953827, + "learning_rate": 2.3190950837658597e-06, + "loss": 1.6954, + "step": 4988 + }, + { + "epoch": 0.9061230049719617, + "grad_norm": 0.40024811029434204, + "learning_rate": 2.310202072092249e-06, + "loss": 1.879, + "step": 4989 + }, + { + "epoch": 0.9063046291461394, + "grad_norm": 0.5333344340324402, + "learning_rate": 2.301325740919319e-06, + "loss": 1.5724, + "step": 4990 + }, + { + "epoch": 0.906486253320317, + "grad_norm": 0.4673263430595398, + "learning_rate": 2.292466093351747e-06, + "loss": 1.8493, + "step": 4991 + }, + { + "epoch": 0.9066678774944945, + "grad_norm": 0.5278956890106201, + "learning_rate": 2.2836231324883828e-06, + "loss": 1.7742, + "step": 4992 + }, + { + "epoch": 0.9068495016686721, + "grad_norm": 0.31442099809646606, + "learning_rate": 2.274796861422246e-06, + "loss": 1.7204, + "step": 4993 + }, + { + "epoch": 0.9070311258428497, + "grad_norm": 0.42055878043174744, + "learning_rate": 2.2659872832404893e-06, + "loss": 1.6927, + "step": 4994 + }, + { + "epoch": 0.9072127500170273, + "grad_norm": 0.5749602913856506, + "learning_rate": 2.257194401024465e-06, + "loss": 1.8119, + "step": 4995 + }, + { + "epoch": 0.9073943741912048, + "grad_norm": 0.37285536527633667, + "learning_rate": 2.2484182178496573e-06, + "loss": 1.8606, + "step": 4996 + }, + { + "epoch": 0.9075759983653824, + "grad_norm": 0.4246571958065033, + "learning_rate": 2.239658736785716e-06, + "loss": 1.55, + "step": 4997 + }, + { + "epoch": 0.90775762253956, + "grad_norm": 0.3660171926021576, + "learning_rate": 2.2309159608964624e-06, + "loss": 1.8657, + "step": 4998 + }, + { + "epoch": 0.9079392467137376, + "grad_norm": 0.5159090161323547, + "learning_rate": 2.2221898932398566e-06, + "loss": 1.7507, + "step": 4999 + }, + { + "epoch": 0.9081208708879152, + "grad_norm": 0.4199765920639038, + "learning_rate": 2.2134805368680235e-06, + "loss": 1.8526, + "step": 5000 + }, + { + "epoch": 0.9083024950620928, + "grad_norm": 0.3455963730812073, + "learning_rate": 2.2047878948272373e-06, + "loss": 1.7372, + "step": 5001 + }, + { + "epoch": 0.9084841192362704, + "grad_norm": 0.4540290832519531, + "learning_rate": 2.1961119701579492e-06, + "loss": 1.7528, + "step": 5002 + }, + { + "epoch": 0.9086657434104479, + "grad_norm": 0.41386234760284424, + "learning_rate": 2.187452765894732e-06, + "loss": 1.8806, + "step": 5003 + }, + { + "epoch": 0.9088473675846255, + "grad_norm": 0.4991159737110138, + "learning_rate": 2.178810285066324e-06, + "loss": 1.6569, + "step": 5004 + }, + { + "epoch": 0.9090289917588031, + "grad_norm": 1.0720034837722778, + "learning_rate": 2.1701845306956017e-06, + "loss": 1.712, + "step": 5005 + }, + { + "epoch": 0.9092106159329807, + "grad_norm": 0.461214542388916, + "learning_rate": 2.161575505799618e-06, + "loss": 1.8099, + "step": 5006 + }, + { + "epoch": 0.9093922401071582, + "grad_norm": 0.3804715871810913, + "learning_rate": 2.152983213389559e-06, + "loss": 1.7065, + "step": 5007 + }, + { + "epoch": 0.9095738642813358, + "grad_norm": 0.4284220039844513, + "learning_rate": 2.1444076564707483e-06, + "loss": 1.7399, + "step": 5008 + }, + { + "epoch": 0.9097554884555135, + "grad_norm": 0.64887535572052, + "learning_rate": 2.1358488380426755e-06, + "loss": 1.7228, + "step": 5009 + }, + { + "epoch": 0.909937112629691, + "grad_norm": 0.39479130506515503, + "learning_rate": 2.1273067610989515e-06, + "loss": 1.7634, + "step": 5010 + }, + { + "epoch": 0.9101187368038686, + "grad_norm": 0.4316254258155823, + "learning_rate": 2.1187814286273646e-06, + "loss": 1.7661, + "step": 5011 + }, + { + "epoch": 0.9103003609780462, + "grad_norm": 0.42746224999427795, + "learning_rate": 2.1102728436098063e-06, + "loss": 1.865, + "step": 5012 + }, + { + "epoch": 0.9104819851522238, + "grad_norm": 0.3470882475376129, + "learning_rate": 2.1017810090223523e-06, + "loss": 1.589, + "step": 5013 + }, + { + "epoch": 0.9106636093264013, + "grad_norm": 1.5288110971450806, + "learning_rate": 2.093305927835182e-06, + "loss": 1.8849, + "step": 5014 + }, + { + "epoch": 0.9108452335005789, + "grad_norm": 0.43052351474761963, + "learning_rate": 2.084847603012646e-06, + "loss": 1.848, + "step": 5015 + }, + { + "epoch": 0.9110268576747566, + "grad_norm": 0.43378910422325134, + "learning_rate": 2.076406037513212e-06, + "loss": 1.5782, + "step": 5016 + }, + { + "epoch": 0.9112084818489341, + "grad_norm": 0.3732862174510956, + "learning_rate": 2.06798123428949e-06, + "loss": 1.7198, + "step": 5017 + }, + { + "epoch": 0.9113901060231117, + "grad_norm": 0.3563593626022339, + "learning_rate": 2.0595731962882338e-06, + "loss": 1.762, + "step": 5018 + }, + { + "epoch": 0.9115717301972892, + "grad_norm": 0.42713767290115356, + "learning_rate": 2.0511819264503295e-06, + "loss": 1.6542, + "step": 5019 + }, + { + "epoch": 0.9117533543714669, + "grad_norm": 0.30379483103752136, + "learning_rate": 2.042807427710802e-06, + "loss": 1.5848, + "step": 5020 + }, + { + "epoch": 0.9119349785456444, + "grad_norm": 0.39307528734207153, + "learning_rate": 2.0344497029988086e-06, + "loss": 1.6565, + "step": 5021 + }, + { + "epoch": 0.912116602719822, + "grad_norm": 0.8538019061088562, + "learning_rate": 2.026108755237632e-06, + "loss": 1.828, + "step": 5022 + }, + { + "epoch": 0.9122982268939996, + "grad_norm": 0.45637422800064087, + "learning_rate": 2.01778458734469e-06, + "loss": 1.8251, + "step": 5023 + }, + { + "epoch": 0.9124798510681772, + "grad_norm": 0.31902262568473816, + "learning_rate": 2.0094772022315467e-06, + "loss": 1.6815, + "step": 5024 + }, + { + "epoch": 0.9126614752423547, + "grad_norm": 0.5378137230873108, + "learning_rate": 2.0011866028038617e-06, + "loss": 1.647, + "step": 5025 + }, + { + "epoch": 0.9128430994165323, + "grad_norm": 0.3386855721473694, + "learning_rate": 1.9929127919614653e-06, + "loss": 1.8568, + "step": 5026 + }, + { + "epoch": 0.91302472359071, + "grad_norm": 0.3820885717868805, + "learning_rate": 1.9846557725982817e-06, + "loss": 1.76, + "step": 5027 + }, + { + "epoch": 0.9132063477648875, + "grad_norm": 0.3061985969543457, + "learning_rate": 1.9764155476023895e-06, + "loss": 1.6351, + "step": 5028 + }, + { + "epoch": 0.9133879719390651, + "grad_norm": 0.34956687688827515, + "learning_rate": 1.9681921198559716e-06, + "loss": 1.7478, + "step": 5029 + }, + { + "epoch": 0.9135695961132426, + "grad_norm": 0.33033329248428345, + "learning_rate": 1.9599854922353335e-06, + "loss": 1.7244, + "step": 5030 + }, + { + "epoch": 0.9137512202874203, + "grad_norm": 0.3710954785346985, + "learning_rate": 1.951795667610928e-06, + "loss": 1.7867, + "step": 5031 + }, + { + "epoch": 0.9139328444615978, + "grad_norm": 0.3441013693809509, + "learning_rate": 1.94362264884731e-06, + "loss": 1.572, + "step": 5032 + }, + { + "epoch": 0.9141144686357754, + "grad_norm": 0.408074289560318, + "learning_rate": 1.9354664388031684e-06, + "loss": 1.7389, + "step": 5033 + }, + { + "epoch": 0.914296092809953, + "grad_norm": 0.4142604172229767, + "learning_rate": 1.927327040331306e-06, + "loss": 1.6726, + "step": 5034 + }, + { + "epoch": 0.9144777169841306, + "grad_norm": 0.39636921882629395, + "learning_rate": 1.9192044562786504e-06, + "loss": 1.6058, + "step": 5035 + }, + { + "epoch": 0.9146593411583082, + "grad_norm": 0.41392698884010315, + "learning_rate": 1.911098689486229e-06, + "loss": 1.5285, + "step": 5036 + }, + { + "epoch": 0.9148409653324857, + "grad_norm": 0.36246582865715027, + "learning_rate": 1.9030097427892134e-06, + "loss": 1.6311, + "step": 5037 + }, + { + "epoch": 0.9150225895066634, + "grad_norm": 0.39901238679885864, + "learning_rate": 1.8949376190168899e-06, + "loss": 1.6023, + "step": 5038 + }, + { + "epoch": 0.9152042136808409, + "grad_norm": 0.3883909583091736, + "learning_rate": 1.8868823209926389e-06, + "loss": 1.5927, + "step": 5039 + }, + { + "epoch": 0.9153858378550185, + "grad_norm": 0.41448143124580383, + "learning_rate": 1.8788438515339734e-06, + "loss": 1.9497, + "step": 5040 + }, + { + "epoch": 0.915567462029196, + "grad_norm": 0.36838796734809875, + "learning_rate": 1.8708222134525167e-06, + "loss": 1.6344, + "step": 5041 + }, + { + "epoch": 0.9157490862033737, + "grad_norm": 0.6344767808914185, + "learning_rate": 1.8628174095540073e-06, + "loss": 1.5686, + "step": 5042 + }, + { + "epoch": 0.9159307103775512, + "grad_norm": 0.3496866524219513, + "learning_rate": 1.8548294426382784e-06, + "loss": 1.4468, + "step": 5043 + }, + { + "epoch": 0.9161123345517288, + "grad_norm": 0.39987504482269287, + "learning_rate": 1.8468583154993002e-06, + "loss": 1.569, + "step": 5044 + }, + { + "epoch": 0.9162939587259065, + "grad_norm": 0.44675663113594055, + "learning_rate": 1.838904030925137e-06, + "loss": 1.7791, + "step": 5045 + }, + { + "epoch": 0.916475582900084, + "grad_norm": 0.3922137916088104, + "learning_rate": 1.830966591697969e-06, + "loss": 1.7219, + "step": 5046 + }, + { + "epoch": 0.9166572070742616, + "grad_norm": 0.5028101801872253, + "learning_rate": 1.823046000594081e-06, + "loss": 1.5952, + "step": 5047 + }, + { + "epoch": 0.9168388312484391, + "grad_norm": 0.3847370147705078, + "learning_rate": 1.815142260383862e-06, + "loss": 1.631, + "step": 5048 + }, + { + "epoch": 0.9170204554226168, + "grad_norm": 0.3260517120361328, + "learning_rate": 1.807255373831801e-06, + "loss": 1.7767, + "step": 5049 + }, + { + "epoch": 0.9172020795967943, + "grad_norm": 0.4284895956516266, + "learning_rate": 1.7993853436965137e-06, + "loss": 1.8988, + "step": 5050 + }, + { + "epoch": 0.9173837037709719, + "grad_norm": 0.4890294075012207, + "learning_rate": 1.7915321727307088e-06, + "loss": 1.6704, + "step": 5051 + }, + { + "epoch": 0.9175653279451494, + "grad_norm": 0.6487362384796143, + "learning_rate": 1.783695863681184e-06, + "loss": 1.6244, + "step": 5052 + }, + { + "epoch": 0.9177469521193271, + "grad_norm": 0.3841184675693512, + "learning_rate": 1.7758764192888576e-06, + "loss": 1.83, + "step": 5053 + }, + { + "epoch": 0.9179285762935047, + "grad_norm": 0.45139244198799133, + "learning_rate": 1.7680738422887534e-06, + "loss": 1.7824, + "step": 5054 + }, + { + "epoch": 0.9181102004676822, + "grad_norm": 0.35119393467903137, + "learning_rate": 1.7602881354099709e-06, + "loss": 1.7441, + "step": 5055 + }, + { + "epoch": 0.9182918246418599, + "grad_norm": 0.37626707553863525, + "learning_rate": 1.7525193013757269e-06, + "loss": 1.684, + "step": 5056 + }, + { + "epoch": 0.9184734488160374, + "grad_norm": 0.38829880952835083, + "learning_rate": 1.7447673429033362e-06, + "loss": 1.8967, + "step": 5057 + }, + { + "epoch": 0.918655072990215, + "grad_norm": 1.444040298461914, + "learning_rate": 1.7370322627042024e-06, + "loss": 1.886, + "step": 5058 + }, + { + "epoch": 0.9188366971643925, + "grad_norm": 0.4088570475578308, + "learning_rate": 1.7293140634838445e-06, + "loss": 1.6757, + "step": 5059 + }, + { + "epoch": 0.9190183213385702, + "grad_norm": 0.4276229739189148, + "learning_rate": 1.7216127479418476e-06, + "loss": 1.7792, + "step": 5060 + }, + { + "epoch": 0.9191999455127478, + "grad_norm": 0.42666271328926086, + "learning_rate": 1.7139283187719124e-06, + "loss": 1.5521, + "step": 5061 + }, + { + "epoch": 0.9193815696869253, + "grad_norm": 0.39894258975982666, + "learning_rate": 1.7062607786618278e-06, + "loss": 1.7968, + "step": 5062 + }, + { + "epoch": 0.919563193861103, + "grad_norm": 0.34544146060943604, + "learning_rate": 1.6986101302934821e-06, + "loss": 1.6275, + "step": 5063 + }, + { + "epoch": 0.9197448180352805, + "grad_norm": 0.32896414399147034, + "learning_rate": 1.6909763763428455e-06, + "loss": 1.6398, + "step": 5064 + }, + { + "epoch": 0.9199264422094581, + "grad_norm": 1.7664613723754883, + "learning_rate": 1.6833595194799768e-06, + "loss": 1.8057, + "step": 5065 + }, + { + "epoch": 0.9201080663836356, + "grad_norm": 0.3464670181274414, + "learning_rate": 1.6757595623690336e-06, + "loss": 1.8674, + "step": 5066 + }, + { + "epoch": 0.9202896905578133, + "grad_norm": 0.379447877407074, + "learning_rate": 1.6681765076682677e-06, + "loss": 1.7382, + "step": 5067 + }, + { + "epoch": 0.9204713147319908, + "grad_norm": 0.3463670015335083, + "learning_rate": 1.6606103580299959e-06, + "loss": 1.7946, + "step": 5068 + }, + { + "epoch": 0.9206529389061684, + "grad_norm": 0.38972151279449463, + "learning_rate": 1.6530611161006515e-06, + "loss": 1.7633, + "step": 5069 + }, + { + "epoch": 0.920834563080346, + "grad_norm": 0.3269205093383789, + "learning_rate": 1.6455287845207279e-06, + "loss": 1.6619, + "step": 5070 + }, + { + "epoch": 0.9210161872545236, + "grad_norm": 0.37785500288009644, + "learning_rate": 1.6380133659248176e-06, + "loss": 1.7929, + "step": 5071 + }, + { + "epoch": 0.9211978114287012, + "grad_norm": 0.6982583999633789, + "learning_rate": 1.6305148629416012e-06, + "loss": 1.8984, + "step": 5072 + }, + { + "epoch": 0.9213794356028787, + "grad_norm": 0.42645296454429626, + "learning_rate": 1.623033278193825e-06, + "loss": 1.8412, + "step": 5073 + }, + { + "epoch": 0.9215610597770564, + "grad_norm": 0.7901811003684998, + "learning_rate": 1.6155686142983406e-06, + "loss": 1.8272, + "step": 5074 + }, + { + "epoch": 0.9217426839512339, + "grad_norm": 0.3433598279953003, + "learning_rate": 1.6081208738660591e-06, + "loss": 1.6596, + "step": 5075 + }, + { + "epoch": 0.9219243081254115, + "grad_norm": 0.29812490940093994, + "learning_rate": 1.6006900595019913e-06, + "loss": 1.5754, + "step": 5076 + }, + { + "epoch": 0.922105932299589, + "grad_norm": 0.3293648660182953, + "learning_rate": 1.5932761738052193e-06, + "loss": 1.6017, + "step": 5077 + }, + { + "epoch": 0.9222875564737667, + "grad_norm": 0.4386688470840454, + "learning_rate": 1.5858792193689077e-06, + "loss": 1.8148, + "step": 5078 + }, + { + "epoch": 0.9224691806479443, + "grad_norm": 0.3885430693626404, + "learning_rate": 1.578499198780281e-06, + "loss": 1.6346, + "step": 5079 + }, + { + "epoch": 0.9226508048221218, + "grad_norm": 0.3024516701698303, + "learning_rate": 1.5711361146206638e-06, + "loss": 1.5511, + "step": 5080 + }, + { + "epoch": 0.9228324289962994, + "grad_norm": 0.36012083292007446, + "learning_rate": 1.5637899694654456e-06, + "loss": 1.6043, + "step": 5081 + }, + { + "epoch": 0.923014053170477, + "grad_norm": 0.32003259658813477, + "learning_rate": 1.556460765884099e-06, + "loss": 1.5944, + "step": 5082 + }, + { + "epoch": 0.9231956773446546, + "grad_norm": 0.36757737398147583, + "learning_rate": 1.5491485064401623e-06, + "loss": 1.6506, + "step": 5083 + }, + { + "epoch": 0.9233773015188321, + "grad_norm": 0.398034930229187, + "learning_rate": 1.5418531936912506e-06, + "loss": 1.6302, + "step": 5084 + }, + { + "epoch": 0.9235589256930098, + "grad_norm": 0.6276927590370178, + "learning_rate": 1.5345748301890561e-06, + "loss": 1.6715, + "step": 5085 + }, + { + "epoch": 0.9237405498671873, + "grad_norm": 0.4226553738117218, + "learning_rate": 1.5273134184793314e-06, + "loss": 1.7377, + "step": 5086 + }, + { + "epoch": 0.9239221740413649, + "grad_norm": 0.7353553175926208, + "learning_rate": 1.520068961101906e-06, + "loss": 1.8636, + "step": 5087 + }, + { + "epoch": 0.9241037982155424, + "grad_norm": 0.48388567566871643, + "learning_rate": 1.512841460590686e-06, + "loss": 1.7406, + "step": 5088 + }, + { + "epoch": 0.9242854223897201, + "grad_norm": 0.5194151997566223, + "learning_rate": 1.5056309194736384e-06, + "loss": 1.7654, + "step": 5089 + }, + { + "epoch": 0.9244670465638977, + "grad_norm": 0.4186556041240692, + "learning_rate": 1.4984373402728014e-06, + "loss": 1.8679, + "step": 5090 + }, + { + "epoch": 0.9246486707380752, + "grad_norm": 0.42442432045936584, + "learning_rate": 1.4912607255042787e-06, + "loss": 1.6905, + "step": 5091 + }, + { + "epoch": 0.9248302949122528, + "grad_norm": 0.44191110134124756, + "learning_rate": 1.4841010776782293e-06, + "loss": 1.7312, + "step": 5092 + }, + { + "epoch": 0.9250119190864304, + "grad_norm": 0.534186601638794, + "learning_rate": 1.4769583992989056e-06, + "loss": 1.6869, + "step": 5093 + }, + { + "epoch": 0.925193543260608, + "grad_norm": 0.5317513346672058, + "learning_rate": 1.4698326928646033e-06, + "loss": 1.8049, + "step": 5094 + }, + { + "epoch": 0.9253751674347855, + "grad_norm": 0.6315459609031677, + "learning_rate": 1.4627239608676845e-06, + "loss": 1.8046, + "step": 5095 + }, + { + "epoch": 0.9255567916089632, + "grad_norm": 0.3731791377067566, + "learning_rate": 1.455632205794577e-06, + "loss": 1.7212, + "step": 5096 + }, + { + "epoch": 0.9257384157831408, + "grad_norm": 0.3260796070098877, + "learning_rate": 1.4485574301257687e-06, + "loss": 1.8066, + "step": 5097 + }, + { + "epoch": 0.9259200399573183, + "grad_norm": 0.35420361161231995, + "learning_rate": 1.4414996363358135e-06, + "loss": 1.9643, + "step": 5098 + }, + { + "epoch": 0.9261016641314959, + "grad_norm": 0.5375301241874695, + "learning_rate": 1.4344588268933145e-06, + "loss": 1.6463, + "step": 5099 + }, + { + "epoch": 0.9262832883056735, + "grad_norm": 0.4622683525085449, + "learning_rate": 1.4274350042609518e-06, + "loss": 1.5348, + "step": 5100 + }, + { + "epoch": 0.9264649124798511, + "grad_norm": 0.5144912004470825, + "learning_rate": 1.4204281708954437e-06, + "loss": 1.6262, + "step": 5101 + }, + { + "epoch": 0.9266465366540286, + "grad_norm": 0.3530748188495636, + "learning_rate": 1.413438329247585e-06, + "loss": 1.584, + "step": 5102 + }, + { + "epoch": 0.9268281608282062, + "grad_norm": 0.39008134603500366, + "learning_rate": 1.4064654817622147e-06, + "loss": 1.8104, + "step": 5103 + }, + { + "epoch": 0.9270097850023838, + "grad_norm": 0.33599650859832764, + "learning_rate": 1.3995096308782318e-06, + "loss": 1.7373, + "step": 5104 + }, + { + "epoch": 0.9271914091765614, + "grad_norm": 0.41612884402275085, + "learning_rate": 1.3925707790285846e-06, + "loss": 1.5744, + "step": 5105 + }, + { + "epoch": 0.927373033350739, + "grad_norm": 0.32163843512535095, + "learning_rate": 1.3856489286402874e-06, + "loss": 1.784, + "step": 5106 + }, + { + "epoch": 0.9275546575249166, + "grad_norm": 0.3438540995121002, + "learning_rate": 1.3787440821344032e-06, + "loss": 1.7562, + "step": 5107 + }, + { + "epoch": 0.9277362816990942, + "grad_norm": 0.4504026174545288, + "learning_rate": 1.371856241926045e-06, + "loss": 1.7484, + "step": 5108 + }, + { + "epoch": 0.9279179058732717, + "grad_norm": 0.38208577036857605, + "learning_rate": 1.3649854104243797e-06, + "loss": 1.5469, + "step": 5109 + }, + { + "epoch": 0.9280995300474493, + "grad_norm": 0.41364380717277527, + "learning_rate": 1.358131590032613e-06, + "loss": 1.7952, + "step": 5110 + }, + { + "epoch": 0.9282811542216269, + "grad_norm": 0.39007261395454407, + "learning_rate": 1.3512947831480217e-06, + "loss": 1.6057, + "step": 5111 + }, + { + "epoch": 0.9284627783958045, + "grad_norm": 0.5210568308830261, + "learning_rate": 1.344474992161915e-06, + "loss": 1.5632, + "step": 5112 + }, + { + "epoch": 0.928644402569982, + "grad_norm": 0.42912763357162476, + "learning_rate": 1.337672219459668e-06, + "loss": 1.8277, + "step": 5113 + }, + { + "epoch": 0.9288260267441596, + "grad_norm": 0.3096025586128235, + "learning_rate": 1.3308864674206833e-06, + "loss": 1.7555, + "step": 5114 + }, + { + "epoch": 0.9290076509183373, + "grad_norm": 1.07957923412323, + "learning_rate": 1.3241177384184179e-06, + "loss": 1.8254, + "step": 5115 + }, + { + "epoch": 0.9291892750925148, + "grad_norm": 0.42560696601867676, + "learning_rate": 1.317366034820383e-06, + "loss": 1.769, + "step": 5116 + }, + { + "epoch": 0.9293708992666924, + "grad_norm": 0.5368350148200989, + "learning_rate": 1.310631358988118e-06, + "loss": 1.8696, + "step": 5117 + }, + { + "epoch": 0.92955252344087, + "grad_norm": 0.3580041229724884, + "learning_rate": 1.3039137132772216e-06, + "loss": 1.6915, + "step": 5118 + }, + { + "epoch": 0.9297341476150476, + "grad_norm": 0.4295887351036072, + "learning_rate": 1.297213100037331e-06, + "loss": 1.7224, + "step": 5119 + }, + { + "epoch": 0.9299157717892251, + "grad_norm": 0.32962727546691895, + "learning_rate": 1.2905295216121327e-06, + "loss": 1.7303, + "step": 5120 + }, + { + "epoch": 0.9300973959634027, + "grad_norm": 0.30523011088371277, + "learning_rate": 1.2838629803393342e-06, + "loss": 1.6154, + "step": 5121 + }, + { + "epoch": 0.9302790201375803, + "grad_norm": 1.1493028402328491, + "learning_rate": 1.277213478550704e-06, + "loss": 1.8971, + "step": 5122 + }, + { + "epoch": 0.9304606443117579, + "grad_norm": 0.3185190260410309, + "learning_rate": 1.2705810185720368e-06, + "loss": 1.6626, + "step": 5123 + }, + { + "epoch": 0.9306422684859355, + "grad_norm": 0.3485655188560486, + "learning_rate": 1.2639656027231773e-06, + "loss": 1.7515, + "step": 5124 + }, + { + "epoch": 0.930823892660113, + "grad_norm": 0.5146001577377319, + "learning_rate": 1.2573672333180186e-06, + "loss": 1.7841, + "step": 5125 + }, + { + "epoch": 0.9310055168342907, + "grad_norm": 0.43456152081489563, + "learning_rate": 1.2507859126644595e-06, + "loss": 1.8351, + "step": 5126 + }, + { + "epoch": 0.9311871410084682, + "grad_norm": 0.3352729380130768, + "learning_rate": 1.2442216430644638e-06, + "loss": 1.9018, + "step": 5127 + }, + { + "epoch": 0.9313687651826458, + "grad_norm": 0.41262805461883545, + "learning_rate": 1.2376744268140173e-06, + "loss": 1.5677, + "step": 5128 + }, + { + "epoch": 0.9315503893568234, + "grad_norm": 0.3422653377056122, + "learning_rate": 1.2311442662031492e-06, + "loss": 1.6733, + "step": 5129 + }, + { + "epoch": 0.931732013531001, + "grad_norm": 0.4678698480129242, + "learning_rate": 1.2246311635159102e-06, + "loss": 1.5394, + "step": 5130 + }, + { + "epoch": 0.9319136377051785, + "grad_norm": 0.8478350043296814, + "learning_rate": 1.218135121030406e-06, + "loss": 1.7116, + "step": 5131 + }, + { + "epoch": 0.9320952618793561, + "grad_norm": 0.3607379198074341, + "learning_rate": 1.211656141018752e-06, + "loss": 1.5954, + "step": 5132 + }, + { + "epoch": 0.9322768860535338, + "grad_norm": 0.40608781576156616, + "learning_rate": 1.2051942257471193e-06, + "loss": 1.8035, + "step": 5133 + }, + { + "epoch": 0.9324585102277113, + "grad_norm": 0.36697450280189514, + "learning_rate": 1.1987493774756885e-06, + "loss": 1.9248, + "step": 5134 + }, + { + "epoch": 0.9326401344018889, + "grad_norm": 0.31130561232566833, + "learning_rate": 1.192321598458679e-06, + "loss": 1.7466, + "step": 5135 + }, + { + "epoch": 0.9328217585760665, + "grad_norm": 0.7542583346366882, + "learning_rate": 1.1859108909443416e-06, + "loss": 1.4695, + "step": 5136 + }, + { + "epoch": 0.9330033827502441, + "grad_norm": 0.7104910612106323, + "learning_rate": 1.1795172571749501e-06, + "loss": 1.8229, + "step": 5137 + }, + { + "epoch": 0.9331850069244216, + "grad_norm": 0.3681411147117615, + "learning_rate": 1.1731406993868266e-06, + "loss": 1.6857, + "step": 5138 + }, + { + "epoch": 0.9333666310985992, + "grad_norm": 0.3336416184902191, + "learning_rate": 1.1667812198102924e-06, + "loss": 1.5839, + "step": 5139 + }, + { + "epoch": 0.9335482552727769, + "grad_norm": 0.3645172119140625, + "learning_rate": 1.1604388206697125e-06, + "loss": 1.634, + "step": 5140 + }, + { + "epoch": 0.9337298794469544, + "grad_norm": 0.6536900997161865, + "learning_rate": 1.1541135041834628e-06, + "loss": 1.5055, + "step": 5141 + }, + { + "epoch": 0.933911503621132, + "grad_norm": 1.0193183422088623, + "learning_rate": 1.1478052725639733e-06, + "loss": 1.8019, + "step": 5142 + }, + { + "epoch": 0.9340931277953095, + "grad_norm": 0.5205622911453247, + "learning_rate": 1.1415141280176621e-06, + "loss": 1.6432, + "step": 5143 + }, + { + "epoch": 0.9342747519694872, + "grad_norm": 1.0251789093017578, + "learning_rate": 1.1352400727449975e-06, + "loss": 1.8151, + "step": 5144 + }, + { + "epoch": 0.9344563761436647, + "grad_norm": 0.45335787534713745, + "learning_rate": 1.1289831089404567e-06, + "loss": 1.5156, + "step": 5145 + }, + { + "epoch": 0.9346380003178423, + "grad_norm": 0.355452299118042, + "learning_rate": 1.1227432387925507e-06, + "loss": 1.7643, + "step": 5146 + }, + { + "epoch": 0.9348196244920199, + "grad_norm": 0.47115209698677063, + "learning_rate": 1.1165204644838002e-06, + "loss": 1.733, + "step": 5147 + }, + { + "epoch": 0.9350012486661975, + "grad_norm": 0.7538847327232361, + "learning_rate": 1.1103147881907417e-06, + "loss": 1.7387, + "step": 5148 + }, + { + "epoch": 0.935182872840375, + "grad_norm": 0.48022669553756714, + "learning_rate": 1.1041262120839502e-06, + "loss": 1.5159, + "step": 5149 + }, + { + "epoch": 0.9353644970145526, + "grad_norm": 0.5254223942756653, + "learning_rate": 1.0979547383280053e-06, + "loss": 1.9946, + "step": 5150 + }, + { + "epoch": 0.9355461211887303, + "grad_norm": 0.3413410186767578, + "learning_rate": 1.0918003690815138e-06, + "loss": 1.7089, + "step": 5151 + }, + { + "epoch": 0.9357277453629078, + "grad_norm": 0.39498233795166016, + "learning_rate": 1.0856631064970868e-06, + "loss": 1.565, + "step": 5152 + }, + { + "epoch": 0.9359093695370854, + "grad_norm": 0.6022232174873352, + "learning_rate": 1.0795429527213686e-06, + "loss": 1.6918, + "step": 5153 + }, + { + "epoch": 0.9360909937112629, + "grad_norm": 0.42635849118232727, + "learning_rate": 1.0734399098949966e-06, + "loss": 1.8511, + "step": 5154 + }, + { + "epoch": 0.9362726178854406, + "grad_norm": 2.068349599838257, + "learning_rate": 1.0673539801526466e-06, + "loss": 1.8466, + "step": 5155 + }, + { + "epoch": 0.9364542420596181, + "grad_norm": 0.38816702365875244, + "learning_rate": 1.0612851656229995e-06, + "loss": 1.6954, + "step": 5156 + }, + { + "epoch": 0.9366358662337957, + "grad_norm": 0.36314499378204346, + "learning_rate": 1.0552334684287513e-06, + "loss": 1.7549, + "step": 5157 + }, + { + "epoch": 0.9368174904079734, + "grad_norm": 0.42347100377082825, + "learning_rate": 1.0491988906866035e-06, + "loss": 1.7158, + "step": 5158 + }, + { + "epoch": 0.9369991145821509, + "grad_norm": 0.42483803629875183, + "learning_rate": 1.0431814345072787e-06, + "loss": 1.7739, + "step": 5159 + }, + { + "epoch": 0.9371807387563285, + "grad_norm": 0.39330440759658813, + "learning_rate": 1.0371811019955101e-06, + "loss": 1.8319, + "step": 5160 + }, + { + "epoch": 0.937362362930506, + "grad_norm": 0.371390700340271, + "learning_rate": 1.03119789525003e-06, + "loss": 1.5569, + "step": 5161 + }, + { + "epoch": 0.9375439871046837, + "grad_norm": 0.3158995807170868, + "learning_rate": 1.0252318163636032e-06, + "loss": 1.5063, + "step": 5162 + }, + { + "epoch": 0.9377256112788612, + "grad_norm": 0.32861456274986267, + "learning_rate": 1.0192828674229715e-06, + "loss": 1.8465, + "step": 5163 + }, + { + "epoch": 0.9379072354530388, + "grad_norm": 0.5962280631065369, + "learning_rate": 1.0133510505089262e-06, + "loss": 1.7243, + "step": 5164 + }, + { + "epoch": 0.9380888596272163, + "grad_norm": 0.8792483806610107, + "learning_rate": 1.0074363676962295e-06, + "loss": 1.5312, + "step": 5165 + }, + { + "epoch": 0.938270483801394, + "grad_norm": 0.43340224027633667, + "learning_rate": 1.0015388210536714e-06, + "loss": 1.6863, + "step": 5166 + }, + { + "epoch": 0.9384521079755715, + "grad_norm": 0.6920676827430725, + "learning_rate": 9.956584126440294e-07, + "loss": 1.6355, + "step": 5167 + }, + { + "epoch": 0.9386337321497491, + "grad_norm": 0.3976867198944092, + "learning_rate": 9.89795144524114e-07, + "loss": 1.5856, + "step": 5168 + }, + { + "epoch": 0.9388153563239268, + "grad_norm": 0.521247148513794, + "learning_rate": 9.839490187447177e-07, + "loss": 1.6457, + "step": 5169 + }, + { + "epoch": 0.9389969804981043, + "grad_norm": 0.44754958152770996, + "learning_rate": 9.781200373506494e-07, + "loss": 1.7867, + "step": 5170 + }, + { + "epoch": 0.9391786046722819, + "grad_norm": 0.38678672909736633, + "learning_rate": 9.723082023807118e-07, + "loss": 1.9046, + "step": 5171 + }, + { + "epoch": 0.9393602288464594, + "grad_norm": 0.7708633542060852, + "learning_rate": 9.665135158677175e-07, + "loss": 1.6215, + "step": 5172 + }, + { + "epoch": 0.9395418530206371, + "grad_norm": 0.4604766368865967, + "learning_rate": 9.607359798384785e-07, + "loss": 1.6061, + "step": 5173 + }, + { + "epoch": 0.9397234771948146, + "grad_norm": 0.39303871989250183, + "learning_rate": 9.549755963138064e-07, + "loss": 1.7439, + "step": 5174 + }, + { + "epoch": 0.9399051013689922, + "grad_norm": 0.6259109973907471, + "learning_rate": 9.492323673085224e-07, + "loss": 1.7649, + "step": 5175 + }, + { + "epoch": 0.9400867255431697, + "grad_norm": 0.4126527011394501, + "learning_rate": 9.435062948314366e-07, + "loss": 1.6859, + "step": 5176 + }, + { + "epoch": 0.9402683497173474, + "grad_norm": 0.4331003725528717, + "learning_rate": 9.37797380885358e-07, + "loss": 1.6458, + "step": 5177 + }, + { + "epoch": 0.940449973891525, + "grad_norm": 0.41992610692977905, + "learning_rate": 9.321056274671059e-07, + "loss": 1.7807, + "step": 5178 + }, + { + "epoch": 0.9406315980657025, + "grad_norm": 0.3423101305961609, + "learning_rate": 9.264310365674767e-07, + "loss": 1.6031, + "step": 5179 + }, + { + "epoch": 0.9408132222398802, + "grad_norm": 0.9915091395378113, + "learning_rate": 9.207736101712883e-07, + "loss": 1.5597, + "step": 5180 + }, + { + "epoch": 0.9409948464140577, + "grad_norm": 0.3492004871368408, + "learning_rate": 9.151333502573467e-07, + "loss": 1.5387, + "step": 5181 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.3248469829559326, + "learning_rate": 9.095102587984406e-07, + "loss": 1.817, + "step": 5182 + }, + { + "epoch": 0.9413580947624128, + "grad_norm": 0.4107282757759094, + "learning_rate": 9.039043377613743e-07, + "loss": 1.8359, + "step": 5183 + }, + { + "epoch": 0.9415397189365905, + "grad_norm": 0.3478221893310547, + "learning_rate": 8.983155891069184e-07, + "loss": 1.6159, + "step": 5184 + }, + { + "epoch": 0.941721343110768, + "grad_norm": 0.5238373875617981, + "learning_rate": 8.927440147898702e-07, + "loss": 1.658, + "step": 5185 + }, + { + "epoch": 0.9419029672849456, + "grad_norm": 0.6713945269584656, + "learning_rate": 8.871896167589933e-07, + "loss": 1.7771, + "step": 5186 + }, + { + "epoch": 0.9420845914591232, + "grad_norm": 0.41585108637809753, + "learning_rate": 8.816523969570611e-07, + "loss": 1.7192, + "step": 5187 + }, + { + "epoch": 0.9422662156333008, + "grad_norm": 0.3650790750980377, + "learning_rate": 8.761323573208302e-07, + "loss": 1.5565, + "step": 5188 + }, + { + "epoch": 0.9424478398074784, + "grad_norm": 0.5291646718978882, + "learning_rate": 8.706294997810449e-07, + "loss": 1.6121, + "step": 5189 + }, + { + "epoch": 0.9426294639816559, + "grad_norm": 0.7705767154693604, + "learning_rate": 8.651438262624545e-07, + "loss": 1.6147, + "step": 5190 + }, + { + "epoch": 0.9428110881558336, + "grad_norm": 0.4039137065410614, + "learning_rate": 8.596753386837797e-07, + "loss": 1.6484, + "step": 5191 + }, + { + "epoch": 0.9429927123300111, + "grad_norm": 1.2596489191055298, + "learning_rate": 8.542240389577349e-07, + "loss": 1.8745, + "step": 5192 + }, + { + "epoch": 0.9431743365041887, + "grad_norm": 0.5373148918151855, + "learning_rate": 8.487899289910284e-07, + "loss": 1.6958, + "step": 5193 + }, + { + "epoch": 0.9433559606783662, + "grad_norm": 0.30480682849884033, + "learning_rate": 8.433730106843618e-07, + "loss": 1.7625, + "step": 5194 + }, + { + "epoch": 0.9435375848525439, + "grad_norm": 0.47384917736053467, + "learning_rate": 8.379732859324085e-07, + "loss": 1.891, + "step": 5195 + }, + { + "epoch": 0.9437192090267215, + "grad_norm": 0.3940463066101074, + "learning_rate": 8.325907566238355e-07, + "loss": 1.5855, + "step": 5196 + }, + { + "epoch": 0.943900833200899, + "grad_norm": 1.406032681465149, + "learning_rate": 8.272254246412925e-07, + "loss": 1.7228, + "step": 5197 + }, + { + "epoch": 0.9440824573750767, + "grad_norm": 0.3305497169494629, + "learning_rate": 8.218772918614171e-07, + "loss": 1.8367, + "step": 5198 + }, + { + "epoch": 0.9442640815492542, + "grad_norm": 0.4536740183830261, + "learning_rate": 8.165463601548295e-07, + "loss": 1.668, + "step": 5199 + }, + { + "epoch": 0.9444457057234318, + "grad_norm": 0.3494042456150055, + "learning_rate": 8.112326313861385e-07, + "loss": 1.7607, + "step": 5200 + }, + { + "epoch": 0.9446273298976093, + "grad_norm": 0.4304828345775604, + "learning_rate": 8.059361074139293e-07, + "loss": 1.7145, + "step": 5201 + }, + { + "epoch": 0.944808954071787, + "grad_norm": 0.7016861438751221, + "learning_rate": 8.006567900907646e-07, + "loss": 1.8628, + "step": 5202 + }, + { + "epoch": 0.9449905782459646, + "grad_norm": 0.32828289270401, + "learning_rate": 7.953946812632063e-07, + "loss": 1.5143, + "step": 5203 + }, + { + "epoch": 0.9451722024201421, + "grad_norm": 0.4091377556324005, + "learning_rate": 7.901497827717818e-07, + "loss": 1.8158, + "step": 5204 + }, + { + "epoch": 0.9453538265943197, + "grad_norm": 1.4855895042419434, + "learning_rate": 7.849220964510073e-07, + "loss": 1.6762, + "step": 5205 + }, + { + "epoch": 0.9455354507684973, + "grad_norm": 0.37777024507522583, + "learning_rate": 7.797116241293645e-07, + "loss": 1.7564, + "step": 5206 + }, + { + "epoch": 0.9457170749426749, + "grad_norm": 0.5404534339904785, + "learning_rate": 7.745183676293343e-07, + "loss": 1.6597, + "step": 5207 + }, + { + "epoch": 0.9458986991168524, + "grad_norm": 1.4411555528640747, + "learning_rate": 7.693423287673695e-07, + "loss": 1.7045, + "step": 5208 + }, + { + "epoch": 0.9460803232910301, + "grad_norm": 0.42405563592910767, + "learning_rate": 7.641835093538885e-07, + "loss": 1.6185, + "step": 5209 + }, + { + "epoch": 0.9462619474652076, + "grad_norm": 0.3075677156448364, + "learning_rate": 7.590419111932978e-07, + "loss": 1.7318, + "step": 5210 + }, + { + "epoch": 0.9464435716393852, + "grad_norm": 0.36867958307266235, + "learning_rate": 7.539175360839812e-07, + "loss": 1.8353, + "step": 5211 + }, + { + "epoch": 0.9466251958135627, + "grad_norm": 0.3750384747982025, + "learning_rate": 7.488103858182938e-07, + "loss": 1.7057, + "step": 5212 + }, + { + "epoch": 0.9468068199877404, + "grad_norm": 0.4907868802547455, + "learning_rate": 7.437204621825733e-07, + "loss": 1.871, + "step": 5213 + }, + { + "epoch": 0.946988444161918, + "grad_norm": 0.3916128873825073, + "learning_rate": 7.386477669571179e-07, + "loss": 1.6633, + "step": 5214 + }, + { + "epoch": 0.9471700683360955, + "grad_norm": 0.37795743346214294, + "learning_rate": 7.335923019162139e-07, + "loss": 1.5381, + "step": 5215 + }, + { + "epoch": 0.9473516925102731, + "grad_norm": 0.29362359642982483, + "learning_rate": 7.285540688281133e-07, + "loss": 1.6502, + "step": 5216 + }, + { + "epoch": 0.9475333166844507, + "grad_norm": 0.8097910284996033, + "learning_rate": 7.235330694550402e-07, + "loss": 1.84, + "step": 5217 + }, + { + "epoch": 0.9477149408586283, + "grad_norm": 0.6675203442573547, + "learning_rate": 7.185293055532061e-07, + "loss": 1.8609, + "step": 5218 + }, + { + "epoch": 0.9478965650328058, + "grad_norm": 0.41805580258369446, + "learning_rate": 7.135427788727666e-07, + "loss": 1.7543, + "step": 5219 + }, + { + "epoch": 0.9480781892069835, + "grad_norm": 0.5496622920036316, + "learning_rate": 7.085734911578712e-07, + "loss": 1.8559, + "step": 5220 + }, + { + "epoch": 0.948259813381161, + "grad_norm": 0.3521918058395386, + "learning_rate": 7.036214441466348e-07, + "loss": 1.668, + "step": 5221 + }, + { + "epoch": 0.9484414375553386, + "grad_norm": 0.39750340580940247, + "learning_rate": 6.986866395711277e-07, + "loss": 1.7177, + "step": 5222 + }, + { + "epoch": 0.9486230617295162, + "grad_norm": 0.4626479148864746, + "learning_rate": 6.937690791574137e-07, + "loss": 1.7921, + "step": 5223 + }, + { + "epoch": 0.9488046859036938, + "grad_norm": 0.3825419545173645, + "learning_rate": 6.888687646254999e-07, + "loss": 1.4844, + "step": 5224 + }, + { + "epoch": 0.9489863100778714, + "grad_norm": 0.33263248205184937, + "learning_rate": 6.839856976893821e-07, + "loss": 1.6808, + "step": 5225 + }, + { + "epoch": 0.9491679342520489, + "grad_norm": 0.4623580873012543, + "learning_rate": 6.791198800570164e-07, + "loss": 1.87, + "step": 5226 + }, + { + "epoch": 0.9493495584262265, + "grad_norm": 0.34576231241226196, + "learning_rate": 6.742713134303192e-07, + "loss": 1.5047, + "step": 5227 + }, + { + "epoch": 0.9495311826004041, + "grad_norm": 1.0233416557312012, + "learning_rate": 6.694399995051725e-07, + "loss": 2.0211, + "step": 5228 + }, + { + "epoch": 0.9497128067745817, + "grad_norm": 0.3373667299747467, + "learning_rate": 6.646259399714416e-07, + "loss": 1.6392, + "step": 5229 + }, + { + "epoch": 0.9498944309487592, + "grad_norm": 0.3759530782699585, + "learning_rate": 6.598291365129294e-07, + "loss": 1.5381, + "step": 5230 + }, + { + "epoch": 0.9500760551229369, + "grad_norm": 0.5586311221122742, + "learning_rate": 6.550495908074328e-07, + "loss": 1.7847, + "step": 5231 + }, + { + "epoch": 0.9502576792971145, + "grad_norm": 0.381545752286911, + "learning_rate": 6.502873045266811e-07, + "loss": 1.7942, + "step": 5232 + }, + { + "epoch": 0.950439303471292, + "grad_norm": 0.3630439341068268, + "learning_rate": 6.45542279336403e-07, + "loss": 1.8357, + "step": 5233 + }, + { + "epoch": 0.9506209276454696, + "grad_norm": 0.3788076341152191, + "learning_rate": 6.408145168962599e-07, + "loss": 1.6646, + "step": 5234 + }, + { + "epoch": 0.9508025518196472, + "grad_norm": 0.34641286730766296, + "learning_rate": 6.361040188598788e-07, + "loss": 1.4563, + "step": 5235 + }, + { + "epoch": 0.9509841759938248, + "grad_norm": 0.5504201054573059, + "learning_rate": 6.314107868748642e-07, + "loss": 1.8587, + "step": 5236 + }, + { + "epoch": 0.9511658001680023, + "grad_norm": 0.40719959139823914, + "learning_rate": 6.26734822582764e-07, + "loss": 1.7523, + "step": 5237 + }, + { + "epoch": 0.9513474243421799, + "grad_norm": 0.6769658327102661, + "learning_rate": 6.220761276190978e-07, + "loss": 1.7834, + "step": 5238 + }, + { + "epoch": 0.9515290485163576, + "grad_norm": 0.9376266598701477, + "learning_rate": 6.17434703613351e-07, + "loss": 1.8689, + "step": 5239 + }, + { + "epoch": 0.9517106726905351, + "grad_norm": 0.7769054770469666, + "learning_rate": 6.128105521889415e-07, + "loss": 1.8039, + "step": 5240 + }, + { + "epoch": 0.9518922968647127, + "grad_norm": 0.3541914224624634, + "learning_rate": 6.082036749632703e-07, + "loss": 1.7293, + "step": 5241 + }, + { + "epoch": 0.9520739210388903, + "grad_norm": 0.3367586135864258, + "learning_rate": 6.036140735476925e-07, + "loss": 1.7651, + "step": 5242 + }, + { + "epoch": 0.9522555452130679, + "grad_norm": 0.7574877738952637, + "learning_rate": 5.990417495475076e-07, + "loss": 1.6349, + "step": 5243 + }, + { + "epoch": 0.9524371693872454, + "grad_norm": 0.4226863384246826, + "learning_rate": 5.944867045619918e-07, + "loss": 1.6217, + "step": 5244 + }, + { + "epoch": 0.952618793561423, + "grad_norm": 0.4054461419582367, + "learning_rate": 5.89948940184365e-07, + "loss": 1.8612, + "step": 5245 + }, + { + "epoch": 0.9528004177356006, + "grad_norm": 0.4267039895057678, + "learning_rate": 5.854284580017966e-07, + "loss": 1.6758, + "step": 5246 + }, + { + "epoch": 0.9529820419097782, + "grad_norm": 1.3343762159347534, + "learning_rate": 5.809252595954218e-07, + "loss": 1.7711, + "step": 5247 + }, + { + "epoch": 0.9531636660839558, + "grad_norm": 0.3359193801879883, + "learning_rate": 5.764393465403362e-07, + "loss": 1.5772, + "step": 5248 + }, + { + "epoch": 0.9533452902581333, + "grad_norm": 0.4183390140533447, + "learning_rate": 5.719707204055735e-07, + "loss": 1.5016, + "step": 5249 + }, + { + "epoch": 0.953526914432311, + "grad_norm": 1.0923821926116943, + "learning_rate": 5.675193827541281e-07, + "loss": 1.7678, + "step": 5250 + }, + { + "epoch": 0.9537085386064885, + "grad_norm": 0.33379724621772766, + "learning_rate": 5.630853351429599e-07, + "loss": 1.7376, + "step": 5251 + }, + { + "epoch": 0.9538901627806661, + "grad_norm": 0.32526740431785583, + "learning_rate": 5.586685791229562e-07, + "loss": 1.585, + "step": 5252 + }, + { + "epoch": 0.9540717869548437, + "grad_norm": 1.3424384593963623, + "learning_rate": 5.542691162389758e-07, + "loss": 1.7429, + "step": 5253 + }, + { + "epoch": 0.9542534111290213, + "grad_norm": 1.3600283861160278, + "learning_rate": 5.498869480298208e-07, + "loss": 1.7881, + "step": 5254 + }, + { + "epoch": 0.9544350353031988, + "grad_norm": 0.40846261382102966, + "learning_rate": 5.455220760282431e-07, + "loss": 1.8905, + "step": 5255 + }, + { + "epoch": 0.9546166594773764, + "grad_norm": 0.42951828241348267, + "learning_rate": 5.411745017609493e-07, + "loss": 1.7198, + "step": 5256 + }, + { + "epoch": 0.9547982836515541, + "grad_norm": 0.3955833613872528, + "learning_rate": 5.368442267486006e-07, + "loss": 1.738, + "step": 5257 + }, + { + "epoch": 0.9549799078257316, + "grad_norm": 0.3006090521812439, + "learning_rate": 5.325312525057968e-07, + "loss": 1.8208, + "step": 5258 + }, + { + "epoch": 0.9551615319999092, + "grad_norm": 0.3509175777435303, + "learning_rate": 5.282355805410865e-07, + "loss": 1.6176, + "step": 5259 + }, + { + "epoch": 0.9553431561740867, + "grad_norm": 0.3925764262676239, + "learning_rate": 5.23957212356968e-07, + "loss": 1.58, + "step": 5260 + }, + { + "epoch": 0.9555247803482644, + "grad_norm": 0.3906477093696594, + "learning_rate": 5.196961494498997e-07, + "loss": 1.8233, + "step": 5261 + }, + { + "epoch": 0.9557064045224419, + "grad_norm": 0.34929555654525757, + "learning_rate": 5.154523933102784e-07, + "loss": 1.8139, + "step": 5262 + }, + { + "epoch": 0.9558880286966195, + "grad_norm": 0.3421228528022766, + "learning_rate": 5.112259454224333e-07, + "loss": 1.5426, + "step": 5263 + }, + { + "epoch": 0.9560696528707971, + "grad_norm": 0.3160025477409363, + "learning_rate": 5.070168072646597e-07, + "loss": 1.8065, + "step": 5264 + }, + { + "epoch": 0.9562512770449747, + "grad_norm": 0.2987014949321747, + "learning_rate": 5.028249803091966e-07, + "loss": 1.5516, + "step": 5265 + }, + { + "epoch": 0.9564329012191523, + "grad_norm": 0.3478109538555145, + "learning_rate": 4.986504660222102e-07, + "loss": 1.6979, + "step": 5266 + }, + { + "epoch": 0.9566145253933298, + "grad_norm": 0.5237562656402588, + "learning_rate": 4.944932658638379e-07, + "loss": 1.6161, + "step": 5267 + }, + { + "epoch": 0.9567961495675075, + "grad_norm": 0.739628255367279, + "learning_rate": 4.903533812881389e-07, + "loss": 1.8371, + "step": 5268 + }, + { + "epoch": 0.956977773741685, + "grad_norm": 0.39181745052337646, + "learning_rate": 4.862308137431271e-07, + "loss": 1.585, + "step": 5269 + }, + { + "epoch": 0.9571593979158626, + "grad_norm": 0.34616217017173767, + "learning_rate": 4.821255646707546e-07, + "loss": 1.7386, + "step": 5270 + }, + { + "epoch": 0.9573410220900402, + "grad_norm": 0.32961681485176086, + "learning_rate": 4.780376355069172e-07, + "loss": 1.5388, + "step": 5271 + }, + { + "epoch": 0.9575226462642178, + "grad_norm": 0.4514267146587372, + "learning_rate": 4.739670276814545e-07, + "loss": 1.6928, + "step": 5272 + }, + { + "epoch": 0.9577042704383953, + "grad_norm": 0.3911161720752716, + "learning_rate": 4.6991374261814434e-07, + "loss": 1.6724, + "step": 5273 + }, + { + "epoch": 0.9578858946125729, + "grad_norm": 0.8706610798835754, + "learning_rate": 4.658777817347082e-07, + "loss": 1.7696, + "step": 5274 + }, + { + "epoch": 0.9580675187867506, + "grad_norm": 0.4559879004955292, + "learning_rate": 4.618591464428168e-07, + "loss": 1.9505, + "step": 5275 + }, + { + "epoch": 0.9582491429609281, + "grad_norm": 0.41262421011924744, + "learning_rate": 4.5785783814805696e-07, + "loss": 1.6894, + "step": 5276 + }, + { + "epoch": 0.9584307671351057, + "grad_norm": 0.6636069416999817, + "learning_rate": 4.538738582499758e-07, + "loss": 1.8096, + "step": 5277 + }, + { + "epoch": 0.9586123913092832, + "grad_norm": 0.6329394578933716, + "learning_rate": 4.4990720814205856e-07, + "loss": 1.7443, + "step": 5278 + }, + { + "epoch": 0.9587940154834609, + "grad_norm": 0.49127840995788574, + "learning_rate": 4.4595788921171776e-07, + "loss": 1.6894, + "step": 5279 + }, + { + "epoch": 0.9589756396576384, + "grad_norm": 1.0187429189682007, + "learning_rate": 4.420259028403095e-07, + "loss": 1.6658, + "step": 5280 + }, + { + "epoch": 0.959157263831816, + "grad_norm": 0.5905535817146301, + "learning_rate": 4.381112504031337e-07, + "loss": 1.617, + "step": 5281 + }, + { + "epoch": 0.9593388880059937, + "grad_norm": 0.6531595587730408, + "learning_rate": 4.3421393326941174e-07, + "loss": 1.8467, + "step": 5282 + }, + { + "epoch": 0.9595205121801712, + "grad_norm": 0.3270147144794464, + "learning_rate": 4.3033395280232534e-07, + "loss": 1.6076, + "step": 5283 + }, + { + "epoch": 0.9597021363543488, + "grad_norm": 0.39814135432243347, + "learning_rate": 4.264713103589668e-07, + "loss": 1.5338, + "step": 5284 + }, + { + "epoch": 0.9598837605285263, + "grad_norm": 0.6392297148704529, + "learning_rate": 4.226260072903776e-07, + "loss": 1.5712, + "step": 5285 + }, + { + "epoch": 0.960065384702704, + "grad_norm": 0.3346140682697296, + "learning_rate": 4.187980449415319e-07, + "loss": 1.5469, + "step": 5286 + }, + { + "epoch": 0.9602470088768815, + "grad_norm": 0.37436920404434204, + "learning_rate": 4.14987424651353e-07, + "loss": 1.6614, + "step": 5287 + }, + { + "epoch": 0.9604286330510591, + "grad_norm": 0.38734614849090576, + "learning_rate": 4.1119414775266376e-07, + "loss": 1.6148, + "step": 5288 + }, + { + "epoch": 0.9606102572252366, + "grad_norm": 0.44534391164779663, + "learning_rate": 4.0741821557225833e-07, + "loss": 1.7645, + "step": 5289 + }, + { + "epoch": 0.9607918813994143, + "grad_norm": 0.4103158116340637, + "learning_rate": 4.0365962943083593e-07, + "loss": 1.7657, + "step": 5290 + }, + { + "epoch": 0.9609735055735918, + "grad_norm": 0.4359341561794281, + "learning_rate": 3.9991839064305035e-07, + "loss": 1.5387, + "step": 5291 + }, + { + "epoch": 0.9611551297477694, + "grad_norm": 0.47850295901298523, + "learning_rate": 3.9619450051747167e-07, + "loss": 1.7688, + "step": 5292 + }, + { + "epoch": 0.9613367539219471, + "grad_norm": 0.39134302735328674, + "learning_rate": 3.924879603566134e-07, + "loss": 1.8219, + "step": 5293 + }, + { + "epoch": 0.9615183780961246, + "grad_norm": 0.5431784987449646, + "learning_rate": 3.887987714569052e-07, + "loss": 1.8052, + "step": 5294 + }, + { + "epoch": 0.9617000022703022, + "grad_norm": 0.36093005537986755, + "learning_rate": 3.851269351087261e-07, + "loss": 1.6359, + "step": 5295 + }, + { + "epoch": 0.9618816264444797, + "grad_norm": 0.28981098532676697, + "learning_rate": 3.814724525963764e-07, + "loss": 1.5975, + "step": 5296 + }, + { + "epoch": 0.9620632506186574, + "grad_norm": 1.1194252967834473, + "learning_rate": 3.778353251980837e-07, + "loss": 1.6913, + "step": 5297 + }, + { + "epoch": 0.9622448747928349, + "grad_norm": 0.4478866457939148, + "learning_rate": 3.742155541860137e-07, + "loss": 1.7485, + "step": 5298 + }, + { + "epoch": 0.9624264989670125, + "grad_norm": 0.5392176508903503, + "learning_rate": 3.706131408262592e-07, + "loss": 1.7684, + "step": 5299 + }, + { + "epoch": 0.96260812314119, + "grad_norm": 0.46591007709503174, + "learning_rate": 3.670280863788289e-07, + "loss": 1.7217, + "step": 5300 + }, + { + "epoch": 0.9627897473153677, + "grad_norm": 0.42098182439804077, + "learning_rate": 3.634603920976809e-07, + "loss": 1.5322, + "step": 5301 + }, + { + "epoch": 0.9629713714895453, + "grad_norm": 0.3784842789173126, + "learning_rate": 3.5991005923068365e-07, + "loss": 1.6713, + "step": 5302 + }, + { + "epoch": 0.9631529956637228, + "grad_norm": 0.42898741364479065, + "learning_rate": 3.5637708901964385e-07, + "loss": 1.7326, + "step": 5303 + }, + { + "epoch": 0.9633346198379005, + "grad_norm": 0.48599281907081604, + "learning_rate": 3.5286148270028965e-07, + "loss": 1.7729, + "step": 5304 + }, + { + "epoch": 0.963516244012078, + "grad_norm": 0.422601580619812, + "learning_rate": 3.493632415022763e-07, + "loss": 1.7695, + "step": 5305 + }, + { + "epoch": 0.9636978681862556, + "grad_norm": 0.40007370710372925, + "learning_rate": 3.458823666491917e-07, + "loss": 1.5415, + "step": 5306 + }, + { + "epoch": 0.9638794923604331, + "grad_norm": 0.4512110948562622, + "learning_rate": 3.424188593585398e-07, + "loss": 1.8239, + "step": 5307 + }, + { + "epoch": 0.9640611165346108, + "grad_norm": 0.33778223395347595, + "learning_rate": 3.3897272084175703e-07, + "loss": 1.7344, + "step": 5308 + }, + { + "epoch": 0.9642427407087883, + "grad_norm": 0.3809599280357361, + "learning_rate": 3.355439523041959e-07, + "loss": 1.4676, + "step": 5309 + }, + { + "epoch": 0.9644243648829659, + "grad_norm": 0.3540728986263275, + "learning_rate": 3.3213255494514705e-07, + "loss": 1.7004, + "step": 5310 + }, + { + "epoch": 0.9646059890571435, + "grad_norm": 0.5061451196670532, + "learning_rate": 3.2873852995781716e-07, + "loss": 1.8391, + "step": 5311 + }, + { + "epoch": 0.9647876132313211, + "grad_norm": 0.4414951503276825, + "learning_rate": 3.2536187852933443e-07, + "loss": 1.6462, + "step": 5312 + }, + { + "epoch": 0.9649692374054987, + "grad_norm": 0.4015233516693115, + "learning_rate": 3.2200260184075406e-07, + "loss": 1.6739, + "step": 5313 + }, + { + "epoch": 0.9651508615796762, + "grad_norm": 0.40317121148109436, + "learning_rate": 3.1866070106705835e-07, + "loss": 1.7606, + "step": 5314 + }, + { + "epoch": 0.9653324857538539, + "grad_norm": 0.3804510831832886, + "learning_rate": 3.153361773771346e-07, + "loss": 1.6512, + "step": 5315 + }, + { + "epoch": 0.9655141099280314, + "grad_norm": 0.33188965916633606, + "learning_rate": 3.120290319338137e-07, + "loss": 1.7194, + "step": 5316 + }, + { + "epoch": 0.965695734102209, + "grad_norm": 0.3496490716934204, + "learning_rate": 3.087392658938315e-07, + "loss": 1.7686, + "step": 5317 + }, + { + "epoch": 0.9658773582763865, + "grad_norm": 0.48698413372039795, + "learning_rate": 3.05466880407862e-07, + "loss": 1.6611, + "step": 5318 + }, + { + "epoch": 0.9660589824505642, + "grad_norm": 0.3986532390117645, + "learning_rate": 3.02211876620484e-07, + "loss": 1.7229, + "step": 5319 + }, + { + "epoch": 0.9662406066247418, + "grad_norm": 0.49678850173950195, + "learning_rate": 2.9897425567020356e-07, + "loss": 1.6835, + "step": 5320 + }, + { + "epoch": 0.9664222307989193, + "grad_norm": 0.44303980469703674, + "learning_rate": 2.957540186894481e-07, + "loss": 1.8384, + "step": 5321 + }, + { + "epoch": 0.9666038549730969, + "grad_norm": 0.429462194442749, + "learning_rate": 2.925511668045611e-07, + "loss": 1.7039, + "step": 5322 + }, + { + "epoch": 0.9667854791472745, + "grad_norm": 0.37803712487220764, + "learning_rate": 2.8936570113580196e-07, + "loss": 1.6039, + "step": 5323 + }, + { + "epoch": 0.9669671033214521, + "grad_norm": 0.37328898906707764, + "learning_rate": 2.8619762279736284e-07, + "loss": 1.8029, + "step": 5324 + }, + { + "epoch": 0.9671487274956296, + "grad_norm": 0.5981305837631226, + "learning_rate": 2.8304693289734064e-07, + "loss": 1.5992, + "step": 5325 + }, + { + "epoch": 0.9673303516698073, + "grad_norm": 0.3784109055995941, + "learning_rate": 2.799136325377538e-07, + "loss": 1.6323, + "step": 5326 + }, + { + "epoch": 0.9675119758439849, + "grad_norm": 0.47535955905914307, + "learning_rate": 2.767977228145424e-07, + "loss": 1.7474, + "step": 5327 + }, + { + "epoch": 0.9676936000181624, + "grad_norm": 0.4182107448577881, + "learning_rate": 2.7369920481755683e-07, + "loss": 1.7422, + "step": 5328 + }, + { + "epoch": 0.96787522419234, + "grad_norm": 0.4474884569644928, + "learning_rate": 2.7061807963056906e-07, + "loss": 1.925, + "step": 5329 + }, + { + "epoch": 0.9680568483665176, + "grad_norm": 0.45505595207214355, + "learning_rate": 2.6755434833127255e-07, + "loss": 1.8708, + "step": 5330 + }, + { + "epoch": 0.9682384725406952, + "grad_norm": 0.39341872930526733, + "learning_rate": 2.6450801199126573e-07, + "loss": 1.6894, + "step": 5331 + }, + { + "epoch": 0.9684200967148727, + "grad_norm": 0.3379420042037964, + "learning_rate": 2.6147907167606845e-07, + "loss": 1.7139, + "step": 5332 + }, + { + "epoch": 0.9686017208890503, + "grad_norm": 0.36236733198165894, + "learning_rate": 2.5846752844511104e-07, + "loss": 1.538, + "step": 5333 + }, + { + "epoch": 0.9687833450632279, + "grad_norm": 0.3824905753135681, + "learning_rate": 2.554733833517564e-07, + "loss": 1.805, + "step": 5334 + }, + { + "epoch": 0.9689649692374055, + "grad_norm": 0.3307805359363556, + "learning_rate": 2.524966374432558e-07, + "loss": 1.5972, + "step": 5335 + }, + { + "epoch": 0.969146593411583, + "grad_norm": 0.3232310116291046, + "learning_rate": 2.4953729176079855e-07, + "loss": 1.66, + "step": 5336 + }, + { + "epoch": 0.9693282175857607, + "grad_norm": 0.4221342206001282, + "learning_rate": 2.465953473394733e-07, + "loss": 1.8122, + "step": 5337 + }, + { + "epoch": 0.9695098417599383, + "grad_norm": 0.410552442073822, + "learning_rate": 2.436708052082848e-07, + "loss": 1.8367, + "step": 5338 + }, + { + "epoch": 0.9696914659341158, + "grad_norm": 0.3768724203109741, + "learning_rate": 2.407636663901591e-07, + "loss": 1.6602, + "step": 5339 + }, + { + "epoch": 0.9698730901082934, + "grad_norm": 0.579439640045166, + "learning_rate": 2.378739319019163e-07, + "loss": 1.7252, + "step": 5340 + }, + { + "epoch": 0.970054714282471, + "grad_norm": 0.27594906091690063, + "learning_rate": 2.3500160275430893e-07, + "loss": 1.7831, + "step": 5341 + }, + { + "epoch": 0.9702363384566486, + "grad_norm": 0.34563732147216797, + "learning_rate": 2.3214667995199446e-07, + "loss": 1.7177, + "step": 5342 + }, + { + "epoch": 0.9704179626308261, + "grad_norm": 0.4464113414287567, + "learning_rate": 2.2930916449354634e-07, + "loss": 1.7853, + "step": 5343 + }, + { + "epoch": 0.9705995868050038, + "grad_norm": 0.3537169098854065, + "learning_rate": 2.264890573714318e-07, + "loss": 1.8022, + "step": 5344 + }, + { + "epoch": 0.9707812109791814, + "grad_norm": 0.3811842203140259, + "learning_rate": 2.2368635957205618e-07, + "loss": 1.7255, + "step": 5345 + }, + { + "epoch": 0.9709628351533589, + "grad_norm": 0.37563809752464294, + "learning_rate": 2.2090107207570764e-07, + "loss": 1.5578, + "step": 5346 + }, + { + "epoch": 0.9711444593275365, + "grad_norm": 0.38791391253471375, + "learning_rate": 2.1813319585660686e-07, + "loss": 1.5525, + "step": 5347 + }, + { + "epoch": 0.9713260835017141, + "grad_norm": 0.36624258756637573, + "learning_rate": 2.1538273188287938e-07, + "loss": 1.7191, + "step": 5348 + }, + { + "epoch": 0.9715077076758917, + "grad_norm": 0.41203778982162476, + "learning_rate": 2.1264968111655014e-07, + "loss": 1.607, + "step": 5349 + }, + { + "epoch": 0.9716893318500692, + "grad_norm": 0.39490246772766113, + "learning_rate": 2.0993404451356003e-07, + "loss": 1.73, + "step": 5350 + }, + { + "epoch": 0.9718709560242468, + "grad_norm": 0.3751300275325775, + "learning_rate": 2.0723582302376588e-07, + "loss": 1.877, + "step": 5351 + }, + { + "epoch": 0.9720525801984244, + "grad_norm": 0.44275274872779846, + "learning_rate": 2.0455501759092388e-07, + "loss": 1.7054, + "step": 5352 + }, + { + "epoch": 0.972234204372602, + "grad_norm": 0.4012008011341095, + "learning_rate": 2.0189162915270066e-07, + "loss": 1.6964, + "step": 5353 + }, + { + "epoch": 0.9724158285467795, + "grad_norm": 0.3209567964076996, + "learning_rate": 1.9924565864067323e-07, + "loss": 1.5983, + "step": 5354 + }, + { + "epoch": 0.9725974527209572, + "grad_norm": 0.4415270984172821, + "learning_rate": 1.9661710698032354e-07, + "loss": 1.7505, + "step": 5355 + }, + { + "epoch": 0.9727790768951348, + "grad_norm": 0.40494048595428467, + "learning_rate": 1.940059750910439e-07, + "loss": 1.6654, + "step": 5356 + }, + { + "epoch": 0.9729607010693123, + "grad_norm": 0.42396819591522217, + "learning_rate": 1.9141226388613153e-07, + "loss": 1.5662, + "step": 5357 + }, + { + "epoch": 0.9731423252434899, + "grad_norm": 0.48366713523864746, + "learning_rate": 1.888359742727941e-07, + "loss": 1.8783, + "step": 5358 + }, + { + "epoch": 0.9733239494176675, + "grad_norm": 0.3335143029689789, + "learning_rate": 1.8627710715213298e-07, + "loss": 1.6444, + "step": 5359 + }, + { + "epoch": 0.9735055735918451, + "grad_norm": 0.4022449553012848, + "learning_rate": 1.8373566341917114e-07, + "loss": 1.5922, + "step": 5360 + }, + { + "epoch": 0.9736871977660226, + "grad_norm": 0.3165479302406311, + "learning_rate": 1.812116439628364e-07, + "loss": 1.8745, + "step": 5361 + }, + { + "epoch": 0.9738688219402002, + "grad_norm": 0.35455435514450073, + "learning_rate": 1.7870504966595591e-07, + "loss": 1.8057, + "step": 5362 + }, + { + "epoch": 0.9740504461143779, + "grad_norm": 0.36724480986595154, + "learning_rate": 1.7621588140525612e-07, + "loss": 1.5958, + "step": 5363 + }, + { + "epoch": 0.9742320702885554, + "grad_norm": 1.206443190574646, + "learning_rate": 1.7374414005137395e-07, + "loss": 1.8964, + "step": 5364 + }, + { + "epoch": 0.974413694462733, + "grad_norm": 0.44103050231933594, + "learning_rate": 1.7128982646886227e-07, + "loss": 1.7079, + "step": 5365 + }, + { + "epoch": 0.9745953186369106, + "grad_norm": 0.3833788335323334, + "learning_rate": 1.688529415161677e-07, + "loss": 1.8482, + "step": 5366 + }, + { + "epoch": 0.9747769428110882, + "grad_norm": 0.4549916088581085, + "learning_rate": 1.664334860456307e-07, + "loss": 1.8483, + "step": 5367 + }, + { + "epoch": 0.9749585669852657, + "grad_norm": 0.3155622184276581, + "learning_rate": 1.6403146090351874e-07, + "loss": 1.5987, + "step": 5368 + }, + { + "epoch": 0.9751401911594433, + "grad_norm": 0.5604477524757385, + "learning_rate": 1.61646866929982e-07, + "loss": 1.6689, + "step": 5369 + }, + { + "epoch": 0.975321815333621, + "grad_norm": 0.3492743968963623, + "learning_rate": 1.592797049590866e-07, + "loss": 1.8243, + "step": 5370 + }, + { + "epoch": 0.9755034395077985, + "grad_norm": 0.3376096189022064, + "learning_rate": 1.569299758187981e-07, + "loss": 1.6022, + "step": 5371 + }, + { + "epoch": 0.975685063681976, + "grad_norm": 0.6834940314292908, + "learning_rate": 1.5459768033097566e-07, + "loss": 1.9141, + "step": 5372 + }, + { + "epoch": 0.9758666878561536, + "grad_norm": 0.5116184949874878, + "learning_rate": 1.52282819311389e-07, + "loss": 1.7606, + "step": 5373 + }, + { + "epoch": 0.9760483120303313, + "grad_norm": 0.36430150270462036, + "learning_rate": 1.499853935697182e-07, + "loss": 1.6179, + "step": 5374 + }, + { + "epoch": 0.9762299362045088, + "grad_norm": 0.6272649765014648, + "learning_rate": 1.4770540390953158e-07, + "loss": 1.7949, + "step": 5375 + }, + { + "epoch": 0.9764115603786864, + "grad_norm": 0.2522508203983307, + "learning_rate": 1.4544285112830236e-07, + "loss": 1.802, + "step": 5376 + }, + { + "epoch": 0.976593184552864, + "grad_norm": 0.3981025516986847, + "learning_rate": 1.431977360173975e-07, + "loss": 1.7186, + "step": 5377 + }, + { + "epoch": 0.9767748087270416, + "grad_norm": 0.391965389251709, + "learning_rate": 1.409700593621055e-07, + "loss": 1.5596, + "step": 5378 + }, + { + "epoch": 0.9769564329012191, + "grad_norm": 0.4873986840248108, + "learning_rate": 1.3875982194159752e-07, + "loss": 1.7072, + "step": 5379 + }, + { + "epoch": 0.9771380570753967, + "grad_norm": 0.550014853477478, + "learning_rate": 1.3656702452894964e-07, + "loss": 1.6416, + "step": 5380 + }, + { + "epoch": 0.9773196812495744, + "grad_norm": 0.4228378236293793, + "learning_rate": 1.3439166789113722e-07, + "loss": 1.6556, + "step": 5381 + }, + { + "epoch": 0.9775013054237519, + "grad_norm": 0.40300053358078003, + "learning_rate": 1.3223375278904048e-07, + "loss": 1.6485, + "step": 5382 + }, + { + "epoch": 0.9776829295979295, + "grad_norm": 0.36735302209854126, + "learning_rate": 1.3009327997742793e-07, + "loss": 1.5726, + "step": 5383 + }, + { + "epoch": 0.977864553772107, + "grad_norm": 0.37163373827934265, + "learning_rate": 1.27970250204984e-07, + "loss": 1.7623, + "step": 5384 + }, + { + "epoch": 0.9780461779462847, + "grad_norm": 0.4317956566810608, + "learning_rate": 1.2586466421427577e-07, + "loss": 1.6192, + "step": 5385 + }, + { + "epoch": 0.9782278021204622, + "grad_norm": 0.4703550338745117, + "learning_rate": 1.2377652274178087e-07, + "loss": 1.8407, + "step": 5386 + }, + { + "epoch": 0.9784094262946398, + "grad_norm": 0.3679129481315613, + "learning_rate": 1.21705826517865e-07, + "loss": 1.5142, + "step": 5387 + }, + { + "epoch": 0.9785910504688174, + "grad_norm": 0.4597603976726532, + "learning_rate": 1.1965257626680438e-07, + "loss": 1.73, + "step": 5388 + }, + { + "epoch": 0.978772674642995, + "grad_norm": 0.3706151247024536, + "learning_rate": 1.1761677270675787e-07, + "loss": 1.7071, + "step": 5389 + }, + { + "epoch": 0.9789542988171726, + "grad_norm": 0.5278778672218323, + "learning_rate": 1.1559841654979475e-07, + "loss": 1.7472, + "step": 5390 + }, + { + "epoch": 0.9791359229913501, + "grad_norm": 0.3713902235031128, + "learning_rate": 1.1359750850187256e-07, + "loss": 1.7996, + "step": 5391 + }, + { + "epoch": 0.9793175471655278, + "grad_norm": 0.31585484743118286, + "learning_rate": 1.1161404926285923e-07, + "loss": 1.6115, + "step": 5392 + }, + { + "epoch": 0.9794991713397053, + "grad_norm": 0.2802826166152954, + "learning_rate": 1.0964803952650537e-07, + "loss": 1.7743, + "step": 5393 + }, + { + "epoch": 0.9796807955138829, + "grad_norm": 0.5533532500267029, + "learning_rate": 1.0769947998046093e-07, + "loss": 1.7203, + "step": 5394 + }, + { + "epoch": 0.9798624196880604, + "grad_norm": 0.47953319549560547, + "learning_rate": 1.057683713062807e-07, + "loss": 1.6716, + "step": 5395 + }, + { + "epoch": 0.9800440438622381, + "grad_norm": 0.6124926209449768, + "learning_rate": 1.0385471417941329e-07, + "loss": 1.8817, + "step": 5396 + }, + { + "epoch": 0.9802256680364156, + "grad_norm": 0.48378533124923706, + "learning_rate": 1.0195850926918993e-07, + "loss": 1.7709, + "step": 5397 + }, + { + "epoch": 0.9804072922105932, + "grad_norm": 0.4406229555606842, + "learning_rate": 1.0007975723885232e-07, + "loss": 1.615, + "step": 5398 + }, + { + "epoch": 0.9805889163847709, + "grad_norm": 1.0051679611206055, + "learning_rate": 9.821845874553592e-08, + "loss": 1.6812, + "step": 5399 + }, + { + "epoch": 0.9807705405589484, + "grad_norm": 0.3825260102748871, + "learning_rate": 9.637461444026441e-08, + "loss": 1.8048, + "step": 5400 + }, + { + "epoch": 0.980952164733126, + "grad_norm": 0.47166478633880615, + "learning_rate": 9.454822496796634e-08, + "loss": 1.8006, + "step": 5401 + }, + { + "epoch": 0.9811337889073035, + "grad_norm": 0.7538245916366577, + "learning_rate": 9.273929096745848e-08, + "loss": 1.8029, + "step": 5402 + }, + { + "epoch": 0.9813154130814812, + "grad_norm": 0.736508309841156, + "learning_rate": 9.094781307144584e-08, + "loss": 1.6879, + "step": 5403 + }, + { + "epoch": 0.9814970372556587, + "grad_norm": 0.5864923596382141, + "learning_rate": 8.91737919065383e-08, + "loss": 1.9378, + "step": 5404 + }, + { + "epoch": 0.9816786614298363, + "grad_norm": 0.36118578910827637, + "learning_rate": 8.741722809324504e-08, + "loss": 1.7305, + "step": 5405 + }, + { + "epoch": 0.981860285604014, + "grad_norm": 0.7139500975608826, + "learning_rate": 8.56781222459524e-08, + "loss": 1.7001, + "step": 5406 + }, + { + "epoch": 0.9820419097781915, + "grad_norm": 0.4221162796020508, + "learning_rate": 8.395647497294601e-08, + "loss": 1.6938, + "step": 5407 + }, + { + "epoch": 0.982223533952369, + "grad_norm": 0.3668670356273651, + "learning_rate": 8.22522868764164e-08, + "loss": 1.7517, + "step": 5408 + }, + { + "epoch": 0.9824051581265466, + "grad_norm": 0.3448127210140228, + "learning_rate": 8.056555855243675e-08, + "loss": 1.9128, + "step": 5409 + }, + { + "epoch": 0.9825867823007243, + "grad_norm": 0.40857744216918945, + "learning_rate": 7.889629059097403e-08, + "loss": 1.686, + "step": 5410 + }, + { + "epoch": 0.9827684064749018, + "grad_norm": 0.41402795910835266, + "learning_rate": 7.724448357588898e-08, + "loss": 1.745, + "step": 5411 + }, + { + "epoch": 0.9829500306490794, + "grad_norm": 0.7114197611808777, + "learning_rate": 7.561013808493056e-08, + "loss": 1.7672, + "step": 5412 + }, + { + "epoch": 0.9831316548232569, + "grad_norm": 0.4087766110897064, + "learning_rate": 7.399325468975815e-08, + "loss": 1.7939, + "step": 5413 + }, + { + "epoch": 0.9833132789974346, + "grad_norm": 0.35377955436706543, + "learning_rate": 7.239383395590271e-08, + "loss": 1.5881, + "step": 5414 + }, + { + "epoch": 0.9834949031716121, + "grad_norm": 0.33516407012939453, + "learning_rate": 7.081187644278897e-08, + "loss": 1.7207, + "step": 5415 + }, + { + "epoch": 0.9836765273457897, + "grad_norm": 0.4488436281681061, + "learning_rate": 6.924738270374653e-08, + "loss": 1.5514, + "step": 5416 + }, + { + "epoch": 0.9838581515199674, + "grad_norm": 0.32274365425109863, + "learning_rate": 6.770035328599322e-08, + "loss": 1.7422, + "step": 5417 + }, + { + "epoch": 0.9840397756941449, + "grad_norm": 0.3501582741737366, + "learning_rate": 6.617078873062954e-08, + "loss": 1.5331, + "step": 5418 + }, + { + "epoch": 0.9842213998683225, + "grad_norm": 0.35481882095336914, + "learning_rate": 6.465868957264976e-08, + "loss": 1.7979, + "step": 5419 + }, + { + "epoch": 0.9844030240425, + "grad_norm": 0.48570749163627625, + "learning_rate": 6.316405634094746e-08, + "loss": 1.707, + "step": 5420 + }, + { + "epoch": 0.9845846482166777, + "grad_norm": 0.4307800531387329, + "learning_rate": 6.168688955830448e-08, + "loss": 1.7014, + "step": 5421 + }, + { + "epoch": 0.9847662723908552, + "grad_norm": 0.6614157557487488, + "learning_rate": 6.022718974137975e-08, + "loss": 1.699, + "step": 5422 + }, + { + "epoch": 0.9849478965650328, + "grad_norm": 0.43090325593948364, + "learning_rate": 5.878495740074263e-08, + "loss": 1.5258, + "step": 5423 + }, + { + "epoch": 0.9851295207392103, + "grad_norm": 0.4273134469985962, + "learning_rate": 5.736019304084517e-08, + "loss": 1.7479, + "step": 5424 + }, + { + "epoch": 0.985311144913388, + "grad_norm": 0.4596972167491913, + "learning_rate": 5.59528971600165e-08, + "loss": 1.6939, + "step": 5425 + }, + { + "epoch": 0.9854927690875656, + "grad_norm": 0.5013171434402466, + "learning_rate": 5.456307025050178e-08, + "loss": 1.7559, + "step": 5426 + }, + { + "epoch": 0.9856743932617431, + "grad_norm": 0.3634844720363617, + "learning_rate": 5.3190712798417694e-08, + "loss": 1.5983, + "step": 5427 + }, + { + "epoch": 0.9858560174359208, + "grad_norm": 0.6565014719963074, + "learning_rate": 5.183582528376918e-08, + "loss": 1.8296, + "step": 5428 + }, + { + "epoch": 0.9860376416100983, + "grad_norm": 0.38392844796180725, + "learning_rate": 5.049840818046048e-08, + "loss": 1.6137, + "step": 5429 + }, + { + "epoch": 0.9862192657842759, + "grad_norm": 0.3735438883304596, + "learning_rate": 4.917846195628406e-08, + "loss": 1.7125, + "step": 5430 + }, + { + "epoch": 0.9864008899584534, + "grad_norm": 0.4851929545402527, + "learning_rate": 4.7875987072915075e-08, + "loss": 1.7207, + "step": 5431 + }, + { + "epoch": 0.9865825141326311, + "grad_norm": 0.6449366807937622, + "learning_rate": 4.659098398592243e-08, + "loss": 1.7002, + "step": 5432 + }, + { + "epoch": 0.9867641383068086, + "grad_norm": 0.46304985880851746, + "learning_rate": 4.532345314475772e-08, + "loss": 1.5502, + "step": 5433 + }, + { + "epoch": 0.9869457624809862, + "grad_norm": 0.4238336682319641, + "learning_rate": 4.4073394992771855e-08, + "loss": 1.7748, + "step": 5434 + }, + { + "epoch": 0.9871273866551638, + "grad_norm": 0.4924546480178833, + "learning_rate": 4.284080996720397e-08, + "loss": 1.8308, + "step": 5435 + }, + { + "epoch": 0.9873090108293414, + "grad_norm": 0.3247637450695038, + "learning_rate": 4.1625698499164756e-08, + "loss": 1.6949, + "step": 5436 + }, + { + "epoch": 0.987490635003519, + "grad_norm": 0.3514692187309265, + "learning_rate": 4.042806101366981e-08, + "loss": 1.6973, + "step": 5437 + }, + { + "epoch": 0.9876722591776965, + "grad_norm": 0.37926194071769714, + "learning_rate": 3.924789792961736e-08, + "loss": 1.8139, + "step": 5438 + }, + { + "epoch": 0.9878538833518742, + "grad_norm": 0.6178009510040283, + "learning_rate": 3.808520965979945e-08, + "loss": 1.6866, + "step": 5439 + }, + { + "epoch": 0.9880355075260517, + "grad_norm": 0.35131189227104187, + "learning_rate": 3.6939996610879656e-08, + "loss": 1.8102, + "step": 5440 + }, + { + "epoch": 0.9882171317002293, + "grad_norm": 0.3739866316318512, + "learning_rate": 3.581225918342646e-08, + "loss": 1.8421, + "step": 5441 + }, + { + "epoch": 0.9883987558744068, + "grad_norm": 0.30177053809165955, + "learning_rate": 3.4701997771890985e-08, + "loss": 1.5827, + "step": 5442 + }, + { + "epoch": 0.9885803800485845, + "grad_norm": 0.540540874004364, + "learning_rate": 3.36092127646126e-08, + "loss": 1.6443, + "step": 5443 + }, + { + "epoch": 0.9887620042227621, + "grad_norm": 0.32871073484420776, + "learning_rate": 3.253390454380778e-08, + "loss": 1.7645, + "step": 5444 + }, + { + "epoch": 0.9889436283969396, + "grad_norm": 0.3875550925731659, + "learning_rate": 3.147607348559234e-08, + "loss": 1.752, + "step": 5445 + }, + { + "epoch": 0.9891252525711172, + "grad_norm": 0.4069419801235199, + "learning_rate": 3.0435719959959196e-08, + "loss": 1.7647, + "step": 5446 + }, + { + "epoch": 0.9893068767452948, + "grad_norm": 0.31407126784324646, + "learning_rate": 2.9412844330806157e-08, + "loss": 1.9231, + "step": 5447 + }, + { + "epoch": 0.9894885009194724, + "grad_norm": 0.38621148467063904, + "learning_rate": 2.840744695589148e-08, + "loss": 1.7243, + "step": 5448 + }, + { + "epoch": 0.9896701250936499, + "grad_norm": 0.6861346960067749, + "learning_rate": 2.741952818688387e-08, + "loss": 1.6707, + "step": 5449 + }, + { + "epoch": 0.9898517492678276, + "grad_norm": 0.5720432996749878, + "learning_rate": 2.6449088369329135e-08, + "loss": 1.852, + "step": 5450 + }, + { + "epoch": 0.9900333734420051, + "grad_norm": 0.3936172127723694, + "learning_rate": 2.5496127842644658e-08, + "loss": 1.8368, + "step": 5451 + }, + { + "epoch": 0.9902149976161827, + "grad_norm": 0.3254601061344147, + "learning_rate": 2.45606469401638e-08, + "loss": 1.6476, + "step": 5452 + }, + { + "epoch": 0.9903966217903603, + "grad_norm": 0.5839923620223999, + "learning_rate": 2.3642645989085942e-08, + "loss": 1.7086, + "step": 5453 + }, + { + "epoch": 0.9905782459645379, + "grad_norm": 0.42509493231773376, + "learning_rate": 2.2742125310498687e-08, + "loss": 1.8392, + "step": 5454 + }, + { + "epoch": 0.9907598701387155, + "grad_norm": 0.40520089864730835, + "learning_rate": 2.1859085219377852e-08, + "loss": 1.8809, + "step": 5455 + }, + { + "epoch": 0.990941494312893, + "grad_norm": 0.4090883731842041, + "learning_rate": 2.0993526024587484e-08, + "loss": 1.8698, + "step": 5456 + }, + { + "epoch": 0.9911231184870706, + "grad_norm": 1.3702970743179321, + "learning_rate": 2.0145448028874304e-08, + "loss": 1.8427, + "step": 5457 + }, + { + "epoch": 0.9913047426612482, + "grad_norm": 0.39946943521499634, + "learning_rate": 1.9314851528867695e-08, + "loss": 1.755, + "step": 5458 + }, + { + "epoch": 0.9914863668354258, + "grad_norm": 0.9972736239433289, + "learning_rate": 1.850173681509082e-08, + "loss": 1.7735, + "step": 5459 + }, + { + "epoch": 0.9916679910096033, + "grad_norm": 0.9865061044692993, + "learning_rate": 1.770610417194396e-08, + "loss": 1.8286, + "step": 5460 + }, + { + "epoch": 0.991849615183781, + "grad_norm": 0.3783729672431946, + "learning_rate": 1.692795387772117e-08, + "loss": 1.5995, + "step": 5461 + }, + { + "epoch": 0.9920312393579586, + "grad_norm": 0.4268130362033844, + "learning_rate": 1.6167286204593622e-08, + "loss": 1.6805, + "step": 5462 + }, + { + "epoch": 0.9922128635321361, + "grad_norm": 0.548910915851593, + "learning_rate": 1.5424101418620718e-08, + "loss": 1.7336, + "step": 5463 + }, + { + "epoch": 0.9923944877063137, + "grad_norm": 0.3645090162754059, + "learning_rate": 1.4698399779744521e-08, + "loss": 1.8324, + "step": 5464 + }, + { + "epoch": 0.9925761118804913, + "grad_norm": 0.361579954624176, + "learning_rate": 1.3990181541800872e-08, + "loss": 1.723, + "step": 5465 + }, + { + "epoch": 0.9927577360546689, + "grad_norm": 0.44438889622688293, + "learning_rate": 1.3299446952497185e-08, + "loss": 1.6442, + "step": 5466 + }, + { + "epoch": 0.9929393602288464, + "grad_norm": 0.4585612416267395, + "learning_rate": 1.2626196253434641e-08, + "loss": 1.7724, + "step": 5467 + }, + { + "epoch": 0.993120984403024, + "grad_norm": 0.6795984506607056, + "learning_rate": 1.1970429680097095e-08, + "loss": 1.8811, + "step": 5468 + }, + { + "epoch": 0.9933026085772017, + "grad_norm": 0.5341749787330627, + "learning_rate": 1.1332147461851073e-08, + "loss": 1.7002, + "step": 5469 + }, + { + "epoch": 0.9934842327513792, + "grad_norm": 0.31943997740745544, + "learning_rate": 1.0711349821951322e-08, + "loss": 1.77, + "step": 5470 + }, + { + "epoch": 0.9936658569255568, + "grad_norm": 1.7289103269577026, + "learning_rate": 1.0108036977535262e-08, + "loss": 1.9593, + "step": 5471 + }, + { + "epoch": 0.9938474810997344, + "grad_norm": 0.37584108114242554, + "learning_rate": 9.522209139617433e-09, + "loss": 1.8445, + "step": 5472 + }, + { + "epoch": 0.994029105273912, + "grad_norm": 0.37804168462753296, + "learning_rate": 8.953866513111697e-09, + "loss": 1.7776, + "step": 5473 + }, + { + "epoch": 0.9942107294480895, + "grad_norm": 0.4176671504974365, + "learning_rate": 8.403009296803488e-09, + "loss": 1.7212, + "step": 5474 + }, + { + "epoch": 0.9943923536222671, + "grad_norm": 0.3946729004383087, + "learning_rate": 7.869637683372011e-09, + "loss": 1.9811, + "step": 5475 + }, + { + "epoch": 0.9945739777964447, + "grad_norm": 0.7879343628883362, + "learning_rate": 7.353751859368041e-09, + "loss": 1.721, + "step": 5476 + }, + { + "epoch": 0.9947556019706223, + "grad_norm": 0.3320007622241974, + "learning_rate": 6.855352005230575e-09, + "loss": 1.6775, + "step": 5477 + }, + { + "epoch": 0.9949372261447998, + "grad_norm": 0.5085834860801697, + "learning_rate": 6.374438295297935e-09, + "loss": 1.9954, + "step": 5478 + }, + { + "epoch": 0.9951188503189775, + "grad_norm": 0.4545444846153259, + "learning_rate": 5.9110108977689095e-09, + "loss": 1.7341, + "step": 5479 + }, + { + "epoch": 0.9953004744931551, + "grad_norm": 0.5225945711135864, + "learning_rate": 5.46506997474161e-09, + "loss": 1.6087, + "step": 5480 + }, + { + "epoch": 0.9954820986673326, + "grad_norm": 0.34234505891799927, + "learning_rate": 5.03661568219127e-09, + "loss": 1.755, + "step": 5481 + }, + { + "epoch": 0.9956637228415102, + "grad_norm": 0.7375771403312683, + "learning_rate": 4.625648169981345e-09, + "loss": 1.8042, + "step": 5482 + }, + { + "epoch": 0.9958453470156878, + "grad_norm": 0.3588846027851105, + "learning_rate": 4.2321675818579596e-09, + "loss": 1.7312, + "step": 5483 + }, + { + "epoch": 0.9960269711898654, + "grad_norm": 0.5759877562522888, + "learning_rate": 3.85617405543881e-09, + "loss": 1.8581, + "step": 5484 + }, + { + "epoch": 0.9962085953640429, + "grad_norm": 0.32680460810661316, + "learning_rate": 3.497667722246467e-09, + "loss": 1.6499, + "step": 5485 + }, + { + "epoch": 0.9963902195382205, + "grad_norm": 0.5176241993904114, + "learning_rate": 3.1566487076695184e-09, + "loss": 1.734, + "step": 5486 + }, + { + "epoch": 0.9965718437123982, + "grad_norm": 0.45304831862449646, + "learning_rate": 2.833117130990326e-09, + "loss": 1.8594, + "step": 5487 + }, + { + "epoch": 0.9967534678865757, + "grad_norm": 0.8320753574371338, + "learning_rate": 2.5270731053683718e-09, + "loss": 1.7396, + "step": 5488 + }, + { + "epoch": 0.9969350920607533, + "grad_norm": 0.4414713680744171, + "learning_rate": 2.2385167378513593e-09, + "loss": 1.6766, + "step": 5489 + }, + { + "epoch": 0.9971167162349309, + "grad_norm": 0.568382978439331, + "learning_rate": 1.967448129364113e-09, + "loss": 1.4823, + "step": 5490 + }, + { + "epoch": 0.9972983404091085, + "grad_norm": 1.5656307935714722, + "learning_rate": 1.7138673747196799e-09, + "loss": 1.7272, + "step": 5491 + }, + { + "epoch": 0.997479964583286, + "grad_norm": 0.3923093378543854, + "learning_rate": 1.4777745626193274e-09, + "loss": 1.8415, + "step": 5492 + }, + { + "epoch": 0.9976615887574636, + "grad_norm": 0.4043920636177063, + "learning_rate": 1.2591697756358933e-09, + "loss": 1.6422, + "step": 5493 + }, + { + "epoch": 0.9978432129316412, + "grad_norm": 0.3446798026561737, + "learning_rate": 1.0580530902248864e-09, + "loss": 1.7863, + "step": 5494 + }, + { + "epoch": 0.9980248371058188, + "grad_norm": 0.953068196773529, + "learning_rate": 8.744245767466908e-10, + "loss": 1.5797, + "step": 5495 + }, + { + "epoch": 0.9982064612799963, + "grad_norm": 0.30453014373779297, + "learning_rate": 7.08284299411055e-10, + "loss": 1.6762, + "step": 5496 + }, + { + "epoch": 0.9983880854541739, + "grad_norm": 0.3815153241157532, + "learning_rate": 5.596323163437056e-10, + "loss": 1.6625, + "step": 5497 + }, + { + "epoch": 0.9985697096283516, + "grad_norm": 0.32265108823776245, + "learning_rate": 4.284686795363868e-10, + "loss": 1.5824, + "step": 5498 + }, + { + "epoch": 0.9987513338025291, + "grad_norm": 0.5218546986579895, + "learning_rate": 3.1479343485796286e-10, + "loss": 1.7781, + "step": 5499 + }, + { + "epoch": 0.9989329579767067, + "grad_norm": 0.34498491883277893, + "learning_rate": 2.1860662207662253e-10, + "loss": 1.7375, + "step": 5500 + }, + { + "epoch": 0.9991145821508843, + "grad_norm": 0.44958606362342834, + "learning_rate": 1.3990827483212344e-10, + "loss": 1.5089, + "step": 5501 + }, + { + "epoch": 0.9992962063250619, + "grad_norm": 0.42425668239593506, + "learning_rate": 7.869842064689436e-11, + "loss": 1.587, + "step": 5502 + }, + { + "epoch": 0.9994778304992394, + "grad_norm": 0.43472081422805786, + "learning_rate": 3.497708094268859e-11, + "loss": 1.6331, + "step": 5503 + }, + { + "epoch": 0.999659454673417, + "grad_norm": 0.3096645474433899, + "learning_rate": 8.744271001726034e-12, + "loss": 1.755, + "step": 5504 + }, + { + "epoch": 0.9998410788475947, + "grad_norm": 0.4398561716079712, + "learning_rate": 0.0, + "loss": 1.8241, + "step": 5505 + }, + { + "epoch": 0.9998410788475947, + "step": 5505, + "total_flos": 4.169463150439039e+18, + "train_loss": 1.7876772542090766, + "train_runtime": 32508.2013, + "train_samples_per_second": 10.84, + "train_steps_per_second": 0.169 + } + ], + "logging_steps": 1, + "max_steps": 5505, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.169463150439039e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}