{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5698614524343769, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.123268155429712e-05, "grad_norm": 3.241770029067993, "learning_rate": 1e-05, "loss": 0.4814, "step": 1 }, { "epoch": 0.00014246536310859423, "grad_norm": 2.822866201400757, "learning_rate": 2e-05, "loss": 0.5653, "step": 2 }, { "epoch": 0.00021369804466289135, "grad_norm": 3.4582061767578125, "learning_rate": 1.999999922855149e-05, "loss": 0.3878, "step": 3 }, { "epoch": 0.00028493072621718846, "grad_norm": 2.7456774711608887, "learning_rate": 1.9999996914206083e-05, "loss": 0.5697, "step": 4 }, { "epoch": 0.0003561634077714856, "grad_norm": 2.401881694793701, "learning_rate": 1.9999993056964127e-05, "loss": 0.409, "step": 5 }, { "epoch": 0.0004273960893257827, "grad_norm": 7.982686996459961, "learning_rate": 1.9999987656826223e-05, "loss": 0.5139, "step": 6 }, { "epoch": 0.0004986287708800798, "grad_norm": 4.779965877532959, "learning_rate": 1.9999980713793205e-05, "loss": 0.7088, "step": 7 }, { "epoch": 0.0005698614524343769, "grad_norm": 3.5229039192199707, "learning_rate": 1.9999972227866142e-05, "loss": 0.5015, "step": 8 }, { "epoch": 0.000641094133988674, "grad_norm": 3.009692668914795, "learning_rate": 1.9999962199046343e-05, "loss": 0.5786, "step": 9 }, { "epoch": 0.0007123268155429712, "grad_norm": 5.74769401550293, "learning_rate": 1.9999950627335357e-05, "loss": 0.9989, "step": 10 }, { "epoch": 0.0007835594970972683, "grad_norm": 3.102475881576538, "learning_rate": 1.9999937512734968e-05, "loss": 0.4964, "step": 11 }, { "epoch": 0.0008547921786515654, "grad_norm": 3.950805902481079, "learning_rate": 1.9999922855247203e-05, "loss": 0.8578, "step": 12 }, { "epoch": 0.0009260248602058625, "grad_norm": 4.091545104980469, "learning_rate": 1.999990665487432e-05, "loss": 0.7056, "step": 13 }, { "epoch": 0.0009972575417601596, "grad_norm": 3.533233404159546, "learning_rate": 1.9999888911618815e-05, "loss": 0.819, "step": 14 }, { "epoch": 0.0010684902233144566, "grad_norm": 3.580665111541748, "learning_rate": 1.9999869625483433e-05, "loss": 0.6149, "step": 15 }, { "epoch": 0.0011397229048687538, "grad_norm": 5.071815013885498, "learning_rate": 1.9999848796471148e-05, "loss": 0.6862, "step": 16 }, { "epoch": 0.0012109555864230508, "grad_norm": 3.200089693069458, "learning_rate": 1.999982642458517e-05, "loss": 0.5016, "step": 17 }, { "epoch": 0.001282188267977348, "grad_norm": 3.0960240364074707, "learning_rate": 1.9999802509828955e-05, "loss": 0.6249, "step": 18 }, { "epoch": 0.001353420949531645, "grad_norm": 4.258779048919678, "learning_rate": 1.999977705220619e-05, "loss": 0.4007, "step": 19 }, { "epoch": 0.0014246536310859423, "grad_norm": 3.104407548904419, "learning_rate": 1.9999750051720802e-05, "loss": 0.5187, "step": 20 }, { "epoch": 0.0014958863126402393, "grad_norm": 2.47090482711792, "learning_rate": 1.9999721508376962e-05, "loss": 0.2384, "step": 21 }, { "epoch": 0.0015671189941945365, "grad_norm": 5.087483882904053, "learning_rate": 1.9999691422179066e-05, "loss": 0.3782, "step": 22 }, { "epoch": 0.0016383516757488335, "grad_norm": 2.178847074508667, "learning_rate": 1.9999659793131764e-05, "loss": 0.4351, "step": 23 }, { "epoch": 0.0017095843573031308, "grad_norm": 2.5588443279266357, "learning_rate": 1.9999626621239932e-05, "loss": 0.3396, "step": 24 }, { "epoch": 0.0017808170388574278, "grad_norm": 5.967891216278076, "learning_rate": 1.9999591906508686e-05, "loss": 0.7443, "step": 25 }, { "epoch": 0.001852049720411725, "grad_norm": 3.670088768005371, "learning_rate": 1.9999555648943387e-05, "loss": 0.6114, "step": 26 }, { "epoch": 0.001923282401966022, "grad_norm": 3.085096597671509, "learning_rate": 1.9999517848549628e-05, "loss": 0.4394, "step": 27 }, { "epoch": 0.0019945150835203192, "grad_norm": 6.807377338409424, "learning_rate": 1.9999478505333236e-05, "loss": 0.6268, "step": 28 }, { "epoch": 0.0020657477650746162, "grad_norm": 1.923145055770874, "learning_rate": 1.999943761930029e-05, "loss": 0.1604, "step": 29 }, { "epoch": 0.0021369804466289132, "grad_norm": 4.013949871063232, "learning_rate": 1.9999395190457093e-05, "loss": 0.78, "step": 30 }, { "epoch": 0.0022082131281832102, "grad_norm": 4.801888465881348, "learning_rate": 1.999935121881019e-05, "loss": 0.7075, "step": 31 }, { "epoch": 0.0022794458097375077, "grad_norm": 3.4631011486053467, "learning_rate": 1.999930570436637e-05, "loss": 0.714, "step": 32 }, { "epoch": 0.0023506784912918047, "grad_norm": 4.385468482971191, "learning_rate": 1.9999258647132645e-05, "loss": 0.6437, "step": 33 }, { "epoch": 0.0024219111728461017, "grad_norm": 4.913711071014404, "learning_rate": 1.999921004711629e-05, "loss": 0.2878, "step": 34 }, { "epoch": 0.0024931438544003987, "grad_norm": 3.7503769397735596, "learning_rate": 1.9999159904324793e-05, "loss": 0.7344, "step": 35 }, { "epoch": 0.002564376535954696, "grad_norm": 3.115455150604248, "learning_rate": 1.9999108218765898e-05, "loss": 0.7168, "step": 36 }, { "epoch": 0.002635609217508993, "grad_norm": 4.400928020477295, "learning_rate": 1.9999054990447576e-05, "loss": 0.8159, "step": 37 }, { "epoch": 0.00270684189906329, "grad_norm": 3.5946638584136963, "learning_rate": 1.9999000219378036e-05, "loss": 0.2652, "step": 38 }, { "epoch": 0.002778074580617587, "grad_norm": 5.205734729766846, "learning_rate": 1.9998943905565733e-05, "loss": 0.4576, "step": 39 }, { "epoch": 0.0028493072621718846, "grad_norm": 2.7111988067626953, "learning_rate": 1.9998886049019356e-05, "loss": 0.2135, "step": 40 }, { "epoch": 0.0029205399437261816, "grad_norm": 3.921225070953369, "learning_rate": 1.999882664974783e-05, "loss": 0.6962, "step": 41 }, { "epoch": 0.0029917726252804786, "grad_norm": 4.11433219909668, "learning_rate": 1.999876570776032e-05, "loss": 0.8392, "step": 42 }, { "epoch": 0.0030630053068347756, "grad_norm": 5.053853511810303, "learning_rate": 1.999870322306623e-05, "loss": 0.5696, "step": 43 }, { "epoch": 0.003134237988389073, "grad_norm": 4.543092250823975, "learning_rate": 1.9998639195675197e-05, "loss": 0.5621, "step": 44 }, { "epoch": 0.00320547066994337, "grad_norm": 4.138581275939941, "learning_rate": 1.99985736255971e-05, "loss": 0.3599, "step": 45 }, { "epoch": 0.003276703351497667, "grad_norm": 4.79445743560791, "learning_rate": 1.9998506512842063e-05, "loss": 0.9721, "step": 46 }, { "epoch": 0.003347936033051964, "grad_norm": 2.946519374847412, "learning_rate": 1.999843785742043e-05, "loss": 0.1689, "step": 47 }, { "epoch": 0.0034191687146062615, "grad_norm": 5.69881534576416, "learning_rate": 1.9998367659342804e-05, "loss": 0.4692, "step": 48 }, { "epoch": 0.0034904013961605585, "grad_norm": 5.293825149536133, "learning_rate": 1.999829591862001e-05, "loss": 0.4616, "step": 49 }, { "epoch": 0.0035616340777148555, "grad_norm": 2.842916250228882, "learning_rate": 1.9998222635263118e-05, "loss": 0.6245, "step": 50 }, { "epoch": 0.0036328667592691525, "grad_norm": 5.804721355438232, "learning_rate": 1.9998147809283436e-05, "loss": 0.7259, "step": 51 }, { "epoch": 0.00370409944082345, "grad_norm": 12.716594696044922, "learning_rate": 1.9998071440692508e-05, "loss": 0.431, "step": 52 }, { "epoch": 0.003775332122377747, "grad_norm": 5.454065799713135, "learning_rate": 1.9997993529502116e-05, "loss": 0.5346, "step": 53 }, { "epoch": 0.003846564803932044, "grad_norm": 7.468129634857178, "learning_rate": 1.9997914075724283e-05, "loss": 0.6601, "step": 54 }, { "epoch": 0.003917797485486341, "grad_norm": 3.656949043273926, "learning_rate": 1.9997833079371263e-05, "loss": 0.5982, "step": 55 }, { "epoch": 0.0039890301670406384, "grad_norm": 3.8655014038085938, "learning_rate": 1.9997750540455562e-05, "loss": 0.5819, "step": 56 }, { "epoch": 0.004060262848594935, "grad_norm": 2.246042490005493, "learning_rate": 1.999766645898991e-05, "loss": 0.3784, "step": 57 }, { "epoch": 0.0041314955301492325, "grad_norm": 2.2186176776885986, "learning_rate": 1.9997580834987277e-05, "loss": 0.3974, "step": 58 }, { "epoch": 0.00420272821170353, "grad_norm": 3.2396554946899414, "learning_rate": 1.9997493668460876e-05, "loss": 0.3056, "step": 59 }, { "epoch": 0.0042739608932578265, "grad_norm": 4.59601354598999, "learning_rate": 1.9997404959424153e-05, "loss": 0.733, "step": 60 }, { "epoch": 0.004345193574812124, "grad_norm": 3.6259570121765137, "learning_rate": 1.9997314707890802e-05, "loss": 0.4039, "step": 61 }, { "epoch": 0.0044164262563664205, "grad_norm": 3.5043556690216064, "learning_rate": 1.9997222913874745e-05, "loss": 0.9337, "step": 62 }, { "epoch": 0.004487658937920718, "grad_norm": 2.886662006378174, "learning_rate": 1.999712957739014e-05, "loss": 0.3556, "step": 63 }, { "epoch": 0.004558891619475015, "grad_norm": 4.945755958557129, "learning_rate": 1.9997034698451396e-05, "loss": 0.4439, "step": 64 }, { "epoch": 0.004630124301029312, "grad_norm": 4.1916399002075195, "learning_rate": 1.9996938277073146e-05, "loss": 0.914, "step": 65 }, { "epoch": 0.004701356982583609, "grad_norm": 3.7400479316711426, "learning_rate": 1.9996840313270268e-05, "loss": 0.3031, "step": 66 }, { "epoch": 0.004772589664137907, "grad_norm": 3.3715837001800537, "learning_rate": 1.999674080705788e-05, "loss": 0.4319, "step": 67 }, { "epoch": 0.004843822345692203, "grad_norm": 15.411069869995117, "learning_rate": 1.9996639758451323e-05, "loss": 0.4542, "step": 68 }, { "epoch": 0.004915055027246501, "grad_norm": 6.053090572357178, "learning_rate": 1.9996537167466205e-05, "loss": 0.7174, "step": 69 }, { "epoch": 0.004986287708800797, "grad_norm": 4.839987277984619, "learning_rate": 1.9996433034118342e-05, "loss": 0.2808, "step": 70 }, { "epoch": 0.005057520390355095, "grad_norm": 4.20911169052124, "learning_rate": 1.9996327358423812e-05, "loss": 0.7505, "step": 71 }, { "epoch": 0.005128753071909392, "grad_norm": 10.347644805908203, "learning_rate": 1.9996220140398907e-05, "loss": 0.4591, "step": 72 }, { "epoch": 0.005199985753463689, "grad_norm": 3.2806763648986816, "learning_rate": 1.9996111380060177e-05, "loss": 0.6981, "step": 73 }, { "epoch": 0.005271218435017986, "grad_norm": 3.0479278564453125, "learning_rate": 1.99960010774244e-05, "loss": 0.5131, "step": 74 }, { "epoch": 0.005342451116572284, "grad_norm": 4.1378607749938965, "learning_rate": 1.9995889232508595e-05, "loss": 0.4019, "step": 75 }, { "epoch": 0.00541368379812658, "grad_norm": 2.6758971214294434, "learning_rate": 1.9995775845330022e-05, "loss": 0.35, "step": 76 }, { "epoch": 0.005484916479680878, "grad_norm": 2.0090811252593994, "learning_rate": 1.999566091590617e-05, "loss": 0.3933, "step": 77 }, { "epoch": 0.005556149161235174, "grad_norm": 2.9117624759674072, "learning_rate": 1.9995544444254777e-05, "loss": 0.612, "step": 78 }, { "epoch": 0.005627381842789472, "grad_norm": 8.442682266235352, "learning_rate": 1.9995426430393808e-05, "loss": 0.3692, "step": 79 }, { "epoch": 0.005698614524343769, "grad_norm": 5.502487659454346, "learning_rate": 1.9995306874341477e-05, "loss": 0.6022, "step": 80 }, { "epoch": 0.005769847205898066, "grad_norm": 3.6163394451141357, "learning_rate": 1.9995185776116225e-05, "loss": 0.6671, "step": 81 }, { "epoch": 0.005841079887452363, "grad_norm": 5.902405261993408, "learning_rate": 1.9995063135736735e-05, "loss": 0.5535, "step": 82 }, { "epoch": 0.005912312569006661, "grad_norm": 3.5600876808166504, "learning_rate": 1.999493895322194e-05, "loss": 0.8438, "step": 83 }, { "epoch": 0.005983545250560957, "grad_norm": 3.09224796295166, "learning_rate": 1.9994813228590986e-05, "loss": 0.2243, "step": 84 }, { "epoch": 0.006054777932115255, "grad_norm": 3.3218393325805664, "learning_rate": 1.999468596186328e-05, "loss": 0.5945, "step": 85 }, { "epoch": 0.006126010613669551, "grad_norm": 4.142436981201172, "learning_rate": 1.9994557153058456e-05, "loss": 0.6539, "step": 86 }, { "epoch": 0.006197243295223849, "grad_norm": 3.3007771968841553, "learning_rate": 1.9994426802196384e-05, "loss": 0.6304, "step": 87 }, { "epoch": 0.006268475976778146, "grad_norm": 6.157745838165283, "learning_rate": 1.999429490929718e-05, "loss": 0.4685, "step": 88 }, { "epoch": 0.006339708658332443, "grad_norm": 3.9546329975128174, "learning_rate": 1.9994161474381198e-05, "loss": 0.4339, "step": 89 }, { "epoch": 0.00641094133988674, "grad_norm": 3.1059582233428955, "learning_rate": 1.9994026497469016e-05, "loss": 0.3078, "step": 90 }, { "epoch": 0.006482174021441037, "grad_norm": 4.720130443572998, "learning_rate": 1.9993889978581462e-05, "loss": 0.8041, "step": 91 }, { "epoch": 0.006553406702995334, "grad_norm": 2.687554121017456, "learning_rate": 1.9993751917739606e-05, "loss": 0.2588, "step": 92 }, { "epoch": 0.006624639384549632, "grad_norm": 3.4422361850738525, "learning_rate": 1.999361231496474e-05, "loss": 0.2784, "step": 93 }, { "epoch": 0.006695872066103928, "grad_norm": 3.087240219116211, "learning_rate": 1.9993471170278415e-05, "loss": 0.4372, "step": 94 }, { "epoch": 0.006767104747658226, "grad_norm": 3.697134256362915, "learning_rate": 1.9993328483702393e-05, "loss": 0.6927, "step": 95 }, { "epoch": 0.006838337429212523, "grad_norm": 2.5784127712249756, "learning_rate": 1.9993184255258705e-05, "loss": 0.2528, "step": 96 }, { "epoch": 0.00690957011076682, "grad_norm": 4.583463668823242, "learning_rate": 1.9993038484969592e-05, "loss": 0.5169, "step": 97 }, { "epoch": 0.006980802792321117, "grad_norm": 3.3828322887420654, "learning_rate": 1.9992891172857552e-05, "loss": 0.7655, "step": 98 }, { "epoch": 0.007052035473875414, "grad_norm": 3.1128482818603516, "learning_rate": 1.9992742318945307e-05, "loss": 0.712, "step": 99 }, { "epoch": 0.007123268155429711, "grad_norm": 4.8935065269470215, "learning_rate": 1.999259192325583e-05, "loss": 0.626, "step": 100 }, { "epoch": 0.0071945008369840085, "grad_norm": 2.432919979095459, "learning_rate": 1.999243998581232e-05, "loss": 0.4349, "step": 101 }, { "epoch": 0.007265733518538305, "grad_norm": 3.797100305557251, "learning_rate": 1.9992286506638226e-05, "loss": 0.6165, "step": 102 }, { "epoch": 0.0073369662000926025, "grad_norm": 2.691992998123169, "learning_rate": 1.9992131485757223e-05, "loss": 0.4649, "step": 103 }, { "epoch": 0.0074081988816469, "grad_norm": 2.762441635131836, "learning_rate": 1.9991974923193234e-05, "loss": 0.7836, "step": 104 }, { "epoch": 0.0074794315632011965, "grad_norm": 5.420194625854492, "learning_rate": 1.9991816818970408e-05, "loss": 0.438, "step": 105 }, { "epoch": 0.007550664244755494, "grad_norm": 3.0196404457092285, "learning_rate": 1.9991657173113144e-05, "loss": 0.305, "step": 106 }, { "epoch": 0.0076218969263097906, "grad_norm": 3.2360281944274902, "learning_rate": 1.999149598564607e-05, "loss": 0.3937, "step": 107 }, { "epoch": 0.007693129607864088, "grad_norm": 4.05167818069458, "learning_rate": 1.9991333256594062e-05, "loss": 0.5802, "step": 108 }, { "epoch": 0.0077643622894183854, "grad_norm": 4.107483863830566, "learning_rate": 1.9991168985982223e-05, "loss": 0.8575, "step": 109 }, { "epoch": 0.007835594970972682, "grad_norm": 2.9179444313049316, "learning_rate": 1.9991003173835898e-05, "loss": 0.4135, "step": 110 }, { "epoch": 0.007906827652526979, "grad_norm": 2.568345069885254, "learning_rate": 1.9990835820180665e-05, "loss": 0.4531, "step": 111 }, { "epoch": 0.007978060334081277, "grad_norm": 6.864298343658447, "learning_rate": 1.9990666925042356e-05, "loss": 0.8308, "step": 112 }, { "epoch": 0.008049293015635573, "grad_norm": 2.565925359725952, "learning_rate": 1.9990496488447024e-05, "loss": 0.4122, "step": 113 }, { "epoch": 0.00812052569718987, "grad_norm": 6.644340991973877, "learning_rate": 1.9990324510420966e-05, "loss": 0.7607, "step": 114 }, { "epoch": 0.008191758378744168, "grad_norm": 3.0580263137817383, "learning_rate": 1.9990150990990717e-05, "loss": 0.3786, "step": 115 }, { "epoch": 0.008262991060298465, "grad_norm": 3.059570550918579, "learning_rate": 1.998997593018305e-05, "loss": 0.7382, "step": 116 }, { "epoch": 0.008334223741852761, "grad_norm": 3.072932720184326, "learning_rate": 1.998979932802497e-05, "loss": 0.4858, "step": 117 }, { "epoch": 0.00840545642340706, "grad_norm": 3.3676867485046387, "learning_rate": 1.998962118454373e-05, "loss": 0.6202, "step": 118 }, { "epoch": 0.008476689104961356, "grad_norm": 2.543193817138672, "learning_rate": 1.9989441499766814e-05, "loss": 0.3953, "step": 119 }, { "epoch": 0.008547921786515653, "grad_norm": 12.877114295959473, "learning_rate": 1.998926027372195e-05, "loss": 0.4739, "step": 120 }, { "epoch": 0.008619154468069951, "grad_norm": 3.607471466064453, "learning_rate": 1.998907750643709e-05, "loss": 0.5236, "step": 121 }, { "epoch": 0.008690387149624248, "grad_norm": 3.5259222984313965, "learning_rate": 1.998889319794044e-05, "loss": 0.7894, "step": 122 }, { "epoch": 0.008761619831178544, "grad_norm": 2.9799933433532715, "learning_rate": 1.998870734826044e-05, "loss": 0.5314, "step": 123 }, { "epoch": 0.008832852512732841, "grad_norm": 1.8997468948364258, "learning_rate": 1.9988519957425754e-05, "loss": 0.1503, "step": 124 }, { "epoch": 0.00890408519428714, "grad_norm": 3.016674041748047, "learning_rate": 1.9988331025465298e-05, "loss": 0.3479, "step": 125 }, { "epoch": 0.008975317875841436, "grad_norm": 3.8689661026000977, "learning_rate": 1.998814055240823e-05, "loss": 0.276, "step": 126 }, { "epoch": 0.009046550557395732, "grad_norm": 3.0144858360290527, "learning_rate": 1.9987948538283932e-05, "loss": 0.6518, "step": 127 }, { "epoch": 0.00911778323895003, "grad_norm": 3.783109188079834, "learning_rate": 1.998775498312203e-05, "loss": 0.3909, "step": 128 }, { "epoch": 0.009189015920504327, "grad_norm": 3.588620901107788, "learning_rate": 1.998755988695239e-05, "loss": 0.3313, "step": 129 }, { "epoch": 0.009260248602058624, "grad_norm": 2.070775032043457, "learning_rate": 1.998736324980511e-05, "loss": 0.3195, "step": 130 }, { "epoch": 0.009331481283612922, "grad_norm": 2.109273672103882, "learning_rate": 1.998716507171053e-05, "loss": 0.1233, "step": 131 }, { "epoch": 0.009402713965167219, "grad_norm": 10.237347602844238, "learning_rate": 1.9986965352699225e-05, "loss": 0.1951, "step": 132 }, { "epoch": 0.009473946646721515, "grad_norm": 3.4003682136535645, "learning_rate": 1.9986764092802015e-05, "loss": 0.7005, "step": 133 }, { "epoch": 0.009545179328275814, "grad_norm": 3.381950855255127, "learning_rate": 1.998656129204995e-05, "loss": 0.5797, "step": 134 }, { "epoch": 0.00961641200983011, "grad_norm": 4.222867012023926, "learning_rate": 1.998635695047432e-05, "loss": 0.636, "step": 135 }, { "epoch": 0.009687644691384407, "grad_norm": 9.144466400146484, "learning_rate": 1.998615106810665e-05, "loss": 0.2701, "step": 136 }, { "epoch": 0.009758877372938705, "grad_norm": 3.7926902770996094, "learning_rate": 1.9985943644978705e-05, "loss": 0.7506, "step": 137 }, { "epoch": 0.009830110054493002, "grad_norm": 4.455250263214111, "learning_rate": 1.9985734681122494e-05, "loss": 0.7775, "step": 138 }, { "epoch": 0.009901342736047298, "grad_norm": 3.103604793548584, "learning_rate": 1.9985524176570255e-05, "loss": 0.59, "step": 139 }, { "epoch": 0.009972575417601595, "grad_norm": 2.847961664199829, "learning_rate": 1.9985312131354467e-05, "loss": 0.6216, "step": 140 }, { "epoch": 0.010043808099155893, "grad_norm": 3.8381190299987793, "learning_rate": 1.9985098545507843e-05, "loss": 0.2531, "step": 141 }, { "epoch": 0.01011504078071019, "grad_norm": 1.448052167892456, "learning_rate": 1.9984883419063343e-05, "loss": 0.1631, "step": 142 }, { "epoch": 0.010186273462264486, "grad_norm": 2.797065496444702, "learning_rate": 1.9984666752054152e-05, "loss": 0.3673, "step": 143 }, { "epoch": 0.010257506143818785, "grad_norm": 4.262566566467285, "learning_rate": 1.998444854451371e-05, "loss": 0.3816, "step": 144 }, { "epoch": 0.010328738825373081, "grad_norm": 3.6811625957489014, "learning_rate": 1.9984228796475672e-05, "loss": 0.2111, "step": 145 }, { "epoch": 0.010399971506927378, "grad_norm": 3.1426055431365967, "learning_rate": 1.9984007507973952e-05, "loss": 0.2815, "step": 146 }, { "epoch": 0.010471204188481676, "grad_norm": 2.340144395828247, "learning_rate": 1.9983784679042685e-05, "loss": 0.2413, "step": 147 }, { "epoch": 0.010542436870035973, "grad_norm": 3.61550235748291, "learning_rate": 1.998356030971626e-05, "loss": 0.6716, "step": 148 }, { "epoch": 0.01061366955159027, "grad_norm": 3.017151355743408, "learning_rate": 1.9983334400029285e-05, "loss": 0.133, "step": 149 }, { "epoch": 0.010684902233144567, "grad_norm": 4.367491722106934, "learning_rate": 1.998310695001662e-05, "loss": 0.5794, "step": 150 }, { "epoch": 0.010756134914698864, "grad_norm": 2.459001064300537, "learning_rate": 1.9982877959713366e-05, "loss": 0.5198, "step": 151 }, { "epoch": 0.01082736759625316, "grad_norm": 3.3977019786834717, "learning_rate": 1.9982647429154843e-05, "loss": 0.4678, "step": 152 }, { "epoch": 0.010898600277807457, "grad_norm": 3.322810649871826, "learning_rate": 1.9982415358376623e-05, "loss": 0.5499, "step": 153 }, { "epoch": 0.010969832959361756, "grad_norm": 2.961925745010376, "learning_rate": 1.9982181747414508e-05, "loss": 0.5121, "step": 154 }, { "epoch": 0.011041065640916052, "grad_norm": 3.4209818840026855, "learning_rate": 1.998194659630455e-05, "loss": 0.4575, "step": 155 }, { "epoch": 0.011112298322470349, "grad_norm": 3.0197482109069824, "learning_rate": 1.9981709905083026e-05, "loss": 0.4688, "step": 156 }, { "epoch": 0.011183531004024647, "grad_norm": 2.363948345184326, "learning_rate": 1.998147167378645e-05, "loss": 0.2601, "step": 157 }, { "epoch": 0.011254763685578944, "grad_norm": 2.626372814178467, "learning_rate": 1.9981231902451595e-05, "loss": 0.3493, "step": 158 }, { "epoch": 0.01132599636713324, "grad_norm": 3.4593353271484375, "learning_rate": 1.9980990591115437e-05, "loss": 0.415, "step": 159 }, { "epoch": 0.011397229048687538, "grad_norm": 3.0266027450561523, "learning_rate": 1.9980747739815217e-05, "loss": 0.5212, "step": 160 }, { "epoch": 0.011468461730241835, "grad_norm": 2.364204168319702, "learning_rate": 1.99805033485884e-05, "loss": 0.2966, "step": 161 }, { "epoch": 0.011539694411796132, "grad_norm": 2.1384124755859375, "learning_rate": 1.99802574174727e-05, "loss": 0.0679, "step": 162 }, { "epoch": 0.01161092709335043, "grad_norm": 2.142422914505005, "learning_rate": 1.9980009946506053e-05, "loss": 0.5892, "step": 163 }, { "epoch": 0.011682159774904726, "grad_norm": 3.496680974960327, "learning_rate": 1.9979760935726647e-05, "loss": 0.5956, "step": 164 }, { "epoch": 0.011753392456459023, "grad_norm": 3.9778552055358887, "learning_rate": 1.99795103851729e-05, "loss": 0.8177, "step": 165 }, { "epoch": 0.011824625138013321, "grad_norm": 5.169002056121826, "learning_rate": 1.997925829488347e-05, "loss": 0.7233, "step": 166 }, { "epoch": 0.011895857819567618, "grad_norm": 4.267916202545166, "learning_rate": 1.9979004664897252e-05, "loss": 0.6047, "step": 167 }, { "epoch": 0.011967090501121914, "grad_norm": 7.034934043884277, "learning_rate": 1.9978749495253378e-05, "loss": 0.6528, "step": 168 }, { "epoch": 0.012038323182676211, "grad_norm": 2.823610782623291, "learning_rate": 1.9978492785991216e-05, "loss": 0.5336, "step": 169 }, { "epoch": 0.01210955586423051, "grad_norm": 4.046404838562012, "learning_rate": 1.997823453715038e-05, "loss": 0.6645, "step": 170 }, { "epoch": 0.012180788545784806, "grad_norm": 2.528013229370117, "learning_rate": 1.9977974748770708e-05, "loss": 0.4077, "step": 171 }, { "epoch": 0.012252021227339102, "grad_norm": 3.8361382484436035, "learning_rate": 1.9977713420892287e-05, "loss": 0.5486, "step": 172 }, { "epoch": 0.0123232539088934, "grad_norm": 2.7005319595336914, "learning_rate": 1.9977450553555434e-05, "loss": 0.4868, "step": 173 }, { "epoch": 0.012394486590447697, "grad_norm": 4.683351039886475, "learning_rate": 1.9977186146800707e-05, "loss": 0.5587, "step": 174 }, { "epoch": 0.012465719272001994, "grad_norm": 2.860874891281128, "learning_rate": 1.997692020066891e-05, "loss": 0.5051, "step": 175 }, { "epoch": 0.012536951953556292, "grad_norm": 2.046689510345459, "learning_rate": 1.997665271520106e-05, "loss": 0.1578, "step": 176 }, { "epoch": 0.012608184635110589, "grad_norm": 2.085561513900757, "learning_rate": 1.997638369043844e-05, "loss": 0.4687, "step": 177 }, { "epoch": 0.012679417316664885, "grad_norm": 3.1333861351013184, "learning_rate": 1.9976113126422553e-05, "loss": 0.7068, "step": 178 }, { "epoch": 0.012750649998219184, "grad_norm": 4.0296430587768555, "learning_rate": 1.997584102319514e-05, "loss": 0.6563, "step": 179 }, { "epoch": 0.01282188267977348, "grad_norm": 4.484042167663574, "learning_rate": 1.9975567380798195e-05, "loss": 0.6994, "step": 180 }, { "epoch": 0.012893115361327777, "grad_norm": 3.053692579269409, "learning_rate": 1.997529219927393e-05, "loss": 0.7804, "step": 181 }, { "epoch": 0.012964348042882073, "grad_norm": 4.7079176902771, "learning_rate": 1.9975015478664802e-05, "loss": 0.3524, "step": 182 }, { "epoch": 0.013035580724436372, "grad_norm": 4.899001598358154, "learning_rate": 1.9974737219013513e-05, "loss": 0.6012, "step": 183 }, { "epoch": 0.013106813405990668, "grad_norm": 2.5683019161224365, "learning_rate": 1.9974457420362986e-05, "loss": 0.4968, "step": 184 }, { "epoch": 0.013178046087544965, "grad_norm": 3.1313059329986572, "learning_rate": 1.9974176082756397e-05, "loss": 0.5335, "step": 185 }, { "epoch": 0.013249278769099263, "grad_norm": 2.9540343284606934, "learning_rate": 1.9973893206237154e-05, "loss": 0.4855, "step": 186 }, { "epoch": 0.01332051145065356, "grad_norm": 9.442750930786133, "learning_rate": 1.99736087908489e-05, "loss": 0.4468, "step": 187 }, { "epoch": 0.013391744132207856, "grad_norm": 1.851338505744934, "learning_rate": 1.9973322836635517e-05, "loss": 0.1753, "step": 188 }, { "epoch": 0.013462976813762155, "grad_norm": 3.7628555297851562, "learning_rate": 1.9973035343641127e-05, "loss": 0.8189, "step": 189 }, { "epoch": 0.013534209495316451, "grad_norm": 5.735669136047363, "learning_rate": 1.9972746311910086e-05, "loss": 0.6077, "step": 190 }, { "epoch": 0.013605442176870748, "grad_norm": 2.7641937732696533, "learning_rate": 1.997245574148699e-05, "loss": 0.64, "step": 191 }, { "epoch": 0.013676674858425046, "grad_norm": 2.5914268493652344, "learning_rate": 1.9972163632416666e-05, "loss": 0.5483, "step": 192 }, { "epoch": 0.013747907539979343, "grad_norm": 3.0151500701904297, "learning_rate": 1.997186998474419e-05, "loss": 0.4246, "step": 193 }, { "epoch": 0.01381914022153364, "grad_norm": 4.690418720245361, "learning_rate": 1.9971574798514862e-05, "loss": 0.3163, "step": 194 }, { "epoch": 0.013890372903087938, "grad_norm": 3.6574878692626953, "learning_rate": 1.997127807377423e-05, "loss": 0.5816, "step": 195 }, { "epoch": 0.013961605584642234, "grad_norm": 3.441723108291626, "learning_rate": 1.9970979810568082e-05, "loss": 0.8093, "step": 196 }, { "epoch": 0.01403283826619653, "grad_norm": 2.4268226623535156, "learning_rate": 1.9970680008942425e-05, "loss": 0.3376, "step": 197 }, { "epoch": 0.014104070947750827, "grad_norm": 11.938136100769043, "learning_rate": 1.9970378668943522e-05, "loss": 0.4333, "step": 198 }, { "epoch": 0.014175303629305126, "grad_norm": 2.5628435611724854, "learning_rate": 1.9970075790617865e-05, "loss": 0.2481, "step": 199 }, { "epoch": 0.014246536310859422, "grad_norm": 3.9423046112060547, "learning_rate": 1.9969771374012186e-05, "loss": 0.1729, "step": 200 }, { "epoch": 0.014317768992413719, "grad_norm": 2.8135504722595215, "learning_rate": 1.996946541917345e-05, "loss": 0.3506, "step": 201 }, { "epoch": 0.014389001673968017, "grad_norm": 2.382378339767456, "learning_rate": 1.996915792614887e-05, "loss": 0.3523, "step": 202 }, { "epoch": 0.014460234355522314, "grad_norm": 1.9002189636230469, "learning_rate": 1.9968848894985884e-05, "loss": 0.2107, "step": 203 }, { "epoch": 0.01453146703707661, "grad_norm": 4.309718608856201, "learning_rate": 1.996853832573217e-05, "loss": 0.5936, "step": 204 }, { "epoch": 0.014602699718630908, "grad_norm": 2.8296396732330322, "learning_rate": 1.996822621843565e-05, "loss": 0.7109, "step": 205 }, { "epoch": 0.014673932400185205, "grad_norm": 2.397279739379883, "learning_rate": 1.9967912573144476e-05, "loss": 0.3835, "step": 206 }, { "epoch": 0.014745165081739502, "grad_norm": 6.063565254211426, "learning_rate": 1.9967597389907043e-05, "loss": 0.81, "step": 207 }, { "epoch": 0.0148163977632938, "grad_norm": 2.834203004837036, "learning_rate": 1.9967280668771977e-05, "loss": 0.3727, "step": 208 }, { "epoch": 0.014887630444848097, "grad_norm": 3.483571767807007, "learning_rate": 1.996696240978815e-05, "loss": 0.4553, "step": 209 }, { "epoch": 0.014958863126402393, "grad_norm": 2.407409906387329, "learning_rate": 1.9966642613004664e-05, "loss": 0.318, "step": 210 }, { "epoch": 0.015030095807956691, "grad_norm": 3.507251501083374, "learning_rate": 1.9966321278470856e-05, "loss": 0.633, "step": 211 }, { "epoch": 0.015101328489510988, "grad_norm": 4.002109050750732, "learning_rate": 1.9965998406236306e-05, "loss": 0.2085, "step": 212 }, { "epoch": 0.015172561171065285, "grad_norm": 2.691777229309082, "learning_rate": 1.9965673996350836e-05, "loss": 0.5832, "step": 213 }, { "epoch": 0.015243793852619581, "grad_norm": 2.2662160396575928, "learning_rate": 1.9965348048864495e-05, "loss": 0.3932, "step": 214 }, { "epoch": 0.01531502653417388, "grad_norm": 2.292246103286743, "learning_rate": 1.9965020563827574e-05, "loss": 0.6018, "step": 215 }, { "epoch": 0.015386259215728176, "grad_norm": 1.5478793382644653, "learning_rate": 1.99646915412906e-05, "loss": 0.1498, "step": 216 }, { "epoch": 0.015457491897282473, "grad_norm": 2.828451156616211, "learning_rate": 1.996436098130433e-05, "loss": 0.6063, "step": 217 }, { "epoch": 0.015528724578836771, "grad_norm": 3.577572822570801, "learning_rate": 1.9964028883919783e-05, "loss": 0.3565, "step": 218 }, { "epoch": 0.015599957260391067, "grad_norm": 5.46316385269165, "learning_rate": 1.9963695249188185e-05, "loss": 0.4272, "step": 219 }, { "epoch": 0.015671189941945364, "grad_norm": 4.562551498413086, "learning_rate": 1.9963360077161015e-05, "loss": 0.7234, "step": 220 }, { "epoch": 0.01574242262349966, "grad_norm": 1.2947126626968384, "learning_rate": 1.996302336788999e-05, "loss": 0.1611, "step": 221 }, { "epoch": 0.015813655305053957, "grad_norm": 5.177380561828613, "learning_rate": 1.9962685121427055e-05, "loss": 0.5179, "step": 222 }, { "epoch": 0.015884887986608257, "grad_norm": 16.194833755493164, "learning_rate": 1.9962345337824404e-05, "loss": 0.3716, "step": 223 }, { "epoch": 0.015956120668162554, "grad_norm": 4.624609470367432, "learning_rate": 1.996200401713446e-05, "loss": 0.5248, "step": 224 }, { "epoch": 0.01602735334971685, "grad_norm": 4.038144588470459, "learning_rate": 1.9961661159409885e-05, "loss": 0.3842, "step": 225 }, { "epoch": 0.016098586031271147, "grad_norm": 3.391400098800659, "learning_rate": 1.9961316764703583e-05, "loss": 0.4298, "step": 226 }, { "epoch": 0.016169818712825444, "grad_norm": 3.2097573280334473, "learning_rate": 1.996097083306868e-05, "loss": 0.549, "step": 227 }, { "epoch": 0.01624105139437974, "grad_norm": 2.477729558944702, "learning_rate": 1.9960623364558555e-05, "loss": 0.4347, "step": 228 }, { "epoch": 0.01631228407593404, "grad_norm": 6.840750694274902, "learning_rate": 1.9960274359226824e-05, "loss": 0.1686, "step": 229 }, { "epoch": 0.016383516757488337, "grad_norm": 3.1947145462036133, "learning_rate": 1.9959923817127326e-05, "loss": 0.4376, "step": 230 }, { "epoch": 0.016454749439042633, "grad_norm": 3.9973793029785156, "learning_rate": 1.9959571738314153e-05, "loss": 0.1284, "step": 231 }, { "epoch": 0.01652598212059693, "grad_norm": 4.1577467918396, "learning_rate": 1.9959218122841624e-05, "loss": 0.6727, "step": 232 }, { "epoch": 0.016597214802151226, "grad_norm": 5.289069175720215, "learning_rate": 1.99588629707643e-05, "loss": 0.57, "step": 233 }, { "epoch": 0.016668447483705523, "grad_norm": 2.324817419052124, "learning_rate": 1.995850628213697e-05, "loss": 0.0917, "step": 234 }, { "epoch": 0.01673968016525982, "grad_norm": 5.161026477813721, "learning_rate": 1.995814805701468e-05, "loss": 0.2336, "step": 235 }, { "epoch": 0.01681091284681412, "grad_norm": 3.0585944652557373, "learning_rate": 1.9957788295452693e-05, "loss": 0.1709, "step": 236 }, { "epoch": 0.016882145528368416, "grad_norm": 2.7663941383361816, "learning_rate": 1.9957426997506518e-05, "loss": 0.2282, "step": 237 }, { "epoch": 0.016953378209922713, "grad_norm": 4.388126850128174, "learning_rate": 1.9957064163231896e-05, "loss": 0.76, "step": 238 }, { "epoch": 0.01702461089147701, "grad_norm": 4.663323402404785, "learning_rate": 1.9956699792684812e-05, "loss": 0.6776, "step": 239 }, { "epoch": 0.017095843573031306, "grad_norm": 4.506319046020508, "learning_rate": 1.9956333885921488e-05, "loss": 0.6275, "step": 240 }, { "epoch": 0.017167076254585602, "grad_norm": 3.204096555709839, "learning_rate": 1.995596644299837e-05, "loss": 0.2953, "step": 241 }, { "epoch": 0.017238308936139903, "grad_norm": 3.9930503368377686, "learning_rate": 1.9955597463972157e-05, "loss": 0.1772, "step": 242 }, { "epoch": 0.0173095416176942, "grad_norm": 3.1534619331359863, "learning_rate": 1.9955226948899782e-05, "loss": 0.5467, "step": 243 }, { "epoch": 0.017380774299248496, "grad_norm": 2.5726420879364014, "learning_rate": 1.995485489783841e-05, "loss": 0.3991, "step": 244 }, { "epoch": 0.017452006980802792, "grad_norm": 2.412804365158081, "learning_rate": 1.9954481310845437e-05, "loss": 0.5003, "step": 245 }, { "epoch": 0.01752323966235709, "grad_norm": 9.232999801635742, "learning_rate": 1.9954106187978507e-05, "loss": 0.7441, "step": 246 }, { "epoch": 0.017594472343911385, "grad_norm": 3.987621545791626, "learning_rate": 1.9953729529295504e-05, "loss": 0.2536, "step": 247 }, { "epoch": 0.017665705025465682, "grad_norm": 3.260282039642334, "learning_rate": 1.9953351334854537e-05, "loss": 0.6587, "step": 248 }, { "epoch": 0.017736937707019982, "grad_norm": 3.2461953163146973, "learning_rate": 1.9952971604713963e-05, "loss": 0.2529, "step": 249 }, { "epoch": 0.01780817038857428, "grad_norm": 3.155229330062866, "learning_rate": 1.995259033893236e-05, "loss": 0.3536, "step": 250 }, { "epoch": 0.017879403070128575, "grad_norm": 5.562077045440674, "learning_rate": 1.9952207537568563e-05, "loss": 0.2997, "step": 251 }, { "epoch": 0.01795063575168287, "grad_norm": 3.6538283824920654, "learning_rate": 1.9951823200681628e-05, "loss": 0.2044, "step": 252 }, { "epoch": 0.01802186843323717, "grad_norm": 3.403193950653076, "learning_rate": 1.995143732833086e-05, "loss": 0.1903, "step": 253 }, { "epoch": 0.018093101114791465, "grad_norm": 3.040759325027466, "learning_rate": 1.995104992057579e-05, "loss": 0.5624, "step": 254 }, { "epoch": 0.018164333796345765, "grad_norm": 3.8880608081817627, "learning_rate": 1.9950660977476196e-05, "loss": 0.9028, "step": 255 }, { "epoch": 0.01823556647790006, "grad_norm": 3.370450973510742, "learning_rate": 1.9950270499092083e-05, "loss": 0.768, "step": 256 }, { "epoch": 0.018306799159454358, "grad_norm": 4.288878440856934, "learning_rate": 1.99498784854837e-05, "loss": 0.7244, "step": 257 }, { "epoch": 0.018378031841008655, "grad_norm": 6.8655524253845215, "learning_rate": 1.994948493671153e-05, "loss": 0.4358, "step": 258 }, { "epoch": 0.01844926452256295, "grad_norm": 2.423650026321411, "learning_rate": 1.9949089852836297e-05, "loss": 0.4363, "step": 259 }, { "epoch": 0.018520497204117248, "grad_norm": 3.7407803535461426, "learning_rate": 1.994869323391895e-05, "loss": 0.5721, "step": 260 }, { "epoch": 0.018591729885671548, "grad_norm": 6.390771865844727, "learning_rate": 1.9948295080020696e-05, "loss": 0.1977, "step": 261 }, { "epoch": 0.018662962567225844, "grad_norm": 3.715406656265259, "learning_rate": 1.9947895391202955e-05, "loss": 0.6475, "step": 262 }, { "epoch": 0.01873419524878014, "grad_norm": 3.824331521987915, "learning_rate": 1.9947494167527398e-05, "loss": 0.6944, "step": 263 }, { "epoch": 0.018805427930334438, "grad_norm": 2.4876017570495605, "learning_rate": 1.9947091409055933e-05, "loss": 0.5717, "step": 264 }, { "epoch": 0.018876660611888734, "grad_norm": 1.9492506980895996, "learning_rate": 1.9946687115850696e-05, "loss": 0.2419, "step": 265 }, { "epoch": 0.01894789329344303, "grad_norm": 4.194497585296631, "learning_rate": 1.994628128797407e-05, "loss": 0.1283, "step": 266 }, { "epoch": 0.019019125974997327, "grad_norm": 3.9074301719665527, "learning_rate": 1.9945873925488667e-05, "loss": 0.839, "step": 267 }, { "epoch": 0.019090358656551627, "grad_norm": 8.3717622756958, "learning_rate": 1.9945465028457337e-05, "loss": 0.872, "step": 268 }, { "epoch": 0.019161591338105924, "grad_norm": 3.44714617729187, "learning_rate": 1.9945054596943177e-05, "loss": 0.6192, "step": 269 }, { "epoch": 0.01923282401966022, "grad_norm": 2.4443359375, "learning_rate": 1.9944642631009507e-05, "loss": 0.1195, "step": 270 }, { "epoch": 0.019304056701214517, "grad_norm": 3.4766972064971924, "learning_rate": 1.9944229130719885e-05, "loss": 0.6818, "step": 271 }, { "epoch": 0.019375289382768814, "grad_norm": 3.0689163208007812, "learning_rate": 1.9943814096138116e-05, "loss": 0.7114, "step": 272 }, { "epoch": 0.01944652206432311, "grad_norm": 4.211381435394287, "learning_rate": 1.9943397527328233e-05, "loss": 0.6779, "step": 273 }, { "epoch": 0.01951775474587741, "grad_norm": 5.433151721954346, "learning_rate": 1.9942979424354506e-05, "loss": 0.833, "step": 274 }, { "epoch": 0.019588987427431707, "grad_norm": 4.105435371398926, "learning_rate": 1.9942559787281453e-05, "loss": 0.2866, "step": 275 }, { "epoch": 0.019660220108986003, "grad_norm": 2.9731662273406982, "learning_rate": 1.994213861617381e-05, "loss": 0.6215, "step": 276 }, { "epoch": 0.0197314527905403, "grad_norm": 5.589776992797852, "learning_rate": 1.9941715911096563e-05, "loss": 0.6516, "step": 277 }, { "epoch": 0.019802685472094596, "grad_norm": 1.9879305362701416, "learning_rate": 1.9941291672114928e-05, "loss": 0.1755, "step": 278 }, { "epoch": 0.019873918153648893, "grad_norm": 2.5514347553253174, "learning_rate": 1.9940865899294367e-05, "loss": 0.3968, "step": 279 }, { "epoch": 0.01994515083520319, "grad_norm": 4.328810214996338, "learning_rate": 1.9940438592700568e-05, "loss": 0.7842, "step": 280 }, { "epoch": 0.02001638351675749, "grad_norm": 3.6462225914001465, "learning_rate": 1.9940009752399462e-05, "loss": 0.4746, "step": 281 }, { "epoch": 0.020087616198311786, "grad_norm": 3.3935914039611816, "learning_rate": 1.993957937845721e-05, "loss": 0.4229, "step": 282 }, { "epoch": 0.020158848879866083, "grad_norm": 3.1074788570404053, "learning_rate": 1.993914747094022e-05, "loss": 0.3774, "step": 283 }, { "epoch": 0.02023008156142038, "grad_norm": 3.889533042907715, "learning_rate": 1.9938714029915128e-05, "loss": 0.6921, "step": 284 }, { "epoch": 0.020301314242974676, "grad_norm": 5.8477067947387695, "learning_rate": 1.9938279055448814e-05, "loss": 0.3378, "step": 285 }, { "epoch": 0.020372546924528973, "grad_norm": 3.313352108001709, "learning_rate": 1.993784254760838e-05, "loss": 0.6154, "step": 286 }, { "epoch": 0.020443779606083273, "grad_norm": 1.8884928226470947, "learning_rate": 1.9937404506461187e-05, "loss": 0.2238, "step": 287 }, { "epoch": 0.02051501228763757, "grad_norm": 2.968752145767212, "learning_rate": 1.993696493207481e-05, "loss": 0.3922, "step": 288 }, { "epoch": 0.020586244969191866, "grad_norm": 3.4692282676696777, "learning_rate": 1.9936523824517074e-05, "loss": 0.7704, "step": 289 }, { "epoch": 0.020657477650746162, "grad_norm": 3.0259530544281006, "learning_rate": 1.993608118385604e-05, "loss": 0.3657, "step": 290 }, { "epoch": 0.02072871033230046, "grad_norm": 3.6837944984436035, "learning_rate": 1.993563701016e-05, "loss": 0.6612, "step": 291 }, { "epoch": 0.020799943013854755, "grad_norm": 2.9335479736328125, "learning_rate": 1.993519130349749e-05, "loss": 0.6863, "step": 292 }, { "epoch": 0.020871175695409052, "grad_norm": 1.9513373374938965, "learning_rate": 1.9934744063937273e-05, "loss": 0.2353, "step": 293 }, { "epoch": 0.020942408376963352, "grad_norm": 3.080646276473999, "learning_rate": 1.9934295291548357e-05, "loss": 0.4507, "step": 294 }, { "epoch": 0.02101364105851765, "grad_norm": 3.0105276107788086, "learning_rate": 1.9933844986399977e-05, "loss": 0.4654, "step": 295 }, { "epoch": 0.021084873740071945, "grad_norm": 2.7973146438598633, "learning_rate": 1.9933393148561616e-05, "loss": 0.625, "step": 296 }, { "epoch": 0.021156106421626242, "grad_norm": 2.129157543182373, "learning_rate": 1.9932939778102985e-05, "loss": 0.2614, "step": 297 }, { "epoch": 0.02122733910318054, "grad_norm": 2.2891945838928223, "learning_rate": 1.9932484875094036e-05, "loss": 0.4582, "step": 298 }, { "epoch": 0.021298571784734835, "grad_norm": 3.821343183517456, "learning_rate": 1.9932028439604958e-05, "loss": 0.3361, "step": 299 }, { "epoch": 0.021369804466289135, "grad_norm": 3.1815178394317627, "learning_rate": 1.993157047170617e-05, "loss": 0.5018, "step": 300 }, { "epoch": 0.02144103714784343, "grad_norm": 3.672743320465088, "learning_rate": 1.9931110971468332e-05, "loss": 0.4511, "step": 301 }, { "epoch": 0.021512269829397728, "grad_norm": 4.108164310455322, "learning_rate": 1.9930649938962344e-05, "loss": 0.7323, "step": 302 }, { "epoch": 0.021583502510952025, "grad_norm": 2.035090446472168, "learning_rate": 1.9930187374259338e-05, "loss": 0.2107, "step": 303 }, { "epoch": 0.02165473519250632, "grad_norm": 2.2435388565063477, "learning_rate": 1.992972327743068e-05, "loss": 0.3322, "step": 304 }, { "epoch": 0.021725967874060618, "grad_norm": 8.435978889465332, "learning_rate": 1.9929257648547976e-05, "loss": 0.3341, "step": 305 }, { "epoch": 0.021797200555614914, "grad_norm": 2.9391729831695557, "learning_rate": 1.992879048768307e-05, "loss": 0.597, "step": 306 }, { "epoch": 0.021868433237169214, "grad_norm": 5.239787578582764, "learning_rate": 1.9928321794908035e-05, "loss": 0.4413, "step": 307 }, { "epoch": 0.02193966591872351, "grad_norm": 2.2809853553771973, "learning_rate": 1.992785157029519e-05, "loss": 0.2285, "step": 308 }, { "epoch": 0.022010898600277808, "grad_norm": 4.418404579162598, "learning_rate": 1.9927379813917087e-05, "loss": 0.6709, "step": 309 }, { "epoch": 0.022082131281832104, "grad_norm": 2.7674572467803955, "learning_rate": 1.992690652584651e-05, "loss": 0.508, "step": 310 }, { "epoch": 0.0221533639633864, "grad_norm": 5.631307601928711, "learning_rate": 1.992643170615648e-05, "loss": 0.1668, "step": 311 }, { "epoch": 0.022224596644940697, "grad_norm": 3.439270496368408, "learning_rate": 1.9925955354920265e-05, "loss": 0.6604, "step": 312 }, { "epoch": 0.022295829326494997, "grad_norm": 4.659964084625244, "learning_rate": 1.9925477472211356e-05, "loss": 0.6043, "step": 313 }, { "epoch": 0.022367062008049294, "grad_norm": 4.633533954620361, "learning_rate": 1.9924998058103483e-05, "loss": 0.4942, "step": 314 }, { "epoch": 0.02243829468960359, "grad_norm": 2.009563684463501, "learning_rate": 1.9924517112670617e-05, "loss": 0.2789, "step": 315 }, { "epoch": 0.022509527371157887, "grad_norm": 3.7584874629974365, "learning_rate": 1.9924034635986968e-05, "loss": 0.8411, "step": 316 }, { "epoch": 0.022580760052712184, "grad_norm": 2.826160430908203, "learning_rate": 1.992355062812697e-05, "loss": 0.5776, "step": 317 }, { "epoch": 0.02265199273426648, "grad_norm": 2.4606239795684814, "learning_rate": 1.99230650891653e-05, "loss": 0.4872, "step": 318 }, { "epoch": 0.02272322541582078, "grad_norm": 1.8814976215362549, "learning_rate": 1.9922578019176878e-05, "loss": 0.2644, "step": 319 }, { "epoch": 0.022794458097375077, "grad_norm": 2.760129451751709, "learning_rate": 1.992208941823685e-05, "loss": 0.4219, "step": 320 }, { "epoch": 0.022865690778929373, "grad_norm": 2.220008373260498, "learning_rate": 1.99215992864206e-05, "loss": 0.2795, "step": 321 }, { "epoch": 0.02293692346048367, "grad_norm": 1.4184520244598389, "learning_rate": 1.9921107623803757e-05, "loss": 0.0809, "step": 322 }, { "epoch": 0.023008156142037967, "grad_norm": 8.697464942932129, "learning_rate": 1.9920614430462173e-05, "loss": 0.5667, "step": 323 }, { "epoch": 0.023079388823592263, "grad_norm": 2.463754177093506, "learning_rate": 1.9920119706471944e-05, "loss": 0.4717, "step": 324 }, { "epoch": 0.02315062150514656, "grad_norm": 3.290628671646118, "learning_rate": 1.9919623451909402e-05, "loss": 0.5066, "step": 325 }, { "epoch": 0.02322185418670086, "grad_norm": 2.3060688972473145, "learning_rate": 1.9919125666851115e-05, "loss": 0.3064, "step": 326 }, { "epoch": 0.023293086868255156, "grad_norm": 4.83120059967041, "learning_rate": 1.9918626351373885e-05, "loss": 0.6122, "step": 327 }, { "epoch": 0.023364319549809453, "grad_norm": 4.266474723815918, "learning_rate": 1.991812550555475e-05, "loss": 0.3777, "step": 328 }, { "epoch": 0.02343555223136375, "grad_norm": 2.851112127304077, "learning_rate": 1.9917623129470985e-05, "loss": 0.72, "step": 329 }, { "epoch": 0.023506784912918046, "grad_norm": 1.6400808095932007, "learning_rate": 1.99171192232001e-05, "loss": 0.1356, "step": 330 }, { "epoch": 0.023578017594472343, "grad_norm": 1.8228585720062256, "learning_rate": 1.9916613786819856e-05, "loss": 0.1328, "step": 331 }, { "epoch": 0.023649250276026643, "grad_norm": 3.637011766433716, "learning_rate": 1.991610682040822e-05, "loss": 0.6838, "step": 332 }, { "epoch": 0.02372048295758094, "grad_norm": 2.101360321044922, "learning_rate": 1.9915598324043415e-05, "loss": 0.2831, "step": 333 }, { "epoch": 0.023791715639135236, "grad_norm": 3.1839003562927246, "learning_rate": 1.9915088297803905e-05, "loss": 0.5276, "step": 334 }, { "epoch": 0.023862948320689532, "grad_norm": 2.7043795585632324, "learning_rate": 1.9914576741768373e-05, "loss": 0.5107, "step": 335 }, { "epoch": 0.02393418100224383, "grad_norm": 3.033754825592041, "learning_rate": 1.991406365601575e-05, "loss": 0.5584, "step": 336 }, { "epoch": 0.024005413683798126, "grad_norm": 2.9945266246795654, "learning_rate": 1.99135490406252e-05, "loss": 0.3827, "step": 337 }, { "epoch": 0.024076646365352422, "grad_norm": 4.037656784057617, "learning_rate": 1.9913032895676126e-05, "loss": 0.4174, "step": 338 }, { "epoch": 0.024147879046906722, "grad_norm": 2.721132516860962, "learning_rate": 1.9912515221248157e-05, "loss": 0.5932, "step": 339 }, { "epoch": 0.02421911172846102, "grad_norm": 3.981628179550171, "learning_rate": 1.9911996017421168e-05, "loss": 0.4655, "step": 340 }, { "epoch": 0.024290344410015315, "grad_norm": 2.3138608932495117, "learning_rate": 1.991147528427527e-05, "loss": 0.2708, "step": 341 }, { "epoch": 0.024361577091569612, "grad_norm": 5.457282066345215, "learning_rate": 1.9910953021890802e-05, "loss": 0.4119, "step": 342 }, { "epoch": 0.02443280977312391, "grad_norm": 2.1577696800231934, "learning_rate": 1.9910429230348348e-05, "loss": 0.2956, "step": 343 }, { "epoch": 0.024504042454678205, "grad_norm": 2.6752164363861084, "learning_rate": 1.9909903909728722e-05, "loss": 0.601, "step": 344 }, { "epoch": 0.024575275136232505, "grad_norm": 2.392815113067627, "learning_rate": 1.9909377060112973e-05, "loss": 0.3593, "step": 345 }, { "epoch": 0.0246465078177868, "grad_norm": 3.2140135765075684, "learning_rate": 1.990884868158239e-05, "loss": 0.7233, "step": 346 }, { "epoch": 0.024717740499341098, "grad_norm": 3.7890055179595947, "learning_rate": 1.9908318774218498e-05, "loss": 0.5034, "step": 347 }, { "epoch": 0.024788973180895395, "grad_norm": 3.2403042316436768, "learning_rate": 1.9907787338103054e-05, "loss": 0.5745, "step": 348 }, { "epoch": 0.02486020586244969, "grad_norm": 5.495974540710449, "learning_rate": 1.9907254373318054e-05, "loss": 0.5602, "step": 349 }, { "epoch": 0.024931438544003988, "grad_norm": 3.1602189540863037, "learning_rate": 1.9906719879945733e-05, "loss": 0.4357, "step": 350 }, { "epoch": 0.025002671225558284, "grad_norm": 4.674003601074219, "learning_rate": 1.990618385806855e-05, "loss": 0.4336, "step": 351 }, { "epoch": 0.025073903907112585, "grad_norm": 3.797294855117798, "learning_rate": 1.9905646307769212e-05, "loss": 0.4363, "step": 352 }, { "epoch": 0.02514513658866688, "grad_norm": 8.287883758544922, "learning_rate": 1.990510722913066e-05, "loss": 0.3708, "step": 353 }, { "epoch": 0.025216369270221178, "grad_norm": 3.861119031906128, "learning_rate": 1.9904566622236064e-05, "loss": 0.668, "step": 354 }, { "epoch": 0.025287601951775474, "grad_norm": 3.2397239208221436, "learning_rate": 1.9904024487168835e-05, "loss": 0.5818, "step": 355 }, { "epoch": 0.02535883463332977, "grad_norm": 3.0673437118530273, "learning_rate": 1.9903480824012617e-05, "loss": 0.6269, "step": 356 }, { "epoch": 0.025430067314884067, "grad_norm": 2.810466766357422, "learning_rate": 1.9902935632851296e-05, "loss": 0.4296, "step": 357 }, { "epoch": 0.025501299996438367, "grad_norm": 2.518979787826538, "learning_rate": 1.9902388913768987e-05, "loss": 0.694, "step": 358 }, { "epoch": 0.025572532677992664, "grad_norm": 1.5447441339492798, "learning_rate": 1.9901840666850045e-05, "loss": 0.1533, "step": 359 }, { "epoch": 0.02564376535954696, "grad_norm": 2.9155945777893066, "learning_rate": 1.9901290892179056e-05, "loss": 0.532, "step": 360 }, { "epoch": 0.025714998041101257, "grad_norm": 5.897381782531738, "learning_rate": 1.9900739589840846e-05, "loss": 0.4274, "step": 361 }, { "epoch": 0.025786230722655554, "grad_norm": 2.4018092155456543, "learning_rate": 1.9900186759920475e-05, "loss": 0.4731, "step": 362 }, { "epoch": 0.02585746340420985, "grad_norm": 2.5617763996124268, "learning_rate": 1.9899632402503242e-05, "loss": 0.4517, "step": 363 }, { "epoch": 0.025928696085764147, "grad_norm": 1.746895670890808, "learning_rate": 1.9899076517674674e-05, "loss": 0.1913, "step": 364 }, { "epoch": 0.025999928767318447, "grad_norm": 2.6804137229919434, "learning_rate": 1.9898519105520537e-05, "loss": 0.412, "step": 365 }, { "epoch": 0.026071161448872743, "grad_norm": 2.785515546798706, "learning_rate": 1.989796016612684e-05, "loss": 0.6519, "step": 366 }, { "epoch": 0.02614239413042704, "grad_norm": 2.0644407272338867, "learning_rate": 1.989739969957982e-05, "loss": 0.2525, "step": 367 }, { "epoch": 0.026213626811981337, "grad_norm": 3.6696462631225586, "learning_rate": 1.9896837705965946e-05, "loss": 0.4856, "step": 368 }, { "epoch": 0.026284859493535633, "grad_norm": 3.4241416454315186, "learning_rate": 1.9896274185371934e-05, "loss": 0.5238, "step": 369 }, { "epoch": 0.02635609217508993, "grad_norm": 2.380523681640625, "learning_rate": 1.9895709137884727e-05, "loss": 0.355, "step": 370 }, { "epoch": 0.02642732485664423, "grad_norm": 3.2456114292144775, "learning_rate": 1.989514256359151e-05, "loss": 0.3644, "step": 371 }, { "epoch": 0.026498557538198526, "grad_norm": 3.5596134662628174, "learning_rate": 1.9894574462579688e-05, "loss": 0.1417, "step": 372 }, { "epoch": 0.026569790219752823, "grad_norm": 4.966527938842773, "learning_rate": 1.9894004834936924e-05, "loss": 0.7113, "step": 373 }, { "epoch": 0.02664102290130712, "grad_norm": 1.9390217065811157, "learning_rate": 1.9893433680751105e-05, "loss": 0.2767, "step": 374 }, { "epoch": 0.026712255582861416, "grad_norm": 2.4989407062530518, "learning_rate": 1.989286100011035e-05, "loss": 0.299, "step": 375 }, { "epoch": 0.026783488264415713, "grad_norm": 2.683946132659912, "learning_rate": 1.9892286793103018e-05, "loss": 0.5704, "step": 376 }, { "epoch": 0.026854720945970013, "grad_norm": 11.826340675354004, "learning_rate": 1.9891711059817705e-05, "loss": 1.1996, "step": 377 }, { "epoch": 0.02692595362752431, "grad_norm": 4.417357921600342, "learning_rate": 1.9891133800343245e-05, "loss": 0.782, "step": 378 }, { "epoch": 0.026997186309078606, "grad_norm": 3.663700819015503, "learning_rate": 1.989055501476869e-05, "loss": 0.5405, "step": 379 }, { "epoch": 0.027068418990632902, "grad_norm": 3.7081875801086426, "learning_rate": 1.9889974703183354e-05, "loss": 0.6235, "step": 380 }, { "epoch": 0.0271396516721872, "grad_norm": 3.61299204826355, "learning_rate": 1.988939286567677e-05, "loss": 0.7523, "step": 381 }, { "epoch": 0.027210884353741496, "grad_norm": 4.286790370941162, "learning_rate": 1.9888809502338706e-05, "loss": 0.3109, "step": 382 }, { "epoch": 0.027282117035295792, "grad_norm": 5.685676574707031, "learning_rate": 1.988822461325917e-05, "loss": 0.9177, "step": 383 }, { "epoch": 0.027353349716850092, "grad_norm": 3.342818021774292, "learning_rate": 1.988763819852841e-05, "loss": 0.6392, "step": 384 }, { "epoch": 0.02742458239840439, "grad_norm": 1.5610567331314087, "learning_rate": 1.9887050258236894e-05, "loss": 0.2345, "step": 385 }, { "epoch": 0.027495815079958685, "grad_norm": 2.9108164310455322, "learning_rate": 1.988646079247534e-05, "loss": 0.5857, "step": 386 }, { "epoch": 0.027567047761512982, "grad_norm": 2.519801139831543, "learning_rate": 1.9885869801334697e-05, "loss": 0.2971, "step": 387 }, { "epoch": 0.02763828044306728, "grad_norm": 2.8632805347442627, "learning_rate": 1.988527728490615e-05, "loss": 0.1872, "step": 388 }, { "epoch": 0.027709513124621575, "grad_norm": 5.323709487915039, "learning_rate": 1.9884683243281117e-05, "loss": 0.8751, "step": 389 }, { "epoch": 0.027780745806175875, "grad_norm": 4.197568416595459, "learning_rate": 1.988408767655125e-05, "loss": 0.428, "step": 390 }, { "epoch": 0.02785197848773017, "grad_norm": 2.3973400592803955, "learning_rate": 1.9883490584808443e-05, "loss": 0.6006, "step": 391 }, { "epoch": 0.027923211169284468, "grad_norm": 4.030494213104248, "learning_rate": 1.9882891968144816e-05, "loss": 0.5688, "step": 392 }, { "epoch": 0.027994443850838765, "grad_norm": 5.639304161071777, "learning_rate": 1.9882291826652735e-05, "loss": 0.586, "step": 393 }, { "epoch": 0.02806567653239306, "grad_norm": 2.8041231632232666, "learning_rate": 1.988169016042479e-05, "loss": 0.091, "step": 394 }, { "epoch": 0.028136909213947358, "grad_norm": 3.7328743934631348, "learning_rate": 1.988108696955382e-05, "loss": 0.524, "step": 395 }, { "epoch": 0.028208141895501655, "grad_norm": 5.2784318923950195, "learning_rate": 1.988048225413288e-05, "loss": 0.2382, "step": 396 }, { "epoch": 0.028279374577055955, "grad_norm": 2.2437314987182617, "learning_rate": 1.9879876014255283e-05, "loss": 0.3112, "step": 397 }, { "epoch": 0.02835060725861025, "grad_norm": 9.111502647399902, "learning_rate": 1.9879268250014558e-05, "loss": 0.2343, "step": 398 }, { "epoch": 0.028421839940164548, "grad_norm": 3.522883653640747, "learning_rate": 1.987865896150448e-05, "loss": 0.4367, "step": 399 }, { "epoch": 0.028493072621718844, "grad_norm": 4.153606414794922, "learning_rate": 1.9878048148819054e-05, "loss": 0.4825, "step": 400 }, { "epoch": 0.02856430530327314, "grad_norm": 3.8764166831970215, "learning_rate": 1.9877435812052522e-05, "loss": 0.6552, "step": 401 }, { "epoch": 0.028635537984827437, "grad_norm": 2.51763916015625, "learning_rate": 1.9876821951299362e-05, "loss": 0.5576, "step": 402 }, { "epoch": 0.028706770666381737, "grad_norm": 4.438761234283447, "learning_rate": 1.9876206566654285e-05, "loss": 0.5222, "step": 403 }, { "epoch": 0.028778003347936034, "grad_norm": 2.3087432384490967, "learning_rate": 1.9875589658212244e-05, "loss": 0.2928, "step": 404 }, { "epoch": 0.02884923602949033, "grad_norm": 3.323972463607788, "learning_rate": 1.9874971226068417e-05, "loss": 0.5531, "step": 405 }, { "epoch": 0.028920468711044627, "grad_norm": 2.0313777923583984, "learning_rate": 1.987435127031822e-05, "loss": 0.1824, "step": 406 }, { "epoch": 0.028991701392598924, "grad_norm": 2.1502997875213623, "learning_rate": 1.987372979105731e-05, "loss": 0.3506, "step": 407 }, { "epoch": 0.02906293407415322, "grad_norm": 3.3114686012268066, "learning_rate": 1.987310678838157e-05, "loss": 0.7221, "step": 408 }, { "epoch": 0.029134166755707517, "grad_norm": 2.3991193771362305, "learning_rate": 1.9872482262387128e-05, "loss": 0.2765, "step": 409 }, { "epoch": 0.029205399437261817, "grad_norm": 3.928464889526367, "learning_rate": 1.987185621317034e-05, "loss": 0.7624, "step": 410 }, { "epoch": 0.029276632118816114, "grad_norm": 2.1687374114990234, "learning_rate": 1.98712286408278e-05, "loss": 0.2781, "step": 411 }, { "epoch": 0.02934786480037041, "grad_norm": 5.5175461769104, "learning_rate": 1.9870599545456333e-05, "loss": 0.3872, "step": 412 }, { "epoch": 0.029419097481924707, "grad_norm": 3.9385826587677, "learning_rate": 1.9869968927153005e-05, "loss": 0.6785, "step": 413 }, { "epoch": 0.029490330163479003, "grad_norm": 3.941413402557373, "learning_rate": 1.986933678601511e-05, "loss": 0.3963, "step": 414 }, { "epoch": 0.0295615628450333, "grad_norm": 3.5355467796325684, "learning_rate": 1.9868703122140186e-05, "loss": 0.7217, "step": 415 }, { "epoch": 0.0296327955265876, "grad_norm": 3.0641679763793945, "learning_rate": 1.9868067935625997e-05, "loss": 0.4679, "step": 416 }, { "epoch": 0.029704028208141896, "grad_norm": 3.6041886806488037, "learning_rate": 1.9867431226570546e-05, "loss": 0.3975, "step": 417 }, { "epoch": 0.029775260889696193, "grad_norm": 2.970160484313965, "learning_rate": 1.9866792995072073e-05, "loss": 0.4456, "step": 418 }, { "epoch": 0.02984649357125049, "grad_norm": 3.517085552215576, "learning_rate": 1.986615324122905e-05, "loss": 0.4509, "step": 419 }, { "epoch": 0.029917726252804786, "grad_norm": 2.7299153804779053, "learning_rate": 1.986551196514018e-05, "loss": 0.1667, "step": 420 }, { "epoch": 0.029988958934359083, "grad_norm": 2.693978786468506, "learning_rate": 1.9864869166904412e-05, "loss": 0.1425, "step": 421 }, { "epoch": 0.030060191615913383, "grad_norm": 1.7205466032028198, "learning_rate": 1.986422484662092e-05, "loss": 0.1671, "step": 422 }, { "epoch": 0.03013142429746768, "grad_norm": 4.00972843170166, "learning_rate": 1.9863579004389115e-05, "loss": 0.5295, "step": 423 }, { "epoch": 0.030202656979021976, "grad_norm": 2.7980525493621826, "learning_rate": 1.9862931640308648e-05, "loss": 0.6735, "step": 424 }, { "epoch": 0.030273889660576273, "grad_norm": 3.5531563758850098, "learning_rate": 1.9862282754479394e-05, "loss": 0.4479, "step": 425 }, { "epoch": 0.03034512234213057, "grad_norm": 2.928947925567627, "learning_rate": 1.9861632347001474e-05, "loss": 0.6876, "step": 426 }, { "epoch": 0.030416355023684866, "grad_norm": 6.069326877593994, "learning_rate": 1.986098041797524e-05, "loss": 0.7775, "step": 427 }, { "epoch": 0.030487587705239162, "grad_norm": 3.96733021736145, "learning_rate": 1.986032696750127e-05, "loss": 0.4101, "step": 428 }, { "epoch": 0.030558820386793462, "grad_norm": 3.677042007446289, "learning_rate": 1.9859671995680395e-05, "loss": 0.7264, "step": 429 }, { "epoch": 0.03063005306834776, "grad_norm": 4.4930806159973145, "learning_rate": 1.9859015502613666e-05, "loss": 0.5214, "step": 430 }, { "epoch": 0.030701285749902055, "grad_norm": 3.615664482116699, "learning_rate": 1.9858357488402374e-05, "loss": 0.5326, "step": 431 }, { "epoch": 0.030772518431456352, "grad_norm": 5.401984691619873, "learning_rate": 1.985769795314804e-05, "loss": 0.7462, "step": 432 }, { "epoch": 0.03084375111301065, "grad_norm": 2.731973648071289, "learning_rate": 1.985703689695243e-05, "loss": 0.2896, "step": 433 }, { "epoch": 0.030914983794564945, "grad_norm": 2.688290596008301, "learning_rate": 1.9856374319917528e-05, "loss": 0.1249, "step": 434 }, { "epoch": 0.030986216476119245, "grad_norm": 3.5047125816345215, "learning_rate": 1.9855710222145576e-05, "loss": 0.5545, "step": 435 }, { "epoch": 0.031057449157673542, "grad_norm": 4.093603134155273, "learning_rate": 1.985504460373903e-05, "loss": 0.5501, "step": 436 }, { "epoch": 0.03112868183922784, "grad_norm": 3.544187307357788, "learning_rate": 1.9854377464800586e-05, "loss": 0.4204, "step": 437 }, { "epoch": 0.031199914520782135, "grad_norm": 1.999008297920227, "learning_rate": 1.9853708805433182e-05, "loss": 0.1753, "step": 438 }, { "epoch": 0.03127114720233643, "grad_norm": 2.739551305770874, "learning_rate": 1.985303862573998e-05, "loss": 0.3076, "step": 439 }, { "epoch": 0.03134237988389073, "grad_norm": 2.281320333480835, "learning_rate": 1.9852366925824393e-05, "loss": 0.2099, "step": 440 }, { "epoch": 0.031413612565445025, "grad_norm": 1.648186445236206, "learning_rate": 1.985169370579004e-05, "loss": 0.1352, "step": 441 }, { "epoch": 0.03148484524699932, "grad_norm": 4.600591659545898, "learning_rate": 1.9851018965740806e-05, "loss": 0.9799, "step": 442 }, { "epoch": 0.03155607792855362, "grad_norm": 3.5631096363067627, "learning_rate": 1.9850342705780788e-05, "loss": 0.3896, "step": 443 }, { "epoch": 0.031627310610107914, "grad_norm": 4.889103889465332, "learning_rate": 1.984966492601433e-05, "loss": 0.749, "step": 444 }, { "epoch": 0.03169854329166222, "grad_norm": 4.687511920928955, "learning_rate": 1.984898562654601e-05, "loss": 0.4429, "step": 445 }, { "epoch": 0.031769775973216514, "grad_norm": 7.479091644287109, "learning_rate": 1.984830480748063e-05, "loss": 0.4455, "step": 446 }, { "epoch": 0.03184100865477081, "grad_norm": 2.9047815799713135, "learning_rate": 1.9847622468923236e-05, "loss": 0.6796, "step": 447 }, { "epoch": 0.03191224133632511, "grad_norm": 4.178807258605957, "learning_rate": 1.9846938610979104e-05, "loss": 0.7549, "step": 448 }, { "epoch": 0.031983474017879404, "grad_norm": 2.2553889751434326, "learning_rate": 1.984625323375375e-05, "loss": 0.3021, "step": 449 }, { "epoch": 0.0320547066994337, "grad_norm": 3.3558671474456787, "learning_rate": 1.984556633735292e-05, "loss": 0.2952, "step": 450 }, { "epoch": 0.032125939380988, "grad_norm": 1.9594029188156128, "learning_rate": 1.9844877921882593e-05, "loss": 0.2546, "step": 451 }, { "epoch": 0.032197172062542294, "grad_norm": 2.9695496559143066, "learning_rate": 1.9844187987448984e-05, "loss": 0.5052, "step": 452 }, { "epoch": 0.03226840474409659, "grad_norm": 1.8268671035766602, "learning_rate": 1.9843496534158543e-05, "loss": 0.1553, "step": 453 }, { "epoch": 0.03233963742565089, "grad_norm": 2.928473949432373, "learning_rate": 1.984280356211796e-05, "loss": 0.4309, "step": 454 }, { "epoch": 0.032410870107205184, "grad_norm": 2.545924425125122, "learning_rate": 1.9842109071434143e-05, "loss": 0.2777, "step": 455 }, { "epoch": 0.03248210278875948, "grad_norm": 2.765592336654663, "learning_rate": 1.9841413062214253e-05, "loss": 0.3464, "step": 456 }, { "epoch": 0.03255333547031378, "grad_norm": 4.263272285461426, "learning_rate": 1.9840715534565677e-05, "loss": 0.5828, "step": 457 }, { "epoch": 0.03262456815186808, "grad_norm": 2.826770067214966, "learning_rate": 1.984001648859603e-05, "loss": 0.7046, "step": 458 }, { "epoch": 0.03269580083342238, "grad_norm": 3.1558821201324463, "learning_rate": 1.9839315924413174e-05, "loss": 0.3712, "step": 459 }, { "epoch": 0.03276703351497667, "grad_norm": 3.294163942337036, "learning_rate": 1.9838613842125193e-05, "loss": 0.2644, "step": 460 }, { "epoch": 0.03283826619653097, "grad_norm": 2.643167018890381, "learning_rate": 1.9837910241840418e-05, "loss": 0.2987, "step": 461 }, { "epoch": 0.032909498878085267, "grad_norm": 5.479509353637695, "learning_rate": 1.9837205123667404e-05, "loss": 0.4441, "step": 462 }, { "epoch": 0.03298073155963956, "grad_norm": 2.3433303833007812, "learning_rate": 1.983649848771494e-05, "loss": 0.4423, "step": 463 }, { "epoch": 0.03305196424119386, "grad_norm": 2.0404880046844482, "learning_rate": 1.9835790334092054e-05, "loss": 0.0975, "step": 464 }, { "epoch": 0.033123196922748156, "grad_norm": 3.087324619293213, "learning_rate": 1.9835080662908013e-05, "loss": 0.2178, "step": 465 }, { "epoch": 0.03319442960430245, "grad_norm": 2.445744514465332, "learning_rate": 1.9834369474272307e-05, "loss": 0.4179, "step": 466 }, { "epoch": 0.03326566228585675, "grad_norm": 2.799377918243408, "learning_rate": 1.983365676829466e-05, "loss": 0.5952, "step": 467 }, { "epoch": 0.033336894967411046, "grad_norm": 1.9265437126159668, "learning_rate": 1.9832942545085047e-05, "loss": 0.3361, "step": 468 }, { "epoch": 0.03340812764896534, "grad_norm": 2.3368327617645264, "learning_rate": 1.9832226804753658e-05, "loss": 0.2449, "step": 469 }, { "epoch": 0.03347936033051964, "grad_norm": 2.2329750061035156, "learning_rate": 1.9831509547410922e-05, "loss": 0.2992, "step": 470 }, { "epoch": 0.03355059301207394, "grad_norm": 6.898249626159668, "learning_rate": 1.9830790773167513e-05, "loss": 0.5584, "step": 471 }, { "epoch": 0.03362182569362824, "grad_norm": 2.356281280517578, "learning_rate": 1.983007048213432e-05, "loss": 0.1332, "step": 472 }, { "epoch": 0.033693058375182536, "grad_norm": 3.375041961669922, "learning_rate": 1.9829348674422488e-05, "loss": 0.2103, "step": 473 }, { "epoch": 0.03376429105673683, "grad_norm": 2.7144367694854736, "learning_rate": 1.982862535014337e-05, "loss": 0.2324, "step": 474 }, { "epoch": 0.03383552373829113, "grad_norm": 2.8595993518829346, "learning_rate": 1.9827900509408583e-05, "loss": 0.3015, "step": 475 }, { "epoch": 0.033906756419845425, "grad_norm": 4.890361785888672, "learning_rate": 1.9827174152329952e-05, "loss": 0.6136, "step": 476 }, { "epoch": 0.03397798910139972, "grad_norm": 4.054783821105957, "learning_rate": 1.9826446279019547e-05, "loss": 0.2187, "step": 477 }, { "epoch": 0.03404922178295402, "grad_norm": 3.750102996826172, "learning_rate": 1.9825716889589678e-05, "loss": 0.5219, "step": 478 }, { "epoch": 0.034120454464508315, "grad_norm": 2.7119219303131104, "learning_rate": 1.9824985984152877e-05, "loss": 0.1744, "step": 479 }, { "epoch": 0.03419168714606261, "grad_norm": 3.6870715618133545, "learning_rate": 1.9824253562821915e-05, "loss": 0.8384, "step": 480 }, { "epoch": 0.03426291982761691, "grad_norm": 2.2053945064544678, "learning_rate": 1.98235196257098e-05, "loss": 0.3671, "step": 481 }, { "epoch": 0.034334152509171205, "grad_norm": 4.388424873352051, "learning_rate": 1.982278417292977e-05, "loss": 0.7734, "step": 482 }, { "epoch": 0.0344053851907255, "grad_norm": 3.043001413345337, "learning_rate": 1.98220472045953e-05, "loss": 0.2659, "step": 483 }, { "epoch": 0.034476617872279805, "grad_norm": 4.794840335845947, "learning_rate": 1.9821308720820086e-05, "loss": 0.4878, "step": 484 }, { "epoch": 0.0345478505538341, "grad_norm": 2.6886534690856934, "learning_rate": 1.9820568721718082e-05, "loss": 0.4664, "step": 485 }, { "epoch": 0.0346190832353884, "grad_norm": 2.071166753768921, "learning_rate": 1.9819827207403458e-05, "loss": 0.1905, "step": 486 }, { "epoch": 0.034690315916942695, "grad_norm": 3.821279525756836, "learning_rate": 1.9819084177990615e-05, "loss": 0.6089, "step": 487 }, { "epoch": 0.03476154859849699, "grad_norm": 3.9558138847351074, "learning_rate": 1.9818339633594203e-05, "loss": 0.6162, "step": 488 }, { "epoch": 0.03483278128005129, "grad_norm": 2.9398770332336426, "learning_rate": 1.9817593574329096e-05, "loss": 0.6193, "step": 489 }, { "epoch": 0.034904013961605584, "grad_norm": 4.456028938293457, "learning_rate": 1.9816846000310403e-05, "loss": 0.2438, "step": 490 }, { "epoch": 0.03497524664315988, "grad_norm": 6.016989231109619, "learning_rate": 1.981609691165346e-05, "loss": 0.7123, "step": 491 }, { "epoch": 0.03504647932471418, "grad_norm": 2.4266092777252197, "learning_rate": 1.9815346308473857e-05, "loss": 0.2771, "step": 492 }, { "epoch": 0.035117712006268474, "grad_norm": 4.437952518463135, "learning_rate": 1.9814594190887394e-05, "loss": 0.767, "step": 493 }, { "epoch": 0.03518894468782277, "grad_norm": 3.53505539894104, "learning_rate": 1.9813840559010116e-05, "loss": 0.5234, "step": 494 }, { "epoch": 0.03526017736937707, "grad_norm": 1.7348042726516724, "learning_rate": 1.9813085412958307e-05, "loss": 0.2698, "step": 495 }, { "epoch": 0.035331410050931364, "grad_norm": 4.0975518226623535, "learning_rate": 1.9812328752848474e-05, "loss": 0.3849, "step": 496 }, { "epoch": 0.03540264273248567, "grad_norm": 5.793941974639893, "learning_rate": 1.981157057879736e-05, "loss": 0.8259, "step": 497 }, { "epoch": 0.035473875414039964, "grad_norm": 4.684053897857666, "learning_rate": 1.9810810890921943e-05, "loss": 0.4955, "step": 498 }, { "epoch": 0.03554510809559426, "grad_norm": 2.797945737838745, "learning_rate": 1.981004968933944e-05, "loss": 0.4531, "step": 499 }, { "epoch": 0.03561634077714856, "grad_norm": 2.842824697494507, "learning_rate": 1.9809286974167296e-05, "loss": 0.8338, "step": 500 }, { "epoch": 0.035687573458702854, "grad_norm": 3.4375417232513428, "learning_rate": 1.9808522745523186e-05, "loss": 0.7382, "step": 501 }, { "epoch": 0.03575880614025715, "grad_norm": 3.2492804527282715, "learning_rate": 1.9807757003525022e-05, "loss": 0.5886, "step": 502 }, { "epoch": 0.03583003882181145, "grad_norm": 4.535238265991211, "learning_rate": 1.9806989748290954e-05, "loss": 0.5273, "step": 503 }, { "epoch": 0.03590127150336574, "grad_norm": 2.942216396331787, "learning_rate": 1.980622097993936e-05, "loss": 0.5912, "step": 504 }, { "epoch": 0.03597250418492004, "grad_norm": 3.3251519203186035, "learning_rate": 1.9805450698588856e-05, "loss": 0.4281, "step": 505 }, { "epoch": 0.03604373686647434, "grad_norm": 3.2008259296417236, "learning_rate": 1.9804678904358284e-05, "loss": 0.9166, "step": 506 }, { "epoch": 0.03611496954802863, "grad_norm": 5.771389961242676, "learning_rate": 1.9803905597366726e-05, "loss": 0.6366, "step": 507 }, { "epoch": 0.03618620222958293, "grad_norm": 3.8380589485168457, "learning_rate": 1.9803130777733494e-05, "loss": 0.4569, "step": 508 }, { "epoch": 0.03625743491113723, "grad_norm": 2.5490469932556152, "learning_rate": 1.9802354445578137e-05, "loss": 0.4982, "step": 509 }, { "epoch": 0.03632866759269153, "grad_norm": 3.894367218017578, "learning_rate": 1.9801576601020435e-05, "loss": 0.5642, "step": 510 }, { "epoch": 0.036399900274245826, "grad_norm": 2.7723565101623535, "learning_rate": 1.98007972441804e-05, "loss": 0.2161, "step": 511 }, { "epoch": 0.03647113295580012, "grad_norm": 1.9746577739715576, "learning_rate": 1.9800016375178276e-05, "loss": 0.2489, "step": 512 }, { "epoch": 0.03654236563735442, "grad_norm": 4.234973430633545, "learning_rate": 1.979923399413455e-05, "loss": 0.7916, "step": 513 }, { "epoch": 0.036613598318908716, "grad_norm": 2.4094748497009277, "learning_rate": 1.9798450101169927e-05, "loss": 0.3179, "step": 514 }, { "epoch": 0.03668483100046301, "grad_norm": 2.0431346893310547, "learning_rate": 1.979766469640536e-05, "loss": 0.344, "step": 515 }, { "epoch": 0.03675606368201731, "grad_norm": 3.0487828254699707, "learning_rate": 1.9796877779962026e-05, "loss": 0.222, "step": 516 }, { "epoch": 0.036827296363571606, "grad_norm": 2.880671739578247, "learning_rate": 1.9796089351961338e-05, "loss": 0.4392, "step": 517 }, { "epoch": 0.0368985290451259, "grad_norm": 2.8772623538970947, "learning_rate": 1.9795299412524948e-05, "loss": 0.4138, "step": 518 }, { "epoch": 0.0369697617266802, "grad_norm": 3.2752318382263184, "learning_rate": 1.9794507961774725e-05, "loss": 0.671, "step": 519 }, { "epoch": 0.037040994408234496, "grad_norm": 3.5203163623809814, "learning_rate": 1.979371499983279e-05, "loss": 0.5094, "step": 520 }, { "epoch": 0.03711222708978879, "grad_norm": 2.9965898990631104, "learning_rate": 1.9792920526821486e-05, "loss": 0.5047, "step": 521 }, { "epoch": 0.037183459771343096, "grad_norm": 2.692641496658325, "learning_rate": 1.9792124542863394e-05, "loss": 0.276, "step": 522 }, { "epoch": 0.03725469245289739, "grad_norm": 2.987027883529663, "learning_rate": 1.9791327048081322e-05, "loss": 0.7071, "step": 523 }, { "epoch": 0.03732592513445169, "grad_norm": 3.1831185817718506, "learning_rate": 1.9790528042598316e-05, "loss": 0.458, "step": 524 }, { "epoch": 0.037397157816005985, "grad_norm": 4.440424919128418, "learning_rate": 1.978972752653766e-05, "loss": 0.4679, "step": 525 }, { "epoch": 0.03746839049756028, "grad_norm": 3.0778136253356934, "learning_rate": 1.978892550002286e-05, "loss": 0.3532, "step": 526 }, { "epoch": 0.03753962317911458, "grad_norm": 3.235177516937256, "learning_rate": 1.9788121963177663e-05, "loss": 0.7119, "step": 527 }, { "epoch": 0.037610855860668875, "grad_norm": 3.2151880264282227, "learning_rate": 1.978731691612604e-05, "loss": 0.4708, "step": 528 }, { "epoch": 0.03768208854222317, "grad_norm": 2.434161901473999, "learning_rate": 1.9786510358992213e-05, "loss": 0.4478, "step": 529 }, { "epoch": 0.03775332122377747, "grad_norm": 4.251272201538086, "learning_rate": 1.9785702291900616e-05, "loss": 0.1019, "step": 530 }, { "epoch": 0.037824553905331765, "grad_norm": 2.8141674995422363, "learning_rate": 1.978489271497593e-05, "loss": 0.1937, "step": 531 }, { "epoch": 0.03789578658688606, "grad_norm": 3.0674660205841064, "learning_rate": 1.978408162834306e-05, "loss": 0.4534, "step": 532 }, { "epoch": 0.03796701926844036, "grad_norm": 2.0037434101104736, "learning_rate": 1.9783269032127156e-05, "loss": 0.1696, "step": 533 }, { "epoch": 0.038038251949994654, "grad_norm": 3.690661668777466, "learning_rate": 1.9782454926453585e-05, "loss": 0.1415, "step": 534 }, { "epoch": 0.03810948463154896, "grad_norm": 5.418653964996338, "learning_rate": 1.978163931144796e-05, "loss": 0.6483, "step": 535 }, { "epoch": 0.038180717313103255, "grad_norm": 7.357758522033691, "learning_rate": 1.978082218723612e-05, "loss": 0.9729, "step": 536 }, { "epoch": 0.03825194999465755, "grad_norm": 2.54341983795166, "learning_rate": 1.978000355394414e-05, "loss": 0.071, "step": 537 }, { "epoch": 0.03832318267621185, "grad_norm": 4.216346263885498, "learning_rate": 1.9779183411698327e-05, "loss": 0.6702, "step": 538 }, { "epoch": 0.038394415357766144, "grad_norm": 3.249361753463745, "learning_rate": 1.977836176062522e-05, "loss": 0.5922, "step": 539 }, { "epoch": 0.03846564803932044, "grad_norm": 2.692652463912964, "learning_rate": 1.977753860085159e-05, "loss": 0.1265, "step": 540 }, { "epoch": 0.03853688072087474, "grad_norm": 3.1855549812316895, "learning_rate": 1.977671393250444e-05, "loss": 0.6981, "step": 541 }, { "epoch": 0.038608113402429034, "grad_norm": 6.880410194396973, "learning_rate": 1.977588775571101e-05, "loss": 0.3264, "step": 542 }, { "epoch": 0.03867934608398333, "grad_norm": 3.1824851036071777, "learning_rate": 1.9775060070598777e-05, "loss": 0.1714, "step": 543 }, { "epoch": 0.03875057876553763, "grad_norm": 3.7968270778656006, "learning_rate": 1.977423087729544e-05, "loss": 0.6827, "step": 544 }, { "epoch": 0.038821811447091924, "grad_norm": 4.0150885581970215, "learning_rate": 1.977340017592893e-05, "loss": 0.805, "step": 545 }, { "epoch": 0.03889304412864622, "grad_norm": 3.601682662963867, "learning_rate": 1.9772567966627417e-05, "loss": 0.3501, "step": 546 }, { "epoch": 0.03896427681020052, "grad_norm": 2.8812432289123535, "learning_rate": 1.9771734249519307e-05, "loss": 0.3541, "step": 547 }, { "epoch": 0.03903550949175482, "grad_norm": 2.9044511318206787, "learning_rate": 1.9770899024733235e-05, "loss": 0.3378, "step": 548 }, { "epoch": 0.03910674217330912, "grad_norm": 3.9551167488098145, "learning_rate": 1.9770062292398062e-05, "loss": 0.7422, "step": 549 }, { "epoch": 0.039177974854863414, "grad_norm": 2.4672815799713135, "learning_rate": 1.9769224052642887e-05, "loss": 0.5462, "step": 550 }, { "epoch": 0.03924920753641771, "grad_norm": 2.6043331623077393, "learning_rate": 1.9768384305597048e-05, "loss": 0.2815, "step": 551 }, { "epoch": 0.03932044021797201, "grad_norm": 2.9873156547546387, "learning_rate": 1.9767543051390103e-05, "loss": 0.61, "step": 552 }, { "epoch": 0.0393916728995263, "grad_norm": 3.025320053100586, "learning_rate": 1.9766700290151853e-05, "loss": 0.6424, "step": 553 }, { "epoch": 0.0394629055810806, "grad_norm": 3.0427794456481934, "learning_rate": 1.9765856022012326e-05, "loss": 0.6614, "step": 554 }, { "epoch": 0.039534138262634896, "grad_norm": 3.903101921081543, "learning_rate": 1.9765010247101783e-05, "loss": 0.2075, "step": 555 }, { "epoch": 0.03960537094418919, "grad_norm": 2.8419432640075684, "learning_rate": 1.9764162965550718e-05, "loss": 0.4122, "step": 556 }, { "epoch": 0.03967660362574349, "grad_norm": 3.7559120655059814, "learning_rate": 1.9763314177489858e-05, "loss": 0.5601, "step": 557 }, { "epoch": 0.039747836307297786, "grad_norm": 1.9838306903839111, "learning_rate": 1.9762463883050165e-05, "loss": 0.335, "step": 558 }, { "epoch": 0.03981906898885208, "grad_norm": 3.811890125274658, "learning_rate": 1.9761612082362828e-05, "loss": 0.2896, "step": 559 }, { "epoch": 0.03989030167040638, "grad_norm": 7.4740495681762695, "learning_rate": 1.9760758775559275e-05, "loss": 0.5578, "step": 560 }, { "epoch": 0.03996153435196068, "grad_norm": 4.532093524932861, "learning_rate": 1.9759903962771155e-05, "loss": 0.5283, "step": 561 }, { "epoch": 0.04003276703351498, "grad_norm": 2.34786057472229, "learning_rate": 1.9759047644130362e-05, "loss": 0.3725, "step": 562 }, { "epoch": 0.040103999715069276, "grad_norm": 2.9880237579345703, "learning_rate": 1.9758189819769017e-05, "loss": 0.2826, "step": 563 }, { "epoch": 0.04017523239662357, "grad_norm": 2.4334168434143066, "learning_rate": 1.9757330489819472e-05, "loss": 0.5072, "step": 564 }, { "epoch": 0.04024646507817787, "grad_norm": 4.638722896575928, "learning_rate": 1.9756469654414316e-05, "loss": 0.695, "step": 565 }, { "epoch": 0.040317697759732166, "grad_norm": 3.784250020980835, "learning_rate": 1.9755607313686363e-05, "loss": 0.3902, "step": 566 }, { "epoch": 0.04038893044128646, "grad_norm": 3.7105889320373535, "learning_rate": 1.9754743467768663e-05, "loss": 0.7297, "step": 567 }, { "epoch": 0.04046016312284076, "grad_norm": 5.255430221557617, "learning_rate": 1.9753878116794504e-05, "loss": 0.6333, "step": 568 }, { "epoch": 0.040531395804395055, "grad_norm": 9.862104415893555, "learning_rate": 1.9753011260897392e-05, "loss": 0.4964, "step": 569 }, { "epoch": 0.04060262848594935, "grad_norm": 2.701237678527832, "learning_rate": 1.9752142900211084e-05, "loss": 0.2994, "step": 570 }, { "epoch": 0.04067386116750365, "grad_norm": 3.6564035415649414, "learning_rate": 1.9751273034869552e-05, "loss": 0.8359, "step": 571 }, { "epoch": 0.040745093849057945, "grad_norm": 3.7808890342712402, "learning_rate": 1.975040166500701e-05, "loss": 0.2096, "step": 572 }, { "epoch": 0.04081632653061224, "grad_norm": 3.7421276569366455, "learning_rate": 1.97495287907579e-05, "loss": 0.6764, "step": 573 }, { "epoch": 0.040887559212166545, "grad_norm": 1.4586986303329468, "learning_rate": 1.97486544122569e-05, "loss": 0.259, "step": 574 }, { "epoch": 0.04095879189372084, "grad_norm": 4.744945049285889, "learning_rate": 1.974777852963891e-05, "loss": 0.9023, "step": 575 }, { "epoch": 0.04103002457527514, "grad_norm": 3.35522198677063, "learning_rate": 1.9746901143039082e-05, "loss": 0.7199, "step": 576 }, { "epoch": 0.041101257256829435, "grad_norm": 4.3001484870910645, "learning_rate": 1.974602225259278e-05, "loss": 0.5278, "step": 577 }, { "epoch": 0.04117248993838373, "grad_norm": 3.188480854034424, "learning_rate": 1.9745141858435607e-05, "loss": 0.3164, "step": 578 }, { "epoch": 0.04124372261993803, "grad_norm": 6.762843132019043, "learning_rate": 1.9744259960703405e-05, "loss": 0.3899, "step": 579 }, { "epoch": 0.041314955301492325, "grad_norm": 2.6982197761535645, "learning_rate": 1.9743376559532234e-05, "loss": 0.1936, "step": 580 }, { "epoch": 0.04138618798304662, "grad_norm": 3.9284732341766357, "learning_rate": 1.9742491655058396e-05, "loss": 0.395, "step": 581 }, { "epoch": 0.04145742066460092, "grad_norm": 3.914907693862915, "learning_rate": 1.974160524741843e-05, "loss": 0.4399, "step": 582 }, { "epoch": 0.041528653346155214, "grad_norm": 3.5649991035461426, "learning_rate": 1.974071733674909e-05, "loss": 0.4275, "step": 583 }, { "epoch": 0.04159988602770951, "grad_norm": 2.470170497894287, "learning_rate": 1.973982792318737e-05, "loss": 0.4447, "step": 584 }, { "epoch": 0.04167111870926381, "grad_norm": 4.412296295166016, "learning_rate": 1.9738937006870507e-05, "loss": 0.4149, "step": 585 }, { "epoch": 0.041742351390818104, "grad_norm": 3.7653095722198486, "learning_rate": 1.9738044587935957e-05, "loss": 0.7723, "step": 586 }, { "epoch": 0.04181358407237241, "grad_norm": 11.34985637664795, "learning_rate": 1.9737150666521408e-05, "loss": 0.4874, "step": 587 }, { "epoch": 0.041884816753926704, "grad_norm": 2.528203010559082, "learning_rate": 1.9736255242764782e-05, "loss": 0.5502, "step": 588 }, { "epoch": 0.041956049435481, "grad_norm": 2.11552095413208, "learning_rate": 1.973535831680424e-05, "loss": 0.3568, "step": 589 }, { "epoch": 0.0420272821170353, "grad_norm": 3.562774181365967, "learning_rate": 1.973445988877816e-05, "loss": 0.6064, "step": 590 }, { "epoch": 0.042098514798589594, "grad_norm": 3.055729389190674, "learning_rate": 1.9733559958825167e-05, "loss": 0.4306, "step": 591 }, { "epoch": 0.04216974748014389, "grad_norm": 6.165656566619873, "learning_rate": 1.973265852708411e-05, "loss": 0.735, "step": 592 }, { "epoch": 0.04224098016169819, "grad_norm": 3.4377732276916504, "learning_rate": 1.973175559369407e-05, "loss": 0.4218, "step": 593 }, { "epoch": 0.042312212843252484, "grad_norm": 2.571392059326172, "learning_rate": 1.9730851158794358e-05, "loss": 0.6182, "step": 594 }, { "epoch": 0.04238344552480678, "grad_norm": 3.241692304611206, "learning_rate": 1.972994522252452e-05, "loss": 0.6793, "step": 595 }, { "epoch": 0.04245467820636108, "grad_norm": 3.828352928161621, "learning_rate": 1.9729037785024333e-05, "loss": 0.7466, "step": 596 }, { "epoch": 0.04252591088791537, "grad_norm": 3.170468330383301, "learning_rate": 1.972812884643381e-05, "loss": 0.4454, "step": 597 }, { "epoch": 0.04259714356946967, "grad_norm": 2.5584192276000977, "learning_rate": 1.9727218406893177e-05, "loss": 0.3697, "step": 598 }, { "epoch": 0.042668376251023966, "grad_norm": 4.208374500274658, "learning_rate": 1.9726306466542923e-05, "loss": 0.6066, "step": 599 }, { "epoch": 0.04273960893257827, "grad_norm": 6.846272945404053, "learning_rate": 1.972539302552374e-05, "loss": 0.5069, "step": 600 }, { "epoch": 0.042810841614132567, "grad_norm": 2.5662922859191895, "learning_rate": 1.9724478083976565e-05, "loss": 0.5047, "step": 601 }, { "epoch": 0.04288207429568686, "grad_norm": 5.7531352043151855, "learning_rate": 1.9723561642042563e-05, "loss": 0.4935, "step": 602 }, { "epoch": 0.04295330697724116, "grad_norm": 3.844449281692505, "learning_rate": 1.9722643699863135e-05, "loss": 0.5006, "step": 603 }, { "epoch": 0.043024539658795456, "grad_norm": 3.518206834793091, "learning_rate": 1.9721724257579907e-05, "loss": 0.677, "step": 604 }, { "epoch": 0.04309577234034975, "grad_norm": 3.3871872425079346, "learning_rate": 1.972080331533474e-05, "loss": 0.2438, "step": 605 }, { "epoch": 0.04316700502190405, "grad_norm": 2.9402289390563965, "learning_rate": 1.971988087326973e-05, "loss": 0.6094, "step": 606 }, { "epoch": 0.043238237703458346, "grad_norm": 4.085641860961914, "learning_rate": 1.9718956931527193e-05, "loss": 0.4028, "step": 607 }, { "epoch": 0.04330947038501264, "grad_norm": 2.953064203262329, "learning_rate": 1.9718031490249688e-05, "loss": 0.386, "step": 608 }, { "epoch": 0.04338070306656694, "grad_norm": 3.07285737991333, "learning_rate": 1.9717104549580003e-05, "loss": 0.1221, "step": 609 }, { "epoch": 0.043451935748121236, "grad_norm": 2.894601821899414, "learning_rate": 1.9716176109661148e-05, "loss": 0.2196, "step": 610 }, { "epoch": 0.04352316842967553, "grad_norm": 8.048035621643066, "learning_rate": 1.9715246170636383e-05, "loss": 0.3574, "step": 611 }, { "epoch": 0.04359440111122983, "grad_norm": 3.5610713958740234, "learning_rate": 1.9714314732649174e-05, "loss": 0.683, "step": 612 }, { "epoch": 0.04366563379278413, "grad_norm": 4.722747325897217, "learning_rate": 1.9713381795843244e-05, "loss": 0.6357, "step": 613 }, { "epoch": 0.04373686647433843, "grad_norm": 5.193978786468506, "learning_rate": 1.9712447360362534e-05, "loss": 0.7244, "step": 614 }, { "epoch": 0.043808099155892725, "grad_norm": 2.624063491821289, "learning_rate": 1.971151142635121e-05, "loss": 0.5218, "step": 615 }, { "epoch": 0.04387933183744702, "grad_norm": 2.4951136112213135, "learning_rate": 1.9710573993953685e-05, "loss": 0.3093, "step": 616 }, { "epoch": 0.04395056451900132, "grad_norm": 1.8164963722229004, "learning_rate": 1.9709635063314592e-05, "loss": 0.1859, "step": 617 }, { "epoch": 0.044021797200555615, "grad_norm": 3.6309525966644287, "learning_rate": 1.97086946345788e-05, "loss": 0.5027, "step": 618 }, { "epoch": 0.04409302988210991, "grad_norm": 4.412733554840088, "learning_rate": 1.9707752707891404e-05, "loss": 0.1126, "step": 619 }, { "epoch": 0.04416426256366421, "grad_norm": 2.800950288772583, "learning_rate": 1.9706809283397733e-05, "loss": 0.2611, "step": 620 }, { "epoch": 0.044235495245218505, "grad_norm": 3.6768131256103516, "learning_rate": 1.9705864361243355e-05, "loss": 0.3747, "step": 621 }, { "epoch": 0.0443067279267728, "grad_norm": 2.151582956314087, "learning_rate": 1.9704917941574053e-05, "loss": 0.1509, "step": 622 }, { "epoch": 0.0443779606083271, "grad_norm": 2.4861643314361572, "learning_rate": 1.9703970024535855e-05, "loss": 0.2655, "step": 623 }, { "epoch": 0.044449193289881395, "grad_norm": 2.9016404151916504, "learning_rate": 1.970302061027502e-05, "loss": 0.3317, "step": 624 }, { "epoch": 0.0445204259714357, "grad_norm": 4.547147750854492, "learning_rate": 1.970206969893802e-05, "loss": 0.3484, "step": 625 }, { "epoch": 0.044591658652989995, "grad_norm": 3.466859817504883, "learning_rate": 1.970111729067158e-05, "loss": 0.4488, "step": 626 }, { "epoch": 0.04466289133454429, "grad_norm": 2.418379545211792, "learning_rate": 1.9700163385622642e-05, "loss": 0.4227, "step": 627 }, { "epoch": 0.04473412401609859, "grad_norm": 3.1922802925109863, "learning_rate": 1.969920798393839e-05, "loss": 0.6461, "step": 628 }, { "epoch": 0.044805356697652884, "grad_norm": 3.266716957092285, "learning_rate": 1.9698251085766226e-05, "loss": 0.6351, "step": 629 }, { "epoch": 0.04487658937920718, "grad_norm": 3.863145112991333, "learning_rate": 1.969729269125379e-05, "loss": 0.6182, "step": 630 }, { "epoch": 0.04494782206076148, "grad_norm": 2.543768882751465, "learning_rate": 1.969633280054896e-05, "loss": 0.3958, "step": 631 }, { "epoch": 0.045019054742315774, "grad_norm": 3.4180502891540527, "learning_rate": 1.9695371413799825e-05, "loss": 0.4671, "step": 632 }, { "epoch": 0.04509028742387007, "grad_norm": 3.895514965057373, "learning_rate": 1.9694408531154728e-05, "loss": 0.3241, "step": 633 }, { "epoch": 0.04516152010542437, "grad_norm": 2.4915931224823, "learning_rate": 1.969344415276223e-05, "loss": 0.318, "step": 634 }, { "epoch": 0.045232752786978664, "grad_norm": 2.476111888885498, "learning_rate": 1.9692478278771118e-05, "loss": 0.1224, "step": 635 }, { "epoch": 0.04530398546853296, "grad_norm": 3.023810863494873, "learning_rate": 1.969151090933042e-05, "loss": 0.469, "step": 636 }, { "epoch": 0.04537521815008726, "grad_norm": 2.1595218181610107, "learning_rate": 1.9690542044589395e-05, "loss": 0.3035, "step": 637 }, { "epoch": 0.04544645083164156, "grad_norm": 3.5039381980895996, "learning_rate": 1.9689571684697527e-05, "loss": 0.6026, "step": 638 }, { "epoch": 0.04551768351319586, "grad_norm": 3.2699499130249023, "learning_rate": 1.9688599829804528e-05, "loss": 0.7294, "step": 639 }, { "epoch": 0.045588916194750154, "grad_norm": 3.0831241607666016, "learning_rate": 1.968762648006035e-05, "loss": 0.5121, "step": 640 }, { "epoch": 0.04566014887630445, "grad_norm": 4.127689361572266, "learning_rate": 1.9686651635615172e-05, "loss": 0.6973, "step": 641 }, { "epoch": 0.04573138155785875, "grad_norm": 2.711080551147461, "learning_rate": 1.9685675296619397e-05, "loss": 0.4911, "step": 642 }, { "epoch": 0.04580261423941304, "grad_norm": 2.264641284942627, "learning_rate": 1.9684697463223664e-05, "loss": 0.2368, "step": 643 }, { "epoch": 0.04587384692096734, "grad_norm": 2.8379459381103516, "learning_rate": 1.968371813557885e-05, "loss": 0.5638, "step": 644 }, { "epoch": 0.045945079602521637, "grad_norm": 5.159214496612549, "learning_rate": 1.968273731383605e-05, "loss": 0.591, "step": 645 }, { "epoch": 0.04601631228407593, "grad_norm": 5.153579235076904, "learning_rate": 1.9681754998146592e-05, "loss": 0.4388, "step": 646 }, { "epoch": 0.04608754496563023, "grad_norm": 2.9897189140319824, "learning_rate": 1.9680771188662044e-05, "loss": 0.6744, "step": 647 }, { "epoch": 0.046158777647184526, "grad_norm": 2.2696375846862793, "learning_rate": 1.9679785885534196e-05, "loss": 0.3616, "step": 648 }, { "epoch": 0.04623001032873882, "grad_norm": 2.2969424724578857, "learning_rate": 1.9678799088915064e-05, "loss": 0.3884, "step": 649 }, { "epoch": 0.04630124301029312, "grad_norm": 2.416337728500366, "learning_rate": 1.9677810798956906e-05, "loss": 0.4618, "step": 650 }, { "epoch": 0.04637247569184742, "grad_norm": 3.7221078872680664, "learning_rate": 1.9676821015812203e-05, "loss": 0.4945, "step": 651 }, { "epoch": 0.04644370837340172, "grad_norm": 3.5222132205963135, "learning_rate": 1.967582973963367e-05, "loss": 0.5586, "step": 652 }, { "epoch": 0.046514941054956016, "grad_norm": 4.262016773223877, "learning_rate": 1.9674836970574253e-05, "loss": 0.7344, "step": 653 }, { "epoch": 0.04658617373651031, "grad_norm": 2.7827541828155518, "learning_rate": 1.967384270878712e-05, "loss": 0.3116, "step": 654 }, { "epoch": 0.04665740641806461, "grad_norm": 1.883518099784851, "learning_rate": 1.967284695442568e-05, "loss": 0.3448, "step": 655 }, { "epoch": 0.046728639099618906, "grad_norm": 5.83323335647583, "learning_rate": 1.9671849707643567e-05, "loss": 0.4592, "step": 656 }, { "epoch": 0.0467998717811732, "grad_norm": 4.215867042541504, "learning_rate": 1.9670850968594642e-05, "loss": 0.5053, "step": 657 }, { "epoch": 0.0468711044627275, "grad_norm": 5.599697113037109, "learning_rate": 1.9669850737433002e-05, "loss": 0.493, "step": 658 }, { "epoch": 0.046942337144281795, "grad_norm": 5.557825088500977, "learning_rate": 1.9668849014312978e-05, "loss": 0.8698, "step": 659 }, { "epoch": 0.04701356982583609, "grad_norm": 3.8092095851898193, "learning_rate": 1.9667845799389117e-05, "loss": 0.7449, "step": 660 }, { "epoch": 0.04708480250739039, "grad_norm": 2.448709011077881, "learning_rate": 1.9666841092816212e-05, "loss": 0.4192, "step": 661 }, { "epoch": 0.047156035188944685, "grad_norm": 4.7531657218933105, "learning_rate": 1.9665834894749275e-05, "loss": 0.3395, "step": 662 }, { "epoch": 0.04722726787049898, "grad_norm": 1.7277717590332031, "learning_rate": 1.966482720534355e-05, "loss": 0.1847, "step": 663 }, { "epoch": 0.047298500552053285, "grad_norm": 3.3622448444366455, "learning_rate": 1.9663818024754516e-05, "loss": 0.7535, "step": 664 }, { "epoch": 0.04736973323360758, "grad_norm": 8.348213195800781, "learning_rate": 1.966280735313788e-05, "loss": 0.3164, "step": 665 }, { "epoch": 0.04744096591516188, "grad_norm": 2.6332805156707764, "learning_rate": 1.9661795190649578e-05, "loss": 0.1625, "step": 666 }, { "epoch": 0.047512198596716175, "grad_norm": 3.576711893081665, "learning_rate": 1.9660781537445774e-05, "loss": 0.2097, "step": 667 }, { "epoch": 0.04758343127827047, "grad_norm": 3.42527437210083, "learning_rate": 1.9659766393682867e-05, "loss": 0.5658, "step": 668 }, { "epoch": 0.04765466395982477, "grad_norm": 3.6504030227661133, "learning_rate": 1.965874975951748e-05, "loss": 0.7923, "step": 669 }, { "epoch": 0.047725896641379065, "grad_norm": 2.625375747680664, "learning_rate": 1.965773163510647e-05, "loss": 0.5583, "step": 670 }, { "epoch": 0.04779712932293336, "grad_norm": 4.029907703399658, "learning_rate": 1.9656712020606926e-05, "loss": 0.5618, "step": 671 }, { "epoch": 0.04786836200448766, "grad_norm": 3.8212485313415527, "learning_rate": 1.9655690916176164e-05, "loss": 0.5957, "step": 672 }, { "epoch": 0.047939594686041954, "grad_norm": 3.9882125854492188, "learning_rate": 1.9654668321971724e-05, "loss": 0.7506, "step": 673 }, { "epoch": 0.04801082736759625, "grad_norm": 2.1525790691375732, "learning_rate": 1.965364423815139e-05, "loss": 0.3668, "step": 674 }, { "epoch": 0.04808206004915055, "grad_norm": 3.360203742980957, "learning_rate": 1.965261866487316e-05, "loss": 0.9293, "step": 675 }, { "epoch": 0.048153292730704844, "grad_norm": 2.9344890117645264, "learning_rate": 1.9651591602295275e-05, "loss": 0.8029, "step": 676 }, { "epoch": 0.04822452541225915, "grad_norm": 2.689601182937622, "learning_rate": 1.9650563050576195e-05, "loss": 0.5679, "step": 677 }, { "epoch": 0.048295758093813444, "grad_norm": 2.74796199798584, "learning_rate": 1.964953300987462e-05, "loss": 0.5058, "step": 678 }, { "epoch": 0.04836699077536774, "grad_norm": 2.499582052230835, "learning_rate": 1.9648501480349473e-05, "loss": 0.3366, "step": 679 }, { "epoch": 0.04843822345692204, "grad_norm": 4.441071033477783, "learning_rate": 1.9647468462159906e-05, "loss": 0.8649, "step": 680 }, { "epoch": 0.048509456138476334, "grad_norm": 2.951131582260132, "learning_rate": 1.9646433955465307e-05, "loss": 0.3679, "step": 681 }, { "epoch": 0.04858068882003063, "grad_norm": 3.3845865726470947, "learning_rate": 1.9645397960425287e-05, "loss": 0.5774, "step": 682 }, { "epoch": 0.04865192150158493, "grad_norm": 2.5345308780670166, "learning_rate": 1.964436047719969e-05, "loss": 0.232, "step": 683 }, { "epoch": 0.048723154183139224, "grad_norm": 3.160355567932129, "learning_rate": 1.9643321505948588e-05, "loss": 0.6637, "step": 684 }, { "epoch": 0.04879438686469352, "grad_norm": 2.5212619304656982, "learning_rate": 1.9642281046832287e-05, "loss": 0.5233, "step": 685 }, { "epoch": 0.04886561954624782, "grad_norm": 2.9158847332000732, "learning_rate": 1.9641239100011312e-05, "loss": 0.4121, "step": 686 }, { "epoch": 0.04893685222780211, "grad_norm": 3.702652931213379, "learning_rate": 1.9640195665646434e-05, "loss": 0.9233, "step": 687 }, { "epoch": 0.04900808490935641, "grad_norm": 9.71773624420166, "learning_rate": 1.963915074389864e-05, "loss": 0.4396, "step": 688 }, { "epoch": 0.04907931759091071, "grad_norm": 2.7109053134918213, "learning_rate": 1.9638104334929145e-05, "loss": 0.5363, "step": 689 }, { "epoch": 0.04915055027246501, "grad_norm": 3.2948431968688965, "learning_rate": 1.963705643889941e-05, "loss": 0.6611, "step": 690 }, { "epoch": 0.04922178295401931, "grad_norm": 5.145808219909668, "learning_rate": 1.9636007055971106e-05, "loss": 1.0605, "step": 691 }, { "epoch": 0.0492930156355736, "grad_norm": 2.4350037574768066, "learning_rate": 1.9634956186306147e-05, "loss": 0.3858, "step": 692 }, { "epoch": 0.0493642483171279, "grad_norm": 3.258962869644165, "learning_rate": 1.963390383006667e-05, "loss": 0.7877, "step": 693 }, { "epoch": 0.049435480998682196, "grad_norm": 3.736661911010742, "learning_rate": 1.9632849987415038e-05, "loss": 0.9852, "step": 694 }, { "epoch": 0.04950671368023649, "grad_norm": 3.351581573486328, "learning_rate": 1.9631794658513853e-05, "loss": 0.7553, "step": 695 }, { "epoch": 0.04957794636179079, "grad_norm": 3.1564276218414307, "learning_rate": 1.9630737843525946e-05, "loss": 0.6975, "step": 696 }, { "epoch": 0.049649179043345086, "grad_norm": 3.04974627494812, "learning_rate": 1.9629679542614363e-05, "loss": 0.7639, "step": 697 }, { "epoch": 0.04972041172489938, "grad_norm": 2.0711300373077393, "learning_rate": 1.962861975594239e-05, "loss": 0.3942, "step": 698 }, { "epoch": 0.04979164440645368, "grad_norm": 3.2386202812194824, "learning_rate": 1.9627558483673546e-05, "loss": 0.4048, "step": 699 }, { "epoch": 0.049862877088007976, "grad_norm": 3.083845376968384, "learning_rate": 1.962649572597158e-05, "loss": 0.6106, "step": 700 }, { "epoch": 0.04993410976956227, "grad_norm": 3.2503888607025146, "learning_rate": 1.9625431483000448e-05, "loss": 0.2317, "step": 701 }, { "epoch": 0.05000534245111657, "grad_norm": 5.153375148773193, "learning_rate": 1.9624365754924364e-05, "loss": 0.7647, "step": 702 }, { "epoch": 0.05007657513267087, "grad_norm": 2.0833144187927246, "learning_rate": 1.9623298541907756e-05, "loss": 0.3433, "step": 703 }, { "epoch": 0.05014780781422517, "grad_norm": 2.143014669418335, "learning_rate": 1.9622229844115284e-05, "loss": 0.4132, "step": 704 }, { "epoch": 0.050219040495779466, "grad_norm": 3.628122091293335, "learning_rate": 1.9621159661711834e-05, "loss": 0.3829, "step": 705 }, { "epoch": 0.05029027317733376, "grad_norm": 2.7936296463012695, "learning_rate": 1.9620087994862534e-05, "loss": 0.7021, "step": 706 }, { "epoch": 0.05036150585888806, "grad_norm": 3.0783021450042725, "learning_rate": 1.961901484373272e-05, "loss": 0.4454, "step": 707 }, { "epoch": 0.050432738540442355, "grad_norm": 4.674768447875977, "learning_rate": 1.9617940208487968e-05, "loss": 0.7702, "step": 708 }, { "epoch": 0.05050397122199665, "grad_norm": 2.284507989883423, "learning_rate": 1.9616864089294095e-05, "loss": 0.367, "step": 709 }, { "epoch": 0.05057520390355095, "grad_norm": 3.9994325637817383, "learning_rate": 1.9615786486317124e-05, "loss": 0.5782, "step": 710 }, { "epoch": 0.050646436585105245, "grad_norm": 4.076273441314697, "learning_rate": 1.9614707399723318e-05, "loss": 0.4708, "step": 711 }, { "epoch": 0.05071766926665954, "grad_norm": 2.9620423316955566, "learning_rate": 1.9613626829679176e-05, "loss": 0.8369, "step": 712 }, { "epoch": 0.05078890194821384, "grad_norm": 2.6654770374298096, "learning_rate": 1.9612544776351415e-05, "loss": 0.4369, "step": 713 }, { "epoch": 0.050860134629768135, "grad_norm": 4.5585036277771, "learning_rate": 1.961146123990699e-05, "loss": 0.8028, "step": 714 }, { "epoch": 0.05093136731132243, "grad_norm": 3.9226858615875244, "learning_rate": 1.9610376220513067e-05, "loss": 0.8421, "step": 715 }, { "epoch": 0.051002599992876735, "grad_norm": 3.668316602706909, "learning_rate": 1.9609289718337067e-05, "loss": 0.728, "step": 716 }, { "epoch": 0.05107383267443103, "grad_norm": 3.022129774093628, "learning_rate": 1.9608201733546615e-05, "loss": 0.4658, "step": 717 }, { "epoch": 0.05114506535598533, "grad_norm": 2.7143025398254395, "learning_rate": 1.9607112266309585e-05, "loss": 0.5181, "step": 718 }, { "epoch": 0.051216298037539625, "grad_norm": 3.4784815311431885, "learning_rate": 1.9606021316794065e-05, "loss": 0.7922, "step": 719 }, { "epoch": 0.05128753071909392, "grad_norm": 3.781162977218628, "learning_rate": 1.9604928885168376e-05, "loss": 0.5373, "step": 720 }, { "epoch": 0.05135876340064822, "grad_norm": 3.5188803672790527, "learning_rate": 1.9603834971601075e-05, "loss": 0.677, "step": 721 }, { "epoch": 0.051429996082202514, "grad_norm": 5.372128486633301, "learning_rate": 1.9602739576260937e-05, "loss": 0.4958, "step": 722 }, { "epoch": 0.05150122876375681, "grad_norm": 1.9344408512115479, "learning_rate": 1.9601642699316968e-05, "loss": 0.2623, "step": 723 }, { "epoch": 0.05157246144531111, "grad_norm": 4.017388820648193, "learning_rate": 1.9600544340938415e-05, "loss": 0.467, "step": 724 }, { "epoch": 0.051643694126865404, "grad_norm": 2.525991439819336, "learning_rate": 1.9599444501294733e-05, "loss": 0.294, "step": 725 }, { "epoch": 0.0517149268084197, "grad_norm": 3.35978102684021, "learning_rate": 1.959834318055562e-05, "loss": 0.7382, "step": 726 }, { "epoch": 0.051786159489974, "grad_norm": 4.229371070861816, "learning_rate": 1.9597240378891e-05, "loss": 0.6591, "step": 727 }, { "epoch": 0.051857392171528294, "grad_norm": 2.824657440185547, "learning_rate": 1.959613609647102e-05, "loss": 0.323, "step": 728 }, { "epoch": 0.0519286248530826, "grad_norm": 2.903749465942383, "learning_rate": 1.959503033346606e-05, "loss": 0.5353, "step": 729 }, { "epoch": 0.051999857534636894, "grad_norm": 4.323469638824463, "learning_rate": 1.959392309004673e-05, "loss": 0.8269, "step": 730 }, { "epoch": 0.05207109021619119, "grad_norm": 4.274149417877197, "learning_rate": 1.959281436638387e-05, "loss": 0.7935, "step": 731 }, { "epoch": 0.05214232289774549, "grad_norm": 2.3957486152648926, "learning_rate": 1.9591704162648532e-05, "loss": 0.1772, "step": 732 }, { "epoch": 0.052213555579299784, "grad_norm": 2.1578848361968994, "learning_rate": 1.9590592479012022e-05, "loss": 0.3078, "step": 733 }, { "epoch": 0.05228478826085408, "grad_norm": 3.58215069770813, "learning_rate": 1.9589479315645857e-05, "loss": 0.8095, "step": 734 }, { "epoch": 0.05235602094240838, "grad_norm": 3.0139880180358887, "learning_rate": 1.9588364672721785e-05, "loss": 0.3513, "step": 735 }, { "epoch": 0.05242725362396267, "grad_norm": 3.780531883239746, "learning_rate": 1.9587248550411786e-05, "loss": 1.0567, "step": 736 }, { "epoch": 0.05249848630551697, "grad_norm": 3.637742280960083, "learning_rate": 1.9586130948888064e-05, "loss": 0.5477, "step": 737 }, { "epoch": 0.052569718987071266, "grad_norm": 3.083749771118164, "learning_rate": 1.9585011868323052e-05, "loss": 0.4829, "step": 738 }, { "epoch": 0.05264095166862556, "grad_norm": 4.86381721496582, "learning_rate": 1.958389130888942e-05, "loss": 0.5367, "step": 739 }, { "epoch": 0.05271218435017986, "grad_norm": 3.5406880378723145, "learning_rate": 1.9582769270760055e-05, "loss": 0.5167, "step": 740 }, { "epoch": 0.05278341703173416, "grad_norm": 5.6799092292785645, "learning_rate": 1.958164575410807e-05, "loss": 0.4018, "step": 741 }, { "epoch": 0.05285464971328846, "grad_norm": 4.969606876373291, "learning_rate": 1.958052075910682e-05, "loss": 0.6634, "step": 742 }, { "epoch": 0.052925882394842756, "grad_norm": 3.3145556449890137, "learning_rate": 1.9579394285929877e-05, "loss": 0.2247, "step": 743 }, { "epoch": 0.05299711507639705, "grad_norm": 3.0792431831359863, "learning_rate": 1.9578266334751045e-05, "loss": 0.5254, "step": 744 }, { "epoch": 0.05306834775795135, "grad_norm": 6.718730449676514, "learning_rate": 1.9577136905744353e-05, "loss": 0.7391, "step": 745 }, { "epoch": 0.053139580439505646, "grad_norm": 3.171712875366211, "learning_rate": 1.957600599908406e-05, "loss": 0.6224, "step": 746 }, { "epoch": 0.05321081312105994, "grad_norm": 4.810603618621826, "learning_rate": 1.9574873614944657e-05, "loss": 0.8168, "step": 747 }, { "epoch": 0.05328204580261424, "grad_norm": 3.955308437347412, "learning_rate": 1.9573739753500857e-05, "loss": 0.4634, "step": 748 }, { "epoch": 0.053353278484168536, "grad_norm": 3.1820271015167236, "learning_rate": 1.9572604414927604e-05, "loss": 0.6705, "step": 749 }, { "epoch": 0.05342451116572283, "grad_norm": 1.2336657047271729, "learning_rate": 1.957146759940007e-05, "loss": 0.0866, "step": 750 }, { "epoch": 0.05349574384727713, "grad_norm": 3.709996223449707, "learning_rate": 1.9570329307093652e-05, "loss": 0.4619, "step": 751 }, { "epoch": 0.053566976528831425, "grad_norm": 3.073500871658325, "learning_rate": 1.9569189538183978e-05, "loss": 0.4764, "step": 752 }, { "epoch": 0.05363820921038572, "grad_norm": 4.220072269439697, "learning_rate": 1.95680482928469e-05, "loss": 0.5384, "step": 753 }, { "epoch": 0.053709441891940025, "grad_norm": 4.168227672576904, "learning_rate": 1.9566905571258502e-05, "loss": 0.7417, "step": 754 }, { "epoch": 0.05378067457349432, "grad_norm": 2.3591794967651367, "learning_rate": 1.9565761373595094e-05, "loss": 0.4194, "step": 755 }, { "epoch": 0.05385190725504862, "grad_norm": 2.116896152496338, "learning_rate": 1.9564615700033215e-05, "loss": 0.1693, "step": 756 }, { "epoch": 0.053923139936602915, "grad_norm": 2.608844518661499, "learning_rate": 1.956346855074963e-05, "loss": 0.326, "step": 757 }, { "epoch": 0.05399437261815721, "grad_norm": 3.023883581161499, "learning_rate": 1.9562319925921333e-05, "loss": 0.4524, "step": 758 }, { "epoch": 0.05406560529971151, "grad_norm": 3.046668529510498, "learning_rate": 1.9561169825725546e-05, "loss": 0.4076, "step": 759 }, { "epoch": 0.054136837981265805, "grad_norm": 2.4938464164733887, "learning_rate": 1.9560018250339712e-05, "loss": 0.4319, "step": 760 }, { "epoch": 0.0542080706628201, "grad_norm": 3.705254316329956, "learning_rate": 1.9558865199941515e-05, "loss": 0.7095, "step": 761 }, { "epoch": 0.0542793033443744, "grad_norm": 2.2004010677337646, "learning_rate": 1.9557710674708853e-05, "loss": 0.2903, "step": 762 }, { "epoch": 0.054350536025928695, "grad_norm": 3.786945343017578, "learning_rate": 1.955655467481986e-05, "loss": 0.5928, "step": 763 }, { "epoch": 0.05442176870748299, "grad_norm": 2.072543144226074, "learning_rate": 1.9555397200452892e-05, "loss": 0.2172, "step": 764 }, { "epoch": 0.05449300138903729, "grad_norm": 2.4661448001861572, "learning_rate": 1.9554238251786538e-05, "loss": 0.5988, "step": 765 }, { "epoch": 0.054564234070591584, "grad_norm": 2.7987358570098877, "learning_rate": 1.9553077828999614e-05, "loss": 0.1899, "step": 766 }, { "epoch": 0.05463546675214589, "grad_norm": 1.7960389852523804, "learning_rate": 1.9551915932271156e-05, "loss": 0.2318, "step": 767 }, { "epoch": 0.054706699433700184, "grad_norm": 2.539961338043213, "learning_rate": 1.9550752561780434e-05, "loss": 0.4645, "step": 768 }, { "epoch": 0.05477793211525448, "grad_norm": 3.718641996383667, "learning_rate": 1.9549587717706952e-05, "loss": 0.7717, "step": 769 }, { "epoch": 0.05484916479680878, "grad_norm": 2.6663475036621094, "learning_rate": 1.9548421400230418e-05, "loss": 0.3833, "step": 770 }, { "epoch": 0.054920397478363074, "grad_norm": 2.125058889389038, "learning_rate": 1.9547253609530797e-05, "loss": 0.202, "step": 771 }, { "epoch": 0.05499163015991737, "grad_norm": 2.203411102294922, "learning_rate": 1.954608434578826e-05, "loss": 0.4122, "step": 772 }, { "epoch": 0.05506286284147167, "grad_norm": 2.3641469478607178, "learning_rate": 1.9544913609183214e-05, "loss": 0.2824, "step": 773 }, { "epoch": 0.055134095523025964, "grad_norm": 3.355301856994629, "learning_rate": 1.9543741399896295e-05, "loss": 0.7828, "step": 774 }, { "epoch": 0.05520532820458026, "grad_norm": 3.029536008834839, "learning_rate": 1.9542567718108357e-05, "loss": 0.6011, "step": 775 }, { "epoch": 0.05527656088613456, "grad_norm": 3.8782973289489746, "learning_rate": 1.954139256400049e-05, "loss": 0.9555, "step": 776 }, { "epoch": 0.055347793567688854, "grad_norm": 6.109956741333008, "learning_rate": 1.954021593775401e-05, "loss": 0.6109, "step": 777 }, { "epoch": 0.05541902624924315, "grad_norm": 2.6631014347076416, "learning_rate": 1.953903783955045e-05, "loss": 0.5739, "step": 778 }, { "epoch": 0.05549025893079745, "grad_norm": 4.075719356536865, "learning_rate": 1.953785826957159e-05, "loss": 0.4995, "step": 779 }, { "epoch": 0.05556149161235175, "grad_norm": 2.696099042892456, "learning_rate": 1.9536677227999415e-05, "loss": 0.5803, "step": 780 }, { "epoch": 0.05563272429390605, "grad_norm": 2.2666208744049072, "learning_rate": 1.953549471501616e-05, "loss": 0.6199, "step": 781 }, { "epoch": 0.05570395697546034, "grad_norm": 2.892554759979248, "learning_rate": 1.953431073080426e-05, "loss": 0.5553, "step": 782 }, { "epoch": 0.05577518965701464, "grad_norm": 3.8156349658966064, "learning_rate": 1.95331252755464e-05, "loss": 0.6822, "step": 783 }, { "epoch": 0.055846422338568937, "grad_norm": 4.0235915184021, "learning_rate": 1.9531938349425484e-05, "loss": 0.5421, "step": 784 }, { "epoch": 0.05591765502012323, "grad_norm": 3.347797155380249, "learning_rate": 1.953074995262464e-05, "loss": 0.6595, "step": 785 }, { "epoch": 0.05598888770167753, "grad_norm": 3.4325382709503174, "learning_rate": 1.9529560085327227e-05, "loss": 0.7366, "step": 786 }, { "epoch": 0.056060120383231826, "grad_norm": 2.6278388500213623, "learning_rate": 1.9528368747716827e-05, "loss": 0.3918, "step": 787 }, { "epoch": 0.05613135306478612, "grad_norm": 2.8977432250976562, "learning_rate": 1.9527175939977252e-05, "loss": 0.4562, "step": 788 }, { "epoch": 0.05620258574634042, "grad_norm": 4.216031074523926, "learning_rate": 1.952598166229254e-05, "loss": 0.2843, "step": 789 }, { "epoch": 0.056273818427894716, "grad_norm": 2.4589107036590576, "learning_rate": 1.9524785914846956e-05, "loss": 0.3812, "step": 790 }, { "epoch": 0.05634505110944901, "grad_norm": 1.9703328609466553, "learning_rate": 1.9523588697824995e-05, "loss": 0.2097, "step": 791 }, { "epoch": 0.05641628379100331, "grad_norm": 3.9046812057495117, "learning_rate": 1.952239001141137e-05, "loss": 0.4377, "step": 792 }, { "epoch": 0.05648751647255761, "grad_norm": 4.004209995269775, "learning_rate": 1.9521189855791026e-05, "loss": 0.7446, "step": 793 }, { "epoch": 0.05655874915411191, "grad_norm": 4.179111957550049, "learning_rate": 1.9519988231149142e-05, "loss": 0.4088, "step": 794 }, { "epoch": 0.056629981835666206, "grad_norm": 3.555159330368042, "learning_rate": 1.9518785137671107e-05, "loss": 0.5192, "step": 795 }, { "epoch": 0.0567012145172205, "grad_norm": 3.542478084564209, "learning_rate": 1.9517580575542546e-05, "loss": 0.4865, "step": 796 }, { "epoch": 0.0567724471987748, "grad_norm": 4.112839698791504, "learning_rate": 1.951637454494932e-05, "loss": 0.4529, "step": 797 }, { "epoch": 0.056843679880329095, "grad_norm": 3.5209529399871826, "learning_rate": 1.95151670460775e-05, "loss": 0.58, "step": 798 }, { "epoch": 0.05691491256188339, "grad_norm": 3.339796543121338, "learning_rate": 1.951395807911339e-05, "loss": 0.2818, "step": 799 }, { "epoch": 0.05698614524343769, "grad_norm": 2.8356692790985107, "learning_rate": 1.9512747644243525e-05, "loss": 0.64, "step": 800 }, { "epoch": 0.057057377924991985, "grad_norm": 2.503645896911621, "learning_rate": 1.9511535741654663e-05, "loss": 0.1503, "step": 801 }, { "epoch": 0.05712861060654628, "grad_norm": 4.216592311859131, "learning_rate": 1.9510322371533783e-05, "loss": 0.527, "step": 802 }, { "epoch": 0.05719984328810058, "grad_norm": 2.365290880203247, "learning_rate": 1.95091075340681e-05, "loss": 0.5433, "step": 803 }, { "epoch": 0.057271075969654875, "grad_norm": 5.226537704467773, "learning_rate": 1.950789122944505e-05, "loss": 0.5658, "step": 804 }, { "epoch": 0.05734230865120917, "grad_norm": 2.127570390701294, "learning_rate": 1.9506673457852293e-05, "loss": 0.4476, "step": 805 }, { "epoch": 0.057413541332763475, "grad_norm": 2.787292718887329, "learning_rate": 1.9505454219477718e-05, "loss": 0.432, "step": 806 }, { "epoch": 0.05748477401431777, "grad_norm": 2.593315362930298, "learning_rate": 1.950423351450945e-05, "loss": 0.3502, "step": 807 }, { "epoch": 0.05755600669587207, "grad_norm": 2.0728485584259033, "learning_rate": 1.9503011343135828e-05, "loss": 0.4157, "step": 808 }, { "epoch": 0.057627239377426365, "grad_norm": 6.8972344398498535, "learning_rate": 1.9501787705545412e-05, "loss": 1.5766, "step": 809 }, { "epoch": 0.05769847205898066, "grad_norm": 5.084022521972656, "learning_rate": 1.9500562601927003e-05, "loss": 0.956, "step": 810 }, { "epoch": 0.05776970474053496, "grad_norm": 2.634355306625366, "learning_rate": 1.9499336032469626e-05, "loss": 0.6266, "step": 811 }, { "epoch": 0.057840937422089254, "grad_norm": 4.010735511779785, "learning_rate": 1.949810799736252e-05, "loss": 0.7027, "step": 812 }, { "epoch": 0.05791217010364355, "grad_norm": 3.0546672344207764, "learning_rate": 1.949687849679516e-05, "loss": 0.6948, "step": 813 }, { "epoch": 0.05798340278519785, "grad_norm": 3.3616859912872314, "learning_rate": 1.949564753095725e-05, "loss": 0.5158, "step": 814 }, { "epoch": 0.058054635466752144, "grad_norm": 2.906944513320923, "learning_rate": 1.949441510003871e-05, "loss": 0.5923, "step": 815 }, { "epoch": 0.05812586814830644, "grad_norm": 4.515849590301514, "learning_rate": 1.9493181204229696e-05, "loss": 0.3543, "step": 816 }, { "epoch": 0.05819710082986074, "grad_norm": 2.4637768268585205, "learning_rate": 1.949194584372058e-05, "loss": 0.1541, "step": 817 }, { "epoch": 0.058268333511415034, "grad_norm": 22.94904136657715, "learning_rate": 1.9490709018701967e-05, "loss": 1.1454, "step": 818 }, { "epoch": 0.05833956619296934, "grad_norm": 3.26338267326355, "learning_rate": 1.9489470729364694e-05, "loss": 0.7482, "step": 819 }, { "epoch": 0.058410798874523634, "grad_norm": 2.6025607585906982, "learning_rate": 1.9488230975899804e-05, "loss": 0.5523, "step": 820 }, { "epoch": 0.05848203155607793, "grad_norm": 3.530848979949951, "learning_rate": 1.948698975849859e-05, "loss": 0.7754, "step": 821 }, { "epoch": 0.05855326423763223, "grad_norm": 3.7359566688537598, "learning_rate": 1.9485747077352547e-05, "loss": 0.8734, "step": 822 }, { "epoch": 0.058624496919186524, "grad_norm": 4.102601528167725, "learning_rate": 1.948450293265342e-05, "loss": 0.4961, "step": 823 }, { "epoch": 0.05869572960074082, "grad_norm": 2.6384918689727783, "learning_rate": 1.948325732459316e-05, "loss": 0.5418, "step": 824 }, { "epoch": 0.05876696228229512, "grad_norm": 4.302755355834961, "learning_rate": 1.948201025336395e-05, "loss": 0.6147, "step": 825 }, { "epoch": 0.05883819496384941, "grad_norm": 3.8845713138580322, "learning_rate": 1.9480761719158208e-05, "loss": 0.7375, "step": 826 }, { "epoch": 0.05890942764540371, "grad_norm": 3.7475664615631104, "learning_rate": 1.9479511722168567e-05, "loss": 0.571, "step": 827 }, { "epoch": 0.05898066032695801, "grad_norm": 2.825944423675537, "learning_rate": 1.947826026258788e-05, "loss": 0.4081, "step": 828 }, { "epoch": 0.0590518930085123, "grad_norm": 2.6456046104431152, "learning_rate": 1.947700734060925e-05, "loss": 0.4112, "step": 829 }, { "epoch": 0.0591231256900666, "grad_norm": 3.650974750518799, "learning_rate": 1.9475752956425978e-05, "loss": 0.6199, "step": 830 }, { "epoch": 0.059194358371620896, "grad_norm": 5.17279052734375, "learning_rate": 1.9474497110231607e-05, "loss": 0.8163, "step": 831 }, { "epoch": 0.0592655910531752, "grad_norm": 5.337392807006836, "learning_rate": 1.94732398022199e-05, "loss": 0.8988, "step": 832 }, { "epoch": 0.059336823734729496, "grad_norm": 2.2086658477783203, "learning_rate": 1.9471981032584846e-05, "loss": 0.399, "step": 833 }, { "epoch": 0.05940805641628379, "grad_norm": 2.250504970550537, "learning_rate": 1.9470720801520665e-05, "loss": 0.3521, "step": 834 }, { "epoch": 0.05947928909783809, "grad_norm": 3.8848578929901123, "learning_rate": 1.946945910922179e-05, "loss": 0.6538, "step": 835 }, { "epoch": 0.059550521779392386, "grad_norm": 3.382078170776367, "learning_rate": 1.9468195955882892e-05, "loss": 0.8204, "step": 836 }, { "epoch": 0.05962175446094668, "grad_norm": 3.5166375637054443, "learning_rate": 1.946693134169886e-05, "loss": 0.7068, "step": 837 }, { "epoch": 0.05969298714250098, "grad_norm": 4.093095302581787, "learning_rate": 1.9465665266864815e-05, "loss": 0.7692, "step": 838 }, { "epoch": 0.059764219824055276, "grad_norm": 4.3034467697143555, "learning_rate": 1.9464397731576093e-05, "loss": 0.3535, "step": 839 }, { "epoch": 0.05983545250560957, "grad_norm": 3.19888973236084, "learning_rate": 1.946312873602827e-05, "loss": 0.5291, "step": 840 }, { "epoch": 0.05990668518716387, "grad_norm": 3.8091659545898438, "learning_rate": 1.9461858280417134e-05, "loss": 0.1358, "step": 841 }, { "epoch": 0.059977917868718165, "grad_norm": 3.646986961364746, "learning_rate": 1.94605863649387e-05, "loss": 0.6094, "step": 842 }, { "epoch": 0.06004915055027246, "grad_norm": 3.616074562072754, "learning_rate": 1.945931298978922e-05, "loss": 0.9265, "step": 843 }, { "epoch": 0.060120383231826766, "grad_norm": 3.8485612869262695, "learning_rate": 1.9458038155165157e-05, "loss": 0.3896, "step": 844 }, { "epoch": 0.06019161591338106, "grad_norm": 4.9456353187561035, "learning_rate": 1.94567618612632e-05, "loss": 0.2582, "step": 845 }, { "epoch": 0.06026284859493536, "grad_norm": 2.491948127746582, "learning_rate": 1.9455484108280277e-05, "loss": 0.2141, "step": 846 }, { "epoch": 0.060334081276489655, "grad_norm": 3.322152853012085, "learning_rate": 1.945420489641353e-05, "loss": 0.6974, "step": 847 }, { "epoch": 0.06040531395804395, "grad_norm": 2.3740427494049072, "learning_rate": 1.945292422586033e-05, "loss": 0.527, "step": 848 }, { "epoch": 0.06047654663959825, "grad_norm": 2.826033115386963, "learning_rate": 1.9451642096818258e-05, "loss": 0.4932, "step": 849 }, { "epoch": 0.060547779321152545, "grad_norm": 3.245887517929077, "learning_rate": 1.9450358509485152e-05, "loss": 0.5961, "step": 850 }, { "epoch": 0.06061901200270684, "grad_norm": 2.7112619876861572, "learning_rate": 1.9449073464059048e-05, "loss": 0.6417, "step": 851 }, { "epoch": 0.06069024468426114, "grad_norm": 1.8960798978805542, "learning_rate": 1.9447786960738212e-05, "loss": 0.149, "step": 852 }, { "epoch": 0.060761477365815435, "grad_norm": 2.810213327407837, "learning_rate": 1.944649899972114e-05, "loss": 0.5052, "step": 853 }, { "epoch": 0.06083271004736973, "grad_norm": 2.6662979125976562, "learning_rate": 1.9445209581206557e-05, "loss": 0.4923, "step": 854 }, { "epoch": 0.06090394272892403, "grad_norm": 2.8522610664367676, "learning_rate": 1.94439187053934e-05, "loss": 0.5229, "step": 855 }, { "epoch": 0.060975175410478324, "grad_norm": 3.399766445159912, "learning_rate": 1.9442626372480838e-05, "loss": 0.3113, "step": 856 }, { "epoch": 0.06104640809203263, "grad_norm": 3.0479283332824707, "learning_rate": 1.944133258266827e-05, "loss": 0.6789, "step": 857 }, { "epoch": 0.061117640773586925, "grad_norm": 4.266031742095947, "learning_rate": 1.944003733615531e-05, "loss": 0.3837, "step": 858 }, { "epoch": 0.06118887345514122, "grad_norm": 3.2331321239471436, "learning_rate": 1.9438740633141804e-05, "loss": 0.3949, "step": 859 }, { "epoch": 0.06126010613669552, "grad_norm": 3.16984224319458, "learning_rate": 1.9437442473827818e-05, "loss": 0.6359, "step": 860 }, { "epoch": 0.061331338818249814, "grad_norm": 2.332456588745117, "learning_rate": 1.9436142858413648e-05, "loss": 0.346, "step": 861 }, { "epoch": 0.06140257149980411, "grad_norm": 3.701124668121338, "learning_rate": 1.9434841787099804e-05, "loss": 0.2317, "step": 862 }, { "epoch": 0.06147380418135841, "grad_norm": 3.7259418964385986, "learning_rate": 1.9433539260087033e-05, "loss": 0.3378, "step": 863 }, { "epoch": 0.061545036862912704, "grad_norm": 2.596930503845215, "learning_rate": 1.9432235277576304e-05, "loss": 0.4233, "step": 864 }, { "epoch": 0.061616269544467, "grad_norm": 3.58185076713562, "learning_rate": 1.9430929839768803e-05, "loss": 0.3833, "step": 865 }, { "epoch": 0.0616875022260213, "grad_norm": 2.7534780502319336, "learning_rate": 1.9429622946865946e-05, "loss": 0.5732, "step": 866 }, { "epoch": 0.061758734907575594, "grad_norm": 3.189164638519287, "learning_rate": 1.9428314599069375e-05, "loss": 0.5984, "step": 867 }, { "epoch": 0.06182996758912989, "grad_norm": 2.6091508865356445, "learning_rate": 1.9427004796580954e-05, "loss": 0.4871, "step": 868 }, { "epoch": 0.06190120027068419, "grad_norm": 3.764057159423828, "learning_rate": 1.9425693539602773e-05, "loss": 0.4501, "step": 869 }, { "epoch": 0.06197243295223849, "grad_norm": 3.2327442169189453, "learning_rate": 1.9424380828337146e-05, "loss": 0.3434, "step": 870 }, { "epoch": 0.06204366563379279, "grad_norm": 2.3774890899658203, "learning_rate": 1.9423066662986607e-05, "loss": 0.3519, "step": 871 }, { "epoch": 0.062114898315347084, "grad_norm": 2.3284530639648438, "learning_rate": 1.942175104375392e-05, "loss": 0.2575, "step": 872 }, { "epoch": 0.06218613099690138, "grad_norm": 3.2120361328125, "learning_rate": 1.9420433970842078e-05, "loss": 0.5885, "step": 873 }, { "epoch": 0.06225736367845568, "grad_norm": 6.192887306213379, "learning_rate": 1.941911544445428e-05, "loss": 0.4458, "step": 874 }, { "epoch": 0.06232859636000997, "grad_norm": 5.333869457244873, "learning_rate": 1.941779546479397e-05, "loss": 0.6876, "step": 875 }, { "epoch": 0.06239982904156427, "grad_norm": 3.337245464324951, "learning_rate": 1.9416474032064803e-05, "loss": 0.7839, "step": 876 }, { "epoch": 0.062471061723118566, "grad_norm": 2.3776698112487793, "learning_rate": 1.9415151146470665e-05, "loss": 0.2316, "step": 877 }, { "epoch": 0.06254229440467286, "grad_norm": 2.574424982070923, "learning_rate": 1.9413826808215665e-05, "loss": 0.5303, "step": 878 }, { "epoch": 0.06261352708622717, "grad_norm": 6.3156609535217285, "learning_rate": 1.941250101750413e-05, "loss": 0.747, "step": 879 }, { "epoch": 0.06268475976778146, "grad_norm": 4.135359764099121, "learning_rate": 1.9411173774540616e-05, "loss": 0.2452, "step": 880 }, { "epoch": 0.06275599244933576, "grad_norm": 2.0866429805755615, "learning_rate": 1.9409845079529907e-05, "loss": 0.2407, "step": 881 }, { "epoch": 0.06282722513089005, "grad_norm": 6.200789451599121, "learning_rate": 1.9408514932677e-05, "loss": 0.8461, "step": 882 }, { "epoch": 0.06289845781244435, "grad_norm": 2.589054822921753, "learning_rate": 1.9407183334187132e-05, "loss": 0.2769, "step": 883 }, { "epoch": 0.06296969049399864, "grad_norm": 4.0996527671813965, "learning_rate": 1.940585028426575e-05, "loss": 0.4383, "step": 884 }, { "epoch": 0.06304092317555295, "grad_norm": 2.7165510654449463, "learning_rate": 1.9404515783118533e-05, "loss": 0.1997, "step": 885 }, { "epoch": 0.06311215585710724, "grad_norm": 6.477215766906738, "learning_rate": 1.9403179830951376e-05, "loss": 0.4275, "step": 886 }, { "epoch": 0.06318338853866154, "grad_norm": 2.261871337890625, "learning_rate": 1.9401842427970406e-05, "loss": 0.414, "step": 887 }, { "epoch": 0.06325462122021583, "grad_norm": 2.615431785583496, "learning_rate": 1.940050357438197e-05, "loss": 0.3441, "step": 888 }, { "epoch": 0.06332585390177013, "grad_norm": 2.7517430782318115, "learning_rate": 1.9399163270392637e-05, "loss": 0.4765, "step": 889 }, { "epoch": 0.06339708658332444, "grad_norm": 3.2903659343719482, "learning_rate": 1.9397821516209207e-05, "loss": 0.5392, "step": 890 }, { "epoch": 0.06346831926487873, "grad_norm": 3.0049238204956055, "learning_rate": 1.9396478312038694e-05, "loss": 0.1626, "step": 891 }, { "epoch": 0.06353955194643303, "grad_norm": 3.399286985397339, "learning_rate": 1.9395133658088344e-05, "loss": 0.2347, "step": 892 }, { "epoch": 0.06361078462798732, "grad_norm": 4.080726146697998, "learning_rate": 1.9393787554565618e-05, "loss": 0.151, "step": 893 }, { "epoch": 0.06368201730954162, "grad_norm": 2.5577878952026367, "learning_rate": 1.9392440001678213e-05, "loss": 0.3897, "step": 894 }, { "epoch": 0.06375324999109591, "grad_norm": 3.473944902420044, "learning_rate": 1.9391090999634038e-05, "loss": 0.126, "step": 895 }, { "epoch": 0.06382448267265022, "grad_norm": 3.932528018951416, "learning_rate": 1.9389740548641232e-05, "loss": 0.1745, "step": 896 }, { "epoch": 0.0638957153542045, "grad_norm": 3.8648149967193604, "learning_rate": 1.9388388648908156e-05, "loss": 0.7113, "step": 897 }, { "epoch": 0.06396694803575881, "grad_norm": 1.9359571933746338, "learning_rate": 1.9387035300643392e-05, "loss": 0.0424, "step": 898 }, { "epoch": 0.0640381807173131, "grad_norm": 14.584874153137207, "learning_rate": 1.9385680504055746e-05, "loss": 0.5734, "step": 899 }, { "epoch": 0.0641094133988674, "grad_norm": 1.8073800802230835, "learning_rate": 1.9384324259354254e-05, "loss": 0.1821, "step": 900 }, { "epoch": 0.06418064608042169, "grad_norm": 5.801602363586426, "learning_rate": 1.938296656674817e-05, "loss": 0.8645, "step": 901 }, { "epoch": 0.064251878761976, "grad_norm": 3.052730083465576, "learning_rate": 1.938160742644697e-05, "loss": 0.5649, "step": 902 }, { "epoch": 0.0643231114435303, "grad_norm": 3.033768653869629, "learning_rate": 1.9380246838660356e-05, "loss": 0.5848, "step": 903 }, { "epoch": 0.06439434412508459, "grad_norm": 2.2813687324523926, "learning_rate": 1.937888480359825e-05, "loss": 0.3877, "step": 904 }, { "epoch": 0.06446557680663889, "grad_norm": 8.392961502075195, "learning_rate": 1.9377521321470806e-05, "loss": 0.8, "step": 905 }, { "epoch": 0.06453680948819318, "grad_norm": 2.810939073562622, "learning_rate": 1.937615639248839e-05, "loss": 0.3293, "step": 906 }, { "epoch": 0.06460804216974748, "grad_norm": 5.484803199768066, "learning_rate": 1.93747900168616e-05, "loss": 0.6504, "step": 907 }, { "epoch": 0.06467927485130177, "grad_norm": 10.583064079284668, "learning_rate": 1.937342219480125e-05, "loss": 0.8917, "step": 908 }, { "epoch": 0.06475050753285608, "grad_norm": 2.3465123176574707, "learning_rate": 1.9372052926518386e-05, "loss": 0.509, "step": 909 }, { "epoch": 0.06482174021441037, "grad_norm": 3.4561471939086914, "learning_rate": 1.937068221222427e-05, "loss": 0.3009, "step": 910 }, { "epoch": 0.06489297289596467, "grad_norm": 2.958239793777466, "learning_rate": 1.936931005213038e-05, "loss": 0.4379, "step": 911 }, { "epoch": 0.06496420557751896, "grad_norm": 3.922301769256592, "learning_rate": 1.936793644644844e-05, "loss": 0.8968, "step": 912 }, { "epoch": 0.06503543825907326, "grad_norm": 3.046469211578369, "learning_rate": 1.936656139539038e-05, "loss": 0.4177, "step": 913 }, { "epoch": 0.06510667094062755, "grad_norm": 2.2607474327087402, "learning_rate": 1.936518489916835e-05, "loss": 0.2898, "step": 914 }, { "epoch": 0.06517790362218186, "grad_norm": 3.6036360263824463, "learning_rate": 1.936380695799473e-05, "loss": 0.8006, "step": 915 }, { "epoch": 0.06524913630373616, "grad_norm": 4.473855972290039, "learning_rate": 1.936242757208213e-05, "loss": 0.8719, "step": 916 }, { "epoch": 0.06532036898529045, "grad_norm": 1.8606525659561157, "learning_rate": 1.936104674164337e-05, "loss": 0.2333, "step": 917 }, { "epoch": 0.06539160166684475, "grad_norm": 8.314801216125488, "learning_rate": 1.9359664466891495e-05, "loss": 0.1589, "step": 918 }, { "epoch": 0.06546283434839904, "grad_norm": 3.123159646987915, "learning_rate": 1.9358280748039776e-05, "loss": 0.7615, "step": 919 }, { "epoch": 0.06553406702995335, "grad_norm": 2.856058359146118, "learning_rate": 1.9356895585301715e-05, "loss": 0.5052, "step": 920 }, { "epoch": 0.06560529971150764, "grad_norm": 3.41524338722229, "learning_rate": 1.935550897889102e-05, "loss": 0.5706, "step": 921 }, { "epoch": 0.06567653239306194, "grad_norm": 3.2552788257598877, "learning_rate": 1.9354120929021633e-05, "loss": 0.4292, "step": 922 }, { "epoch": 0.06574776507461623, "grad_norm": 2.7574760913848877, "learning_rate": 1.9352731435907715e-05, "loss": 0.5082, "step": 923 }, { "epoch": 0.06581899775617053, "grad_norm": 2.762284517288208, "learning_rate": 1.9351340499763654e-05, "loss": 0.7242, "step": 924 }, { "epoch": 0.06589023043772482, "grad_norm": 4.941155910491943, "learning_rate": 1.934994812080405e-05, "loss": 0.5658, "step": 925 }, { "epoch": 0.06596146311927913, "grad_norm": 3.0973663330078125, "learning_rate": 1.9348554299243737e-05, "loss": 0.3725, "step": 926 }, { "epoch": 0.06603269580083342, "grad_norm": 2.5201385021209717, "learning_rate": 1.934715903529777e-05, "loss": 0.3398, "step": 927 }, { "epoch": 0.06610392848238772, "grad_norm": 4.552981376647949, "learning_rate": 1.934576232918142e-05, "loss": 0.7825, "step": 928 }, { "epoch": 0.06617516116394202, "grad_norm": 5.206096172332764, "learning_rate": 1.9344364181110185e-05, "loss": 0.4194, "step": 929 }, { "epoch": 0.06624639384549631, "grad_norm": 5.273828506469727, "learning_rate": 1.9342964591299785e-05, "loss": 0.2834, "step": 930 }, { "epoch": 0.06631762652705062, "grad_norm": 2.7735073566436768, "learning_rate": 1.934156355996616e-05, "loss": 0.6117, "step": 931 }, { "epoch": 0.0663888592086049, "grad_norm": 2.4389848709106445, "learning_rate": 1.9340161087325483e-05, "loss": 0.4159, "step": 932 }, { "epoch": 0.06646009189015921, "grad_norm": 2.8052902221679688, "learning_rate": 1.9338757173594128e-05, "loss": 0.3224, "step": 933 }, { "epoch": 0.0665313245717135, "grad_norm": 4.618145942687988, "learning_rate": 1.9337351818988718e-05, "loss": 0.5371, "step": 934 }, { "epoch": 0.0666025572532678, "grad_norm": 2.445091962814331, "learning_rate": 1.9335945023726076e-05, "loss": 0.6165, "step": 935 }, { "epoch": 0.06667378993482209, "grad_norm": 3.4490745067596436, "learning_rate": 1.933453678802326e-05, "loss": 0.5035, "step": 936 }, { "epoch": 0.0667450226163764, "grad_norm": 3.0421078205108643, "learning_rate": 1.9333127112097543e-05, "loss": 0.5804, "step": 937 }, { "epoch": 0.06681625529793069, "grad_norm": 3.4441535472869873, "learning_rate": 1.9331715996166424e-05, "loss": 0.6835, "step": 938 }, { "epoch": 0.06688748797948499, "grad_norm": 6.69256591796875, "learning_rate": 1.9330303440447627e-05, "loss": 0.3876, "step": 939 }, { "epoch": 0.06695872066103928, "grad_norm": 3.873143434524536, "learning_rate": 1.9328889445159094e-05, "loss": 1.0144, "step": 940 }, { "epoch": 0.06702995334259358, "grad_norm": 3.1113686561584473, "learning_rate": 1.9327474010518983e-05, "loss": 0.3775, "step": 941 }, { "epoch": 0.06710118602414789, "grad_norm": 2.0928592681884766, "learning_rate": 1.932605713674569e-05, "loss": 0.2315, "step": 942 }, { "epoch": 0.06717241870570217, "grad_norm": 3.0236921310424805, "learning_rate": 1.932463882405782e-05, "loss": 0.7392, "step": 943 }, { "epoch": 0.06724365138725648, "grad_norm": 2.611851215362549, "learning_rate": 1.9323219072674207e-05, "loss": 0.173, "step": 944 }, { "epoch": 0.06731488406881077, "grad_norm": 3.3585939407348633, "learning_rate": 1.9321797882813903e-05, "loss": 0.5513, "step": 945 }, { "epoch": 0.06738611675036507, "grad_norm": 4.050519943237305, "learning_rate": 1.9320375254696177e-05, "loss": 0.7031, "step": 946 }, { "epoch": 0.06745734943191936, "grad_norm": 2.4902873039245605, "learning_rate": 1.9318951188540534e-05, "loss": 0.3769, "step": 947 }, { "epoch": 0.06752858211347366, "grad_norm": 3.246683120727539, "learning_rate": 1.9317525684566686e-05, "loss": 0.2699, "step": 948 }, { "epoch": 0.06759981479502795, "grad_norm": 2.9486937522888184, "learning_rate": 1.9316098742994578e-05, "loss": 0.3326, "step": 949 }, { "epoch": 0.06767104747658226, "grad_norm": 3.313854694366455, "learning_rate": 1.9314670364044374e-05, "loss": 0.3547, "step": 950 }, { "epoch": 0.06774228015813655, "grad_norm": 2.8337888717651367, "learning_rate": 1.931324054793645e-05, "loss": 0.5985, "step": 951 }, { "epoch": 0.06781351283969085, "grad_norm": 4.335596561431885, "learning_rate": 1.9311809294891422e-05, "loss": 0.6434, "step": 952 }, { "epoch": 0.06788474552124514, "grad_norm": 3.5517160892486572, "learning_rate": 1.931037660513011e-05, "loss": 0.7308, "step": 953 }, { "epoch": 0.06795597820279944, "grad_norm": 3.9118449687957764, "learning_rate": 1.930894247887357e-05, "loss": 0.5303, "step": 954 }, { "epoch": 0.06802721088435375, "grad_norm": 2.9481678009033203, "learning_rate": 1.9307506916343066e-05, "loss": 0.4671, "step": 955 }, { "epoch": 0.06809844356590804, "grad_norm": 5.823635578155518, "learning_rate": 1.930606991776009e-05, "loss": 0.3643, "step": 956 }, { "epoch": 0.06816967624746234, "grad_norm": 2.343358278274536, "learning_rate": 1.9304631483346364e-05, "loss": 0.2292, "step": 957 }, { "epoch": 0.06824090892901663, "grad_norm": 5.304490089416504, "learning_rate": 1.930319161332382e-05, "loss": 0.3285, "step": 958 }, { "epoch": 0.06831214161057093, "grad_norm": 6.307834625244141, "learning_rate": 1.930175030791461e-05, "loss": 0.5632, "step": 959 }, { "epoch": 0.06838337429212522, "grad_norm": 2.4588329792022705, "learning_rate": 1.9300307567341124e-05, "loss": 0.3017, "step": 960 }, { "epoch": 0.06845460697367953, "grad_norm": 6.0710129737854, "learning_rate": 1.9298863391825954e-05, "loss": 0.5708, "step": 961 }, { "epoch": 0.06852583965523382, "grad_norm": 3.1560897827148438, "learning_rate": 1.929741778159192e-05, "loss": 0.7626, "step": 962 }, { "epoch": 0.06859707233678812, "grad_norm": 1.7443403005599976, "learning_rate": 1.9295970736862063e-05, "loss": 0.2312, "step": 963 }, { "epoch": 0.06866830501834241, "grad_norm": 6.654495716094971, "learning_rate": 1.9294522257859655e-05, "loss": 0.8885, "step": 964 }, { "epoch": 0.06873953769989671, "grad_norm": 2.0893943309783936, "learning_rate": 1.929307234480818e-05, "loss": 0.1583, "step": 965 }, { "epoch": 0.068810770381451, "grad_norm": 2.4175493717193604, "learning_rate": 1.929162099793134e-05, "loss": 0.5112, "step": 966 }, { "epoch": 0.0688820030630053, "grad_norm": 2.4944376945495605, "learning_rate": 1.9290168217453066e-05, "loss": 0.3561, "step": 967 }, { "epoch": 0.06895323574455961, "grad_norm": 4.480559825897217, "learning_rate": 1.9288714003597504e-05, "loss": 0.5123, "step": 968 }, { "epoch": 0.0690244684261139, "grad_norm": 6.911674499511719, "learning_rate": 1.928725835658903e-05, "loss": 0.2553, "step": 969 }, { "epoch": 0.0690957011076682, "grad_norm": 5.306718826293945, "learning_rate": 1.9285801276652226e-05, "loss": 0.6172, "step": 970 }, { "epoch": 0.06916693378922249, "grad_norm": 4.070590972900391, "learning_rate": 1.9284342764011917e-05, "loss": 0.5723, "step": 971 }, { "epoch": 0.0692381664707768, "grad_norm": 2.7862961292266846, "learning_rate": 1.9282882818893126e-05, "loss": 0.4308, "step": 972 }, { "epoch": 0.06930939915233109, "grad_norm": 2.659595012664795, "learning_rate": 1.9281421441521113e-05, "loss": 0.5775, "step": 973 }, { "epoch": 0.06938063183388539, "grad_norm": 3.451085329055786, "learning_rate": 1.927995863212135e-05, "loss": 0.2925, "step": 974 }, { "epoch": 0.06945186451543968, "grad_norm": 3.0328056812286377, "learning_rate": 1.9278494390919538e-05, "loss": 0.4232, "step": 975 }, { "epoch": 0.06952309719699398, "grad_norm": 3.3224422931671143, "learning_rate": 1.927702871814159e-05, "loss": 0.9173, "step": 976 }, { "epoch": 0.06959432987854827, "grad_norm": 3.2974257469177246, "learning_rate": 1.9275561614013644e-05, "loss": 0.4882, "step": 977 }, { "epoch": 0.06966556256010258, "grad_norm": 2.5703392028808594, "learning_rate": 1.9274093078762063e-05, "loss": 0.3549, "step": 978 }, { "epoch": 0.06973679524165687, "grad_norm": 3.106201171875, "learning_rate": 1.9272623112613425e-05, "loss": 0.487, "step": 979 }, { "epoch": 0.06980802792321117, "grad_norm": 1.597071647644043, "learning_rate": 1.927115171579453e-05, "loss": 0.0891, "step": 980 }, { "epoch": 0.06987926060476547, "grad_norm": 3.1412830352783203, "learning_rate": 1.9269678888532394e-05, "loss": 0.3482, "step": 981 }, { "epoch": 0.06995049328631976, "grad_norm": 3.7975499629974365, "learning_rate": 1.926820463105427e-05, "loss": 0.5771, "step": 982 }, { "epoch": 0.07002172596787407, "grad_norm": 3.892638683319092, "learning_rate": 1.9266728943587615e-05, "loss": 0.4175, "step": 983 }, { "epoch": 0.07009295864942836, "grad_norm": 3.4698612689971924, "learning_rate": 1.926525182636011e-05, "loss": 0.309, "step": 984 }, { "epoch": 0.07016419133098266, "grad_norm": 2.8998594284057617, "learning_rate": 1.926377327959967e-05, "loss": 0.4618, "step": 985 }, { "epoch": 0.07023542401253695, "grad_norm": 4.5070719718933105, "learning_rate": 1.9262293303534403e-05, "loss": 0.6728, "step": 986 }, { "epoch": 0.07030665669409125, "grad_norm": 4.250385761260986, "learning_rate": 1.9260811898392665e-05, "loss": 0.6599, "step": 987 }, { "epoch": 0.07037788937564554, "grad_norm": 3.675543785095215, "learning_rate": 1.925932906440302e-05, "loss": 0.7386, "step": 988 }, { "epoch": 0.07044912205719984, "grad_norm": 3.864457845687866, "learning_rate": 1.9257844801794253e-05, "loss": 0.3606, "step": 989 }, { "epoch": 0.07052035473875413, "grad_norm": 2.580138683319092, "learning_rate": 1.925635911079537e-05, "loss": 0.4641, "step": 990 }, { "epoch": 0.07059158742030844, "grad_norm": 2.6831305027008057, "learning_rate": 1.9254871991635598e-05, "loss": 0.3221, "step": 991 }, { "epoch": 0.07066282010186273, "grad_norm": 2.556180000305176, "learning_rate": 1.9253383444544386e-05, "loss": 0.3985, "step": 992 }, { "epoch": 0.07073405278341703, "grad_norm": 2.4665956497192383, "learning_rate": 1.9251893469751396e-05, "loss": 0.3625, "step": 993 }, { "epoch": 0.07080528546497133, "grad_norm": 3.6774861812591553, "learning_rate": 1.9250402067486523e-05, "loss": 0.5886, "step": 994 }, { "epoch": 0.07087651814652562, "grad_norm": 5.746670246124268, "learning_rate": 1.924890923797987e-05, "loss": 0.6958, "step": 995 }, { "epoch": 0.07094775082807993, "grad_norm": 3.0317447185516357, "learning_rate": 1.9247414981461768e-05, "loss": 0.4822, "step": 996 }, { "epoch": 0.07101898350963422, "grad_norm": 2.796196699142456, "learning_rate": 1.9245919298162763e-05, "loss": 0.5786, "step": 997 }, { "epoch": 0.07109021619118852, "grad_norm": 3.497126579284668, "learning_rate": 1.9244422188313624e-05, "loss": 0.6183, "step": 998 }, { "epoch": 0.07116144887274281, "grad_norm": 2.1518757343292236, "learning_rate": 1.9242923652145345e-05, "loss": 0.4102, "step": 999 }, { "epoch": 0.07123268155429711, "grad_norm": 2.6611673831939697, "learning_rate": 1.9241423689889126e-05, "loss": 0.6952, "step": 1000 }, { "epoch": 0.0713039142358514, "grad_norm": 3.1373541355133057, "learning_rate": 1.9239922301776404e-05, "loss": 0.3614, "step": 1001 }, { "epoch": 0.07137514691740571, "grad_norm": 3.665011167526245, "learning_rate": 1.923841948803882e-05, "loss": 0.8215, "step": 1002 }, { "epoch": 0.07144637959896, "grad_norm": 3.347308874130249, "learning_rate": 1.9236915248908244e-05, "loss": 0.3497, "step": 1003 }, { "epoch": 0.0715176122805143, "grad_norm": 3.4668562412261963, "learning_rate": 1.9235409584616774e-05, "loss": 0.6933, "step": 1004 }, { "epoch": 0.07158884496206859, "grad_norm": 3.219271659851074, "learning_rate": 1.9233902495396707e-05, "loss": 0.7712, "step": 1005 }, { "epoch": 0.0716600776436229, "grad_norm": 2.371466636657715, "learning_rate": 1.9232393981480576e-05, "loss": 0.4934, "step": 1006 }, { "epoch": 0.0717313103251772, "grad_norm": 5.40619421005249, "learning_rate": 1.923088404310113e-05, "loss": 0.8732, "step": 1007 }, { "epoch": 0.07180254300673149, "grad_norm": 3.6770644187927246, "learning_rate": 1.9229372680491334e-05, "loss": 0.4797, "step": 1008 }, { "epoch": 0.07187377568828579, "grad_norm": 3.3191933631896973, "learning_rate": 1.922785989388438e-05, "loss": 0.8777, "step": 1009 }, { "epoch": 0.07194500836984008, "grad_norm": 3.066992998123169, "learning_rate": 1.922634568351367e-05, "loss": 0.4806, "step": 1010 }, { "epoch": 0.07201624105139438, "grad_norm": 3.433650016784668, "learning_rate": 1.922483004961284e-05, "loss": 0.5103, "step": 1011 }, { "epoch": 0.07208747373294867, "grad_norm": 6.266208648681641, "learning_rate": 1.9223312992415723e-05, "loss": 0.6417, "step": 1012 }, { "epoch": 0.07215870641450298, "grad_norm": 2.4837069511413574, "learning_rate": 1.9221794512156394e-05, "loss": 0.4199, "step": 1013 }, { "epoch": 0.07222993909605727, "grad_norm": 2.2320425510406494, "learning_rate": 1.9220274609069143e-05, "loss": 0.4303, "step": 1014 }, { "epoch": 0.07230117177761157, "grad_norm": 2.6277027130126953, "learning_rate": 1.921875328338847e-05, "loss": 0.5292, "step": 1015 }, { "epoch": 0.07237240445916586, "grad_norm": 3.659153938293457, "learning_rate": 1.9217230535349097e-05, "loss": 0.5257, "step": 1016 }, { "epoch": 0.07244363714072016, "grad_norm": 2.4081618785858154, "learning_rate": 1.9215706365185973e-05, "loss": 0.3407, "step": 1017 }, { "epoch": 0.07251486982227447, "grad_norm": 4.274810314178467, "learning_rate": 1.9214180773134257e-05, "loss": 0.4593, "step": 1018 }, { "epoch": 0.07258610250382876, "grad_norm": 8.175437927246094, "learning_rate": 1.921265375942934e-05, "loss": 0.4019, "step": 1019 }, { "epoch": 0.07265733518538306, "grad_norm": 6.455588340759277, "learning_rate": 1.9211125324306816e-05, "loss": 0.7002, "step": 1020 }, { "epoch": 0.07272856786693735, "grad_norm": 3.429478168487549, "learning_rate": 1.9209595468002515e-05, "loss": 0.7091, "step": 1021 }, { "epoch": 0.07279980054849165, "grad_norm": 3.0515952110290527, "learning_rate": 1.920806419075247e-05, "loss": 0.4965, "step": 1022 }, { "epoch": 0.07287103323004594, "grad_norm": 5.194477081298828, "learning_rate": 1.9206531492792945e-05, "loss": 0.3795, "step": 1023 }, { "epoch": 0.07294226591160025, "grad_norm": 3.020181179046631, "learning_rate": 1.9204997374360423e-05, "loss": 0.8128, "step": 1024 }, { "epoch": 0.07301349859315454, "grad_norm": 4.738003730773926, "learning_rate": 1.9203461835691596e-05, "loss": 0.4578, "step": 1025 }, { "epoch": 0.07308473127470884, "grad_norm": 3.27852201461792, "learning_rate": 1.9201924877023388e-05, "loss": 0.4468, "step": 1026 }, { "epoch": 0.07315596395626313, "grad_norm": 4.257307052612305, "learning_rate": 1.9200386498592932e-05, "loss": 0.6934, "step": 1027 }, { "epoch": 0.07322719663781743, "grad_norm": 2.380798578262329, "learning_rate": 1.9198846700637582e-05, "loss": 0.2254, "step": 1028 }, { "epoch": 0.07329842931937172, "grad_norm": 4.779901027679443, "learning_rate": 1.9197305483394917e-05, "loss": 0.6494, "step": 1029 }, { "epoch": 0.07336966200092603, "grad_norm": 4.874053955078125, "learning_rate": 1.9195762847102732e-05, "loss": 0.31, "step": 1030 }, { "epoch": 0.07344089468248033, "grad_norm": 2.9153478145599365, "learning_rate": 1.9194218791999037e-05, "loss": 0.4757, "step": 1031 }, { "epoch": 0.07351212736403462, "grad_norm": 1.996578335762024, "learning_rate": 1.9192673318322062e-05, "loss": 0.2465, "step": 1032 }, { "epoch": 0.07358336004558892, "grad_norm": 3.63970685005188, "learning_rate": 1.9191126426310264e-05, "loss": 0.3826, "step": 1033 }, { "epoch": 0.07365459272714321, "grad_norm": 2.14697527885437, "learning_rate": 1.918957811620231e-05, "loss": 0.392, "step": 1034 }, { "epoch": 0.07372582540869752, "grad_norm": 2.631237745285034, "learning_rate": 1.9188028388237084e-05, "loss": 0.2881, "step": 1035 }, { "epoch": 0.0737970580902518, "grad_norm": 3.8839914798736572, "learning_rate": 1.9186477242653693e-05, "loss": 0.5242, "step": 1036 }, { "epoch": 0.07386829077180611, "grad_norm": 4.135127067565918, "learning_rate": 1.9184924679691474e-05, "loss": 0.8156, "step": 1037 }, { "epoch": 0.0739395234533604, "grad_norm": 3.3383543491363525, "learning_rate": 1.9183370699589954e-05, "loss": 0.6394, "step": 1038 }, { "epoch": 0.0740107561349147, "grad_norm": 2.001521348953247, "learning_rate": 1.918181530258891e-05, "loss": 0.2869, "step": 1039 }, { "epoch": 0.07408198881646899, "grad_norm": 3.817751407623291, "learning_rate": 1.918025848892832e-05, "loss": 0.5005, "step": 1040 }, { "epoch": 0.0741532214980233, "grad_norm": 2.786716938018799, "learning_rate": 1.9178700258848383e-05, "loss": 0.6011, "step": 1041 }, { "epoch": 0.07422445417957758, "grad_norm": 2.821485757827759, "learning_rate": 1.9177140612589517e-05, "loss": 0.3483, "step": 1042 }, { "epoch": 0.07429568686113189, "grad_norm": 1.9917645454406738, "learning_rate": 1.9175579550392362e-05, "loss": 0.2927, "step": 1043 }, { "epoch": 0.07436691954268619, "grad_norm": 3.777744770050049, "learning_rate": 1.9174017072497773e-05, "loss": 0.5297, "step": 1044 }, { "epoch": 0.07443815222424048, "grad_norm": 3.425264596939087, "learning_rate": 1.9172453179146822e-05, "loss": 0.5447, "step": 1045 }, { "epoch": 0.07450938490579478, "grad_norm": 5.876537799835205, "learning_rate": 1.9170887870580806e-05, "loss": 0.6511, "step": 1046 }, { "epoch": 0.07458061758734907, "grad_norm": 2.5663487911224365, "learning_rate": 1.9169321147041234e-05, "loss": 0.4484, "step": 1047 }, { "epoch": 0.07465185026890338, "grad_norm": 2.9270100593566895, "learning_rate": 1.916775300876983e-05, "loss": 0.6907, "step": 1048 }, { "epoch": 0.07472308295045767, "grad_norm": 3.4039342403411865, "learning_rate": 1.916618345600855e-05, "loss": 0.4457, "step": 1049 }, { "epoch": 0.07479431563201197, "grad_norm": 2.1377956867218018, "learning_rate": 1.9164612488999556e-05, "loss": 0.5034, "step": 1050 }, { "epoch": 0.07486554831356626, "grad_norm": 2.834691047668457, "learning_rate": 1.916304010798523e-05, "loss": 0.4636, "step": 1051 }, { "epoch": 0.07493678099512056, "grad_norm": 2.271336555480957, "learning_rate": 1.916146631320818e-05, "loss": 0.2112, "step": 1052 }, { "epoch": 0.07500801367667485, "grad_norm": 2.3811984062194824, "learning_rate": 1.915989110491122e-05, "loss": 0.5748, "step": 1053 }, { "epoch": 0.07507924635822916, "grad_norm": 3.6429431438446045, "learning_rate": 1.9158314483337394e-05, "loss": 0.6008, "step": 1054 }, { "epoch": 0.07515047903978345, "grad_norm": 3.3966774940490723, "learning_rate": 1.9156736448729952e-05, "loss": 0.6067, "step": 1055 }, { "epoch": 0.07522171172133775, "grad_norm": 4.117657661437988, "learning_rate": 1.9155157001332374e-05, "loss": 0.4612, "step": 1056 }, { "epoch": 0.07529294440289205, "grad_norm": 2.73118257522583, "learning_rate": 1.915357614138835e-05, "loss": 0.5499, "step": 1057 }, { "epoch": 0.07536417708444634, "grad_norm": 2.591031789779663, "learning_rate": 1.915199386914179e-05, "loss": 0.5646, "step": 1058 }, { "epoch": 0.07543540976600065, "grad_norm": 4.222745418548584, "learning_rate": 1.9150410184836826e-05, "loss": 0.5541, "step": 1059 }, { "epoch": 0.07550664244755494, "grad_norm": 1.97188138961792, "learning_rate": 1.91488250887178e-05, "loss": 0.1848, "step": 1060 }, { "epoch": 0.07557787512910924, "grad_norm": 3.9484033584594727, "learning_rate": 1.9147238581029276e-05, "loss": 0.7725, "step": 1061 }, { "epoch": 0.07564910781066353, "grad_norm": 2.2619802951812744, "learning_rate": 1.914565066201604e-05, "loss": 0.2517, "step": 1062 }, { "epoch": 0.07572034049221783, "grad_norm": 2.538317918777466, "learning_rate": 1.9144061331923086e-05, "loss": 0.6815, "step": 1063 }, { "epoch": 0.07579157317377212, "grad_norm": 3.0275983810424805, "learning_rate": 1.9142470590995636e-05, "loss": 0.6359, "step": 1064 }, { "epoch": 0.07586280585532643, "grad_norm": 1.1580815315246582, "learning_rate": 1.9140878439479123e-05, "loss": 0.1504, "step": 1065 }, { "epoch": 0.07593403853688072, "grad_norm": 3.901352882385254, "learning_rate": 1.9139284877619196e-05, "loss": 0.6656, "step": 1066 }, { "epoch": 0.07600527121843502, "grad_norm": 3.0247433185577393, "learning_rate": 1.9137689905661733e-05, "loss": 0.5701, "step": 1067 }, { "epoch": 0.07607650389998931, "grad_norm": 2.999526262283325, "learning_rate": 1.9136093523852817e-05, "loss": 0.7776, "step": 1068 }, { "epoch": 0.07614773658154361, "grad_norm": 2.9940085411071777, "learning_rate": 1.9134495732438755e-05, "loss": 0.8105, "step": 1069 }, { "epoch": 0.07621896926309792, "grad_norm": 7.391124725341797, "learning_rate": 1.9132896531666067e-05, "loss": 0.5404, "step": 1070 }, { "epoch": 0.0762902019446522, "grad_norm": 2.4210569858551025, "learning_rate": 1.9131295921781495e-05, "loss": 0.2867, "step": 1071 }, { "epoch": 0.07636143462620651, "grad_norm": 2.238307476043701, "learning_rate": 1.9129693903031995e-05, "loss": 0.4169, "step": 1072 }, { "epoch": 0.0764326673077608, "grad_norm": 3.4303410053253174, "learning_rate": 1.9128090475664748e-05, "loss": 0.4206, "step": 1073 }, { "epoch": 0.0765038999893151, "grad_norm": 4.41590690612793, "learning_rate": 1.9126485639927137e-05, "loss": 0.4879, "step": 1074 }, { "epoch": 0.07657513267086939, "grad_norm": 5.135560512542725, "learning_rate": 1.9124879396066778e-05, "loss": 0.3732, "step": 1075 }, { "epoch": 0.0766463653524237, "grad_norm": 3.853181838989258, "learning_rate": 1.9123271744331494e-05, "loss": 0.6471, "step": 1076 }, { "epoch": 0.07671759803397799, "grad_norm": 1.5850954055786133, "learning_rate": 1.9121662684969337e-05, "loss": 0.2303, "step": 1077 }, { "epoch": 0.07678883071553229, "grad_norm": 4.402900218963623, "learning_rate": 1.9120052218228558e-05, "loss": 0.5629, "step": 1078 }, { "epoch": 0.07686006339708658, "grad_norm": 2.941265106201172, "learning_rate": 1.911844034435764e-05, "loss": 0.5734, "step": 1079 }, { "epoch": 0.07693129607864088, "grad_norm": 3.1970558166503906, "learning_rate": 1.911682706360528e-05, "loss": 0.2509, "step": 1080 }, { "epoch": 0.07700252876019517, "grad_norm": 2.6382691860198975, "learning_rate": 1.9115212376220392e-05, "loss": 0.7305, "step": 1081 }, { "epoch": 0.07707376144174947, "grad_norm": 3.2347311973571777, "learning_rate": 1.91135962824521e-05, "loss": 0.4568, "step": 1082 }, { "epoch": 0.07714499412330378, "grad_norm": 4.5650153160095215, "learning_rate": 1.911197878254975e-05, "loss": 0.8494, "step": 1083 }, { "epoch": 0.07721622680485807, "grad_norm": 4.5968098640441895, "learning_rate": 1.9110359876762913e-05, "loss": 0.5061, "step": 1084 }, { "epoch": 0.07728745948641237, "grad_norm": 2.6800906658172607, "learning_rate": 1.9108739565341365e-05, "loss": 0.0886, "step": 1085 }, { "epoch": 0.07735869216796666, "grad_norm": 3.185925006866455, "learning_rate": 1.9107117848535105e-05, "loss": 0.7617, "step": 1086 }, { "epoch": 0.07742992484952096, "grad_norm": 3.2358810901641846, "learning_rate": 1.9105494726594344e-05, "loss": 0.8263, "step": 1087 }, { "epoch": 0.07750115753107525, "grad_norm": 2.2871272563934326, "learning_rate": 1.910387019976952e-05, "loss": 0.374, "step": 1088 }, { "epoch": 0.07757239021262956, "grad_norm": 2.076991558074951, "learning_rate": 1.910224426831127e-05, "loss": 0.217, "step": 1089 }, { "epoch": 0.07764362289418385, "grad_norm": 2.2119734287261963, "learning_rate": 1.910061693247047e-05, "loss": 0.3282, "step": 1090 }, { "epoch": 0.07771485557573815, "grad_norm": 2.861825466156006, "learning_rate": 1.909898819249819e-05, "loss": 0.5239, "step": 1091 }, { "epoch": 0.07778608825729244, "grad_norm": 2.5045886039733887, "learning_rate": 1.9097358048645732e-05, "loss": 0.2283, "step": 1092 }, { "epoch": 0.07785732093884674, "grad_norm": 3.904294729232788, "learning_rate": 1.9095726501164616e-05, "loss": 0.4468, "step": 1093 }, { "epoch": 0.07792855362040103, "grad_norm": 3.162916898727417, "learning_rate": 1.909409355030657e-05, "loss": 0.6767, "step": 1094 }, { "epoch": 0.07799978630195534, "grad_norm": 3.3371212482452393, "learning_rate": 1.909245919632354e-05, "loss": 0.7788, "step": 1095 }, { "epoch": 0.07807101898350964, "grad_norm": 2.8538920879364014, "learning_rate": 1.9090823439467686e-05, "loss": 0.5847, "step": 1096 }, { "epoch": 0.07814225166506393, "grad_norm": 4.150547027587891, "learning_rate": 1.9089186279991398e-05, "loss": 0.3363, "step": 1097 }, { "epoch": 0.07821348434661823, "grad_norm": 4.892727375030518, "learning_rate": 1.908754771814726e-05, "loss": 0.7339, "step": 1098 }, { "epoch": 0.07828471702817252, "grad_norm": 2.806790828704834, "learning_rate": 1.90859077541881e-05, "loss": 0.2197, "step": 1099 }, { "epoch": 0.07835594970972683, "grad_norm": 3.13319993019104, "learning_rate": 1.9084266388366937e-05, "loss": 0.4825, "step": 1100 }, { "epoch": 0.07842718239128112, "grad_norm": 2.3038508892059326, "learning_rate": 1.9082623620937023e-05, "loss": 0.2042, "step": 1101 }, { "epoch": 0.07849841507283542, "grad_norm": 11.431265830993652, "learning_rate": 1.9080979452151813e-05, "loss": 0.2419, "step": 1102 }, { "epoch": 0.07856964775438971, "grad_norm": 2.3126673698425293, "learning_rate": 1.9079333882264994e-05, "loss": 0.3908, "step": 1103 }, { "epoch": 0.07864088043594401, "grad_norm": 1.1036229133605957, "learning_rate": 1.907768691153045e-05, "loss": 0.1198, "step": 1104 }, { "epoch": 0.0787121131174983, "grad_norm": 3.593777656555176, "learning_rate": 1.90760385402023e-05, "loss": 0.131, "step": 1105 }, { "epoch": 0.0787833457990526, "grad_norm": 5.454516887664795, "learning_rate": 1.9074388768534872e-05, "loss": 0.6927, "step": 1106 }, { "epoch": 0.0788545784806069, "grad_norm": 2.1224257946014404, "learning_rate": 1.9072737596782703e-05, "loss": 0.2603, "step": 1107 }, { "epoch": 0.0789258111621612, "grad_norm": 2.9937987327575684, "learning_rate": 1.9071085025200555e-05, "loss": 0.6574, "step": 1108 }, { "epoch": 0.0789970438437155, "grad_norm": 2.5190248489379883, "learning_rate": 1.9069431054043398e-05, "loss": 0.4686, "step": 1109 }, { "epoch": 0.07906827652526979, "grad_norm": 2.2233970165252686, "learning_rate": 1.9067775683566433e-05, "loss": 0.2929, "step": 1110 }, { "epoch": 0.0791395092068241, "grad_norm": 8.826275825500488, "learning_rate": 1.9066118914025054e-05, "loss": 0.9163, "step": 1111 }, { "epoch": 0.07921074188837839, "grad_norm": 3.789903402328491, "learning_rate": 1.906446074567489e-05, "loss": 0.7338, "step": 1112 }, { "epoch": 0.07928197456993269, "grad_norm": 2.5349574089050293, "learning_rate": 1.906280117877178e-05, "loss": 0.5931, "step": 1113 }, { "epoch": 0.07935320725148698, "grad_norm": 3.8266725540161133, "learning_rate": 1.9061140213571777e-05, "loss": 0.7492, "step": 1114 }, { "epoch": 0.07942443993304128, "grad_norm": 2.295623779296875, "learning_rate": 1.905947785033115e-05, "loss": 0.1396, "step": 1115 }, { "epoch": 0.07949567261459557, "grad_norm": 3.121149778366089, "learning_rate": 1.9057814089306388e-05, "loss": 0.5283, "step": 1116 }, { "epoch": 0.07956690529614988, "grad_norm": 3.348057746887207, "learning_rate": 1.905614893075419e-05, "loss": 0.4602, "step": 1117 }, { "epoch": 0.07963813797770417, "grad_norm": 4.026923179626465, "learning_rate": 1.905448237493147e-05, "loss": 0.6089, "step": 1118 }, { "epoch": 0.07970937065925847, "grad_norm": 3.7878692150115967, "learning_rate": 1.905281442209536e-05, "loss": 0.5954, "step": 1119 }, { "epoch": 0.07978060334081276, "grad_norm": 2.528989315032959, "learning_rate": 1.9051145072503216e-05, "loss": 0.253, "step": 1120 }, { "epoch": 0.07985183602236706, "grad_norm": 3.5902223587036133, "learning_rate": 1.9049474326412593e-05, "loss": 0.5904, "step": 1121 }, { "epoch": 0.07992306870392137, "grad_norm": 2.9288365840911865, "learning_rate": 1.904780218408127e-05, "loss": 0.2707, "step": 1122 }, { "epoch": 0.07999430138547566, "grad_norm": 3.2119977474212646, "learning_rate": 1.9046128645767247e-05, "loss": 0.7213, "step": 1123 }, { "epoch": 0.08006553406702996, "grad_norm": 3.1106433868408203, "learning_rate": 1.9044453711728733e-05, "loss": 0.5712, "step": 1124 }, { "epoch": 0.08013676674858425, "grad_norm": 2.9914629459381104, "learning_rate": 1.904277738222415e-05, "loss": 0.607, "step": 1125 }, { "epoch": 0.08020799943013855, "grad_norm": 3.0236387252807617, "learning_rate": 1.9041099657512138e-05, "loss": 0.4567, "step": 1126 }, { "epoch": 0.08027923211169284, "grad_norm": 3.1905834674835205, "learning_rate": 1.903942053785156e-05, "loss": 0.7013, "step": 1127 }, { "epoch": 0.08035046479324714, "grad_norm": 2.9885287284851074, "learning_rate": 1.9037740023501473e-05, "loss": 0.641, "step": 1128 }, { "epoch": 0.08042169747480143, "grad_norm": 3.435246229171753, "learning_rate": 1.9036058114721174e-05, "loss": 0.7159, "step": 1129 }, { "epoch": 0.08049293015635574, "grad_norm": 1.1890342235565186, "learning_rate": 1.9034374811770163e-05, "loss": 0.0397, "step": 1130 }, { "epoch": 0.08056416283791003, "grad_norm": 3.0385210514068604, "learning_rate": 1.9032690114908155e-05, "loss": 0.5689, "step": 1131 }, { "epoch": 0.08063539551946433, "grad_norm": 2.4226856231689453, "learning_rate": 1.903100402439508e-05, "loss": 0.565, "step": 1132 }, { "epoch": 0.08070662820101862, "grad_norm": 3.271113634109497, "learning_rate": 1.902931654049108e-05, "loss": 0.491, "step": 1133 }, { "epoch": 0.08077786088257292, "grad_norm": 2.7045912742614746, "learning_rate": 1.9027627663456528e-05, "loss": 0.2879, "step": 1134 }, { "epoch": 0.08084909356412723, "grad_norm": 3.477649688720703, "learning_rate": 1.9025937393551993e-05, "loss": 0.7974, "step": 1135 }, { "epoch": 0.08092032624568152, "grad_norm": 4.423633575439453, "learning_rate": 1.902424573103827e-05, "loss": 0.6168, "step": 1136 }, { "epoch": 0.08099155892723582, "grad_norm": 2.3800225257873535, "learning_rate": 1.9022552676176358e-05, "loss": 0.3965, "step": 1137 }, { "epoch": 0.08106279160879011, "grad_norm": 5.605489730834961, "learning_rate": 1.9020858229227483e-05, "loss": 0.7233, "step": 1138 }, { "epoch": 0.08113402429034441, "grad_norm": 3.025442123413086, "learning_rate": 1.901916239045308e-05, "loss": 0.5697, "step": 1139 }, { "epoch": 0.0812052569718987, "grad_norm": 2.0553054809570312, "learning_rate": 1.9017465160114804e-05, "loss": 0.2114, "step": 1140 }, { "epoch": 0.08127648965345301, "grad_norm": 2.875681161880493, "learning_rate": 1.901576653847451e-05, "loss": 0.7907, "step": 1141 }, { "epoch": 0.0813477223350073, "grad_norm": 4.55006217956543, "learning_rate": 1.9014066525794284e-05, "loss": 0.7804, "step": 1142 }, { "epoch": 0.0814189550165616, "grad_norm": 3.5956201553344727, "learning_rate": 1.9012365122336425e-05, "loss": 0.3935, "step": 1143 }, { "epoch": 0.08149018769811589, "grad_norm": 3.9524645805358887, "learning_rate": 1.9010662328363435e-05, "loss": 0.6079, "step": 1144 }, { "epoch": 0.0815614203796702, "grad_norm": 2.8235981464385986, "learning_rate": 1.900895814413804e-05, "loss": 0.1739, "step": 1145 }, { "epoch": 0.08163265306122448, "grad_norm": 3.512258291244507, "learning_rate": 1.9007252569923173e-05, "loss": 0.6243, "step": 1146 }, { "epoch": 0.08170388574277879, "grad_norm": 4.197023868560791, "learning_rate": 1.9005545605981996e-05, "loss": 0.5443, "step": 1147 }, { "epoch": 0.08177511842433309, "grad_norm": 4.520979404449463, "learning_rate": 1.900383725257787e-05, "loss": 0.3076, "step": 1148 }, { "epoch": 0.08184635110588738, "grad_norm": 3.3862149715423584, "learning_rate": 1.9002127509974376e-05, "loss": 0.5742, "step": 1149 }, { "epoch": 0.08191758378744168, "grad_norm": 4.831696510314941, "learning_rate": 1.9000416378435312e-05, "loss": 0.3281, "step": 1150 }, { "epoch": 0.08198881646899597, "grad_norm": 2.7291791439056396, "learning_rate": 1.899870385822469e-05, "loss": 0.5591, "step": 1151 }, { "epoch": 0.08206004915055028, "grad_norm": 2.1096365451812744, "learning_rate": 1.8996989949606724e-05, "loss": 0.2068, "step": 1152 }, { "epoch": 0.08213128183210457, "grad_norm": 3.5661513805389404, "learning_rate": 1.8995274652845867e-05, "loss": 0.408, "step": 1153 }, { "epoch": 0.08220251451365887, "grad_norm": 3.3362226486206055, "learning_rate": 1.8993557968206763e-05, "loss": 0.6003, "step": 1154 }, { "epoch": 0.08227374719521316, "grad_norm": 1.8148953914642334, "learning_rate": 1.8991839895954277e-05, "loss": 0.2874, "step": 1155 }, { "epoch": 0.08234497987676746, "grad_norm": 2.31791090965271, "learning_rate": 1.8990120436353496e-05, "loss": 0.5596, "step": 1156 }, { "epoch": 0.08241621255832175, "grad_norm": 17.566858291625977, "learning_rate": 1.898839958966971e-05, "loss": 0.7299, "step": 1157 }, { "epoch": 0.08248744523987606, "grad_norm": 3.56974196434021, "learning_rate": 1.8986677356168433e-05, "loss": 0.6299, "step": 1158 }, { "epoch": 0.08255867792143035, "grad_norm": 2.951828718185425, "learning_rate": 1.8984953736115382e-05, "loss": 0.5079, "step": 1159 }, { "epoch": 0.08262991060298465, "grad_norm": 2.3427681922912598, "learning_rate": 1.89832287297765e-05, "loss": 0.3982, "step": 1160 }, { "epoch": 0.08270114328453895, "grad_norm": 4.116645336151123, "learning_rate": 1.8981502337417933e-05, "loss": 0.3141, "step": 1161 }, { "epoch": 0.08277237596609324, "grad_norm": 2.6049489974975586, "learning_rate": 1.8979774559306046e-05, "loss": 0.3396, "step": 1162 }, { "epoch": 0.08284360864764755, "grad_norm": 4.262461185455322, "learning_rate": 1.897804539570742e-05, "loss": 0.7828, "step": 1163 }, { "epoch": 0.08291484132920184, "grad_norm": 3.203399896621704, "learning_rate": 1.8976314846888845e-05, "loss": 0.817, "step": 1164 }, { "epoch": 0.08298607401075614, "grad_norm": 7.994067668914795, "learning_rate": 1.8974582913117323e-05, "loss": 0.1586, "step": 1165 }, { "epoch": 0.08305730669231043, "grad_norm": 2.180044651031494, "learning_rate": 1.897284959466008e-05, "loss": 0.2441, "step": 1166 }, { "epoch": 0.08312853937386473, "grad_norm": 2.5884642601013184, "learning_rate": 1.897111489178455e-05, "loss": 0.2599, "step": 1167 }, { "epoch": 0.08319977205541902, "grad_norm": 2.5363454818725586, "learning_rate": 1.8969378804758375e-05, "loss": 0.4004, "step": 1168 }, { "epoch": 0.08327100473697333, "grad_norm": 2.38020920753479, "learning_rate": 1.8967641333849417e-05, "loss": 0.375, "step": 1169 }, { "epoch": 0.08334223741852761, "grad_norm": 2.4055802822113037, "learning_rate": 1.896590247932575e-05, "loss": 0.3817, "step": 1170 }, { "epoch": 0.08341347010008192, "grad_norm": 2.2278079986572266, "learning_rate": 1.8964162241455662e-05, "loss": 0.3762, "step": 1171 }, { "epoch": 0.08348470278163621, "grad_norm": 5.333711624145508, "learning_rate": 1.896242062050765e-05, "loss": 0.5521, "step": 1172 }, { "epoch": 0.08355593546319051, "grad_norm": 3.2224271297454834, "learning_rate": 1.8960677616750435e-05, "loss": 0.5525, "step": 1173 }, { "epoch": 0.08362716814474482, "grad_norm": 2.1929237842559814, "learning_rate": 1.8958933230452938e-05, "loss": 0.6326, "step": 1174 }, { "epoch": 0.0836984008262991, "grad_norm": 2.6454601287841797, "learning_rate": 1.8957187461884308e-05, "loss": 0.5623, "step": 1175 }, { "epoch": 0.08376963350785341, "grad_norm": 6.21392297744751, "learning_rate": 1.895544031131389e-05, "loss": 0.6474, "step": 1176 }, { "epoch": 0.0838408661894077, "grad_norm": 3.0276241302490234, "learning_rate": 1.8953691779011255e-05, "loss": 0.9168, "step": 1177 }, { "epoch": 0.083912098870962, "grad_norm": 5.576284408569336, "learning_rate": 1.895194186524618e-05, "loss": 0.6853, "step": 1178 }, { "epoch": 0.08398333155251629, "grad_norm": 2.554047107696533, "learning_rate": 1.895019057028867e-05, "loss": 0.5874, "step": 1179 }, { "epoch": 0.0840545642340706, "grad_norm": 2.7928574085235596, "learning_rate": 1.894843789440892e-05, "loss": 0.3788, "step": 1180 }, { "epoch": 0.08412579691562488, "grad_norm": 1.737177848815918, "learning_rate": 1.8946683837877354e-05, "loss": 0.1799, "step": 1181 }, { "epoch": 0.08419702959717919, "grad_norm": 4.5933098793029785, "learning_rate": 1.8944928400964606e-05, "loss": 0.7763, "step": 1182 }, { "epoch": 0.08426826227873348, "grad_norm": 2.972719192504883, "learning_rate": 1.894317158394152e-05, "loss": 0.8102, "step": 1183 }, { "epoch": 0.08433949496028778, "grad_norm": 3.528303384780884, "learning_rate": 1.8941413387079156e-05, "loss": 0.6818, "step": 1184 }, { "epoch": 0.08441072764184207, "grad_norm": 3.3144428730010986, "learning_rate": 1.8939653810648785e-05, "loss": 0.3379, "step": 1185 }, { "epoch": 0.08448196032339637, "grad_norm": 1.971819519996643, "learning_rate": 1.8937892854921892e-05, "loss": 0.334, "step": 1186 }, { "epoch": 0.08455319300495068, "grad_norm": 3.6054365634918213, "learning_rate": 1.8936130520170172e-05, "loss": 0.3554, "step": 1187 }, { "epoch": 0.08462442568650497, "grad_norm": 2.585344076156616, "learning_rate": 1.893436680666554e-05, "loss": 0.2273, "step": 1188 }, { "epoch": 0.08469565836805927, "grad_norm": 3.3488073348999023, "learning_rate": 1.893260171468011e-05, "loss": 0.4844, "step": 1189 }, { "epoch": 0.08476689104961356, "grad_norm": 2.335827350616455, "learning_rate": 1.8930835244486232e-05, "loss": 0.2852, "step": 1190 }, { "epoch": 0.08483812373116786, "grad_norm": 4.813215732574463, "learning_rate": 1.892906739635644e-05, "loss": 0.468, "step": 1191 }, { "epoch": 0.08490935641272215, "grad_norm": 4.5968523025512695, "learning_rate": 1.8927298170563503e-05, "loss": 0.6021, "step": 1192 }, { "epoch": 0.08498058909427646, "grad_norm": 6.206570625305176, "learning_rate": 1.892552756738039e-05, "loss": 0.7893, "step": 1193 }, { "epoch": 0.08505182177583075, "grad_norm": 1.877178430557251, "learning_rate": 1.8923755587080288e-05, "loss": 0.2067, "step": 1194 }, { "epoch": 0.08512305445738505, "grad_norm": 3.011931896209717, "learning_rate": 1.8921982229936597e-05, "loss": 0.4586, "step": 1195 }, { "epoch": 0.08519428713893934, "grad_norm": 3.2133657932281494, "learning_rate": 1.8920207496222924e-05, "loss": 0.3618, "step": 1196 }, { "epoch": 0.08526551982049364, "grad_norm": 3.252030372619629, "learning_rate": 1.89184313862131e-05, "loss": 0.8518, "step": 1197 }, { "epoch": 0.08533675250204793, "grad_norm": 2.900805950164795, "learning_rate": 1.891665390018115e-05, "loss": 0.5238, "step": 1198 }, { "epoch": 0.08540798518360224, "grad_norm": 3.249209403991699, "learning_rate": 1.891487503840133e-05, "loss": 0.4749, "step": 1199 }, { "epoch": 0.08547921786515654, "grad_norm": 4.552114009857178, "learning_rate": 1.8913094801148096e-05, "loss": 0.966, "step": 1200 }, { "epoch": 0.08555045054671083, "grad_norm": 2.803631544113159, "learning_rate": 1.891131318869612e-05, "loss": 0.5755, "step": 1201 }, { "epoch": 0.08562168322826513, "grad_norm": 3.3868417739868164, "learning_rate": 1.8909530201320288e-05, "loss": 0.5369, "step": 1202 }, { "epoch": 0.08569291590981942, "grad_norm": 2.566066026687622, "learning_rate": 1.89077458392957e-05, "loss": 0.6002, "step": 1203 }, { "epoch": 0.08576414859137373, "grad_norm": 3.9399662017822266, "learning_rate": 1.890596010289766e-05, "loss": 0.6241, "step": 1204 }, { "epoch": 0.08583538127292802, "grad_norm": 2.322877883911133, "learning_rate": 1.8904172992401685e-05, "loss": 0.3901, "step": 1205 }, { "epoch": 0.08590661395448232, "grad_norm": 4.802760124206543, "learning_rate": 1.8902384508083518e-05, "loss": 0.3012, "step": 1206 }, { "epoch": 0.08597784663603661, "grad_norm": 1.748192310333252, "learning_rate": 1.8900594650219096e-05, "loss": 0.1784, "step": 1207 }, { "epoch": 0.08604907931759091, "grad_norm": 3.183027744293213, "learning_rate": 1.8898803419084578e-05, "loss": 0.4441, "step": 1208 }, { "epoch": 0.0861203119991452, "grad_norm": 4.438425540924072, "learning_rate": 1.889701081495633e-05, "loss": 0.793, "step": 1209 }, { "epoch": 0.0861915446806995, "grad_norm": 4.705627918243408, "learning_rate": 1.8895216838110938e-05, "loss": 0.5387, "step": 1210 }, { "epoch": 0.0862627773622538, "grad_norm": 1.7422181367874146, "learning_rate": 1.889342148882519e-05, "loss": 0.1054, "step": 1211 }, { "epoch": 0.0863340100438081, "grad_norm": 3.0388965606689453, "learning_rate": 1.889162476737609e-05, "loss": 0.1679, "step": 1212 }, { "epoch": 0.0864052427253624, "grad_norm": 1.7832591533660889, "learning_rate": 1.8889826674040855e-05, "loss": 0.202, "step": 1213 }, { "epoch": 0.08647647540691669, "grad_norm": 2.1219141483306885, "learning_rate": 1.8888027209096913e-05, "loss": 0.2868, "step": 1214 }, { "epoch": 0.086547708088471, "grad_norm": 3.050948143005371, "learning_rate": 1.88862263728219e-05, "loss": 0.4902, "step": 1215 }, { "epoch": 0.08661894077002529, "grad_norm": 2.3974015712738037, "learning_rate": 1.888442416549367e-05, "loss": 0.331, "step": 1216 }, { "epoch": 0.08669017345157959, "grad_norm": 3.948610782623291, "learning_rate": 1.888262058739028e-05, "loss": 0.8222, "step": 1217 }, { "epoch": 0.08676140613313388, "grad_norm": 1.609980821609497, "learning_rate": 1.888081563879001e-05, "loss": 0.2263, "step": 1218 }, { "epoch": 0.08683263881468818, "grad_norm": 3.623095989227295, "learning_rate": 1.887900931997134e-05, "loss": 0.736, "step": 1219 }, { "epoch": 0.08690387149624247, "grad_norm": 4.0532097816467285, "learning_rate": 1.8877201631212966e-05, "loss": 0.7307, "step": 1220 }, { "epoch": 0.08697510417779677, "grad_norm": 2.8707895278930664, "learning_rate": 1.88753925727938e-05, "loss": 0.5064, "step": 1221 }, { "epoch": 0.08704633685935106, "grad_norm": 3.581732749938965, "learning_rate": 1.887358214499296e-05, "loss": 0.6369, "step": 1222 }, { "epoch": 0.08711756954090537, "grad_norm": 3.7812867164611816, "learning_rate": 1.8871770348089774e-05, "loss": 0.6164, "step": 1223 }, { "epoch": 0.08718880222245966, "grad_norm": 4.290986061096191, "learning_rate": 1.8869957182363784e-05, "loss": 0.3909, "step": 1224 }, { "epoch": 0.08726003490401396, "grad_norm": 1.977197527885437, "learning_rate": 1.8868142648094745e-05, "loss": 0.2936, "step": 1225 }, { "epoch": 0.08733126758556826, "grad_norm": 1.4167723655700684, "learning_rate": 1.886632674556262e-05, "loss": 0.1523, "step": 1226 }, { "epoch": 0.08740250026712255, "grad_norm": 3.4144527912139893, "learning_rate": 1.8864509475047583e-05, "loss": 0.5122, "step": 1227 }, { "epoch": 0.08747373294867686, "grad_norm": 3.257235288619995, "learning_rate": 1.886269083683002e-05, "loss": 0.5205, "step": 1228 }, { "epoch": 0.08754496563023115, "grad_norm": 5.89054012298584, "learning_rate": 1.886087083119053e-05, "loss": 0.7454, "step": 1229 }, { "epoch": 0.08761619831178545, "grad_norm": 2.221205949783325, "learning_rate": 1.885904945840992e-05, "loss": 0.2779, "step": 1230 }, { "epoch": 0.08768743099333974, "grad_norm": 3.3186254501342773, "learning_rate": 1.885722671876921e-05, "loss": 0.6054, "step": 1231 }, { "epoch": 0.08775866367489404, "grad_norm": 2.041476249694824, "learning_rate": 1.8855402612549624e-05, "loss": 0.2309, "step": 1232 }, { "epoch": 0.08782989635644833, "grad_norm": 2.374070405960083, "learning_rate": 1.8853577140032614e-05, "loss": 0.4163, "step": 1233 }, { "epoch": 0.08790112903800264, "grad_norm": 4.219454288482666, "learning_rate": 1.885175030149982e-05, "loss": 0.7834, "step": 1234 }, { "epoch": 0.08797236171955693, "grad_norm": 5.646050930023193, "learning_rate": 1.8849922097233115e-05, "loss": 0.7268, "step": 1235 }, { "epoch": 0.08804359440111123, "grad_norm": 2.6913323402404785, "learning_rate": 1.8848092527514564e-05, "loss": 0.5171, "step": 1236 }, { "epoch": 0.08811482708266553, "grad_norm": 3.3355329036712646, "learning_rate": 1.8846261592626455e-05, "loss": 0.6964, "step": 1237 }, { "epoch": 0.08818605976421982, "grad_norm": 1.9102216958999634, "learning_rate": 1.8844429292851282e-05, "loss": 0.0672, "step": 1238 }, { "epoch": 0.08825729244577413, "grad_norm": 4.206172466278076, "learning_rate": 1.8842595628471746e-05, "loss": 0.7775, "step": 1239 }, { "epoch": 0.08832852512732842, "grad_norm": 3.104734182357788, "learning_rate": 1.884076059977077e-05, "loss": 0.7994, "step": 1240 }, { "epoch": 0.08839975780888272, "grad_norm": 2.8220067024230957, "learning_rate": 1.8838924207031474e-05, "loss": 0.5807, "step": 1241 }, { "epoch": 0.08847099049043701, "grad_norm": 3.46606707572937, "learning_rate": 1.8837086450537195e-05, "loss": 0.3478, "step": 1242 }, { "epoch": 0.08854222317199131, "grad_norm": 5.062743186950684, "learning_rate": 1.883524733057148e-05, "loss": 0.9071, "step": 1243 }, { "epoch": 0.0886134558535456, "grad_norm": 4.16996955871582, "learning_rate": 1.8833406847418088e-05, "loss": 0.6094, "step": 1244 }, { "epoch": 0.0886846885350999, "grad_norm": 4.167663097381592, "learning_rate": 1.8831565001360987e-05, "loss": 0.7129, "step": 1245 }, { "epoch": 0.0887559212166542, "grad_norm": 3.016483783721924, "learning_rate": 1.8829721792684353e-05, "loss": 0.7603, "step": 1246 }, { "epoch": 0.0888271538982085, "grad_norm": 4.777698040008545, "learning_rate": 1.8827877221672578e-05, "loss": 0.4361, "step": 1247 }, { "epoch": 0.08889838657976279, "grad_norm": 3.6659209728240967, "learning_rate": 1.8826031288610255e-05, "loss": 0.4505, "step": 1248 }, { "epoch": 0.08896961926131709, "grad_norm": 4.132365703582764, "learning_rate": 1.8824183993782193e-05, "loss": 0.3378, "step": 1249 }, { "epoch": 0.0890408519428714, "grad_norm": 2.9635379314422607, "learning_rate": 1.8822335337473413e-05, "loss": 0.4261, "step": 1250 }, { "epoch": 0.08911208462442569, "grad_norm": 3.9187963008880615, "learning_rate": 1.8820485319969145e-05, "loss": 0.7004, "step": 1251 }, { "epoch": 0.08918331730597999, "grad_norm": 3.48785138130188, "learning_rate": 1.881863394155482e-05, "loss": 0.7077, "step": 1252 }, { "epoch": 0.08925454998753428, "grad_norm": 2.470024824142456, "learning_rate": 1.88167812025161e-05, "loss": 0.2817, "step": 1253 }, { "epoch": 0.08932578266908858, "grad_norm": 2.517160177230835, "learning_rate": 1.881492710313883e-05, "loss": 0.3191, "step": 1254 }, { "epoch": 0.08939701535064287, "grad_norm": 4.7456374168396, "learning_rate": 1.8813071643709087e-05, "loss": 0.248, "step": 1255 }, { "epoch": 0.08946824803219718, "grad_norm": 3.623358726501465, "learning_rate": 1.8811214824513145e-05, "loss": 0.2873, "step": 1256 }, { "epoch": 0.08953948071375147, "grad_norm": 2.066709280014038, "learning_rate": 1.8809356645837495e-05, "loss": 0.296, "step": 1257 }, { "epoch": 0.08961071339530577, "grad_norm": 3.290742874145508, "learning_rate": 1.8807497107968834e-05, "loss": 0.5808, "step": 1258 }, { "epoch": 0.08968194607686006, "grad_norm": 3.667315721511841, "learning_rate": 1.8805636211194066e-05, "loss": 0.5665, "step": 1259 }, { "epoch": 0.08975317875841436, "grad_norm": 2.121943235397339, "learning_rate": 1.8803773955800313e-05, "loss": 0.2112, "step": 1260 }, { "epoch": 0.08982441143996865, "grad_norm": 3.6101951599121094, "learning_rate": 1.88019103420749e-05, "loss": 0.3305, "step": 1261 }, { "epoch": 0.08989564412152296, "grad_norm": 3.3880512714385986, "learning_rate": 1.8800045370305365e-05, "loss": 0.6426, "step": 1262 }, { "epoch": 0.08996687680307726, "grad_norm": 4.919668197631836, "learning_rate": 1.879817904077945e-05, "loss": 0.6815, "step": 1263 }, { "epoch": 0.09003810948463155, "grad_norm": 4.391907215118408, "learning_rate": 1.879631135378511e-05, "loss": 0.5457, "step": 1264 }, { "epoch": 0.09010934216618585, "grad_norm": 3.7320609092712402, "learning_rate": 1.8794442309610518e-05, "loss": 0.4657, "step": 1265 }, { "epoch": 0.09018057484774014, "grad_norm": 2.699648857116699, "learning_rate": 1.879257190854404e-05, "loss": 0.3662, "step": 1266 }, { "epoch": 0.09025180752929444, "grad_norm": 4.507177352905273, "learning_rate": 1.879070015087426e-05, "loss": 0.7228, "step": 1267 }, { "epoch": 0.09032304021084873, "grad_norm": 4.651413440704346, "learning_rate": 1.8788827036889978e-05, "loss": 0.6412, "step": 1268 }, { "epoch": 0.09039427289240304, "grad_norm": 3.228273630142212, "learning_rate": 1.8786952566880192e-05, "loss": 0.5671, "step": 1269 }, { "epoch": 0.09046550557395733, "grad_norm": 4.102708339691162, "learning_rate": 1.878507674113411e-05, "loss": 0.6181, "step": 1270 }, { "epoch": 0.09053673825551163, "grad_norm": 3.732553243637085, "learning_rate": 1.878319955994116e-05, "loss": 0.3909, "step": 1271 }, { "epoch": 0.09060797093706592, "grad_norm": 3.583171844482422, "learning_rate": 1.8781321023590962e-05, "loss": 0.6118, "step": 1272 }, { "epoch": 0.09067920361862022, "grad_norm": 9.243890762329102, "learning_rate": 1.877944113237336e-05, "loss": 0.2511, "step": 1273 }, { "epoch": 0.09075043630017451, "grad_norm": 2.879972457885742, "learning_rate": 1.8777559886578407e-05, "loss": 0.5517, "step": 1274 }, { "epoch": 0.09082166898172882, "grad_norm": 3.5413975715637207, "learning_rate": 1.877567728649635e-05, "loss": 0.8936, "step": 1275 }, { "epoch": 0.09089290166328312, "grad_norm": 3.3792591094970703, "learning_rate": 1.8773793332417664e-05, "loss": 0.4607, "step": 1276 }, { "epoch": 0.09096413434483741, "grad_norm": 3.026505708694458, "learning_rate": 1.8771908024633017e-05, "loss": 0.5551, "step": 1277 }, { "epoch": 0.09103536702639171, "grad_norm": 2.767277479171753, "learning_rate": 1.8770021363433295e-05, "loss": 0.6754, "step": 1278 }, { "epoch": 0.091106599707946, "grad_norm": 3.1222352981567383, "learning_rate": 1.876813334910959e-05, "loss": 0.6049, "step": 1279 }, { "epoch": 0.09117783238950031, "grad_norm": 3.2395377159118652, "learning_rate": 1.8766243981953204e-05, "loss": 0.5029, "step": 1280 }, { "epoch": 0.0912490650710546, "grad_norm": 2.1201157569885254, "learning_rate": 1.876435326225565e-05, "loss": 0.3338, "step": 1281 }, { "epoch": 0.0913202977526089, "grad_norm": 3.432853937149048, "learning_rate": 1.8762461190308637e-05, "loss": 0.7293, "step": 1282 }, { "epoch": 0.09139153043416319, "grad_norm": 1.7185752391815186, "learning_rate": 1.8760567766404102e-05, "loss": 0.1766, "step": 1283 }, { "epoch": 0.0914627631157175, "grad_norm": 2.7943053245544434, "learning_rate": 1.8758672990834172e-05, "loss": 0.324, "step": 1284 }, { "epoch": 0.09153399579727178, "grad_norm": 3.4485106468200684, "learning_rate": 1.87567768638912e-05, "loss": 0.7663, "step": 1285 }, { "epoch": 0.09160522847882609, "grad_norm": 3.344630718231201, "learning_rate": 1.8754879385867738e-05, "loss": 0.3642, "step": 1286 }, { "epoch": 0.09167646116038038, "grad_norm": 3.018841028213501, "learning_rate": 1.875298055705654e-05, "loss": 0.8537, "step": 1287 }, { "epoch": 0.09174769384193468, "grad_norm": 2.417069673538208, "learning_rate": 1.8751080377750585e-05, "loss": 0.1751, "step": 1288 }, { "epoch": 0.09181892652348898, "grad_norm": 2.891385316848755, "learning_rate": 1.8749178848243042e-05, "loss": 0.8851, "step": 1289 }, { "epoch": 0.09189015920504327, "grad_norm": 3.068493604660034, "learning_rate": 1.8747275968827304e-05, "loss": 0.4006, "step": 1290 }, { "epoch": 0.09196139188659758, "grad_norm": 2.6412277221679688, "learning_rate": 1.8745371739796962e-05, "loss": 0.5957, "step": 1291 }, { "epoch": 0.09203262456815187, "grad_norm": 4.112875938415527, "learning_rate": 1.8743466161445823e-05, "loss": 0.9725, "step": 1292 }, { "epoch": 0.09210385724970617, "grad_norm": 2.9205336570739746, "learning_rate": 1.8741559234067893e-05, "loss": 0.1135, "step": 1293 }, { "epoch": 0.09217508993126046, "grad_norm": 2.966789722442627, "learning_rate": 1.8739650957957396e-05, "loss": 0.4845, "step": 1294 }, { "epoch": 0.09224632261281476, "grad_norm": 3.231937885284424, "learning_rate": 1.8737741333408757e-05, "loss": 0.4369, "step": 1295 }, { "epoch": 0.09231755529436905, "grad_norm": 3.823906898498535, "learning_rate": 1.873583036071661e-05, "loss": 0.73, "step": 1296 }, { "epoch": 0.09238878797592336, "grad_norm": 3.872420072555542, "learning_rate": 1.87339180401758e-05, "loss": 0.7311, "step": 1297 }, { "epoch": 0.09246002065747765, "grad_norm": 7.204023361206055, "learning_rate": 1.873200437208138e-05, "loss": 0.6314, "step": 1298 }, { "epoch": 0.09253125333903195, "grad_norm": 3.6863696575164795, "learning_rate": 1.8730089356728605e-05, "loss": 0.7498, "step": 1299 }, { "epoch": 0.09260248602058624, "grad_norm": 4.516905784606934, "learning_rate": 1.8728172994412948e-05, "loss": 0.7127, "step": 1300 }, { "epoch": 0.09267371870214054, "grad_norm": 3.2726423740386963, "learning_rate": 1.872625528543008e-05, "loss": 0.508, "step": 1301 }, { "epoch": 0.09274495138369485, "grad_norm": 3.945563793182373, "learning_rate": 1.8724336230075885e-05, "loss": 0.5945, "step": 1302 }, { "epoch": 0.09281618406524914, "grad_norm": 6.09106969833374, "learning_rate": 1.872241582864645e-05, "loss": 0.5706, "step": 1303 }, { "epoch": 0.09288741674680344, "grad_norm": 8.709092140197754, "learning_rate": 1.872049408143808e-05, "loss": 0.2209, "step": 1304 }, { "epoch": 0.09295864942835773, "grad_norm": 2.001002311706543, "learning_rate": 1.871857098874727e-05, "loss": 0.1587, "step": 1305 }, { "epoch": 0.09302988210991203, "grad_norm": 3.7652411460876465, "learning_rate": 1.8716646550870746e-05, "loss": 0.4762, "step": 1306 }, { "epoch": 0.09310111479146632, "grad_norm": 3.5171914100646973, "learning_rate": 1.8714720768105425e-05, "loss": 0.2036, "step": 1307 }, { "epoch": 0.09317234747302063, "grad_norm": 2.0084123611450195, "learning_rate": 1.8712793640748433e-05, "loss": 0.2955, "step": 1308 }, { "epoch": 0.09324358015457491, "grad_norm": 2.4678943157196045, "learning_rate": 1.8710865169097102e-05, "loss": 0.4527, "step": 1309 }, { "epoch": 0.09331481283612922, "grad_norm": 3.3499104976654053, "learning_rate": 1.8708935353448982e-05, "loss": 0.6275, "step": 1310 }, { "epoch": 0.09338604551768351, "grad_norm": 1.440740704536438, "learning_rate": 1.8707004194101825e-05, "loss": 0.1387, "step": 1311 }, { "epoch": 0.09345727819923781, "grad_norm": 0.9849145412445068, "learning_rate": 1.8705071691353583e-05, "loss": 0.0298, "step": 1312 }, { "epoch": 0.0935285108807921, "grad_norm": 4.248468399047852, "learning_rate": 1.870313784550242e-05, "loss": 0.6288, "step": 1313 }, { "epoch": 0.0935997435623464, "grad_norm": 4.436267375946045, "learning_rate": 1.8701202656846717e-05, "loss": 0.8155, "step": 1314 }, { "epoch": 0.09367097624390071, "grad_norm": 4.171763896942139, "learning_rate": 1.8699266125685052e-05, "loss": 0.7333, "step": 1315 }, { "epoch": 0.093742208925455, "grad_norm": 3.7198495864868164, "learning_rate": 1.8697328252316205e-05, "loss": 0.6074, "step": 1316 }, { "epoch": 0.0938134416070093, "grad_norm": 4.168213844299316, "learning_rate": 1.8695389037039172e-05, "loss": 0.7414, "step": 1317 }, { "epoch": 0.09388467428856359, "grad_norm": 3.588937520980835, "learning_rate": 1.869344848015316e-05, "loss": 0.8556, "step": 1318 }, { "epoch": 0.0939559069701179, "grad_norm": 3.791379928588867, "learning_rate": 1.869150658195757e-05, "loss": 0.5198, "step": 1319 }, { "epoch": 0.09402713965167218, "grad_norm": 1.8446608781814575, "learning_rate": 1.868956334275202e-05, "loss": 0.191, "step": 1320 }, { "epoch": 0.09409837233322649, "grad_norm": 3.6945624351501465, "learning_rate": 1.8687618762836334e-05, "loss": 0.6773, "step": 1321 }, { "epoch": 0.09416960501478078, "grad_norm": 2.066972494125366, "learning_rate": 1.8685672842510536e-05, "loss": 0.2142, "step": 1322 }, { "epoch": 0.09424083769633508, "grad_norm": 2.4198994636535645, "learning_rate": 1.8683725582074862e-05, "loss": 0.2013, "step": 1323 }, { "epoch": 0.09431207037788937, "grad_norm": 4.794493675231934, "learning_rate": 1.868177698182976e-05, "loss": 0.5902, "step": 1324 }, { "epoch": 0.09438330305944367, "grad_norm": 1.9296724796295166, "learning_rate": 1.867982704207587e-05, "loss": 0.1591, "step": 1325 }, { "epoch": 0.09445453574099796, "grad_norm": 2.3474555015563965, "learning_rate": 1.8677875763114054e-05, "loss": 0.393, "step": 1326 }, { "epoch": 0.09452576842255227, "grad_norm": 3.922429084777832, "learning_rate": 1.8675923145245373e-05, "loss": 0.5658, "step": 1327 }, { "epoch": 0.09459700110410657, "grad_norm": 3.7784571647644043, "learning_rate": 1.8673969188771094e-05, "loss": 0.6183, "step": 1328 }, { "epoch": 0.09466823378566086, "grad_norm": 2.6175239086151123, "learning_rate": 1.8672013893992697e-05, "loss": 0.2783, "step": 1329 }, { "epoch": 0.09473946646721516, "grad_norm": 2.840385675430298, "learning_rate": 1.8670057261211857e-05, "loss": 0.7182, "step": 1330 }, { "epoch": 0.09481069914876945, "grad_norm": 2.6728367805480957, "learning_rate": 1.8668099290730468e-05, "loss": 0.4719, "step": 1331 }, { "epoch": 0.09488193183032376, "grad_norm": 2.996279001235962, "learning_rate": 1.8666139982850626e-05, "loss": 0.7547, "step": 1332 }, { "epoch": 0.09495316451187805, "grad_norm": 5.368165493011475, "learning_rate": 1.8664179337874618e-05, "loss": 0.6006, "step": 1333 }, { "epoch": 0.09502439719343235, "grad_norm": 3.5279183387756348, "learning_rate": 1.866221735610497e-05, "loss": 0.5682, "step": 1334 }, { "epoch": 0.09509562987498664, "grad_norm": 1.9174907207489014, "learning_rate": 1.866025403784439e-05, "loss": 0.1277, "step": 1335 }, { "epoch": 0.09516686255654094, "grad_norm": 4.527996063232422, "learning_rate": 1.865828938339579e-05, "loss": 0.3665, "step": 1336 }, { "epoch": 0.09523809523809523, "grad_norm": 3.690699577331543, "learning_rate": 1.86563233930623e-05, "loss": 0.6167, "step": 1337 }, { "epoch": 0.09530932791964954, "grad_norm": 2.4651196002960205, "learning_rate": 1.8654356067147258e-05, "loss": 0.4583, "step": 1338 }, { "epoch": 0.09538056060120383, "grad_norm": 1.966208815574646, "learning_rate": 1.8652387405954196e-05, "loss": 0.2182, "step": 1339 }, { "epoch": 0.09545179328275813, "grad_norm": 2.8265726566314697, "learning_rate": 1.865041740978686e-05, "loss": 0.7286, "step": 1340 }, { "epoch": 0.09552302596431243, "grad_norm": 2.8864247798919678, "learning_rate": 1.86484460789492e-05, "loss": 0.6768, "step": 1341 }, { "epoch": 0.09559425864586672, "grad_norm": 2.4593405723571777, "learning_rate": 1.864647341374537e-05, "loss": 0.5385, "step": 1342 }, { "epoch": 0.09566549132742103, "grad_norm": 3.745260715484619, "learning_rate": 1.8644499414479735e-05, "loss": 0.8768, "step": 1343 }, { "epoch": 0.09573672400897532, "grad_norm": 2.902894973754883, "learning_rate": 1.864252408145686e-05, "loss": 0.1994, "step": 1344 }, { "epoch": 0.09580795669052962, "grad_norm": 3.060603141784668, "learning_rate": 1.8640547414981523e-05, "loss": 0.3098, "step": 1345 }, { "epoch": 0.09587918937208391, "grad_norm": 1.8912601470947266, "learning_rate": 1.8638569415358696e-05, "loss": 0.2003, "step": 1346 }, { "epoch": 0.09595042205363821, "grad_norm": 4.674674987792969, "learning_rate": 1.863659008289357e-05, "loss": 0.4261, "step": 1347 }, { "epoch": 0.0960216547351925, "grad_norm": 2.9351983070373535, "learning_rate": 1.8634609417891535e-05, "loss": 0.6848, "step": 1348 }, { "epoch": 0.0960928874167468, "grad_norm": 5.15578556060791, "learning_rate": 1.8632627420658184e-05, "loss": 0.6913, "step": 1349 }, { "epoch": 0.0961641200983011, "grad_norm": 2.5313096046447754, "learning_rate": 1.8630644091499322e-05, "loss": 0.426, "step": 1350 }, { "epoch": 0.0962353527798554, "grad_norm": 3.2554404735565186, "learning_rate": 1.8628659430720958e-05, "loss": 0.6784, "step": 1351 }, { "epoch": 0.09630658546140969, "grad_norm": 5.930413722991943, "learning_rate": 1.86266734386293e-05, "loss": 0.493, "step": 1352 }, { "epoch": 0.09637781814296399, "grad_norm": 2.1458587646484375, "learning_rate": 1.8624686115530767e-05, "loss": 0.0977, "step": 1353 }, { "epoch": 0.0964490508245183, "grad_norm": 3.054422616958618, "learning_rate": 1.8622697461731983e-05, "loss": 0.4721, "step": 1354 }, { "epoch": 0.09652028350607259, "grad_norm": 2.195075273513794, "learning_rate": 1.8620707477539776e-05, "loss": 0.3682, "step": 1355 }, { "epoch": 0.09659151618762689, "grad_norm": 3.596714973449707, "learning_rate": 1.8618716163261185e-05, "loss": 0.6028, "step": 1356 }, { "epoch": 0.09666274886918118, "grad_norm": 2.614881753921509, "learning_rate": 1.8616723519203445e-05, "loss": 0.3709, "step": 1357 }, { "epoch": 0.09673398155073548, "grad_norm": 2.3709182739257812, "learning_rate": 1.8614729545674e-05, "loss": 0.5059, "step": 1358 }, { "epoch": 0.09680521423228977, "grad_norm": 2.328643560409546, "learning_rate": 1.86127342429805e-05, "loss": 0.4069, "step": 1359 }, { "epoch": 0.09687644691384407, "grad_norm": 3.7947659492492676, "learning_rate": 1.86107376114308e-05, "loss": 0.5889, "step": 1360 }, { "epoch": 0.09694767959539836, "grad_norm": 3.2569050788879395, "learning_rate": 1.8608739651332965e-05, "loss": 0.436, "step": 1361 }, { "epoch": 0.09701891227695267, "grad_norm": 4.097048759460449, "learning_rate": 1.8606740362995247e-05, "loss": 0.8535, "step": 1362 }, { "epoch": 0.09709014495850696, "grad_norm": 4.102331638336182, "learning_rate": 1.8604739746726128e-05, "loss": 0.4739, "step": 1363 }, { "epoch": 0.09716137764006126, "grad_norm": 2.897353410720825, "learning_rate": 1.8602737802834275e-05, "loss": 0.3676, "step": 1364 }, { "epoch": 0.09723261032161555, "grad_norm": 2.451538562774658, "learning_rate": 1.8600734531628573e-05, "loss": 0.3479, "step": 1365 }, { "epoch": 0.09730384300316985, "grad_norm": 2.3254690170288086, "learning_rate": 1.8598729933418102e-05, "loss": 0.3924, "step": 1366 }, { "epoch": 0.09737507568472416, "grad_norm": 5.067768096923828, "learning_rate": 1.8596724008512153e-05, "loss": 0.4867, "step": 1367 }, { "epoch": 0.09744630836627845, "grad_norm": 2.858320474624634, "learning_rate": 1.8594716757220218e-05, "loss": 0.4429, "step": 1368 }, { "epoch": 0.09751754104783275, "grad_norm": 3.2118642330169678, "learning_rate": 1.8592708179851994e-05, "loss": 0.7107, "step": 1369 }, { "epoch": 0.09758877372938704, "grad_norm": 3.5770740509033203, "learning_rate": 1.8590698276717386e-05, "loss": 0.6588, "step": 1370 }, { "epoch": 0.09766000641094134, "grad_norm": 3.213167190551758, "learning_rate": 1.8588687048126503e-05, "loss": 0.3322, "step": 1371 }, { "epoch": 0.09773123909249563, "grad_norm": 2.5814361572265625, "learning_rate": 1.8586674494389653e-05, "loss": 0.4229, "step": 1372 }, { "epoch": 0.09780247177404994, "grad_norm": 4.6123576164245605, "learning_rate": 1.858466061581736e-05, "loss": 0.6201, "step": 1373 }, { "epoch": 0.09787370445560423, "grad_norm": 6.1385416984558105, "learning_rate": 1.858264541272033e-05, "loss": 0.3283, "step": 1374 }, { "epoch": 0.09794493713715853, "grad_norm": 2.977623462677002, "learning_rate": 1.8580628885409502e-05, "loss": 0.5115, "step": 1375 }, { "epoch": 0.09801616981871282, "grad_norm": 5.7976975440979, "learning_rate": 1.8578611034196e-05, "loss": 0.9019, "step": 1376 }, { "epoch": 0.09808740250026712, "grad_norm": 2.520535945892334, "learning_rate": 1.8576591859391158e-05, "loss": 0.4467, "step": 1377 }, { "epoch": 0.09815863518182141, "grad_norm": 7.008199214935303, "learning_rate": 1.857457136130651e-05, "loss": 0.4716, "step": 1378 }, { "epoch": 0.09822986786337572, "grad_norm": 8.794989585876465, "learning_rate": 1.857254954025381e-05, "loss": 0.1044, "step": 1379 }, { "epoch": 0.09830110054493002, "grad_norm": 2.6683149337768555, "learning_rate": 1.857052639654499e-05, "loss": 0.4308, "step": 1380 }, { "epoch": 0.09837233322648431, "grad_norm": 3.2887816429138184, "learning_rate": 1.8568501930492204e-05, "loss": 0.5448, "step": 1381 }, { "epoch": 0.09844356590803861, "grad_norm": 3.8534648418426514, "learning_rate": 1.8566476142407814e-05, "loss": 0.5486, "step": 1382 }, { "epoch": 0.0985147985895929, "grad_norm": 2.4694948196411133, "learning_rate": 1.856444903260437e-05, "loss": 0.523, "step": 1383 }, { "epoch": 0.0985860312711472, "grad_norm": 2.5473790168762207, "learning_rate": 1.856242060139464e-05, "loss": 0.4636, "step": 1384 }, { "epoch": 0.0986572639527015, "grad_norm": 2.4368362426757812, "learning_rate": 1.8560390849091585e-05, "loss": 0.3108, "step": 1385 }, { "epoch": 0.0987284966342558, "grad_norm": 3.0573065280914307, "learning_rate": 1.8558359776008377e-05, "loss": 0.4586, "step": 1386 }, { "epoch": 0.09879972931581009, "grad_norm": 5.392340183258057, "learning_rate": 1.855632738245839e-05, "loss": 0.2951, "step": 1387 }, { "epoch": 0.09887096199736439, "grad_norm": 1.8787267208099365, "learning_rate": 1.8554293668755203e-05, "loss": 0.2586, "step": 1388 }, { "epoch": 0.09894219467891868, "grad_norm": 2.89555025100708, "learning_rate": 1.855225863521259e-05, "loss": 0.3366, "step": 1389 }, { "epoch": 0.09901342736047299, "grad_norm": 3.1434195041656494, "learning_rate": 1.8550222282144544e-05, "loss": 0.6729, "step": 1390 }, { "epoch": 0.09908466004202728, "grad_norm": 5.7656989097595215, "learning_rate": 1.854818460986525e-05, "loss": 0.5473, "step": 1391 }, { "epoch": 0.09915589272358158, "grad_norm": 4.378120422363281, "learning_rate": 1.85461456186891e-05, "loss": 0.0853, "step": 1392 }, { "epoch": 0.09922712540513588, "grad_norm": 0.7779933214187622, "learning_rate": 1.8544105308930688e-05, "loss": 0.0306, "step": 1393 }, { "epoch": 0.09929835808669017, "grad_norm": 2.880258083343506, "learning_rate": 1.8542063680904818e-05, "loss": 0.3409, "step": 1394 }, { "epoch": 0.09936959076824448, "grad_norm": 2.4548428058624268, "learning_rate": 1.8540020734926483e-05, "loss": 0.4705, "step": 1395 }, { "epoch": 0.09944082344979877, "grad_norm": 5.782365322113037, "learning_rate": 1.85379764713109e-05, "loss": 0.6213, "step": 1396 }, { "epoch": 0.09951205613135307, "grad_norm": 3.8529131412506104, "learning_rate": 1.8535930890373467e-05, "loss": 0.2804, "step": 1397 }, { "epoch": 0.09958328881290736, "grad_norm": 2.5643415451049805, "learning_rate": 1.85338839924298e-05, "loss": 0.4689, "step": 1398 }, { "epoch": 0.09965452149446166, "grad_norm": 2.432931423187256, "learning_rate": 1.853183577779572e-05, "loss": 0.4635, "step": 1399 }, { "epoch": 0.09972575417601595, "grad_norm": 3.464775562286377, "learning_rate": 1.8529786246787235e-05, "loss": 0.5861, "step": 1400 }, { "epoch": 0.09979698685757026, "grad_norm": 5.507517337799072, "learning_rate": 1.8527735399720575e-05, "loss": 0.2944, "step": 1401 }, { "epoch": 0.09986821953912454, "grad_norm": 2.546436071395874, "learning_rate": 1.852568323691216e-05, "loss": 0.2378, "step": 1402 }, { "epoch": 0.09993945222067885, "grad_norm": 4.090466499328613, "learning_rate": 1.8523629758678618e-05, "loss": 0.5263, "step": 1403 }, { "epoch": 0.10001068490223314, "grad_norm": 2.6235930919647217, "learning_rate": 1.8521574965336783e-05, "loss": 0.8099, "step": 1404 }, { "epoch": 0.10008191758378744, "grad_norm": 6.423054218292236, "learning_rate": 1.8519518857203686e-05, "loss": 0.8722, "step": 1405 }, { "epoch": 0.10015315026534174, "grad_norm": 5.25217866897583, "learning_rate": 1.8517461434596563e-05, "loss": 0.8113, "step": 1406 }, { "epoch": 0.10022438294689603, "grad_norm": 3.57761812210083, "learning_rate": 1.851540269783285e-05, "loss": 0.4032, "step": 1407 }, { "epoch": 0.10029561562845034, "grad_norm": 1.3502016067504883, "learning_rate": 1.8513342647230197e-05, "loss": 0.1175, "step": 1408 }, { "epoch": 0.10036684831000463, "grad_norm": 8.48971176147461, "learning_rate": 1.8511281283106442e-05, "loss": 0.878, "step": 1409 }, { "epoch": 0.10043808099155893, "grad_norm": 2.8034520149230957, "learning_rate": 1.850921860577964e-05, "loss": 0.6515, "step": 1410 }, { "epoch": 0.10050931367311322, "grad_norm": 3.4831035137176514, "learning_rate": 1.8507154615568027e-05, "loss": 0.2744, "step": 1411 }, { "epoch": 0.10058054635466752, "grad_norm": 5.1472554206848145, "learning_rate": 1.8505089312790067e-05, "loss": 0.8195, "step": 1412 }, { "epoch": 0.10065177903622181, "grad_norm": 3.1493747234344482, "learning_rate": 1.850302269776441e-05, "loss": 0.7112, "step": 1413 }, { "epoch": 0.10072301171777612, "grad_norm": 3.6091184616088867, "learning_rate": 1.8500954770809915e-05, "loss": 0.7442, "step": 1414 }, { "epoch": 0.10079424439933041, "grad_norm": 2.361109972000122, "learning_rate": 1.8498885532245643e-05, "loss": 0.455, "step": 1415 }, { "epoch": 0.10086547708088471, "grad_norm": 2.8218488693237305, "learning_rate": 1.8496814982390856e-05, "loss": 0.1192, "step": 1416 }, { "epoch": 0.100936709762439, "grad_norm": 2.678875684738159, "learning_rate": 1.8494743121565015e-05, "loss": 0.5647, "step": 1417 }, { "epoch": 0.1010079424439933, "grad_norm": 3.930060863494873, "learning_rate": 1.8492669950087792e-05, "loss": 0.4774, "step": 1418 }, { "epoch": 0.10107917512554761, "grad_norm": 2.287843942642212, "learning_rate": 1.849059546827905e-05, "loss": 0.3236, "step": 1419 }, { "epoch": 0.1011504078071019, "grad_norm": 2.7285056114196777, "learning_rate": 1.8488519676458868e-05, "loss": 0.5288, "step": 1420 }, { "epoch": 0.1012216404886562, "grad_norm": 3.8738842010498047, "learning_rate": 1.848644257494751e-05, "loss": 0.1238, "step": 1421 }, { "epoch": 0.10129287317021049, "grad_norm": 4.581371784210205, "learning_rate": 1.8484364164065457e-05, "loss": 0.795, "step": 1422 }, { "epoch": 0.1013641058517648, "grad_norm": 3.595460891723633, "learning_rate": 1.8482284444133388e-05, "loss": 0.401, "step": 1423 }, { "epoch": 0.10143533853331908, "grad_norm": 3.2866227626800537, "learning_rate": 1.848020341547218e-05, "loss": 0.6475, "step": 1424 }, { "epoch": 0.10150657121487339, "grad_norm": 2.9653756618499756, "learning_rate": 1.8478121078402914e-05, "loss": 0.5046, "step": 1425 }, { "epoch": 0.10157780389642768, "grad_norm": 3.3130593299865723, "learning_rate": 1.847603743324687e-05, "loss": 0.4533, "step": 1426 }, { "epoch": 0.10164903657798198, "grad_norm": 2.439343214035034, "learning_rate": 1.847395248032554e-05, "loss": 0.283, "step": 1427 }, { "epoch": 0.10172026925953627, "grad_norm": 5.452528476715088, "learning_rate": 1.8471866219960604e-05, "loss": 0.7837, "step": 1428 }, { "epoch": 0.10179150194109057, "grad_norm": 5.953334331512451, "learning_rate": 1.8469778652473955e-05, "loss": 0.5233, "step": 1429 }, { "epoch": 0.10186273462264486, "grad_norm": 2.4642395973205566, "learning_rate": 1.8467689778187684e-05, "loss": 0.3562, "step": 1430 }, { "epoch": 0.10193396730419917, "grad_norm": 3.4748167991638184, "learning_rate": 1.8465599597424076e-05, "loss": 0.6939, "step": 1431 }, { "epoch": 0.10200519998575347, "grad_norm": 2.4553396701812744, "learning_rate": 1.8463508110505635e-05, "loss": 0.2818, "step": 1432 }, { "epoch": 0.10207643266730776, "grad_norm": 4.128581523895264, "learning_rate": 1.8461415317755046e-05, "loss": 0.5788, "step": 1433 }, { "epoch": 0.10214766534886206, "grad_norm": 3.1052703857421875, "learning_rate": 1.8459321219495207e-05, "loss": 0.3534, "step": 1434 }, { "epoch": 0.10221889803041635, "grad_norm": 2.9588615894317627, "learning_rate": 1.845722581604922e-05, "loss": 0.6679, "step": 1435 }, { "epoch": 0.10229013071197066, "grad_norm": 5.368716239929199, "learning_rate": 1.8455129107740383e-05, "loss": 0.5956, "step": 1436 }, { "epoch": 0.10236136339352495, "grad_norm": 4.254218101501465, "learning_rate": 1.8453031094892196e-05, "loss": 0.4028, "step": 1437 }, { "epoch": 0.10243259607507925, "grad_norm": 5.193363189697266, "learning_rate": 1.845093177782836e-05, "loss": 0.8294, "step": 1438 }, { "epoch": 0.10250382875663354, "grad_norm": 1.7985156774520874, "learning_rate": 1.844883115687278e-05, "loss": 0.3567, "step": 1439 }, { "epoch": 0.10257506143818784, "grad_norm": 3.957906484603882, "learning_rate": 1.8446729232349557e-05, "loss": 0.7277, "step": 1440 }, { "epoch": 0.10264629411974213, "grad_norm": 3.476916790008545, "learning_rate": 1.8444626004582998e-05, "loss": 0.3242, "step": 1441 }, { "epoch": 0.10271752680129644, "grad_norm": 3.268742084503174, "learning_rate": 1.8442521473897606e-05, "loss": 0.7261, "step": 1442 }, { "epoch": 0.10278875948285073, "grad_norm": 2.6532630920410156, "learning_rate": 1.8440415640618097e-05, "loss": 0.3419, "step": 1443 }, { "epoch": 0.10285999216440503, "grad_norm": 3.776421308517456, "learning_rate": 1.843830850506937e-05, "loss": 0.8115, "step": 1444 }, { "epoch": 0.10293122484595933, "grad_norm": 3.1364190578460693, "learning_rate": 1.843620006757654e-05, "loss": 0.4773, "step": 1445 }, { "epoch": 0.10300245752751362, "grad_norm": 1.9481444358825684, "learning_rate": 1.8434090328464916e-05, "loss": 0.2768, "step": 1446 }, { "epoch": 0.10307369020906793, "grad_norm": 2.4708094596862793, "learning_rate": 1.843197928806001e-05, "loss": 0.4392, "step": 1447 }, { "epoch": 0.10314492289062221, "grad_norm": 1.2828149795532227, "learning_rate": 1.842986694668753e-05, "loss": 0.1188, "step": 1448 }, { "epoch": 0.10321615557217652, "grad_norm": 11.096900939941406, "learning_rate": 1.8427753304673395e-05, "loss": 0.2442, "step": 1449 }, { "epoch": 0.10328738825373081, "grad_norm": 1.5540435314178467, "learning_rate": 1.842563836234371e-05, "loss": 0.0982, "step": 1450 }, { "epoch": 0.10335862093528511, "grad_norm": 2.5053248405456543, "learning_rate": 1.8423522120024793e-05, "loss": 0.2956, "step": 1451 }, { "epoch": 0.1034298536168394, "grad_norm": 2.9730169773101807, "learning_rate": 1.842140457804316e-05, "loss": 0.5996, "step": 1452 }, { "epoch": 0.1035010862983937, "grad_norm": 2.902134418487549, "learning_rate": 1.8419285736725524e-05, "loss": 0.4725, "step": 1453 }, { "epoch": 0.103572318979948, "grad_norm": 2.478632926940918, "learning_rate": 1.8417165596398803e-05, "loss": 0.0811, "step": 1454 }, { "epoch": 0.1036435516615023, "grad_norm": 3.112293243408203, "learning_rate": 1.8415044157390105e-05, "loss": 0.3918, "step": 1455 }, { "epoch": 0.10371478434305659, "grad_norm": 3.268944263458252, "learning_rate": 1.8412921420026757e-05, "loss": 0.3402, "step": 1456 }, { "epoch": 0.10378601702461089, "grad_norm": 2.8977468013763428, "learning_rate": 1.8410797384636267e-05, "loss": 0.2587, "step": 1457 }, { "epoch": 0.1038572497061652, "grad_norm": 2.7757365703582764, "learning_rate": 1.8408672051546355e-05, "loss": 0.5225, "step": 1458 }, { "epoch": 0.10392848238771948, "grad_norm": 2.64327335357666, "learning_rate": 1.840654542108494e-05, "loss": 0.4195, "step": 1459 }, { "epoch": 0.10399971506927379, "grad_norm": 2.663177967071533, "learning_rate": 1.8404417493580138e-05, "loss": 0.6953, "step": 1460 }, { "epoch": 0.10407094775082808, "grad_norm": 2.3424434661865234, "learning_rate": 1.840228826936026e-05, "loss": 0.4796, "step": 1461 }, { "epoch": 0.10414218043238238, "grad_norm": 39.079612731933594, "learning_rate": 1.8400157748753835e-05, "loss": 0.594, "step": 1462 }, { "epoch": 0.10421341311393667, "grad_norm": 2.5604755878448486, "learning_rate": 1.839802593208957e-05, "loss": 0.4859, "step": 1463 }, { "epoch": 0.10428464579549097, "grad_norm": 3.482391357421875, "learning_rate": 1.839589281969639e-05, "loss": 0.8069, "step": 1464 }, { "epoch": 0.10435587847704526, "grad_norm": 4.722891330718994, "learning_rate": 1.8393758411903406e-05, "loss": 0.5996, "step": 1465 }, { "epoch": 0.10442711115859957, "grad_norm": 1.8752844333648682, "learning_rate": 1.839162270903994e-05, "loss": 0.1187, "step": 1466 }, { "epoch": 0.10449834384015386, "grad_norm": 3.4651243686676025, "learning_rate": 1.8389485711435505e-05, "loss": 0.7729, "step": 1467 }, { "epoch": 0.10456957652170816, "grad_norm": 4.943392753601074, "learning_rate": 1.8387347419419824e-05, "loss": 0.7929, "step": 1468 }, { "epoch": 0.10464080920326246, "grad_norm": 3.3845608234405518, "learning_rate": 1.8385207833322805e-05, "loss": 0.3286, "step": 1469 }, { "epoch": 0.10471204188481675, "grad_norm": 4.551529407501221, "learning_rate": 1.838306695347457e-05, "loss": 0.5948, "step": 1470 }, { "epoch": 0.10478327456637106, "grad_norm": 4.132630348205566, "learning_rate": 1.8380924780205434e-05, "loss": 0.7537, "step": 1471 }, { "epoch": 0.10485450724792535, "grad_norm": 4.111681938171387, "learning_rate": 1.837878131384591e-05, "loss": 0.7429, "step": 1472 }, { "epoch": 0.10492573992947965, "grad_norm": 10.041197776794434, "learning_rate": 1.8376636554726713e-05, "loss": 0.4244, "step": 1473 }, { "epoch": 0.10499697261103394, "grad_norm": 2.420588254928589, "learning_rate": 1.8374490503178758e-05, "loss": 0.4673, "step": 1474 }, { "epoch": 0.10506820529258824, "grad_norm": 4.335046768188477, "learning_rate": 1.837234315953316e-05, "loss": 0.7385, "step": 1475 }, { "epoch": 0.10513943797414253, "grad_norm": 2.3528146743774414, "learning_rate": 1.8370194524121232e-05, "loss": 0.3101, "step": 1476 }, { "epoch": 0.10521067065569684, "grad_norm": 3.135192632675171, "learning_rate": 1.8368044597274483e-05, "loss": 0.4294, "step": 1477 }, { "epoch": 0.10528190333725113, "grad_norm": 4.988473415374756, "learning_rate": 1.8365893379324628e-05, "loss": 0.9027, "step": 1478 }, { "epoch": 0.10535313601880543, "grad_norm": 3.866879463195801, "learning_rate": 1.8363740870603578e-05, "loss": 0.5725, "step": 1479 }, { "epoch": 0.10542436870035972, "grad_norm": 2.676614761352539, "learning_rate": 1.836158707144344e-05, "loss": 0.3708, "step": 1480 }, { "epoch": 0.10549560138191402, "grad_norm": 8.485868453979492, "learning_rate": 1.8359431982176526e-05, "loss": 0.9007, "step": 1481 }, { "epoch": 0.10556683406346833, "grad_norm": 3.3244242668151855, "learning_rate": 1.835727560313534e-05, "loss": 0.2736, "step": 1482 }, { "epoch": 0.10563806674502262, "grad_norm": 8.19057846069336, "learning_rate": 1.8355117934652593e-05, "loss": 0.2611, "step": 1483 }, { "epoch": 0.10570929942657692, "grad_norm": 8.770209312438965, "learning_rate": 1.835295897706119e-05, "loss": 0.3837, "step": 1484 }, { "epoch": 0.10578053210813121, "grad_norm": 3.725705623626709, "learning_rate": 1.8350798730694234e-05, "loss": 0.3313, "step": 1485 }, { "epoch": 0.10585176478968551, "grad_norm": 2.5951907634735107, "learning_rate": 1.8348637195885033e-05, "loss": 0.5962, "step": 1486 }, { "epoch": 0.1059229974712398, "grad_norm": 2.8150272369384766, "learning_rate": 1.8346474372967086e-05, "loss": 0.464, "step": 1487 }, { "epoch": 0.1059942301527941, "grad_norm": 2.608823776245117, "learning_rate": 1.8344310262274093e-05, "loss": 0.318, "step": 1488 }, { "epoch": 0.1060654628343484, "grad_norm": 2.323730707168579, "learning_rate": 1.8342144864139962e-05, "loss": 0.2816, "step": 1489 }, { "epoch": 0.1061366955159027, "grad_norm": 3.238983154296875, "learning_rate": 1.833997817889878e-05, "loss": 0.6473, "step": 1490 }, { "epoch": 0.10620792819745699, "grad_norm": 2.6696674823760986, "learning_rate": 1.8337810206884853e-05, "loss": 0.5516, "step": 1491 }, { "epoch": 0.10627916087901129, "grad_norm": 3.747286558151245, "learning_rate": 1.8335640948432675e-05, "loss": 0.4743, "step": 1492 }, { "epoch": 0.10635039356056558, "grad_norm": 3.8718836307525635, "learning_rate": 1.8333470403876935e-05, "loss": 0.5756, "step": 1493 }, { "epoch": 0.10642162624211988, "grad_norm": 3.0988166332244873, "learning_rate": 1.8331298573552534e-05, "loss": 0.6344, "step": 1494 }, { "epoch": 0.10649285892367419, "grad_norm": 2.887407064437866, "learning_rate": 1.8329125457794557e-05, "loss": 0.4008, "step": 1495 }, { "epoch": 0.10656409160522848, "grad_norm": 3.145056962966919, "learning_rate": 1.8326951056938295e-05, "loss": 0.5078, "step": 1496 }, { "epoch": 0.10663532428678278, "grad_norm": 4.217214584350586, "learning_rate": 1.832477537131924e-05, "loss": 0.6858, "step": 1497 }, { "epoch": 0.10670655696833707, "grad_norm": 3.4741570949554443, "learning_rate": 1.8322598401273067e-05, "loss": 0.6119, "step": 1498 }, { "epoch": 0.10677778964989137, "grad_norm": 2.4080545902252197, "learning_rate": 1.8320420147135674e-05, "loss": 0.3556, "step": 1499 }, { "epoch": 0.10684902233144566, "grad_norm": 5.046425819396973, "learning_rate": 1.831824060924313e-05, "loss": 0.6769, "step": 1500 }, { "epoch": 0.10692025501299997, "grad_norm": 1.6142845153808594, "learning_rate": 1.8316059787931725e-05, "loss": 0.2351, "step": 1501 }, { "epoch": 0.10699148769455426, "grad_norm": 3.3941686153411865, "learning_rate": 1.831387768353793e-05, "loss": 0.7886, "step": 1502 }, { "epoch": 0.10706272037610856, "grad_norm": 3.390230417251587, "learning_rate": 1.831169429639843e-05, "loss": 0.446, "step": 1503 }, { "epoch": 0.10713395305766285, "grad_norm": 2.358793258666992, "learning_rate": 1.830950962685009e-05, "loss": 0.3669, "step": 1504 }, { "epoch": 0.10720518573921715, "grad_norm": 3.001603126525879, "learning_rate": 1.8307323675229986e-05, "loss": 0.1807, "step": 1505 }, { "epoch": 0.10727641842077144, "grad_norm": 5.869502544403076, "learning_rate": 1.8305136441875388e-05, "loss": 1.0502, "step": 1506 }, { "epoch": 0.10734765110232575, "grad_norm": 2.375246286392212, "learning_rate": 1.8302947927123767e-05, "loss": 0.3577, "step": 1507 }, { "epoch": 0.10741888378388005, "grad_norm": 3.502385377883911, "learning_rate": 1.8300758131312778e-05, "loss": 0.4333, "step": 1508 }, { "epoch": 0.10749011646543434, "grad_norm": 3.432023525238037, "learning_rate": 1.8298567054780295e-05, "loss": 0.7505, "step": 1509 }, { "epoch": 0.10756134914698864, "grad_norm": 3.296227216720581, "learning_rate": 1.8296374697864376e-05, "loss": 0.4071, "step": 1510 }, { "epoch": 0.10763258182854293, "grad_norm": 3.4562792778015137, "learning_rate": 1.8294181060903275e-05, "loss": 0.4968, "step": 1511 }, { "epoch": 0.10770381451009724, "grad_norm": 3.406933546066284, "learning_rate": 1.829198614423545e-05, "loss": 0.5876, "step": 1512 }, { "epoch": 0.10777504719165153, "grad_norm": 5.318166732788086, "learning_rate": 1.8289789948199553e-05, "loss": 1.0607, "step": 1513 }, { "epoch": 0.10784627987320583, "grad_norm": 2.769782543182373, "learning_rate": 1.8287592473134436e-05, "loss": 0.4264, "step": 1514 }, { "epoch": 0.10791751255476012, "grad_norm": 4.821406841278076, "learning_rate": 1.8285393719379146e-05, "loss": 0.4811, "step": 1515 }, { "epoch": 0.10798874523631442, "grad_norm": 2.2026498317718506, "learning_rate": 1.8283193687272927e-05, "loss": 0.3796, "step": 1516 }, { "epoch": 0.10805997791786871, "grad_norm": 3.194061756134033, "learning_rate": 1.8280992377155224e-05, "loss": 0.6975, "step": 1517 }, { "epoch": 0.10813121059942302, "grad_norm": 1.9764872789382935, "learning_rate": 1.8278789789365675e-05, "loss": 0.1849, "step": 1518 }, { "epoch": 0.1082024432809773, "grad_norm": 1.9262995719909668, "learning_rate": 1.8276585924244113e-05, "loss": 0.1571, "step": 1519 }, { "epoch": 0.10827367596253161, "grad_norm": 2.7954676151275635, "learning_rate": 1.827438078213058e-05, "loss": 0.3409, "step": 1520 }, { "epoch": 0.10834490864408591, "grad_norm": 3.630375385284424, "learning_rate": 1.82721743633653e-05, "loss": 0.4026, "step": 1521 }, { "epoch": 0.1084161413256402, "grad_norm": 3.711257219314575, "learning_rate": 1.8269966668288704e-05, "loss": 0.35, "step": 1522 }, { "epoch": 0.1084873740071945, "grad_norm": 3.106813907623291, "learning_rate": 1.8267757697241415e-05, "loss": 0.5182, "step": 1523 }, { "epoch": 0.1085586066887488, "grad_norm": 3.777282476425171, "learning_rate": 1.826554745056425e-05, "loss": 0.6265, "step": 1524 }, { "epoch": 0.1086298393703031, "grad_norm": 3.4298007488250732, "learning_rate": 1.8263335928598237e-05, "loss": 0.696, "step": 1525 }, { "epoch": 0.10870107205185739, "grad_norm": 4.956480979919434, "learning_rate": 1.8261123131684587e-05, "loss": 0.4357, "step": 1526 }, { "epoch": 0.10877230473341169, "grad_norm": 2.9299042224884033, "learning_rate": 1.8258909060164706e-05, "loss": 0.4477, "step": 1527 }, { "epoch": 0.10884353741496598, "grad_norm": 3.842271566390991, "learning_rate": 1.8256693714380214e-05, "loss": 0.7388, "step": 1528 }, { "epoch": 0.10891477009652029, "grad_norm": 2.634791135787964, "learning_rate": 1.8254477094672903e-05, "loss": 0.5771, "step": 1529 }, { "epoch": 0.10898600277807458, "grad_norm": 4.50253438949585, "learning_rate": 1.8252259201384786e-05, "loss": 0.6138, "step": 1530 }, { "epoch": 0.10905723545962888, "grad_norm": 3.912045478820801, "learning_rate": 1.825004003485805e-05, "loss": 0.7164, "step": 1531 }, { "epoch": 0.10912846814118317, "grad_norm": 2.827897548675537, "learning_rate": 1.8247819595435102e-05, "loss": 0.4539, "step": 1532 }, { "epoch": 0.10919970082273747, "grad_norm": 3.0472395420074463, "learning_rate": 1.8245597883458524e-05, "loss": 0.6297, "step": 1533 }, { "epoch": 0.10927093350429178, "grad_norm": 1.93716561794281, "learning_rate": 1.8243374899271103e-05, "loss": 0.3447, "step": 1534 }, { "epoch": 0.10934216618584607, "grad_norm": 2.5568783283233643, "learning_rate": 1.8241150643215828e-05, "loss": 0.2495, "step": 1535 }, { "epoch": 0.10941339886740037, "grad_norm": 5.119356155395508, "learning_rate": 1.823892511563588e-05, "loss": 0.4635, "step": 1536 }, { "epoch": 0.10948463154895466, "grad_norm": 2.592417001724243, "learning_rate": 1.8236698316874625e-05, "loss": 0.6354, "step": 1537 }, { "epoch": 0.10955586423050896, "grad_norm": 4.767209529876709, "learning_rate": 1.8234470247275644e-05, "loss": 0.8339, "step": 1538 }, { "epoch": 0.10962709691206325, "grad_norm": 4.290926933288574, "learning_rate": 1.8232240907182702e-05, "loss": 0.6997, "step": 1539 }, { "epoch": 0.10969832959361756, "grad_norm": 2.513141632080078, "learning_rate": 1.8230010296939764e-05, "loss": 0.4286, "step": 1540 }, { "epoch": 0.10976956227517184, "grad_norm": 2.775249481201172, "learning_rate": 1.822777841689099e-05, "loss": 0.5801, "step": 1541 }, { "epoch": 0.10984079495672615, "grad_norm": 3.2870917320251465, "learning_rate": 1.8225545267380736e-05, "loss": 0.1836, "step": 1542 }, { "epoch": 0.10991202763828044, "grad_norm": 2.5889499187469482, "learning_rate": 1.8223310848753552e-05, "loss": 0.4994, "step": 1543 }, { "epoch": 0.10998326031983474, "grad_norm": 4.653005123138428, "learning_rate": 1.822107516135419e-05, "loss": 0.2812, "step": 1544 }, { "epoch": 0.11005449300138903, "grad_norm": 4.377760887145996, "learning_rate": 1.821883820552759e-05, "loss": 0.394, "step": 1545 }, { "epoch": 0.11012572568294333, "grad_norm": 3.900918960571289, "learning_rate": 1.8216599981618895e-05, "loss": 0.3303, "step": 1546 }, { "epoch": 0.11019695836449764, "grad_norm": 3.242931604385376, "learning_rate": 1.8214360489973435e-05, "loss": 0.6335, "step": 1547 }, { "epoch": 0.11026819104605193, "grad_norm": 3.7367048263549805, "learning_rate": 1.8212119730936745e-05, "loss": 0.1906, "step": 1548 }, { "epoch": 0.11033942372760623, "grad_norm": 3.2181153297424316, "learning_rate": 1.8209877704854547e-05, "loss": 0.6364, "step": 1549 }, { "epoch": 0.11041065640916052, "grad_norm": 5.39577054977417, "learning_rate": 1.8207634412072765e-05, "loss": 0.4699, "step": 1550 }, { "epoch": 0.11048188909071482, "grad_norm": 4.1963396072387695, "learning_rate": 1.8205389852937516e-05, "loss": 0.5303, "step": 1551 }, { "epoch": 0.11055312177226911, "grad_norm": 3.0388083457946777, "learning_rate": 1.820314402779511e-05, "loss": 0.6519, "step": 1552 }, { "epoch": 0.11062435445382342, "grad_norm": 5.5630974769592285, "learning_rate": 1.820089693699206e-05, "loss": 0.7326, "step": 1553 }, { "epoch": 0.11069558713537771, "grad_norm": 4.372586727142334, "learning_rate": 1.8198648580875063e-05, "loss": 0.9321, "step": 1554 }, { "epoch": 0.11076681981693201, "grad_norm": 3.5760059356689453, "learning_rate": 1.8196398959791022e-05, "loss": 0.6116, "step": 1555 }, { "epoch": 0.1108380524984863, "grad_norm": 2.7976179122924805, "learning_rate": 1.8194148074087025e-05, "loss": 0.1743, "step": 1556 }, { "epoch": 0.1109092851800406, "grad_norm": 3.5373754501342773, "learning_rate": 1.8191895924110364e-05, "loss": 0.6141, "step": 1557 }, { "epoch": 0.1109805178615949, "grad_norm": 3.044980049133301, "learning_rate": 1.8189642510208525e-05, "loss": 0.3036, "step": 1558 }, { "epoch": 0.1110517505431492, "grad_norm": 4.1984639167785645, "learning_rate": 1.818738783272918e-05, "loss": 0.527, "step": 1559 }, { "epoch": 0.1111229832247035, "grad_norm": 3.356112480163574, "learning_rate": 1.818513189202021e-05, "loss": 0.787, "step": 1560 }, { "epoch": 0.11119421590625779, "grad_norm": 7.121969699859619, "learning_rate": 1.8182874688429674e-05, "loss": 0.5005, "step": 1561 }, { "epoch": 0.1112654485878121, "grad_norm": 3.177220344543457, "learning_rate": 1.8180616222305847e-05, "loss": 0.3104, "step": 1562 }, { "epoch": 0.11133668126936638, "grad_norm": 2.916890859603882, "learning_rate": 1.817835649399718e-05, "loss": 0.3343, "step": 1563 }, { "epoch": 0.11140791395092069, "grad_norm": 2.12516713142395, "learning_rate": 1.817609550385232e-05, "loss": 0.254, "step": 1564 }, { "epoch": 0.11147914663247498, "grad_norm": 2.2931456565856934, "learning_rate": 1.817383325222013e-05, "loss": 0.2995, "step": 1565 }, { "epoch": 0.11155037931402928, "grad_norm": 4.362823009490967, "learning_rate": 1.8171569739449642e-05, "loss": 0.4618, "step": 1566 }, { "epoch": 0.11162161199558357, "grad_norm": 3.7937769889831543, "learning_rate": 1.8169304965890088e-05, "loss": 0.5034, "step": 1567 }, { "epoch": 0.11169284467713787, "grad_norm": 7.148209571838379, "learning_rate": 1.816703893189091e-05, "loss": 0.8176, "step": 1568 }, { "epoch": 0.11176407735869216, "grad_norm": 2.7721385955810547, "learning_rate": 1.816477163780173e-05, "loss": 0.3138, "step": 1569 }, { "epoch": 0.11183531004024647, "grad_norm": 2.2968533039093018, "learning_rate": 1.8162503083972365e-05, "loss": 0.5508, "step": 1570 }, { "epoch": 0.11190654272180076, "grad_norm": 3.5449023246765137, "learning_rate": 1.816023327075283e-05, "loss": 0.4999, "step": 1571 }, { "epoch": 0.11197777540335506, "grad_norm": 2.1483683586120605, "learning_rate": 1.815796219849334e-05, "loss": 0.3962, "step": 1572 }, { "epoch": 0.11204900808490936, "grad_norm": 2.4321889877319336, "learning_rate": 1.815568986754429e-05, "loss": 0.5294, "step": 1573 }, { "epoch": 0.11212024076646365, "grad_norm": 2.104543924331665, "learning_rate": 1.815341627825628e-05, "loss": 0.5365, "step": 1574 }, { "epoch": 0.11219147344801796, "grad_norm": 3.687098741531372, "learning_rate": 1.8151141430980106e-05, "loss": 0.369, "step": 1575 }, { "epoch": 0.11226270612957225, "grad_norm": 2.4909229278564453, "learning_rate": 1.814886532606675e-05, "loss": 0.5282, "step": 1576 }, { "epoch": 0.11233393881112655, "grad_norm": 4.069705486297607, "learning_rate": 1.8146587963867388e-05, "loss": 0.5738, "step": 1577 }, { "epoch": 0.11240517149268084, "grad_norm": 3.2739691734313965, "learning_rate": 1.8144309344733397e-05, "loss": 0.5726, "step": 1578 }, { "epoch": 0.11247640417423514, "grad_norm": 3.7428600788116455, "learning_rate": 1.8142029469016345e-05, "loss": 0.9252, "step": 1579 }, { "epoch": 0.11254763685578943, "grad_norm": 4.322535037994385, "learning_rate": 1.8139748337067993e-05, "loss": 0.5595, "step": 1580 }, { "epoch": 0.11261886953734374, "grad_norm": 1.8438702821731567, "learning_rate": 1.8137465949240294e-05, "loss": 0.125, "step": 1581 }, { "epoch": 0.11269010221889803, "grad_norm": 4.167746543884277, "learning_rate": 1.8135182305885403e-05, "loss": 0.4191, "step": 1582 }, { "epoch": 0.11276133490045233, "grad_norm": 2.3527421951293945, "learning_rate": 1.8132897407355657e-05, "loss": 0.4669, "step": 1583 }, { "epoch": 0.11283256758200662, "grad_norm": 4.2916975021362305, "learning_rate": 1.813061125400359e-05, "loss": 0.4149, "step": 1584 }, { "epoch": 0.11290380026356092, "grad_norm": 3.426138162612915, "learning_rate": 1.812832384618194e-05, "loss": 0.7898, "step": 1585 }, { "epoch": 0.11297503294511523, "grad_norm": 4.755450248718262, "learning_rate": 1.8126035184243623e-05, "loss": 0.4365, "step": 1586 }, { "epoch": 0.11304626562666951, "grad_norm": 2.5578432083129883, "learning_rate": 1.812374526854176e-05, "loss": 0.6335, "step": 1587 }, { "epoch": 0.11311749830822382, "grad_norm": 2.8531601428985596, "learning_rate": 1.812145409942966e-05, "loss": 0.4207, "step": 1588 }, { "epoch": 0.11318873098977811, "grad_norm": 2.9627456665039062, "learning_rate": 1.8119161677260827e-05, "loss": 0.7156, "step": 1589 }, { "epoch": 0.11325996367133241, "grad_norm": 2.4673361778259277, "learning_rate": 1.811686800238896e-05, "loss": 0.3747, "step": 1590 }, { "epoch": 0.1133311963528867, "grad_norm": 2.0657618045806885, "learning_rate": 1.8114573075167947e-05, "loss": 0.3583, "step": 1591 }, { "epoch": 0.113402429034441, "grad_norm": 2.656261444091797, "learning_rate": 1.8112276895951872e-05, "loss": 0.4665, "step": 1592 }, { "epoch": 0.1134736617159953, "grad_norm": 3.2002170085906982, "learning_rate": 1.8109979465095014e-05, "loss": 0.7777, "step": 1593 }, { "epoch": 0.1135448943975496, "grad_norm": 5.174727916717529, "learning_rate": 1.810768078295184e-05, "loss": 0.465, "step": 1594 }, { "epoch": 0.11361612707910389, "grad_norm": 3.4704017639160156, "learning_rate": 1.8105380849877013e-05, "loss": 0.2928, "step": 1595 }, { "epoch": 0.11368735976065819, "grad_norm": 2.3354029655456543, "learning_rate": 1.810307966622539e-05, "loss": 0.3379, "step": 1596 }, { "epoch": 0.11375859244221248, "grad_norm": 3.997392416000366, "learning_rate": 1.8100777232352022e-05, "loss": 0.5513, "step": 1597 }, { "epoch": 0.11382982512376678, "grad_norm": 3.3698647022247314, "learning_rate": 1.8098473548612146e-05, "loss": 0.4777, "step": 1598 }, { "epoch": 0.11390105780532109, "grad_norm": 4.334118366241455, "learning_rate": 1.8096168615361203e-05, "loss": 0.8749, "step": 1599 }, { "epoch": 0.11397229048687538, "grad_norm": 3.050067901611328, "learning_rate": 1.8093862432954815e-05, "loss": 0.659, "step": 1600 }, { "epoch": 0.11404352316842968, "grad_norm": 2.5288875102996826, "learning_rate": 1.809155500174881e-05, "loss": 0.285, "step": 1601 }, { "epoch": 0.11411475584998397, "grad_norm": 3.278409242630005, "learning_rate": 1.8089246322099188e-05, "loss": 0.8154, "step": 1602 }, { "epoch": 0.11418598853153827, "grad_norm": 2.3090291023254395, "learning_rate": 1.8086936394362165e-05, "loss": 0.0557, "step": 1603 }, { "epoch": 0.11425722121309256, "grad_norm": 4.357360363006592, "learning_rate": 1.808462521889413e-05, "loss": 0.6117, "step": 1604 }, { "epoch": 0.11432845389464687, "grad_norm": 4.38557243347168, "learning_rate": 1.8082312796051685e-05, "loss": 0.8092, "step": 1605 }, { "epoch": 0.11439968657620116, "grad_norm": 3.589102268218994, "learning_rate": 1.807999912619161e-05, "loss": 0.8788, "step": 1606 }, { "epoch": 0.11447091925775546, "grad_norm": 2.8890295028686523, "learning_rate": 1.807768420967087e-05, "loss": 0.3254, "step": 1607 }, { "epoch": 0.11454215193930975, "grad_norm": 3.422760248184204, "learning_rate": 1.8075368046846647e-05, "loss": 0.7364, "step": 1608 }, { "epoch": 0.11461338462086405, "grad_norm": 4.407214641571045, "learning_rate": 1.807305063807629e-05, "loss": 0.7009, "step": 1609 }, { "epoch": 0.11468461730241834, "grad_norm": 3.7371673583984375, "learning_rate": 1.8070731983717357e-05, "loss": 0.1461, "step": 1610 }, { "epoch": 0.11475584998397265, "grad_norm": 2.1817824840545654, "learning_rate": 1.8068412084127594e-05, "loss": 0.3121, "step": 1611 }, { "epoch": 0.11482708266552695, "grad_norm": 2.4425740242004395, "learning_rate": 1.8066090939664934e-05, "loss": 0.4367, "step": 1612 }, { "epoch": 0.11489831534708124, "grad_norm": 2.9885013103485107, "learning_rate": 1.8063768550687504e-05, "loss": 0.5313, "step": 1613 }, { "epoch": 0.11496954802863554, "grad_norm": 3.4931108951568604, "learning_rate": 1.806144491755363e-05, "loss": 0.6662, "step": 1614 }, { "epoch": 0.11504078071018983, "grad_norm": 3.3060755729675293, "learning_rate": 1.805912004062182e-05, "loss": 0.7017, "step": 1615 }, { "epoch": 0.11511201339174414, "grad_norm": 3.1128289699554443, "learning_rate": 1.8056793920250784e-05, "loss": 0.772, "step": 1616 }, { "epoch": 0.11518324607329843, "grad_norm": 2.72200870513916, "learning_rate": 1.805446655679941e-05, "loss": 0.5738, "step": 1617 }, { "epoch": 0.11525447875485273, "grad_norm": 2.303638458251953, "learning_rate": 1.8052137950626795e-05, "loss": 0.2509, "step": 1618 }, { "epoch": 0.11532571143640702, "grad_norm": 3.8613204956054688, "learning_rate": 1.8049808102092213e-05, "loss": 0.8147, "step": 1619 }, { "epoch": 0.11539694411796132, "grad_norm": 2.640094041824341, "learning_rate": 1.8047477011555142e-05, "loss": 0.5029, "step": 1620 }, { "epoch": 0.11546817679951561, "grad_norm": 4.122694492340088, "learning_rate": 1.804514467937524e-05, "loss": 0.6338, "step": 1621 }, { "epoch": 0.11553940948106992, "grad_norm": 2.9702484607696533, "learning_rate": 1.804281110591236e-05, "loss": 0.3955, "step": 1622 }, { "epoch": 0.1156106421626242, "grad_norm": 3.4977169036865234, "learning_rate": 1.804047629152655e-05, "loss": 0.4635, "step": 1623 }, { "epoch": 0.11568187484417851, "grad_norm": 3.628450393676758, "learning_rate": 1.8038140236578053e-05, "loss": 0.4371, "step": 1624 }, { "epoch": 0.11575310752573281, "grad_norm": 3.5462934970855713, "learning_rate": 1.803580294142729e-05, "loss": 0.8053, "step": 1625 }, { "epoch": 0.1158243402072871, "grad_norm": 1.9954164028167725, "learning_rate": 1.803346440643489e-05, "loss": 0.2672, "step": 1626 }, { "epoch": 0.1158955728888414, "grad_norm": 2.880156993865967, "learning_rate": 1.803112463196166e-05, "loss": 0.3467, "step": 1627 }, { "epoch": 0.1159668055703957, "grad_norm": 4.157865524291992, "learning_rate": 1.8028783618368603e-05, "loss": 0.7502, "step": 1628 }, { "epoch": 0.11603803825195, "grad_norm": 4.2515716552734375, "learning_rate": 1.8026441366016915e-05, "loss": 0.2943, "step": 1629 }, { "epoch": 0.11610927093350429, "grad_norm": 3.9217331409454346, "learning_rate": 1.8024097875267982e-05, "loss": 0.714, "step": 1630 }, { "epoch": 0.11618050361505859, "grad_norm": 4.641177177429199, "learning_rate": 1.8021753146483373e-05, "loss": 0.4465, "step": 1631 }, { "epoch": 0.11625173629661288, "grad_norm": 2.612713575363159, "learning_rate": 1.8019407180024867e-05, "loss": 0.2492, "step": 1632 }, { "epoch": 0.11632296897816718, "grad_norm": 1.6677166223526, "learning_rate": 1.8017059976254415e-05, "loss": 0.275, "step": 1633 }, { "epoch": 0.11639420165972147, "grad_norm": 1.6663864850997925, "learning_rate": 1.801471153553417e-05, "loss": 0.3989, "step": 1634 }, { "epoch": 0.11646543434127578, "grad_norm": 5.309479713439941, "learning_rate": 1.801236185822647e-05, "loss": 0.6116, "step": 1635 }, { "epoch": 0.11653666702283007, "grad_norm": 11.101628303527832, "learning_rate": 1.8010010944693846e-05, "loss": 0.1483, "step": 1636 }, { "epoch": 0.11660789970438437, "grad_norm": 1.990329384803772, "learning_rate": 1.8007658795299023e-05, "loss": 0.315, "step": 1637 }, { "epoch": 0.11667913238593867, "grad_norm": 2.4669413566589355, "learning_rate": 1.800530541040491e-05, "loss": 0.4503, "step": 1638 }, { "epoch": 0.11675036506749296, "grad_norm": 7.872953414916992, "learning_rate": 1.800295079037461e-05, "loss": 0.6454, "step": 1639 }, { "epoch": 0.11682159774904727, "grad_norm": 2.939554452896118, "learning_rate": 1.8000594935571416e-05, "loss": 0.5387, "step": 1640 }, { "epoch": 0.11689283043060156, "grad_norm": 3.7435591220855713, "learning_rate": 1.7998237846358812e-05, "loss": 0.5285, "step": 1641 }, { "epoch": 0.11696406311215586, "grad_norm": 3.6142797470092773, "learning_rate": 1.7995879523100478e-05, "loss": 0.58, "step": 1642 }, { "epoch": 0.11703529579371015, "grad_norm": 3.2767527103424072, "learning_rate": 1.7993519966160276e-05, "loss": 0.6059, "step": 1643 }, { "epoch": 0.11710652847526445, "grad_norm": 2.2991833686828613, "learning_rate": 1.7991159175902257e-05, "loss": 0.4866, "step": 1644 }, { "epoch": 0.11717776115681874, "grad_norm": 2.67952299118042, "learning_rate": 1.798879715269067e-05, "loss": 0.7029, "step": 1645 }, { "epoch": 0.11724899383837305, "grad_norm": 2.6602590084075928, "learning_rate": 1.7986433896889955e-05, "loss": 0.3798, "step": 1646 }, { "epoch": 0.11732022651992734, "grad_norm": 3.908963918685913, "learning_rate": 1.7984069408864733e-05, "loss": 0.869, "step": 1647 }, { "epoch": 0.11739145920148164, "grad_norm": 3.1878132820129395, "learning_rate": 1.798170368897982e-05, "loss": 0.5261, "step": 1648 }, { "epoch": 0.11746269188303593, "grad_norm": 3.389021158218384, "learning_rate": 1.7979336737600225e-05, "loss": 0.4979, "step": 1649 }, { "epoch": 0.11753392456459023, "grad_norm": 2.789870262145996, "learning_rate": 1.797696855509114e-05, "loss": 0.3213, "step": 1650 }, { "epoch": 0.11760515724614454, "grad_norm": 1.7882893085479736, "learning_rate": 1.7974599141817953e-05, "loss": 0.1127, "step": 1651 }, { "epoch": 0.11767638992769883, "grad_norm": 3.374636650085449, "learning_rate": 1.7972228498146243e-05, "loss": 0.1097, "step": 1652 }, { "epoch": 0.11774762260925313, "grad_norm": 3.1983988285064697, "learning_rate": 1.7969856624441778e-05, "loss": 0.8029, "step": 1653 }, { "epoch": 0.11781885529080742, "grad_norm": 3.4238312244415283, "learning_rate": 1.7967483521070502e-05, "loss": 0.5504, "step": 1654 }, { "epoch": 0.11789008797236172, "grad_norm": 6.634302616119385, "learning_rate": 1.7965109188398572e-05, "loss": 0.6148, "step": 1655 }, { "epoch": 0.11796132065391601, "grad_norm": 3.9488649368286133, "learning_rate": 1.796273362679232e-05, "loss": 0.9465, "step": 1656 }, { "epoch": 0.11803255333547032, "grad_norm": 2.9691410064697266, "learning_rate": 1.7960356836618265e-05, "loss": 0.4681, "step": 1657 }, { "epoch": 0.1181037860170246, "grad_norm": 4.032883167266846, "learning_rate": 1.795797881824313e-05, "loss": 0.658, "step": 1658 }, { "epoch": 0.11817501869857891, "grad_norm": 5.101877689361572, "learning_rate": 1.7955599572033816e-05, "loss": 0.6481, "step": 1659 }, { "epoch": 0.1182462513801332, "grad_norm": 3.0714988708496094, "learning_rate": 1.795321909835741e-05, "loss": 0.6907, "step": 1660 }, { "epoch": 0.1183174840616875, "grad_norm": 2.457704782485962, "learning_rate": 1.79508373975812e-05, "loss": 0.6207, "step": 1661 }, { "epoch": 0.11838871674324179, "grad_norm": 2.748354434967041, "learning_rate": 1.794845447007266e-05, "loss": 0.624, "step": 1662 }, { "epoch": 0.1184599494247961, "grad_norm": 2.6425585746765137, "learning_rate": 1.7946070316199448e-05, "loss": 0.5098, "step": 1663 }, { "epoch": 0.1185311821063504, "grad_norm": 2.6164584159851074, "learning_rate": 1.794368493632942e-05, "loss": 0.5549, "step": 1664 }, { "epoch": 0.11860241478790469, "grad_norm": 2.83842134475708, "learning_rate": 1.79412983308306e-05, "loss": 0.509, "step": 1665 }, { "epoch": 0.11867364746945899, "grad_norm": 3.3722493648529053, "learning_rate": 1.7938910500071233e-05, "loss": 0.4889, "step": 1666 }, { "epoch": 0.11874488015101328, "grad_norm": 5.088579177856445, "learning_rate": 1.793652144441973e-05, "loss": 0.6451, "step": 1667 }, { "epoch": 0.11881611283256759, "grad_norm": 3.5733373165130615, "learning_rate": 1.79341311642447e-05, "loss": 0.7347, "step": 1668 }, { "epoch": 0.11888734551412188, "grad_norm": 3.58616304397583, "learning_rate": 1.7931739659914936e-05, "loss": 0.3272, "step": 1669 }, { "epoch": 0.11895857819567618, "grad_norm": 4.5262250900268555, "learning_rate": 1.792934693179942e-05, "loss": 0.6137, "step": 1670 }, { "epoch": 0.11902981087723047, "grad_norm": 4.553283214569092, "learning_rate": 1.7926952980267335e-05, "loss": 0.4081, "step": 1671 }, { "epoch": 0.11910104355878477, "grad_norm": 3.211430311203003, "learning_rate": 1.7924557805688033e-05, "loss": 0.475, "step": 1672 }, { "epoch": 0.11917227624033906, "grad_norm": 3.379725694656372, "learning_rate": 1.792216140843107e-05, "loss": 0.1973, "step": 1673 }, { "epoch": 0.11924350892189337, "grad_norm": 3.2287068367004395, "learning_rate": 1.791976378886618e-05, "loss": 0.4261, "step": 1674 }, { "epoch": 0.11931474160344765, "grad_norm": 3.494589328765869, "learning_rate": 1.79173649473633e-05, "loss": 0.6566, "step": 1675 }, { "epoch": 0.11938597428500196, "grad_norm": 3.8319175243377686, "learning_rate": 1.7914964884292543e-05, "loss": 0.5264, "step": 1676 }, { "epoch": 0.11945720696655626, "grad_norm": 1.9083422422409058, "learning_rate": 1.7912563600024212e-05, "loss": 0.3557, "step": 1677 }, { "epoch": 0.11952843964811055, "grad_norm": 5.013131618499756, "learning_rate": 1.79101610949288e-05, "loss": 0.4054, "step": 1678 }, { "epoch": 0.11959967232966486, "grad_norm": 3.874098777770996, "learning_rate": 1.7907757369376984e-05, "loss": 0.3339, "step": 1679 }, { "epoch": 0.11967090501121914, "grad_norm": 2.183333396911621, "learning_rate": 1.7905352423739648e-05, "loss": 0.2873, "step": 1680 }, { "epoch": 0.11974213769277345, "grad_norm": 3.021385431289673, "learning_rate": 1.790294625838784e-05, "loss": 0.6735, "step": 1681 }, { "epoch": 0.11981337037432774, "grad_norm": 3.591909170150757, "learning_rate": 1.790053887369281e-05, "loss": 0.4856, "step": 1682 }, { "epoch": 0.11988460305588204, "grad_norm": 3.626708507537842, "learning_rate": 1.7898130270025992e-05, "loss": 0.2675, "step": 1683 }, { "epoch": 0.11995583573743633, "grad_norm": 3.222175359725952, "learning_rate": 1.7895720447759007e-05, "loss": 0.8761, "step": 1684 }, { "epoch": 0.12002706841899063, "grad_norm": 3.5079987049102783, "learning_rate": 1.7893309407263665e-05, "loss": 0.0806, "step": 1685 }, { "epoch": 0.12009830110054492, "grad_norm": 2.5676722526550293, "learning_rate": 1.789089714891197e-05, "loss": 0.4747, "step": 1686 }, { "epoch": 0.12016953378209923, "grad_norm": 4.076123237609863, "learning_rate": 1.7888483673076104e-05, "loss": 0.574, "step": 1687 }, { "epoch": 0.12024076646365353, "grad_norm": 3.4983279705047607, "learning_rate": 1.7886068980128444e-05, "loss": 0.4641, "step": 1688 }, { "epoch": 0.12031199914520782, "grad_norm": 2.0639636516571045, "learning_rate": 1.7883653070441548e-05, "loss": 0.4111, "step": 1689 }, { "epoch": 0.12038323182676212, "grad_norm": 2.816234827041626, "learning_rate": 1.7881235944388173e-05, "loss": 0.2178, "step": 1690 }, { "epoch": 0.12045446450831641, "grad_norm": 3.8975164890289307, "learning_rate": 1.7878817602341252e-05, "loss": 0.0798, "step": 1691 }, { "epoch": 0.12052569718987072, "grad_norm": 3.8797082901000977, "learning_rate": 1.7876398044673912e-05, "loss": 0.6505, "step": 1692 }, { "epoch": 0.12059692987142501, "grad_norm": 3.6337645053863525, "learning_rate": 1.787397727175946e-05, "loss": 0.5351, "step": 1693 }, { "epoch": 0.12066816255297931, "grad_norm": 2.9863851070404053, "learning_rate": 1.7871555283971408e-05, "loss": 0.6315, "step": 1694 }, { "epoch": 0.1207393952345336, "grad_norm": 1.9860076904296875, "learning_rate": 1.786913208168343e-05, "loss": 0.448, "step": 1695 }, { "epoch": 0.1208106279160879, "grad_norm": 5.128785610198975, "learning_rate": 1.7866707665269413e-05, "loss": 0.5544, "step": 1696 }, { "epoch": 0.1208818605976422, "grad_norm": 2.5832602977752686, "learning_rate": 1.7864282035103415e-05, "loss": 0.3396, "step": 1697 }, { "epoch": 0.1209530932791965, "grad_norm": 4.435151100158691, "learning_rate": 1.7861855191559682e-05, "loss": 0.7044, "step": 1698 }, { "epoch": 0.12102432596075079, "grad_norm": 3.169374942779541, "learning_rate": 1.785942713501266e-05, "loss": 0.3865, "step": 1699 }, { "epoch": 0.12109555864230509, "grad_norm": 3.0819308757781982, "learning_rate": 1.785699786583696e-05, "loss": 0.6607, "step": 1700 }, { "epoch": 0.1211667913238594, "grad_norm": 2.0713765621185303, "learning_rate": 1.7854567384407407e-05, "loss": 0.3677, "step": 1701 }, { "epoch": 0.12123802400541368, "grad_norm": 2.997398614883423, "learning_rate": 1.785213569109899e-05, "loss": 0.7354, "step": 1702 }, { "epoch": 0.12130925668696799, "grad_norm": 2.7464470863342285, "learning_rate": 1.7849702786286897e-05, "loss": 0.4086, "step": 1703 }, { "epoch": 0.12138048936852228, "grad_norm": 5.986326694488525, "learning_rate": 1.78472686703465e-05, "loss": 0.3812, "step": 1704 }, { "epoch": 0.12145172205007658, "grad_norm": 3.525017499923706, "learning_rate": 1.784483334365336e-05, "loss": 0.6301, "step": 1705 }, { "epoch": 0.12152295473163087, "grad_norm": 1.9690991640090942, "learning_rate": 1.784239680658322e-05, "loss": 0.1404, "step": 1706 }, { "epoch": 0.12159418741318517, "grad_norm": 3.6827588081359863, "learning_rate": 1.7839959059512016e-05, "loss": 0.2904, "step": 1707 }, { "epoch": 0.12166542009473946, "grad_norm": 16.30459213256836, "learning_rate": 1.7837520102815862e-05, "loss": 0.8478, "step": 1708 }, { "epoch": 0.12173665277629377, "grad_norm": 4.103888511657715, "learning_rate": 1.7835079936871068e-05, "loss": 0.5811, "step": 1709 }, { "epoch": 0.12180788545784806, "grad_norm": 2.6418638229370117, "learning_rate": 1.7832638562054126e-05, "loss": 0.3538, "step": 1710 }, { "epoch": 0.12187911813940236, "grad_norm": 2.581397294998169, "learning_rate": 1.7830195978741716e-05, "loss": 0.3231, "step": 1711 }, { "epoch": 0.12195035082095665, "grad_norm": 2.5586700439453125, "learning_rate": 1.7827752187310702e-05, "loss": 0.513, "step": 1712 }, { "epoch": 0.12202158350251095, "grad_norm": 3.6870226860046387, "learning_rate": 1.7825307188138133e-05, "loss": 0.4898, "step": 1713 }, { "epoch": 0.12209281618406526, "grad_norm": 3.7272732257843018, "learning_rate": 1.782286098160125e-05, "loss": 0.3987, "step": 1714 }, { "epoch": 0.12216404886561955, "grad_norm": 4.608972549438477, "learning_rate": 1.7820413568077478e-05, "loss": 0.5045, "step": 1715 }, { "epoch": 0.12223528154717385, "grad_norm": 4.996685981750488, "learning_rate": 1.7817964947944427e-05, "loss": 0.2368, "step": 1716 }, { "epoch": 0.12230651422872814, "grad_norm": 1.8631631135940552, "learning_rate": 1.7815515121579897e-05, "loss": 0.1856, "step": 1717 }, { "epoch": 0.12237774691028244, "grad_norm": 5.371891021728516, "learning_rate": 1.7813064089361866e-05, "loss": 0.8414, "step": 1718 }, { "epoch": 0.12244897959183673, "grad_norm": 3.3530876636505127, "learning_rate": 1.7810611851668503e-05, "loss": 0.3831, "step": 1719 }, { "epoch": 0.12252021227339104, "grad_norm": 5.993406295776367, "learning_rate": 1.7808158408878167e-05, "loss": 0.7373, "step": 1720 }, { "epoch": 0.12259144495494533, "grad_norm": 1.9810377359390259, "learning_rate": 1.7805703761369398e-05, "loss": 0.2208, "step": 1721 }, { "epoch": 0.12266267763649963, "grad_norm": 2.6130316257476807, "learning_rate": 1.780324790952092e-05, "loss": 0.1607, "step": 1722 }, { "epoch": 0.12273391031805392, "grad_norm": 3.012326717376709, "learning_rate": 1.7800790853711646e-05, "loss": 0.4577, "step": 1723 }, { "epoch": 0.12280514299960822, "grad_norm": 4.045257568359375, "learning_rate": 1.779833259432068e-05, "loss": 0.5522, "step": 1724 }, { "epoch": 0.12287637568116251, "grad_norm": 5.068005561828613, "learning_rate": 1.77958731317273e-05, "loss": 0.6886, "step": 1725 }, { "epoch": 0.12294760836271681, "grad_norm": 3.865185260772705, "learning_rate": 1.7793412466310974e-05, "loss": 0.4781, "step": 1726 }, { "epoch": 0.12301884104427112, "grad_norm": 3.3819491863250732, "learning_rate": 1.779095059845137e-05, "loss": 0.6502, "step": 1727 }, { "epoch": 0.12309007372582541, "grad_norm": 3.3966357707977295, "learning_rate": 1.7788487528528314e-05, "loss": 0.3354, "step": 1728 }, { "epoch": 0.12316130640737971, "grad_norm": 2.359276056289673, "learning_rate": 1.7786023256921835e-05, "loss": 0.3787, "step": 1729 }, { "epoch": 0.123232539088934, "grad_norm": 3.7778232097625732, "learning_rate": 1.7783557784012154e-05, "loss": 0.4674, "step": 1730 }, { "epoch": 0.1233037717704883, "grad_norm": 3.0081992149353027, "learning_rate": 1.7781091110179657e-05, "loss": 0.624, "step": 1731 }, { "epoch": 0.1233750044520426, "grad_norm": 2.6515846252441406, "learning_rate": 1.7778623235804935e-05, "loss": 0.4705, "step": 1732 }, { "epoch": 0.1234462371335969, "grad_norm": 11.08813762664795, "learning_rate": 1.7776154161268753e-05, "loss": 0.0876, "step": 1733 }, { "epoch": 0.12351746981515119, "grad_norm": 4.094571113586426, "learning_rate": 1.777368388695206e-05, "loss": 0.5518, "step": 1734 }, { "epoch": 0.12358870249670549, "grad_norm": 3.096072196960449, "learning_rate": 1.7771212413235997e-05, "loss": 0.6865, "step": 1735 }, { "epoch": 0.12365993517825978, "grad_norm": 2.3496694564819336, "learning_rate": 1.776873974050189e-05, "loss": 0.2329, "step": 1736 }, { "epoch": 0.12373116785981408, "grad_norm": 2.6515605449676514, "learning_rate": 1.776626586913124e-05, "loss": 0.5489, "step": 1737 }, { "epoch": 0.12380240054136837, "grad_norm": 2.5751380920410156, "learning_rate": 1.7763790799505746e-05, "loss": 0.7148, "step": 1738 }, { "epoch": 0.12387363322292268, "grad_norm": 2.4597649574279785, "learning_rate": 1.776131453200728e-05, "loss": 0.1853, "step": 1739 }, { "epoch": 0.12394486590447698, "grad_norm": 2.3268790245056152, "learning_rate": 1.775883706701791e-05, "loss": 0.1891, "step": 1740 }, { "epoch": 0.12401609858603127, "grad_norm": 2.7085773944854736, "learning_rate": 1.775635840491988e-05, "loss": 0.8413, "step": 1741 }, { "epoch": 0.12408733126758557, "grad_norm": 2.4471330642700195, "learning_rate": 1.7753878546095625e-05, "loss": 0.3067, "step": 1742 }, { "epoch": 0.12415856394913986, "grad_norm": 3.8551690578460693, "learning_rate": 1.7751397490927756e-05, "loss": 0.0541, "step": 1743 }, { "epoch": 0.12422979663069417, "grad_norm": 2.7604823112487793, "learning_rate": 1.7748915239799083e-05, "loss": 0.6936, "step": 1744 }, { "epoch": 0.12430102931224846, "grad_norm": 5.1555495262146, "learning_rate": 1.7746431793092583e-05, "loss": 0.6883, "step": 1745 }, { "epoch": 0.12437226199380276, "grad_norm": 5.027878284454346, "learning_rate": 1.774394715119143e-05, "loss": 0.5775, "step": 1746 }, { "epoch": 0.12444349467535705, "grad_norm": 5.884731769561768, "learning_rate": 1.7741461314478986e-05, "loss": 0.8141, "step": 1747 }, { "epoch": 0.12451472735691135, "grad_norm": 5.155446529388428, "learning_rate": 1.773897428333878e-05, "loss": 0.5129, "step": 1748 }, { "epoch": 0.12458596003846564, "grad_norm": 3.0769622325897217, "learning_rate": 1.773648605815453e-05, "loss": 0.2257, "step": 1749 }, { "epoch": 0.12465719272001995, "grad_norm": 3.229334831237793, "learning_rate": 1.7733996639310157e-05, "loss": 0.5406, "step": 1750 }, { "epoch": 0.12472842540157424, "grad_norm": 2.62799072265625, "learning_rate": 1.773150602718975e-05, "loss": 0.3505, "step": 1751 }, { "epoch": 0.12479965808312854, "grad_norm": 3.7537314891815186, "learning_rate": 1.772901422217758e-05, "loss": 1.1301, "step": 1752 }, { "epoch": 0.12487089076468284, "grad_norm": 2.2296030521392822, "learning_rate": 1.7726521224658106e-05, "loss": 0.3264, "step": 1753 }, { "epoch": 0.12494212344623713, "grad_norm": 3.7098562717437744, "learning_rate": 1.772402703501598e-05, "loss": 0.2906, "step": 1754 }, { "epoch": 0.12501335612779144, "grad_norm": 3.6714723110198975, "learning_rate": 1.772153165363602e-05, "loss": 0.5956, "step": 1755 }, { "epoch": 0.12508458880934573, "grad_norm": 3.3428609371185303, "learning_rate": 1.771903508090324e-05, "loss": 0.5071, "step": 1756 }, { "epoch": 0.12515582149090002, "grad_norm": 3.0351181030273438, "learning_rate": 1.7716537317202848e-05, "loss": 0.3229, "step": 1757 }, { "epoch": 0.12522705417245433, "grad_norm": 2.2114200592041016, "learning_rate": 1.7714038362920205e-05, "loss": 0.4136, "step": 1758 }, { "epoch": 0.12529828685400862, "grad_norm": 3.7363059520721436, "learning_rate": 1.771153821844088e-05, "loss": 0.5869, "step": 1759 }, { "epoch": 0.1253695195355629, "grad_norm": 5.415356159210205, "learning_rate": 1.7709036884150627e-05, "loss": 0.5527, "step": 1760 }, { "epoch": 0.1254407522171172, "grad_norm": 2.8156096935272217, "learning_rate": 1.770653436043537e-05, "loss": 0.4302, "step": 1761 }, { "epoch": 0.12551198489867152, "grad_norm": 2.806408405303955, "learning_rate": 1.770403064768122e-05, "loss": 0.4776, "step": 1762 }, { "epoch": 0.1255832175802258, "grad_norm": 3.8516154289245605, "learning_rate": 1.770152574627448e-05, "loss": 0.6866, "step": 1763 }, { "epoch": 0.1256544502617801, "grad_norm": 4.1592607498168945, "learning_rate": 1.7699019656601624e-05, "loss": 1.5339, "step": 1764 }, { "epoch": 0.1257256829433344, "grad_norm": 2.9893863201141357, "learning_rate": 1.7696512379049323e-05, "loss": 0.3862, "step": 1765 }, { "epoch": 0.1257969156248887, "grad_norm": 4.367090702056885, "learning_rate": 1.7694003914004422e-05, "loss": 0.7366, "step": 1766 }, { "epoch": 0.125868148306443, "grad_norm": 2.4954075813293457, "learning_rate": 1.769149426185395e-05, "loss": 0.1884, "step": 1767 }, { "epoch": 0.12593938098799728, "grad_norm": 2.388719081878662, "learning_rate": 1.7688983422985116e-05, "loss": 0.4109, "step": 1768 }, { "epoch": 0.1260106136695516, "grad_norm": 4.846286296844482, "learning_rate": 1.7686471397785322e-05, "loss": 0.7241, "step": 1769 }, { "epoch": 0.1260818463511059, "grad_norm": 3.3240323066711426, "learning_rate": 1.768395818664215e-05, "loss": 0.6719, "step": 1770 }, { "epoch": 0.12615307903266018, "grad_norm": 3.5087409019470215, "learning_rate": 1.7681443789943354e-05, "loss": 0.5006, "step": 1771 }, { "epoch": 0.12622431171421447, "grad_norm": 4.380054950714111, "learning_rate": 1.767892820807689e-05, "loss": 0.2207, "step": 1772 }, { "epoch": 0.1262955443957688, "grad_norm": 3.206188678741455, "learning_rate": 1.7676411441430877e-05, "loss": 0.7342, "step": 1773 }, { "epoch": 0.12636677707732308, "grad_norm": 2.8212356567382812, "learning_rate": 1.7673893490393636e-05, "loss": 0.3621, "step": 1774 }, { "epoch": 0.12643800975887737, "grad_norm": 3.730874538421631, "learning_rate": 1.767137435535365e-05, "loss": 0.6647, "step": 1775 }, { "epoch": 0.12650924244043166, "grad_norm": 2.5377213954925537, "learning_rate": 1.76688540366996e-05, "loss": 0.238, "step": 1776 }, { "epoch": 0.12658047512198597, "grad_norm": 3.393831491470337, "learning_rate": 1.766633253482035e-05, "loss": 0.6487, "step": 1777 }, { "epoch": 0.12665170780354026, "grad_norm": 3.4753034114837646, "learning_rate": 1.7663809850104936e-05, "loss": 0.6562, "step": 1778 }, { "epoch": 0.12672294048509455, "grad_norm": 2.07151198387146, "learning_rate": 1.7661285982942588e-05, "loss": 0.3217, "step": 1779 }, { "epoch": 0.12679417316664887, "grad_norm": 2.9560322761535645, "learning_rate": 1.7658760933722702e-05, "loss": 0.3383, "step": 1780 }, { "epoch": 0.12686540584820316, "grad_norm": 4.283071994781494, "learning_rate": 1.7656234702834877e-05, "loss": 0.2231, "step": 1781 }, { "epoch": 0.12693663852975745, "grad_norm": 3.526287794113159, "learning_rate": 1.7653707290668882e-05, "loss": 0.5459, "step": 1782 }, { "epoch": 0.12700787121131174, "grad_norm": 3.040297508239746, "learning_rate": 1.765117869761467e-05, "loss": 0.5549, "step": 1783 }, { "epoch": 0.12707910389286606, "grad_norm": 2.743323802947998, "learning_rate": 1.7648648924062378e-05, "loss": 0.5584, "step": 1784 }, { "epoch": 0.12715033657442035, "grad_norm": 3.3200247287750244, "learning_rate": 1.764611797040232e-05, "loss": 0.618, "step": 1785 }, { "epoch": 0.12722156925597464, "grad_norm": 3.1681504249572754, "learning_rate": 1.7643585837025e-05, "loss": 0.6231, "step": 1786 }, { "epoch": 0.12729280193752893, "grad_norm": 3.2539429664611816, "learning_rate": 1.76410525243211e-05, "loss": 0.7074, "step": 1787 }, { "epoch": 0.12736403461908324, "grad_norm": 5.554616451263428, "learning_rate": 1.7638518032681482e-05, "loss": 1.2272, "step": 1788 }, { "epoch": 0.12743526730063753, "grad_norm": 4.996348857879639, "learning_rate": 1.7635982362497195e-05, "loss": 0.5093, "step": 1789 }, { "epoch": 0.12750649998219182, "grad_norm": 4.4844746589660645, "learning_rate": 1.763344551415946e-05, "loss": 0.902, "step": 1790 }, { "epoch": 0.1275777326637461, "grad_norm": 3.002272367477417, "learning_rate": 1.76309074880597e-05, "loss": 0.309, "step": 1791 }, { "epoch": 0.12764896534530043, "grad_norm": 3.0305728912353516, "learning_rate": 1.762836828458949e-05, "loss": 0.3452, "step": 1792 }, { "epoch": 0.12772019802685472, "grad_norm": 3.175258159637451, "learning_rate": 1.762582790414061e-05, "loss": 0.6602, "step": 1793 }, { "epoch": 0.127791430708409, "grad_norm": 2.6165075302124023, "learning_rate": 1.762328634710502e-05, "loss": 0.3011, "step": 1794 }, { "epoch": 0.12786266338996333, "grad_norm": 1.4037420749664307, "learning_rate": 1.762074361387485e-05, "loss": 0.0512, "step": 1795 }, { "epoch": 0.12793389607151762, "grad_norm": 6.374377727508545, "learning_rate": 1.761819970484242e-05, "loss": 0.4352, "step": 1796 }, { "epoch": 0.1280051287530719, "grad_norm": 4.395524024963379, "learning_rate": 1.7615654620400225e-05, "loss": 0.4658, "step": 1797 }, { "epoch": 0.1280763614346262, "grad_norm": 2.8027491569519043, "learning_rate": 1.761310836094095e-05, "loss": 0.7646, "step": 1798 }, { "epoch": 0.1281475941161805, "grad_norm": 5.397621154785156, "learning_rate": 1.7610560926857455e-05, "loss": 0.726, "step": 1799 }, { "epoch": 0.1282188267977348, "grad_norm": 3.5140764713287354, "learning_rate": 1.760801231854278e-05, "loss": 0.3431, "step": 1800 }, { "epoch": 0.1282900594792891, "grad_norm": 2.817654609680176, "learning_rate": 1.7605462536390155e-05, "loss": 0.5874, "step": 1801 }, { "epoch": 0.12836129216084338, "grad_norm": 1.9842945337295532, "learning_rate": 1.760291158079298e-05, "loss": 0.3081, "step": 1802 }, { "epoch": 0.1284325248423977, "grad_norm": 3.9331889152526855, "learning_rate": 1.7600359452144845e-05, "loss": 0.8466, "step": 1803 }, { "epoch": 0.128503757523952, "grad_norm": 1.4997584819793701, "learning_rate": 1.759780615083951e-05, "loss": 0.177, "step": 1804 }, { "epoch": 0.12857499020550628, "grad_norm": 3.053304433822632, "learning_rate": 1.7595251677270933e-05, "loss": 0.5679, "step": 1805 }, { "epoch": 0.1286462228870606, "grad_norm": 2.197131395339966, "learning_rate": 1.7592696031833237e-05, "loss": 0.4026, "step": 1806 }, { "epoch": 0.12871745556861489, "grad_norm": 2.03897762298584, "learning_rate": 1.7590139214920732e-05, "loss": 0.4211, "step": 1807 }, { "epoch": 0.12878868825016918, "grad_norm": 2.42685604095459, "learning_rate": 1.758758122692791e-05, "loss": 0.5515, "step": 1808 }, { "epoch": 0.12885992093172347, "grad_norm": 3.5230801105499268, "learning_rate": 1.758502206824944e-05, "loss": 0.6761, "step": 1809 }, { "epoch": 0.12893115361327778, "grad_norm": 3.0943949222564697, "learning_rate": 1.7582461739280178e-05, "loss": 0.5861, "step": 1810 }, { "epoch": 0.12900238629483207, "grad_norm": 3.3609201908111572, "learning_rate": 1.7579900240415155e-05, "loss": 0.9577, "step": 1811 }, { "epoch": 0.12907361897638636, "grad_norm": 3.0026280879974365, "learning_rate": 1.757733757204958e-05, "loss": 0.663, "step": 1812 }, { "epoch": 0.12914485165794065, "grad_norm": 3.650784730911255, "learning_rate": 1.757477373457885e-05, "loss": 0.3324, "step": 1813 }, { "epoch": 0.12921608433949497, "grad_norm": 2.9294776916503906, "learning_rate": 1.757220872839854e-05, "loss": 0.5081, "step": 1814 }, { "epoch": 0.12928731702104926, "grad_norm": 3.030892848968506, "learning_rate": 1.75696425539044e-05, "loss": 0.5548, "step": 1815 }, { "epoch": 0.12935854970260355, "grad_norm": 5.905982971191406, "learning_rate": 1.7567075211492365e-05, "loss": 0.6177, "step": 1816 }, { "epoch": 0.12942978238415787, "grad_norm": 3.931818723678589, "learning_rate": 1.756450670155855e-05, "loss": 0.7835, "step": 1817 }, { "epoch": 0.12950101506571216, "grad_norm": 1.7834268808364868, "learning_rate": 1.7561937024499252e-05, "loss": 0.2889, "step": 1818 }, { "epoch": 0.12957224774726644, "grad_norm": 3.5795228481292725, "learning_rate": 1.7559366180710942e-05, "loss": 0.5571, "step": 1819 }, { "epoch": 0.12964348042882073, "grad_norm": 2.7331416606903076, "learning_rate": 1.7556794170590282e-05, "loss": 0.3006, "step": 1820 }, { "epoch": 0.12971471311037505, "grad_norm": 2.5610995292663574, "learning_rate": 1.7554220994534096e-05, "loss": 0.257, "step": 1821 }, { "epoch": 0.12978594579192934, "grad_norm": 3.193053960800171, "learning_rate": 1.7551646652939405e-05, "loss": 0.6081, "step": 1822 }, { "epoch": 0.12985717847348363, "grad_norm": 4.4382004737854, "learning_rate": 1.7549071146203404e-05, "loss": 0.4532, "step": 1823 }, { "epoch": 0.12992841115503792, "grad_norm": 3.995068311691284, "learning_rate": 1.7546494474723467e-05, "loss": 0.466, "step": 1824 }, { "epoch": 0.12999964383659224, "grad_norm": 3.0497517585754395, "learning_rate": 1.7543916638897142e-05, "loss": 0.5663, "step": 1825 }, { "epoch": 0.13007087651814653, "grad_norm": 2.37418532371521, "learning_rate": 1.754133763912217e-05, "loss": 0.5007, "step": 1826 }, { "epoch": 0.13014210919970082, "grad_norm": 3.584066390991211, "learning_rate": 1.753875747579646e-05, "loss": 0.4681, "step": 1827 }, { "epoch": 0.1302133418812551, "grad_norm": 5.344079494476318, "learning_rate": 1.7536176149318106e-05, "loss": 0.3253, "step": 1828 }, { "epoch": 0.13028457456280942, "grad_norm": 3.277454137802124, "learning_rate": 1.7533593660085378e-05, "loss": 0.1747, "step": 1829 }, { "epoch": 0.13035580724436371, "grad_norm": 4.964325904846191, "learning_rate": 1.7531010008496733e-05, "loss": 0.9321, "step": 1830 }, { "epoch": 0.130427039925918, "grad_norm": 3.104020833969116, "learning_rate": 1.7528425194950794e-05, "loss": 0.417, "step": 1831 }, { "epoch": 0.13049827260747232, "grad_norm": 3.8737077713012695, "learning_rate": 1.752583921984638e-05, "loss": 0.5512, "step": 1832 }, { "epoch": 0.1305695052890266, "grad_norm": 4.295087814331055, "learning_rate": 1.752325208358247e-05, "loss": 0.7069, "step": 1833 }, { "epoch": 0.1306407379705809, "grad_norm": 3.1642720699310303, "learning_rate": 1.7520663786558243e-05, "loss": 0.5344, "step": 1834 }, { "epoch": 0.1307119706521352, "grad_norm": 2.916987895965576, "learning_rate": 1.751807432917304e-05, "loss": 0.5657, "step": 1835 }, { "epoch": 0.1307832033336895, "grad_norm": 2.7608213424682617, "learning_rate": 1.7515483711826386e-05, "loss": 0.277, "step": 1836 }, { "epoch": 0.1308544360152438, "grad_norm": 3.668001651763916, "learning_rate": 1.7512891934917994e-05, "loss": 0.8253, "step": 1837 }, { "epoch": 0.1309256686967981, "grad_norm": 3.3581862449645996, "learning_rate": 1.7510298998847742e-05, "loss": 0.5472, "step": 1838 }, { "epoch": 0.13099690137835238, "grad_norm": 3.341787576675415, "learning_rate": 1.7507704904015696e-05, "loss": 0.7946, "step": 1839 }, { "epoch": 0.1310681340599067, "grad_norm": 4.342875003814697, "learning_rate": 1.7505109650822096e-05, "loss": 0.5573, "step": 1840 }, { "epoch": 0.13113936674146098, "grad_norm": 3.336479663848877, "learning_rate": 1.7502513239667365e-05, "loss": 0.4928, "step": 1841 }, { "epoch": 0.13121059942301527, "grad_norm": 2.4533095359802246, "learning_rate": 1.7499915670952107e-05, "loss": 0.5084, "step": 1842 }, { "epoch": 0.1312818321045696, "grad_norm": 4.1495866775512695, "learning_rate": 1.749731694507709e-05, "loss": 0.5966, "step": 1843 }, { "epoch": 0.13135306478612388, "grad_norm": 3.517976999282837, "learning_rate": 1.749471706244328e-05, "loss": 0.5026, "step": 1844 }, { "epoch": 0.13142429746767817, "grad_norm": 3.2329907417297363, "learning_rate": 1.7492116023451803e-05, "loss": 0.7289, "step": 1845 }, { "epoch": 0.13149553014923246, "grad_norm": 3.724641799926758, "learning_rate": 1.748951382850398e-05, "loss": 0.7654, "step": 1846 }, { "epoch": 0.13156676283078678, "grad_norm": 3.6807427406311035, "learning_rate": 1.7486910478001303e-05, "loss": 0.3916, "step": 1847 }, { "epoch": 0.13163799551234107, "grad_norm": 3.987013816833496, "learning_rate": 1.7484305972345436e-05, "loss": 0.3761, "step": 1848 }, { "epoch": 0.13170922819389536, "grad_norm": 5.163904190063477, "learning_rate": 1.748170031193823e-05, "loss": 0.5992, "step": 1849 }, { "epoch": 0.13178046087544965, "grad_norm": 3.2250564098358154, "learning_rate": 1.7479093497181714e-05, "loss": 0.1098, "step": 1850 }, { "epoch": 0.13185169355700396, "grad_norm": 2.387108325958252, "learning_rate": 1.7476485528478093e-05, "loss": 0.4964, "step": 1851 }, { "epoch": 0.13192292623855825, "grad_norm": 4.174975872039795, "learning_rate": 1.7473876406229744e-05, "loss": 0.6178, "step": 1852 }, { "epoch": 0.13199415892011254, "grad_norm": 3.0520575046539307, "learning_rate": 1.7471266130839235e-05, "loss": 0.5549, "step": 1853 }, { "epoch": 0.13206539160166683, "grad_norm": 2.361309051513672, "learning_rate": 1.74686547027093e-05, "loss": 0.2562, "step": 1854 }, { "epoch": 0.13213662428322115, "grad_norm": 6.16984224319458, "learning_rate": 1.7466042122242853e-05, "loss": 0.4524, "step": 1855 }, { "epoch": 0.13220785696477544, "grad_norm": 1.9731487035751343, "learning_rate": 1.7463428389842997e-05, "loss": 0.1725, "step": 1856 }, { "epoch": 0.13227908964632973, "grad_norm": 2.9563865661621094, "learning_rate": 1.7460813505912996e-05, "loss": 0.5906, "step": 1857 }, { "epoch": 0.13235032232788405, "grad_norm": 3.819587469100952, "learning_rate": 1.7458197470856305e-05, "loss": 0.7121, "step": 1858 }, { "epoch": 0.13242155500943834, "grad_norm": 2.326592445373535, "learning_rate": 1.7455580285076546e-05, "loss": 0.2966, "step": 1859 }, { "epoch": 0.13249278769099262, "grad_norm": 3.29347562789917, "learning_rate": 1.745296194897753e-05, "loss": 0.3857, "step": 1860 }, { "epoch": 0.13256402037254691, "grad_norm": 1.3216028213500977, "learning_rate": 1.7450342462963235e-05, "loss": 0.0889, "step": 1861 }, { "epoch": 0.13263525305410123, "grad_norm": 5.163313865661621, "learning_rate": 1.744772182743782e-05, "loss": 0.5538, "step": 1862 }, { "epoch": 0.13270648573565552, "grad_norm": 3.280334234237671, "learning_rate": 1.7445100042805627e-05, "loss": 0.5787, "step": 1863 }, { "epoch": 0.1327777184172098, "grad_norm": 2.8918659687042236, "learning_rate": 1.744247710947116e-05, "loss": 0.7778, "step": 1864 }, { "epoch": 0.1328489510987641, "grad_norm": 3.035244941711426, "learning_rate": 1.7439853027839124e-05, "loss": 0.7444, "step": 1865 }, { "epoch": 0.13292018378031842, "grad_norm": 4.3076605796813965, "learning_rate": 1.743722779831438e-05, "loss": 0.2848, "step": 1866 }, { "epoch": 0.1329914164618727, "grad_norm": 4.959719181060791, "learning_rate": 1.7434601421301974e-05, "loss": 0.4421, "step": 1867 }, { "epoch": 0.133062649143427, "grad_norm": 3.3864002227783203, "learning_rate": 1.743197389720713e-05, "loss": 0.7583, "step": 1868 }, { "epoch": 0.13313388182498131, "grad_norm": 3.965118169784546, "learning_rate": 1.7429345226435253e-05, "loss": 0.2625, "step": 1869 }, { "epoch": 0.1332051145065356, "grad_norm": 2.491135358810425, "learning_rate": 1.742671540939191e-05, "loss": 0.5425, "step": 1870 }, { "epoch": 0.1332763471880899, "grad_norm": 3.9292490482330322, "learning_rate": 1.742408444648286e-05, "loss": 0.6649, "step": 1871 }, { "epoch": 0.13334757986964418, "grad_norm": 4.9949727058410645, "learning_rate": 1.7421452338114036e-05, "loss": 0.536, "step": 1872 }, { "epoch": 0.1334188125511985, "grad_norm": 5.8765153884887695, "learning_rate": 1.741881908469154e-05, "loss": 0.5798, "step": 1873 }, { "epoch": 0.1334900452327528, "grad_norm": 4.237856388092041, "learning_rate": 1.741618468662166e-05, "loss": 0.6075, "step": 1874 }, { "epoch": 0.13356127791430708, "grad_norm": 2.794881582260132, "learning_rate": 1.7413549144310856e-05, "loss": 0.8519, "step": 1875 }, { "epoch": 0.13363251059586137, "grad_norm": 3.6972200870513916, "learning_rate": 1.741091245816576e-05, "loss": 0.7055, "step": 1876 }, { "epoch": 0.1337037432774157, "grad_norm": 4.839714050292969, "learning_rate": 1.7408274628593192e-05, "loss": 0.6439, "step": 1877 }, { "epoch": 0.13377497595896998, "grad_norm": 3.9981210231781006, "learning_rate": 1.740563565600014e-05, "loss": 0.3148, "step": 1878 }, { "epoch": 0.13384620864052427, "grad_norm": 7.841110706329346, "learning_rate": 1.7402995540793764e-05, "loss": 0.1967, "step": 1879 }, { "epoch": 0.13391744132207856, "grad_norm": 2.822190523147583, "learning_rate": 1.7400354283381416e-05, "loss": 0.413, "step": 1880 }, { "epoch": 0.13398867400363287, "grad_norm": 3.709867000579834, "learning_rate": 1.7397711884170613e-05, "loss": 0.0731, "step": 1881 }, { "epoch": 0.13405990668518716, "grad_norm": 5.753561973571777, "learning_rate": 1.7395068343569047e-05, "loss": 0.7164, "step": 1882 }, { "epoch": 0.13413113936674145, "grad_norm": 3.675844192504883, "learning_rate": 1.739242366198459e-05, "loss": 0.6013, "step": 1883 }, { "epoch": 0.13420237204829577, "grad_norm": 8.295356750488281, "learning_rate": 1.7389777839825284e-05, "loss": 0.1497, "step": 1884 }, { "epoch": 0.13427360472985006, "grad_norm": 4.432013511657715, "learning_rate": 1.7387130877499364e-05, "loss": 0.4326, "step": 1885 }, { "epoch": 0.13434483741140435, "grad_norm": 3.6607210636138916, "learning_rate": 1.738448277541522e-05, "loss": 0.726, "step": 1886 }, { "epoch": 0.13441607009295864, "grad_norm": 2.577439546585083, "learning_rate": 1.738183353398143e-05, "loss": 0.3432, "step": 1887 }, { "epoch": 0.13448730277451296, "grad_norm": 3.1471431255340576, "learning_rate": 1.7379183153606743e-05, "loss": 0.6874, "step": 1888 }, { "epoch": 0.13455853545606725, "grad_norm": 3.5909461975097656, "learning_rate": 1.7376531634700087e-05, "loss": 0.6181, "step": 1889 }, { "epoch": 0.13462976813762154, "grad_norm": 2.4454972743988037, "learning_rate": 1.737387897767056e-05, "loss": 0.181, "step": 1890 }, { "epoch": 0.13470100081917583, "grad_norm": 3.7749886512756348, "learning_rate": 1.7371225182927447e-05, "loss": 0.2947, "step": 1891 }, { "epoch": 0.13477223350073014, "grad_norm": 1.3567813634872437, "learning_rate": 1.7368570250880198e-05, "loss": 0.0915, "step": 1892 }, { "epoch": 0.13484346618228443, "grad_norm": 2.574010133743286, "learning_rate": 1.736591418193844e-05, "loss": 0.3423, "step": 1893 }, { "epoch": 0.13491469886383872, "grad_norm": 3.354480504989624, "learning_rate": 1.7363256976511972e-05, "loss": 0.6756, "step": 1894 }, { "epoch": 0.13498593154539304, "grad_norm": 2.827120304107666, "learning_rate": 1.7360598635010787e-05, "loss": 0.2984, "step": 1895 }, { "epoch": 0.13505716422694733, "grad_norm": 3.939589738845825, "learning_rate": 1.735793915784503e-05, "loss": 0.563, "step": 1896 }, { "epoch": 0.13512839690850162, "grad_norm": 3.2574145793914795, "learning_rate": 1.7355278545425033e-05, "loss": 0.4899, "step": 1897 }, { "epoch": 0.1351996295900559, "grad_norm": 3.6873297691345215, "learning_rate": 1.73526167981613e-05, "loss": 0.5565, "step": 1898 }, { "epoch": 0.13527086227161023, "grad_norm": 3.325939893722534, "learning_rate": 1.7349953916464512e-05, "loss": 0.5017, "step": 1899 }, { "epoch": 0.13534209495316452, "grad_norm": 3.5972492694854736, "learning_rate": 1.7347289900745525e-05, "loss": 0.7075, "step": 1900 }, { "epoch": 0.1354133276347188, "grad_norm": 6.4874701499938965, "learning_rate": 1.734462475141537e-05, "loss": 0.4358, "step": 1901 }, { "epoch": 0.1354845603162731, "grad_norm": 3.445476770401001, "learning_rate": 1.734195846888525e-05, "loss": 0.5954, "step": 1902 }, { "epoch": 0.1355557929978274, "grad_norm": 2.6124074459075928, "learning_rate": 1.7339291053566544e-05, "loss": 0.46, "step": 1903 }, { "epoch": 0.1356270256793817, "grad_norm": 7.612179279327393, "learning_rate": 1.7336622505870813e-05, "loss": 0.5828, "step": 1904 }, { "epoch": 0.135698258360936, "grad_norm": 3.6200666427612305, "learning_rate": 1.733395282620978e-05, "loss": 0.2907, "step": 1905 }, { "epoch": 0.13576949104249028, "grad_norm": 5.022115230560303, "learning_rate": 1.7331282014995348e-05, "loss": 0.2686, "step": 1906 }, { "epoch": 0.1358407237240446, "grad_norm": 3.0262339115142822, "learning_rate": 1.7328610072639604e-05, "loss": 0.4318, "step": 1907 }, { "epoch": 0.1359119564055989, "grad_norm": 2.5021002292633057, "learning_rate": 1.732593699955479e-05, "loss": 0.1825, "step": 1908 }, { "epoch": 0.13598318908715318, "grad_norm": 2.1856765747070312, "learning_rate": 1.7323262796153342e-05, "loss": 0.5737, "step": 1909 }, { "epoch": 0.1360544217687075, "grad_norm": 3.2805604934692383, "learning_rate": 1.7320587462847858e-05, "loss": 0.2046, "step": 1910 }, { "epoch": 0.13612565445026178, "grad_norm": 1.5693178176879883, "learning_rate": 1.7317911000051123e-05, "loss": 0.1648, "step": 1911 }, { "epoch": 0.13619688713181607, "grad_norm": 3.448387861251831, "learning_rate": 1.7315233408176073e-05, "loss": 0.5271, "step": 1912 }, { "epoch": 0.13626811981337036, "grad_norm": 2.956303119659424, "learning_rate": 1.7312554687635843e-05, "loss": 0.3954, "step": 1913 }, { "epoch": 0.13633935249492468, "grad_norm": 5.657852649688721, "learning_rate": 1.730987483884373e-05, "loss": 0.6176, "step": 1914 }, { "epoch": 0.13641058517647897, "grad_norm": 2.7609658241271973, "learning_rate": 1.7307193862213204e-05, "loss": 0.5755, "step": 1915 }, { "epoch": 0.13648181785803326, "grad_norm": 2.3245913982391357, "learning_rate": 1.7304511758157917e-05, "loss": 0.6794, "step": 1916 }, { "epoch": 0.13655305053958755, "grad_norm": 2.333289384841919, "learning_rate": 1.7301828527091687e-05, "loss": 0.3365, "step": 1917 }, { "epoch": 0.13662428322114187, "grad_norm": 2.963080883026123, "learning_rate": 1.7299144169428513e-05, "loss": 0.5735, "step": 1918 }, { "epoch": 0.13669551590269616, "grad_norm": 1.8795262575149536, "learning_rate": 1.7296458685582557e-05, "loss": 0.2103, "step": 1919 }, { "epoch": 0.13676674858425045, "grad_norm": 2.7049756050109863, "learning_rate": 1.7293772075968163e-05, "loss": 0.6375, "step": 1920 }, { "epoch": 0.13683798126580476, "grad_norm": 9.923035621643066, "learning_rate": 1.729108434099985e-05, "loss": 0.8158, "step": 1921 }, { "epoch": 0.13690921394735905, "grad_norm": 1.888973355293274, "learning_rate": 1.7288395481092307e-05, "loss": 0.1821, "step": 1922 }, { "epoch": 0.13698044662891334, "grad_norm": 1.9465503692626953, "learning_rate": 1.7285705496660398e-05, "loss": 0.2681, "step": 1923 }, { "epoch": 0.13705167931046763, "grad_norm": 3.37148380279541, "learning_rate": 1.728301438811916e-05, "loss": 0.8201, "step": 1924 }, { "epoch": 0.13712291199202195, "grad_norm": 5.524354934692383, "learning_rate": 1.7280322155883805e-05, "loss": 0.981, "step": 1925 }, { "epoch": 0.13719414467357624, "grad_norm": 2.7267067432403564, "learning_rate": 1.7277628800369708e-05, "loss": 0.2882, "step": 1926 }, { "epoch": 0.13726537735513053, "grad_norm": 3.2571043968200684, "learning_rate": 1.7274934321992435e-05, "loss": 0.3148, "step": 1927 }, { "epoch": 0.13733661003668482, "grad_norm": 3.430840492248535, "learning_rate": 1.7272238721167715e-05, "loss": 0.505, "step": 1928 }, { "epoch": 0.13740784271823914, "grad_norm": 3.288778305053711, "learning_rate": 1.7269541998311446e-05, "loss": 0.2696, "step": 1929 }, { "epoch": 0.13747907539979343, "grad_norm": 3.7281858921051025, "learning_rate": 1.726684415383971e-05, "loss": 0.4804, "step": 1930 }, { "epoch": 0.13755030808134772, "grad_norm": 2.5943186283111572, "learning_rate": 1.7264145188168755e-05, "loss": 0.524, "step": 1931 }, { "epoch": 0.137621540762902, "grad_norm": 4.169601917266846, "learning_rate": 1.7261445101715006e-05, "loss": 0.0777, "step": 1932 }, { "epoch": 0.13769277344445632, "grad_norm": 4.337432384490967, "learning_rate": 1.7258743894895054e-05, "loss": 0.6961, "step": 1933 }, { "epoch": 0.1377640061260106, "grad_norm": 2.753322124481201, "learning_rate": 1.7256041568125673e-05, "loss": 0.2434, "step": 1934 }, { "epoch": 0.1378352388075649, "grad_norm": 3.1092562675476074, "learning_rate": 1.7253338121823796e-05, "loss": 0.3553, "step": 1935 }, { "epoch": 0.13790647148911922, "grad_norm": 3.020981550216675, "learning_rate": 1.7250633556406545e-05, "loss": 0.6681, "step": 1936 }, { "epoch": 0.1379777041706735, "grad_norm": 2.548369884490967, "learning_rate": 1.72479278722912e-05, "loss": 0.484, "step": 1937 }, { "epoch": 0.1380489368522278, "grad_norm": 2.8158771991729736, "learning_rate": 1.7245221069895227e-05, "loss": 0.5907, "step": 1938 }, { "epoch": 0.1381201695337821, "grad_norm": 3.8767621517181396, "learning_rate": 1.7242513149636253e-05, "loss": 0.5101, "step": 1939 }, { "epoch": 0.1381914022153364, "grad_norm": 3.49729061126709, "learning_rate": 1.7239804111932085e-05, "loss": 0.472, "step": 1940 }, { "epoch": 0.1382626348968907, "grad_norm": 3.3562183380126953, "learning_rate": 1.7237093957200694e-05, "loss": 0.5606, "step": 1941 }, { "epoch": 0.13833386757844499, "grad_norm": 5.149975299835205, "learning_rate": 1.7234382685860236e-05, "loss": 0.6651, "step": 1942 }, { "epoch": 0.13840510025999928, "grad_norm": 2.659721612930298, "learning_rate": 1.723167029832903e-05, "loss": 0.4155, "step": 1943 }, { "epoch": 0.1384763329415536, "grad_norm": 3.684318780899048, "learning_rate": 1.7228956795025565e-05, "loss": 0.6326, "step": 1944 }, { "epoch": 0.13854756562310788, "grad_norm": 2.2151386737823486, "learning_rate": 1.7226242176368515e-05, "loss": 0.2871, "step": 1945 }, { "epoch": 0.13861879830466217, "grad_norm": 3.6864078044891357, "learning_rate": 1.7223526442776712e-05, "loss": 0.4018, "step": 1946 }, { "epoch": 0.1386900309862165, "grad_norm": 5.0912299156188965, "learning_rate": 1.7220809594669165e-05, "loss": 0.7884, "step": 1947 }, { "epoch": 0.13876126366777078, "grad_norm": 4.213191032409668, "learning_rate": 1.7218091632465057e-05, "loss": 0.5802, "step": 1948 }, { "epoch": 0.13883249634932507, "grad_norm": 5.393504619598389, "learning_rate": 1.7215372556583745e-05, "loss": 0.8544, "step": 1949 }, { "epoch": 0.13890372903087936, "grad_norm": 3.361772060394287, "learning_rate": 1.721265236744475e-05, "loss": 0.6714, "step": 1950 }, { "epoch": 0.13897496171243368, "grad_norm": 3.005452871322632, "learning_rate": 1.720993106546777e-05, "loss": 0.3153, "step": 1951 }, { "epoch": 0.13904619439398797, "grad_norm": 4.9447922706604, "learning_rate": 1.7207208651072677e-05, "loss": 0.5214, "step": 1952 }, { "epoch": 0.13911742707554225, "grad_norm": 3.9854884147644043, "learning_rate": 1.7204485124679506e-05, "loss": 0.5815, "step": 1953 }, { "epoch": 0.13918865975709654, "grad_norm": 2.7530438899993896, "learning_rate": 1.720176048670847e-05, "loss": 0.4906, "step": 1954 }, { "epoch": 0.13925989243865086, "grad_norm": 7.807960510253906, "learning_rate": 1.7199034737579962e-05, "loss": 0.3869, "step": 1955 }, { "epoch": 0.13933112512020515, "grad_norm": 3.506089925765991, "learning_rate": 1.7196307877714523e-05, "loss": 0.374, "step": 1956 }, { "epoch": 0.13940235780175944, "grad_norm": 5.249359607696533, "learning_rate": 1.719357990753289e-05, "loss": 0.5427, "step": 1957 }, { "epoch": 0.13947359048331373, "grad_norm": 3.600304365158081, "learning_rate": 1.7190850827455957e-05, "loss": 0.8782, "step": 1958 }, { "epoch": 0.13954482316486805, "grad_norm": 3.036264181137085, "learning_rate": 1.7188120637904792e-05, "loss": 0.212, "step": 1959 }, { "epoch": 0.13961605584642234, "grad_norm": 4.014216423034668, "learning_rate": 1.7185389339300633e-05, "loss": 0.8658, "step": 1960 }, { "epoch": 0.13968728852797663, "grad_norm": 3.3706586360931396, "learning_rate": 1.7182656932064894e-05, "loss": 0.3183, "step": 1961 }, { "epoch": 0.13975852120953094, "grad_norm": 4.339809417724609, "learning_rate": 1.7179923416619163e-05, "loss": 0.7165, "step": 1962 }, { "epoch": 0.13982975389108523, "grad_norm": 3.0857458114624023, "learning_rate": 1.7177188793385183e-05, "loss": 0.4919, "step": 1963 }, { "epoch": 0.13990098657263952, "grad_norm": 4.468426704406738, "learning_rate": 1.7174453062784885e-05, "loss": 0.7171, "step": 1964 }, { "epoch": 0.1399722192541938, "grad_norm": 2.1137099266052246, "learning_rate": 1.717171622524036e-05, "loss": 0.3682, "step": 1965 }, { "epoch": 0.14004345193574813, "grad_norm": 3.103158473968506, "learning_rate": 1.716897828117388e-05, "loss": 0.2549, "step": 1966 }, { "epoch": 0.14011468461730242, "grad_norm": 8.23100471496582, "learning_rate": 1.7166239231007872e-05, "loss": 0.6842, "step": 1967 }, { "epoch": 0.1401859172988567, "grad_norm": 1.8909817934036255, "learning_rate": 1.716349907516495e-05, "loss": 0.2915, "step": 1968 }, { "epoch": 0.140257149980411, "grad_norm": 3.542397975921631, "learning_rate": 1.7160757814067895e-05, "loss": 0.6235, "step": 1969 }, { "epoch": 0.14032838266196532, "grad_norm": 3.316131114959717, "learning_rate": 1.7158015448139645e-05, "loss": 0.6726, "step": 1970 }, { "epoch": 0.1403996153435196, "grad_norm": 5.679234981536865, "learning_rate": 1.715527197780333e-05, "loss": 0.5494, "step": 1971 }, { "epoch": 0.1404708480250739, "grad_norm": 3.34303879737854, "learning_rate": 1.715252740348223e-05, "loss": 0.3733, "step": 1972 }, { "epoch": 0.14054208070662821, "grad_norm": 7.866354465484619, "learning_rate": 1.714978172559981e-05, "loss": 0.6557, "step": 1973 }, { "epoch": 0.1406133133881825, "grad_norm": 3.0132594108581543, "learning_rate": 1.7147034944579698e-05, "loss": 0.6311, "step": 1974 }, { "epoch": 0.1406845460697368, "grad_norm": 2.8606150150299072, "learning_rate": 1.7144287060845696e-05, "loss": 0.913, "step": 1975 }, { "epoch": 0.14075577875129108, "grad_norm": 3.375913381576538, "learning_rate": 1.714153807482177e-05, "loss": 0.4282, "step": 1976 }, { "epoch": 0.1408270114328454, "grad_norm": 3.8561487197875977, "learning_rate": 1.713878798693206e-05, "loss": 0.5917, "step": 1977 }, { "epoch": 0.1408982441143997, "grad_norm": 7.984810829162598, "learning_rate": 1.7136036797600882e-05, "loss": 0.4061, "step": 1978 }, { "epoch": 0.14096947679595398, "grad_norm": 1.5625873804092407, "learning_rate": 1.7133284507252715e-05, "loss": 0.1753, "step": 1979 }, { "epoch": 0.14104070947750827, "grad_norm": 4.677729606628418, "learning_rate": 1.7130531116312202e-05, "loss": 0.2714, "step": 1980 }, { "epoch": 0.1411119421590626, "grad_norm": 2.2527997493743896, "learning_rate": 1.7127776625204173e-05, "loss": 0.2402, "step": 1981 }, { "epoch": 0.14118317484061688, "grad_norm": 3.3141849040985107, "learning_rate": 1.7125021034353614e-05, "loss": 0.2366, "step": 1982 }, { "epoch": 0.14125440752217117, "grad_norm": 3.640965461730957, "learning_rate": 1.7122264344185677e-05, "loss": 0.4955, "step": 1983 }, { "epoch": 0.14132564020372546, "grad_norm": 1.7325704097747803, "learning_rate": 1.71195065551257e-05, "loss": 0.2585, "step": 1984 }, { "epoch": 0.14139687288527977, "grad_norm": 3.755324363708496, "learning_rate": 1.711674766759918e-05, "loss": 0.1821, "step": 1985 }, { "epoch": 0.14146810556683406, "grad_norm": 4.018867015838623, "learning_rate": 1.711398768203178e-05, "loss": 0.3885, "step": 1986 }, { "epoch": 0.14153933824838835, "grad_norm": 5.42822265625, "learning_rate": 1.7111226598849344e-05, "loss": 0.2028, "step": 1987 }, { "epoch": 0.14161057092994267, "grad_norm": 5.436866760253906, "learning_rate": 1.710846441847787e-05, "loss": 0.5035, "step": 1988 }, { "epoch": 0.14168180361149696, "grad_norm": 3.7188634872436523, "learning_rate": 1.710570114134354e-05, "loss": 0.4877, "step": 1989 }, { "epoch": 0.14175303629305125, "grad_norm": 1.9824947118759155, "learning_rate": 1.7102936767872704e-05, "loss": 0.1157, "step": 1990 }, { "epoch": 0.14182426897460554, "grad_norm": 2.0863587856292725, "learning_rate": 1.7100171298491866e-05, "loss": 0.4498, "step": 1991 }, { "epoch": 0.14189550165615986, "grad_norm": 2.4758565425872803, "learning_rate": 1.709740473362772e-05, "loss": 0.405, "step": 1992 }, { "epoch": 0.14196673433771415, "grad_norm": 3.192715644836426, "learning_rate": 1.7094637073707105e-05, "loss": 0.5831, "step": 1993 }, { "epoch": 0.14203796701926844, "grad_norm": 3.3109283447265625, "learning_rate": 1.7091868319157055e-05, "loss": 0.56, "step": 1994 }, { "epoch": 0.14210919970082272, "grad_norm": 2.4023661613464355, "learning_rate": 1.7089098470404755e-05, "loss": 0.5469, "step": 1995 }, { "epoch": 0.14218043238237704, "grad_norm": 4.109723091125488, "learning_rate": 1.7086327527877563e-05, "loss": 0.5561, "step": 1996 }, { "epoch": 0.14225166506393133, "grad_norm": 3.3557851314544678, "learning_rate": 1.708355549200301e-05, "loss": 0.3816, "step": 1997 }, { "epoch": 0.14232289774548562, "grad_norm": 1.9844425916671753, "learning_rate": 1.708078236320879e-05, "loss": 0.3463, "step": 1998 }, { "epoch": 0.14239413042703994, "grad_norm": 6.129020690917969, "learning_rate": 1.707800814192277e-05, "loss": 0.4519, "step": 1999 }, { "epoch": 0.14246536310859423, "grad_norm": 3.3401949405670166, "learning_rate": 1.7075232828572982e-05, "loss": 0.6163, "step": 2000 }, { "epoch": 0.14253659579014852, "grad_norm": 2.6898443698883057, "learning_rate": 1.707245642358763e-05, "loss": 0.3019, "step": 2001 }, { "epoch": 0.1426078284717028, "grad_norm": 5.420042514801025, "learning_rate": 1.7069678927395083e-05, "loss": 0.6979, "step": 2002 }, { "epoch": 0.14267906115325713, "grad_norm": 2.169562816619873, "learning_rate": 1.706690034042388e-05, "loss": 0.3834, "step": 2003 }, { "epoch": 0.14275029383481141, "grad_norm": 3.3596177101135254, "learning_rate": 1.7064120663102737e-05, "loss": 0.5196, "step": 2004 }, { "epoch": 0.1428215265163657, "grad_norm": 2.5264735221862793, "learning_rate": 1.7061339895860513e-05, "loss": 0.3589, "step": 2005 }, { "epoch": 0.14289275919792, "grad_norm": 2.502256155014038, "learning_rate": 1.7058558039126266e-05, "loss": 0.5757, "step": 2006 }, { "epoch": 0.1429639918794743, "grad_norm": 2.9634592533111572, "learning_rate": 1.7055775093329202e-05, "loss": 0.7499, "step": 2007 }, { "epoch": 0.1430352245610286, "grad_norm": 3.1640119552612305, "learning_rate": 1.70529910588987e-05, "loss": 0.8935, "step": 2008 }, { "epoch": 0.1431064572425829, "grad_norm": 4.702127456665039, "learning_rate": 1.705020593626431e-05, "loss": 0.4752, "step": 2009 }, { "epoch": 0.14317768992413718, "grad_norm": 1.9386948347091675, "learning_rate": 1.704741972585575e-05, "loss": 0.2028, "step": 2010 }, { "epoch": 0.1432489226056915, "grad_norm": 4.175467491149902, "learning_rate": 1.7044632428102896e-05, "loss": 0.5561, "step": 2011 }, { "epoch": 0.1433201552872458, "grad_norm": 4.242292881011963, "learning_rate": 1.7041844043435806e-05, "loss": 0.7524, "step": 2012 }, { "epoch": 0.14339138796880008, "grad_norm": 4.2634358406066895, "learning_rate": 1.7039054572284697e-05, "loss": 0.5005, "step": 2013 }, { "epoch": 0.1434626206503544, "grad_norm": 3.715280294418335, "learning_rate": 1.7036264015079958e-05, "loss": 0.2807, "step": 2014 }, { "epoch": 0.14353385333190868, "grad_norm": 3.4895243644714355, "learning_rate": 1.7033472372252138e-05, "loss": 0.5971, "step": 2015 }, { "epoch": 0.14360508601346297, "grad_norm": 2.6333467960357666, "learning_rate": 1.703067964423196e-05, "loss": 0.4327, "step": 2016 }, { "epoch": 0.14367631869501726, "grad_norm": 3.0495030879974365, "learning_rate": 1.7027885831450318e-05, "loss": 0.3619, "step": 2017 }, { "epoch": 0.14374755137657158, "grad_norm": 2.41998553276062, "learning_rate": 1.7025090934338266e-05, "loss": 0.3325, "step": 2018 }, { "epoch": 0.14381878405812587, "grad_norm": 4.463460922241211, "learning_rate": 1.7022294953327025e-05, "loss": 0.1974, "step": 2019 }, { "epoch": 0.14389001673968016, "grad_norm": 3.5130460262298584, "learning_rate": 1.701949788884799e-05, "loss": 0.4558, "step": 2020 }, { "epoch": 0.14396124942123445, "grad_norm": 2.5298478603363037, "learning_rate": 1.701669974133272e-05, "loss": 0.627, "step": 2021 }, { "epoch": 0.14403248210278877, "grad_norm": 3.393969774246216, "learning_rate": 1.7013900511212932e-05, "loss": 0.3524, "step": 2022 }, { "epoch": 0.14410371478434306, "grad_norm": 3.759288787841797, "learning_rate": 1.7011100198920528e-05, "loss": 0.4236, "step": 2023 }, { "epoch": 0.14417494746589735, "grad_norm": 3.6242880821228027, "learning_rate": 1.7008298804887565e-05, "loss": 0.367, "step": 2024 }, { "epoch": 0.14424618014745166, "grad_norm": 4.757544994354248, "learning_rate": 1.7005496329546263e-05, "loss": 0.5168, "step": 2025 }, { "epoch": 0.14431741282900595, "grad_norm": 2.6955020427703857, "learning_rate": 1.7002692773329026e-05, "loss": 0.6788, "step": 2026 }, { "epoch": 0.14438864551056024, "grad_norm": 2.4819343090057373, "learning_rate": 1.6999888136668404e-05, "loss": 0.1164, "step": 2027 }, { "epoch": 0.14445987819211453, "grad_norm": 4.188404560089111, "learning_rate": 1.6997082419997127e-05, "loss": 0.9055, "step": 2028 }, { "epoch": 0.14453111087366885, "grad_norm": 3.5800540447235107, "learning_rate": 1.6994275623748092e-05, "loss": 0.3825, "step": 2029 }, { "epoch": 0.14460234355522314, "grad_norm": 2.8178861141204834, "learning_rate": 1.6991467748354352e-05, "loss": 0.6259, "step": 2030 }, { "epoch": 0.14467357623677743, "grad_norm": 4.544328212738037, "learning_rate": 1.6988658794249134e-05, "loss": 0.4649, "step": 2031 }, { "epoch": 0.14474480891833172, "grad_norm": 2.9185361862182617, "learning_rate": 1.6985848761865838e-05, "loss": 0.3858, "step": 2032 }, { "epoch": 0.14481604159988604, "grad_norm": 6.338019371032715, "learning_rate": 1.698303765163802e-05, "loss": 0.5329, "step": 2033 }, { "epoch": 0.14488727428144033, "grad_norm": 2.9565961360931396, "learning_rate": 1.69802254639994e-05, "loss": 0.5013, "step": 2034 }, { "epoch": 0.14495850696299462, "grad_norm": 2.4691176414489746, "learning_rate": 1.6977412199383872e-05, "loss": 0.4464, "step": 2035 }, { "epoch": 0.14502973964454893, "grad_norm": 2.982527256011963, "learning_rate": 1.6974597858225502e-05, "loss": 0.4747, "step": 2036 }, { "epoch": 0.14510097232610322, "grad_norm": 4.310842037200928, "learning_rate": 1.69717824409585e-05, "loss": 0.6378, "step": 2037 }, { "epoch": 0.1451722050076575, "grad_norm": 2.6649820804595947, "learning_rate": 1.6968965948017266e-05, "loss": 0.3354, "step": 2038 }, { "epoch": 0.1452434376892118, "grad_norm": 5.058302402496338, "learning_rate": 1.696614837983635e-05, "loss": 0.4774, "step": 2039 }, { "epoch": 0.14531467037076612, "grad_norm": 5.673666954040527, "learning_rate": 1.696332973685048e-05, "loss": 0.8624, "step": 2040 }, { "epoch": 0.1453859030523204, "grad_norm": 3.3904972076416016, "learning_rate": 1.696051001949454e-05, "loss": 0.5406, "step": 2041 }, { "epoch": 0.1454571357338747, "grad_norm": 4.975129127502441, "learning_rate": 1.6957689228203583e-05, "loss": 0.2966, "step": 2042 }, { "epoch": 0.145528368415429, "grad_norm": 2.885040283203125, "learning_rate": 1.6954867363412827e-05, "loss": 0.3613, "step": 2043 }, { "epoch": 0.1455996010969833, "grad_norm": 3.044543743133545, "learning_rate": 1.695204442555766e-05, "loss": 0.6042, "step": 2044 }, { "epoch": 0.1456708337785376, "grad_norm": 5.8502583503723145, "learning_rate": 1.6949220415073627e-05, "loss": 0.4049, "step": 2045 }, { "epoch": 0.14574206646009188, "grad_norm": 1.29160737991333, "learning_rate": 1.6946395332396447e-05, "loss": 0.1024, "step": 2046 }, { "epoch": 0.14581329914164617, "grad_norm": 2.1559348106384277, "learning_rate": 1.6943569177962005e-05, "loss": 0.1206, "step": 2047 }, { "epoch": 0.1458845318232005, "grad_norm": 2.517138957977295, "learning_rate": 1.6940741952206342e-05, "loss": 0.3919, "step": 2048 }, { "epoch": 0.14595576450475478, "grad_norm": 3.425997734069824, "learning_rate": 1.693791365556567e-05, "loss": 0.5894, "step": 2049 }, { "epoch": 0.14602699718630907, "grad_norm": 2.6628000736236572, "learning_rate": 1.6935084288476365e-05, "loss": 0.4625, "step": 2050 }, { "epoch": 0.1460982298678634, "grad_norm": 2.513779401779175, "learning_rate": 1.693225385137498e-05, "loss": 0.3697, "step": 2051 }, { "epoch": 0.14616946254941768, "grad_norm": 2.9653079509735107, "learning_rate": 1.692942234469821e-05, "loss": 0.2033, "step": 2052 }, { "epoch": 0.14624069523097197, "grad_norm": 3.0075292587280273, "learning_rate": 1.692658976888293e-05, "loss": 0.3312, "step": 2053 }, { "epoch": 0.14631192791252626, "grad_norm": 4.403154373168945, "learning_rate": 1.6923756124366184e-05, "loss": 0.5217, "step": 2054 }, { "epoch": 0.14638316059408057, "grad_norm": 1.8982322216033936, "learning_rate": 1.6920921411585164e-05, "loss": 0.2455, "step": 2055 }, { "epoch": 0.14645439327563486, "grad_norm": 4.5139007568359375, "learning_rate": 1.691808563097724e-05, "loss": 0.5205, "step": 2056 }, { "epoch": 0.14652562595718915, "grad_norm": 2.903446912765503, "learning_rate": 1.691524878297995e-05, "loss": 0.3918, "step": 2057 }, { "epoch": 0.14659685863874344, "grad_norm": 3.673393726348877, "learning_rate": 1.6912410868030987e-05, "loss": 0.6233, "step": 2058 }, { "epoch": 0.14666809132029776, "grad_norm": 2.8743202686309814, "learning_rate": 1.6909571886568206e-05, "loss": 0.6679, "step": 2059 }, { "epoch": 0.14673932400185205, "grad_norm": 4.624180793762207, "learning_rate": 1.690673183902964e-05, "loss": 0.3917, "step": 2060 }, { "epoch": 0.14681055668340634, "grad_norm": 4.283020973205566, "learning_rate": 1.690389072585348e-05, "loss": 0.3788, "step": 2061 }, { "epoch": 0.14688178936496066, "grad_norm": 3.5189497470855713, "learning_rate": 1.6901048547478073e-05, "loss": 0.6698, "step": 2062 }, { "epoch": 0.14695302204651495, "grad_norm": 3.5502662658691406, "learning_rate": 1.6898205304341947e-05, "loss": 0.6666, "step": 2063 }, { "epoch": 0.14702425472806924, "grad_norm": 2.5162346363067627, "learning_rate": 1.6895360996883777e-05, "loss": 0.4281, "step": 2064 }, { "epoch": 0.14709548740962353, "grad_norm": 3.2200543880462646, "learning_rate": 1.6892515625542413e-05, "loss": 0.7073, "step": 2065 }, { "epoch": 0.14716672009117784, "grad_norm": 2.2520430088043213, "learning_rate": 1.688966919075687e-05, "loss": 0.3845, "step": 2066 }, { "epoch": 0.14723795277273213, "grad_norm": 2.3306643962860107, "learning_rate": 1.6886821692966314e-05, "loss": 0.3594, "step": 2067 }, { "epoch": 0.14730918545428642, "grad_norm": 4.055081844329834, "learning_rate": 1.68839731326101e-05, "loss": 0.5085, "step": 2068 }, { "epoch": 0.1473804181358407, "grad_norm": 2.237621545791626, "learning_rate": 1.6881123510127716e-05, "loss": 0.3603, "step": 2069 }, { "epoch": 0.14745165081739503, "grad_norm": 2.081141233444214, "learning_rate": 1.687827282595884e-05, "loss": 0.3547, "step": 2070 }, { "epoch": 0.14752288349894932, "grad_norm": 3.1830759048461914, "learning_rate": 1.68754210805433e-05, "loss": 0.8607, "step": 2071 }, { "epoch": 0.1475941161805036, "grad_norm": 2.8665854930877686, "learning_rate": 1.6872568274321087e-05, "loss": 0.6723, "step": 2072 }, { "epoch": 0.1476653488620579, "grad_norm": 3.51324462890625, "learning_rate": 1.6869714407732364e-05, "loss": 0.4475, "step": 2073 }, { "epoch": 0.14773658154361222, "grad_norm": 8.886722564697266, "learning_rate": 1.6866859481217453e-05, "loss": 0.547, "step": 2074 }, { "epoch": 0.1478078142251665, "grad_norm": 4.32809591293335, "learning_rate": 1.686400349521684e-05, "loss": 0.5144, "step": 2075 }, { "epoch": 0.1478790469067208, "grad_norm": 2.3460402488708496, "learning_rate": 1.6861146450171177e-05, "loss": 0.3912, "step": 2076 }, { "epoch": 0.1479502795882751, "grad_norm": 6.7859039306640625, "learning_rate": 1.6858288346521265e-05, "loss": 0.7182, "step": 2077 }, { "epoch": 0.1480215122698294, "grad_norm": 3.8864612579345703, "learning_rate": 1.685542918470809e-05, "loss": 0.3358, "step": 2078 }, { "epoch": 0.1480927449513837, "grad_norm": 2.245877742767334, "learning_rate": 1.6852568965172794e-05, "loss": 0.3333, "step": 2079 }, { "epoch": 0.14816397763293798, "grad_norm": 1.7778654098510742, "learning_rate": 1.684970768835667e-05, "loss": 0.1792, "step": 2080 }, { "epoch": 0.1482352103144923, "grad_norm": 4.892746925354004, "learning_rate": 1.684684535470119e-05, "loss": 0.7051, "step": 2081 }, { "epoch": 0.1483064429960466, "grad_norm": 2.8635191917419434, "learning_rate": 1.6843981964647976e-05, "loss": 0.5164, "step": 2082 }, { "epoch": 0.14837767567760088, "grad_norm": 3.011653423309326, "learning_rate": 1.684111751863883e-05, "loss": 0.6966, "step": 2083 }, { "epoch": 0.14844890835915517, "grad_norm": 3.1896209716796875, "learning_rate": 1.68382520171157e-05, "loss": 0.5251, "step": 2084 }, { "epoch": 0.14852014104070949, "grad_norm": 3.380427598953247, "learning_rate": 1.68353854605207e-05, "loss": 0.3842, "step": 2085 }, { "epoch": 0.14859137372226378, "grad_norm": 3.4290244579315186, "learning_rate": 1.683251784929612e-05, "loss": 0.4873, "step": 2086 }, { "epoch": 0.14866260640381807, "grad_norm": 4.074676990509033, "learning_rate": 1.6829649183884395e-05, "loss": 0.6039, "step": 2087 }, { "epoch": 0.14873383908537238, "grad_norm": 2.8978638648986816, "learning_rate": 1.6826779464728132e-05, "loss": 0.3855, "step": 2088 }, { "epoch": 0.14880507176692667, "grad_norm": 4.659038543701172, "learning_rate": 1.68239086922701e-05, "loss": 0.7198, "step": 2089 }, { "epoch": 0.14887630444848096, "grad_norm": 2.6789658069610596, "learning_rate": 1.6821036866953226e-05, "loss": 0.3666, "step": 2090 }, { "epoch": 0.14894753713003525, "grad_norm": 6.0987420082092285, "learning_rate": 1.681816398922061e-05, "loss": 0.7262, "step": 2091 }, { "epoch": 0.14901876981158957, "grad_norm": 2.601454734802246, "learning_rate": 1.6815290059515504e-05, "loss": 0.156, "step": 2092 }, { "epoch": 0.14909000249314386, "grad_norm": 6.624037742614746, "learning_rate": 1.6812415078281324e-05, "loss": 0.7369, "step": 2093 }, { "epoch": 0.14916123517469815, "grad_norm": 1.9372104406356812, "learning_rate": 1.6809539045961653e-05, "loss": 0.2335, "step": 2094 }, { "epoch": 0.14923246785625244, "grad_norm": 2.3116068840026855, "learning_rate": 1.6806661963000234e-05, "loss": 0.3512, "step": 2095 }, { "epoch": 0.14930370053780675, "grad_norm": 2.0516445636749268, "learning_rate": 1.6803783829840967e-05, "loss": 0.3821, "step": 2096 }, { "epoch": 0.14937493321936104, "grad_norm": 3.6478195190429688, "learning_rate": 1.6800904646927923e-05, "loss": 0.519, "step": 2097 }, { "epoch": 0.14944616590091533, "grad_norm": 2.6818864345550537, "learning_rate": 1.679802441470532e-05, "loss": 0.252, "step": 2098 }, { "epoch": 0.14951739858246962, "grad_norm": 2.4929521083831787, "learning_rate": 1.6795143133617562e-05, "loss": 0.5462, "step": 2099 }, { "epoch": 0.14958863126402394, "grad_norm": 2.686737060546875, "learning_rate": 1.6792260804109196e-05, "loss": 0.2982, "step": 2100 }, { "epoch": 0.14965986394557823, "grad_norm": 3.269791603088379, "learning_rate": 1.6789377426624935e-05, "loss": 0.8269, "step": 2101 }, { "epoch": 0.14973109662713252, "grad_norm": 3.2408156394958496, "learning_rate": 1.678649300160965e-05, "loss": 0.5224, "step": 2102 }, { "epoch": 0.14980232930868684, "grad_norm": 3.4789984226226807, "learning_rate": 1.6783607529508382e-05, "loss": 0.9083, "step": 2103 }, { "epoch": 0.14987356199024113, "grad_norm": 3.023468017578125, "learning_rate": 1.6780721010766335e-05, "loss": 0.6337, "step": 2104 }, { "epoch": 0.14994479467179542, "grad_norm": 3.098576068878174, "learning_rate": 1.677783344582886e-05, "loss": 0.4006, "step": 2105 }, { "epoch": 0.1500160273533497, "grad_norm": 2.7042248249053955, "learning_rate": 1.6774944835141484e-05, "loss": 0.4863, "step": 2106 }, { "epoch": 0.15008726003490402, "grad_norm": 2.6174755096435547, "learning_rate": 1.6772055179149886e-05, "loss": 0.3261, "step": 2107 }, { "epoch": 0.1501584927164583, "grad_norm": 5.293312072753906, "learning_rate": 1.676916447829992e-05, "loss": 0.4058, "step": 2108 }, { "epoch": 0.1502297253980126, "grad_norm": 4.665131568908691, "learning_rate": 1.6766272733037575e-05, "loss": 0.673, "step": 2109 }, { "epoch": 0.1503009580795669, "grad_norm": 7.169065475463867, "learning_rate": 1.676337994380903e-05, "loss": 0.3599, "step": 2110 }, { "epoch": 0.1503721907611212, "grad_norm": 4.231070041656494, "learning_rate": 1.6760486111060607e-05, "loss": 0.8225, "step": 2111 }, { "epoch": 0.1504434234426755, "grad_norm": 2.1446943283081055, "learning_rate": 1.67575912352388e-05, "loss": 0.4099, "step": 2112 }, { "epoch": 0.1505146561242298, "grad_norm": 2.769963026046753, "learning_rate": 1.6754695316790255e-05, "loss": 0.2952, "step": 2113 }, { "epoch": 0.1505858888057841, "grad_norm": 2.474128007888794, "learning_rate": 1.675179835616178e-05, "loss": 0.477, "step": 2114 }, { "epoch": 0.1506571214873384, "grad_norm": 3.862168788909912, "learning_rate": 1.674890035380035e-05, "loss": 0.6372, "step": 2115 }, { "epoch": 0.1507283541688927, "grad_norm": 3.5942392349243164, "learning_rate": 1.6746001310153095e-05, "loss": 0.6819, "step": 2116 }, { "epoch": 0.15079958685044698, "grad_norm": 2.140770673751831, "learning_rate": 1.674310122566731e-05, "loss": 0.2304, "step": 2117 }, { "epoch": 0.1508708195320013, "grad_norm": 2.289363384246826, "learning_rate": 1.6740200100790445e-05, "loss": 0.3755, "step": 2118 }, { "epoch": 0.15094205221355558, "grad_norm": 3.58697509765625, "learning_rate": 1.673729793597011e-05, "loss": 0.4697, "step": 2119 }, { "epoch": 0.15101328489510987, "grad_norm": 4.349618434906006, "learning_rate": 1.6734394731654094e-05, "loss": 0.3947, "step": 2120 }, { "epoch": 0.15108451757666416, "grad_norm": 2.2796108722686768, "learning_rate": 1.6731490488290316e-05, "loss": 0.6082, "step": 2121 }, { "epoch": 0.15115575025821848, "grad_norm": 2.9987988471984863, "learning_rate": 1.672858520632688e-05, "loss": 0.3249, "step": 2122 }, { "epoch": 0.15122698293977277, "grad_norm": 2.931288719177246, "learning_rate": 1.6725678886212034e-05, "loss": 0.5154, "step": 2123 }, { "epoch": 0.15129821562132706, "grad_norm": 2.7967004776000977, "learning_rate": 1.67227715283942e-05, "loss": 0.3334, "step": 2124 }, { "epoch": 0.15136944830288135, "grad_norm": 1.8390012979507446, "learning_rate": 1.6719863133321947e-05, "loss": 0.0896, "step": 2125 }, { "epoch": 0.15144068098443567, "grad_norm": 2.6344869136810303, "learning_rate": 1.6716953701444014e-05, "loss": 0.5015, "step": 2126 }, { "epoch": 0.15151191366598996, "grad_norm": 3.085902690887451, "learning_rate": 1.6714043233209296e-05, "loss": 0.8928, "step": 2127 }, { "epoch": 0.15158314634754425, "grad_norm": 1.98793363571167, "learning_rate": 1.6711131729066853e-05, "loss": 0.2926, "step": 2128 }, { "epoch": 0.15165437902909856, "grad_norm": 3.4855527877807617, "learning_rate": 1.6708219189465894e-05, "loss": 0.578, "step": 2129 }, { "epoch": 0.15172561171065285, "grad_norm": 2.922657012939453, "learning_rate": 1.670530561485579e-05, "loss": 0.566, "step": 2130 }, { "epoch": 0.15179684439220714, "grad_norm": 3.192566394805908, "learning_rate": 1.6702391005686088e-05, "loss": 0.7319, "step": 2131 }, { "epoch": 0.15186807707376143, "grad_norm": 4.291374683380127, "learning_rate": 1.669947536240647e-05, "loss": 0.5798, "step": 2132 }, { "epoch": 0.15193930975531575, "grad_norm": 3.536851406097412, "learning_rate": 1.6696558685466793e-05, "loss": 0.5976, "step": 2133 }, { "epoch": 0.15201054243687004, "grad_norm": 3.681243658065796, "learning_rate": 1.6693640975317078e-05, "loss": 0.4555, "step": 2134 }, { "epoch": 0.15208177511842433, "grad_norm": 2.377490997314453, "learning_rate": 1.669072223240749e-05, "loss": 0.574, "step": 2135 }, { "epoch": 0.15215300779997862, "grad_norm": 3.612712860107422, "learning_rate": 1.668780245718836e-05, "loss": 0.6028, "step": 2136 }, { "epoch": 0.15222424048153294, "grad_norm": 3.4517719745635986, "learning_rate": 1.6684881650110186e-05, "loss": 0.6376, "step": 2137 }, { "epoch": 0.15229547316308722, "grad_norm": 3.724398374557495, "learning_rate": 1.668195981162361e-05, "loss": 0.7373, "step": 2138 }, { "epoch": 0.15236670584464151, "grad_norm": 2.404324769973755, "learning_rate": 1.667903694217945e-05, "loss": 0.2808, "step": 2139 }, { "epoch": 0.15243793852619583, "grad_norm": 3.0359280109405518, "learning_rate": 1.667611304222867e-05, "loss": 0.3398, "step": 2140 }, { "epoch": 0.15250917120775012, "grad_norm": 3.2669100761413574, "learning_rate": 1.6673188112222394e-05, "loss": 0.7533, "step": 2141 }, { "epoch": 0.1525804038893044, "grad_norm": 1.6558657884597778, "learning_rate": 1.6670262152611916e-05, "loss": 0.1679, "step": 2142 }, { "epoch": 0.1526516365708587, "grad_norm": 7.498032569885254, "learning_rate": 1.6667335163848682e-05, "loss": 0.8167, "step": 2143 }, { "epoch": 0.15272286925241302, "grad_norm": 2.2483837604522705, "learning_rate": 1.6664407146384287e-05, "loss": 0.3216, "step": 2144 }, { "epoch": 0.1527941019339673, "grad_norm": 2.6502134799957275, "learning_rate": 1.6661478100670502e-05, "loss": 0.4794, "step": 2145 }, { "epoch": 0.1528653346155216, "grad_norm": 2.564066171646118, "learning_rate": 1.6658548027159245e-05, "loss": 0.7577, "step": 2146 }, { "epoch": 0.1529365672970759, "grad_norm": 2.8195364475250244, "learning_rate": 1.6655616926302594e-05, "loss": 0.3324, "step": 2147 }, { "epoch": 0.1530077999786302, "grad_norm": 3.1765453815460205, "learning_rate": 1.6652684798552793e-05, "loss": 0.0907, "step": 2148 }, { "epoch": 0.1530790326601845, "grad_norm": 2.2883105278015137, "learning_rate": 1.664975164436224e-05, "loss": 0.3659, "step": 2149 }, { "epoch": 0.15315026534173878, "grad_norm": 6.458082675933838, "learning_rate": 1.6646817464183485e-05, "loss": 0.7908, "step": 2150 }, { "epoch": 0.15322149802329307, "grad_norm": 3.1367592811584473, "learning_rate": 1.6643882258469247e-05, "loss": 0.3343, "step": 2151 }, { "epoch": 0.1532927307048474, "grad_norm": 3.8866491317749023, "learning_rate": 1.6640946027672395e-05, "loss": 0.4495, "step": 2152 }, { "epoch": 0.15336396338640168, "grad_norm": 3.946992874145508, "learning_rate": 1.6638008772245956e-05, "loss": 0.7396, "step": 2153 }, { "epoch": 0.15343519606795597, "grad_norm": 3.341935396194458, "learning_rate": 1.663507049264312e-05, "loss": 0.3057, "step": 2154 }, { "epoch": 0.1535064287495103, "grad_norm": 2.1967687606811523, "learning_rate": 1.663213118931724e-05, "loss": 0.3416, "step": 2155 }, { "epoch": 0.15357766143106458, "grad_norm": 3.5308306217193604, "learning_rate": 1.6629190862721813e-05, "loss": 0.5972, "step": 2156 }, { "epoch": 0.15364889411261887, "grad_norm": 3.5517752170562744, "learning_rate": 1.6626249513310505e-05, "loss": 0.028, "step": 2157 }, { "epoch": 0.15372012679417316, "grad_norm": 3.0510969161987305, "learning_rate": 1.662330714153713e-05, "loss": 0.3694, "step": 2158 }, { "epoch": 0.15379135947572747, "grad_norm": 5.1095428466796875, "learning_rate": 1.6620363747855675e-05, "loss": 0.5533, "step": 2159 }, { "epoch": 0.15386259215728176, "grad_norm": 2.6577963829040527, "learning_rate": 1.6617419332720267e-05, "loss": 0.466, "step": 2160 }, { "epoch": 0.15393382483883605, "grad_norm": 2.002375841140747, "learning_rate": 1.6614473896585206e-05, "loss": 0.2229, "step": 2161 }, { "epoch": 0.15400505752039034, "grad_norm": 4.511390209197998, "learning_rate": 1.6611527439904934e-05, "loss": 0.6006, "step": 2162 }, { "epoch": 0.15407629020194466, "grad_norm": 3.0297770500183105, "learning_rate": 1.6608579963134067e-05, "loss": 0.5255, "step": 2163 }, { "epoch": 0.15414752288349895, "grad_norm": 3.449495792388916, "learning_rate": 1.6605631466727365e-05, "loss": 0.4456, "step": 2164 }, { "epoch": 0.15421875556505324, "grad_norm": 3.2772979736328125, "learning_rate": 1.6602681951139752e-05, "loss": 0.7563, "step": 2165 }, { "epoch": 0.15428998824660756, "grad_norm": 4.095283508300781, "learning_rate": 1.659973141682631e-05, "loss": 0.4776, "step": 2166 }, { "epoch": 0.15436122092816185, "grad_norm": 2.7068302631378174, "learning_rate": 1.6596779864242274e-05, "loss": 0.5676, "step": 2167 }, { "epoch": 0.15443245360971614, "grad_norm": 3.8394272327423096, "learning_rate": 1.659382729384304e-05, "loss": 0.511, "step": 2168 }, { "epoch": 0.15450368629127043, "grad_norm": 3.297064781188965, "learning_rate": 1.6590873706084158e-05, "loss": 0.4003, "step": 2169 }, { "epoch": 0.15457491897282474, "grad_norm": 3.408616781234741, "learning_rate": 1.6587919101421333e-05, "loss": 0.5499, "step": 2170 }, { "epoch": 0.15464615165437903, "grad_norm": 3.206162929534912, "learning_rate": 1.6584963480310433e-05, "loss": 0.5663, "step": 2171 }, { "epoch": 0.15471738433593332, "grad_norm": 2.2374284267425537, "learning_rate": 1.658200684320748e-05, "loss": 0.3604, "step": 2172 }, { "epoch": 0.1547886170174876, "grad_norm": 2.7870981693267822, "learning_rate": 1.6579049190568656e-05, "loss": 0.4915, "step": 2173 }, { "epoch": 0.15485984969904193, "grad_norm": 2.9755587577819824, "learning_rate": 1.6576090522850292e-05, "loss": 0.6387, "step": 2174 }, { "epoch": 0.15493108238059622, "grad_norm": 2.0574352741241455, "learning_rate": 1.657313084050888e-05, "loss": 0.1231, "step": 2175 }, { "epoch": 0.1550023150621505, "grad_norm": 2.1793410778045654, "learning_rate": 1.6570170144001067e-05, "loss": 0.1799, "step": 2176 }, { "epoch": 0.1550735477437048, "grad_norm": 4.628292083740234, "learning_rate": 1.6567208433783666e-05, "loss": 0.8443, "step": 2177 }, { "epoch": 0.15514478042525912, "grad_norm": 3.2543203830718994, "learning_rate": 1.656424571031363e-05, "loss": 0.4428, "step": 2178 }, { "epoch": 0.1552160131068134, "grad_norm": 2.9102799892425537, "learning_rate": 1.656128197404808e-05, "loss": 0.7111, "step": 2179 }, { "epoch": 0.1552872457883677, "grad_norm": 2.591869831085205, "learning_rate": 1.655831722544429e-05, "loss": 0.2772, "step": 2180 }, { "epoch": 0.155358478469922, "grad_norm": 6.135251522064209, "learning_rate": 1.655535146495969e-05, "loss": 0.1006, "step": 2181 }, { "epoch": 0.1554297111514763, "grad_norm": 4.5213541984558105, "learning_rate": 1.655238469305186e-05, "loss": 0.3801, "step": 2182 }, { "epoch": 0.1555009438330306, "grad_norm": 4.533535003662109, "learning_rate": 1.6549416910178554e-05, "loss": 0.3779, "step": 2183 }, { "epoch": 0.15557217651458488, "grad_norm": 6.215088844299316, "learning_rate": 1.6546448116797664e-05, "loss": 0.4421, "step": 2184 }, { "epoch": 0.1556434091961392, "grad_norm": 2.790968656539917, "learning_rate": 1.6543478313367244e-05, "loss": 0.5802, "step": 2185 }, { "epoch": 0.1557146418776935, "grad_norm": 4.220832347869873, "learning_rate": 1.6540507500345507e-05, "loss": 0.5106, "step": 2186 }, { "epoch": 0.15578587455924778, "grad_norm": 3.9066812992095947, "learning_rate": 1.6537535678190815e-05, "loss": 0.7516, "step": 2187 }, { "epoch": 0.15585710724080207, "grad_norm": 3.549044609069824, "learning_rate": 1.6534562847361693e-05, "loss": 0.7376, "step": 2188 }, { "epoch": 0.15592833992235638, "grad_norm": 2.076472282409668, "learning_rate": 1.6531589008316816e-05, "loss": 0.2056, "step": 2189 }, { "epoch": 0.15599957260391067, "grad_norm": 4.000905513763428, "learning_rate": 1.6528614161515015e-05, "loss": 0.5402, "step": 2190 }, { "epoch": 0.15607080528546496, "grad_norm": 3.042762279510498, "learning_rate": 1.6525638307415284e-05, "loss": 0.4633, "step": 2191 }, { "epoch": 0.15614203796701928, "grad_norm": 4.0870208740234375, "learning_rate": 1.6522661446476762e-05, "loss": 0.5247, "step": 2192 }, { "epoch": 0.15621327064857357, "grad_norm": 3.8520047664642334, "learning_rate": 1.651968357915875e-05, "loss": 0.251, "step": 2193 }, { "epoch": 0.15628450333012786, "grad_norm": 2.2324278354644775, "learning_rate": 1.6516704705920702e-05, "loss": 0.2636, "step": 2194 }, { "epoch": 0.15635573601168215, "grad_norm": 3.427581787109375, "learning_rate": 1.6513724827222225e-05, "loss": 0.6492, "step": 2195 }, { "epoch": 0.15642696869323647, "grad_norm": 3.0478599071502686, "learning_rate": 1.6510743943523084e-05, "loss": 0.4681, "step": 2196 }, { "epoch": 0.15649820137479076, "grad_norm": 4.2298455238342285, "learning_rate": 1.6507762055283202e-05, "loss": 0.6836, "step": 2197 }, { "epoch": 0.15656943405634505, "grad_norm": 2.7147583961486816, "learning_rate": 1.6504779162962655e-05, "loss": 0.3804, "step": 2198 }, { "epoch": 0.15664066673789934, "grad_norm": 3.457833766937256, "learning_rate": 1.6501795267021666e-05, "loss": 0.7746, "step": 2199 }, { "epoch": 0.15671189941945365, "grad_norm": 4.625402450561523, "learning_rate": 1.6498810367920622e-05, "loss": 0.5574, "step": 2200 }, { "epoch": 0.15678313210100794, "grad_norm": 3.8126280307769775, "learning_rate": 1.6495824466120067e-05, "loss": 0.6855, "step": 2201 }, { "epoch": 0.15685436478256223, "grad_norm": 3.7745237350463867, "learning_rate": 1.649283756208069e-05, "loss": 0.7197, "step": 2202 }, { "epoch": 0.15692559746411652, "grad_norm": 3.1364023685455322, "learning_rate": 1.6489849656263336e-05, "loss": 0.5904, "step": 2203 }, { "epoch": 0.15699683014567084, "grad_norm": 2.5418217182159424, "learning_rate": 1.6486860749129014e-05, "loss": 0.6319, "step": 2204 }, { "epoch": 0.15706806282722513, "grad_norm": 3.5167572498321533, "learning_rate": 1.6483870841138883e-05, "loss": 0.5309, "step": 2205 }, { "epoch": 0.15713929550877942, "grad_norm": 9.44259262084961, "learning_rate": 1.648087993275425e-05, "loss": 0.6637, "step": 2206 }, { "epoch": 0.15721052819033374, "grad_norm": 3.0112509727478027, "learning_rate": 1.6477888024436586e-05, "loss": 0.6823, "step": 2207 }, { "epoch": 0.15728176087188803, "grad_norm": 3.0174224376678467, "learning_rate": 1.6474895116647506e-05, "loss": 0.5454, "step": 2208 }, { "epoch": 0.15735299355344232, "grad_norm": 3.620011329650879, "learning_rate": 1.647190120984879e-05, "loss": 0.7686, "step": 2209 }, { "epoch": 0.1574242262349966, "grad_norm": 2.5304646492004395, "learning_rate": 1.6468906304502365e-05, "loss": 0.6026, "step": 2210 }, { "epoch": 0.15749545891655092, "grad_norm": 3.940138339996338, "learning_rate": 1.6465910401070312e-05, "loss": 0.6793, "step": 2211 }, { "epoch": 0.1575666915981052, "grad_norm": 2.946047067642212, "learning_rate": 1.6462913500014872e-05, "loss": 0.5269, "step": 2212 }, { "epoch": 0.1576379242796595, "grad_norm": 3.6002628803253174, "learning_rate": 1.6459915601798436e-05, "loss": 0.728, "step": 2213 }, { "epoch": 0.1577091569612138, "grad_norm": 3.896296977996826, "learning_rate": 1.6456916706883542e-05, "loss": 0.1106, "step": 2214 }, { "epoch": 0.1577803896427681, "grad_norm": 3.0729291439056396, "learning_rate": 1.64539168157329e-05, "loss": 0.5009, "step": 2215 }, { "epoch": 0.1578516223243224, "grad_norm": 5.609020233154297, "learning_rate": 1.645091592880935e-05, "loss": 0.5474, "step": 2216 }, { "epoch": 0.1579228550058767, "grad_norm": 2.535051107406616, "learning_rate": 1.6447914046575906e-05, "loss": 0.4806, "step": 2217 }, { "epoch": 0.157994087687431, "grad_norm": 2.6236772537231445, "learning_rate": 1.6444911169495727e-05, "loss": 0.4278, "step": 2218 }, { "epoch": 0.1580653203689853, "grad_norm": 2.5267598628997803, "learning_rate": 1.644190729803212e-05, "loss": 0.5591, "step": 2219 }, { "epoch": 0.15813655305053959, "grad_norm": 2.7284586429595947, "learning_rate": 1.6438902432648558e-05, "loss": 0.732, "step": 2220 }, { "epoch": 0.15820778573209388, "grad_norm": 4.032872676849365, "learning_rate": 1.643589657380866e-05, "loss": 0.5916, "step": 2221 }, { "epoch": 0.1582790184136482, "grad_norm": 5.524267196655273, "learning_rate": 1.6432889721976196e-05, "loss": 0.5046, "step": 2222 }, { "epoch": 0.15835025109520248, "grad_norm": 2.47912859916687, "learning_rate": 1.6429881877615094e-05, "loss": 0.4117, "step": 2223 }, { "epoch": 0.15842148377675677, "grad_norm": 2.2781307697296143, "learning_rate": 1.642687304118943e-05, "loss": 0.6056, "step": 2224 }, { "epoch": 0.15849271645831106, "grad_norm": 4.91823148727417, "learning_rate": 1.6423863213163443e-05, "loss": 0.8537, "step": 2225 }, { "epoch": 0.15856394913986538, "grad_norm": 3.0923054218292236, "learning_rate": 1.642085239400152e-05, "loss": 0.336, "step": 2226 }, { "epoch": 0.15863518182141967, "grad_norm": 2.2640459537506104, "learning_rate": 1.6417840584168185e-05, "loss": 0.2842, "step": 2227 }, { "epoch": 0.15870641450297396, "grad_norm": 3.037214756011963, "learning_rate": 1.6414827784128145e-05, "loss": 0.6539, "step": 2228 }, { "epoch": 0.15877764718452825, "grad_norm": 3.513546943664551, "learning_rate": 1.6411813994346237e-05, "loss": 0.492, "step": 2229 }, { "epoch": 0.15884887986608257, "grad_norm": 3.583009958267212, "learning_rate": 1.640879921528746e-05, "loss": 0.5899, "step": 2230 }, { "epoch": 0.15892011254763685, "grad_norm": 2.321423053741455, "learning_rate": 1.640578344741696e-05, "loss": 0.5233, "step": 2231 }, { "epoch": 0.15899134522919114, "grad_norm": 3.1781458854675293, "learning_rate": 1.640276669120004e-05, "loss": 0.5305, "step": 2232 }, { "epoch": 0.15906257791074546, "grad_norm": 3.7390356063842773, "learning_rate": 1.6399748947102154e-05, "loss": 0.8009, "step": 2233 }, { "epoch": 0.15913381059229975, "grad_norm": 3.386319398880005, "learning_rate": 1.6396730215588913e-05, "loss": 0.5496, "step": 2234 }, { "epoch": 0.15920504327385404, "grad_norm": 2.1616525650024414, "learning_rate": 1.6393710497126075e-05, "loss": 0.2174, "step": 2235 }, { "epoch": 0.15927627595540833, "grad_norm": 2.5637824535369873, "learning_rate": 1.6390689792179546e-05, "loss": 0.297, "step": 2236 }, { "epoch": 0.15934750863696265, "grad_norm": 3.3369834423065186, "learning_rate": 1.6387668101215397e-05, "loss": 0.6835, "step": 2237 }, { "epoch": 0.15941874131851694, "grad_norm": 3.5377488136291504, "learning_rate": 1.6384645424699835e-05, "loss": 0.8034, "step": 2238 }, { "epoch": 0.15948997400007123, "grad_norm": 3.9509847164154053, "learning_rate": 1.638162176309924e-05, "loss": 0.2442, "step": 2239 }, { "epoch": 0.15956120668162552, "grad_norm": 1.9483323097229004, "learning_rate": 1.637859711688012e-05, "loss": 0.3733, "step": 2240 }, { "epoch": 0.15963243936317983, "grad_norm": 3.390495538711548, "learning_rate": 1.637557148650915e-05, "loss": 0.4763, "step": 2241 }, { "epoch": 0.15970367204473412, "grad_norm": 2.525059700012207, "learning_rate": 1.637254487245316e-05, "loss": 0.5926, "step": 2242 }, { "epoch": 0.1597749047262884, "grad_norm": 2.5816521644592285, "learning_rate": 1.636951727517912e-05, "loss": 0.4447, "step": 2243 }, { "epoch": 0.15984613740784273, "grad_norm": 2.742842435836792, "learning_rate": 1.6366488695154153e-05, "loss": 0.3791, "step": 2244 }, { "epoch": 0.15991737008939702, "grad_norm": 1.8328619003295898, "learning_rate": 1.636345913284555e-05, "loss": 0.2444, "step": 2245 }, { "epoch": 0.1599886027709513, "grad_norm": 3.023804187774658, "learning_rate": 1.636042858872073e-05, "loss": 0.4685, "step": 2246 }, { "epoch": 0.1600598354525056, "grad_norm": 3.3211669921875, "learning_rate": 1.6357397063247278e-05, "loss": 0.1119, "step": 2247 }, { "epoch": 0.16013106813405992, "grad_norm": 3.515697956085205, "learning_rate": 1.6354364556892926e-05, "loss": 0.749, "step": 2248 }, { "epoch": 0.1602023008156142, "grad_norm": 4.325363636016846, "learning_rate": 1.6351331070125565e-05, "loss": 0.775, "step": 2249 }, { "epoch": 0.1602735334971685, "grad_norm": 3.1608924865722656, "learning_rate": 1.634829660341322e-05, "loss": 0.4568, "step": 2250 }, { "epoch": 0.1603447661787228, "grad_norm": 3.783487319946289, "learning_rate": 1.6345261157224088e-05, "loss": 0.5434, "step": 2251 }, { "epoch": 0.1604159988602771, "grad_norm": 5.263415813446045, "learning_rate": 1.6342224732026503e-05, "loss": 0.5348, "step": 2252 }, { "epoch": 0.1604872315418314, "grad_norm": 3.514448404312134, "learning_rate": 1.6339187328288953e-05, "loss": 0.2762, "step": 2253 }, { "epoch": 0.16055846422338568, "grad_norm": 4.080444812774658, "learning_rate": 1.633614894648008e-05, "loss": 0.8157, "step": 2254 }, { "epoch": 0.16062969690494, "grad_norm": 2.9644935131073, "learning_rate": 1.6333109587068675e-05, "loss": 0.6697, "step": 2255 }, { "epoch": 0.1607009295864943, "grad_norm": 2.4699490070343018, "learning_rate": 1.6330069250523675e-05, "loss": 0.4158, "step": 2256 }, { "epoch": 0.16077216226804858, "grad_norm": 1.6674588918685913, "learning_rate": 1.6327027937314183e-05, "loss": 0.1259, "step": 2257 }, { "epoch": 0.16084339494960287, "grad_norm": 4.185678482055664, "learning_rate": 1.632398564790943e-05, "loss": 0.4087, "step": 2258 }, { "epoch": 0.1609146276311572, "grad_norm": 5.784289360046387, "learning_rate": 1.632094238277882e-05, "loss": 0.6124, "step": 2259 }, { "epoch": 0.16098586031271148, "grad_norm": 4.591983795166016, "learning_rate": 1.631789814239189e-05, "loss": 0.8387, "step": 2260 }, { "epoch": 0.16105709299426577, "grad_norm": 3.955702304840088, "learning_rate": 1.631485292721834e-05, "loss": 0.4201, "step": 2261 }, { "epoch": 0.16112832567582006, "grad_norm": 3.6439266204833984, "learning_rate": 1.6311806737728016e-05, "loss": 0.5807, "step": 2262 }, { "epoch": 0.16119955835737437, "grad_norm": 3.7127487659454346, "learning_rate": 1.630875957439091e-05, "loss": 0.5422, "step": 2263 }, { "epoch": 0.16127079103892866, "grad_norm": 7.912548065185547, "learning_rate": 1.6305711437677166e-05, "loss": 0.1646, "step": 2264 }, { "epoch": 0.16134202372048295, "grad_norm": 2.4339797496795654, "learning_rate": 1.630266232805709e-05, "loss": 0.3731, "step": 2265 }, { "epoch": 0.16141325640203724, "grad_norm": 2.9527835845947266, "learning_rate": 1.6299612246001118e-05, "loss": 0.1236, "step": 2266 }, { "epoch": 0.16148448908359156, "grad_norm": 3.214245557785034, "learning_rate": 1.6296561191979847e-05, "loss": 0.5735, "step": 2267 }, { "epoch": 0.16155572176514585, "grad_norm": 4.132042407989502, "learning_rate": 1.629350916646403e-05, "loss": 0.4058, "step": 2268 }, { "epoch": 0.16162695444670014, "grad_norm": 3.146714448928833, "learning_rate": 1.629045616992456e-05, "loss": 0.5515, "step": 2269 }, { "epoch": 0.16169818712825446, "grad_norm": 3.502908706665039, "learning_rate": 1.628740220283248e-05, "loss": 0.3879, "step": 2270 }, { "epoch": 0.16176941980980875, "grad_norm": 1.8348361253738403, "learning_rate": 1.6284347265658986e-05, "loss": 0.1423, "step": 2271 }, { "epoch": 0.16184065249136304, "grad_norm": 2.6412014961242676, "learning_rate": 1.6281291358875427e-05, "loss": 0.5049, "step": 2272 }, { "epoch": 0.16191188517291732, "grad_norm": 6.513573169708252, "learning_rate": 1.6278234482953296e-05, "loss": 0.6322, "step": 2273 }, { "epoch": 0.16198311785447164, "grad_norm": 3.310540199279785, "learning_rate": 1.627517663836424e-05, "loss": 0.5217, "step": 2274 }, { "epoch": 0.16205435053602593, "grad_norm": 1.308848261833191, "learning_rate": 1.627211782558005e-05, "loss": 0.1469, "step": 2275 }, { "epoch": 0.16212558321758022, "grad_norm": 3.3199336528778076, "learning_rate": 1.6269058045072664e-05, "loss": 0.6778, "step": 2276 }, { "epoch": 0.1621968158991345, "grad_norm": 2.7188217639923096, "learning_rate": 1.626599729731419e-05, "loss": 0.6053, "step": 2277 }, { "epoch": 0.16226804858068883, "grad_norm": 3.8557350635528564, "learning_rate": 1.626293558277685e-05, "loss": 0.6886, "step": 2278 }, { "epoch": 0.16233928126224312, "grad_norm": 3.6800763607025146, "learning_rate": 1.6259872901933052e-05, "loss": 0.5364, "step": 2279 }, { "epoch": 0.1624105139437974, "grad_norm": 5.174066066741943, "learning_rate": 1.6256809255255328e-05, "loss": 0.8163, "step": 2280 }, { "epoch": 0.16248174662535173, "grad_norm": 4.39872932434082, "learning_rate": 1.625374464321637e-05, "loss": 0.3936, "step": 2281 }, { "epoch": 0.16255297930690601, "grad_norm": 3.253690004348755, "learning_rate": 1.6250679066289015e-05, "loss": 0.4997, "step": 2282 }, { "epoch": 0.1626242119884603, "grad_norm": 3.0169413089752197, "learning_rate": 1.624761252494625e-05, "loss": 0.156, "step": 2283 }, { "epoch": 0.1626954446700146, "grad_norm": 3.118818998336792, "learning_rate": 1.6244545019661203e-05, "loss": 0.5617, "step": 2284 }, { "epoch": 0.1627666773515689, "grad_norm": 2.197984218597412, "learning_rate": 1.624147655090717e-05, "loss": 0.3503, "step": 2285 }, { "epoch": 0.1628379100331232, "grad_norm": 3.344980001449585, "learning_rate": 1.6238407119157586e-05, "loss": 0.5155, "step": 2286 }, { "epoch": 0.1629091427146775, "grad_norm": 4.289247512817383, "learning_rate": 1.623533672488602e-05, "loss": 0.6227, "step": 2287 }, { "epoch": 0.16298037539623178, "grad_norm": 3.436316967010498, "learning_rate": 1.623226536856621e-05, "loss": 0.3511, "step": 2288 }, { "epoch": 0.1630516080777861, "grad_norm": 7.1041693687438965, "learning_rate": 1.6229193050672036e-05, "loss": 0.1444, "step": 2289 }, { "epoch": 0.1631228407593404, "grad_norm": 4.82064962387085, "learning_rate": 1.6226119771677517e-05, "loss": 0.5377, "step": 2290 }, { "epoch": 0.16319407344089468, "grad_norm": 3.239898681640625, "learning_rate": 1.6223045532056838e-05, "loss": 0.4971, "step": 2291 }, { "epoch": 0.16326530612244897, "grad_norm": 3.261049509048462, "learning_rate": 1.6219970332284322e-05, "loss": 0.5447, "step": 2292 }, { "epoch": 0.16333653880400328, "grad_norm": 4.070466995239258, "learning_rate": 1.621689417283443e-05, "loss": 0.52, "step": 2293 }, { "epoch": 0.16340777148555757, "grad_norm": 5.887040615081787, "learning_rate": 1.621381705418179e-05, "loss": 0.4819, "step": 2294 }, { "epoch": 0.16347900416711186, "grad_norm": 3.79129958152771, "learning_rate": 1.6210738976801174e-05, "loss": 0.7492, "step": 2295 }, { "epoch": 0.16355023684866618, "grad_norm": 3.5290513038635254, "learning_rate": 1.6207659941167485e-05, "loss": 0.5836, "step": 2296 }, { "epoch": 0.16362146953022047, "grad_norm": 4.3891825675964355, "learning_rate": 1.62045799477558e-05, "loss": 0.3076, "step": 2297 }, { "epoch": 0.16369270221177476, "grad_norm": 2.5199294090270996, "learning_rate": 1.620149899704132e-05, "loss": 0.2558, "step": 2298 }, { "epoch": 0.16376393489332905, "grad_norm": 2.63261342048645, "learning_rate": 1.619841708949941e-05, "loss": 0.5044, "step": 2299 }, { "epoch": 0.16383516757488337, "grad_norm": 2.2541208267211914, "learning_rate": 1.619533422560557e-05, "loss": 0.3445, "step": 2300 }, { "epoch": 0.16390640025643766, "grad_norm": 2.20526123046875, "learning_rate": 1.619225040583546e-05, "loss": 0.1532, "step": 2301 }, { "epoch": 0.16397763293799195, "grad_norm": 2.4851648807525635, "learning_rate": 1.618916563066488e-05, "loss": 0.1844, "step": 2302 }, { "epoch": 0.16404886561954624, "grad_norm": 2.772275924682617, "learning_rate": 1.6186079900569787e-05, "loss": 0.4283, "step": 2303 }, { "epoch": 0.16412009830110055, "grad_norm": 3.310789108276367, "learning_rate": 1.618299321602626e-05, "loss": 0.4787, "step": 2304 }, { "epoch": 0.16419133098265484, "grad_norm": 3.516406774520874, "learning_rate": 1.617990557751056e-05, "loss": 0.9038, "step": 2305 }, { "epoch": 0.16426256366420913, "grad_norm": 3.8913826942443848, "learning_rate": 1.6176816985499068e-05, "loss": 0.8491, "step": 2306 }, { "epoch": 0.16433379634576345, "grad_norm": 2.86397123336792, "learning_rate": 1.6173727440468318e-05, "loss": 0.5771, "step": 2307 }, { "epoch": 0.16440502902731774, "grad_norm": 5.622960090637207, "learning_rate": 1.6170636942895006e-05, "loss": 0.5802, "step": 2308 }, { "epoch": 0.16447626170887203, "grad_norm": 2.8592216968536377, "learning_rate": 1.616754549325596e-05, "loss": 0.5137, "step": 2309 }, { "epoch": 0.16454749439042632, "grad_norm": 2.774587392807007, "learning_rate": 1.6164453092028157e-05, "loss": 0.3403, "step": 2310 }, { "epoch": 0.16461872707198064, "grad_norm": 3.0607378482818604, "learning_rate": 1.616135973968872e-05, "loss": 0.7971, "step": 2311 }, { "epoch": 0.16468995975353493, "grad_norm": 2.8912527561187744, "learning_rate": 1.615826543671493e-05, "loss": 0.5302, "step": 2312 }, { "epoch": 0.16476119243508922, "grad_norm": 2.946227788925171, "learning_rate": 1.6155170183584195e-05, "loss": 0.1635, "step": 2313 }, { "epoch": 0.1648324251166435, "grad_norm": 3.5270121097564697, "learning_rate": 1.6152073980774093e-05, "loss": 0.6096, "step": 2314 }, { "epoch": 0.16490365779819782, "grad_norm": 2.8414652347564697, "learning_rate": 1.6148976828762326e-05, "loss": 0.4288, "step": 2315 }, { "epoch": 0.1649748904797521, "grad_norm": 4.224698543548584, "learning_rate": 1.6145878728026757e-05, "loss": 0.4158, "step": 2316 }, { "epoch": 0.1650461231613064, "grad_norm": 3.8345460891723633, "learning_rate": 1.6142779679045392e-05, "loss": 0.7412, "step": 2317 }, { "epoch": 0.1651173558428607, "grad_norm": 2.6474907398223877, "learning_rate": 1.613967968229638e-05, "loss": 0.5766, "step": 2318 }, { "epoch": 0.165188588524415, "grad_norm": 2.478878974914551, "learning_rate": 1.613657873825802e-05, "loss": 0.2693, "step": 2319 }, { "epoch": 0.1652598212059693, "grad_norm": 3.4527628421783447, "learning_rate": 1.6133476847408754e-05, "loss": 0.6366, "step": 2320 }, { "epoch": 0.1653310538875236, "grad_norm": 3.1149981021881104, "learning_rate": 1.6130374010227174e-05, "loss": 0.1648, "step": 2321 }, { "epoch": 0.1654022865690779, "grad_norm": 8.024595260620117, "learning_rate": 1.6127270227192012e-05, "loss": 0.5404, "step": 2322 }, { "epoch": 0.1654735192506322, "grad_norm": 3.013279438018799, "learning_rate": 1.6124165498782156e-05, "loss": 0.4463, "step": 2323 }, { "epoch": 0.16554475193218648, "grad_norm": 3.045224189758301, "learning_rate": 1.612105982547663e-05, "loss": 0.5461, "step": 2324 }, { "epoch": 0.16561598461374077, "grad_norm": 2.0537188053131104, "learning_rate": 1.6117953207754605e-05, "loss": 0.3128, "step": 2325 }, { "epoch": 0.1656872172952951, "grad_norm": 5.990727424621582, "learning_rate": 1.611484564609541e-05, "loss": 0.7757, "step": 2326 }, { "epoch": 0.16575844997684938, "grad_norm": 2.1801986694335938, "learning_rate": 1.6111737140978495e-05, "loss": 0.5651, "step": 2327 }, { "epoch": 0.16582968265840367, "grad_norm": 2.7376906871795654, "learning_rate": 1.610862769288348e-05, "loss": 0.3237, "step": 2328 }, { "epoch": 0.16590091533995796, "grad_norm": 4.7759575843811035, "learning_rate": 1.6105517302290118e-05, "loss": 0.6447, "step": 2329 }, { "epoch": 0.16597214802151228, "grad_norm": 2.670071840286255, "learning_rate": 1.6102405969678314e-05, "loss": 0.5259, "step": 2330 }, { "epoch": 0.16604338070306657, "grad_norm": 8.530394554138184, "learning_rate": 1.609929369552811e-05, "loss": 0.7537, "step": 2331 }, { "epoch": 0.16611461338462086, "grad_norm": 3.8778464794158936, "learning_rate": 1.6096180480319698e-05, "loss": 0.689, "step": 2332 }, { "epoch": 0.16618584606617517, "grad_norm": 3.620866060256958, "learning_rate": 1.6093066324533413e-05, "loss": 0.5258, "step": 2333 }, { "epoch": 0.16625707874772946, "grad_norm": 3.0608983039855957, "learning_rate": 1.608995122864975e-05, "loss": 0.4989, "step": 2334 }, { "epoch": 0.16632831142928375, "grad_norm": 3.3945629596710205, "learning_rate": 1.6086835193149318e-05, "loss": 0.4517, "step": 2335 }, { "epoch": 0.16639954411083804, "grad_norm": 2.77042293548584, "learning_rate": 1.6083718218512904e-05, "loss": 0.8367, "step": 2336 }, { "epoch": 0.16647077679239236, "grad_norm": 4.649474620819092, "learning_rate": 1.6080600305221417e-05, "loss": 0.4247, "step": 2337 }, { "epoch": 0.16654200947394665, "grad_norm": 2.722146987915039, "learning_rate": 1.607748145375592e-05, "loss": 0.3279, "step": 2338 }, { "epoch": 0.16661324215550094, "grad_norm": 3.1799473762512207, "learning_rate": 1.607436166459762e-05, "loss": 0.4365, "step": 2339 }, { "epoch": 0.16668447483705523, "grad_norm": 9.902547836303711, "learning_rate": 1.607124093822787e-05, "loss": 0.2878, "step": 2340 }, { "epoch": 0.16675570751860955, "grad_norm": 4.096104621887207, "learning_rate": 1.6068119275128165e-05, "loss": 0.7848, "step": 2341 }, { "epoch": 0.16682694020016384, "grad_norm": 5.326472282409668, "learning_rate": 1.6064996675780146e-05, "loss": 0.8684, "step": 2342 }, { "epoch": 0.16689817288171813, "grad_norm": 1.7886748313903809, "learning_rate": 1.60618731406656e-05, "loss": 0.2277, "step": 2343 }, { "epoch": 0.16696940556327242, "grad_norm": 2.7573623657226562, "learning_rate": 1.6058748670266445e-05, "loss": 0.3847, "step": 2344 }, { "epoch": 0.16704063824482673, "grad_norm": 3.835264205932617, "learning_rate": 1.605562326506477e-05, "loss": 0.5962, "step": 2345 }, { "epoch": 0.16711187092638102, "grad_norm": 4.60938835144043, "learning_rate": 1.6052496925542786e-05, "loss": 0.3879, "step": 2346 }, { "epoch": 0.1671831036079353, "grad_norm": 3.9318830966949463, "learning_rate": 1.6049369652182855e-05, "loss": 0.4367, "step": 2347 }, { "epoch": 0.16725433628948963, "grad_norm": 2.0089447498321533, "learning_rate": 1.604624144546748e-05, "loss": 0.3356, "step": 2348 }, { "epoch": 0.16732556897104392, "grad_norm": 2.8252832889556885, "learning_rate": 1.6043112305879317e-05, "loss": 0.6825, "step": 2349 }, { "epoch": 0.1673968016525982, "grad_norm": 3.3057377338409424, "learning_rate": 1.6039982233901155e-05, "loss": 0.4718, "step": 2350 }, { "epoch": 0.1674680343341525, "grad_norm": 1.9275517463684082, "learning_rate": 1.6036851230015935e-05, "loss": 0.3834, "step": 2351 }, { "epoch": 0.16753926701570682, "grad_norm": 2.7420997619628906, "learning_rate": 1.603371929470674e-05, "loss": 0.3649, "step": 2352 }, { "epoch": 0.1676104996972611, "grad_norm": 2.0865318775177, "learning_rate": 1.603058642845679e-05, "loss": 0.3637, "step": 2353 }, { "epoch": 0.1676817323788154, "grad_norm": 3.331354856491089, "learning_rate": 1.6027452631749458e-05, "loss": 0.4661, "step": 2354 }, { "epoch": 0.16775296506036969, "grad_norm": 3.9987635612487793, "learning_rate": 1.6024317905068255e-05, "loss": 0.4008, "step": 2355 }, { "epoch": 0.167824197741924, "grad_norm": 2.8840689659118652, "learning_rate": 1.602118224889684e-05, "loss": 0.6047, "step": 2356 }, { "epoch": 0.1678954304234783, "grad_norm": 4.5788397789001465, "learning_rate": 1.601804566371901e-05, "loss": 0.6815, "step": 2357 }, { "epoch": 0.16796666310503258, "grad_norm": 3.1403539180755615, "learning_rate": 1.6014908150018703e-05, "loss": 0.2408, "step": 2358 }, { "epoch": 0.1680378957865869, "grad_norm": 2.692378044128418, "learning_rate": 1.601176970828002e-05, "loss": 0.3263, "step": 2359 }, { "epoch": 0.1681091284681412, "grad_norm": 4.956105709075928, "learning_rate": 1.6008630338987173e-05, "loss": 0.1842, "step": 2360 }, { "epoch": 0.16818036114969548, "grad_norm": 5.534692287445068, "learning_rate": 1.600549004262454e-05, "loss": 0.494, "step": 2361 }, { "epoch": 0.16825159383124977, "grad_norm": 3.215830087661743, "learning_rate": 1.600234881967664e-05, "loss": 0.7025, "step": 2362 }, { "epoch": 0.16832282651280409, "grad_norm": 4.682371616363525, "learning_rate": 1.599920667062813e-05, "loss": 0.6317, "step": 2363 }, { "epoch": 0.16839405919435838, "grad_norm": 6.931369781494141, "learning_rate": 1.5996063595963813e-05, "loss": 0.735, "step": 2364 }, { "epoch": 0.16846529187591266, "grad_norm": 4.543053150177002, "learning_rate": 1.599291959616863e-05, "loss": 0.7021, "step": 2365 }, { "epoch": 0.16853652455746695, "grad_norm": 3.6910719871520996, "learning_rate": 1.5989774671727664e-05, "loss": 0.6513, "step": 2366 }, { "epoch": 0.16860775723902127, "grad_norm": 3.6904873847961426, "learning_rate": 1.598662882312615e-05, "loss": 0.3636, "step": 2367 }, { "epoch": 0.16867898992057556, "grad_norm": 4.5044779777526855, "learning_rate": 1.5983482050849462e-05, "loss": 0.5445, "step": 2368 }, { "epoch": 0.16875022260212985, "grad_norm": 2.6553239822387695, "learning_rate": 1.598033435538311e-05, "loss": 0.2595, "step": 2369 }, { "epoch": 0.16882145528368414, "grad_norm": 3.682990550994873, "learning_rate": 1.5977185737212756e-05, "loss": 0.6577, "step": 2370 }, { "epoch": 0.16889268796523846, "grad_norm": 2.136679172515869, "learning_rate": 1.597403619682419e-05, "loss": 0.2569, "step": 2371 }, { "epoch": 0.16896392064679275, "grad_norm": 2.3634512424468994, "learning_rate": 1.5970885734703363e-05, "loss": 0.3554, "step": 2372 }, { "epoch": 0.16903515332834704, "grad_norm": 2.9858040809631348, "learning_rate": 1.5967734351336354e-05, "loss": 0.5387, "step": 2373 }, { "epoch": 0.16910638600990135, "grad_norm": 2.919058084487915, "learning_rate": 1.5964582047209392e-05, "loss": 0.4792, "step": 2374 }, { "epoch": 0.16917761869145564, "grad_norm": 2.6900477409362793, "learning_rate": 1.596142882280884e-05, "loss": 0.5856, "step": 2375 }, { "epoch": 0.16924885137300993, "grad_norm": 2.3560800552368164, "learning_rate": 1.5958274678621217e-05, "loss": 0.455, "step": 2376 }, { "epoch": 0.16932008405456422, "grad_norm": 5.895632266998291, "learning_rate": 1.5955119615133163e-05, "loss": 0.4548, "step": 2377 }, { "epoch": 0.16939131673611854, "grad_norm": 4.634343147277832, "learning_rate": 1.5951963632831482e-05, "loss": 0.7069, "step": 2378 }, { "epoch": 0.16946254941767283, "grad_norm": 3.6215527057647705, "learning_rate": 1.5948806732203105e-05, "loss": 0.2173, "step": 2379 }, { "epoch": 0.16953378209922712, "grad_norm": 3.927238702774048, "learning_rate": 1.594564891373511e-05, "loss": 0.9145, "step": 2380 }, { "epoch": 0.1696050147807814, "grad_norm": 2.562329053878784, "learning_rate": 1.5942490177914715e-05, "loss": 0.1742, "step": 2381 }, { "epoch": 0.16967624746233573, "grad_norm": 8.974620819091797, "learning_rate": 1.5939330525229285e-05, "loss": 0.7379, "step": 2382 }, { "epoch": 0.16974748014389002, "grad_norm": 2.118201732635498, "learning_rate": 1.5936169956166316e-05, "loss": 0.5172, "step": 2383 }, { "epoch": 0.1698187128254443, "grad_norm": 3.455167531967163, "learning_rate": 1.593300847121345e-05, "loss": 0.6006, "step": 2384 }, { "epoch": 0.16988994550699862, "grad_norm": 7.837595462799072, "learning_rate": 1.592984607085848e-05, "loss": 0.7533, "step": 2385 }, { "epoch": 0.1699611781885529, "grad_norm": 5.265071392059326, "learning_rate": 1.5926682755589325e-05, "loss": 0.5734, "step": 2386 }, { "epoch": 0.1700324108701072, "grad_norm": 4.5972514152526855, "learning_rate": 1.5923518525894053e-05, "loss": 0.4862, "step": 2387 }, { "epoch": 0.1701036435516615, "grad_norm": 3.401005983352661, "learning_rate": 1.5920353382260876e-05, "loss": 0.6591, "step": 2388 }, { "epoch": 0.1701748762332158, "grad_norm": 2.7915098667144775, "learning_rate": 1.591718732517814e-05, "loss": 0.3748, "step": 2389 }, { "epoch": 0.1702461089147701, "grad_norm": 3.8822498321533203, "learning_rate": 1.5914020355134333e-05, "loss": 0.5183, "step": 2390 }, { "epoch": 0.1703173415963244, "grad_norm": 2.7676475048065186, "learning_rate": 1.5910852472618085e-05, "loss": 0.3511, "step": 2391 }, { "epoch": 0.17038857427787868, "grad_norm": 5.33272647857666, "learning_rate": 1.5907683678118173e-05, "loss": 0.8045, "step": 2392 }, { "epoch": 0.170459806959433, "grad_norm": 3.460752487182617, "learning_rate": 1.5904513972123507e-05, "loss": 0.3988, "step": 2393 }, { "epoch": 0.1705310396409873, "grad_norm": 2.387155532836914, "learning_rate": 1.590134335512314e-05, "loss": 0.1769, "step": 2394 }, { "epoch": 0.17060227232254158, "grad_norm": 3.6221415996551514, "learning_rate": 1.5898171827606264e-05, "loss": 0.3711, "step": 2395 }, { "epoch": 0.17067350500409587, "grad_norm": 7.504072189331055, "learning_rate": 1.5894999390062216e-05, "loss": 0.8144, "step": 2396 }, { "epoch": 0.17074473768565018, "grad_norm": 2.9467527866363525, "learning_rate": 1.5891826042980468e-05, "loss": 0.6144, "step": 2397 }, { "epoch": 0.17081597036720447, "grad_norm": 2.342944383621216, "learning_rate": 1.5888651786850638e-05, "loss": 0.4562, "step": 2398 }, { "epoch": 0.17088720304875876, "grad_norm": 3.7866294384002686, "learning_rate": 1.5885476622162478e-05, "loss": 0.3215, "step": 2399 }, { "epoch": 0.17095843573031308, "grad_norm": 3.2369332313537598, "learning_rate": 1.588230054940588e-05, "loss": 0.4617, "step": 2400 }, { "epoch": 0.17102966841186737, "grad_norm": 3.483668565750122, "learning_rate": 1.5879123569070888e-05, "loss": 0.4318, "step": 2401 }, { "epoch": 0.17110090109342166, "grad_norm": 1.7083375453948975, "learning_rate": 1.5875945681647672e-05, "loss": 0.1628, "step": 2402 }, { "epoch": 0.17117213377497595, "grad_norm": 2.3182475566864014, "learning_rate": 1.5872766887626546e-05, "loss": 0.5038, "step": 2403 }, { "epoch": 0.17124336645653027, "grad_norm": 1.873441219329834, "learning_rate": 1.5869587187497965e-05, "loss": 0.1079, "step": 2404 }, { "epoch": 0.17131459913808456, "grad_norm": 2.4465250968933105, "learning_rate": 1.586640658175253e-05, "loss": 0.3096, "step": 2405 }, { "epoch": 0.17138583181963885, "grad_norm": 2.6407663822174072, "learning_rate": 1.586322507088097e-05, "loss": 0.4519, "step": 2406 }, { "epoch": 0.17145706450119313, "grad_norm": 3.540566921234131, "learning_rate": 1.586004265537416e-05, "loss": 0.5458, "step": 2407 }, { "epoch": 0.17152829718274745, "grad_norm": 2.671980381011963, "learning_rate": 1.585685933572312e-05, "loss": 0.2258, "step": 2408 }, { "epoch": 0.17159952986430174, "grad_norm": 2.561422348022461, "learning_rate": 1.5853675112418994e-05, "loss": 0.481, "step": 2409 }, { "epoch": 0.17167076254585603, "grad_norm": 3.5996015071868896, "learning_rate": 1.5850489985953076e-05, "loss": 0.5618, "step": 2410 }, { "epoch": 0.17174199522741035, "grad_norm": 3.3198368549346924, "learning_rate": 1.5847303956816808e-05, "loss": 0.7675, "step": 2411 }, { "epoch": 0.17181322790896464, "grad_norm": 2.669048309326172, "learning_rate": 1.5844117025501753e-05, "loss": 0.521, "step": 2412 }, { "epoch": 0.17188446059051893, "grad_norm": 3.5588250160217285, "learning_rate": 1.584092919249962e-05, "loss": 0.47, "step": 2413 }, { "epoch": 0.17195569327207322, "grad_norm": 3.2618963718414307, "learning_rate": 1.583774045830227e-05, "loss": 0.7879, "step": 2414 }, { "epoch": 0.17202692595362754, "grad_norm": 5.581077575683594, "learning_rate": 1.583455082340168e-05, "loss": 0.3871, "step": 2415 }, { "epoch": 0.17209815863518182, "grad_norm": 4.561549186706543, "learning_rate": 1.583136028828998e-05, "loss": 0.3106, "step": 2416 }, { "epoch": 0.17216939131673611, "grad_norm": 4.0107645988464355, "learning_rate": 1.5828168853459445e-05, "loss": 0.5078, "step": 2417 }, { "epoch": 0.1722406239982904, "grad_norm": 2.61594820022583, "learning_rate": 1.582497651940247e-05, "loss": 0.5819, "step": 2418 }, { "epoch": 0.17231185667984472, "grad_norm": 3.5442986488342285, "learning_rate": 1.5821783286611604e-05, "loss": 0.3137, "step": 2419 }, { "epoch": 0.172383089361399, "grad_norm": 4.27548360824585, "learning_rate": 1.581858915557953e-05, "loss": 0.7619, "step": 2420 }, { "epoch": 0.1724543220429533, "grad_norm": 2.1951887607574463, "learning_rate": 1.581539412679907e-05, "loss": 0.4797, "step": 2421 }, { "epoch": 0.1725255547245076, "grad_norm": 2.654306650161743, "learning_rate": 1.581219820076318e-05, "loss": 0.4085, "step": 2422 }, { "epoch": 0.1725967874060619, "grad_norm": 3.6790456771850586, "learning_rate": 1.5809001377964966e-05, "loss": 0.9421, "step": 2423 }, { "epoch": 0.1726680200876162, "grad_norm": 3.721282482147217, "learning_rate": 1.580580365889766e-05, "loss": 0.4997, "step": 2424 }, { "epoch": 0.1727392527691705, "grad_norm": 2.9201910495758057, "learning_rate": 1.5802605044054638e-05, "loss": 0.6093, "step": 2425 }, { "epoch": 0.1728104854507248, "grad_norm": 6.8688483238220215, "learning_rate": 1.579940553392941e-05, "loss": 0.6794, "step": 2426 }, { "epoch": 0.1728817181322791, "grad_norm": 5.285643100738525, "learning_rate": 1.579620512901563e-05, "loss": 0.7141, "step": 2427 }, { "epoch": 0.17295295081383338, "grad_norm": 2.687561511993408, "learning_rate": 1.579300382980709e-05, "loss": 0.2801, "step": 2428 }, { "epoch": 0.17302418349538767, "grad_norm": 3.5451536178588867, "learning_rate": 1.5789801636797718e-05, "loss": 0.7166, "step": 2429 }, { "epoch": 0.173095416176942, "grad_norm": 3.8835396766662598, "learning_rate": 1.5786598550481573e-05, "loss": 0.4572, "step": 2430 }, { "epoch": 0.17316664885849628, "grad_norm": 4.080260276794434, "learning_rate": 1.5783394571352863e-05, "loss": 0.3154, "step": 2431 }, { "epoch": 0.17323788154005057, "grad_norm": 2.347665309906006, "learning_rate": 1.5780189699905928e-05, "loss": 0.2198, "step": 2432 }, { "epoch": 0.17330911422160486, "grad_norm": 3.024641513824463, "learning_rate": 1.577698393663525e-05, "loss": 0.5365, "step": 2433 }, { "epoch": 0.17338034690315918, "grad_norm": 7.803039073944092, "learning_rate": 1.5773777282035437e-05, "loss": 0.3094, "step": 2434 }, { "epoch": 0.17345157958471347, "grad_norm": 2.813838481903076, "learning_rate": 1.577056973660125e-05, "loss": 0.6425, "step": 2435 }, { "epoch": 0.17352281226626776, "grad_norm": 1.896451711654663, "learning_rate": 1.5767361300827577e-05, "loss": 0.3358, "step": 2436 }, { "epoch": 0.17359404494782207, "grad_norm": 3.69079852104187, "learning_rate": 1.576415197520945e-05, "loss": 0.7012, "step": 2437 }, { "epoch": 0.17366527762937636, "grad_norm": 2.3665874004364014, "learning_rate": 1.576094176024203e-05, "loss": 0.4704, "step": 2438 }, { "epoch": 0.17373651031093065, "grad_norm": 2.5908310413360596, "learning_rate": 1.5757730656420626e-05, "loss": 0.4441, "step": 2439 }, { "epoch": 0.17380774299248494, "grad_norm": 3.7593061923980713, "learning_rate": 1.575451866424067e-05, "loss": 0.5186, "step": 2440 }, { "epoch": 0.17387897567403926, "grad_norm": 4.703148365020752, "learning_rate": 1.5751305784197746e-05, "loss": 0.7007, "step": 2441 }, { "epoch": 0.17395020835559355, "grad_norm": 5.91521692276001, "learning_rate": 1.5748092016787567e-05, "loss": 0.6993, "step": 2442 }, { "epoch": 0.17402144103714784, "grad_norm": 3.4506609439849854, "learning_rate": 1.5744877362505987e-05, "loss": 0.6616, "step": 2443 }, { "epoch": 0.17409267371870213, "grad_norm": 2.583871603012085, "learning_rate": 1.5741661821848983e-05, "loss": 0.29, "step": 2444 }, { "epoch": 0.17416390640025645, "grad_norm": 1.9994337558746338, "learning_rate": 1.5738445395312694e-05, "loss": 0.3266, "step": 2445 }, { "epoch": 0.17423513908181074, "grad_norm": 6.188634395599365, "learning_rate": 1.5735228083393373e-05, "loss": 0.4637, "step": 2446 }, { "epoch": 0.17430637176336503, "grad_norm": 3.6282401084899902, "learning_rate": 1.573200988658742e-05, "loss": 0.5333, "step": 2447 }, { "epoch": 0.17437760444491932, "grad_norm": 5.21647310256958, "learning_rate": 1.572879080539137e-05, "loss": 0.4704, "step": 2448 }, { "epoch": 0.17444883712647363, "grad_norm": 2.588482618331909, "learning_rate": 1.5725570840301897e-05, "loss": 0.5531, "step": 2449 }, { "epoch": 0.17452006980802792, "grad_norm": 4.173840522766113, "learning_rate": 1.5722349991815802e-05, "loss": 0.5473, "step": 2450 }, { "epoch": 0.1745913024895822, "grad_norm": 1.7349262237548828, "learning_rate": 1.571912826043003e-05, "loss": 0.2288, "step": 2451 }, { "epoch": 0.17466253517113653, "grad_norm": 2.514021158218384, "learning_rate": 1.5715905646641666e-05, "loss": 0.4781, "step": 2452 }, { "epoch": 0.17473376785269082, "grad_norm": 2.6140239238739014, "learning_rate": 1.5712682150947926e-05, "loss": 0.3187, "step": 2453 }, { "epoch": 0.1748050005342451, "grad_norm": 4.791279315948486, "learning_rate": 1.5709457773846155e-05, "loss": 0.6874, "step": 2454 }, { "epoch": 0.1748762332157994, "grad_norm": 7.593138694763184, "learning_rate": 1.5706232515833842e-05, "loss": 0.5369, "step": 2455 }, { "epoch": 0.17494746589735372, "grad_norm": 2.66121768951416, "learning_rate": 1.5703006377408623e-05, "loss": 0.2913, "step": 2456 }, { "epoch": 0.175018698578908, "grad_norm": 2.3350276947021484, "learning_rate": 1.5699779359068248e-05, "loss": 0.2847, "step": 2457 }, { "epoch": 0.1750899312604623, "grad_norm": 1.5795358419418335, "learning_rate": 1.569655146131061e-05, "loss": 0.1805, "step": 2458 }, { "epoch": 0.17516116394201658, "grad_norm": 3.5663366317749023, "learning_rate": 1.5693322684633747e-05, "loss": 0.8214, "step": 2459 }, { "epoch": 0.1752323966235709, "grad_norm": 3.1721482276916504, "learning_rate": 1.5690093029535824e-05, "loss": 0.5409, "step": 2460 }, { "epoch": 0.1753036293051252, "grad_norm": 4.637707710266113, "learning_rate": 1.5686862496515142e-05, "loss": 0.4838, "step": 2461 }, { "epoch": 0.17537486198667948, "grad_norm": 3.9935646057128906, "learning_rate": 1.568363108607014e-05, "loss": 0.248, "step": 2462 }, { "epoch": 0.1754460946682338, "grad_norm": 2.413029909133911, "learning_rate": 1.5680398798699395e-05, "loss": 0.6546, "step": 2463 }, { "epoch": 0.1755173273497881, "grad_norm": 4.736993312835693, "learning_rate": 1.5677165634901607e-05, "loss": 0.6595, "step": 2464 }, { "epoch": 0.17558856003134238, "grad_norm": 3.130391836166382, "learning_rate": 1.567393159517563e-05, "loss": 0.1427, "step": 2465 }, { "epoch": 0.17565979271289667, "grad_norm": 1.8534352779388428, "learning_rate": 1.5670696680020433e-05, "loss": 0.3876, "step": 2466 }, { "epoch": 0.17573102539445098, "grad_norm": 2.2536799907684326, "learning_rate": 1.5667460889935138e-05, "loss": 0.5088, "step": 2467 }, { "epoch": 0.17580225807600527, "grad_norm": 3.7204370498657227, "learning_rate": 1.566422422541899e-05, "loss": 0.6814, "step": 2468 }, { "epoch": 0.17587349075755956, "grad_norm": 14.514363288879395, "learning_rate": 1.5660986686971377e-05, "loss": 0.8178, "step": 2469 }, { "epoch": 0.17594472343911385, "grad_norm": 1.9795832633972168, "learning_rate": 1.565774827509181e-05, "loss": 0.3544, "step": 2470 }, { "epoch": 0.17601595612066817, "grad_norm": 3.151874303817749, "learning_rate": 1.565450899027995e-05, "loss": 0.6506, "step": 2471 }, { "epoch": 0.17608718880222246, "grad_norm": 2.689246416091919, "learning_rate": 1.5651268833035585e-05, "loss": 0.1688, "step": 2472 }, { "epoch": 0.17615842148377675, "grad_norm": 3.0066752433776855, "learning_rate": 1.5648027803858635e-05, "loss": 0.7411, "step": 2473 }, { "epoch": 0.17622965416533107, "grad_norm": 1.9885495901107788, "learning_rate": 1.564478590324916e-05, "loss": 0.259, "step": 2474 }, { "epoch": 0.17630088684688536, "grad_norm": 2.791862964630127, "learning_rate": 1.5641543131707345e-05, "loss": 0.7515, "step": 2475 }, { "epoch": 0.17637211952843965, "grad_norm": 3.1264142990112305, "learning_rate": 1.5638299489733525e-05, "loss": 0.1213, "step": 2476 }, { "epoch": 0.17644335220999394, "grad_norm": 1.7875373363494873, "learning_rate": 1.5635054977828156e-05, "loss": 0.2955, "step": 2477 }, { "epoch": 0.17651458489154825, "grad_norm": 3.564626455307007, "learning_rate": 1.5631809596491833e-05, "loss": 0.4393, "step": 2478 }, { "epoch": 0.17658581757310254, "grad_norm": 3.013395071029663, "learning_rate": 1.562856334622529e-05, "loss": 0.4471, "step": 2479 }, { "epoch": 0.17665705025465683, "grad_norm": 3.8542320728302, "learning_rate": 1.5625316227529382e-05, "loss": 0.5653, "step": 2480 }, { "epoch": 0.17672828293621112, "grad_norm": 5.535815238952637, "learning_rate": 1.562206824090511e-05, "loss": 0.3761, "step": 2481 }, { "epoch": 0.17679951561776544, "grad_norm": 2.685360908508301, "learning_rate": 1.5618819386853607e-05, "loss": 0.5234, "step": 2482 }, { "epoch": 0.17687074829931973, "grad_norm": 3.276151418685913, "learning_rate": 1.5615569665876132e-05, "loss": 0.4837, "step": 2483 }, { "epoch": 0.17694198098087402, "grad_norm": 2.2547552585601807, "learning_rate": 1.5612319078474087e-05, "loss": 0.2785, "step": 2484 }, { "epoch": 0.1770132136624283, "grad_norm": 3.620140314102173, "learning_rate": 1.5609067625149007e-05, "loss": 0.6582, "step": 2485 }, { "epoch": 0.17708444634398263, "grad_norm": 6.382880210876465, "learning_rate": 1.560581530640255e-05, "loss": 0.1432, "step": 2486 }, { "epoch": 0.17715567902553692, "grad_norm": 3.40442156791687, "learning_rate": 1.5602562122736526e-05, "loss": 0.3441, "step": 2487 }, { "epoch": 0.1772269117070912, "grad_norm": 3.448045253753662, "learning_rate": 1.5599308074652856e-05, "loss": 0.6718, "step": 2488 }, { "epoch": 0.17729814438864552, "grad_norm": 4.526960849761963, "learning_rate": 1.5596053162653612e-05, "loss": 0.4678, "step": 2489 }, { "epoch": 0.1773693770701998, "grad_norm": 3.3941867351531982, "learning_rate": 1.5592797387240996e-05, "loss": 0.5138, "step": 2490 }, { "epoch": 0.1774406097517541, "grad_norm": 2.6672208309173584, "learning_rate": 1.5589540748917336e-05, "loss": 0.3044, "step": 2491 }, { "epoch": 0.1775118424333084, "grad_norm": 3.255941152572632, "learning_rate": 1.5586283248185102e-05, "loss": 0.7838, "step": 2492 }, { "epoch": 0.1775830751148627, "grad_norm": 4.233285903930664, "learning_rate": 1.5583024885546887e-05, "loss": 0.4578, "step": 2493 }, { "epoch": 0.177654307796417, "grad_norm": 3.591371536254883, "learning_rate": 1.557976566150543e-05, "loss": 0.3675, "step": 2494 }, { "epoch": 0.1777255404779713, "grad_norm": 3.5480504035949707, "learning_rate": 1.5576505576563587e-05, "loss": 0.4764, "step": 2495 }, { "epoch": 0.17779677315952558, "grad_norm": 3.1476776599884033, "learning_rate": 1.5573244631224364e-05, "loss": 0.4565, "step": 2496 }, { "epoch": 0.1778680058410799, "grad_norm": 4.737380027770996, "learning_rate": 1.556998282599089e-05, "loss": 0.6076, "step": 2497 }, { "epoch": 0.17793923852263419, "grad_norm": 2.7602360248565674, "learning_rate": 1.5566720161366423e-05, "loss": 0.4494, "step": 2498 }, { "epoch": 0.17801047120418848, "grad_norm": 2.564480781555176, "learning_rate": 1.556345663785436e-05, "loss": 0.455, "step": 2499 }, { "epoch": 0.1780817038857428, "grad_norm": 3.406715154647827, "learning_rate": 1.556019225595823e-05, "loss": 0.1018, "step": 2500 }, { "epoch": 0.17815293656729708, "grad_norm": 2.2425191402435303, "learning_rate": 1.5556927016181694e-05, "loss": 0.1788, "step": 2501 }, { "epoch": 0.17822416924885137, "grad_norm": 3.4872915744781494, "learning_rate": 1.555366091902855e-05, "loss": 0.6133, "step": 2502 }, { "epoch": 0.17829540193040566, "grad_norm": 3.602289915084839, "learning_rate": 1.5550393965002712e-05, "loss": 0.2744, "step": 2503 }, { "epoch": 0.17836663461195998, "grad_norm": 5.0966644287109375, "learning_rate": 1.5547126154608246e-05, "loss": 0.744, "step": 2504 }, { "epoch": 0.17843786729351427, "grad_norm": 2.920994520187378, "learning_rate": 1.5543857488349335e-05, "loss": 0.5079, "step": 2505 }, { "epoch": 0.17850909997506856, "grad_norm": 3.945953369140625, "learning_rate": 1.5540587966730306e-05, "loss": 0.615, "step": 2506 }, { "epoch": 0.17858033265662285, "grad_norm": 6.010052680969238, "learning_rate": 1.553731759025561e-05, "loss": 0.5228, "step": 2507 }, { "epoch": 0.17865156533817717, "grad_norm": 9.765052795410156, "learning_rate": 1.553404635942984e-05, "loss": 0.5966, "step": 2508 }, { "epoch": 0.17872279801973145, "grad_norm": 2.847676992416382, "learning_rate": 1.5530774274757697e-05, "loss": 0.5355, "step": 2509 }, { "epoch": 0.17879403070128574, "grad_norm": 2.995786190032959, "learning_rate": 1.5527501336744046e-05, "loss": 0.4108, "step": 2510 }, { "epoch": 0.17886526338284003, "grad_norm": 2.0783956050872803, "learning_rate": 1.5524227545893856e-05, "loss": 0.449, "step": 2511 }, { "epoch": 0.17893649606439435, "grad_norm": 2.619899272918701, "learning_rate": 1.5520952902712246e-05, "loss": 0.2019, "step": 2512 }, { "epoch": 0.17900772874594864, "grad_norm": 2.711325168609619, "learning_rate": 1.551767740770446e-05, "loss": 0.3903, "step": 2513 }, { "epoch": 0.17907896142750293, "grad_norm": 4.046687126159668, "learning_rate": 1.5514401061375873e-05, "loss": 0.6211, "step": 2514 }, { "epoch": 0.17915019410905725, "grad_norm": 1.9864157438278198, "learning_rate": 1.5511123864231983e-05, "loss": 0.3695, "step": 2515 }, { "epoch": 0.17922142679061154, "grad_norm": 2.886979341506958, "learning_rate": 1.550784581677844e-05, "loss": 0.2467, "step": 2516 }, { "epoch": 0.17929265947216583, "grad_norm": 1.6478612422943115, "learning_rate": 1.5504566919521e-05, "loss": 0.2744, "step": 2517 }, { "epoch": 0.17936389215372012, "grad_norm": 3.0282516479492188, "learning_rate": 1.550128717296558e-05, "loss": 0.29, "step": 2518 }, { "epoch": 0.17943512483527443, "grad_norm": 2.571427822113037, "learning_rate": 1.5498006577618194e-05, "loss": 0.3662, "step": 2519 }, { "epoch": 0.17950635751682872, "grad_norm": 4.803003311157227, "learning_rate": 1.5494725133985014e-05, "loss": 0.2362, "step": 2520 }, { "epoch": 0.179577590198383, "grad_norm": 2.98887038230896, "learning_rate": 1.549144284257233e-05, "loss": 0.3585, "step": 2521 }, { "epoch": 0.1796488228799373, "grad_norm": 3.028607130050659, "learning_rate": 1.548815970388657e-05, "loss": 0.4639, "step": 2522 }, { "epoch": 0.17972005556149162, "grad_norm": 2.7414045333862305, "learning_rate": 1.5484875718434284e-05, "loss": 0.4052, "step": 2523 }, { "epoch": 0.1797912882430459, "grad_norm": 3.286766290664673, "learning_rate": 1.5481590886722154e-05, "loss": 0.7455, "step": 2524 }, { "epoch": 0.1798625209246002, "grad_norm": 2.2267818450927734, "learning_rate": 1.5478305209257004e-05, "loss": 0.3975, "step": 2525 }, { "epoch": 0.17993375360615452, "grad_norm": 2.7903523445129395, "learning_rate": 1.547501868654577e-05, "loss": 0.5996, "step": 2526 }, { "epoch": 0.1800049862877088, "grad_norm": 4.565357685089111, "learning_rate": 1.5471731319095537e-05, "loss": 0.4336, "step": 2527 }, { "epoch": 0.1800762189692631, "grad_norm": 3.793740749359131, "learning_rate": 1.5468443107413512e-05, "loss": 0.6501, "step": 2528 }, { "epoch": 0.1801474516508174, "grad_norm": 2.5186116695404053, "learning_rate": 1.5465154052007027e-05, "loss": 0.6308, "step": 2529 }, { "epoch": 0.1802186843323717, "grad_norm": 2.8003480434417725, "learning_rate": 1.5461864153383555e-05, "loss": 0.5891, "step": 2530 }, { "epoch": 0.180289917013926, "grad_norm": 3.1359188556671143, "learning_rate": 1.5458573412050688e-05, "loss": 0.4877, "step": 2531 }, { "epoch": 0.18036114969548028, "grad_norm": 2.8837742805480957, "learning_rate": 1.5455281828516152e-05, "loss": 0.754, "step": 2532 }, { "epoch": 0.18043238237703457, "grad_norm": 3.6486780643463135, "learning_rate": 1.5451989403287816e-05, "loss": 0.4307, "step": 2533 }, { "epoch": 0.1805036150585889, "grad_norm": 3.986577033996582, "learning_rate": 1.544869613687366e-05, "loss": 0.5181, "step": 2534 }, { "epoch": 0.18057484774014318, "grad_norm": 2.1216061115264893, "learning_rate": 1.5445402029781792e-05, "loss": 0.0963, "step": 2535 }, { "epoch": 0.18064608042169747, "grad_norm": 3.9502956867218018, "learning_rate": 1.5442107082520475e-05, "loss": 0.8166, "step": 2536 }, { "epoch": 0.18071731310325176, "grad_norm": 6.7223310470581055, "learning_rate": 1.5438811295598075e-05, "loss": 0.2196, "step": 2537 }, { "epoch": 0.18078854578480608, "grad_norm": 2.9376156330108643, "learning_rate": 1.5435514669523102e-05, "loss": 0.2445, "step": 2538 }, { "epoch": 0.18085977846636037, "grad_norm": 4.2125325202941895, "learning_rate": 1.543221720480419e-05, "loss": 0.5299, "step": 2539 }, { "epoch": 0.18093101114791466, "grad_norm": 7.89931058883667, "learning_rate": 1.5428918901950105e-05, "loss": 0.4494, "step": 2540 }, { "epoch": 0.18100224382946897, "grad_norm": 2.832980155944824, "learning_rate": 1.542561976146974e-05, "loss": 0.6816, "step": 2541 }, { "epoch": 0.18107347651102326, "grad_norm": 3.2466089725494385, "learning_rate": 1.5422319783872118e-05, "loss": 0.4132, "step": 2542 }, { "epoch": 0.18114470919257755, "grad_norm": 2.3242430686950684, "learning_rate": 1.5419018969666396e-05, "loss": 0.1875, "step": 2543 }, { "epoch": 0.18121594187413184, "grad_norm": 3.0035808086395264, "learning_rate": 1.541571731936185e-05, "loss": 0.2069, "step": 2544 }, { "epoch": 0.18128717455568616, "grad_norm": 3.886323928833008, "learning_rate": 1.5412414833467887e-05, "loss": 0.838, "step": 2545 }, { "epoch": 0.18135840723724045, "grad_norm": 4.752068519592285, "learning_rate": 1.540911151249406e-05, "loss": 0.301, "step": 2546 }, { "epoch": 0.18142963991879474, "grad_norm": 4.43424654006958, "learning_rate": 1.5405807356950028e-05, "loss": 0.721, "step": 2547 }, { "epoch": 0.18150087260034903, "grad_norm": 3.994656801223755, "learning_rate": 1.5402502367345588e-05, "loss": 0.7142, "step": 2548 }, { "epoch": 0.18157210528190335, "grad_norm": 2.3546128273010254, "learning_rate": 1.5399196544190668e-05, "loss": 0.6994, "step": 2549 }, { "epoch": 0.18164333796345764, "grad_norm": 3.682981014251709, "learning_rate": 1.5395889887995324e-05, "loss": 0.3311, "step": 2550 }, { "epoch": 0.18171457064501192, "grad_norm": 5.304713726043701, "learning_rate": 1.5392582399269735e-05, "loss": 0.1433, "step": 2551 }, { "epoch": 0.18178580332656624, "grad_norm": 3.18168306350708, "learning_rate": 1.5389274078524217e-05, "loss": 0.4846, "step": 2552 }, { "epoch": 0.18185703600812053, "grad_norm": 8.405804634094238, "learning_rate": 1.5385964926269206e-05, "loss": 1.3897, "step": 2553 }, { "epoch": 0.18192826868967482, "grad_norm": 3.3578474521636963, "learning_rate": 1.5382654943015274e-05, "loss": 0.4816, "step": 2554 }, { "epoch": 0.1819995013712291, "grad_norm": 4.590666770935059, "learning_rate": 1.5379344129273112e-05, "loss": 0.3697, "step": 2555 }, { "epoch": 0.18207073405278343, "grad_norm": 2.85115385055542, "learning_rate": 1.5376032485553543e-05, "loss": 0.4787, "step": 2556 }, { "epoch": 0.18214196673433772, "grad_norm": 3.179366111755371, "learning_rate": 1.5372720012367532e-05, "loss": 0.4855, "step": 2557 }, { "epoch": 0.182213199415892, "grad_norm": 5.712611198425293, "learning_rate": 1.5369406710226147e-05, "loss": 0.3681, "step": 2558 }, { "epoch": 0.1822844320974463, "grad_norm": 4.758913516998291, "learning_rate": 1.5366092579640604e-05, "loss": 0.7424, "step": 2559 }, { "epoch": 0.18235566477900061, "grad_norm": 4.194564342498779, "learning_rate": 1.5362777621122235e-05, "loss": 0.6804, "step": 2560 }, { "epoch": 0.1824268974605549, "grad_norm": 5.380641937255859, "learning_rate": 1.5359461835182507e-05, "loss": 0.3887, "step": 2561 }, { "epoch": 0.1824981301421092, "grad_norm": 3.1900291442871094, "learning_rate": 1.5356145222333006e-05, "loss": 0.5461, "step": 2562 }, { "epoch": 0.18256936282366348, "grad_norm": 2.3338446617126465, "learning_rate": 1.5352827783085453e-05, "loss": 0.4221, "step": 2563 }, { "epoch": 0.1826405955052178, "grad_norm": 3.062649965286255, "learning_rate": 1.53495095179517e-05, "loss": 0.4911, "step": 2564 }, { "epoch": 0.1827118281867721, "grad_norm": 4.261214256286621, "learning_rate": 1.5346190427443716e-05, "loss": 0.5689, "step": 2565 }, { "epoch": 0.18278306086832638, "grad_norm": 4.663865566253662, "learning_rate": 1.5342870512073605e-05, "loss": 0.6233, "step": 2566 }, { "epoch": 0.1828542935498807, "grad_norm": 4.492799282073975, "learning_rate": 1.5339549772353595e-05, "loss": 0.7722, "step": 2567 }, { "epoch": 0.182925526231435, "grad_norm": 2.9577205181121826, "learning_rate": 1.533622820879604e-05, "loss": 0.3913, "step": 2568 }, { "epoch": 0.18299675891298928, "grad_norm": 2.7524032592773438, "learning_rate": 1.533290582191343e-05, "loss": 0.6874, "step": 2569 }, { "epoch": 0.18306799159454357, "grad_norm": 3.5169312953948975, "learning_rate": 1.5329582612218366e-05, "loss": 0.6676, "step": 2570 }, { "epoch": 0.18313922427609788, "grad_norm": 2.131711721420288, "learning_rate": 1.532625858022359e-05, "loss": 0.1566, "step": 2571 }, { "epoch": 0.18321045695765217, "grad_norm": 2.954590082168579, "learning_rate": 1.5322933726441963e-05, "loss": 0.3514, "step": 2572 }, { "epoch": 0.18328168963920646, "grad_norm": 4.045517921447754, "learning_rate": 1.531960805138648e-05, "loss": 0.3317, "step": 2573 }, { "epoch": 0.18335292232076075, "grad_norm": 3.291036367416382, "learning_rate": 1.5316281555570258e-05, "loss": 0.5714, "step": 2574 }, { "epoch": 0.18342415500231507, "grad_norm": 4.643711090087891, "learning_rate": 1.5312954239506536e-05, "loss": 0.5001, "step": 2575 }, { "epoch": 0.18349538768386936, "grad_norm": 2.4603564739227295, "learning_rate": 1.530962610370869e-05, "loss": 0.322, "step": 2576 }, { "epoch": 0.18356662036542365, "grad_norm": 2.8414273262023926, "learning_rate": 1.530629714869021e-05, "loss": 0.4027, "step": 2577 }, { "epoch": 0.18363785304697797, "grad_norm": 2.520615577697754, "learning_rate": 1.5302967374964727e-05, "loss": 0.4528, "step": 2578 }, { "epoch": 0.18370908572853226, "grad_norm": 2.459307909011841, "learning_rate": 1.5299636783045988e-05, "loss": 0.4276, "step": 2579 }, { "epoch": 0.18378031841008655, "grad_norm": 3.7928946018218994, "learning_rate": 1.529630537344787e-05, "loss": 0.4221, "step": 2580 }, { "epoch": 0.18385155109164084, "grad_norm": 2.490184783935547, "learning_rate": 1.5292973146684372e-05, "loss": 0.28, "step": 2581 }, { "epoch": 0.18392278377319515, "grad_norm": 3.846090793609619, "learning_rate": 1.5289640103269626e-05, "loss": 0.4598, "step": 2582 }, { "epoch": 0.18399401645474944, "grad_norm": 7.0528364181518555, "learning_rate": 1.5286306243717884e-05, "loss": 0.4082, "step": 2583 }, { "epoch": 0.18406524913630373, "grad_norm": 2.5398037433624268, "learning_rate": 1.528297156854353e-05, "loss": 0.408, "step": 2584 }, { "epoch": 0.18413648181785802, "grad_norm": 3.0121021270751953, "learning_rate": 1.5279636078261064e-05, "loss": 0.368, "step": 2585 }, { "epoch": 0.18420771449941234, "grad_norm": 4.2436699867248535, "learning_rate": 1.5276299773385122e-05, "loss": 0.4448, "step": 2586 }, { "epoch": 0.18427894718096663, "grad_norm": 2.571016788482666, "learning_rate": 1.527296265443046e-05, "loss": 0.5323, "step": 2587 }, { "epoch": 0.18435017986252092, "grad_norm": 3.1025068759918213, "learning_rate": 1.5269624721911964e-05, "loss": 0.4433, "step": 2588 }, { "epoch": 0.1844214125440752, "grad_norm": 3.1208856105804443, "learning_rate": 1.5266285976344635e-05, "loss": 0.358, "step": 2589 }, { "epoch": 0.18449264522562953, "grad_norm": 3.237855911254883, "learning_rate": 1.5262946418243617e-05, "loss": 0.5562, "step": 2590 }, { "epoch": 0.18456387790718382, "grad_norm": 2.7191035747528076, "learning_rate": 1.5259606048124162e-05, "loss": 0.471, "step": 2591 }, { "epoch": 0.1846351105887381, "grad_norm": 3.5035347938537598, "learning_rate": 1.5256264866501655e-05, "loss": 0.3288, "step": 2592 }, { "epoch": 0.18470634327029242, "grad_norm": 3.829512357711792, "learning_rate": 1.5252922873891611e-05, "loss": 0.692, "step": 2593 }, { "epoch": 0.1847775759518467, "grad_norm": 2.158475399017334, "learning_rate": 1.5249580070809661e-05, "loss": 0.4493, "step": 2594 }, { "epoch": 0.184848808633401, "grad_norm": 2.6653923988342285, "learning_rate": 1.5246236457771568e-05, "loss": 0.598, "step": 2595 }, { "epoch": 0.1849200413149553, "grad_norm": 2.581913948059082, "learning_rate": 1.5242892035293216e-05, "loss": 0.2179, "step": 2596 }, { "epoch": 0.1849912739965096, "grad_norm": 3.7720558643341064, "learning_rate": 1.523954680389061e-05, "loss": 0.3625, "step": 2597 }, { "epoch": 0.1850625066780639, "grad_norm": 2.3597137928009033, "learning_rate": 1.5236200764079894e-05, "loss": 0.3979, "step": 2598 }, { "epoch": 0.1851337393596182, "grad_norm": 1.3528653383255005, "learning_rate": 1.5232853916377321e-05, "loss": 0.0593, "step": 2599 }, { "epoch": 0.18520497204117248, "grad_norm": 2.5275096893310547, "learning_rate": 1.5229506261299276e-05, "loss": 0.3843, "step": 2600 }, { "epoch": 0.1852762047227268, "grad_norm": 4.647790908813477, "learning_rate": 1.5226157799362267e-05, "loss": 0.6878, "step": 2601 }, { "epoch": 0.18534743740428108, "grad_norm": 3.4278879165649414, "learning_rate": 1.5222808531082929e-05, "loss": 0.628, "step": 2602 }, { "epoch": 0.18541867008583537, "grad_norm": 4.476102352142334, "learning_rate": 1.521945845697802e-05, "loss": 0.3564, "step": 2603 }, { "epoch": 0.1854899027673897, "grad_norm": 2.4279167652130127, "learning_rate": 1.521610757756442e-05, "loss": 0.3519, "step": 2604 }, { "epoch": 0.18556113544894398, "grad_norm": 2.746847152709961, "learning_rate": 1.521275589335914e-05, "loss": 0.5168, "step": 2605 }, { "epoch": 0.18563236813049827, "grad_norm": 3.9490277767181396, "learning_rate": 1.5209403404879305e-05, "loss": 0.4593, "step": 2606 }, { "epoch": 0.18570360081205256, "grad_norm": 2.37511944770813, "learning_rate": 1.520605011264217e-05, "loss": 0.4941, "step": 2607 }, { "epoch": 0.18577483349360688, "grad_norm": 4.488595962524414, "learning_rate": 1.5202696017165114e-05, "loss": 0.4159, "step": 2608 }, { "epoch": 0.18584606617516117, "grad_norm": 4.398386001586914, "learning_rate": 1.5199341118965641e-05, "loss": 0.5481, "step": 2609 }, { "epoch": 0.18591729885671546, "grad_norm": 4.604757785797119, "learning_rate": 1.5195985418561377e-05, "loss": 0.4702, "step": 2610 }, { "epoch": 0.18598853153826975, "grad_norm": 2.7033791542053223, "learning_rate": 1.519262891647007e-05, "loss": 0.095, "step": 2611 }, { "epoch": 0.18605976421982406, "grad_norm": 3.046166181564331, "learning_rate": 1.5189271613209595e-05, "loss": 0.4937, "step": 2612 }, { "epoch": 0.18613099690137835, "grad_norm": 5.548402786254883, "learning_rate": 1.518591350929795e-05, "loss": 0.6122, "step": 2613 }, { "epoch": 0.18620222958293264, "grad_norm": 3.3958828449249268, "learning_rate": 1.5182554605253254e-05, "loss": 0.0895, "step": 2614 }, { "epoch": 0.18627346226448693, "grad_norm": 2.941892147064209, "learning_rate": 1.5179194901593752e-05, "loss": 0.47, "step": 2615 }, { "epoch": 0.18634469494604125, "grad_norm": 2.65197491645813, "learning_rate": 1.5175834398837814e-05, "loss": 0.0818, "step": 2616 }, { "epoch": 0.18641592762759554, "grad_norm": 3.135103464126587, "learning_rate": 1.5172473097503928e-05, "loss": 0.4693, "step": 2617 }, { "epoch": 0.18648716030914983, "grad_norm": 5.703603744506836, "learning_rate": 1.516911099811071e-05, "loss": 0.7378, "step": 2618 }, { "epoch": 0.18655839299070415, "grad_norm": 2.4556756019592285, "learning_rate": 1.5165748101176894e-05, "loss": 0.4796, "step": 2619 }, { "epoch": 0.18662962567225844, "grad_norm": 2.0720667839050293, "learning_rate": 1.5162384407221344e-05, "loss": 0.1175, "step": 2620 }, { "epoch": 0.18670085835381273, "grad_norm": 3.4270167350769043, "learning_rate": 1.5159019916763044e-05, "loss": 0.5948, "step": 2621 }, { "epoch": 0.18677209103536702, "grad_norm": 3.8779094219207764, "learning_rate": 1.51556546303211e-05, "loss": 0.761, "step": 2622 }, { "epoch": 0.18684332371692133, "grad_norm": 4.288640975952148, "learning_rate": 1.5152288548414734e-05, "loss": 0.965, "step": 2623 }, { "epoch": 0.18691455639847562, "grad_norm": 2.263329029083252, "learning_rate": 1.5148921671563309e-05, "loss": 0.1509, "step": 2624 }, { "epoch": 0.1869857890800299, "grad_norm": 9.227435111999512, "learning_rate": 1.514555400028629e-05, "loss": 0.5168, "step": 2625 }, { "epoch": 0.1870570217615842, "grad_norm": 3.0913567543029785, "learning_rate": 1.5142185535103276e-05, "loss": 0.7029, "step": 2626 }, { "epoch": 0.18712825444313852, "grad_norm": 2.5722010135650635, "learning_rate": 1.5138816276533994e-05, "loss": 0.6015, "step": 2627 }, { "epoch": 0.1871994871246928, "grad_norm": 3.5307769775390625, "learning_rate": 1.5135446225098279e-05, "loss": 0.6329, "step": 2628 }, { "epoch": 0.1872707198062471, "grad_norm": 4.4696221351623535, "learning_rate": 1.5132075381316091e-05, "loss": 0.4919, "step": 2629 }, { "epoch": 0.18734195248780142, "grad_norm": 1.920982003211975, "learning_rate": 1.5128703745707527e-05, "loss": 0.2986, "step": 2630 }, { "epoch": 0.1874131851693557, "grad_norm": 2.4218122959136963, "learning_rate": 1.5125331318792787e-05, "loss": 0.3285, "step": 2631 }, { "epoch": 0.18748441785091, "grad_norm": 2.174961566925049, "learning_rate": 1.5121958101092205e-05, "loss": 0.2581, "step": 2632 }, { "epoch": 0.18755565053246429, "grad_norm": 3.599374294281006, "learning_rate": 1.5118584093126237e-05, "loss": 0.4914, "step": 2633 }, { "epoch": 0.1876268832140186, "grad_norm": 3.863976240158081, "learning_rate": 1.5115209295415454e-05, "loss": 0.5101, "step": 2634 }, { "epoch": 0.1876981158955729, "grad_norm": 2.5237932205200195, "learning_rate": 1.5111833708480555e-05, "loss": 0.6615, "step": 2635 }, { "epoch": 0.18776934857712718, "grad_norm": 4.414559841156006, "learning_rate": 1.5108457332842352e-05, "loss": 0.2334, "step": 2636 }, { "epoch": 0.18784058125868147, "grad_norm": 3.5367918014526367, "learning_rate": 1.5105080169021792e-05, "loss": 0.8129, "step": 2637 }, { "epoch": 0.1879118139402358, "grad_norm": 2.805579900741577, "learning_rate": 1.5101702217539933e-05, "loss": 0.5903, "step": 2638 }, { "epoch": 0.18798304662179008, "grad_norm": 4.173245429992676, "learning_rate": 1.509832347891796e-05, "loss": 0.4857, "step": 2639 }, { "epoch": 0.18805427930334437, "grad_norm": 3.9835619926452637, "learning_rate": 1.5094943953677175e-05, "loss": 0.6373, "step": 2640 }, { "epoch": 0.18812551198489866, "grad_norm": 2.44608736038208, "learning_rate": 1.509156364233901e-05, "loss": 0.3242, "step": 2641 }, { "epoch": 0.18819674466645298, "grad_norm": 3.3736071586608887, "learning_rate": 1.5088182545425003e-05, "loss": 0.768, "step": 2642 }, { "epoch": 0.18826797734800726, "grad_norm": 3.202894449234009, "learning_rate": 1.5084800663456828e-05, "loss": 0.624, "step": 2643 }, { "epoch": 0.18833921002956155, "grad_norm": 4.04539155960083, "learning_rate": 1.5081417996956277e-05, "loss": 0.3993, "step": 2644 }, { "epoch": 0.18841044271111587, "grad_norm": 3.1914405822753906, "learning_rate": 1.5078034546445257e-05, "loss": 0.1824, "step": 2645 }, { "epoch": 0.18848167539267016, "grad_norm": 3.05484676361084, "learning_rate": 1.5074650312445797e-05, "loss": 0.6777, "step": 2646 }, { "epoch": 0.18855290807422445, "grad_norm": 3.852975368499756, "learning_rate": 1.5071265295480058e-05, "loss": 0.2057, "step": 2647 }, { "epoch": 0.18862414075577874, "grad_norm": 3.058046340942383, "learning_rate": 1.5067879496070305e-05, "loss": 0.9836, "step": 2648 }, { "epoch": 0.18869537343733306, "grad_norm": 2.6176373958587646, "learning_rate": 1.5064492914738934e-05, "loss": 0.477, "step": 2649 }, { "epoch": 0.18876660611888735, "grad_norm": 3.244668960571289, "learning_rate": 1.5061105552008462e-05, "loss": 0.5237, "step": 2650 }, { "epoch": 0.18883783880044164, "grad_norm": 1.792302131652832, "learning_rate": 1.5057717408401523e-05, "loss": 0.1269, "step": 2651 }, { "epoch": 0.18890907148199593, "grad_norm": 3.914513111114502, "learning_rate": 1.5054328484440868e-05, "loss": 0.3976, "step": 2652 }, { "epoch": 0.18898030416355024, "grad_norm": 2.5126051902770996, "learning_rate": 1.5050938780649382e-05, "loss": 0.4963, "step": 2653 }, { "epoch": 0.18905153684510453, "grad_norm": 2.513664484024048, "learning_rate": 1.5047548297550054e-05, "loss": 0.7325, "step": 2654 }, { "epoch": 0.18912276952665882, "grad_norm": 2.9330687522888184, "learning_rate": 1.5044157035666003e-05, "loss": 0.5726, "step": 2655 }, { "epoch": 0.18919400220821314, "grad_norm": 1.741971492767334, "learning_rate": 1.5040764995520469e-05, "loss": 0.1363, "step": 2656 }, { "epoch": 0.18926523488976743, "grad_norm": 3.6424756050109863, "learning_rate": 1.5037372177636805e-05, "loss": 0.7346, "step": 2657 }, { "epoch": 0.18933646757132172, "grad_norm": 3.091566324234009, "learning_rate": 1.5033978582538487e-05, "loss": 0.5506, "step": 2658 }, { "epoch": 0.189407700252876, "grad_norm": 2.8181705474853516, "learning_rate": 1.5030584210749117e-05, "loss": 0.5636, "step": 2659 }, { "epoch": 0.18947893293443033, "grad_norm": 3.5611062049865723, "learning_rate": 1.5027189062792405e-05, "loss": 0.332, "step": 2660 }, { "epoch": 0.18955016561598462, "grad_norm": 2.775186538696289, "learning_rate": 1.5023793139192192e-05, "loss": 0.6851, "step": 2661 }, { "epoch": 0.1896213982975389, "grad_norm": 3.5565497875213623, "learning_rate": 1.5020396440472433e-05, "loss": 0.154, "step": 2662 }, { "epoch": 0.1896926309790932, "grad_norm": 5.230798244476318, "learning_rate": 1.5016998967157201e-05, "loss": 0.4758, "step": 2663 }, { "epoch": 0.1897638636606475, "grad_norm": 1.9074218273162842, "learning_rate": 1.5013600719770699e-05, "loss": 0.2363, "step": 2664 }, { "epoch": 0.1898350963422018, "grad_norm": 3.90585994720459, "learning_rate": 1.5010201698837232e-05, "loss": 0.4164, "step": 2665 }, { "epoch": 0.1899063290237561, "grad_norm": 3.2291688919067383, "learning_rate": 1.5006801904881236e-05, "loss": 0.6128, "step": 2666 }, { "epoch": 0.18997756170531038, "grad_norm": 2.4561731815338135, "learning_rate": 1.5003401338427271e-05, "loss": 0.3586, "step": 2667 }, { "epoch": 0.1900487943868647, "grad_norm": 4.5114240646362305, "learning_rate": 1.5000000000000002e-05, "loss": 0.4413, "step": 2668 }, { "epoch": 0.190120027068419, "grad_norm": 2.380441665649414, "learning_rate": 1.4996597890124222e-05, "loss": 0.5464, "step": 2669 }, { "epoch": 0.19019125974997328, "grad_norm": 3.025801420211792, "learning_rate": 1.4993195009324844e-05, "loss": 0.4232, "step": 2670 }, { "epoch": 0.1902624924315276, "grad_norm": 3.480611801147461, "learning_rate": 1.4989791358126898e-05, "loss": 0.8618, "step": 2671 }, { "epoch": 0.1903337251130819, "grad_norm": 2.782806158065796, "learning_rate": 1.4986386937055529e-05, "loss": 0.3289, "step": 2672 }, { "epoch": 0.19040495779463618, "grad_norm": 2.129403829574585, "learning_rate": 1.4982981746636002e-05, "loss": 0.2751, "step": 2673 }, { "epoch": 0.19047619047619047, "grad_norm": 3.350802421569824, "learning_rate": 1.4979575787393713e-05, "loss": 0.7432, "step": 2674 }, { "epoch": 0.19054742315774478, "grad_norm": 3.5194544792175293, "learning_rate": 1.4976169059854151e-05, "loss": 0.6147, "step": 2675 }, { "epoch": 0.19061865583929907, "grad_norm": 2.4347426891326904, "learning_rate": 1.4972761564542953e-05, "loss": 0.166, "step": 2676 }, { "epoch": 0.19068988852085336, "grad_norm": 4.222224235534668, "learning_rate": 1.4969353301985856e-05, "loss": 0.5731, "step": 2677 }, { "epoch": 0.19076112120240765, "grad_norm": 2.3050410747528076, "learning_rate": 1.4965944272708717e-05, "loss": 0.152, "step": 2678 }, { "epoch": 0.19083235388396197, "grad_norm": 3.750385284423828, "learning_rate": 1.4962534477237516e-05, "loss": 0.4817, "step": 2679 }, { "epoch": 0.19090358656551626, "grad_norm": 4.339419841766357, "learning_rate": 1.495912391609835e-05, "loss": 0.5731, "step": 2680 }, { "epoch": 0.19097481924707055, "grad_norm": 3.5442118644714355, "learning_rate": 1.4955712589817433e-05, "loss": 0.6942, "step": 2681 }, { "epoch": 0.19104605192862487, "grad_norm": 3.1688711643218994, "learning_rate": 1.4952300498921097e-05, "loss": 0.2694, "step": 2682 }, { "epoch": 0.19111728461017916, "grad_norm": 3.551852226257324, "learning_rate": 1.4948887643935793e-05, "loss": 0.1594, "step": 2683 }, { "epoch": 0.19118851729173345, "grad_norm": 2.7523725032806396, "learning_rate": 1.494547402538809e-05, "loss": 0.4002, "step": 2684 }, { "epoch": 0.19125974997328773, "grad_norm": 6.5679030418396, "learning_rate": 1.4942059643804671e-05, "loss": 0.7443, "step": 2685 }, { "epoch": 0.19133098265484205, "grad_norm": 2.674117088317871, "learning_rate": 1.4938644499712342e-05, "loss": 0.3791, "step": 2686 }, { "epoch": 0.19140221533639634, "grad_norm": 3.3846256732940674, "learning_rate": 1.4935228593638029e-05, "loss": 0.2518, "step": 2687 }, { "epoch": 0.19147344801795063, "grad_norm": 3.3218021392822266, "learning_rate": 1.4931811926108765e-05, "loss": 0.6879, "step": 2688 }, { "epoch": 0.19154468069950492, "grad_norm": 2.8775055408477783, "learning_rate": 1.4928394497651709e-05, "loss": 0.5628, "step": 2689 }, { "epoch": 0.19161591338105924, "grad_norm": 4.309749126434326, "learning_rate": 1.4924976308794134e-05, "loss": 0.3645, "step": 2690 }, { "epoch": 0.19168714606261353, "grad_norm": 3.728616952896118, "learning_rate": 1.4921557360063432e-05, "loss": 0.431, "step": 2691 }, { "epoch": 0.19175837874416782, "grad_norm": 3.4805526733398438, "learning_rate": 1.4918137651987111e-05, "loss": 0.7387, "step": 2692 }, { "epoch": 0.1918296114257221, "grad_norm": 2.923534631729126, "learning_rate": 1.4914717185092797e-05, "loss": 0.3866, "step": 2693 }, { "epoch": 0.19190084410727642, "grad_norm": 3.9958457946777344, "learning_rate": 1.4911295959908235e-05, "loss": 0.4437, "step": 2694 }, { "epoch": 0.19197207678883071, "grad_norm": 2.9788851737976074, "learning_rate": 1.4907873976961282e-05, "loss": 0.543, "step": 2695 }, { "epoch": 0.192043309470385, "grad_norm": 4.846106052398682, "learning_rate": 1.4904451236779917e-05, "loss": 0.3028, "step": 2696 }, { "epoch": 0.19211454215193932, "grad_norm": 3.1704509258270264, "learning_rate": 1.4901027739892228e-05, "loss": 0.6754, "step": 2697 }, { "epoch": 0.1921857748334936, "grad_norm": 3.083327293395996, "learning_rate": 1.4897603486826433e-05, "loss": 0.5066, "step": 2698 }, { "epoch": 0.1922570075150479, "grad_norm": 3.2370247840881348, "learning_rate": 1.4894178478110856e-05, "loss": 0.5767, "step": 2699 }, { "epoch": 0.1923282401966022, "grad_norm": 3.319366455078125, "learning_rate": 1.4890752714273936e-05, "loss": 0.7215, "step": 2700 }, { "epoch": 0.1923994728781565, "grad_norm": 2.3806045055389404, "learning_rate": 1.4887326195844243e-05, "loss": 0.2154, "step": 2701 }, { "epoch": 0.1924707055597108, "grad_norm": 2.6410093307495117, "learning_rate": 1.4883898923350446e-05, "loss": 0.5728, "step": 2702 }, { "epoch": 0.1925419382412651, "grad_norm": 2.293452501296997, "learning_rate": 1.488047089732134e-05, "loss": 0.3341, "step": 2703 }, { "epoch": 0.19261317092281938, "grad_norm": 1.739888310432434, "learning_rate": 1.4877042118285832e-05, "loss": 0.2305, "step": 2704 }, { "epoch": 0.1926844036043737, "grad_norm": 2.606576919555664, "learning_rate": 1.487361258677295e-05, "loss": 0.4486, "step": 2705 }, { "epoch": 0.19275563628592798, "grad_norm": 7.079329013824463, "learning_rate": 1.487018230331183e-05, "loss": 0.3397, "step": 2706 }, { "epoch": 0.19282686896748227, "grad_norm": 5.897678852081299, "learning_rate": 1.4866751268431738e-05, "loss": 0.5012, "step": 2707 }, { "epoch": 0.1928981016490366, "grad_norm": 3.51143217086792, "learning_rate": 1.4863319482662044e-05, "loss": 0.419, "step": 2708 }, { "epoch": 0.19296933433059088, "grad_norm": 2.7349634170532227, "learning_rate": 1.4859886946532235e-05, "loss": 0.4757, "step": 2709 }, { "epoch": 0.19304056701214517, "grad_norm": 2.187361240386963, "learning_rate": 1.485645366057192e-05, "loss": 0.2434, "step": 2710 }, { "epoch": 0.19311179969369946, "grad_norm": 5.061954021453857, "learning_rate": 1.4853019625310813e-05, "loss": 0.3869, "step": 2711 }, { "epoch": 0.19318303237525378, "grad_norm": 3.054757833480835, "learning_rate": 1.4849584841278755e-05, "loss": 0.4241, "step": 2712 }, { "epoch": 0.19325426505680807, "grad_norm": 3.9929590225219727, "learning_rate": 1.4846149309005697e-05, "loss": 0.6245, "step": 2713 }, { "epoch": 0.19332549773836236, "grad_norm": 2.94531512260437, "learning_rate": 1.4842713029021707e-05, "loss": 0.3709, "step": 2714 }, { "epoch": 0.19339673041991665, "grad_norm": 3.0837810039520264, "learning_rate": 1.4839276001856965e-05, "loss": 0.5361, "step": 2715 }, { "epoch": 0.19346796310147096, "grad_norm": 2.483865261077881, "learning_rate": 1.4835838228041773e-05, "loss": 0.5535, "step": 2716 }, { "epoch": 0.19353919578302525, "grad_norm": 2.2102794647216797, "learning_rate": 1.4832399708106541e-05, "loss": 0.34, "step": 2717 }, { "epoch": 0.19361042846457954, "grad_norm": 4.309078693389893, "learning_rate": 1.4828960442581802e-05, "loss": 0.579, "step": 2718 }, { "epoch": 0.19368166114613386, "grad_norm": 4.840634822845459, "learning_rate": 1.4825520431998191e-05, "loss": 0.5659, "step": 2719 }, { "epoch": 0.19375289382768815, "grad_norm": 3.2534584999084473, "learning_rate": 1.4822079676886469e-05, "loss": 0.9059, "step": 2720 }, { "epoch": 0.19382412650924244, "grad_norm": 1.8628875017166138, "learning_rate": 1.4818638177777514e-05, "loss": 0.1979, "step": 2721 }, { "epoch": 0.19389535919079673, "grad_norm": 3.609545946121216, "learning_rate": 1.481519593520231e-05, "loss": 0.8065, "step": 2722 }, { "epoch": 0.19396659187235105, "grad_norm": 2.8986871242523193, "learning_rate": 1.4811752949691958e-05, "loss": 0.3702, "step": 2723 }, { "epoch": 0.19403782455390534, "grad_norm": 5.049222469329834, "learning_rate": 1.4808309221777681e-05, "loss": 0.2379, "step": 2724 }, { "epoch": 0.19410905723545963, "grad_norm": 2.8463284969329834, "learning_rate": 1.4804864751990807e-05, "loss": 0.2147, "step": 2725 }, { "epoch": 0.19418028991701392, "grad_norm": 9.900996208190918, "learning_rate": 1.4801419540862779e-05, "loss": 0.2177, "step": 2726 }, { "epoch": 0.19425152259856823, "grad_norm": 3.4885482788085938, "learning_rate": 1.4797973588925163e-05, "loss": 0.9162, "step": 2727 }, { "epoch": 0.19432275528012252, "grad_norm": 2.8240151405334473, "learning_rate": 1.479452689670963e-05, "loss": 0.5082, "step": 2728 }, { "epoch": 0.1943939879616768, "grad_norm": 3.7421364784240723, "learning_rate": 1.4791079464747973e-05, "loss": 0.756, "step": 2729 }, { "epoch": 0.1944652206432311, "grad_norm": 3.0781309604644775, "learning_rate": 1.4787631293572094e-05, "loss": 0.6064, "step": 2730 }, { "epoch": 0.19453645332478542, "grad_norm": 2.738283395767212, "learning_rate": 1.4784182383714005e-05, "loss": 0.4294, "step": 2731 }, { "epoch": 0.1946076860063397, "grad_norm": 5.530115604400635, "learning_rate": 1.4780732735705847e-05, "loss": 0.5436, "step": 2732 }, { "epoch": 0.194678918687894, "grad_norm": 4.285490989685059, "learning_rate": 1.4777282350079858e-05, "loss": 0.3376, "step": 2733 }, { "epoch": 0.19475015136944832, "grad_norm": 3.2633495330810547, "learning_rate": 1.4773831227368399e-05, "loss": 0.3651, "step": 2734 }, { "epoch": 0.1948213840510026, "grad_norm": 5.742598533630371, "learning_rate": 1.477037936810394e-05, "loss": 0.5087, "step": 2735 }, { "epoch": 0.1948926167325569, "grad_norm": 3.1666200160980225, "learning_rate": 1.4766926772819072e-05, "loss": 0.2938, "step": 2736 }, { "epoch": 0.19496384941411118, "grad_norm": 2.922640323638916, "learning_rate": 1.476347344204649e-05, "loss": 0.5728, "step": 2737 }, { "epoch": 0.1950350820956655, "grad_norm": 2.5151517391204834, "learning_rate": 1.4760019376319015e-05, "loss": 0.2345, "step": 2738 }, { "epoch": 0.1951063147772198, "grad_norm": 2.319295883178711, "learning_rate": 1.4756564576169568e-05, "loss": 0.2418, "step": 2739 }, { "epoch": 0.19517754745877408, "grad_norm": 3.2617578506469727, "learning_rate": 1.4753109042131189e-05, "loss": 0.2468, "step": 2740 }, { "epoch": 0.19524878014032837, "grad_norm": 3.3664026260375977, "learning_rate": 1.4749652774737031e-05, "loss": 0.5242, "step": 2741 }, { "epoch": 0.1953200128218827, "grad_norm": 2.637477397918701, "learning_rate": 1.4746195774520365e-05, "loss": 0.5819, "step": 2742 }, { "epoch": 0.19539124550343698, "grad_norm": 3.575623035430908, "learning_rate": 1.4742738042014563e-05, "loss": 0.4707, "step": 2743 }, { "epoch": 0.19546247818499127, "grad_norm": 2.1203150749206543, "learning_rate": 1.4739279577753122e-05, "loss": 0.2371, "step": 2744 }, { "epoch": 0.19553371086654558, "grad_norm": 2.890530586242676, "learning_rate": 1.4735820382269652e-05, "loss": 0.2612, "step": 2745 }, { "epoch": 0.19560494354809987, "grad_norm": 5.764167308807373, "learning_rate": 1.4732360456097862e-05, "loss": 0.4442, "step": 2746 }, { "epoch": 0.19567617622965416, "grad_norm": 2.800431728363037, "learning_rate": 1.4728899799771591e-05, "loss": 0.4622, "step": 2747 }, { "epoch": 0.19574740891120845, "grad_norm": 5.830680847167969, "learning_rate": 1.472543841382478e-05, "loss": 0.5342, "step": 2748 }, { "epoch": 0.19581864159276277, "grad_norm": 2.91980242729187, "learning_rate": 1.472197629879148e-05, "loss": 0.4013, "step": 2749 }, { "epoch": 0.19588987427431706, "grad_norm": 2.8990769386291504, "learning_rate": 1.4718513455205867e-05, "loss": 0.1759, "step": 2750 }, { "epoch": 0.19596110695587135, "grad_norm": 2.3558621406555176, "learning_rate": 1.4715049883602217e-05, "loss": 0.2251, "step": 2751 }, { "epoch": 0.19603233963742564, "grad_norm": 2.874040126800537, "learning_rate": 1.4711585584514927e-05, "loss": 0.4366, "step": 2752 }, { "epoch": 0.19610357231897996, "grad_norm": 3.727233409881592, "learning_rate": 1.4708120558478501e-05, "loss": 0.5654, "step": 2753 }, { "epoch": 0.19617480500053425, "grad_norm": 4.439034938812256, "learning_rate": 1.4704654806027558e-05, "loss": 0.5687, "step": 2754 }, { "epoch": 0.19624603768208854, "grad_norm": 2.070781946182251, "learning_rate": 1.4701188327696825e-05, "loss": 0.2378, "step": 2755 }, { "epoch": 0.19631727036364283, "grad_norm": 2.392538070678711, "learning_rate": 1.4697721124021149e-05, "loss": 0.332, "step": 2756 }, { "epoch": 0.19638850304519714, "grad_norm": 3.2432451248168945, "learning_rate": 1.4694253195535478e-05, "loss": 0.4244, "step": 2757 }, { "epoch": 0.19645973572675143, "grad_norm": 1.5696322917938232, "learning_rate": 1.469078454277488e-05, "loss": 0.2245, "step": 2758 }, { "epoch": 0.19653096840830572, "grad_norm": 6.530178546905518, "learning_rate": 1.4687315166274535e-05, "loss": 0.6166, "step": 2759 }, { "epoch": 0.19660220108986004, "grad_norm": 2.6803038120269775, "learning_rate": 1.4683845066569727e-05, "loss": 0.4312, "step": 2760 }, { "epoch": 0.19667343377141433, "grad_norm": 3.441145420074463, "learning_rate": 1.4680374244195861e-05, "loss": 0.4046, "step": 2761 }, { "epoch": 0.19674466645296862, "grad_norm": 3.0965728759765625, "learning_rate": 1.467690269968845e-05, "loss": 0.8718, "step": 2762 }, { "epoch": 0.1968158991345229, "grad_norm": 2.349693775177002, "learning_rate": 1.4673430433583114e-05, "loss": 0.3567, "step": 2763 }, { "epoch": 0.19688713181607723, "grad_norm": 2.4899632930755615, "learning_rate": 1.4669957446415588e-05, "loss": 0.2077, "step": 2764 }, { "epoch": 0.19695836449763152, "grad_norm": 3.171365737915039, "learning_rate": 1.4666483738721719e-05, "loss": 0.5073, "step": 2765 }, { "epoch": 0.1970295971791858, "grad_norm": 4.199099063873291, "learning_rate": 1.4663009311037464e-05, "loss": 0.701, "step": 2766 }, { "epoch": 0.1971008298607401, "grad_norm": 3.530271530151367, "learning_rate": 1.4659534163898894e-05, "loss": 0.6312, "step": 2767 }, { "epoch": 0.1971720625422944, "grad_norm": 3.6834206581115723, "learning_rate": 1.4656058297842185e-05, "loss": 0.4591, "step": 2768 }, { "epoch": 0.1972432952238487, "grad_norm": 3.7613213062286377, "learning_rate": 1.465258171340363e-05, "loss": 0.6349, "step": 2769 }, { "epoch": 0.197314527905403, "grad_norm": 3.973776340484619, "learning_rate": 1.464910441111963e-05, "loss": 0.3542, "step": 2770 }, { "epoch": 0.1973857605869573, "grad_norm": 12.4153470993042, "learning_rate": 1.4645626391526694e-05, "loss": 0.4015, "step": 2771 }, { "epoch": 0.1974569932685116, "grad_norm": 3.374846935272217, "learning_rate": 1.4642147655161445e-05, "loss": 0.505, "step": 2772 }, { "epoch": 0.1975282259500659, "grad_norm": 2.4913992881774902, "learning_rate": 1.463866820256062e-05, "loss": 0.463, "step": 2773 }, { "epoch": 0.19759945863162018, "grad_norm": 3.2956254482269287, "learning_rate": 1.4635188034261059e-05, "loss": 0.8621, "step": 2774 }, { "epoch": 0.1976706913131745, "grad_norm": 2.3766069412231445, "learning_rate": 1.4631707150799718e-05, "loss": 0.3879, "step": 2775 }, { "epoch": 0.19774192399472879, "grad_norm": 1.5376824140548706, "learning_rate": 1.4628225552713662e-05, "loss": 0.1812, "step": 2776 }, { "epoch": 0.19781315667628308, "grad_norm": 3.981269121170044, "learning_rate": 1.4624743240540064e-05, "loss": 0.7068, "step": 2777 }, { "epoch": 0.19788438935783736, "grad_norm": 5.377039432525635, "learning_rate": 1.4621260214816211e-05, "loss": 0.7182, "step": 2778 }, { "epoch": 0.19795562203939168, "grad_norm": 2.746382713317871, "learning_rate": 1.4617776476079495e-05, "loss": 0.3006, "step": 2779 }, { "epoch": 0.19802685472094597, "grad_norm": 2.7375028133392334, "learning_rate": 1.461429202486742e-05, "loss": 0.6175, "step": 2780 }, { "epoch": 0.19809808740250026, "grad_norm": 3.198803663253784, "learning_rate": 1.4610806861717607e-05, "loss": 0.5849, "step": 2781 }, { "epoch": 0.19816932008405455, "grad_norm": 3.32126784324646, "learning_rate": 1.4607320987167778e-05, "loss": 0.7404, "step": 2782 }, { "epoch": 0.19824055276560887, "grad_norm": 4.1025495529174805, "learning_rate": 1.4603834401755766e-05, "loss": 0.662, "step": 2783 }, { "epoch": 0.19831178544716316, "grad_norm": 2.493117570877075, "learning_rate": 1.4600347106019514e-05, "loss": 0.6765, "step": 2784 }, { "epoch": 0.19838301812871745, "grad_norm": 3.2716894149780273, "learning_rate": 1.4596859100497083e-05, "loss": 0.6521, "step": 2785 }, { "epoch": 0.19845425081027177, "grad_norm": 2.9164483547210693, "learning_rate": 1.4593370385726627e-05, "loss": 0.5467, "step": 2786 }, { "epoch": 0.19852548349182605, "grad_norm": 5.131321430206299, "learning_rate": 1.4589880962246424e-05, "loss": 0.3002, "step": 2787 }, { "epoch": 0.19859671617338034, "grad_norm": 2.367499351501465, "learning_rate": 1.4586390830594856e-05, "loss": 0.568, "step": 2788 }, { "epoch": 0.19866794885493463, "grad_norm": 4.389141082763672, "learning_rate": 1.4582899991310412e-05, "loss": 0.4962, "step": 2789 }, { "epoch": 0.19873918153648895, "grad_norm": 2.3945887088775635, "learning_rate": 1.4579408444931696e-05, "loss": 0.526, "step": 2790 }, { "epoch": 0.19881041421804324, "grad_norm": 4.347743988037109, "learning_rate": 1.4575916191997415e-05, "loss": 0.4706, "step": 2791 }, { "epoch": 0.19888164689959753, "grad_norm": 2.539599895477295, "learning_rate": 1.4572423233046386e-05, "loss": 0.284, "step": 2792 }, { "epoch": 0.19895287958115182, "grad_norm": 3.1948952674865723, "learning_rate": 1.4568929568617542e-05, "loss": 0.1346, "step": 2793 }, { "epoch": 0.19902411226270614, "grad_norm": 2.9564099311828613, "learning_rate": 1.4565435199249915e-05, "loss": 0.4337, "step": 2794 }, { "epoch": 0.19909534494426043, "grad_norm": 4.476072788238525, "learning_rate": 1.4561940125482652e-05, "loss": 0.2807, "step": 2795 }, { "epoch": 0.19916657762581472, "grad_norm": 4.111037731170654, "learning_rate": 1.4558444347855008e-05, "loss": 0.5597, "step": 2796 }, { "epoch": 0.19923781030736903, "grad_norm": 2.541555643081665, "learning_rate": 1.455494786690634e-05, "loss": 0.3962, "step": 2797 }, { "epoch": 0.19930904298892332, "grad_norm": 4.7090840339660645, "learning_rate": 1.4551450683176127e-05, "loss": 0.8078, "step": 2798 }, { "epoch": 0.1993802756704776, "grad_norm": 4.6335248947143555, "learning_rate": 1.4547952797203944e-05, "loss": 0.4763, "step": 2799 }, { "epoch": 0.1994515083520319, "grad_norm": 3.0581953525543213, "learning_rate": 1.454445420952948e-05, "loss": 0.314, "step": 2800 }, { "epoch": 0.19952274103358622, "grad_norm": 2.3661816120147705, "learning_rate": 1.4540954920692528e-05, "loss": 0.2033, "step": 2801 }, { "epoch": 0.1995939737151405, "grad_norm": 2.6277964115142822, "learning_rate": 1.4537454931232994e-05, "loss": 0.5861, "step": 2802 }, { "epoch": 0.1996652063966948, "grad_norm": 3.6803951263427734, "learning_rate": 1.4533954241690891e-05, "loss": 0.7429, "step": 2803 }, { "epoch": 0.1997364390782491, "grad_norm": 1.5415135622024536, "learning_rate": 1.453045285260634e-05, "loss": 0.1405, "step": 2804 }, { "epoch": 0.1998076717598034, "grad_norm": 2.2406492233276367, "learning_rate": 1.452695076451957e-05, "loss": 0.4035, "step": 2805 }, { "epoch": 0.1998789044413577, "grad_norm": 2.527981996536255, "learning_rate": 1.4523447977970913e-05, "loss": 0.3141, "step": 2806 }, { "epoch": 0.199950137122912, "grad_norm": 2.6876847743988037, "learning_rate": 1.451994449350082e-05, "loss": 0.4004, "step": 2807 }, { "epoch": 0.20002136980446628, "grad_norm": 3.1427247524261475, "learning_rate": 1.4516440311649835e-05, "loss": 0.1498, "step": 2808 }, { "epoch": 0.2000926024860206, "grad_norm": 4.687466621398926, "learning_rate": 1.451293543295862e-05, "loss": 0.888, "step": 2809 }, { "epoch": 0.20016383516757488, "grad_norm": 3.7184011936187744, "learning_rate": 1.450942985796794e-05, "loss": 0.5785, "step": 2810 }, { "epoch": 0.20023506784912917, "grad_norm": 2.586395740509033, "learning_rate": 1.4505923587218673e-05, "loss": 0.1764, "step": 2811 }, { "epoch": 0.2003063005306835, "grad_norm": 2.15071177482605, "learning_rate": 1.4502416621251798e-05, "loss": 0.3961, "step": 2812 }, { "epoch": 0.20037753321223778, "grad_norm": 2.629173994064331, "learning_rate": 1.4498908960608407e-05, "loss": 0.4736, "step": 2813 }, { "epoch": 0.20044876589379207, "grad_norm": 2.5709235668182373, "learning_rate": 1.449540060582969e-05, "loss": 0.2436, "step": 2814 }, { "epoch": 0.20051999857534636, "grad_norm": 3.1722304821014404, "learning_rate": 1.4491891557456956e-05, "loss": 0.5195, "step": 2815 }, { "epoch": 0.20059123125690068, "grad_norm": 3.076782464981079, "learning_rate": 1.448838181603161e-05, "loss": 0.7196, "step": 2816 }, { "epoch": 0.20066246393845497, "grad_norm": 2.976076364517212, "learning_rate": 1.4484871382095172e-05, "loss": 0.5136, "step": 2817 }, { "epoch": 0.20073369662000926, "grad_norm": 3.1436052322387695, "learning_rate": 1.4481360256189266e-05, "loss": 0.5278, "step": 2818 }, { "epoch": 0.20080492930156355, "grad_norm": 3.0955941677093506, "learning_rate": 1.4477848438855619e-05, "loss": 0.3912, "step": 2819 }, { "epoch": 0.20087616198311786, "grad_norm": 2.0197954177856445, "learning_rate": 1.447433593063607e-05, "loss": 0.1908, "step": 2820 }, { "epoch": 0.20094739466467215, "grad_norm": 3.73455548286438, "learning_rate": 1.4470822732072567e-05, "loss": 0.4907, "step": 2821 }, { "epoch": 0.20101862734622644, "grad_norm": 5.817769527435303, "learning_rate": 1.4467308843707155e-05, "loss": 0.5047, "step": 2822 }, { "epoch": 0.20108986002778076, "grad_norm": 2.074063777923584, "learning_rate": 1.4463794266081994e-05, "loss": 0.2951, "step": 2823 }, { "epoch": 0.20116109270933505, "grad_norm": 4.010993957519531, "learning_rate": 1.4460278999739346e-05, "loss": 0.5602, "step": 2824 }, { "epoch": 0.20123232539088934, "grad_norm": 2.9956047534942627, "learning_rate": 1.445676304522158e-05, "loss": 0.4513, "step": 2825 }, { "epoch": 0.20130355807244363, "grad_norm": 5.75505256652832, "learning_rate": 1.445324640307117e-05, "loss": 0.4179, "step": 2826 }, { "epoch": 0.20137479075399795, "grad_norm": 3.5126559734344482, "learning_rate": 1.4449729073830703e-05, "loss": 0.6033, "step": 2827 }, { "epoch": 0.20144602343555224, "grad_norm": 5.827909469604492, "learning_rate": 1.444621105804286e-05, "loss": 0.2801, "step": 2828 }, { "epoch": 0.20151725611710652, "grad_norm": 3.311288595199585, "learning_rate": 1.4442692356250443e-05, "loss": 0.4807, "step": 2829 }, { "epoch": 0.20158848879866081, "grad_norm": 2.2838194370269775, "learning_rate": 1.4439172968996343e-05, "loss": 0.1567, "step": 2830 }, { "epoch": 0.20165972148021513, "grad_norm": 2.2193562984466553, "learning_rate": 1.4435652896823565e-05, "loss": 0.1931, "step": 2831 }, { "epoch": 0.20173095416176942, "grad_norm": 3.731250762939453, "learning_rate": 1.4432132140275229e-05, "loss": 0.266, "step": 2832 }, { "epoch": 0.2018021868433237, "grad_norm": 3.889559030532837, "learning_rate": 1.4428610699894542e-05, "loss": 0.5636, "step": 2833 }, { "epoch": 0.201873419524878, "grad_norm": 3.547395944595337, "learning_rate": 1.442508857622483e-05, "loss": 0.4816, "step": 2834 }, { "epoch": 0.20194465220643232, "grad_norm": 3.910055160522461, "learning_rate": 1.4421565769809523e-05, "loss": 0.4109, "step": 2835 }, { "epoch": 0.2020158848879866, "grad_norm": 2.1562464237213135, "learning_rate": 1.4418042281192151e-05, "loss": 0.135, "step": 2836 }, { "epoch": 0.2020871175695409, "grad_norm": 2.961430311203003, "learning_rate": 1.4414518110916352e-05, "loss": 0.2933, "step": 2837 }, { "epoch": 0.20215835025109521, "grad_norm": 1.761440396308899, "learning_rate": 1.4410993259525868e-05, "loss": 0.194, "step": 2838 }, { "epoch": 0.2022295829326495, "grad_norm": 3.3940560817718506, "learning_rate": 1.4407467727564548e-05, "loss": 0.8417, "step": 2839 }, { "epoch": 0.2023008156142038, "grad_norm": 2.307361364364624, "learning_rate": 1.4403941515576344e-05, "loss": 0.5408, "step": 2840 }, { "epoch": 0.20237204829575808, "grad_norm": 2.8579890727996826, "learning_rate": 1.4400414624105319e-05, "loss": 0.3979, "step": 2841 }, { "epoch": 0.2024432809773124, "grad_norm": 4.163303375244141, "learning_rate": 1.4396887053695631e-05, "loss": 0.4255, "step": 2842 }, { "epoch": 0.2025145136588667, "grad_norm": 3.466451644897461, "learning_rate": 1.439335880489155e-05, "loss": 0.4612, "step": 2843 }, { "epoch": 0.20258574634042098, "grad_norm": 4.981315612792969, "learning_rate": 1.4389829878237451e-05, "loss": 0.7164, "step": 2844 }, { "epoch": 0.20265697902197527, "grad_norm": 3.4372358322143555, "learning_rate": 1.438630027427781e-05, "loss": 0.5127, "step": 2845 }, { "epoch": 0.2027282117035296, "grad_norm": 4.266355991363525, "learning_rate": 1.4382769993557202e-05, "loss": 0.7257, "step": 2846 }, { "epoch": 0.20279944438508388, "grad_norm": 1.9193538427352905, "learning_rate": 1.4379239036620319e-05, "loss": 0.3777, "step": 2847 }, { "epoch": 0.20287067706663817, "grad_norm": 3.8028876781463623, "learning_rate": 1.4375707404011949e-05, "loss": 0.4909, "step": 2848 }, { "epoch": 0.20294190974819248, "grad_norm": 1.9436227083206177, "learning_rate": 1.4372175096276988e-05, "loss": 0.2108, "step": 2849 }, { "epoch": 0.20301314242974677, "grad_norm": 3.8489603996276855, "learning_rate": 1.4368642113960436e-05, "loss": 0.5568, "step": 2850 }, { "epoch": 0.20308437511130106, "grad_norm": 2.669351100921631, "learning_rate": 1.4365108457607396e-05, "loss": 0.5145, "step": 2851 }, { "epoch": 0.20315560779285535, "grad_norm": 2.300764560699463, "learning_rate": 1.4361574127763069e-05, "loss": 0.3416, "step": 2852 }, { "epoch": 0.20322684047440967, "grad_norm": 4.926682949066162, "learning_rate": 1.4358039124972771e-05, "loss": 0.6671, "step": 2853 }, { "epoch": 0.20329807315596396, "grad_norm": 2.7485740184783936, "learning_rate": 1.4354503449781914e-05, "loss": 0.5454, "step": 2854 }, { "epoch": 0.20336930583751825, "grad_norm": 2.4433374404907227, "learning_rate": 1.435096710273602e-05, "loss": 0.6279, "step": 2855 }, { "epoch": 0.20344053851907254, "grad_norm": 2.1449320316314697, "learning_rate": 1.4347430084380705e-05, "loss": 0.2724, "step": 2856 }, { "epoch": 0.20351177120062686, "grad_norm": 5.255427837371826, "learning_rate": 1.4343892395261699e-05, "loss": 0.8079, "step": 2857 }, { "epoch": 0.20358300388218115, "grad_norm": 1.1558208465576172, "learning_rate": 1.434035403592483e-05, "loss": 0.0774, "step": 2858 }, { "epoch": 0.20365423656373544, "grad_norm": 2.8192968368530273, "learning_rate": 1.4336815006916032e-05, "loss": 0.3387, "step": 2859 }, { "epoch": 0.20372546924528973, "grad_norm": 2.310081958770752, "learning_rate": 1.4333275308781338e-05, "loss": 0.2642, "step": 2860 }, { "epoch": 0.20379670192684404, "grad_norm": 2.4729628562927246, "learning_rate": 1.4329734942066889e-05, "loss": 0.2888, "step": 2861 }, { "epoch": 0.20386793460839833, "grad_norm": 3.559253692626953, "learning_rate": 1.4326193907318924e-05, "loss": 0.8018, "step": 2862 }, { "epoch": 0.20393916728995262, "grad_norm": 2.5654313564300537, "learning_rate": 1.432265220508379e-05, "loss": 0.5407, "step": 2863 }, { "epoch": 0.20401039997150694, "grad_norm": 5.596227169036865, "learning_rate": 1.4319109835907936e-05, "loss": 0.7996, "step": 2864 }, { "epoch": 0.20408163265306123, "grad_norm": 2.686269521713257, "learning_rate": 1.4315566800337914e-05, "loss": 0.2225, "step": 2865 }, { "epoch": 0.20415286533461552, "grad_norm": 2.604785203933716, "learning_rate": 1.4312023098920374e-05, "loss": 0.4437, "step": 2866 }, { "epoch": 0.2042240980161698, "grad_norm": 3.767986536026001, "learning_rate": 1.430847873220208e-05, "loss": 0.3066, "step": 2867 }, { "epoch": 0.20429533069772413, "grad_norm": 2.93733811378479, "learning_rate": 1.4304933700729882e-05, "loss": 0.5952, "step": 2868 }, { "epoch": 0.20436656337927842, "grad_norm": 4.671230792999268, "learning_rate": 1.4301388005050746e-05, "loss": 0.4864, "step": 2869 }, { "epoch": 0.2044377960608327, "grad_norm": 3.5968849658966064, "learning_rate": 1.4297841645711738e-05, "loss": 0.3297, "step": 2870 }, { "epoch": 0.204509028742387, "grad_norm": 3.5603911876678467, "learning_rate": 1.4294294623260024e-05, "loss": 0.525, "step": 2871 }, { "epoch": 0.2045802614239413, "grad_norm": 3.902127265930176, "learning_rate": 1.429074693824287e-05, "loss": 0.4284, "step": 2872 }, { "epoch": 0.2046514941054956, "grad_norm": 3.006416082382202, "learning_rate": 1.428719859120765e-05, "loss": 0.3868, "step": 2873 }, { "epoch": 0.2047227267870499, "grad_norm": 2.4999449253082275, "learning_rate": 1.428364958270184e-05, "loss": 0.3668, "step": 2874 }, { "epoch": 0.2047939594686042, "grad_norm": 3.316601276397705, "learning_rate": 1.428009991327301e-05, "loss": 0.0996, "step": 2875 }, { "epoch": 0.2048651921501585, "grad_norm": 2.353342294692993, "learning_rate": 1.4276549583468842e-05, "loss": 0.4095, "step": 2876 }, { "epoch": 0.2049364248317128, "grad_norm": 3.30751895904541, "learning_rate": 1.4272998593837108e-05, "loss": 0.5046, "step": 2877 }, { "epoch": 0.20500765751326708, "grad_norm": 3.2800986766815186, "learning_rate": 1.42694469449257e-05, "loss": 0.3699, "step": 2878 }, { "epoch": 0.2050788901948214, "grad_norm": 2.7391271591186523, "learning_rate": 1.4265894637282594e-05, "loss": 0.6908, "step": 2879 }, { "epoch": 0.20515012287637568, "grad_norm": 3.4226839542388916, "learning_rate": 1.4262341671455873e-05, "loss": 0.661, "step": 2880 }, { "epoch": 0.20522135555792997, "grad_norm": 2.198756217956543, "learning_rate": 1.4258788047993726e-05, "loss": 0.5564, "step": 2881 }, { "epoch": 0.20529258823948426, "grad_norm": 3.0985498428344727, "learning_rate": 1.4255233767444443e-05, "loss": 0.4844, "step": 2882 }, { "epoch": 0.20536382092103858, "grad_norm": 4.383757591247559, "learning_rate": 1.4251678830356408e-05, "loss": 0.2189, "step": 2883 }, { "epoch": 0.20543505360259287, "grad_norm": 2.3619327545166016, "learning_rate": 1.4248123237278116e-05, "loss": 0.3734, "step": 2884 }, { "epoch": 0.20550628628414716, "grad_norm": 3.1367411613464355, "learning_rate": 1.4244566988758152e-05, "loss": 0.4369, "step": 2885 }, { "epoch": 0.20557751896570145, "grad_norm": 2.969975471496582, "learning_rate": 1.4241010085345216e-05, "loss": 0.7057, "step": 2886 }, { "epoch": 0.20564875164725577, "grad_norm": 3.348392963409424, "learning_rate": 1.4237452527588094e-05, "loss": 0.6673, "step": 2887 }, { "epoch": 0.20571998432881006, "grad_norm": 2.2267396450042725, "learning_rate": 1.4233894316035683e-05, "loss": 0.3809, "step": 2888 }, { "epoch": 0.20579121701036435, "grad_norm": 2.516047239303589, "learning_rate": 1.4230335451236988e-05, "loss": 0.326, "step": 2889 }, { "epoch": 0.20586244969191866, "grad_norm": 4.311049461364746, "learning_rate": 1.422677593374109e-05, "loss": 0.5663, "step": 2890 }, { "epoch": 0.20593368237347295, "grad_norm": 3.189842939376831, "learning_rate": 1.4223215764097194e-05, "loss": 0.4829, "step": 2891 }, { "epoch": 0.20600491505502724, "grad_norm": 3.528754711151123, "learning_rate": 1.4219654942854598e-05, "loss": 0.1456, "step": 2892 }, { "epoch": 0.20607614773658153, "grad_norm": 2.949629783630371, "learning_rate": 1.4216093470562698e-05, "loss": 0.2374, "step": 2893 }, { "epoch": 0.20614738041813585, "grad_norm": 4.95701265335083, "learning_rate": 1.4212531347770987e-05, "loss": 0.5811, "step": 2894 }, { "epoch": 0.20621861309969014, "grad_norm": 2.7754762172698975, "learning_rate": 1.4208968575029077e-05, "loss": 0.2193, "step": 2895 }, { "epoch": 0.20628984578124443, "grad_norm": 2.412060260772705, "learning_rate": 1.4205405152886658e-05, "loss": 0.3909, "step": 2896 }, { "epoch": 0.20636107846279872, "grad_norm": 3.8613641262054443, "learning_rate": 1.4201841081893531e-05, "loss": 0.1285, "step": 2897 }, { "epoch": 0.20643231114435304, "grad_norm": 4.324449062347412, "learning_rate": 1.4198276362599597e-05, "loss": 0.3088, "step": 2898 }, { "epoch": 0.20650354382590733, "grad_norm": 3.0298495292663574, "learning_rate": 1.4194710995554852e-05, "loss": 0.7315, "step": 2899 }, { "epoch": 0.20657477650746162, "grad_norm": 2.875023365020752, "learning_rate": 1.4191144981309397e-05, "loss": 0.2262, "step": 2900 }, { "epoch": 0.20664600918901593, "grad_norm": 3.4419519901275635, "learning_rate": 1.4187578320413434e-05, "loss": 0.5112, "step": 2901 }, { "epoch": 0.20671724187057022, "grad_norm": 4.092258930206299, "learning_rate": 1.4184011013417258e-05, "loss": 0.6264, "step": 2902 }, { "epoch": 0.2067884745521245, "grad_norm": 3.132176399230957, "learning_rate": 1.4180443060871269e-05, "loss": 0.237, "step": 2903 }, { "epoch": 0.2068597072336788, "grad_norm": 2.873509645462036, "learning_rate": 1.4176874463325967e-05, "loss": 0.2463, "step": 2904 }, { "epoch": 0.20693093991523312, "grad_norm": 4.368157863616943, "learning_rate": 1.4173305221331953e-05, "loss": 0.4299, "step": 2905 }, { "epoch": 0.2070021725967874, "grad_norm": 2.6002917289733887, "learning_rate": 1.4169735335439914e-05, "loss": 0.7337, "step": 2906 }, { "epoch": 0.2070734052783417, "grad_norm": 3.575671672821045, "learning_rate": 1.4166164806200655e-05, "loss": 0.5694, "step": 2907 }, { "epoch": 0.207144637959896, "grad_norm": 2.8650765419006348, "learning_rate": 1.416259363416507e-05, "loss": 0.3335, "step": 2908 }, { "epoch": 0.2072158706414503, "grad_norm": 3.0150632858276367, "learning_rate": 1.415902181988415e-05, "loss": 0.6576, "step": 2909 }, { "epoch": 0.2072871033230046, "grad_norm": 2.8182218074798584, "learning_rate": 1.4155449363908997e-05, "loss": 0.5865, "step": 2910 }, { "epoch": 0.20735833600455889, "grad_norm": 2.695887327194214, "learning_rate": 1.4151876266790801e-05, "loss": 0.3933, "step": 2911 }, { "epoch": 0.20742956868611317, "grad_norm": 3.182853937149048, "learning_rate": 1.414830252908085e-05, "loss": 0.685, "step": 2912 }, { "epoch": 0.2075008013676675, "grad_norm": 4.38975191116333, "learning_rate": 1.414472815133054e-05, "loss": 0.472, "step": 2913 }, { "epoch": 0.20757203404922178, "grad_norm": 2.762667179107666, "learning_rate": 1.4141153134091357e-05, "loss": 0.562, "step": 2914 }, { "epoch": 0.20764326673077607, "grad_norm": 5.646838188171387, "learning_rate": 1.4137577477914892e-05, "loss": 0.7568, "step": 2915 }, { "epoch": 0.2077144994123304, "grad_norm": 3.047260284423828, "learning_rate": 1.4134001183352833e-05, "loss": 0.5514, "step": 2916 }, { "epoch": 0.20778573209388468, "grad_norm": 2.5291748046875, "learning_rate": 1.4130424250956958e-05, "loss": 0.3956, "step": 2917 }, { "epoch": 0.20785696477543897, "grad_norm": 4.726107120513916, "learning_rate": 1.4126846681279161e-05, "loss": 0.4764, "step": 2918 }, { "epoch": 0.20792819745699326, "grad_norm": 2.5397825241088867, "learning_rate": 1.4123268474871417e-05, "loss": 0.5885, "step": 2919 }, { "epoch": 0.20799943013854758, "grad_norm": 3.0951335430145264, "learning_rate": 1.4119689632285812e-05, "loss": 0.2681, "step": 2920 }, { "epoch": 0.20807066282010186, "grad_norm": 2.1807827949523926, "learning_rate": 1.4116110154074518e-05, "loss": 0.3015, "step": 2921 }, { "epoch": 0.20814189550165615, "grad_norm": 2.8456904888153076, "learning_rate": 1.4112530040789816e-05, "loss": 0.5389, "step": 2922 }, { "epoch": 0.20821312818321044, "grad_norm": 4.780940055847168, "learning_rate": 1.4108949292984077e-05, "loss": 0.6494, "step": 2923 }, { "epoch": 0.20828436086476476, "grad_norm": 3.019838333129883, "learning_rate": 1.410536791120978e-05, "loss": 0.3304, "step": 2924 }, { "epoch": 0.20835559354631905, "grad_norm": 4.376544952392578, "learning_rate": 1.410178589601949e-05, "loss": 0.4814, "step": 2925 }, { "epoch": 0.20842682622787334, "grad_norm": 5.459171295166016, "learning_rate": 1.4098203247965876e-05, "loss": 0.2226, "step": 2926 }, { "epoch": 0.20849805890942766, "grad_norm": 3.7817089557647705, "learning_rate": 1.4094619967601707e-05, "loss": 0.7171, "step": 2927 }, { "epoch": 0.20856929159098195, "grad_norm": 4.119266033172607, "learning_rate": 1.409103605547984e-05, "loss": 0.5133, "step": 2928 }, { "epoch": 0.20864052427253624, "grad_norm": 2.7612411975860596, "learning_rate": 1.4087451512153241e-05, "loss": 0.3623, "step": 2929 }, { "epoch": 0.20871175695409053, "grad_norm": 2.919844388961792, "learning_rate": 1.4083866338174964e-05, "loss": 0.3029, "step": 2930 }, { "epoch": 0.20878298963564484, "grad_norm": 3.4894421100616455, "learning_rate": 1.4080280534098168e-05, "loss": 0.5936, "step": 2931 }, { "epoch": 0.20885422231719913, "grad_norm": 2.0852701663970947, "learning_rate": 1.4076694100476104e-05, "loss": 0.2663, "step": 2932 }, { "epoch": 0.20892545499875342, "grad_norm": 3.2228572368621826, "learning_rate": 1.4073107037862124e-05, "loss": 0.458, "step": 2933 }, { "epoch": 0.2089966876803077, "grad_norm": 3.796372413635254, "learning_rate": 1.4069519346809673e-05, "loss": 0.4013, "step": 2934 }, { "epoch": 0.20906792036186203, "grad_norm": 2.6343941688537598, "learning_rate": 1.4065931027872293e-05, "loss": 0.2617, "step": 2935 }, { "epoch": 0.20913915304341632, "grad_norm": 2.1551499366760254, "learning_rate": 1.4062342081603626e-05, "loss": 0.5095, "step": 2936 }, { "epoch": 0.2092103857249706, "grad_norm": 6.848942279815674, "learning_rate": 1.405875250855741e-05, "loss": 0.4338, "step": 2937 }, { "epoch": 0.20928161840652493, "grad_norm": 3.388190269470215, "learning_rate": 1.4055162309287477e-05, "loss": 0.4875, "step": 2938 }, { "epoch": 0.20935285108807922, "grad_norm": 3.0708160400390625, "learning_rate": 1.4051571484347766e-05, "loss": 0.3945, "step": 2939 }, { "epoch": 0.2094240837696335, "grad_norm": 2.8989148139953613, "learning_rate": 1.4047980034292292e-05, "loss": 0.3414, "step": 2940 }, { "epoch": 0.2094953164511878, "grad_norm": 2.4291703701019287, "learning_rate": 1.4044387959675187e-05, "loss": 0.6478, "step": 2941 }, { "epoch": 0.2095665491327421, "grad_norm": 2.7079007625579834, "learning_rate": 1.4040795261050671e-05, "loss": 0.2483, "step": 2942 }, { "epoch": 0.2096377818142964, "grad_norm": 2.057833433151245, "learning_rate": 1.4037201938973057e-05, "loss": 0.251, "step": 2943 }, { "epoch": 0.2097090144958507, "grad_norm": 2.0785927772521973, "learning_rate": 1.4033607993996758e-05, "loss": 0.4024, "step": 2944 }, { "epoch": 0.20978024717740498, "grad_norm": 4.475035667419434, "learning_rate": 1.4030013426676283e-05, "loss": 0.8571, "step": 2945 }, { "epoch": 0.2098514798589593, "grad_norm": 2.89471173286438, "learning_rate": 1.4026418237566239e-05, "loss": 0.5634, "step": 2946 }, { "epoch": 0.2099227125405136, "grad_norm": 3.9313178062438965, "learning_rate": 1.4022822427221325e-05, "loss": 0.8983, "step": 2947 }, { "epoch": 0.20999394522206788, "grad_norm": 3.2409443855285645, "learning_rate": 1.4019225996196335e-05, "loss": 0.5774, "step": 2948 }, { "epoch": 0.21006517790362217, "grad_norm": 4.710606098175049, "learning_rate": 1.4015628945046169e-05, "loss": 0.6991, "step": 2949 }, { "epoch": 0.2101364105851765, "grad_norm": 2.4557249546051025, "learning_rate": 1.4012031274325808e-05, "loss": 0.4579, "step": 2950 }, { "epoch": 0.21020764326673078, "grad_norm": 3.7474663257598877, "learning_rate": 1.4008432984590333e-05, "loss": 0.6694, "step": 2951 }, { "epoch": 0.21027887594828507, "grad_norm": 4.29076623916626, "learning_rate": 1.4004834076394931e-05, "loss": 0.7078, "step": 2952 }, { "epoch": 0.21035010862983938, "grad_norm": 3.4286258220672607, "learning_rate": 1.4001234550294873e-05, "loss": 0.5618, "step": 2953 }, { "epoch": 0.21042134131139367, "grad_norm": 2.7125141620635986, "learning_rate": 1.3997634406845526e-05, "loss": 0.5996, "step": 2954 }, { "epoch": 0.21049257399294796, "grad_norm": 3.680410146713257, "learning_rate": 1.3994033646602359e-05, "loss": 0.528, "step": 2955 }, { "epoch": 0.21056380667450225, "grad_norm": 2.6398961544036865, "learning_rate": 1.3990432270120933e-05, "loss": 0.2019, "step": 2956 }, { "epoch": 0.21063503935605657, "grad_norm": 2.0972087383270264, "learning_rate": 1.3986830277956899e-05, "loss": 0.2404, "step": 2957 }, { "epoch": 0.21070627203761086, "grad_norm": 3.5284502506256104, "learning_rate": 1.3983227670666011e-05, "loss": 0.8189, "step": 2958 }, { "epoch": 0.21077750471916515, "grad_norm": 3.402261972427368, "learning_rate": 1.3979624448804112e-05, "loss": 0.6176, "step": 2959 }, { "epoch": 0.21084873740071944, "grad_norm": 2.836343765258789, "learning_rate": 1.3976020612927141e-05, "loss": 0.5727, "step": 2960 }, { "epoch": 0.21091997008227376, "grad_norm": 2.824113130569458, "learning_rate": 1.3972416163591138e-05, "loss": 0.4047, "step": 2961 }, { "epoch": 0.21099120276382805, "grad_norm": 1.8547451496124268, "learning_rate": 1.3968811101352226e-05, "loss": 0.1833, "step": 2962 }, { "epoch": 0.21106243544538233, "grad_norm": 4.43675422668457, "learning_rate": 1.3965205426766632e-05, "loss": 0.6272, "step": 2963 }, { "epoch": 0.21113366812693665, "grad_norm": 1.8892507553100586, "learning_rate": 1.3961599140390675e-05, "loss": 0.368, "step": 2964 }, { "epoch": 0.21120490080849094, "grad_norm": 4.427581310272217, "learning_rate": 1.3957992242780768e-05, "loss": 0.7773, "step": 2965 }, { "epoch": 0.21127613349004523, "grad_norm": 5.270426273345947, "learning_rate": 1.3954384734493418e-05, "loss": 0.4904, "step": 2966 }, { "epoch": 0.21134736617159952, "grad_norm": 3.9143972396850586, "learning_rate": 1.3950776616085224e-05, "loss": 0.8249, "step": 2967 }, { "epoch": 0.21141859885315384, "grad_norm": 2.536550283432007, "learning_rate": 1.3947167888112882e-05, "loss": 0.3833, "step": 2968 }, { "epoch": 0.21148983153470813, "grad_norm": 2.637701988220215, "learning_rate": 1.3943558551133186e-05, "loss": 0.6141, "step": 2969 }, { "epoch": 0.21156106421626242, "grad_norm": 2.8889098167419434, "learning_rate": 1.3939948605703015e-05, "loss": 0.0571, "step": 2970 }, { "epoch": 0.2116322968978167, "grad_norm": 3.5129740238189697, "learning_rate": 1.393633805237935e-05, "loss": 0.6493, "step": 2971 }, { "epoch": 0.21170352957937102, "grad_norm": 3.7539031505584717, "learning_rate": 1.3932726891719259e-05, "loss": 0.5492, "step": 2972 }, { "epoch": 0.21177476226092531, "grad_norm": 2.3248071670532227, "learning_rate": 1.3929115124279906e-05, "loss": 0.3505, "step": 2973 }, { "epoch": 0.2118459949424796, "grad_norm": 2.794280767440796, "learning_rate": 1.392550275061855e-05, "loss": 0.4299, "step": 2974 }, { "epoch": 0.2119172276240339, "grad_norm": 2.3730711936950684, "learning_rate": 1.3921889771292546e-05, "loss": 0.2806, "step": 2975 }, { "epoch": 0.2119884603055882, "grad_norm": 2.7040975093841553, "learning_rate": 1.391827618685934e-05, "loss": 0.5048, "step": 2976 }, { "epoch": 0.2120596929871425, "grad_norm": 3.109279155731201, "learning_rate": 1.3914661997876467e-05, "loss": 0.3459, "step": 2977 }, { "epoch": 0.2121309256686968, "grad_norm": 2.3572018146514893, "learning_rate": 1.391104720490156e-05, "loss": 0.5263, "step": 2978 }, { "epoch": 0.2122021583502511, "grad_norm": 2.8684561252593994, "learning_rate": 1.3907431808492348e-05, "loss": 0.495, "step": 2979 }, { "epoch": 0.2122733910318054, "grad_norm": 3.702266216278076, "learning_rate": 1.3903815809206646e-05, "loss": 0.8183, "step": 2980 }, { "epoch": 0.2123446237133597, "grad_norm": 2.847968578338623, "learning_rate": 1.3900199207602365e-05, "loss": 0.2974, "step": 2981 }, { "epoch": 0.21241585639491398, "grad_norm": 3.0347888469696045, "learning_rate": 1.3896582004237514e-05, "loss": 0.9352, "step": 2982 }, { "epoch": 0.2124870890764683, "grad_norm": 2.7027230262756348, "learning_rate": 1.3892964199670181e-05, "loss": 0.665, "step": 2983 }, { "epoch": 0.21255832175802258, "grad_norm": 2.9549410343170166, "learning_rate": 1.3889345794458563e-05, "loss": 0.3167, "step": 2984 }, { "epoch": 0.21262955443957687, "grad_norm": 3.699366569519043, "learning_rate": 1.3885726789160943e-05, "loss": 0.7017, "step": 2985 }, { "epoch": 0.21270078712113116, "grad_norm": 4.052136421203613, "learning_rate": 1.3882107184335696e-05, "loss": 0.448, "step": 2986 }, { "epoch": 0.21277201980268548, "grad_norm": 1.900744915008545, "learning_rate": 1.3878486980541289e-05, "loss": 0.1298, "step": 2987 }, { "epoch": 0.21284325248423977, "grad_norm": 5.278896808624268, "learning_rate": 1.3874866178336277e-05, "loss": 0.4921, "step": 2988 }, { "epoch": 0.21291448516579406, "grad_norm": 2.8815674781799316, "learning_rate": 1.387124477827932e-05, "loss": 0.1985, "step": 2989 }, { "epoch": 0.21298571784734838, "grad_norm": 2.768470048904419, "learning_rate": 1.386762278092916e-05, "loss": 0.3197, "step": 2990 }, { "epoch": 0.21305695052890267, "grad_norm": 1.5538630485534668, "learning_rate": 1.3864000186844631e-05, "loss": 0.1142, "step": 2991 }, { "epoch": 0.21312818321045696, "grad_norm": 2.7608470916748047, "learning_rate": 1.3860376996584667e-05, "loss": 0.4571, "step": 2992 }, { "epoch": 0.21319941589201125, "grad_norm": 3.5218348503112793, "learning_rate": 1.3856753210708288e-05, "loss": 0.7102, "step": 2993 }, { "epoch": 0.21327064857356556, "grad_norm": 3.557003974914551, "learning_rate": 1.3853128829774605e-05, "loss": 0.6116, "step": 2994 }, { "epoch": 0.21334188125511985, "grad_norm": 5.363908767700195, "learning_rate": 1.3849503854342823e-05, "loss": 0.5732, "step": 2995 }, { "epoch": 0.21341311393667414, "grad_norm": 2.808804512023926, "learning_rate": 1.3845878284972237e-05, "loss": 0.6675, "step": 2996 }, { "epoch": 0.21348434661822843, "grad_norm": 5.526084899902344, "learning_rate": 1.3842252122222235e-05, "loss": 0.1691, "step": 2997 }, { "epoch": 0.21355557929978275, "grad_norm": 1.8786733150482178, "learning_rate": 1.38386253666523e-05, "loss": 0.3098, "step": 2998 }, { "epoch": 0.21362681198133704, "grad_norm": 4.320495128631592, "learning_rate": 1.3834998018822004e-05, "loss": 0.4704, "step": 2999 }, { "epoch": 0.21369804466289133, "grad_norm": 3.159491539001465, "learning_rate": 1.3831370079291002e-05, "loss": 0.8825, "step": 3000 }, { "epoch": 0.21376927734444562, "grad_norm": 2.6163759231567383, "learning_rate": 1.3827741548619054e-05, "loss": 0.3902, "step": 3001 }, { "epoch": 0.21384051002599994, "grad_norm": 3.478778123855591, "learning_rate": 1.3824112427366003e-05, "loss": 0.3351, "step": 3002 }, { "epoch": 0.21391174270755423, "grad_norm": 4.076736927032471, "learning_rate": 1.3820482716091786e-05, "loss": 0.987, "step": 3003 }, { "epoch": 0.21398297538910852, "grad_norm": 6.964200496673584, "learning_rate": 1.381685241535643e-05, "loss": 0.7715, "step": 3004 }, { "epoch": 0.21405420807066283, "grad_norm": 3.683234453201294, "learning_rate": 1.381322152572005e-05, "loss": 0.4397, "step": 3005 }, { "epoch": 0.21412544075221712, "grad_norm": 3.8743646144866943, "learning_rate": 1.3809590047742858e-05, "loss": 0.6681, "step": 3006 }, { "epoch": 0.2141966734337714, "grad_norm": 3.0336692333221436, "learning_rate": 1.3805957981985154e-05, "loss": 0.1852, "step": 3007 }, { "epoch": 0.2142679061153257, "grad_norm": 2.8447251319885254, "learning_rate": 1.3802325329007324e-05, "loss": 0.057, "step": 3008 }, { "epoch": 0.21433913879688002, "grad_norm": 3.60489821434021, "learning_rate": 1.3798692089369855e-05, "loss": 0.6797, "step": 3009 }, { "epoch": 0.2144103714784343, "grad_norm": 4.952525615692139, "learning_rate": 1.3795058263633316e-05, "loss": 0.9026, "step": 3010 }, { "epoch": 0.2144816041599886, "grad_norm": 2.735532283782959, "learning_rate": 1.3791423852358365e-05, "loss": 0.6944, "step": 3011 }, { "epoch": 0.2145528368415429, "grad_norm": 1.8782761096954346, "learning_rate": 1.3787788856105762e-05, "loss": 0.226, "step": 3012 }, { "epoch": 0.2146240695230972, "grad_norm": 2.8857781887054443, "learning_rate": 1.3784153275436345e-05, "loss": 0.5932, "step": 3013 }, { "epoch": 0.2146953022046515, "grad_norm": 3.492868423461914, "learning_rate": 1.3780517110911042e-05, "loss": 0.5652, "step": 3014 }, { "epoch": 0.21476653488620578, "grad_norm": 5.897347927093506, "learning_rate": 1.3776880363090883e-05, "loss": 0.4262, "step": 3015 }, { "epoch": 0.2148377675677601, "grad_norm": 1.4158928394317627, "learning_rate": 1.377324303253698e-05, "loss": 0.0847, "step": 3016 }, { "epoch": 0.2149090002493144, "grad_norm": 2.1914985179901123, "learning_rate": 1.3769605119810533e-05, "loss": 0.2098, "step": 3017 }, { "epoch": 0.21498023293086868, "grad_norm": 4.1557416915893555, "learning_rate": 1.3765966625472837e-05, "loss": 0.5737, "step": 3018 }, { "epoch": 0.21505146561242297, "grad_norm": 3.649855852127075, "learning_rate": 1.376232755008527e-05, "loss": 0.1958, "step": 3019 }, { "epoch": 0.2151226982939773, "grad_norm": 4.564761638641357, "learning_rate": 1.3758687894209307e-05, "loss": 0.4708, "step": 3020 }, { "epoch": 0.21519393097553158, "grad_norm": 5.430371284484863, "learning_rate": 1.375504765840651e-05, "loss": 0.6465, "step": 3021 }, { "epoch": 0.21526516365708587, "grad_norm": 3.473217010498047, "learning_rate": 1.3751406843238526e-05, "loss": 0.4272, "step": 3022 }, { "epoch": 0.21533639633864016, "grad_norm": 3.646695852279663, "learning_rate": 1.37477654492671e-05, "loss": 0.673, "step": 3023 }, { "epoch": 0.21540762902019447, "grad_norm": 3.2755343914031982, "learning_rate": 1.374412347705406e-05, "loss": 0.2482, "step": 3024 }, { "epoch": 0.21547886170174876, "grad_norm": 3.102573871612549, "learning_rate": 1.3740480927161326e-05, "loss": 0.6463, "step": 3025 }, { "epoch": 0.21555009438330305, "grad_norm": 5.486429214477539, "learning_rate": 1.3736837800150903e-05, "loss": 0.4374, "step": 3026 }, { "epoch": 0.21562132706485734, "grad_norm": 3.1481997966766357, "learning_rate": 1.373319409658489e-05, "loss": 0.7499, "step": 3027 }, { "epoch": 0.21569255974641166, "grad_norm": 2.8331973552703857, "learning_rate": 1.3729549817025472e-05, "loss": 0.5112, "step": 3028 }, { "epoch": 0.21576379242796595, "grad_norm": 1.9950281381607056, "learning_rate": 1.3725904962034923e-05, "loss": 0.5062, "step": 3029 }, { "epoch": 0.21583502510952024, "grad_norm": 3.7141971588134766, "learning_rate": 1.372225953217561e-05, "loss": 0.7219, "step": 3030 }, { "epoch": 0.21590625779107456, "grad_norm": 4.8200860023498535, "learning_rate": 1.3718613528009982e-05, "loss": 0.7574, "step": 3031 }, { "epoch": 0.21597749047262885, "grad_norm": 3.0601437091827393, "learning_rate": 1.371496695010058e-05, "loss": 0.3679, "step": 3032 }, { "epoch": 0.21604872315418314, "grad_norm": 3.346085548400879, "learning_rate": 1.3711319799010037e-05, "loss": 0.3085, "step": 3033 }, { "epoch": 0.21611995583573743, "grad_norm": 2.2385668754577637, "learning_rate": 1.3707672075301064e-05, "loss": 0.3635, "step": 3034 }, { "epoch": 0.21619118851729174, "grad_norm": 2.2620458602905273, "learning_rate": 1.3704023779536475e-05, "loss": 0.3654, "step": 3035 }, { "epoch": 0.21626242119884603, "grad_norm": 2.7462148666381836, "learning_rate": 1.3700374912279159e-05, "loss": 0.5412, "step": 3036 }, { "epoch": 0.21633365388040032, "grad_norm": 5.6550774574279785, "learning_rate": 1.3696725474092098e-05, "loss": 0.2116, "step": 3037 }, { "epoch": 0.2164048865619546, "grad_norm": 3.944655179977417, "learning_rate": 1.369307546553837e-05, "loss": 0.7599, "step": 3038 }, { "epoch": 0.21647611924350893, "grad_norm": 2.2319393157958984, "learning_rate": 1.3689424887181129e-05, "loss": 0.0974, "step": 3039 }, { "epoch": 0.21654735192506322, "grad_norm": 3.7255098819732666, "learning_rate": 1.368577373958362e-05, "loss": 0.5188, "step": 3040 }, { "epoch": 0.2166185846066175, "grad_norm": 2.8769326210021973, "learning_rate": 1.3682122023309179e-05, "loss": 0.4985, "step": 3041 }, { "epoch": 0.21668981728817183, "grad_norm": 4.12813138961792, "learning_rate": 1.3678469738921228e-05, "loss": 0.5473, "step": 3042 }, { "epoch": 0.21676104996972612, "grad_norm": 2.5217244625091553, "learning_rate": 1.3674816886983275e-05, "loss": 0.2536, "step": 3043 }, { "epoch": 0.2168322826512804, "grad_norm": 6.188952922821045, "learning_rate": 1.3671163468058924e-05, "loss": 0.2309, "step": 3044 }, { "epoch": 0.2169035153328347, "grad_norm": 1.937841773033142, "learning_rate": 1.3667509482711851e-05, "loss": 0.1635, "step": 3045 }, { "epoch": 0.216974748014389, "grad_norm": 2.4110190868377686, "learning_rate": 1.3663854931505838e-05, "loss": 0.1411, "step": 3046 }, { "epoch": 0.2170459806959433, "grad_norm": 2.2464160919189453, "learning_rate": 1.366019981500474e-05, "loss": 0.4768, "step": 3047 }, { "epoch": 0.2171172133774976, "grad_norm": 4.740327835083008, "learning_rate": 1.3656544133772499e-05, "loss": 0.6484, "step": 3048 }, { "epoch": 0.21718844605905188, "grad_norm": 3.021939754486084, "learning_rate": 1.3652887888373155e-05, "loss": 0.4013, "step": 3049 }, { "epoch": 0.2172596787406062, "grad_norm": 1.9491933584213257, "learning_rate": 1.3649231079370825e-05, "loss": 0.434, "step": 3050 }, { "epoch": 0.2173309114221605, "grad_norm": 4.1623854637146, "learning_rate": 1.364557370732972e-05, "loss": 0.199, "step": 3051 }, { "epoch": 0.21740214410371478, "grad_norm": 3.4553935527801514, "learning_rate": 1.3641915772814137e-05, "loss": 0.3856, "step": 3052 }, { "epoch": 0.21747337678526907, "grad_norm": 2.184267282485962, "learning_rate": 1.3638257276388454e-05, "loss": 0.3992, "step": 3053 }, { "epoch": 0.21754460946682339, "grad_norm": 3.3647100925445557, "learning_rate": 1.3634598218617138e-05, "loss": 0.678, "step": 3054 }, { "epoch": 0.21761584214837768, "grad_norm": 3.5547118186950684, "learning_rate": 1.3630938600064748e-05, "loss": 0.8089, "step": 3055 }, { "epoch": 0.21768707482993196, "grad_norm": 2.876230001449585, "learning_rate": 1.3627278421295925e-05, "loss": 0.3554, "step": 3056 }, { "epoch": 0.21775830751148628, "grad_norm": 7.009502410888672, "learning_rate": 1.362361768287539e-05, "loss": 0.2808, "step": 3057 }, { "epoch": 0.21782954019304057, "grad_norm": 2.9945616722106934, "learning_rate": 1.3619956385367964e-05, "loss": 0.7361, "step": 3058 }, { "epoch": 0.21790077287459486, "grad_norm": 2.110137462615967, "learning_rate": 1.3616294529338547e-05, "loss": 0.2447, "step": 3059 }, { "epoch": 0.21797200555614915, "grad_norm": 5.234311103820801, "learning_rate": 1.3612632115352126e-05, "loss": 0.6281, "step": 3060 }, { "epoch": 0.21804323823770347, "grad_norm": 5.308831214904785, "learning_rate": 1.3608969143973771e-05, "loss": 0.1909, "step": 3061 }, { "epoch": 0.21811447091925776, "grad_norm": 2.7986550331115723, "learning_rate": 1.3605305615768645e-05, "loss": 0.5146, "step": 3062 }, { "epoch": 0.21818570360081205, "grad_norm": 2.169100522994995, "learning_rate": 1.3601641531301988e-05, "loss": 0.4331, "step": 3063 }, { "epoch": 0.21825693628236634, "grad_norm": 3.693246364593506, "learning_rate": 1.3597976891139132e-05, "loss": 0.4992, "step": 3064 }, { "epoch": 0.21832816896392065, "grad_norm": 1.9032320976257324, "learning_rate": 1.3594311695845494e-05, "loss": 0.1766, "step": 3065 }, { "epoch": 0.21839940164547494, "grad_norm": 3.2918782234191895, "learning_rate": 1.3590645945986577e-05, "loss": 0.4945, "step": 3066 }, { "epoch": 0.21847063432702923, "grad_norm": 2.628973960876465, "learning_rate": 1.3586979642127964e-05, "loss": 0.2436, "step": 3067 }, { "epoch": 0.21854186700858355, "grad_norm": 2.6664226055145264, "learning_rate": 1.3583312784835332e-05, "loss": 0.6309, "step": 3068 }, { "epoch": 0.21861309969013784, "grad_norm": 3.8960487842559814, "learning_rate": 1.3579645374674442e-05, "loss": 1.0037, "step": 3069 }, { "epoch": 0.21868433237169213, "grad_norm": 3.5874221324920654, "learning_rate": 1.3575977412211132e-05, "loss": 0.3031, "step": 3070 }, { "epoch": 0.21875556505324642, "grad_norm": 3.316882610321045, "learning_rate": 1.3572308898011328e-05, "loss": 0.6959, "step": 3071 }, { "epoch": 0.21882679773480074, "grad_norm": 5.960402011871338, "learning_rate": 1.3568639832641055e-05, "loss": 0.4367, "step": 3072 }, { "epoch": 0.21889803041635503, "grad_norm": 3.58998966217041, "learning_rate": 1.3564970216666402e-05, "loss": 0.3803, "step": 3073 }, { "epoch": 0.21896926309790932, "grad_norm": 3.634350299835205, "learning_rate": 1.3561300050653556e-05, "loss": 0.4315, "step": 3074 }, { "epoch": 0.2190404957794636, "grad_norm": 3.2565650939941406, "learning_rate": 1.3557629335168789e-05, "loss": 0.5917, "step": 3075 }, { "epoch": 0.21911172846101792, "grad_norm": 1.8956308364868164, "learning_rate": 1.3553958070778452e-05, "loss": 0.369, "step": 3076 }, { "epoch": 0.2191829611425722, "grad_norm": 3.44262433052063, "learning_rate": 1.3550286258048984e-05, "loss": 0.6602, "step": 3077 }, { "epoch": 0.2192541938241265, "grad_norm": 3.046152353286743, "learning_rate": 1.3546613897546905e-05, "loss": 0.6415, "step": 3078 }, { "epoch": 0.2193254265056808, "grad_norm": 2.8780922889709473, "learning_rate": 1.3542940989838824e-05, "loss": 0.4067, "step": 3079 }, { "epoch": 0.2193966591872351, "grad_norm": 2.8475496768951416, "learning_rate": 1.3539267535491436e-05, "loss": 0.403, "step": 3080 }, { "epoch": 0.2194678918687894, "grad_norm": 6.384998798370361, "learning_rate": 1.3535593535071515e-05, "loss": 0.4707, "step": 3081 }, { "epoch": 0.2195391245503437, "grad_norm": 1.5761198997497559, "learning_rate": 1.3531918989145919e-05, "loss": 0.2898, "step": 3082 }, { "epoch": 0.219610357231898, "grad_norm": 2.3828749656677246, "learning_rate": 1.3528243898281595e-05, "loss": 0.586, "step": 3083 }, { "epoch": 0.2196815899134523, "grad_norm": 3.816232442855835, "learning_rate": 1.3524568263045572e-05, "loss": 0.4785, "step": 3084 }, { "epoch": 0.21975282259500659, "grad_norm": 3.137086868286133, "learning_rate": 1.3520892084004961e-05, "loss": 0.2512, "step": 3085 }, { "epoch": 0.21982405527656088, "grad_norm": 3.640415668487549, "learning_rate": 1.3517215361726963e-05, "loss": 0.6176, "step": 3086 }, { "epoch": 0.2198952879581152, "grad_norm": 2.7917439937591553, "learning_rate": 1.3513538096778853e-05, "loss": 0.5463, "step": 3087 }, { "epoch": 0.21996652063966948, "grad_norm": 2.3442864418029785, "learning_rate": 1.3509860289727994e-05, "loss": 0.2859, "step": 3088 }, { "epoch": 0.22003775332122377, "grad_norm": 4.303785800933838, "learning_rate": 1.350618194114184e-05, "loss": 0.9046, "step": 3089 }, { "epoch": 0.22010898600277806, "grad_norm": 2.7540645599365234, "learning_rate": 1.3502503051587921e-05, "loss": 0.6083, "step": 3090 }, { "epoch": 0.22018021868433238, "grad_norm": 3.0876832008361816, "learning_rate": 1.3498823621633848e-05, "loss": 0.5063, "step": 3091 }, { "epoch": 0.22025145136588667, "grad_norm": 1.6173527240753174, "learning_rate": 1.349514365184732e-05, "loss": 0.2322, "step": 3092 }, { "epoch": 0.22032268404744096, "grad_norm": 2.568364381790161, "learning_rate": 1.3491463142796121e-05, "loss": 0.3468, "step": 3093 }, { "epoch": 0.22039391672899528, "grad_norm": 4.058254241943359, "learning_rate": 1.3487782095048112e-05, "loss": 0.6313, "step": 3094 }, { "epoch": 0.22046514941054957, "grad_norm": 3.8260889053344727, "learning_rate": 1.3484100509171246e-05, "loss": 0.8353, "step": 3095 }, { "epoch": 0.22053638209210386, "grad_norm": 4.092189311981201, "learning_rate": 1.3480418385733549e-05, "loss": 0.5966, "step": 3096 }, { "epoch": 0.22060761477365814, "grad_norm": 1.5195233821868896, "learning_rate": 1.3476735725303134e-05, "loss": 0.1646, "step": 3097 }, { "epoch": 0.22067884745521246, "grad_norm": 1.8107589483261108, "learning_rate": 1.3473052528448203e-05, "loss": 0.1747, "step": 3098 }, { "epoch": 0.22075008013676675, "grad_norm": 2.44699764251709, "learning_rate": 1.3469368795737033e-05, "loss": 0.3637, "step": 3099 }, { "epoch": 0.22082131281832104, "grad_norm": 4.338643550872803, "learning_rate": 1.3465684527737986e-05, "loss": 0.6244, "step": 3100 }, { "epoch": 0.22089254549987533, "grad_norm": 2.5442519187927246, "learning_rate": 1.3461999725019506e-05, "loss": 0.3793, "step": 3101 }, { "epoch": 0.22096377818142965, "grad_norm": 3.2040863037109375, "learning_rate": 1.3458314388150115e-05, "loss": 0.4292, "step": 3102 }, { "epoch": 0.22103501086298394, "grad_norm": 4.508277893066406, "learning_rate": 1.3454628517698431e-05, "loss": 0.753, "step": 3103 }, { "epoch": 0.22110624354453823, "grad_norm": 3.719189167022705, "learning_rate": 1.3450942114233145e-05, "loss": 0.8003, "step": 3104 }, { "epoch": 0.22117747622609252, "grad_norm": 2.3285646438598633, "learning_rate": 1.3447255178323025e-05, "loss": 0.2012, "step": 3105 }, { "epoch": 0.22124870890764683, "grad_norm": 2.5770881175994873, "learning_rate": 1.3443567710536931e-05, "loss": 0.2635, "step": 3106 }, { "epoch": 0.22131994158920112, "grad_norm": 2.894160509109497, "learning_rate": 1.3439879711443807e-05, "loss": 0.286, "step": 3107 }, { "epoch": 0.22139117427075541, "grad_norm": 4.02253532409668, "learning_rate": 1.3436191181612662e-05, "loss": 0.686, "step": 3108 }, { "epoch": 0.22146240695230973, "grad_norm": 4.331364154815674, "learning_rate": 1.3432502121612602e-05, "loss": 0.5572, "step": 3109 }, { "epoch": 0.22153363963386402, "grad_norm": 2.6199488639831543, "learning_rate": 1.3428812532012816e-05, "loss": 0.5311, "step": 3110 }, { "epoch": 0.2216048723154183, "grad_norm": 2.369678258895874, "learning_rate": 1.3425122413382563e-05, "loss": 0.2964, "step": 3111 }, { "epoch": 0.2216761049969726, "grad_norm": 3.281947374343872, "learning_rate": 1.3421431766291198e-05, "loss": 0.166, "step": 3112 }, { "epoch": 0.22174733767852692, "grad_norm": 2.34667706489563, "learning_rate": 1.3417740591308142e-05, "loss": 0.5363, "step": 3113 }, { "epoch": 0.2218185703600812, "grad_norm": 3.6198198795318604, "learning_rate": 1.341404888900291e-05, "loss": 0.4894, "step": 3114 }, { "epoch": 0.2218898030416355, "grad_norm": 3.8416216373443604, "learning_rate": 1.3410356659945095e-05, "loss": 0.5772, "step": 3115 }, { "epoch": 0.2219610357231898, "grad_norm": 3.084214925765991, "learning_rate": 1.3406663904704362e-05, "loss": 0.7667, "step": 3116 }, { "epoch": 0.2220322684047441, "grad_norm": 2.4548404216766357, "learning_rate": 1.3402970623850474e-05, "loss": 0.379, "step": 3117 }, { "epoch": 0.2221035010862984, "grad_norm": 3.1341934204101562, "learning_rate": 1.339927681795326e-05, "loss": 0.2107, "step": 3118 }, { "epoch": 0.22217473376785268, "grad_norm": 2.82271671295166, "learning_rate": 1.3395582487582639e-05, "loss": 0.4212, "step": 3119 }, { "epoch": 0.222245966449407, "grad_norm": 3.549679756164551, "learning_rate": 1.3391887633308609e-05, "loss": 0.5441, "step": 3120 }, { "epoch": 0.2223171991309613, "grad_norm": 4.972689151763916, "learning_rate": 1.3388192255701249e-05, "loss": 0.3244, "step": 3121 }, { "epoch": 0.22238843181251558, "grad_norm": 2.936882257461548, "learning_rate": 1.3384496355330714e-05, "loss": 0.3135, "step": 3122 }, { "epoch": 0.22245966449406987, "grad_norm": 2.2426626682281494, "learning_rate": 1.3380799932767243e-05, "loss": 0.2774, "step": 3123 }, { "epoch": 0.2225308971756242, "grad_norm": 4.293651580810547, "learning_rate": 1.3377102988581162e-05, "loss": 0.4414, "step": 3124 }, { "epoch": 0.22260212985717848, "grad_norm": 1.9233635663986206, "learning_rate": 1.3373405523342862e-05, "loss": 0.2704, "step": 3125 }, { "epoch": 0.22267336253873277, "grad_norm": 2.420651912689209, "learning_rate": 1.336970753762283e-05, "loss": 0.1026, "step": 3126 }, { "epoch": 0.22274459522028706, "grad_norm": 2.9903526306152344, "learning_rate": 1.336600903199163e-05, "loss": 0.5466, "step": 3127 }, { "epoch": 0.22281582790184137, "grad_norm": 2.6274752616882324, "learning_rate": 1.3362310007019897e-05, "loss": 0.4829, "step": 3128 }, { "epoch": 0.22288706058339566, "grad_norm": 4.274594783782959, "learning_rate": 1.3358610463278357e-05, "loss": 0.7007, "step": 3129 }, { "epoch": 0.22295829326494995, "grad_norm": 3.30477237701416, "learning_rate": 1.335491040133781e-05, "loss": 0.4811, "step": 3130 }, { "epoch": 0.22302952594650424, "grad_norm": 3.403719425201416, "learning_rate": 1.335120982176913e-05, "loss": 0.4831, "step": 3131 }, { "epoch": 0.22310075862805856, "grad_norm": 3.352402448654175, "learning_rate": 1.3347508725143292e-05, "loss": 0.4891, "step": 3132 }, { "epoch": 0.22317199130961285, "grad_norm": 3.9723169803619385, "learning_rate": 1.3343807112031329e-05, "loss": 0.2857, "step": 3133 }, { "epoch": 0.22324322399116714, "grad_norm": 2.6055943965911865, "learning_rate": 1.3340104983004363e-05, "loss": 0.261, "step": 3134 }, { "epoch": 0.22331445667272146, "grad_norm": 1.797678828239441, "learning_rate": 1.3336402338633593e-05, "loss": 0.1678, "step": 3135 }, { "epoch": 0.22338568935427575, "grad_norm": 2.635219097137451, "learning_rate": 1.3332699179490302e-05, "loss": 0.6006, "step": 3136 }, { "epoch": 0.22345692203583004, "grad_norm": 3.417555809020996, "learning_rate": 1.3328995506145849e-05, "loss": 0.9316, "step": 3137 }, { "epoch": 0.22352815471738433, "grad_norm": 3.778444528579712, "learning_rate": 1.3325291319171669e-05, "loss": 0.2709, "step": 3138 }, { "epoch": 0.22359938739893864, "grad_norm": 3.2056493759155273, "learning_rate": 1.3321586619139285e-05, "loss": 0.328, "step": 3139 }, { "epoch": 0.22367062008049293, "grad_norm": 3.0002474784851074, "learning_rate": 1.3317881406620287e-05, "loss": 0.6473, "step": 3140 }, { "epoch": 0.22374185276204722, "grad_norm": 2.1553986072540283, "learning_rate": 1.3314175682186358e-05, "loss": 0.4645, "step": 3141 }, { "epoch": 0.2238130854436015, "grad_norm": 2.812248706817627, "learning_rate": 1.3310469446409251e-05, "loss": 0.4859, "step": 3142 }, { "epoch": 0.22388431812515583, "grad_norm": 3.2999706268310547, "learning_rate": 1.33067626998608e-05, "loss": 0.6887, "step": 3143 }, { "epoch": 0.22395555080671012, "grad_norm": 3.349360466003418, "learning_rate": 1.3303055443112918e-05, "loss": 0.7391, "step": 3144 }, { "epoch": 0.2240267834882644, "grad_norm": 9.514159202575684, "learning_rate": 1.3299347676737595e-05, "loss": 0.6841, "step": 3145 }, { "epoch": 0.22409801616981873, "grad_norm": 1.8376832008361816, "learning_rate": 1.32956394013069e-05, "loss": 0.2894, "step": 3146 }, { "epoch": 0.22416924885137302, "grad_norm": 4.2699360847473145, "learning_rate": 1.329193061739299e-05, "loss": 0.6234, "step": 3147 }, { "epoch": 0.2242404815329273, "grad_norm": 2.692051649093628, "learning_rate": 1.328822132556808e-05, "loss": 0.4673, "step": 3148 }, { "epoch": 0.2243117142144816, "grad_norm": 3.5391042232513428, "learning_rate": 1.3284511526404485e-05, "loss": 0.4099, "step": 3149 }, { "epoch": 0.2243829468960359, "grad_norm": 8.395517349243164, "learning_rate": 1.3280801220474585e-05, "loss": 0.714, "step": 3150 }, { "epoch": 0.2244541795775902, "grad_norm": 2.9848687648773193, "learning_rate": 1.3277090408350841e-05, "loss": 0.5783, "step": 3151 }, { "epoch": 0.2245254122591445, "grad_norm": 2.52195143699646, "learning_rate": 1.3273379090605796e-05, "loss": 0.6826, "step": 3152 }, { "epoch": 0.22459664494069878, "grad_norm": 2.1882691383361816, "learning_rate": 1.3269667267812066e-05, "loss": 0.0456, "step": 3153 }, { "epoch": 0.2246678776222531, "grad_norm": 5.166197776794434, "learning_rate": 1.3265954940542344e-05, "loss": 0.6394, "step": 3154 }, { "epoch": 0.2247391103038074, "grad_norm": 3.313779592514038, "learning_rate": 1.3262242109369412e-05, "loss": 0.4971, "step": 3155 }, { "epoch": 0.22481034298536168, "grad_norm": 2.7379274368286133, "learning_rate": 1.3258528774866115e-05, "loss": 0.5985, "step": 3156 }, { "epoch": 0.224881575666916, "grad_norm": 4.050948619842529, "learning_rate": 1.3254814937605385e-05, "loss": 0.349, "step": 3157 }, { "epoch": 0.22495280834847028, "grad_norm": 3.9323642253875732, "learning_rate": 1.325110059816023e-05, "loss": 0.2968, "step": 3158 }, { "epoch": 0.22502404103002457, "grad_norm": 4.032215595245361, "learning_rate": 1.324738575710373e-05, "loss": 0.7339, "step": 3159 }, { "epoch": 0.22509527371157886, "grad_norm": 4.498744010925293, "learning_rate": 1.324367041500905e-05, "loss": 0.4277, "step": 3160 }, { "epoch": 0.22516650639313318, "grad_norm": 1.5881863832473755, "learning_rate": 1.323995457244943e-05, "loss": 0.1277, "step": 3161 }, { "epoch": 0.22523773907468747, "grad_norm": 3.6910109519958496, "learning_rate": 1.3236238229998181e-05, "loss": 0.3207, "step": 3162 }, { "epoch": 0.22530897175624176, "grad_norm": 2.673894166946411, "learning_rate": 1.3232521388228703e-05, "loss": 0.5902, "step": 3163 }, { "epoch": 0.22538020443779605, "grad_norm": 1.844895601272583, "learning_rate": 1.3228804047714462e-05, "loss": 0.2357, "step": 3164 }, { "epoch": 0.22545143711935037, "grad_norm": 2.835265874862671, "learning_rate": 1.3225086209029008e-05, "loss": 0.1934, "step": 3165 }, { "epoch": 0.22552266980090466, "grad_norm": 4.385526657104492, "learning_rate": 1.3221367872745962e-05, "loss": 0.467, "step": 3166 }, { "epoch": 0.22559390248245895, "grad_norm": 2.7246689796447754, "learning_rate": 1.321764903943903e-05, "loss": 0.5103, "step": 3167 }, { "epoch": 0.22566513516401324, "grad_norm": 3.4900851249694824, "learning_rate": 1.3213929709681986e-05, "loss": 0.1875, "step": 3168 }, { "epoch": 0.22573636784556755, "grad_norm": 2.537065029144287, "learning_rate": 1.321020988404868e-05, "loss": 0.4297, "step": 3169 }, { "epoch": 0.22580760052712184, "grad_norm": 3.0506558418273926, "learning_rate": 1.3206489563113054e-05, "loss": 0.6846, "step": 3170 }, { "epoch": 0.22587883320867613, "grad_norm": 3.0648868083953857, "learning_rate": 1.3202768747449104e-05, "loss": 0.3627, "step": 3171 }, { "epoch": 0.22595006589023045, "grad_norm": 4.5410614013671875, "learning_rate": 1.3199047437630921e-05, "loss": 0.3982, "step": 3172 }, { "epoch": 0.22602129857178474, "grad_norm": 3.2024331092834473, "learning_rate": 1.3195325634232662e-05, "loss": 0.5272, "step": 3173 }, { "epoch": 0.22609253125333903, "grad_norm": 2.819218635559082, "learning_rate": 1.3191603337828563e-05, "loss": 0.6254, "step": 3174 }, { "epoch": 0.22616376393489332, "grad_norm": 3.723343849182129, "learning_rate": 1.3187880548992937e-05, "loss": 0.3534, "step": 3175 }, { "epoch": 0.22623499661644764, "grad_norm": 2.16211199760437, "learning_rate": 1.3184157268300168e-05, "loss": 0.4077, "step": 3176 }, { "epoch": 0.22630622929800193, "grad_norm": 3.948180913925171, "learning_rate": 1.3180433496324724e-05, "loss": 0.5451, "step": 3177 }, { "epoch": 0.22637746197955622, "grad_norm": 4.720814228057861, "learning_rate": 1.3176709233641147e-05, "loss": 0.6729, "step": 3178 }, { "epoch": 0.2264486946611105, "grad_norm": 3.2507498264312744, "learning_rate": 1.3172984480824045e-05, "loss": 0.3923, "step": 3179 }, { "epoch": 0.22651992734266482, "grad_norm": 3.688411235809326, "learning_rate": 1.3169259238448115e-05, "loss": 0.4078, "step": 3180 }, { "epoch": 0.2265911600242191, "grad_norm": 5.542478084564209, "learning_rate": 1.3165533507088122e-05, "loss": 0.373, "step": 3181 }, { "epoch": 0.2266623927057734, "grad_norm": 1.6204471588134766, "learning_rate": 1.3161807287318906e-05, "loss": 0.3445, "step": 3182 }, { "epoch": 0.22673362538732772, "grad_norm": 2.593773365020752, "learning_rate": 1.3158080579715389e-05, "loss": 0.5306, "step": 3183 }, { "epoch": 0.226804858068882, "grad_norm": 5.383662223815918, "learning_rate": 1.3154353384852559e-05, "loss": 0.6757, "step": 3184 }, { "epoch": 0.2268760907504363, "grad_norm": 4.951272010803223, "learning_rate": 1.315062570330548e-05, "loss": 0.1136, "step": 3185 }, { "epoch": 0.2269473234319906, "grad_norm": 5.008998394012451, "learning_rate": 1.3146897535649305e-05, "loss": 0.2683, "step": 3186 }, { "epoch": 0.2270185561135449, "grad_norm": 5.1770477294921875, "learning_rate": 1.3143168882459247e-05, "loss": 1.0101, "step": 3187 }, { "epoch": 0.2270897887950992, "grad_norm": 2.690631628036499, "learning_rate": 1.3139439744310599e-05, "loss": 0.5246, "step": 3188 }, { "epoch": 0.22716102147665349, "grad_norm": 0.8243559002876282, "learning_rate": 1.3135710121778729e-05, "loss": 0.0316, "step": 3189 }, { "epoch": 0.22723225415820777, "grad_norm": 4.481201171875, "learning_rate": 1.3131980015439079e-05, "loss": 0.1211, "step": 3190 }, { "epoch": 0.2273034868397621, "grad_norm": 2.7919325828552246, "learning_rate": 1.3128249425867161e-05, "loss": 0.6031, "step": 3191 }, { "epoch": 0.22737471952131638, "grad_norm": 5.444344520568848, "learning_rate": 1.3124518353638575e-05, "loss": 0.3309, "step": 3192 }, { "epoch": 0.22744595220287067, "grad_norm": 2.6765480041503906, "learning_rate": 1.3120786799328982e-05, "loss": 0.5337, "step": 3193 }, { "epoch": 0.22751718488442496, "grad_norm": 2.752760648727417, "learning_rate": 1.3117054763514126e-05, "loss": 0.394, "step": 3194 }, { "epoch": 0.22758841756597928, "grad_norm": 3.930600166320801, "learning_rate": 1.3113322246769817e-05, "loss": 0.5098, "step": 3195 }, { "epoch": 0.22765965024753357, "grad_norm": 2.221040725708008, "learning_rate": 1.3109589249671947e-05, "loss": 0.1508, "step": 3196 }, { "epoch": 0.22773088292908786, "grad_norm": 4.718826770782471, "learning_rate": 1.3105855772796482e-05, "loss": 0.7006, "step": 3197 }, { "epoch": 0.22780211561064218, "grad_norm": 3.213949203491211, "learning_rate": 1.3102121816719453e-05, "loss": 0.4406, "step": 3198 }, { "epoch": 0.22787334829219646, "grad_norm": 1.5256524085998535, "learning_rate": 1.3098387382016971e-05, "loss": 0.2373, "step": 3199 }, { "epoch": 0.22794458097375075, "grad_norm": 2.5661566257476807, "learning_rate": 1.3094652469265225e-05, "loss": 0.4435, "step": 3200 }, { "epoch": 0.22801581365530504, "grad_norm": 5.934866428375244, "learning_rate": 1.309091707904047e-05, "loss": 0.5315, "step": 3201 }, { "epoch": 0.22808704633685936, "grad_norm": 2.3731346130371094, "learning_rate": 1.3087181211919043e-05, "loss": 0.3413, "step": 3202 }, { "epoch": 0.22815827901841365, "grad_norm": 3.613952398300171, "learning_rate": 1.3083444868477344e-05, "loss": 0.6633, "step": 3203 }, { "epoch": 0.22822951169996794, "grad_norm": 4.178106307983398, "learning_rate": 1.3079708049291857e-05, "loss": 0.6155, "step": 3204 }, { "epoch": 0.22830074438152223, "grad_norm": 3.2255847454071045, "learning_rate": 1.3075970754939134e-05, "loss": 0.6598, "step": 3205 }, { "epoch": 0.22837197706307655, "grad_norm": 4.815492153167725, "learning_rate": 1.3072232985995798e-05, "loss": 0.7234, "step": 3206 }, { "epoch": 0.22844320974463084, "grad_norm": 5.594132423400879, "learning_rate": 1.306849474303855e-05, "loss": 0.563, "step": 3207 }, { "epoch": 0.22851444242618513, "grad_norm": 2.9139885902404785, "learning_rate": 1.306475602664416e-05, "loss": 0.2917, "step": 3208 }, { "epoch": 0.22858567510773944, "grad_norm": 3.3308522701263428, "learning_rate": 1.3061016837389482e-05, "loss": 0.5838, "step": 3209 }, { "epoch": 0.22865690778929373, "grad_norm": 2.8994510173797607, "learning_rate": 1.3057277175851426e-05, "loss": 0.4405, "step": 3210 }, { "epoch": 0.22872814047084802, "grad_norm": 4.081943035125732, "learning_rate": 1.3053537042606985e-05, "loss": 0.6819, "step": 3211 }, { "epoch": 0.2287993731524023, "grad_norm": 2.5543839931488037, "learning_rate": 1.3049796438233225e-05, "loss": 0.5329, "step": 3212 }, { "epoch": 0.22887060583395663, "grad_norm": 2.172142505645752, "learning_rate": 1.3046055363307277e-05, "loss": 0.2022, "step": 3213 }, { "epoch": 0.22894183851551092, "grad_norm": 3.079927682876587, "learning_rate": 1.3042313818406359e-05, "loss": 0.2689, "step": 3214 }, { "epoch": 0.2290130711970652, "grad_norm": 1.8382865190505981, "learning_rate": 1.3038571804107747e-05, "loss": 0.2542, "step": 3215 }, { "epoch": 0.2290843038786195, "grad_norm": 2.7130544185638428, "learning_rate": 1.3034829320988796e-05, "loss": 0.466, "step": 3216 }, { "epoch": 0.22915553656017382, "grad_norm": 2.152255058288574, "learning_rate": 1.3031086369626934e-05, "loss": 0.286, "step": 3217 }, { "epoch": 0.2292267692417281, "grad_norm": 1.9394997358322144, "learning_rate": 1.302734295059966e-05, "loss": 0.0659, "step": 3218 }, { "epoch": 0.2292980019232824, "grad_norm": 3.4846251010894775, "learning_rate": 1.3023599064484546e-05, "loss": 0.6891, "step": 3219 }, { "epoch": 0.22936923460483669, "grad_norm": 4.0591254234313965, "learning_rate": 1.3019854711859233e-05, "loss": 0.4479, "step": 3220 }, { "epoch": 0.229440467286391, "grad_norm": 1.584997534751892, "learning_rate": 1.3016109893301434e-05, "loss": 0.0958, "step": 3221 }, { "epoch": 0.2295116999679453, "grad_norm": 3.3534634113311768, "learning_rate": 1.3012364609388939e-05, "loss": 0.5778, "step": 3222 }, { "epoch": 0.22958293264949958, "grad_norm": 3.350318431854248, "learning_rate": 1.3008618860699607e-05, "loss": 0.6025, "step": 3223 }, { "epoch": 0.2296541653310539, "grad_norm": 3.30584979057312, "learning_rate": 1.3004872647811365e-05, "loss": 0.4442, "step": 3224 }, { "epoch": 0.2297253980126082, "grad_norm": 3.611548900604248, "learning_rate": 1.300112597130222e-05, "loss": 0.5124, "step": 3225 }, { "epoch": 0.22979663069416248, "grad_norm": 3.501070261001587, "learning_rate": 1.2997378831750242e-05, "loss": 0.5709, "step": 3226 }, { "epoch": 0.22986786337571677, "grad_norm": 3.4287948608398438, "learning_rate": 1.2993631229733584e-05, "loss": 0.4446, "step": 3227 }, { "epoch": 0.2299390960572711, "grad_norm": 2.3798749446868896, "learning_rate": 1.2989883165830448e-05, "loss": 0.2856, "step": 3228 }, { "epoch": 0.23001032873882538, "grad_norm": 6.276019096374512, "learning_rate": 1.298613464061913e-05, "loss": 0.3374, "step": 3229 }, { "epoch": 0.23008156142037967, "grad_norm": 2.592010021209717, "learning_rate": 1.2982385654677989e-05, "loss": 0.4146, "step": 3230 }, { "epoch": 0.23015279410193396, "grad_norm": 2.6251397132873535, "learning_rate": 1.2978636208585456e-05, "loss": 0.3293, "step": 3231 }, { "epoch": 0.23022402678348827, "grad_norm": 3.400576114654541, "learning_rate": 1.2974886302920029e-05, "loss": 0.5628, "step": 3232 }, { "epoch": 0.23029525946504256, "grad_norm": 2.8045527935028076, "learning_rate": 1.297113593826028e-05, "loss": 0.626, "step": 3233 }, { "epoch": 0.23036649214659685, "grad_norm": 2.493760585784912, "learning_rate": 1.2967385115184854e-05, "loss": 0.3798, "step": 3234 }, { "epoch": 0.23043772482815117, "grad_norm": 3.705566167831421, "learning_rate": 1.2963633834272463e-05, "loss": 0.4784, "step": 3235 }, { "epoch": 0.23050895750970546, "grad_norm": 4.996864318847656, "learning_rate": 1.2959882096101888e-05, "loss": 0.532, "step": 3236 }, { "epoch": 0.23058019019125975, "grad_norm": 4.200279235839844, "learning_rate": 1.2956129901251988e-05, "loss": 0.5811, "step": 3237 }, { "epoch": 0.23065142287281404, "grad_norm": 1.9550002813339233, "learning_rate": 1.2952377250301689e-05, "loss": 0.3888, "step": 3238 }, { "epoch": 0.23072265555436836, "grad_norm": 4.2254486083984375, "learning_rate": 1.294862414382998e-05, "loss": 0.4533, "step": 3239 }, { "epoch": 0.23079388823592265, "grad_norm": 2.8803679943084717, "learning_rate": 1.2944870582415931e-05, "loss": 0.4242, "step": 3240 }, { "epoch": 0.23086512091747693, "grad_norm": 2.883255958557129, "learning_rate": 1.2941116566638681e-05, "loss": 0.5351, "step": 3241 }, { "epoch": 0.23093635359903122, "grad_norm": 3.383302927017212, "learning_rate": 1.293736209707743e-05, "loss": 0.6449, "step": 3242 }, { "epoch": 0.23100758628058554, "grad_norm": 3.1909542083740234, "learning_rate": 1.2933607174311458e-05, "loss": 0.608, "step": 3243 }, { "epoch": 0.23107881896213983, "grad_norm": 4.183938026428223, "learning_rate": 1.2929851798920108e-05, "loss": 0.3726, "step": 3244 }, { "epoch": 0.23115005164369412, "grad_norm": 3.315225124359131, "learning_rate": 1.2926095971482795e-05, "loss": 0.199, "step": 3245 }, { "epoch": 0.2312212843252484, "grad_norm": 4.205860614776611, "learning_rate": 1.2922339692579008e-05, "loss": 0.4862, "step": 3246 }, { "epoch": 0.23129251700680273, "grad_norm": 3.880666494369507, "learning_rate": 1.2918582962788301e-05, "loss": 0.5751, "step": 3247 }, { "epoch": 0.23136374968835702, "grad_norm": 3.8120996952056885, "learning_rate": 1.2914825782690299e-05, "loss": 0.517, "step": 3248 }, { "epoch": 0.2314349823699113, "grad_norm": 2.0660767555236816, "learning_rate": 1.2911068152864697e-05, "loss": 0.3373, "step": 3249 }, { "epoch": 0.23150621505146562, "grad_norm": 2.766209363937378, "learning_rate": 1.2907310073891255e-05, "loss": 0.2084, "step": 3250 }, { "epoch": 0.23157744773301991, "grad_norm": 2.8225326538085938, "learning_rate": 1.2903551546349809e-05, "loss": 0.313, "step": 3251 }, { "epoch": 0.2316486804145742, "grad_norm": 2.8889780044555664, "learning_rate": 1.289979257082026e-05, "loss": 0.5273, "step": 3252 }, { "epoch": 0.2317199130961285, "grad_norm": 3.295088052749634, "learning_rate": 1.2896033147882576e-05, "loss": 0.5191, "step": 3253 }, { "epoch": 0.2317911457776828, "grad_norm": 2.585541009902954, "learning_rate": 1.2892273278116805e-05, "loss": 0.1926, "step": 3254 }, { "epoch": 0.2318623784592371, "grad_norm": 3.449702262878418, "learning_rate": 1.288851296210305e-05, "loss": 0.6619, "step": 3255 }, { "epoch": 0.2319336111407914, "grad_norm": 6.398964881896973, "learning_rate": 1.2884752200421493e-05, "loss": 0.6456, "step": 3256 }, { "epoch": 0.23200484382234568, "grad_norm": 4.241541385650635, "learning_rate": 1.2880990993652379e-05, "loss": 0.8553, "step": 3257 }, { "epoch": 0.2320760765039, "grad_norm": 2.633449077606201, "learning_rate": 1.287722934237602e-05, "loss": 0.4067, "step": 3258 }, { "epoch": 0.2321473091854543, "grad_norm": 3.4917221069335938, "learning_rate": 1.2873467247172804e-05, "loss": 0.7851, "step": 3259 }, { "epoch": 0.23221854186700858, "grad_norm": 2.3043243885040283, "learning_rate": 1.2869704708623184e-05, "loss": 0.2014, "step": 3260 }, { "epoch": 0.2322897745485629, "grad_norm": 3.311764717102051, "learning_rate": 1.286594172730768e-05, "loss": 0.6358, "step": 3261 }, { "epoch": 0.23236100723011718, "grad_norm": 2.867713212966919, "learning_rate": 1.2862178303806878e-05, "loss": 0.43, "step": 3262 }, { "epoch": 0.23243223991167147, "grad_norm": 3.3778038024902344, "learning_rate": 1.285841443870144e-05, "loss": 0.4703, "step": 3263 }, { "epoch": 0.23250347259322576, "grad_norm": 3.011502981185913, "learning_rate": 1.285465013257209e-05, "loss": 0.3894, "step": 3264 }, { "epoch": 0.23257470527478008, "grad_norm": 2.4098150730133057, "learning_rate": 1.2850885385999626e-05, "loss": 0.3646, "step": 3265 }, { "epoch": 0.23264593795633437, "grad_norm": 2.3337957859039307, "learning_rate": 1.28471201995649e-05, "loss": 0.3577, "step": 3266 }, { "epoch": 0.23271717063788866, "grad_norm": 2.8828160762786865, "learning_rate": 1.2843354573848849e-05, "loss": 0.5539, "step": 3267 }, { "epoch": 0.23278840331944295, "grad_norm": 2.266707181930542, "learning_rate": 1.2839588509432466e-05, "loss": 0.3016, "step": 3268 }, { "epoch": 0.23285963600099727, "grad_norm": 2.592521905899048, "learning_rate": 1.283582200689682e-05, "loss": 0.1536, "step": 3269 }, { "epoch": 0.23293086868255156, "grad_norm": 2.7040910720825195, "learning_rate": 1.283205506682304e-05, "loss": 0.5422, "step": 3270 }, { "epoch": 0.23300210136410585, "grad_norm": 4.2206878662109375, "learning_rate": 1.2828287689792331e-05, "loss": 0.6228, "step": 3271 }, { "epoch": 0.23307333404566014, "grad_norm": 3.509248733520508, "learning_rate": 1.2824519876385957e-05, "loss": 0.2423, "step": 3272 }, { "epoch": 0.23314456672721445, "grad_norm": 4.703032970428467, "learning_rate": 1.2820751627185248e-05, "loss": 0.4507, "step": 3273 }, { "epoch": 0.23321579940876874, "grad_norm": 2.2744638919830322, "learning_rate": 1.2816982942771616e-05, "loss": 0.0646, "step": 3274 }, { "epoch": 0.23328703209032303, "grad_norm": 2.981922149658203, "learning_rate": 1.2813213823726524e-05, "loss": 0.266, "step": 3275 }, { "epoch": 0.23335826477187735, "grad_norm": 2.587778091430664, "learning_rate": 1.2809444270631508e-05, "loss": 0.4567, "step": 3276 }, { "epoch": 0.23342949745343164, "grad_norm": 2.439598560333252, "learning_rate": 1.2805674284068175e-05, "loss": 0.281, "step": 3277 }, { "epoch": 0.23350073013498593, "grad_norm": 2.986294984817505, "learning_rate": 1.2801903864618193e-05, "loss": 0.359, "step": 3278 }, { "epoch": 0.23357196281654022, "grad_norm": 3.4137494564056396, "learning_rate": 1.2798133012863297e-05, "loss": 0.2016, "step": 3279 }, { "epoch": 0.23364319549809454, "grad_norm": 9.057896614074707, "learning_rate": 1.2794361729385291e-05, "loss": 0.3677, "step": 3280 }, { "epoch": 0.23371442817964883, "grad_norm": 2.336866617202759, "learning_rate": 1.279059001476605e-05, "loss": 0.5337, "step": 3281 }, { "epoch": 0.23378566086120312, "grad_norm": 7.053333282470703, "learning_rate": 1.2786817869587504e-05, "loss": 0.5332, "step": 3282 }, { "epoch": 0.2338568935427574, "grad_norm": 3.451514482498169, "learning_rate": 1.2783045294431662e-05, "loss": 0.5816, "step": 3283 }, { "epoch": 0.23392812622431172, "grad_norm": 2.9398908615112305, "learning_rate": 1.2779272289880589e-05, "loss": 0.3112, "step": 3284 }, { "epoch": 0.233999358905866, "grad_norm": 2.9238970279693604, "learning_rate": 1.2775498856516422e-05, "loss": 0.3527, "step": 3285 }, { "epoch": 0.2340705915874203, "grad_norm": 2.026508092880249, "learning_rate": 1.2771724994921367e-05, "loss": 0.4231, "step": 3286 }, { "epoch": 0.23414182426897462, "grad_norm": 2.724583387374878, "learning_rate": 1.2767950705677685e-05, "loss": 0.4338, "step": 3287 }, { "epoch": 0.2342130569505289, "grad_norm": 2.7300941944122314, "learning_rate": 1.2764175989367717e-05, "loss": 0.4363, "step": 3288 }, { "epoch": 0.2342842896320832, "grad_norm": 2.717825174331665, "learning_rate": 1.2760400846573858e-05, "loss": 0.3678, "step": 3289 }, { "epoch": 0.2343555223136375, "grad_norm": 3.5612311363220215, "learning_rate": 1.2756625277878571e-05, "loss": 1.0356, "step": 3290 }, { "epoch": 0.2344267549951918, "grad_norm": 2.452353000640869, "learning_rate": 1.2752849283864395e-05, "loss": 0.3302, "step": 3291 }, { "epoch": 0.2344979876767461, "grad_norm": 2.5166015625, "learning_rate": 1.2749072865113926e-05, "loss": 0.2024, "step": 3292 }, { "epoch": 0.23456922035830038, "grad_norm": 4.728134632110596, "learning_rate": 1.274529602220982e-05, "loss": 0.444, "step": 3293 }, { "epoch": 0.23464045303985467, "grad_norm": 4.234918117523193, "learning_rate": 1.2741518755734809e-05, "loss": 0.4938, "step": 3294 }, { "epoch": 0.234711685721409, "grad_norm": 2.7846450805664062, "learning_rate": 1.2737741066271689e-05, "loss": 0.5573, "step": 3295 }, { "epoch": 0.23478291840296328, "grad_norm": 2.1404500007629395, "learning_rate": 1.2733962954403311e-05, "loss": 0.4102, "step": 3296 }, { "epoch": 0.23485415108451757, "grad_norm": 2.9079549312591553, "learning_rate": 1.2730184420712605e-05, "loss": 0.2711, "step": 3297 }, { "epoch": 0.23492538376607186, "grad_norm": 2.5269439220428467, "learning_rate": 1.2726405465782562e-05, "loss": 0.3539, "step": 3298 }, { "epoch": 0.23499661644762618, "grad_norm": 3.9239883422851562, "learning_rate": 1.2722626090196229e-05, "loss": 0.5104, "step": 3299 }, { "epoch": 0.23506784912918047, "grad_norm": 33.15471267700195, "learning_rate": 1.2718846294536729e-05, "loss": 0.5483, "step": 3300 }, { "epoch": 0.23513908181073476, "grad_norm": 8.32680892944336, "learning_rate": 1.2715066079387243e-05, "loss": 0.415, "step": 3301 }, { "epoch": 0.23521031449228907, "grad_norm": 3.861215591430664, "learning_rate": 1.2711285445331023e-05, "loss": 0.3452, "step": 3302 }, { "epoch": 0.23528154717384336, "grad_norm": 3.4071009159088135, "learning_rate": 1.270750439295138e-05, "loss": 0.4742, "step": 3303 }, { "epoch": 0.23535277985539765, "grad_norm": 4.002985000610352, "learning_rate": 1.270372292283169e-05, "loss": 0.5626, "step": 3304 }, { "epoch": 0.23542401253695194, "grad_norm": 3.251643657684326, "learning_rate": 1.2699941035555394e-05, "loss": 0.5259, "step": 3305 }, { "epoch": 0.23549524521850626, "grad_norm": 2.704543352127075, "learning_rate": 1.2696158731706e-05, "loss": 0.3762, "step": 3306 }, { "epoch": 0.23556647790006055, "grad_norm": 4.320241451263428, "learning_rate": 1.269237601186708e-05, "loss": 0.6601, "step": 3307 }, { "epoch": 0.23563771058161484, "grad_norm": 5.9147629737854, "learning_rate": 1.2688592876622268e-05, "loss": 0.5852, "step": 3308 }, { "epoch": 0.23570894326316913, "grad_norm": 4.227715969085693, "learning_rate": 1.2684809326555266e-05, "loss": 0.8049, "step": 3309 }, { "epoch": 0.23578017594472345, "grad_norm": 4.4104204177856445, "learning_rate": 1.2681025362249826e-05, "loss": 0.6554, "step": 3310 }, { "epoch": 0.23585140862627774, "grad_norm": 3.8864474296569824, "learning_rate": 1.2677240984289787e-05, "loss": 0.7617, "step": 3311 }, { "epoch": 0.23592264130783203, "grad_norm": 2.488257884979248, "learning_rate": 1.2673456193259033e-05, "loss": 0.4083, "step": 3312 }, { "epoch": 0.23599387398938634, "grad_norm": 3.1482884883880615, "learning_rate": 1.2669670989741519e-05, "loss": 0.7027, "step": 3313 }, { "epoch": 0.23606510667094063, "grad_norm": 1.7119680643081665, "learning_rate": 1.2665885374321263e-05, "loss": 0.1579, "step": 3314 }, { "epoch": 0.23613633935249492, "grad_norm": 2.1228365898132324, "learning_rate": 1.2662099347582348e-05, "loss": 0.4017, "step": 3315 }, { "epoch": 0.2362075720340492, "grad_norm": 2.784837007522583, "learning_rate": 1.2658312910108919e-05, "loss": 0.7161, "step": 3316 }, { "epoch": 0.23627880471560353, "grad_norm": 4.222367763519287, "learning_rate": 1.2654526062485182e-05, "loss": 0.5553, "step": 3317 }, { "epoch": 0.23635003739715782, "grad_norm": 4.2711405754089355, "learning_rate": 1.265073880529541e-05, "loss": 0.774, "step": 3318 }, { "epoch": 0.2364212700787121, "grad_norm": 3.499077558517456, "learning_rate": 1.2646951139123935e-05, "loss": 0.7083, "step": 3319 }, { "epoch": 0.2364925027602664, "grad_norm": 2.5083000659942627, "learning_rate": 1.2643163064555163e-05, "loss": 0.3443, "step": 3320 }, { "epoch": 0.23656373544182072, "grad_norm": 3.3029255867004395, "learning_rate": 1.2639374582173548e-05, "loss": 0.7353, "step": 3321 }, { "epoch": 0.236634968123375, "grad_norm": 2.231818914413452, "learning_rate": 1.263558569256361e-05, "loss": 0.4059, "step": 3322 }, { "epoch": 0.2367062008049293, "grad_norm": 2.5797617435455322, "learning_rate": 1.2631796396309945e-05, "loss": 0.6079, "step": 3323 }, { "epoch": 0.23677743348648359, "grad_norm": 5.4496541023254395, "learning_rate": 1.2628006693997199e-05, "loss": 0.6804, "step": 3324 }, { "epoch": 0.2368486661680379, "grad_norm": 3.6114888191223145, "learning_rate": 1.2624216586210084e-05, "loss": 0.8573, "step": 3325 }, { "epoch": 0.2369198988495922, "grad_norm": 6.205987453460693, "learning_rate": 1.2620426073533371e-05, "loss": 0.552, "step": 3326 }, { "epoch": 0.23699113153114648, "grad_norm": 3.1477444171905518, "learning_rate": 1.2616635156551902e-05, "loss": 0.381, "step": 3327 }, { "epoch": 0.2370623642127008, "grad_norm": 4.404491424560547, "learning_rate": 1.2612843835850574e-05, "loss": 0.3346, "step": 3328 }, { "epoch": 0.2371335968942551, "grad_norm": 1.5592106580734253, "learning_rate": 1.2609052112014349e-05, "loss": 0.1427, "step": 3329 }, { "epoch": 0.23720482957580938, "grad_norm": 2.691866636276245, "learning_rate": 1.2605259985628248e-05, "loss": 0.3725, "step": 3330 }, { "epoch": 0.23727606225736367, "grad_norm": 7.5460028648376465, "learning_rate": 1.2601467457277368e-05, "loss": 0.7455, "step": 3331 }, { "epoch": 0.23734729493891799, "grad_norm": 2.6163172721862793, "learning_rate": 1.2597674527546846e-05, "loss": 0.3712, "step": 3332 }, { "epoch": 0.23741852762047227, "grad_norm": 4.180835723876953, "learning_rate": 1.259388119702189e-05, "loss": 0.5355, "step": 3333 }, { "epoch": 0.23748976030202656, "grad_norm": 3.9360790252685547, "learning_rate": 1.2590087466287783e-05, "loss": 0.5211, "step": 3334 }, { "epoch": 0.23756099298358085, "grad_norm": 2.6131579875946045, "learning_rate": 1.2586293335929851e-05, "loss": 0.4702, "step": 3335 }, { "epoch": 0.23763222566513517, "grad_norm": 2.19614839553833, "learning_rate": 1.258249880653349e-05, "loss": 0.4537, "step": 3336 }, { "epoch": 0.23770345834668946, "grad_norm": 2.1984195709228516, "learning_rate": 1.2578703878684158e-05, "loss": 0.2857, "step": 3337 }, { "epoch": 0.23777469102824375, "grad_norm": 4.303745746612549, "learning_rate": 1.2574908552967374e-05, "loss": 0.7013, "step": 3338 }, { "epoch": 0.23784592370979807, "grad_norm": 3.2480037212371826, "learning_rate": 1.2571112829968716e-05, "loss": 0.6661, "step": 3339 }, { "epoch": 0.23791715639135236, "grad_norm": 1.639064908027649, "learning_rate": 1.256731671027383e-05, "loss": 0.0865, "step": 3340 }, { "epoch": 0.23798838907290665, "grad_norm": 2.6295809745788574, "learning_rate": 1.2563520194468408e-05, "loss": 0.2689, "step": 3341 }, { "epoch": 0.23805962175446094, "grad_norm": 2.967778205871582, "learning_rate": 1.2559723283138219e-05, "loss": 0.4444, "step": 3342 }, { "epoch": 0.23813085443601525, "grad_norm": 4.767229080200195, "learning_rate": 1.255592597686909e-05, "loss": 0.5769, "step": 3343 }, { "epoch": 0.23820208711756954, "grad_norm": 4.0398430824279785, "learning_rate": 1.2552128276246905e-05, "loss": 0.5513, "step": 3344 }, { "epoch": 0.23827331979912383, "grad_norm": 4.162031173706055, "learning_rate": 1.2548330181857605e-05, "loss": 0.2809, "step": 3345 }, { "epoch": 0.23834455248067812, "grad_norm": 6.54805326461792, "learning_rate": 1.2544531694287203e-05, "loss": 0.5935, "step": 3346 }, { "epoch": 0.23841578516223244, "grad_norm": 3.028130054473877, "learning_rate": 1.2540732814121763e-05, "loss": 0.5341, "step": 3347 }, { "epoch": 0.23848701784378673, "grad_norm": 3.004371404647827, "learning_rate": 1.2536933541947416e-05, "loss": 0.7373, "step": 3348 }, { "epoch": 0.23855825052534102, "grad_norm": 7.93870210647583, "learning_rate": 1.2533133878350348e-05, "loss": 0.6315, "step": 3349 }, { "epoch": 0.2386294832068953, "grad_norm": 2.209244966506958, "learning_rate": 1.2529333823916807e-05, "loss": 0.4771, "step": 3350 }, { "epoch": 0.23870071588844963, "grad_norm": 2.5349698066711426, "learning_rate": 1.2525533379233108e-05, "loss": 0.3322, "step": 3351 }, { "epoch": 0.23877194857000392, "grad_norm": 3.6230716705322266, "learning_rate": 1.2521732544885614e-05, "loss": 0.7977, "step": 3352 }, { "epoch": 0.2388431812515582, "grad_norm": 2.4170360565185547, "learning_rate": 1.2517931321460756e-05, "loss": 0.5028, "step": 3353 }, { "epoch": 0.23891441393311252, "grad_norm": 2.146090269088745, "learning_rate": 1.251412970954503e-05, "loss": 0.5165, "step": 3354 }, { "epoch": 0.2389856466146668, "grad_norm": 2.5318262577056885, "learning_rate": 1.2510327709724976e-05, "loss": 0.2576, "step": 3355 }, { "epoch": 0.2390568792962211, "grad_norm": 2.6806833744049072, "learning_rate": 1.2506525322587207e-05, "loss": 0.2573, "step": 3356 }, { "epoch": 0.2391281119777754, "grad_norm": 2.758495807647705, "learning_rate": 1.2502722548718396e-05, "loss": 0.5434, "step": 3357 }, { "epoch": 0.2391993446593297, "grad_norm": 5.8894267082214355, "learning_rate": 1.2498919388705266e-05, "loss": 0.5328, "step": 3358 }, { "epoch": 0.239270577340884, "grad_norm": 4.7074737548828125, "learning_rate": 1.2495115843134608e-05, "loss": 0.5944, "step": 3359 }, { "epoch": 0.2393418100224383, "grad_norm": 3.347437858581543, "learning_rate": 1.249131191259327e-05, "loss": 0.6659, "step": 3360 }, { "epoch": 0.23941304270399258, "grad_norm": 2.753903388977051, "learning_rate": 1.2487507597668163e-05, "loss": 0.4529, "step": 3361 }, { "epoch": 0.2394842753855469, "grad_norm": 2.063889980316162, "learning_rate": 1.2483702898946249e-05, "loss": 0.5583, "step": 3362 }, { "epoch": 0.23955550806710119, "grad_norm": 2.3503544330596924, "learning_rate": 1.2479897817014553e-05, "loss": 0.3513, "step": 3363 }, { "epoch": 0.23962674074865548, "grad_norm": 2.646869421005249, "learning_rate": 1.2476092352460161e-05, "loss": 0.3288, "step": 3364 }, { "epoch": 0.2396979734302098, "grad_norm": 2.8686046600341797, "learning_rate": 1.2472286505870222e-05, "loss": 0.6524, "step": 3365 }, { "epoch": 0.23976920611176408, "grad_norm": 3.940523386001587, "learning_rate": 1.246848027783193e-05, "loss": 0.4289, "step": 3366 }, { "epoch": 0.23984043879331837, "grad_norm": 4.02206563949585, "learning_rate": 1.2464673668932555e-05, "loss": 0.4074, "step": 3367 }, { "epoch": 0.23991167147487266, "grad_norm": 3.8400986194610596, "learning_rate": 1.2460866679759412e-05, "loss": 0.2385, "step": 3368 }, { "epoch": 0.23998290415642698, "grad_norm": 4.0806803703308105, "learning_rate": 1.2457059310899887e-05, "loss": 0.2938, "step": 3369 }, { "epoch": 0.24005413683798127, "grad_norm": 3.5449867248535156, "learning_rate": 1.2453251562941406e-05, "loss": 0.6055, "step": 3370 }, { "epoch": 0.24012536951953556, "grad_norm": 2.919361114501953, "learning_rate": 1.2449443436471476e-05, "loss": 0.4685, "step": 3371 }, { "epoch": 0.24019660220108985, "grad_norm": 3.5845181941986084, "learning_rate": 1.2445634932077648e-05, "loss": 0.5818, "step": 3372 }, { "epoch": 0.24026783488264417, "grad_norm": 3.3722307682037354, "learning_rate": 1.2441826050347535e-05, "loss": 0.645, "step": 3373 }, { "epoch": 0.24033906756419846, "grad_norm": 5.214705467224121, "learning_rate": 1.243801679186881e-05, "loss": 0.6078, "step": 3374 }, { "epoch": 0.24041030024575274, "grad_norm": 2.421875, "learning_rate": 1.24342071572292e-05, "loss": 0.3327, "step": 3375 }, { "epoch": 0.24048153292730706, "grad_norm": 3.1314074993133545, "learning_rate": 1.243039714701649e-05, "loss": 0.6539, "step": 3376 }, { "epoch": 0.24055276560886135, "grad_norm": 2.9714548587799072, "learning_rate": 1.2426586761818533e-05, "loss": 0.6407, "step": 3377 }, { "epoch": 0.24062399829041564, "grad_norm": 3.9230997562408447, "learning_rate": 1.2422776002223226e-05, "loss": 0.5481, "step": 3378 }, { "epoch": 0.24069523097196993, "grad_norm": 2.345991373062134, "learning_rate": 1.2418964868818529e-05, "loss": 0.4975, "step": 3379 }, { "epoch": 0.24076646365352425, "grad_norm": 2.520709991455078, "learning_rate": 1.2415153362192466e-05, "loss": 0.3993, "step": 3380 }, { "epoch": 0.24083769633507854, "grad_norm": 2.8910231590270996, "learning_rate": 1.241134148293311e-05, "loss": 0.4273, "step": 3381 }, { "epoch": 0.24090892901663283, "grad_norm": 2.3572516441345215, "learning_rate": 1.2407529231628595e-05, "loss": 0.5159, "step": 3382 }, { "epoch": 0.24098016169818712, "grad_norm": 3.2695374488830566, "learning_rate": 1.2403716608867111e-05, "loss": 0.6545, "step": 3383 }, { "epoch": 0.24105139437974143, "grad_norm": 3.022430658340454, "learning_rate": 1.239990361523691e-05, "loss": 0.6152, "step": 3384 }, { "epoch": 0.24112262706129572, "grad_norm": 2.7370541095733643, "learning_rate": 1.2396090251326296e-05, "loss": 0.2997, "step": 3385 }, { "epoch": 0.24119385974285001, "grad_norm": 3.4290292263031006, "learning_rate": 1.239227651772363e-05, "loss": 0.4443, "step": 3386 }, { "epoch": 0.2412650924244043, "grad_norm": 3.7362382411956787, "learning_rate": 1.2388462415017331e-05, "loss": 0.6672, "step": 3387 }, { "epoch": 0.24133632510595862, "grad_norm": 4.0514631271362305, "learning_rate": 1.238464794379588e-05, "loss": 0.7168, "step": 3388 }, { "epoch": 0.2414075577875129, "grad_norm": 2.640996217727661, "learning_rate": 1.2380833104647807e-05, "loss": 0.6466, "step": 3389 }, { "epoch": 0.2414787904690672, "grad_norm": 3.688347816467285, "learning_rate": 1.2377017898161703e-05, "loss": 0.5967, "step": 3390 }, { "epoch": 0.24155002315062152, "grad_norm": 2.3397138118743896, "learning_rate": 1.2373202324926222e-05, "loss": 0.5373, "step": 3391 }, { "epoch": 0.2416212558321758, "grad_norm": 3.542295455932617, "learning_rate": 1.2369386385530055e-05, "loss": 0.5708, "step": 3392 }, { "epoch": 0.2416924885137301, "grad_norm": 2.964590311050415, "learning_rate": 1.2365570080561971e-05, "loss": 0.5425, "step": 3393 }, { "epoch": 0.2417637211952844, "grad_norm": 3.5611345767974854, "learning_rate": 1.2361753410610784e-05, "loss": 0.6908, "step": 3394 }, { "epoch": 0.2418349538768387, "grad_norm": 2.6296164989471436, "learning_rate": 1.2357936376265367e-05, "loss": 0.4449, "step": 3395 }, { "epoch": 0.241906186558393, "grad_norm": 4.133703231811523, "learning_rate": 1.2354118978114648e-05, "loss": 0.1781, "step": 3396 }, { "epoch": 0.24197741923994728, "grad_norm": 2.360924005508423, "learning_rate": 1.2350301216747615e-05, "loss": 0.3979, "step": 3397 }, { "epoch": 0.24204865192150157, "grad_norm": 3.244323253631592, "learning_rate": 1.2346483092753307e-05, "loss": 0.4664, "step": 3398 }, { "epoch": 0.2421198846030559, "grad_norm": 3.4410550594329834, "learning_rate": 1.2342664606720823e-05, "loss": 0.2411, "step": 3399 }, { "epoch": 0.24219111728461018, "grad_norm": 4.013761043548584, "learning_rate": 1.2338845759239315e-05, "loss": 0.4687, "step": 3400 }, { "epoch": 0.24226234996616447, "grad_norm": 2.2635154724121094, "learning_rate": 1.233502655089799e-05, "loss": 0.4648, "step": 3401 }, { "epoch": 0.2423335826477188, "grad_norm": 3.476940870285034, "learning_rate": 1.2331206982286114e-05, "loss": 0.5755, "step": 3402 }, { "epoch": 0.24240481532927308, "grad_norm": 3.183173179626465, "learning_rate": 1.232738705399301e-05, "loss": 0.621, "step": 3403 }, { "epoch": 0.24247604801082737, "grad_norm": 3.2186105251312256, "learning_rate": 1.2323566766608049e-05, "loss": 0.4567, "step": 3404 }, { "epoch": 0.24254728069238166, "grad_norm": 3.053614854812622, "learning_rate": 1.2319746120720665e-05, "loss": 0.4581, "step": 3405 }, { "epoch": 0.24261851337393597, "grad_norm": 3.9510257244110107, "learning_rate": 1.2315925116920342e-05, "loss": 0.6316, "step": 3406 }, { "epoch": 0.24268974605549026, "grad_norm": 4.16535758972168, "learning_rate": 1.2312103755796625e-05, "loss": 0.5598, "step": 3407 }, { "epoch": 0.24276097873704455, "grad_norm": 2.6339001655578613, "learning_rate": 1.2308282037939108e-05, "loss": 0.1964, "step": 3408 }, { "epoch": 0.24283221141859884, "grad_norm": 2.944469928741455, "learning_rate": 1.2304459963937443e-05, "loss": 0.5365, "step": 3409 }, { "epoch": 0.24290344410015316, "grad_norm": 2.9309425354003906, "learning_rate": 1.2300637534381336e-05, "loss": 0.6214, "step": 3410 }, { "epoch": 0.24297467678170745, "grad_norm": 2.2446606159210205, "learning_rate": 1.229681474986055e-05, "loss": 0.0481, "step": 3411 }, { "epoch": 0.24304590946326174, "grad_norm": 3.942143440246582, "learning_rate": 1.2292991610964902e-05, "loss": 0.732, "step": 3412 }, { "epoch": 0.24311714214481603, "grad_norm": 4.180193901062012, "learning_rate": 1.228916811828426e-05, "loss": 0.4142, "step": 3413 }, { "epoch": 0.24318837482637035, "grad_norm": 3.2792911529541016, "learning_rate": 1.2285344272408553e-05, "loss": 0.7516, "step": 3414 }, { "epoch": 0.24325960750792464, "grad_norm": 2.9258148670196533, "learning_rate": 1.2281520073927757e-05, "loss": 0.659, "step": 3415 }, { "epoch": 0.24333084018947893, "grad_norm": 2.4951045513153076, "learning_rate": 1.227769552343191e-05, "loss": 0.2981, "step": 3416 }, { "epoch": 0.24340207287103324, "grad_norm": 3.801884889602661, "learning_rate": 1.2273870621511098e-05, "loss": 0.468, "step": 3417 }, { "epoch": 0.24347330555258753, "grad_norm": 2.359889030456543, "learning_rate": 1.2270045368755467e-05, "loss": 0.2991, "step": 3418 }, { "epoch": 0.24354453823414182, "grad_norm": 3.7081899642944336, "learning_rate": 1.2266219765755211e-05, "loss": 0.0885, "step": 3419 }, { "epoch": 0.2436157709156961, "grad_norm": 3.581299066543579, "learning_rate": 1.2262393813100584e-05, "loss": 0.7535, "step": 3420 }, { "epoch": 0.24368700359725043, "grad_norm": 4.836090564727783, "learning_rate": 1.2258567511381891e-05, "loss": 0.7644, "step": 3421 }, { "epoch": 0.24375823627880472, "grad_norm": 3.32586669921875, "learning_rate": 1.225474086118949e-05, "loss": 0.7076, "step": 3422 }, { "epoch": 0.243829468960359, "grad_norm": 3.373288869857788, "learning_rate": 1.2250913863113792e-05, "loss": 0.2724, "step": 3423 }, { "epoch": 0.2439007016419133, "grad_norm": 3.7408254146575928, "learning_rate": 1.2247086517745262e-05, "loss": 0.7513, "step": 3424 }, { "epoch": 0.24397193432346762, "grad_norm": 4.487040042877197, "learning_rate": 1.2243258825674424e-05, "loss": 0.6661, "step": 3425 }, { "epoch": 0.2440431670050219, "grad_norm": 7.970608711242676, "learning_rate": 1.2239430787491853e-05, "loss": 0.6003, "step": 3426 }, { "epoch": 0.2441143996865762, "grad_norm": 3.440584182739258, "learning_rate": 1.2235602403788172e-05, "loss": 0.6593, "step": 3427 }, { "epoch": 0.2441856323681305, "grad_norm": 3.4741291999816895, "learning_rate": 1.2231773675154062e-05, "loss": 0.3938, "step": 3428 }, { "epoch": 0.2442568650496848, "grad_norm": 2.9722347259521484, "learning_rate": 1.222794460218026e-05, "loss": 0.1881, "step": 3429 }, { "epoch": 0.2443280977312391, "grad_norm": 2.5545270442962646, "learning_rate": 1.2224115185457543e-05, "loss": 0.4057, "step": 3430 }, { "epoch": 0.24439933041279338, "grad_norm": 2.646820306777954, "learning_rate": 1.222028542557676e-05, "loss": 0.579, "step": 3431 }, { "epoch": 0.2444705630943477, "grad_norm": 1.872708797454834, "learning_rate": 1.2216455323128801e-05, "loss": 0.1775, "step": 3432 }, { "epoch": 0.244541795775902, "grad_norm": 1.844294548034668, "learning_rate": 1.2212624878704612e-05, "loss": 0.1995, "step": 3433 }, { "epoch": 0.24461302845745628, "grad_norm": 4.761245250701904, "learning_rate": 1.2208794092895187e-05, "loss": 0.1798, "step": 3434 }, { "epoch": 0.24468426113901057, "grad_norm": 3.1367688179016113, "learning_rate": 1.220496296629158e-05, "loss": 0.4977, "step": 3435 }, { "epoch": 0.24475549382056488, "grad_norm": 2.419638156890869, "learning_rate": 1.2201131499484896e-05, "loss": 0.5821, "step": 3436 }, { "epoch": 0.24482672650211917, "grad_norm": 3.5790061950683594, "learning_rate": 1.219729969306629e-05, "loss": 0.3405, "step": 3437 }, { "epoch": 0.24489795918367346, "grad_norm": 2.9731693267822266, "learning_rate": 1.2193467547626966e-05, "loss": 0.6198, "step": 3438 }, { "epoch": 0.24496919186522775, "grad_norm": 3.4840381145477295, "learning_rate": 1.2189635063758188e-05, "loss": 0.7732, "step": 3439 }, { "epoch": 0.24504042454678207, "grad_norm": 8.817876815795898, "learning_rate": 1.2185802242051267e-05, "loss": 0.475, "step": 3440 }, { "epoch": 0.24511165722833636, "grad_norm": 2.2354822158813477, "learning_rate": 1.218196908309757e-05, "loss": 0.2825, "step": 3441 }, { "epoch": 0.24518288990989065, "grad_norm": 2.7878990173339844, "learning_rate": 1.2178135587488515e-05, "loss": 0.8176, "step": 3442 }, { "epoch": 0.24525412259144497, "grad_norm": 3.355527639389038, "learning_rate": 1.2174301755815572e-05, "loss": 0.6823, "step": 3443 }, { "epoch": 0.24532535527299926, "grad_norm": 2.58919095993042, "learning_rate": 1.2170467588670256e-05, "loss": 0.3604, "step": 3444 }, { "epoch": 0.24539658795455355, "grad_norm": 3.426776885986328, "learning_rate": 1.2166633086644142e-05, "loss": 0.7132, "step": 3445 }, { "epoch": 0.24546782063610784, "grad_norm": 5.839648723602295, "learning_rate": 1.2162798250328857e-05, "loss": 0.4485, "step": 3446 }, { "epoch": 0.24553905331766215, "grad_norm": 5.154189586639404, "learning_rate": 1.2158963080316071e-05, "loss": 0.3813, "step": 3447 }, { "epoch": 0.24561028599921644, "grad_norm": 3.5704076290130615, "learning_rate": 1.2155127577197519e-05, "loss": 0.5534, "step": 3448 }, { "epoch": 0.24568151868077073, "grad_norm": 3.572700023651123, "learning_rate": 1.2151291741564972e-05, "loss": 0.2763, "step": 3449 }, { "epoch": 0.24575275136232502, "grad_norm": 3.7805871963500977, "learning_rate": 1.2147455574010263e-05, "loss": 0.5307, "step": 3450 }, { "epoch": 0.24582398404387934, "grad_norm": 5.166235446929932, "learning_rate": 1.2143619075125277e-05, "loss": 0.751, "step": 3451 }, { "epoch": 0.24589521672543363, "grad_norm": 3.6623151302337646, "learning_rate": 1.2139782245501942e-05, "loss": 0.6181, "step": 3452 }, { "epoch": 0.24596644940698792, "grad_norm": 2.798804759979248, "learning_rate": 1.213594508573224e-05, "loss": 0.2311, "step": 3453 }, { "epoch": 0.24603768208854224, "grad_norm": 2.7651963233947754, "learning_rate": 1.2132107596408207e-05, "loss": 0.1785, "step": 3454 }, { "epoch": 0.24610891477009653, "grad_norm": 3.3953697681427, "learning_rate": 1.212826977812193e-05, "loss": 0.5845, "step": 3455 }, { "epoch": 0.24618014745165082, "grad_norm": 2.7298104763031006, "learning_rate": 1.212443163146554e-05, "loss": 0.0806, "step": 3456 }, { "epoch": 0.2462513801332051, "grad_norm": 3.8366918563842773, "learning_rate": 1.2120593157031231e-05, "loss": 0.7555, "step": 3457 }, { "epoch": 0.24632261281475942, "grad_norm": 4.272348403930664, "learning_rate": 1.2116754355411233e-05, "loss": 0.746, "step": 3458 }, { "epoch": 0.2463938454963137, "grad_norm": 3.172146797180176, "learning_rate": 1.2112915227197836e-05, "loss": 0.3397, "step": 3459 }, { "epoch": 0.246465078177868, "grad_norm": 4.443112850189209, "learning_rate": 1.2109075772983383e-05, "loss": 0.9535, "step": 3460 }, { "epoch": 0.2465363108594223, "grad_norm": 4.144311428070068, "learning_rate": 1.2105235993360252e-05, "loss": 0.8665, "step": 3461 }, { "epoch": 0.2466075435409766, "grad_norm": 3.211911201477051, "learning_rate": 1.2101395888920888e-05, "loss": 0.5279, "step": 3462 }, { "epoch": 0.2466787762225309, "grad_norm": 3.652310848236084, "learning_rate": 1.2097555460257779e-05, "loss": 0.7956, "step": 3463 }, { "epoch": 0.2467500089040852, "grad_norm": 5.18585729598999, "learning_rate": 1.2093714707963464e-05, "loss": 0.5841, "step": 3464 }, { "epoch": 0.24682124158563948, "grad_norm": 3.7618236541748047, "learning_rate": 1.2089873632630531e-05, "loss": 0.2651, "step": 3465 }, { "epoch": 0.2468924742671938, "grad_norm": 2.896047830581665, "learning_rate": 1.2086032234851616e-05, "loss": 0.5158, "step": 3466 }, { "epoch": 0.24696370694874809, "grad_norm": 2.567082405090332, "learning_rate": 1.2082190515219412e-05, "loss": 0.3787, "step": 3467 }, { "epoch": 0.24703493963030237, "grad_norm": 3.1349456310272217, "learning_rate": 1.2078348474326652e-05, "loss": 0.7827, "step": 3468 }, { "epoch": 0.2471061723118567, "grad_norm": 2.7591936588287354, "learning_rate": 1.2074506112766127e-05, "loss": 0.3366, "step": 3469 }, { "epoch": 0.24717740499341098, "grad_norm": 2.8999931812286377, "learning_rate": 1.2070663431130666e-05, "loss": 0.6674, "step": 3470 }, { "epoch": 0.24724863767496527, "grad_norm": 1.2289211750030518, "learning_rate": 1.2066820430013168e-05, "loss": 0.1048, "step": 3471 }, { "epoch": 0.24731987035651956, "grad_norm": 3.3717403411865234, "learning_rate": 1.2062977110006559e-05, "loss": 0.5637, "step": 3472 }, { "epoch": 0.24739110303807388, "grad_norm": 3.7943639755249023, "learning_rate": 1.205913347170383e-05, "loss": 0.4985, "step": 3473 }, { "epoch": 0.24746233571962817, "grad_norm": 3.986232280731201, "learning_rate": 1.2055289515698008e-05, "loss": 0.7415, "step": 3474 }, { "epoch": 0.24753356840118246, "grad_norm": 3.537716865539551, "learning_rate": 1.205144524258218e-05, "loss": 0.3452, "step": 3475 }, { "epoch": 0.24760480108273675, "grad_norm": 2.6177000999450684, "learning_rate": 1.2047600652949476e-05, "loss": 0.4336, "step": 3476 }, { "epoch": 0.24767603376429106, "grad_norm": 3.245483875274658, "learning_rate": 1.2043755747393077e-05, "loss": 0.7477, "step": 3477 }, { "epoch": 0.24774726644584535, "grad_norm": 2.809401512145996, "learning_rate": 1.203991052650621e-05, "loss": 0.2162, "step": 3478 }, { "epoch": 0.24781849912739964, "grad_norm": 2.353790521621704, "learning_rate": 1.2036064990882162e-05, "loss": 0.5282, "step": 3479 }, { "epoch": 0.24788973180895396, "grad_norm": 3.0466346740722656, "learning_rate": 1.2032219141114253e-05, "loss": 0.4149, "step": 3480 }, { "epoch": 0.24796096449050825, "grad_norm": 3.991736888885498, "learning_rate": 1.2028372977795854e-05, "loss": 0.421, "step": 3481 }, { "epoch": 0.24803219717206254, "grad_norm": 3.3412389755249023, "learning_rate": 1.2024526501520398e-05, "loss": 0.3665, "step": 3482 }, { "epoch": 0.24810342985361683, "grad_norm": 2.5686872005462646, "learning_rate": 1.2020679712881347e-05, "loss": 0.5787, "step": 3483 }, { "epoch": 0.24817466253517115, "grad_norm": 3.7030344009399414, "learning_rate": 1.2016832612472225e-05, "loss": 0.687, "step": 3484 }, { "epoch": 0.24824589521672544, "grad_norm": 3.169741630554199, "learning_rate": 1.2012985200886602e-05, "loss": 0.7302, "step": 3485 }, { "epoch": 0.24831712789827973, "grad_norm": 2.3285505771636963, "learning_rate": 1.2009137478718093e-05, "loss": 0.3514, "step": 3486 }, { "epoch": 0.24838836057983402, "grad_norm": 2.5512173175811768, "learning_rate": 1.2005289446560357e-05, "loss": 0.5172, "step": 3487 }, { "epoch": 0.24845959326138833, "grad_norm": 4.091785430908203, "learning_rate": 1.2001441105007114e-05, "loss": 0.4514, "step": 3488 }, { "epoch": 0.24853082594294262, "grad_norm": 3.7817108631134033, "learning_rate": 1.199759245465212e-05, "loss": 0.3344, "step": 3489 }, { "epoch": 0.2486020586244969, "grad_norm": 2.456705093383789, "learning_rate": 1.199374349608918e-05, "loss": 0.3108, "step": 3490 }, { "epoch": 0.2486732913060512, "grad_norm": 3.7280704975128174, "learning_rate": 1.198989422991215e-05, "loss": 0.3461, "step": 3491 }, { "epoch": 0.24874452398760552, "grad_norm": 1.3634617328643799, "learning_rate": 1.1986044656714933e-05, "loss": 0.044, "step": 3492 }, { "epoch": 0.2488157566691598, "grad_norm": 3.3870794773101807, "learning_rate": 1.1982194777091476e-05, "loss": 0.4444, "step": 3493 }, { "epoch": 0.2488869893507141, "grad_norm": 3.413191795349121, "learning_rate": 1.1978344591635779e-05, "loss": 0.3754, "step": 3494 }, { "epoch": 0.24895822203226842, "grad_norm": 2.4845645427703857, "learning_rate": 1.1974494100941884e-05, "loss": 0.3579, "step": 3495 }, { "epoch": 0.2490294547138227, "grad_norm": 7.896911144256592, "learning_rate": 1.1970643305603885e-05, "loss": 0.3251, "step": 3496 }, { "epoch": 0.249100687395377, "grad_norm": 3.1977970600128174, "learning_rate": 1.1966792206215914e-05, "loss": 0.5384, "step": 3497 }, { "epoch": 0.24917192007693129, "grad_norm": 3.1998472213745117, "learning_rate": 1.1962940803372158e-05, "loss": 0.7068, "step": 3498 }, { "epoch": 0.2492431527584856, "grad_norm": 3.3920156955718994, "learning_rate": 1.1959089097666853e-05, "loss": 0.7976, "step": 3499 }, { "epoch": 0.2493143854400399, "grad_norm": 3.3412859439849854, "learning_rate": 1.1955237089694279e-05, "loss": 0.5712, "step": 3500 }, { "epoch": 0.24938561812159418, "grad_norm": 4.001209259033203, "learning_rate": 1.1951384780048752e-05, "loss": 0.5331, "step": 3501 }, { "epoch": 0.24945685080314847, "grad_norm": 2.8858442306518555, "learning_rate": 1.1947532169324649e-05, "loss": 0.4994, "step": 3502 }, { "epoch": 0.2495280834847028, "grad_norm": 4.16473388671875, "learning_rate": 1.194367925811639e-05, "loss": 0.6606, "step": 3503 }, { "epoch": 0.24959931616625708, "grad_norm": 3.159484624862671, "learning_rate": 1.1939826047018436e-05, "loss": 0.5339, "step": 3504 }, { "epoch": 0.24967054884781137, "grad_norm": 1.9360827207565308, "learning_rate": 1.1935972536625302e-05, "loss": 0.3426, "step": 3505 }, { "epoch": 0.2497417815293657, "grad_norm": 2.794954299926758, "learning_rate": 1.1932118727531541e-05, "loss": 0.2654, "step": 3506 }, { "epoch": 0.24981301421091998, "grad_norm": 3.0443692207336426, "learning_rate": 1.1928264620331755e-05, "loss": 0.5385, "step": 3507 }, { "epoch": 0.24988424689247427, "grad_norm": 5.135855674743652, "learning_rate": 1.1924410215620596e-05, "loss": 0.7256, "step": 3508 }, { "epoch": 0.24995547957402856, "grad_norm": 4.152370452880859, "learning_rate": 1.192055551399276e-05, "loss": 0.5536, "step": 3509 }, { "epoch": 0.2500267122555829, "grad_norm": 2.6929328441619873, "learning_rate": 1.1916700516042986e-05, "loss": 0.4494, "step": 3510 }, { "epoch": 0.25009794493713716, "grad_norm": 3.192110300064087, "learning_rate": 1.1912845222366061e-05, "loss": 0.4668, "step": 3511 }, { "epoch": 0.25016917761869145, "grad_norm": 3.305152177810669, "learning_rate": 1.1908989633556816e-05, "loss": 0.4184, "step": 3512 }, { "epoch": 0.25024041030024574, "grad_norm": 2.223947763442993, "learning_rate": 1.1905133750210126e-05, "loss": 0.0864, "step": 3513 }, { "epoch": 0.25031164298180003, "grad_norm": 3.0986592769622803, "learning_rate": 1.1901277572920922e-05, "loss": 0.6987, "step": 3514 }, { "epoch": 0.2503828756633543, "grad_norm": 2.6953861713409424, "learning_rate": 1.1897421102284166e-05, "loss": 0.5623, "step": 3515 }, { "epoch": 0.25045410834490867, "grad_norm": 3.9850735664367676, "learning_rate": 1.1893564338894872e-05, "loss": 0.3279, "step": 3516 }, { "epoch": 0.25052534102646296, "grad_norm": 2.507908344268799, "learning_rate": 1.1889707283348104e-05, "loss": 0.3854, "step": 3517 }, { "epoch": 0.25059657370801725, "grad_norm": 3.131206512451172, "learning_rate": 1.188584993623896e-05, "loss": 0.2103, "step": 3518 }, { "epoch": 0.25066780638957153, "grad_norm": 3.107682943344116, "learning_rate": 1.1881992298162593e-05, "loss": 0.5544, "step": 3519 }, { "epoch": 0.2507390390711258, "grad_norm": 3.4062798023223877, "learning_rate": 1.1878134369714193e-05, "loss": 0.4174, "step": 3520 }, { "epoch": 0.2508102717526801, "grad_norm": 5.131697177886963, "learning_rate": 1.1874276151489002e-05, "loss": 0.5112, "step": 3521 }, { "epoch": 0.2508815044342344, "grad_norm": 3.134702682495117, "learning_rate": 1.1870417644082304e-05, "loss": 0.7744, "step": 3522 }, { "epoch": 0.25095273711578875, "grad_norm": 3.4121854305267334, "learning_rate": 1.1866558848089422e-05, "loss": 0.6329, "step": 3523 }, { "epoch": 0.25102396979734304, "grad_norm": 3.0653622150421143, "learning_rate": 1.1862699764105731e-05, "loss": 0.3985, "step": 3524 }, { "epoch": 0.25109520247889733, "grad_norm": 2.69709849357605, "learning_rate": 1.1858840392726652e-05, "loss": 0.6027, "step": 3525 }, { "epoch": 0.2511664351604516, "grad_norm": 3.212862968444824, "learning_rate": 1.185498073454764e-05, "loss": 1.0307, "step": 3526 }, { "epoch": 0.2512376678420059, "grad_norm": 2.463923931121826, "learning_rate": 1.1851120790164206e-05, "loss": 0.2633, "step": 3527 }, { "epoch": 0.2513089005235602, "grad_norm": 2.319136381149292, "learning_rate": 1.1847260560171895e-05, "loss": 0.5443, "step": 3528 }, { "epoch": 0.2513801332051145, "grad_norm": 2.804433584213257, "learning_rate": 1.1843400045166305e-05, "loss": 0.581, "step": 3529 }, { "epoch": 0.2514513658866688, "grad_norm": 3.4666216373443604, "learning_rate": 1.1839539245743066e-05, "loss": 0.6017, "step": 3530 }, { "epoch": 0.2515225985682231, "grad_norm": 3.5864920616149902, "learning_rate": 1.183567816249787e-05, "loss": 0.5007, "step": 3531 }, { "epoch": 0.2515938312497774, "grad_norm": 3.2405028343200684, "learning_rate": 1.1831816796026434e-05, "loss": 0.6276, "step": 3532 }, { "epoch": 0.2516650639313317, "grad_norm": 5.3313422203063965, "learning_rate": 1.1827955146924532e-05, "loss": 0.7845, "step": 3533 }, { "epoch": 0.251736296612886, "grad_norm": 3.5416691303253174, "learning_rate": 1.1824093215787977e-05, "loss": 0.7626, "step": 3534 }, { "epoch": 0.2518075292944403, "grad_norm": 5.838059425354004, "learning_rate": 1.182023100321262e-05, "loss": 0.5066, "step": 3535 }, { "epoch": 0.25187876197599457, "grad_norm": 2.570932626724243, "learning_rate": 1.1816368509794365e-05, "loss": 0.5701, "step": 3536 }, { "epoch": 0.25194999465754886, "grad_norm": 2.2317867279052734, "learning_rate": 1.1812505736129156e-05, "loss": 0.623, "step": 3537 }, { "epoch": 0.2520212273391032, "grad_norm": 2.2696824073791504, "learning_rate": 1.1808642682812973e-05, "loss": 0.4696, "step": 3538 }, { "epoch": 0.2520924600206575, "grad_norm": 4.601973056793213, "learning_rate": 1.1804779350441852e-05, "loss": 0.4109, "step": 3539 }, { "epoch": 0.2521636927022118, "grad_norm": 7.56576681137085, "learning_rate": 1.1800915739611865e-05, "loss": 0.2228, "step": 3540 }, { "epoch": 0.2522349253837661, "grad_norm": 2.628319025039673, "learning_rate": 1.1797051850919123e-05, "loss": 0.5162, "step": 3541 }, { "epoch": 0.25230615806532036, "grad_norm": 3.137352228164673, "learning_rate": 1.1793187684959786e-05, "loss": 0.5783, "step": 3542 }, { "epoch": 0.25237739074687465, "grad_norm": 3.666346549987793, "learning_rate": 1.1789323242330057e-05, "loss": 0.681, "step": 3543 }, { "epoch": 0.25244862342842894, "grad_norm": 4.8120503425598145, "learning_rate": 1.1785458523626177e-05, "loss": 0.5991, "step": 3544 }, { "epoch": 0.2525198561099833, "grad_norm": 1.7714455127716064, "learning_rate": 1.1781593529444432e-05, "loss": 0.0928, "step": 3545 }, { "epoch": 0.2525910887915376, "grad_norm": 4.320879936218262, "learning_rate": 1.1777728260381154e-05, "loss": 0.6632, "step": 3546 }, { "epoch": 0.25266232147309187, "grad_norm": 3.3937041759490967, "learning_rate": 1.1773862717032711e-05, "loss": 0.6132, "step": 3547 }, { "epoch": 0.25273355415464616, "grad_norm": 2.079728603363037, "learning_rate": 1.176999689999552e-05, "loss": 0.2873, "step": 3548 }, { "epoch": 0.25280478683620045, "grad_norm": 2.6121668815612793, "learning_rate": 1.1766130809866037e-05, "loss": 0.4933, "step": 3549 }, { "epoch": 0.25287601951775474, "grad_norm": 3.452188491821289, "learning_rate": 1.1762264447240753e-05, "loss": 0.4332, "step": 3550 }, { "epoch": 0.252947252199309, "grad_norm": 3.164970874786377, "learning_rate": 1.1758397812716216e-05, "loss": 0.3739, "step": 3551 }, { "epoch": 0.2530184848808633, "grad_norm": 6.692883014678955, "learning_rate": 1.1754530906889e-05, "loss": 0.6652, "step": 3552 }, { "epoch": 0.25308971756241766, "grad_norm": 4.3864946365356445, "learning_rate": 1.1750663730355737e-05, "loss": 0.4123, "step": 3553 }, { "epoch": 0.25316095024397195, "grad_norm": 5.440397262573242, "learning_rate": 1.174679628371309e-05, "loss": 0.2522, "step": 3554 }, { "epoch": 0.25323218292552624, "grad_norm": 3.122704029083252, "learning_rate": 1.174292856755776e-05, "loss": 0.8215, "step": 3555 }, { "epoch": 0.25330341560708053, "grad_norm": 2.368931293487549, "learning_rate": 1.1739060582486506e-05, "loss": 0.3127, "step": 3556 }, { "epoch": 0.2533746482886348, "grad_norm": 3.359668016433716, "learning_rate": 1.173519232909611e-05, "loss": 0.5556, "step": 3557 }, { "epoch": 0.2534458809701891, "grad_norm": 2.571807622909546, "learning_rate": 1.1731323807983406e-05, "loss": 0.4974, "step": 3558 }, { "epoch": 0.2535171136517434, "grad_norm": 3.785839080810547, "learning_rate": 1.1727455019745269e-05, "loss": 0.6297, "step": 3559 }, { "epoch": 0.25358834633329774, "grad_norm": 3.113342046737671, "learning_rate": 1.1723585964978612e-05, "loss": 0.3969, "step": 3560 }, { "epoch": 0.25365957901485203, "grad_norm": 1.9507261514663696, "learning_rate": 1.1719716644280388e-05, "loss": 0.2489, "step": 3561 }, { "epoch": 0.2537308116964063, "grad_norm": 3.594454526901245, "learning_rate": 1.1715847058247599e-05, "loss": 0.541, "step": 3562 }, { "epoch": 0.2538020443779606, "grad_norm": 2.394909620285034, "learning_rate": 1.1711977207477276e-05, "loss": 0.22, "step": 3563 }, { "epoch": 0.2538732770595149, "grad_norm": 2.3210220336914062, "learning_rate": 1.1708107092566501e-05, "loss": 0.3141, "step": 3564 }, { "epoch": 0.2539445097410692, "grad_norm": 4.046879291534424, "learning_rate": 1.170423671411239e-05, "loss": 0.484, "step": 3565 }, { "epoch": 0.2540157424226235, "grad_norm": 1.7023299932479858, "learning_rate": 1.1700366072712108e-05, "loss": 0.1858, "step": 3566 }, { "epoch": 0.25408697510417777, "grad_norm": 3.202017307281494, "learning_rate": 1.1696495168962848e-05, "loss": 0.7025, "step": 3567 }, { "epoch": 0.2541582077857321, "grad_norm": 2.7147397994995117, "learning_rate": 1.1692624003461854e-05, "loss": 0.6127, "step": 3568 }, { "epoch": 0.2542294404672864, "grad_norm": 1.9901703596115112, "learning_rate": 1.168875257680641e-05, "loss": 0.3054, "step": 3569 }, { "epoch": 0.2543006731488407, "grad_norm": 4.00234317779541, "learning_rate": 1.168488088959383e-05, "loss": 0.6245, "step": 3570 }, { "epoch": 0.254371905830395, "grad_norm": 4.661257743835449, "learning_rate": 1.1681008942421484e-05, "loss": 0.6628, "step": 3571 }, { "epoch": 0.2544431385119493, "grad_norm": 4.365265369415283, "learning_rate": 1.1677136735886767e-05, "loss": 0.2516, "step": 3572 }, { "epoch": 0.25451437119350356, "grad_norm": 3.3826115131378174, "learning_rate": 1.1673264270587122e-05, "loss": 0.4562, "step": 3573 }, { "epoch": 0.25458560387505785, "grad_norm": 4.228590488433838, "learning_rate": 1.1669391547120032e-05, "loss": 0.4619, "step": 3574 }, { "epoch": 0.2546568365566122, "grad_norm": 4.380306243896484, "learning_rate": 1.1665518566083016e-05, "loss": 0.9418, "step": 3575 }, { "epoch": 0.2547280692381665, "grad_norm": 8.1712007522583, "learning_rate": 1.1661645328073641e-05, "loss": 0.8188, "step": 3576 }, { "epoch": 0.2547993019197208, "grad_norm": 3.00211763381958, "learning_rate": 1.16577718336895e-05, "loss": 0.4716, "step": 3577 }, { "epoch": 0.25487053460127507, "grad_norm": 3.0481960773468018, "learning_rate": 1.165389808352824e-05, "loss": 0.3244, "step": 3578 }, { "epoch": 0.25494176728282936, "grad_norm": 3.06108021736145, "learning_rate": 1.1650024078187534e-05, "loss": 0.6729, "step": 3579 }, { "epoch": 0.25501299996438365, "grad_norm": 4.200718879699707, "learning_rate": 1.1646149818265107e-05, "loss": 0.8074, "step": 3580 }, { "epoch": 0.25508423264593794, "grad_norm": 2.94278621673584, "learning_rate": 1.1642275304358713e-05, "loss": 0.4997, "step": 3581 }, { "epoch": 0.2551554653274922, "grad_norm": 2.8631937503814697, "learning_rate": 1.1638400537066152e-05, "loss": 0.6604, "step": 3582 }, { "epoch": 0.25522669800904657, "grad_norm": 2.2186992168426514, "learning_rate": 1.1634525516985264e-05, "loss": 0.2596, "step": 3583 }, { "epoch": 0.25529793069060086, "grad_norm": 3.205430507659912, "learning_rate": 1.1630650244713917e-05, "loss": 0.8016, "step": 3584 }, { "epoch": 0.25536916337215515, "grad_norm": 2.9316141605377197, "learning_rate": 1.1626774720850031e-05, "loss": 0.4172, "step": 3585 }, { "epoch": 0.25544039605370944, "grad_norm": 3.7130119800567627, "learning_rate": 1.1622898945991559e-05, "loss": 0.1317, "step": 3586 }, { "epoch": 0.25551162873526373, "grad_norm": 4.265615463256836, "learning_rate": 1.1619022920736491e-05, "loss": 0.4341, "step": 3587 }, { "epoch": 0.255582861416818, "grad_norm": 2.245126724243164, "learning_rate": 1.161514664568286e-05, "loss": 0.3455, "step": 3588 }, { "epoch": 0.2556540940983723, "grad_norm": 2.417196273803711, "learning_rate": 1.1611270121428736e-05, "loss": 0.4062, "step": 3589 }, { "epoch": 0.25572532677992665, "grad_norm": 4.761309623718262, "learning_rate": 1.160739334857222e-05, "loss": 0.7847, "step": 3590 }, { "epoch": 0.25579655946148094, "grad_norm": 2.775887966156006, "learning_rate": 1.1603516327711466e-05, "loss": 0.5323, "step": 3591 }, { "epoch": 0.25586779214303523, "grad_norm": 2.195596218109131, "learning_rate": 1.1599639059444657e-05, "loss": 0.2538, "step": 3592 }, { "epoch": 0.2559390248245895, "grad_norm": 2.63349986076355, "learning_rate": 1.1595761544370015e-05, "loss": 0.5187, "step": 3593 }, { "epoch": 0.2560102575061438, "grad_norm": 3.723637819290161, "learning_rate": 1.1591883783085799e-05, "loss": 0.474, "step": 3594 }, { "epoch": 0.2560814901876981, "grad_norm": 4.991243362426758, "learning_rate": 1.1588005776190305e-05, "loss": 0.2126, "step": 3595 }, { "epoch": 0.2561527228692524, "grad_norm": 2.812591075897217, "learning_rate": 1.1584127524281877e-05, "loss": 0.5286, "step": 3596 }, { "epoch": 0.25622395555080674, "grad_norm": 3.0670981407165527, "learning_rate": 1.1580249027958883e-05, "loss": 0.3132, "step": 3597 }, { "epoch": 0.256295188232361, "grad_norm": 4.875831604003906, "learning_rate": 1.1576370287819737e-05, "loss": 0.2494, "step": 3598 }, { "epoch": 0.2563664209139153, "grad_norm": 3.286660671234131, "learning_rate": 1.1572491304462891e-05, "loss": 0.3008, "step": 3599 }, { "epoch": 0.2564376535954696, "grad_norm": 2.6820900440216064, "learning_rate": 1.156861207848683e-05, "loss": 0.5848, "step": 3600 }, { "epoch": 0.2565088862770239, "grad_norm": 2.3977859020233154, "learning_rate": 1.156473261049008e-05, "loss": 0.6206, "step": 3601 }, { "epoch": 0.2565801189585782, "grad_norm": 4.024013042449951, "learning_rate": 1.15608529010712e-05, "loss": 0.4244, "step": 3602 }, { "epoch": 0.2566513516401325, "grad_norm": 3.644878387451172, "learning_rate": 1.1556972950828791e-05, "loss": 0.6476, "step": 3603 }, { "epoch": 0.25672258432168676, "grad_norm": 4.871897220611572, "learning_rate": 1.1553092760361488e-05, "loss": 0.6045, "step": 3604 }, { "epoch": 0.2567938170032411, "grad_norm": 2.1923139095306396, "learning_rate": 1.1549212330267969e-05, "loss": 0.1366, "step": 3605 }, { "epoch": 0.2568650496847954, "grad_norm": 2.574031114578247, "learning_rate": 1.1545331661146941e-05, "loss": 0.4875, "step": 3606 }, { "epoch": 0.2569362823663497, "grad_norm": 2.9249372482299805, "learning_rate": 1.1541450753597147e-05, "loss": 0.6937, "step": 3607 }, { "epoch": 0.257007515047904, "grad_norm": 2.675241470336914, "learning_rate": 1.1537569608217381e-05, "loss": 0.4707, "step": 3608 }, { "epoch": 0.25707874772945827, "grad_norm": 2.506319999694824, "learning_rate": 1.1533688225606458e-05, "loss": 0.3581, "step": 3609 }, { "epoch": 0.25714998041101256, "grad_norm": 3.4105536937713623, "learning_rate": 1.1529806606363234e-05, "loss": 0.5304, "step": 3610 }, { "epoch": 0.25722121309256685, "grad_norm": 2.6695327758789062, "learning_rate": 1.1525924751086603e-05, "loss": 0.7959, "step": 3611 }, { "epoch": 0.2572924457741212, "grad_norm": 2.9633021354675293, "learning_rate": 1.15220426603755e-05, "loss": 0.5921, "step": 3612 }, { "epoch": 0.2573636784556755, "grad_norm": 2.8402535915374756, "learning_rate": 1.1518160334828885e-05, "loss": 0.2846, "step": 3613 }, { "epoch": 0.25743491113722977, "grad_norm": 3.8807644844055176, "learning_rate": 1.1514277775045768e-05, "loss": 0.6995, "step": 3614 }, { "epoch": 0.25750614381878406, "grad_norm": 2.800935983657837, "learning_rate": 1.1510394981625184e-05, "loss": 0.481, "step": 3615 }, { "epoch": 0.25757737650033835, "grad_norm": 3.4788875579833984, "learning_rate": 1.1506511955166206e-05, "loss": 0.7977, "step": 3616 }, { "epoch": 0.25764860918189264, "grad_norm": 3.0510096549987793, "learning_rate": 1.150262869626795e-05, "loss": 0.649, "step": 3617 }, { "epoch": 0.25771984186344693, "grad_norm": 3.5058555603027344, "learning_rate": 1.1498745205529558e-05, "loss": 0.7477, "step": 3618 }, { "epoch": 0.2577910745450012, "grad_norm": 3.084942579269409, "learning_rate": 1.1494861483550216e-05, "loss": 0.2429, "step": 3619 }, { "epoch": 0.25786230722655556, "grad_norm": 3.871434450149536, "learning_rate": 1.1490977530929141e-05, "loss": 0.3928, "step": 3620 }, { "epoch": 0.25793353990810985, "grad_norm": 4.387081146240234, "learning_rate": 1.1487093348265585e-05, "loss": 0.4904, "step": 3621 }, { "epoch": 0.25800477258966414, "grad_norm": 2.8228070735931396, "learning_rate": 1.1483208936158842e-05, "loss": 0.7455, "step": 3622 }, { "epoch": 0.25807600527121843, "grad_norm": 1.3927373886108398, "learning_rate": 1.1479324295208234e-05, "loss": 0.1401, "step": 3623 }, { "epoch": 0.2581472379527727, "grad_norm": 4.089735507965088, "learning_rate": 1.1475439426013122e-05, "loss": 0.7267, "step": 3624 }, { "epoch": 0.258218470634327, "grad_norm": 2.0236051082611084, "learning_rate": 1.14715543291729e-05, "loss": 0.3649, "step": 3625 }, { "epoch": 0.2582897033158813, "grad_norm": 3.7066752910614014, "learning_rate": 1.1467669005286999e-05, "loss": 0.4505, "step": 3626 }, { "epoch": 0.25836093599743565, "grad_norm": 8.184478759765625, "learning_rate": 1.1463783454954883e-05, "loss": 0.1109, "step": 3627 }, { "epoch": 0.25843216867898994, "grad_norm": 3.202418327331543, "learning_rate": 1.1459897678776055e-05, "loss": 0.3424, "step": 3628 }, { "epoch": 0.2585034013605442, "grad_norm": 3.0126278400421143, "learning_rate": 1.1456011677350052e-05, "loss": 0.551, "step": 3629 }, { "epoch": 0.2585746340420985, "grad_norm": 5.342653274536133, "learning_rate": 1.1452125451276435e-05, "loss": 0.5023, "step": 3630 }, { "epoch": 0.2586458667236528, "grad_norm": 4.617044448852539, "learning_rate": 1.1448239001154821e-05, "loss": 0.3969, "step": 3631 }, { "epoch": 0.2587170994052071, "grad_norm": 2.6997230052948, "learning_rate": 1.144435232758484e-05, "loss": 0.264, "step": 3632 }, { "epoch": 0.2587883320867614, "grad_norm": 3.599173069000244, "learning_rate": 1.144046543116617e-05, "loss": 0.5781, "step": 3633 }, { "epoch": 0.25885956476831573, "grad_norm": 3.1188127994537354, "learning_rate": 1.1436578312498518e-05, "loss": 0.3622, "step": 3634 }, { "epoch": 0.25893079744987, "grad_norm": 4.734534740447998, "learning_rate": 1.1432690972181624e-05, "loss": 0.2919, "step": 3635 }, { "epoch": 0.2590020301314243, "grad_norm": 2.876941680908203, "learning_rate": 1.1428803410815268e-05, "loss": 0.2941, "step": 3636 }, { "epoch": 0.2590732628129786, "grad_norm": 5.632003307342529, "learning_rate": 1.1424915628999261e-05, "loss": 0.4458, "step": 3637 }, { "epoch": 0.2591444954945329, "grad_norm": 3.4588048458099365, "learning_rate": 1.1421027627333445e-05, "loss": 0.8233, "step": 3638 }, { "epoch": 0.2592157281760872, "grad_norm": 3.4155566692352295, "learning_rate": 1.14171394064177e-05, "loss": 0.7805, "step": 3639 }, { "epoch": 0.25928696085764147, "grad_norm": 3.404533624649048, "learning_rate": 1.141325096685194e-05, "loss": 0.495, "step": 3640 }, { "epoch": 0.25935819353919576, "grad_norm": 3.6122097969055176, "learning_rate": 1.1409362309236107e-05, "loss": 0.8349, "step": 3641 }, { "epoch": 0.2594294262207501, "grad_norm": 3.2106356620788574, "learning_rate": 1.1405473434170185e-05, "loss": 0.5768, "step": 3642 }, { "epoch": 0.2595006589023044, "grad_norm": 2.0744831562042236, "learning_rate": 1.1401584342254183e-05, "loss": 0.4134, "step": 3643 }, { "epoch": 0.2595718915838587, "grad_norm": 4.678126811981201, "learning_rate": 1.1397695034088152e-05, "loss": 0.6698, "step": 3644 }, { "epoch": 0.259643124265413, "grad_norm": 4.343177318572998, "learning_rate": 1.1393805510272171e-05, "loss": 0.6129, "step": 3645 }, { "epoch": 0.25971435694696726, "grad_norm": 2.9252431392669678, "learning_rate": 1.1389915771406354e-05, "loss": 0.8011, "step": 3646 }, { "epoch": 0.25978558962852155, "grad_norm": 2.5129618644714355, "learning_rate": 1.1386025818090847e-05, "loss": 0.3728, "step": 3647 }, { "epoch": 0.25985682231007584, "grad_norm": 1.7155877351760864, "learning_rate": 1.138213565092583e-05, "loss": 0.248, "step": 3648 }, { "epoch": 0.2599280549916302, "grad_norm": 1.8366961479187012, "learning_rate": 1.1378245270511512e-05, "loss": 0.23, "step": 3649 }, { "epoch": 0.2599992876731845, "grad_norm": 3.415086507797241, "learning_rate": 1.1374354677448145e-05, "loss": 0.9364, "step": 3650 }, { "epoch": 0.26007052035473877, "grad_norm": 3.0105371475219727, "learning_rate": 1.1370463872336004e-05, "loss": 0.3543, "step": 3651 }, { "epoch": 0.26014175303629306, "grad_norm": 3.244387149810791, "learning_rate": 1.1366572855775397e-05, "loss": 0.7327, "step": 3652 }, { "epoch": 0.26021298571784734, "grad_norm": 3.6394155025482178, "learning_rate": 1.1362681628366676e-05, "loss": 0.579, "step": 3653 }, { "epoch": 0.26028421839940163, "grad_norm": 3.9524190425872803, "learning_rate": 1.1358790190710213e-05, "loss": 0.6663, "step": 3654 }, { "epoch": 0.2603554510809559, "grad_norm": 4.8753485679626465, "learning_rate": 1.1354898543406411e-05, "loss": 0.2928, "step": 3655 }, { "epoch": 0.2604266837625102, "grad_norm": 3.5505259037017822, "learning_rate": 1.1351006687055722e-05, "loss": 0.5399, "step": 3656 }, { "epoch": 0.26049791644406456, "grad_norm": 0.9308469295501709, "learning_rate": 1.1347114622258613e-05, "loss": 0.0442, "step": 3657 }, { "epoch": 0.26056914912561885, "grad_norm": 3.6820852756500244, "learning_rate": 1.1343222349615585e-05, "loss": 0.9854, "step": 3658 }, { "epoch": 0.26064038180717314, "grad_norm": 2.779742479324341, "learning_rate": 1.1339329869727187e-05, "loss": 0.5523, "step": 3659 }, { "epoch": 0.26071161448872743, "grad_norm": 2.3575146198272705, "learning_rate": 1.133543718319398e-05, "loss": 0.3043, "step": 3660 }, { "epoch": 0.2607828471702817, "grad_norm": 2.4427132606506348, "learning_rate": 1.1331544290616569e-05, "loss": 0.4473, "step": 3661 }, { "epoch": 0.260854079851836, "grad_norm": 5.184088230133057, "learning_rate": 1.1327651192595588e-05, "loss": 0.7606, "step": 3662 }, { "epoch": 0.2609253125333903, "grad_norm": 3.323179006576538, "learning_rate": 1.1323757889731697e-05, "loss": 0.3586, "step": 3663 }, { "epoch": 0.26099654521494464, "grad_norm": 3.127376079559326, "learning_rate": 1.1319864382625595e-05, "loss": 0.5659, "step": 3664 }, { "epoch": 0.26106777789649893, "grad_norm": 5.065756797790527, "learning_rate": 1.1315970671878014e-05, "loss": 0.3321, "step": 3665 }, { "epoch": 0.2611390105780532, "grad_norm": 2.582815408706665, "learning_rate": 1.1312076758089708e-05, "loss": 0.3103, "step": 3666 }, { "epoch": 0.2612102432596075, "grad_norm": 2.945856809616089, "learning_rate": 1.130818264186147e-05, "loss": 0.733, "step": 3667 }, { "epoch": 0.2612814759411618, "grad_norm": 1.7865132093429565, "learning_rate": 1.1304288323794121e-05, "loss": 0.1511, "step": 3668 }, { "epoch": 0.2613527086227161, "grad_norm": 3.446915626525879, "learning_rate": 1.1300393804488519e-05, "loss": 0.648, "step": 3669 }, { "epoch": 0.2614239413042704, "grad_norm": 2.204465627670288, "learning_rate": 1.1296499084545543e-05, "loss": 0.1849, "step": 3670 }, { "epoch": 0.26149517398582467, "grad_norm": 3.655649423599243, "learning_rate": 1.1292604164566108e-05, "loss": 0.6722, "step": 3671 }, { "epoch": 0.261566406667379, "grad_norm": 3.4007489681243896, "learning_rate": 1.1288709045151161e-05, "loss": 0.5589, "step": 3672 }, { "epoch": 0.2616376393489333, "grad_norm": 2.353759527206421, "learning_rate": 1.128481372690168e-05, "loss": 0.2156, "step": 3673 }, { "epoch": 0.2617088720304876, "grad_norm": 2.753774881362915, "learning_rate": 1.1280918210418674e-05, "loss": 0.724, "step": 3674 }, { "epoch": 0.2617801047120419, "grad_norm": 3.871187210083008, "learning_rate": 1.1277022496303178e-05, "loss": 0.3259, "step": 3675 }, { "epoch": 0.2618513373935962, "grad_norm": 2.8721401691436768, "learning_rate": 1.1273126585156262e-05, "loss": 0.5068, "step": 3676 }, { "epoch": 0.26192257007515046, "grad_norm": 3.7088444232940674, "learning_rate": 1.1269230477579025e-05, "loss": 0.21, "step": 3677 }, { "epoch": 0.26199380275670475, "grad_norm": 2.393704891204834, "learning_rate": 1.1265334174172593e-05, "loss": 0.2428, "step": 3678 }, { "epoch": 0.2620650354382591, "grad_norm": 2.0165064334869385, "learning_rate": 1.1261437675538132e-05, "loss": 0.2582, "step": 3679 }, { "epoch": 0.2621362681198134, "grad_norm": 3.0158703327178955, "learning_rate": 1.1257540982276827e-05, "loss": 0.7575, "step": 3680 }, { "epoch": 0.2622075008013677, "grad_norm": 2.250725507736206, "learning_rate": 1.1253644094989895e-05, "loss": 0.2243, "step": 3681 }, { "epoch": 0.26227873348292197, "grad_norm": 1.7925552129745483, "learning_rate": 1.1249747014278594e-05, "loss": 0.1491, "step": 3682 }, { "epoch": 0.26234996616447626, "grad_norm": 3.4534828662872314, "learning_rate": 1.1245849740744198e-05, "loss": 0.4914, "step": 3683 }, { "epoch": 0.26242119884603055, "grad_norm": 2.6600072383880615, "learning_rate": 1.1241952274988015e-05, "loss": 0.4985, "step": 3684 }, { "epoch": 0.26249243152758484, "grad_norm": 2.701455593109131, "learning_rate": 1.1238054617611384e-05, "loss": 0.3367, "step": 3685 }, { "epoch": 0.2625636642091392, "grad_norm": 3.1828064918518066, "learning_rate": 1.1234156769215678e-05, "loss": 0.6782, "step": 3686 }, { "epoch": 0.26263489689069347, "grad_norm": 6.033182144165039, "learning_rate": 1.123025873040229e-05, "loss": 0.7165, "step": 3687 }, { "epoch": 0.26270612957224776, "grad_norm": 2.674203634262085, "learning_rate": 1.122636050177265e-05, "loss": 0.4643, "step": 3688 }, { "epoch": 0.26277736225380205, "grad_norm": 2.702440023422241, "learning_rate": 1.1222462083928215e-05, "loss": 0.3882, "step": 3689 }, { "epoch": 0.26284859493535634, "grad_norm": 3.651021957397461, "learning_rate": 1.1218563477470465e-05, "loss": 0.6968, "step": 3690 }, { "epoch": 0.26291982761691063, "grad_norm": 2.155545949935913, "learning_rate": 1.1214664683000927e-05, "loss": 0.2986, "step": 3691 }, { "epoch": 0.2629910602984649, "grad_norm": 4.708418846130371, "learning_rate": 1.121076570112113e-05, "loss": 0.6567, "step": 3692 }, { "epoch": 0.2630622929800192, "grad_norm": 2.173466682434082, "learning_rate": 1.1206866532432657e-05, "loss": 0.3899, "step": 3693 }, { "epoch": 0.26313352566157355, "grad_norm": 4.751148700714111, "learning_rate": 1.1202967177537105e-05, "loss": 0.6044, "step": 3694 }, { "epoch": 0.26320475834312784, "grad_norm": 3.168107271194458, "learning_rate": 1.1199067637036106e-05, "loss": 0.5822, "step": 3695 }, { "epoch": 0.26327599102468213, "grad_norm": 3.145500898361206, "learning_rate": 1.1195167911531317e-05, "loss": 0.3314, "step": 3696 }, { "epoch": 0.2633472237062364, "grad_norm": 3.27828049659729, "learning_rate": 1.1191268001624431e-05, "loss": 0.4445, "step": 3697 }, { "epoch": 0.2634184563877907, "grad_norm": 2.825721025466919, "learning_rate": 1.1187367907917158e-05, "loss": 0.2304, "step": 3698 }, { "epoch": 0.263489689069345, "grad_norm": 2.9981513023376465, "learning_rate": 1.1183467631011245e-05, "loss": 0.238, "step": 3699 }, { "epoch": 0.2635609217508993, "grad_norm": 4.935610294342041, "learning_rate": 1.1179567171508463e-05, "loss": 0.5301, "step": 3700 }, { "epoch": 0.26363215443245364, "grad_norm": 3.057166337966919, "learning_rate": 1.1175666530010612e-05, "loss": 0.3017, "step": 3701 }, { "epoch": 0.2637033871140079, "grad_norm": 2.1174027919769287, "learning_rate": 1.1171765707119525e-05, "loss": 0.3175, "step": 3702 }, { "epoch": 0.2637746197955622, "grad_norm": 3.9208812713623047, "learning_rate": 1.1167864703437054e-05, "loss": 0.4347, "step": 3703 }, { "epoch": 0.2638458524771165, "grad_norm": 2.4447011947631836, "learning_rate": 1.1163963519565086e-05, "loss": 0.4742, "step": 3704 }, { "epoch": 0.2639170851586708, "grad_norm": 4.527089595794678, "learning_rate": 1.1160062156105536e-05, "loss": 0.541, "step": 3705 }, { "epoch": 0.2639883178402251, "grad_norm": 6.247172832489014, "learning_rate": 1.1156160613660341e-05, "loss": 0.8555, "step": 3706 }, { "epoch": 0.2640595505217794, "grad_norm": 3.2188031673431396, "learning_rate": 1.1152258892831468e-05, "loss": 0.7659, "step": 3707 }, { "epoch": 0.26413078320333366, "grad_norm": 2.2456958293914795, "learning_rate": 1.1148356994220917e-05, "loss": 0.4824, "step": 3708 }, { "epoch": 0.264202015884888, "grad_norm": 3.0288383960723877, "learning_rate": 1.1144454918430703e-05, "loss": 0.5659, "step": 3709 }, { "epoch": 0.2642732485664423, "grad_norm": 5.137568950653076, "learning_rate": 1.1140552666062883e-05, "loss": 0.7955, "step": 3710 }, { "epoch": 0.2643444812479966, "grad_norm": 4.480634689331055, "learning_rate": 1.1136650237719534e-05, "loss": 0.5557, "step": 3711 }, { "epoch": 0.2644157139295509, "grad_norm": 4.531589508056641, "learning_rate": 1.1132747634002754e-05, "loss": 0.3102, "step": 3712 }, { "epoch": 0.26448694661110517, "grad_norm": 2.814554214477539, "learning_rate": 1.1128844855514684e-05, "loss": 0.2302, "step": 3713 }, { "epoch": 0.26455817929265946, "grad_norm": 3.1511616706848145, "learning_rate": 1.1124941902857475e-05, "loss": 0.5871, "step": 3714 }, { "epoch": 0.26462941197421375, "grad_norm": 4.077970027923584, "learning_rate": 1.1121038776633315e-05, "loss": 0.5695, "step": 3715 }, { "epoch": 0.2647006446557681, "grad_norm": 2.6516337394714355, "learning_rate": 1.1117135477444417e-05, "loss": 0.5519, "step": 3716 }, { "epoch": 0.2647718773373224, "grad_norm": 3.108184337615967, "learning_rate": 1.111323200589302e-05, "loss": 0.6254, "step": 3717 }, { "epoch": 0.26484311001887667, "grad_norm": 3.569167137145996, "learning_rate": 1.1109328362581385e-05, "loss": 0.397, "step": 3718 }, { "epoch": 0.26491434270043096, "grad_norm": 3.0389554500579834, "learning_rate": 1.110542454811181e-05, "loss": 0.3356, "step": 3719 }, { "epoch": 0.26498557538198525, "grad_norm": 2.15948486328125, "learning_rate": 1.1101520563086612e-05, "loss": 0.4579, "step": 3720 }, { "epoch": 0.26505680806353954, "grad_norm": 2.828796863555908, "learning_rate": 1.1097616408108134e-05, "loss": 0.3658, "step": 3721 }, { "epoch": 0.26512804074509383, "grad_norm": 2.041861057281494, "learning_rate": 1.1093712083778748e-05, "loss": 0.2742, "step": 3722 }, { "epoch": 0.2651992734266481, "grad_norm": 2.9132652282714844, "learning_rate": 1.1089807590700848e-05, "loss": 0.2156, "step": 3723 }, { "epoch": 0.26527050610820246, "grad_norm": 2.6068027019500732, "learning_rate": 1.108590292947686e-05, "loss": 0.2975, "step": 3724 }, { "epoch": 0.26534173878975675, "grad_norm": 3.5217790603637695, "learning_rate": 1.1081998100709232e-05, "loss": 0.4233, "step": 3725 }, { "epoch": 0.26541297147131104, "grad_norm": 2.3894035816192627, "learning_rate": 1.1078093105000441e-05, "loss": 0.5117, "step": 3726 }, { "epoch": 0.26548420415286533, "grad_norm": 3.67254376411438, "learning_rate": 1.1074187942952985e-05, "loss": 0.583, "step": 3727 }, { "epoch": 0.2655554368344196, "grad_norm": 3.798628091812134, "learning_rate": 1.1070282615169395e-05, "loss": 0.3962, "step": 3728 }, { "epoch": 0.2656266695159739, "grad_norm": 2.224250078201294, "learning_rate": 1.1066377122252216e-05, "loss": 0.3773, "step": 3729 }, { "epoch": 0.2656979021975282, "grad_norm": 3.529259443283081, "learning_rate": 1.106247146480403e-05, "loss": 0.5591, "step": 3730 }, { "epoch": 0.26576913487908255, "grad_norm": 4.19018030166626, "learning_rate": 1.1058565643427439e-05, "loss": 0.5306, "step": 3731 }, { "epoch": 0.26584036756063684, "grad_norm": 2.885582208633423, "learning_rate": 1.1054659658725067e-05, "loss": 0.3259, "step": 3732 }, { "epoch": 0.2659116002421911, "grad_norm": 4.5262064933776855, "learning_rate": 1.1050753511299572e-05, "loss": 0.6477, "step": 3733 }, { "epoch": 0.2659828329237454, "grad_norm": 2.135200262069702, "learning_rate": 1.1046847201753632e-05, "loss": 0.3686, "step": 3734 }, { "epoch": 0.2660540656052997, "grad_norm": 3.7207143306732178, "learning_rate": 1.104294073068995e-05, "loss": 0.5381, "step": 3735 }, { "epoch": 0.266125298286854, "grad_norm": 5.085023880004883, "learning_rate": 1.1039034098711251e-05, "loss": 0.6265, "step": 3736 }, { "epoch": 0.2661965309684083, "grad_norm": 2.645787239074707, "learning_rate": 1.1035127306420295e-05, "loss": 0.2976, "step": 3737 }, { "epoch": 0.26626776364996263, "grad_norm": 4.211857795715332, "learning_rate": 1.1031220354419849e-05, "loss": 0.492, "step": 3738 }, { "epoch": 0.2663389963315169, "grad_norm": 3.319849967956543, "learning_rate": 1.1027313243312726e-05, "loss": 0.3546, "step": 3739 }, { "epoch": 0.2664102290130712, "grad_norm": 2.5895490646362305, "learning_rate": 1.1023405973701746e-05, "loss": 0.5442, "step": 3740 }, { "epoch": 0.2664814616946255, "grad_norm": 3.7252049446105957, "learning_rate": 1.1019498546189765e-05, "loss": 0.6962, "step": 3741 }, { "epoch": 0.2665526943761798, "grad_norm": 2.81919002532959, "learning_rate": 1.1015590961379657e-05, "loss": 0.6204, "step": 3742 }, { "epoch": 0.2666239270577341, "grad_norm": 1.2668300867080688, "learning_rate": 1.1011683219874324e-05, "loss": 0.0892, "step": 3743 }, { "epoch": 0.26669515973928837, "grad_norm": 2.1310102939605713, "learning_rate": 1.1007775322276687e-05, "loss": 0.4156, "step": 3744 }, { "epoch": 0.26676639242084266, "grad_norm": 1.7789279222488403, "learning_rate": 1.1003867269189696e-05, "loss": 0.1967, "step": 3745 }, { "epoch": 0.266837625102397, "grad_norm": 3.088484764099121, "learning_rate": 1.099995906121632e-05, "loss": 0.7831, "step": 3746 }, { "epoch": 0.2669088577839513, "grad_norm": 3.2557520866394043, "learning_rate": 1.0996050698959561e-05, "loss": 0.527, "step": 3747 }, { "epoch": 0.2669800904655056, "grad_norm": 3.046210765838623, "learning_rate": 1.0992142183022438e-05, "loss": 0.3716, "step": 3748 }, { "epoch": 0.26705132314705987, "grad_norm": 2.425692319869995, "learning_rate": 1.0988233514007991e-05, "loss": 0.2805, "step": 3749 }, { "epoch": 0.26712255582861416, "grad_norm": 5.119366645812988, "learning_rate": 1.0984324692519292e-05, "loss": 0.3553, "step": 3750 }, { "epoch": 0.26719378851016845, "grad_norm": 3.0071003437042236, "learning_rate": 1.098041571915943e-05, "loss": 0.4337, "step": 3751 }, { "epoch": 0.26726502119172274, "grad_norm": 1.797238826751709, "learning_rate": 1.0976506594531515e-05, "loss": 0.2208, "step": 3752 }, { "epoch": 0.2673362538732771, "grad_norm": 3.3324978351593018, "learning_rate": 1.0972597319238692e-05, "loss": 0.6073, "step": 3753 }, { "epoch": 0.2674074865548314, "grad_norm": 2.1432762145996094, "learning_rate": 1.0968687893884118e-05, "loss": 0.1642, "step": 3754 }, { "epoch": 0.26747871923638566, "grad_norm": 6.025820255279541, "learning_rate": 1.0964778319070974e-05, "loss": 0.6484, "step": 3755 }, { "epoch": 0.26754995191793995, "grad_norm": 3.0018486976623535, "learning_rate": 1.0960868595402474e-05, "loss": 0.2674, "step": 3756 }, { "epoch": 0.26762118459949424, "grad_norm": 1.3066145181655884, "learning_rate": 1.0956958723481845e-05, "loss": 0.043, "step": 3757 }, { "epoch": 0.26769241728104853, "grad_norm": 3.419909715652466, "learning_rate": 1.095304870391234e-05, "loss": 0.599, "step": 3758 }, { "epoch": 0.2677636499626028, "grad_norm": 4.294625282287598, "learning_rate": 1.0949138537297233e-05, "loss": 0.61, "step": 3759 }, { "epoch": 0.2678348826441571, "grad_norm": 2.7719345092773438, "learning_rate": 1.0945228224239823e-05, "loss": 0.577, "step": 3760 }, { "epoch": 0.26790611532571146, "grad_norm": 2.2853844165802, "learning_rate": 1.0941317765343433e-05, "loss": 0.2632, "step": 3761 }, { "epoch": 0.26797734800726575, "grad_norm": 3.8973686695098877, "learning_rate": 1.0937407161211406e-05, "loss": 0.7137, "step": 3762 }, { "epoch": 0.26804858068882004, "grad_norm": 2.856279134750366, "learning_rate": 1.0933496412447105e-05, "loss": 0.3328, "step": 3763 }, { "epoch": 0.2681198133703743, "grad_norm": 4.078315734863281, "learning_rate": 1.0929585519653924e-05, "loss": 0.8471, "step": 3764 }, { "epoch": 0.2681910460519286, "grad_norm": 2.4278085231781006, "learning_rate": 1.092567448343527e-05, "loss": 0.2876, "step": 3765 }, { "epoch": 0.2682622787334829, "grad_norm": 2.4311022758483887, "learning_rate": 1.0921763304394574e-05, "loss": 0.4048, "step": 3766 }, { "epoch": 0.2683335114150372, "grad_norm": 3.76811146736145, "learning_rate": 1.0917851983135294e-05, "loss": 0.6532, "step": 3767 }, { "epoch": 0.26840474409659154, "grad_norm": 2.73417329788208, "learning_rate": 1.0913940520260906e-05, "loss": 0.5325, "step": 3768 }, { "epoch": 0.26847597677814583, "grad_norm": 3.941840410232544, "learning_rate": 1.0910028916374904e-05, "loss": 0.4066, "step": 3769 }, { "epoch": 0.2685472094597001, "grad_norm": 3.3807907104492188, "learning_rate": 1.0906117172080812e-05, "loss": 0.389, "step": 3770 }, { "epoch": 0.2686184421412544, "grad_norm": 4.02963924407959, "learning_rate": 1.0902205287982175e-05, "loss": 0.4411, "step": 3771 }, { "epoch": 0.2686896748228087, "grad_norm": 2.696909189224243, "learning_rate": 1.0898293264682549e-05, "loss": 0.5796, "step": 3772 }, { "epoch": 0.268760907504363, "grad_norm": 3.563265085220337, "learning_rate": 1.0894381102785527e-05, "loss": 0.3862, "step": 3773 }, { "epoch": 0.2688321401859173, "grad_norm": 4.659812927246094, "learning_rate": 1.0890468802894712e-05, "loss": 0.6003, "step": 3774 }, { "epoch": 0.26890337286747157, "grad_norm": 3.2148096561431885, "learning_rate": 1.0886556365613725e-05, "loss": 0.4601, "step": 3775 }, { "epoch": 0.2689746055490259, "grad_norm": 2.056544542312622, "learning_rate": 1.0882643791546224e-05, "loss": 0.26, "step": 3776 }, { "epoch": 0.2690458382305802, "grad_norm": 4.29584264755249, "learning_rate": 1.0878731081295874e-05, "loss": 0.8609, "step": 3777 }, { "epoch": 0.2691170709121345, "grad_norm": 2.8776376247406006, "learning_rate": 1.0874818235466366e-05, "loss": 0.4226, "step": 3778 }, { "epoch": 0.2691883035936888, "grad_norm": 5.725050926208496, "learning_rate": 1.0870905254661418e-05, "loss": 0.4496, "step": 3779 }, { "epoch": 0.26925953627524307, "grad_norm": 3.60516095161438, "learning_rate": 1.0866992139484755e-05, "loss": 0.5052, "step": 3780 }, { "epoch": 0.26933076895679736, "grad_norm": 3.603090763092041, "learning_rate": 1.0863078890540133e-05, "loss": 0.4388, "step": 3781 }, { "epoch": 0.26940200163835165, "grad_norm": 2.875850200653076, "learning_rate": 1.0859165508431329e-05, "loss": 0.4885, "step": 3782 }, { "epoch": 0.269473234319906, "grad_norm": 2.614314317703247, "learning_rate": 1.085525199376213e-05, "loss": 0.3753, "step": 3783 }, { "epoch": 0.2695444670014603, "grad_norm": 4.031411647796631, "learning_rate": 1.0851338347136358e-05, "loss": 0.6793, "step": 3784 }, { "epoch": 0.2696156996830146, "grad_norm": 3.7255804538726807, "learning_rate": 1.0847424569157847e-05, "loss": 0.3666, "step": 3785 }, { "epoch": 0.26968693236456887, "grad_norm": 2.82472562789917, "learning_rate": 1.0843510660430447e-05, "loss": 0.4502, "step": 3786 }, { "epoch": 0.26975816504612316, "grad_norm": 3.3472681045532227, "learning_rate": 1.0839596621558045e-05, "loss": 0.5415, "step": 3787 }, { "epoch": 0.26982939772767744, "grad_norm": 2.0194215774536133, "learning_rate": 1.0835682453144527e-05, "loss": 0.1314, "step": 3788 }, { "epoch": 0.26990063040923173, "grad_norm": 2.5426218509674072, "learning_rate": 1.0831768155793814e-05, "loss": 0.5477, "step": 3789 }, { "epoch": 0.2699718630907861, "grad_norm": 2.5591790676116943, "learning_rate": 1.082785373010984e-05, "loss": 0.6087, "step": 3790 }, { "epoch": 0.27004309577234037, "grad_norm": 3.4339845180511475, "learning_rate": 1.0823939176696561e-05, "loss": 0.6947, "step": 3791 }, { "epoch": 0.27011432845389466, "grad_norm": 2.794717311859131, "learning_rate": 1.082002449615795e-05, "loss": 0.6789, "step": 3792 }, { "epoch": 0.27018556113544895, "grad_norm": 4.306746482849121, "learning_rate": 1.0816109689098004e-05, "loss": 0.4055, "step": 3793 }, { "epoch": 0.27025679381700324, "grad_norm": 2.8307979106903076, "learning_rate": 1.081219475612074e-05, "loss": 0.5663, "step": 3794 }, { "epoch": 0.2703280264985575, "grad_norm": 3.2222375869750977, "learning_rate": 1.0808279697830188e-05, "loss": 0.8351, "step": 3795 }, { "epoch": 0.2703992591801118, "grad_norm": 8.03504753112793, "learning_rate": 1.08043645148304e-05, "loss": 0.5912, "step": 3796 }, { "epoch": 0.2704704918616661, "grad_norm": 2.616081714630127, "learning_rate": 1.0800449207725453e-05, "loss": 0.3707, "step": 3797 }, { "epoch": 0.27054172454322045, "grad_norm": 4.020095348358154, "learning_rate": 1.0796533777119435e-05, "loss": 0.2737, "step": 3798 }, { "epoch": 0.27061295722477474, "grad_norm": 2.30271315574646, "learning_rate": 1.079261822361646e-05, "loss": 0.3611, "step": 3799 }, { "epoch": 0.27068418990632903, "grad_norm": 2.558283567428589, "learning_rate": 1.0788702547820654e-05, "loss": 0.2127, "step": 3800 }, { "epoch": 0.2707554225878833, "grad_norm": 3.301825761795044, "learning_rate": 1.0784786750336165e-05, "loss": 0.4553, "step": 3801 }, { "epoch": 0.2708266552694376, "grad_norm": 2.6234638690948486, "learning_rate": 1.0780870831767166e-05, "loss": 0.3355, "step": 3802 }, { "epoch": 0.2708978879509919, "grad_norm": 4.620428562164307, "learning_rate": 1.0776954792717835e-05, "loss": 0.5712, "step": 3803 }, { "epoch": 0.2709691206325462, "grad_norm": 2.7336337566375732, "learning_rate": 1.0773038633792385e-05, "loss": 0.5235, "step": 3804 }, { "epoch": 0.27104035331410053, "grad_norm": 4.384819030761719, "learning_rate": 1.0769122355595031e-05, "loss": 0.6109, "step": 3805 }, { "epoch": 0.2711115859956548, "grad_norm": 1.910244345664978, "learning_rate": 1.0765205958730018e-05, "loss": 0.1717, "step": 3806 }, { "epoch": 0.2711828186772091, "grad_norm": 3.3928394317626953, "learning_rate": 1.0761289443801608e-05, "loss": 0.3994, "step": 3807 }, { "epoch": 0.2712540513587634, "grad_norm": 2.732597827911377, "learning_rate": 1.0757372811414075e-05, "loss": 0.2826, "step": 3808 }, { "epoch": 0.2713252840403177, "grad_norm": 2.508465051651001, "learning_rate": 1.0753456062171716e-05, "loss": 0.3466, "step": 3809 }, { "epoch": 0.271396516721872, "grad_norm": 3.824949026107788, "learning_rate": 1.0749539196678849e-05, "loss": 0.5221, "step": 3810 }, { "epoch": 0.2714677494034263, "grad_norm": 3.110821485519409, "learning_rate": 1.0745622215539801e-05, "loss": 0.4748, "step": 3811 }, { "epoch": 0.27153898208498056, "grad_norm": 2.5455687046051025, "learning_rate": 1.0741705119358922e-05, "loss": 0.1416, "step": 3812 }, { "epoch": 0.2716102147665349, "grad_norm": 2.2108943462371826, "learning_rate": 1.0737787908740582e-05, "loss": 0.2075, "step": 3813 }, { "epoch": 0.2716814474480892, "grad_norm": 2.6503005027770996, "learning_rate": 1.0733870584289168e-05, "loss": 0.2903, "step": 3814 }, { "epoch": 0.2717526801296435, "grad_norm": 3.6650614738464355, "learning_rate": 1.0729953146609076e-05, "loss": 0.6992, "step": 3815 }, { "epoch": 0.2718239128111978, "grad_norm": 3.6820168495178223, "learning_rate": 1.0726035596304733e-05, "loss": 0.3496, "step": 3816 }, { "epoch": 0.27189514549275207, "grad_norm": 2.7986361980438232, "learning_rate": 1.0722117933980574e-05, "loss": 0.4207, "step": 3817 }, { "epoch": 0.27196637817430636, "grad_norm": 3.707007646560669, "learning_rate": 1.0718200160241054e-05, "loss": 0.2155, "step": 3818 }, { "epoch": 0.27203761085586065, "grad_norm": 5.068442344665527, "learning_rate": 1.0714282275690646e-05, "loss": 0.3042, "step": 3819 }, { "epoch": 0.272108843537415, "grad_norm": 7.671543598175049, "learning_rate": 1.0710364280933839e-05, "loss": 0.6178, "step": 3820 }, { "epoch": 0.2721800762189693, "grad_norm": 3.229414701461792, "learning_rate": 1.0706446176575137e-05, "loss": 0.5073, "step": 3821 }, { "epoch": 0.27225130890052357, "grad_norm": 3.2111544609069824, "learning_rate": 1.0702527963219064e-05, "loss": 0.4509, "step": 3822 }, { "epoch": 0.27232254158207786, "grad_norm": 1.3086886405944824, "learning_rate": 1.0698609641470161e-05, "loss": 0.0965, "step": 3823 }, { "epoch": 0.27239377426363215, "grad_norm": 2.498168468475342, "learning_rate": 1.0694691211932986e-05, "loss": 0.7801, "step": 3824 }, { "epoch": 0.27246500694518644, "grad_norm": 7.320872783660889, "learning_rate": 1.0690772675212112e-05, "loss": 0.7946, "step": 3825 }, { "epoch": 0.27253623962674073, "grad_norm": 2.4044313430786133, "learning_rate": 1.0686854031912126e-05, "loss": 0.3475, "step": 3826 }, { "epoch": 0.2726074723082951, "grad_norm": 2.7024996280670166, "learning_rate": 1.0682935282637638e-05, "loss": 0.4859, "step": 3827 }, { "epoch": 0.27267870498984936, "grad_norm": 2.6739890575408936, "learning_rate": 1.0679016427993267e-05, "loss": 0.5308, "step": 3828 }, { "epoch": 0.27274993767140365, "grad_norm": 3.5651278495788574, "learning_rate": 1.0675097468583653e-05, "loss": 0.1329, "step": 3829 }, { "epoch": 0.27282117035295794, "grad_norm": 3.3097734451293945, "learning_rate": 1.0671178405013454e-05, "loss": 0.6535, "step": 3830 }, { "epoch": 0.27289240303451223, "grad_norm": 3.4227092266082764, "learning_rate": 1.066725923788734e-05, "loss": 0.5042, "step": 3831 }, { "epoch": 0.2729636357160665, "grad_norm": 1.9806092977523804, "learning_rate": 1.0663339967809991e-05, "loss": 0.3194, "step": 3832 }, { "epoch": 0.2730348683976208, "grad_norm": 2.7527287006378174, "learning_rate": 1.0659420595386123e-05, "loss": 0.8666, "step": 3833 }, { "epoch": 0.2731061010791751, "grad_norm": 4.411495685577393, "learning_rate": 1.0655501121220446e-05, "loss": 0.6204, "step": 3834 }, { "epoch": 0.27317733376072945, "grad_norm": 3.1394524574279785, "learning_rate": 1.0651581545917693e-05, "loss": 0.507, "step": 3835 }, { "epoch": 0.27324856644228374, "grad_norm": 3.3547956943511963, "learning_rate": 1.064766187008262e-05, "loss": 0.7349, "step": 3836 }, { "epoch": 0.273319799123838, "grad_norm": 3.3744821548461914, "learning_rate": 1.0643742094319991e-05, "loss": 0.5125, "step": 3837 }, { "epoch": 0.2733910318053923, "grad_norm": 2.182025909423828, "learning_rate": 1.0639822219234583e-05, "loss": 0.1717, "step": 3838 }, { "epoch": 0.2734622644869466, "grad_norm": 3.4100775718688965, "learning_rate": 1.0635902245431198e-05, "loss": 0.4668, "step": 3839 }, { "epoch": 0.2735334971685009, "grad_norm": 3.37913179397583, "learning_rate": 1.0631982173514645e-05, "loss": 0.6252, "step": 3840 }, { "epoch": 0.2736047298500552, "grad_norm": 4.628815174102783, "learning_rate": 1.062806200408975e-05, "loss": 0.2171, "step": 3841 }, { "epoch": 0.27367596253160953, "grad_norm": 3.587124824523926, "learning_rate": 1.0624141737761356e-05, "loss": 0.6102, "step": 3842 }, { "epoch": 0.2737471952131638, "grad_norm": 7.878421783447266, "learning_rate": 1.0620221375134319e-05, "loss": 0.9022, "step": 3843 }, { "epoch": 0.2738184278947181, "grad_norm": 3.764225721359253, "learning_rate": 1.0616300916813509e-05, "loss": 0.7219, "step": 3844 }, { "epoch": 0.2738896605762724, "grad_norm": 2.9367425441741943, "learning_rate": 1.0612380363403818e-05, "loss": 0.3761, "step": 3845 }, { "epoch": 0.2739608932578267, "grad_norm": 3.421659231185913, "learning_rate": 1.060845971551014e-05, "loss": 0.6407, "step": 3846 }, { "epoch": 0.274032125939381, "grad_norm": 2.4257500171661377, "learning_rate": 1.0604538973737394e-05, "loss": 0.2747, "step": 3847 }, { "epoch": 0.27410335862093527, "grad_norm": 2.536060094833374, "learning_rate": 1.0600618138690514e-05, "loss": 0.2826, "step": 3848 }, { "epoch": 0.27417459130248956, "grad_norm": 2.980879306793213, "learning_rate": 1.0596697210974436e-05, "loss": 0.4578, "step": 3849 }, { "epoch": 0.2742458239840439, "grad_norm": 2.8685288429260254, "learning_rate": 1.0592776191194126e-05, "loss": 0.6518, "step": 3850 }, { "epoch": 0.2743170566655982, "grad_norm": 2.718792676925659, "learning_rate": 1.0588855079954552e-05, "loss": 0.5455, "step": 3851 }, { "epoch": 0.2743882893471525, "grad_norm": 1.7652561664581299, "learning_rate": 1.05849338778607e-05, "loss": 0.3052, "step": 3852 }, { "epoch": 0.27445952202870677, "grad_norm": 3.7951531410217285, "learning_rate": 1.058101258551758e-05, "loss": 0.5759, "step": 3853 }, { "epoch": 0.27453075471026106, "grad_norm": 7.102584362030029, "learning_rate": 1.05770912035302e-05, "loss": 0.4608, "step": 3854 }, { "epoch": 0.27460198739181535, "grad_norm": 2.754162073135376, "learning_rate": 1.0573169732503592e-05, "loss": 0.2811, "step": 3855 }, { "epoch": 0.27467322007336964, "grad_norm": 2.7301621437072754, "learning_rate": 1.0569248173042793e-05, "loss": 0.6623, "step": 3856 }, { "epoch": 0.274744452754924, "grad_norm": 1.3477485179901123, "learning_rate": 1.0565326525752866e-05, "loss": 0.1169, "step": 3857 }, { "epoch": 0.2748156854364783, "grad_norm": 2.504746675491333, "learning_rate": 1.0561404791238875e-05, "loss": 0.277, "step": 3858 }, { "epoch": 0.27488691811803256, "grad_norm": 2.8365049362182617, "learning_rate": 1.0557482970105907e-05, "loss": 0.3983, "step": 3859 }, { "epoch": 0.27495815079958685, "grad_norm": 3.3723552227020264, "learning_rate": 1.0553561062959056e-05, "loss": 0.4195, "step": 3860 }, { "epoch": 0.27502938348114114, "grad_norm": 2.487466812133789, "learning_rate": 1.0549639070403437e-05, "loss": 0.3933, "step": 3861 }, { "epoch": 0.27510061616269543, "grad_norm": 3.0408339500427246, "learning_rate": 1.0545716993044168e-05, "loss": 0.6309, "step": 3862 }, { "epoch": 0.2751718488442497, "grad_norm": 2.519923210144043, "learning_rate": 1.0541794831486388e-05, "loss": 0.6883, "step": 3863 }, { "epoch": 0.275243081525804, "grad_norm": 5.323497772216797, "learning_rate": 1.0537872586335245e-05, "loss": 0.1696, "step": 3864 }, { "epoch": 0.27531431420735836, "grad_norm": 2.4515230655670166, "learning_rate": 1.05339502581959e-05, "loss": 0.4217, "step": 3865 }, { "epoch": 0.27538554688891265, "grad_norm": 5.0334320068359375, "learning_rate": 1.0530027847673526e-05, "loss": 0.2902, "step": 3866 }, { "epoch": 0.27545677957046694, "grad_norm": 2.420335531234741, "learning_rate": 1.0526105355373318e-05, "loss": 0.4406, "step": 3867 }, { "epoch": 0.2755280122520212, "grad_norm": 3.035008668899536, "learning_rate": 1.0522182781900467e-05, "loss": 0.4929, "step": 3868 }, { "epoch": 0.2755992449335755, "grad_norm": 3.4333488941192627, "learning_rate": 1.0518260127860192e-05, "loss": 0.7738, "step": 3869 }, { "epoch": 0.2756704776151298, "grad_norm": 2.5185914039611816, "learning_rate": 1.0514337393857718e-05, "loss": 0.0844, "step": 3870 }, { "epoch": 0.2757417102966841, "grad_norm": 2.5678791999816895, "learning_rate": 1.0510414580498283e-05, "loss": 0.2767, "step": 3871 }, { "epoch": 0.27581294297823844, "grad_norm": 3.6680610179901123, "learning_rate": 1.0506491688387128e-05, "loss": 0.7247, "step": 3872 }, { "epoch": 0.27588417565979273, "grad_norm": 4.791692733764648, "learning_rate": 1.0502568718129526e-05, "loss": 0.5866, "step": 3873 }, { "epoch": 0.275955408341347, "grad_norm": 2.725526809692383, "learning_rate": 1.0498645670330746e-05, "loss": 0.4891, "step": 3874 }, { "epoch": 0.2760266410229013, "grad_norm": 3.3196821212768555, "learning_rate": 1.049472254559607e-05, "loss": 0.2727, "step": 3875 }, { "epoch": 0.2760978737044556, "grad_norm": 2.285648822784424, "learning_rate": 1.0490799344530804e-05, "loss": 0.2302, "step": 3876 }, { "epoch": 0.2761691063860099, "grad_norm": 2.936091899871826, "learning_rate": 1.0486876067740253e-05, "loss": 0.6863, "step": 3877 }, { "epoch": 0.2762403390675642, "grad_norm": 4.228646755218506, "learning_rate": 1.0482952715829737e-05, "loss": 0.3564, "step": 3878 }, { "epoch": 0.2763115717491185, "grad_norm": 1.596107840538025, "learning_rate": 1.0479029289404592e-05, "loss": 0.0896, "step": 3879 }, { "epoch": 0.2763828044306728, "grad_norm": 3.2190206050872803, "learning_rate": 1.0475105789070157e-05, "loss": 0.2397, "step": 3880 }, { "epoch": 0.2764540371122271, "grad_norm": 2.8374760150909424, "learning_rate": 1.0471182215431796e-05, "loss": 0.3733, "step": 3881 }, { "epoch": 0.2765252697937814, "grad_norm": 2.7054426670074463, "learning_rate": 1.046725856909487e-05, "loss": 0.6133, "step": 3882 }, { "epoch": 0.2765965024753357, "grad_norm": 3.421339273452759, "learning_rate": 1.0463334850664757e-05, "loss": 0.7132, "step": 3883 }, { "epoch": 0.27666773515688997, "grad_norm": 3.0930140018463135, "learning_rate": 1.0459411060746848e-05, "loss": 0.5394, "step": 3884 }, { "epoch": 0.27673896783844426, "grad_norm": 5.802558422088623, "learning_rate": 1.0455487199946547e-05, "loss": 1.2879, "step": 3885 }, { "epoch": 0.27681020051999855, "grad_norm": 3.0546061992645264, "learning_rate": 1.0451563268869258e-05, "loss": 0.5622, "step": 3886 }, { "epoch": 0.2768814332015529, "grad_norm": 5.880954265594482, "learning_rate": 1.0447639268120409e-05, "loss": 0.2588, "step": 3887 }, { "epoch": 0.2769526658831072, "grad_norm": 4.112024307250977, "learning_rate": 1.0443715198305432e-05, "loss": 0.2988, "step": 3888 }, { "epoch": 0.2770238985646615, "grad_norm": 5.283123970031738, "learning_rate": 1.0439791060029765e-05, "loss": 0.4675, "step": 3889 }, { "epoch": 0.27709513124621576, "grad_norm": 3.742169141769409, "learning_rate": 1.0435866853898869e-05, "loss": 0.5699, "step": 3890 }, { "epoch": 0.27716636392777005, "grad_norm": 3.190629243850708, "learning_rate": 1.0431942580518207e-05, "loss": 0.2774, "step": 3891 }, { "epoch": 0.27723759660932434, "grad_norm": 3.296903371810913, "learning_rate": 1.0428018240493247e-05, "loss": 0.2245, "step": 3892 }, { "epoch": 0.27730882929087863, "grad_norm": 8.980363845825195, "learning_rate": 1.0424093834429487e-05, "loss": 0.6866, "step": 3893 }, { "epoch": 0.277380061972433, "grad_norm": 3.2339370250701904, "learning_rate": 1.0420169362932416e-05, "loss": 0.594, "step": 3894 }, { "epoch": 0.27745129465398727, "grad_norm": 1.7887054681777954, "learning_rate": 1.0416244826607533e-05, "loss": 0.2706, "step": 3895 }, { "epoch": 0.27752252733554156, "grad_norm": 3.71588134765625, "learning_rate": 1.0412320226060364e-05, "loss": 0.3346, "step": 3896 }, { "epoch": 0.27759376001709585, "grad_norm": 4.470887184143066, "learning_rate": 1.0408395561896429e-05, "loss": 0.416, "step": 3897 }, { "epoch": 0.27766499269865014, "grad_norm": 2.283400774002075, "learning_rate": 1.0404470834721265e-05, "loss": 0.1466, "step": 3898 }, { "epoch": 0.2777362253802044, "grad_norm": 3.5580077171325684, "learning_rate": 1.0400546045140416e-05, "loss": 0.5137, "step": 3899 }, { "epoch": 0.2778074580617587, "grad_norm": 4.743161201477051, "learning_rate": 1.039662119375944e-05, "loss": 0.96, "step": 3900 }, { "epoch": 0.277878690743313, "grad_norm": 3.46093487739563, "learning_rate": 1.0392696281183893e-05, "loss": 1.0116, "step": 3901 }, { "epoch": 0.27794992342486735, "grad_norm": 2.868712902069092, "learning_rate": 1.0388771308019359e-05, "loss": 0.4292, "step": 3902 }, { "epoch": 0.27802115610642164, "grad_norm": 2.1031494140625, "learning_rate": 1.0384846274871412e-05, "loss": 0.2291, "step": 3903 }, { "epoch": 0.27809238878797593, "grad_norm": 2.693235158920288, "learning_rate": 1.038092118234565e-05, "loss": 0.5869, "step": 3904 }, { "epoch": 0.2781636214695302, "grad_norm": 1.9385250806808472, "learning_rate": 1.037699603104767e-05, "loss": 0.763, "step": 3905 }, { "epoch": 0.2782348541510845, "grad_norm": 1.9607189893722534, "learning_rate": 1.0373070821583084e-05, "loss": 0.3652, "step": 3906 }, { "epoch": 0.2783060868326388, "grad_norm": 3.069441318511963, "learning_rate": 1.0369145554557516e-05, "loss": 0.6127, "step": 3907 }, { "epoch": 0.2783773195141931, "grad_norm": 2.92572283744812, "learning_rate": 1.0365220230576592e-05, "loss": 0.2334, "step": 3908 }, { "epoch": 0.27844855219574743, "grad_norm": 1.8921220302581787, "learning_rate": 1.0361294850245942e-05, "loss": 0.3301, "step": 3909 }, { "epoch": 0.2785197848773017, "grad_norm": 3.6129791736602783, "learning_rate": 1.0357369414171219e-05, "loss": 0.7368, "step": 3910 }, { "epoch": 0.278591017558856, "grad_norm": 2.9379639625549316, "learning_rate": 1.0353443922958078e-05, "loss": 0.3929, "step": 3911 }, { "epoch": 0.2786622502404103, "grad_norm": 3.7315878868103027, "learning_rate": 1.0349518377212175e-05, "loss": 0.7212, "step": 3912 }, { "epoch": 0.2787334829219646, "grad_norm": 2.2774295806884766, "learning_rate": 1.0345592777539189e-05, "loss": 0.379, "step": 3913 }, { "epoch": 0.2788047156035189, "grad_norm": 3.386545181274414, "learning_rate": 1.0341667124544797e-05, "loss": 0.4904, "step": 3914 }, { "epoch": 0.27887594828507317, "grad_norm": 3.073446035385132, "learning_rate": 1.0337741418834683e-05, "loss": 0.3331, "step": 3915 }, { "epoch": 0.27894718096662746, "grad_norm": 3.149291515350342, "learning_rate": 1.033381566101455e-05, "loss": 0.1857, "step": 3916 }, { "epoch": 0.2790184136481818, "grad_norm": 3.3099701404571533, "learning_rate": 1.0329889851690094e-05, "loss": 0.4284, "step": 3917 }, { "epoch": 0.2790896463297361, "grad_norm": 5.379390239715576, "learning_rate": 1.0325963991467031e-05, "loss": 0.611, "step": 3918 }, { "epoch": 0.2791608790112904, "grad_norm": 3.637929916381836, "learning_rate": 1.0322038080951084e-05, "loss": 0.4697, "step": 3919 }, { "epoch": 0.2792321116928447, "grad_norm": 2.6739730834960938, "learning_rate": 1.0318112120747977e-05, "loss": 0.5089, "step": 3920 }, { "epoch": 0.27930334437439897, "grad_norm": 4.159818172454834, "learning_rate": 1.0314186111463444e-05, "loss": 0.5933, "step": 3921 }, { "epoch": 0.27937457705595325, "grad_norm": 1.8235028982162476, "learning_rate": 1.0310260053703231e-05, "loss": 0.1898, "step": 3922 }, { "epoch": 0.27944580973750754, "grad_norm": 2.0513579845428467, "learning_rate": 1.0306333948073089e-05, "loss": 0.4056, "step": 3923 }, { "epoch": 0.2795170424190619, "grad_norm": 2.868013858795166, "learning_rate": 1.030240779517877e-05, "loss": 0.4353, "step": 3924 }, { "epoch": 0.2795882751006162, "grad_norm": 4.070772171020508, "learning_rate": 1.0298481595626045e-05, "loss": 0.5309, "step": 3925 }, { "epoch": 0.27965950778217047, "grad_norm": 5.546368598937988, "learning_rate": 1.0294555350020678e-05, "loss": 0.8482, "step": 3926 }, { "epoch": 0.27973074046372476, "grad_norm": 2.834636688232422, "learning_rate": 1.0290629058968457e-05, "loss": 0.4163, "step": 3927 }, { "epoch": 0.27980197314527905, "grad_norm": 4.114894390106201, "learning_rate": 1.0286702723075167e-05, "loss": 0.5893, "step": 3928 }, { "epoch": 0.27987320582683334, "grad_norm": 4.7636027336120605, "learning_rate": 1.0282776342946597e-05, "loss": 0.3056, "step": 3929 }, { "epoch": 0.2799444385083876, "grad_norm": 2.850306987762451, "learning_rate": 1.0278849919188551e-05, "loss": 0.4485, "step": 3930 }, { "epoch": 0.280015671189942, "grad_norm": 2.3466413021087646, "learning_rate": 1.0274923452406835e-05, "loss": 0.4309, "step": 3931 }, { "epoch": 0.28008690387149626, "grad_norm": 2.3244268894195557, "learning_rate": 1.0270996943207258e-05, "loss": 0.6322, "step": 3932 }, { "epoch": 0.28015813655305055, "grad_norm": 2.905562400817871, "learning_rate": 1.0267070392195646e-05, "loss": 0.7639, "step": 3933 }, { "epoch": 0.28022936923460484, "grad_norm": 4.227348327636719, "learning_rate": 1.0263143799977824e-05, "loss": 0.571, "step": 3934 }, { "epoch": 0.28030060191615913, "grad_norm": 2.2361409664154053, "learning_rate": 1.025921716715962e-05, "loss": 0.3183, "step": 3935 }, { "epoch": 0.2803718345977134, "grad_norm": 4.44700288772583, "learning_rate": 1.0255290494346877e-05, "loss": 0.3388, "step": 3936 }, { "epoch": 0.2804430672792677, "grad_norm": 4.7434821128845215, "learning_rate": 1.0251363782145443e-05, "loss": 0.6445, "step": 3937 }, { "epoch": 0.280514299960822, "grad_norm": 2.995059013366699, "learning_rate": 1.0247437031161162e-05, "loss": 0.6754, "step": 3938 }, { "epoch": 0.28058553264237635, "grad_norm": 2.12400221824646, "learning_rate": 1.0243510241999898e-05, "loss": 0.3498, "step": 3939 }, { "epoch": 0.28065676532393063, "grad_norm": 3.1953351497650146, "learning_rate": 1.0239583415267509e-05, "loss": 0.7462, "step": 3940 }, { "epoch": 0.2807279980054849, "grad_norm": 2.8267223834991455, "learning_rate": 1.0235656551569868e-05, "loss": 0.5374, "step": 3941 }, { "epoch": 0.2807992306870392, "grad_norm": 2.7785773277282715, "learning_rate": 1.0231729651512847e-05, "loss": 0.3236, "step": 3942 }, { "epoch": 0.2808704633685935, "grad_norm": 2.297187089920044, "learning_rate": 1.0227802715702326e-05, "loss": 0.5016, "step": 3943 }, { "epoch": 0.2809416960501478, "grad_norm": 2.5108392238616943, "learning_rate": 1.0223875744744194e-05, "loss": 0.1972, "step": 3944 }, { "epoch": 0.2810129287317021, "grad_norm": 3.283334493637085, "learning_rate": 1.021994873924434e-05, "loss": 0.6359, "step": 3945 }, { "epoch": 0.28108416141325643, "grad_norm": 2.5186989307403564, "learning_rate": 1.021602169980866e-05, "loss": 0.6366, "step": 3946 }, { "epoch": 0.2811553940948107, "grad_norm": 2.202291250228882, "learning_rate": 1.0212094627043056e-05, "loss": 0.2807, "step": 3947 }, { "epoch": 0.281226626776365, "grad_norm": 4.206649303436279, "learning_rate": 1.0208167521553439e-05, "loss": 0.1952, "step": 3948 }, { "epoch": 0.2812978594579193, "grad_norm": 4.352333068847656, "learning_rate": 1.0204240383945709e-05, "loss": 0.763, "step": 3949 }, { "epoch": 0.2813690921394736, "grad_norm": 5.495899200439453, "learning_rate": 1.0200313214825797e-05, "loss": 0.3064, "step": 3950 }, { "epoch": 0.2814403248210279, "grad_norm": 2.284925937652588, "learning_rate": 1.0196386014799617e-05, "loss": 0.2525, "step": 3951 }, { "epoch": 0.28151155750258217, "grad_norm": 3.0302059650421143, "learning_rate": 1.0192458784473099e-05, "loss": 0.5684, "step": 3952 }, { "epoch": 0.28158279018413646, "grad_norm": 5.723763465881348, "learning_rate": 1.0188531524452173e-05, "loss": 0.5409, "step": 3953 }, { "epoch": 0.2816540228656908, "grad_norm": 3.0475950241088867, "learning_rate": 1.018460423534277e-05, "loss": 0.3253, "step": 3954 }, { "epoch": 0.2817252555472451, "grad_norm": 4.49261474609375, "learning_rate": 1.0180676917750839e-05, "loss": 0.5858, "step": 3955 }, { "epoch": 0.2817964882287994, "grad_norm": 4.2654128074646, "learning_rate": 1.0176749572282318e-05, "loss": 0.3538, "step": 3956 }, { "epoch": 0.28186772091035367, "grad_norm": 1.3470141887664795, "learning_rate": 1.0172822199543155e-05, "loss": 0.1284, "step": 3957 }, { "epoch": 0.28193895359190796, "grad_norm": 6.843236446380615, "learning_rate": 1.0168894800139311e-05, "loss": 0.7593, "step": 3958 }, { "epoch": 0.28201018627346225, "grad_norm": 3.2009077072143555, "learning_rate": 1.0164967374676737e-05, "loss": 0.1655, "step": 3959 }, { "epoch": 0.28208141895501654, "grad_norm": 2.9170150756835938, "learning_rate": 1.0161039923761398e-05, "loss": 0.7612, "step": 3960 }, { "epoch": 0.2821526516365709, "grad_norm": 2.432640790939331, "learning_rate": 1.0157112447999255e-05, "loss": 0.5032, "step": 3961 }, { "epoch": 0.2822238843181252, "grad_norm": 2.879519462585449, "learning_rate": 1.0153184947996282e-05, "loss": 0.1981, "step": 3962 }, { "epoch": 0.28229511699967946, "grad_norm": 3.4942972660064697, "learning_rate": 1.0149257424358445e-05, "loss": 0.4106, "step": 3963 }, { "epoch": 0.28236634968123375, "grad_norm": 2.8611533641815186, "learning_rate": 1.0145329877691725e-05, "loss": 0.6279, "step": 3964 }, { "epoch": 0.28243758236278804, "grad_norm": 4.407961845397949, "learning_rate": 1.0141402308602104e-05, "loss": 0.531, "step": 3965 }, { "epoch": 0.28250881504434233, "grad_norm": 2.4765703678131104, "learning_rate": 1.0137474717695561e-05, "loss": 0.2264, "step": 3966 }, { "epoch": 0.2825800477258966, "grad_norm": 2.5087716579437256, "learning_rate": 1.0133547105578085e-05, "loss": 0.5755, "step": 3967 }, { "epoch": 0.2826512804074509, "grad_norm": 3.234992027282715, "learning_rate": 1.012961947285567e-05, "loss": 0.3239, "step": 3968 }, { "epoch": 0.28272251308900526, "grad_norm": 2.825155258178711, "learning_rate": 1.0125691820134299e-05, "loss": 0.4844, "step": 3969 }, { "epoch": 0.28279374577055955, "grad_norm": 3.819096565246582, "learning_rate": 1.0121764148019977e-05, "loss": 0.6937, "step": 3970 }, { "epoch": 0.28286497845211384, "grad_norm": 4.3054728507995605, "learning_rate": 1.0117836457118701e-05, "loss": 0.1645, "step": 3971 }, { "epoch": 0.2829362111336681, "grad_norm": 2.489229440689087, "learning_rate": 1.0113908748036471e-05, "loss": 0.4596, "step": 3972 }, { "epoch": 0.2830074438152224, "grad_norm": 6.936276435852051, "learning_rate": 1.0109981021379297e-05, "loss": 0.7899, "step": 3973 }, { "epoch": 0.2830786764967767, "grad_norm": 4.9916768074035645, "learning_rate": 1.0106053277753182e-05, "loss": 0.2772, "step": 3974 }, { "epoch": 0.283149909178331, "grad_norm": 3.625915765762329, "learning_rate": 1.0102125517764144e-05, "loss": 0.7682, "step": 3975 }, { "epoch": 0.28322114185988534, "grad_norm": 3.4747960567474365, "learning_rate": 1.0098197742018185e-05, "loss": 0.6625, "step": 3976 }, { "epoch": 0.28329237454143963, "grad_norm": 3.791959285736084, "learning_rate": 1.0094269951121326e-05, "loss": 0.5204, "step": 3977 }, { "epoch": 0.2833636072229939, "grad_norm": 5.595301628112793, "learning_rate": 1.0090342145679584e-05, "loss": 0.9777, "step": 3978 }, { "epoch": 0.2834348399045482, "grad_norm": 4.56858491897583, "learning_rate": 1.008641432629898e-05, "loss": 0.7575, "step": 3979 }, { "epoch": 0.2835060725861025, "grad_norm": 5.759364604949951, "learning_rate": 1.0082486493585535e-05, "loss": 0.4732, "step": 3980 }, { "epoch": 0.2835773052676568, "grad_norm": 4.667255878448486, "learning_rate": 1.0078558648145273e-05, "loss": 0.6935, "step": 3981 }, { "epoch": 0.2836485379492111, "grad_norm": 1.6499096155166626, "learning_rate": 1.0074630790584223e-05, "loss": 0.1368, "step": 3982 }, { "epoch": 0.2837197706307654, "grad_norm": 3.4402241706848145, "learning_rate": 1.0070702921508408e-05, "loss": 0.6838, "step": 3983 }, { "epoch": 0.2837910033123197, "grad_norm": 1.9420626163482666, "learning_rate": 1.0066775041523864e-05, "loss": 0.3453, "step": 3984 }, { "epoch": 0.283862235993874, "grad_norm": 4.078906536102295, "learning_rate": 1.0062847151236616e-05, "loss": 0.4142, "step": 3985 }, { "epoch": 0.2839334686754283, "grad_norm": 4.739321708679199, "learning_rate": 1.00589192512527e-05, "loss": 0.9179, "step": 3986 }, { "epoch": 0.2840047013569826, "grad_norm": 1.9154719114303589, "learning_rate": 1.005499134217815e-05, "loss": 0.2406, "step": 3987 }, { "epoch": 0.28407593403853687, "grad_norm": 3.321984052658081, "learning_rate": 1.0051063424619e-05, "loss": 0.4548, "step": 3988 }, { "epoch": 0.28414716672009116, "grad_norm": 0.3219392001628876, "learning_rate": 1.0047135499181293e-05, "loss": 0.0047, "step": 3989 }, { "epoch": 0.28421839940164545, "grad_norm": 2.3134663105010986, "learning_rate": 1.0043207566471064e-05, "loss": 0.4129, "step": 3990 }, { "epoch": 0.2842896320831998, "grad_norm": 2.31357479095459, "learning_rate": 1.0039279627094352e-05, "loss": 0.3369, "step": 3991 }, { "epoch": 0.2843608647647541, "grad_norm": 3.108046770095825, "learning_rate": 1.0035351681657194e-05, "loss": 0.1054, "step": 3992 }, { "epoch": 0.2844320974463084, "grad_norm": 2.39547061920166, "learning_rate": 1.0031423730765642e-05, "loss": 0.4103, "step": 3993 }, { "epoch": 0.28450333012786266, "grad_norm": 2.4185502529144287, "learning_rate": 1.0027495775025726e-05, "loss": 0.5764, "step": 3994 }, { "epoch": 0.28457456280941695, "grad_norm": 3.4710404872894287, "learning_rate": 1.0023567815043498e-05, "loss": 0.3399, "step": 3995 }, { "epoch": 0.28464579549097124, "grad_norm": 3.3729958534240723, "learning_rate": 1.0019639851424998e-05, "loss": 0.5034, "step": 3996 }, { "epoch": 0.28471702817252553, "grad_norm": 5.744598865509033, "learning_rate": 1.0015711884776274e-05, "loss": 0.8152, "step": 3997 }, { "epoch": 0.2847882608540799, "grad_norm": 2.788661241531372, "learning_rate": 1.0011783915703367e-05, "loss": 0.4456, "step": 3998 }, { "epoch": 0.28485949353563417, "grad_norm": 2.789508819580078, "learning_rate": 1.0007855944812321e-05, "loss": 0.5543, "step": 3999 }, { "epoch": 0.28493072621718846, "grad_norm": 3.627718448638916, "learning_rate": 1.0003927972709182e-05, "loss": 0.448, "step": 4000 }, { "epoch": 0.28500195889874275, "grad_norm": 2.1252119541168213, "learning_rate": 1e-05, "loss": 0.3813, "step": 4001 }, { "epoch": 0.28507319158029704, "grad_norm": 3.964925765991211, "learning_rate": 9.996072027290818e-06, "loss": 0.4164, "step": 4002 }, { "epoch": 0.2851444242618513, "grad_norm": 2.9158616065979004, "learning_rate": 9.992144055187684e-06, "loss": 0.7013, "step": 4003 }, { "epoch": 0.2852156569434056, "grad_norm": 2.7341411113739014, "learning_rate": 9.988216084296637e-06, "loss": 0.4265, "step": 4004 }, { "epoch": 0.2852868896249599, "grad_norm": 2.835594892501831, "learning_rate": 9.984288115223729e-06, "loss": 0.4689, "step": 4005 }, { "epoch": 0.28535812230651425, "grad_norm": 2.0396530628204346, "learning_rate": 9.980360148575006e-06, "loss": 0.3531, "step": 4006 }, { "epoch": 0.28542935498806854, "grad_norm": 2.7819180488586426, "learning_rate": 9.976432184956504e-06, "loss": 0.516, "step": 4007 }, { "epoch": 0.28550058766962283, "grad_norm": 5.5193023681640625, "learning_rate": 9.972504224974274e-06, "loss": 0.6017, "step": 4008 }, { "epoch": 0.2855718203511771, "grad_norm": 3.361731767654419, "learning_rate": 9.968576269234365e-06, "loss": 0.463, "step": 4009 }, { "epoch": 0.2856430530327314, "grad_norm": 1.8981995582580566, "learning_rate": 9.964648318342807e-06, "loss": 0.478, "step": 4010 }, { "epoch": 0.2857142857142857, "grad_norm": 2.9542253017425537, "learning_rate": 9.960720372905651e-06, "loss": 0.3937, "step": 4011 }, { "epoch": 0.28578551839584, "grad_norm": 2.9699480533599854, "learning_rate": 9.95679243352894e-06, "loss": 0.3568, "step": 4012 }, { "epoch": 0.28585675107739433, "grad_norm": 3.619741439819336, "learning_rate": 9.95286450081871e-06, "loss": 0.5656, "step": 4013 }, { "epoch": 0.2859279837589486, "grad_norm": 2.4119086265563965, "learning_rate": 9.948936575381001e-06, "loss": 0.5543, "step": 4014 }, { "epoch": 0.2859992164405029, "grad_norm": 5.564222812652588, "learning_rate": 9.945008657821856e-06, "loss": 0.4076, "step": 4015 }, { "epoch": 0.2860704491220572, "grad_norm": 3.6046664714813232, "learning_rate": 9.941080748747305e-06, "loss": 0.51, "step": 4016 }, { "epoch": 0.2861416818036115, "grad_norm": 3.955432176589966, "learning_rate": 9.937152848763387e-06, "loss": 0.4013, "step": 4017 }, { "epoch": 0.2862129144851658, "grad_norm": 3.9843950271606445, "learning_rate": 9.933224958476143e-06, "loss": 0.5597, "step": 4018 }, { "epoch": 0.28628414716672007, "grad_norm": 2.5851476192474365, "learning_rate": 9.929297078491594e-06, "loss": 0.6175, "step": 4019 }, { "epoch": 0.28635537984827436, "grad_norm": 3.719390392303467, "learning_rate": 9.92536920941578e-06, "loss": 0.4196, "step": 4020 }, { "epoch": 0.2864266125298287, "grad_norm": 2.472452402114868, "learning_rate": 9.921441351854727e-06, "loss": 0.3503, "step": 4021 }, { "epoch": 0.286497845211383, "grad_norm": 3.3514931201934814, "learning_rate": 9.917513506414468e-06, "loss": 0.7573, "step": 4022 }, { "epoch": 0.2865690778929373, "grad_norm": 2.4450571537017822, "learning_rate": 9.913585673701023e-06, "loss": 0.4402, "step": 4023 }, { "epoch": 0.2866403105744916, "grad_norm": 4.8062591552734375, "learning_rate": 9.909657854320417e-06, "loss": 0.6534, "step": 4024 }, { "epoch": 0.28671154325604586, "grad_norm": 10.457976341247559, "learning_rate": 9.905730048878678e-06, "loss": 0.5615, "step": 4025 }, { "epoch": 0.28678277593760015, "grad_norm": 1.6271241903305054, "learning_rate": 9.901802257981819e-06, "loss": 0.1853, "step": 4026 }, { "epoch": 0.28685400861915444, "grad_norm": 1.9878695011138916, "learning_rate": 9.897874482235862e-06, "loss": 0.2765, "step": 4027 }, { "epoch": 0.2869252413007088, "grad_norm": 3.067518472671509, "learning_rate": 9.893946722246821e-06, "loss": 0.427, "step": 4028 }, { "epoch": 0.2869964739822631, "grad_norm": 2.963838815689087, "learning_rate": 9.890018978620706e-06, "loss": 0.6582, "step": 4029 }, { "epoch": 0.28706770666381737, "grad_norm": 2.490366220474243, "learning_rate": 9.886091251963529e-06, "loss": 0.5951, "step": 4030 }, { "epoch": 0.28713893934537166, "grad_norm": 2.5107650756835938, "learning_rate": 9.882163542881304e-06, "loss": 0.3654, "step": 4031 }, { "epoch": 0.28721017202692595, "grad_norm": 4.67518424987793, "learning_rate": 9.878235851980027e-06, "loss": 0.7705, "step": 4032 }, { "epoch": 0.28728140470848024, "grad_norm": 4.030263423919678, "learning_rate": 9.874308179865701e-06, "loss": 0.793, "step": 4033 }, { "epoch": 0.2873526373900345, "grad_norm": 3.190770387649536, "learning_rate": 9.870380527144336e-06, "loss": 0.4656, "step": 4034 }, { "epoch": 0.28742387007158887, "grad_norm": 5.1777191162109375, "learning_rate": 9.866452894421918e-06, "loss": 0.3494, "step": 4035 }, { "epoch": 0.28749510275314316, "grad_norm": 2.7876784801483154, "learning_rate": 9.86252528230444e-06, "loss": 0.5064, "step": 4036 }, { "epoch": 0.28756633543469745, "grad_norm": 3.182436227798462, "learning_rate": 9.858597691397901e-06, "loss": 0.46, "step": 4037 }, { "epoch": 0.28763756811625174, "grad_norm": 2.167931079864502, "learning_rate": 9.854670122308276e-06, "loss": 0.2867, "step": 4038 }, { "epoch": 0.28770880079780603, "grad_norm": 3.4726762771606445, "learning_rate": 9.850742575641557e-06, "loss": 0.7477, "step": 4039 }, { "epoch": 0.2877800334793603, "grad_norm": 2.4985105991363525, "learning_rate": 9.846815052003723e-06, "loss": 0.1281, "step": 4040 }, { "epoch": 0.2878512661609146, "grad_norm": 2.201894521713257, "learning_rate": 9.842887552000746e-06, "loss": 0.443, "step": 4041 }, { "epoch": 0.2879224988424689, "grad_norm": 2.197289228439331, "learning_rate": 9.838960076238604e-06, "loss": 0.4255, "step": 4042 }, { "epoch": 0.28799373152402324, "grad_norm": 3.3056821823120117, "learning_rate": 9.835032625323265e-06, "loss": 0.3975, "step": 4043 }, { "epoch": 0.28806496420557753, "grad_norm": 3.2037341594696045, "learning_rate": 9.83110519986069e-06, "loss": 0.1084, "step": 4044 }, { "epoch": 0.2881361968871318, "grad_norm": 2.3408052921295166, "learning_rate": 9.827177800456843e-06, "loss": 0.4788, "step": 4045 }, { "epoch": 0.2882074295686861, "grad_norm": 2.9474918842315674, "learning_rate": 9.823250427717687e-06, "loss": 0.524, "step": 4046 }, { "epoch": 0.2882786622502404, "grad_norm": 2.689678192138672, "learning_rate": 9.819323082249165e-06, "loss": 0.1368, "step": 4047 }, { "epoch": 0.2883498949317947, "grad_norm": 3.1431353092193604, "learning_rate": 9.81539576465723e-06, "loss": 0.4685, "step": 4048 }, { "epoch": 0.288421127613349, "grad_norm": 3.721433639526367, "learning_rate": 9.811468475547832e-06, "loss": 0.5679, "step": 4049 }, { "epoch": 0.2884923602949033, "grad_norm": 2.432645559310913, "learning_rate": 9.807541215526906e-06, "loss": 0.3927, "step": 4050 }, { "epoch": 0.2885635929764576, "grad_norm": 3.720099925994873, "learning_rate": 9.803613985200385e-06, "loss": 0.5506, "step": 4051 }, { "epoch": 0.2886348256580119, "grad_norm": 4.3786234855651855, "learning_rate": 9.799686785174208e-06, "loss": 0.4188, "step": 4052 }, { "epoch": 0.2887060583395662, "grad_norm": 2.2990334033966064, "learning_rate": 9.795759616054293e-06, "loss": 0.4873, "step": 4053 }, { "epoch": 0.2887772910211205, "grad_norm": 4.898111820220947, "learning_rate": 9.791832478446566e-06, "loss": 0.8392, "step": 4054 }, { "epoch": 0.2888485237026748, "grad_norm": 2.7415847778320312, "learning_rate": 9.787905372956947e-06, "loss": 0.471, "step": 4055 }, { "epoch": 0.28891975638422907, "grad_norm": 3.634413242340088, "learning_rate": 9.783978300191343e-06, "loss": 0.5024, "step": 4056 }, { "epoch": 0.28899098906578335, "grad_norm": 3.2254538536071777, "learning_rate": 9.780051260755663e-06, "loss": 0.3903, "step": 4057 }, { "epoch": 0.2890622217473377, "grad_norm": 3.489807605743408, "learning_rate": 9.776124255255808e-06, "loss": 0.6951, "step": 4058 }, { "epoch": 0.289133454428892, "grad_norm": 3.117826223373413, "learning_rate": 9.772197284297677e-06, "loss": 0.6652, "step": 4059 }, { "epoch": 0.2892046871104463, "grad_norm": 4.064243793487549, "learning_rate": 9.768270348487156e-06, "loss": 0.3175, "step": 4060 }, { "epoch": 0.28927591979200057, "grad_norm": 2.928380012512207, "learning_rate": 9.764343448430132e-06, "loss": 0.3841, "step": 4061 }, { "epoch": 0.28934715247355486, "grad_norm": 4.227704048156738, "learning_rate": 9.760416584732494e-06, "loss": 0.4899, "step": 4062 }, { "epoch": 0.28941838515510915, "grad_norm": 3.5361177921295166, "learning_rate": 9.756489758000105e-06, "loss": 0.3795, "step": 4063 }, { "epoch": 0.28948961783666344, "grad_norm": 5.6958394050598145, "learning_rate": 9.75256296883884e-06, "loss": 0.1869, "step": 4064 }, { "epoch": 0.2895608505182178, "grad_norm": 3.372347593307495, "learning_rate": 9.748636217854562e-06, "loss": 0.4528, "step": 4065 }, { "epoch": 0.2896320831997721, "grad_norm": 2.7643637657165527, "learning_rate": 9.744709505653126e-06, "loss": 0.6424, "step": 4066 }, { "epoch": 0.28970331588132636, "grad_norm": 2.6104061603546143, "learning_rate": 9.740782832840382e-06, "loss": 0.0828, "step": 4067 }, { "epoch": 0.28977454856288065, "grad_norm": 2.064906597137451, "learning_rate": 9.736856200022182e-06, "loss": 0.3261, "step": 4068 }, { "epoch": 0.28984578124443494, "grad_norm": 3.1948282718658447, "learning_rate": 9.732929607804357e-06, "loss": 0.6348, "step": 4069 }, { "epoch": 0.28991701392598923, "grad_norm": 1.9013447761535645, "learning_rate": 9.729003056792742e-06, "loss": 0.1183, "step": 4070 }, { "epoch": 0.2899882466075435, "grad_norm": 2.43251895904541, "learning_rate": 9.72507654759317e-06, "loss": 0.1678, "step": 4071 }, { "epoch": 0.29005947928909787, "grad_norm": 4.001916885375977, "learning_rate": 9.721150080811452e-06, "loss": 0.564, "step": 4072 }, { "epoch": 0.29013071197065216, "grad_norm": 1.966252088546753, "learning_rate": 9.717223657053403e-06, "loss": 0.3736, "step": 4073 }, { "epoch": 0.29020194465220644, "grad_norm": 3.1726672649383545, "learning_rate": 9.713297276924838e-06, "loss": 0.5258, "step": 4074 }, { "epoch": 0.29027317733376073, "grad_norm": 3.973245143890381, "learning_rate": 9.709370941031544e-06, "loss": 0.6712, "step": 4075 }, { "epoch": 0.290344410015315, "grad_norm": 2.954960584640503, "learning_rate": 9.705444649979322e-06, "loss": 0.4191, "step": 4076 }, { "epoch": 0.2904156426968693, "grad_norm": 2.990523099899292, "learning_rate": 9.701518404373962e-06, "loss": 0.3576, "step": 4077 }, { "epoch": 0.2904868753784236, "grad_norm": 2.545441150665283, "learning_rate": 9.697592204821233e-06, "loss": 0.2489, "step": 4078 }, { "epoch": 0.2905581080599779, "grad_norm": 2.608447313308716, "learning_rate": 9.693666051926915e-06, "loss": 0.5325, "step": 4079 }, { "epoch": 0.29062934074153224, "grad_norm": 2.8677151203155518, "learning_rate": 9.689739946296772e-06, "loss": 0.1055, "step": 4080 }, { "epoch": 0.29070057342308653, "grad_norm": 2.477057933807373, "learning_rate": 9.685813888536559e-06, "loss": 0.3781, "step": 4081 }, { "epoch": 0.2907718061046408, "grad_norm": 3.8189141750335693, "learning_rate": 9.681887879252025e-06, "loss": 0.5759, "step": 4082 }, { "epoch": 0.2908430387861951, "grad_norm": 2.05157470703125, "learning_rate": 9.67796191904892e-06, "loss": 0.2463, "step": 4083 }, { "epoch": 0.2909142714677494, "grad_norm": 1.9026663303375244, "learning_rate": 9.67403600853297e-06, "loss": 0.1285, "step": 4084 }, { "epoch": 0.2909855041493037, "grad_norm": 3.9894161224365234, "learning_rate": 9.670110148309907e-06, "loss": 0.4443, "step": 4085 }, { "epoch": 0.291056736830858, "grad_norm": 2.4964449405670166, "learning_rate": 9.666184338985456e-06, "loss": 0.5384, "step": 4086 }, { "epoch": 0.2911279695124123, "grad_norm": 5.529338836669922, "learning_rate": 9.66225858116532e-06, "loss": 0.2575, "step": 4087 }, { "epoch": 0.2911992021939666, "grad_norm": 2.4810118675231934, "learning_rate": 9.658332875455207e-06, "loss": 0.1577, "step": 4088 }, { "epoch": 0.2912704348755209, "grad_norm": 2.457737445831299, "learning_rate": 9.654407222460816e-06, "loss": 0.5966, "step": 4089 }, { "epoch": 0.2913416675570752, "grad_norm": 3.193065881729126, "learning_rate": 9.650481622787829e-06, "loss": 0.5565, "step": 4090 }, { "epoch": 0.2914129002386295, "grad_norm": 2.2091097831726074, "learning_rate": 9.646556077041925e-06, "loss": 0.3571, "step": 4091 }, { "epoch": 0.29148413292018377, "grad_norm": 5.5231218338012695, "learning_rate": 9.642630585828785e-06, "loss": 0.6477, "step": 4092 }, { "epoch": 0.29155536560173806, "grad_norm": 3.355095386505127, "learning_rate": 9.638705149754061e-06, "loss": 0.2466, "step": 4093 }, { "epoch": 0.29162659828329235, "grad_norm": 2.0591320991516113, "learning_rate": 9.634779769423412e-06, "loss": 0.2257, "step": 4094 }, { "epoch": 0.2916978309648467, "grad_norm": 2.7507920265197754, "learning_rate": 9.630854445442486e-06, "loss": 0.4499, "step": 4095 }, { "epoch": 0.291769063646401, "grad_norm": 2.9735898971557617, "learning_rate": 9.626929178416918e-06, "loss": 0.5008, "step": 4096 }, { "epoch": 0.2918402963279553, "grad_norm": 2.633479118347168, "learning_rate": 9.623003968952331e-06, "loss": 0.4977, "step": 4097 }, { "epoch": 0.29191152900950956, "grad_norm": 1.7078570127487183, "learning_rate": 9.619078817654352e-06, "loss": 0.3715, "step": 4098 }, { "epoch": 0.29198276169106385, "grad_norm": 3.160426378250122, "learning_rate": 9.615153725128593e-06, "loss": 0.5665, "step": 4099 }, { "epoch": 0.29205399437261814, "grad_norm": 2.2509007453918457, "learning_rate": 9.611228691980644e-06, "loss": 0.4349, "step": 4100 }, { "epoch": 0.29212522705417243, "grad_norm": 3.0502500534057617, "learning_rate": 9.607303718816108e-06, "loss": 0.6711, "step": 4101 }, { "epoch": 0.2921964597357268, "grad_norm": 3.0315239429473877, "learning_rate": 9.603378806240564e-06, "loss": 0.3989, "step": 4102 }, { "epoch": 0.29226769241728107, "grad_norm": 2.4675397872924805, "learning_rate": 9.599453954859586e-06, "loss": 0.3302, "step": 4103 }, { "epoch": 0.29233892509883536, "grad_norm": 3.3986399173736572, "learning_rate": 9.595529165278736e-06, "loss": 0.6642, "step": 4104 }, { "epoch": 0.29241015778038965, "grad_norm": 2.8814148902893066, "learning_rate": 9.591604438103574e-06, "loss": 0.2675, "step": 4105 }, { "epoch": 0.29248139046194394, "grad_norm": 3.3225972652435303, "learning_rate": 9.587679773939637e-06, "loss": 0.2954, "step": 4106 }, { "epoch": 0.2925526231434982, "grad_norm": 2.616800308227539, "learning_rate": 9.583755173392467e-06, "loss": 0.3479, "step": 4107 }, { "epoch": 0.2926238558250525, "grad_norm": 3.2265069484710693, "learning_rate": 9.57983063706759e-06, "loss": 0.1764, "step": 4108 }, { "epoch": 0.2926950885066068, "grad_norm": 3.2894790172576904, "learning_rate": 9.575906165570515e-06, "loss": 0.4423, "step": 4109 }, { "epoch": 0.29276632118816115, "grad_norm": 3.706843137741089, "learning_rate": 9.571981759506753e-06, "loss": 0.6328, "step": 4110 }, { "epoch": 0.29283755386971544, "grad_norm": 3.004138231277466, "learning_rate": 9.5680574194818e-06, "loss": 0.5313, "step": 4111 }, { "epoch": 0.29290878655126973, "grad_norm": 3.333873748779297, "learning_rate": 9.564133146101134e-06, "loss": 0.386, "step": 4112 }, { "epoch": 0.292980019232824, "grad_norm": 2.4493772983551025, "learning_rate": 9.560208939970236e-06, "loss": 0.203, "step": 4113 }, { "epoch": 0.2930512519143783, "grad_norm": 4.411905765533447, "learning_rate": 9.556284801694573e-06, "loss": 0.171, "step": 4114 }, { "epoch": 0.2931224845959326, "grad_norm": 2.3106377124786377, "learning_rate": 9.552360731879593e-06, "loss": 0.2913, "step": 4115 }, { "epoch": 0.2931937172774869, "grad_norm": 2.136394500732422, "learning_rate": 9.54843673113074e-06, "loss": 0.0971, "step": 4116 }, { "epoch": 0.29326494995904123, "grad_norm": 3.133444309234619, "learning_rate": 9.544512800053457e-06, "loss": 0.3824, "step": 4117 }, { "epoch": 0.2933361826405955, "grad_norm": 2.640105724334717, "learning_rate": 9.540588939253153e-06, "loss": 0.307, "step": 4118 }, { "epoch": 0.2934074153221498, "grad_norm": 2.245999813079834, "learning_rate": 9.536665149335245e-06, "loss": 0.4335, "step": 4119 }, { "epoch": 0.2934786480037041, "grad_norm": 2.433410167694092, "learning_rate": 9.532741430905135e-06, "loss": 0.2754, "step": 4120 }, { "epoch": 0.2935498806852584, "grad_norm": 3.141179323196411, "learning_rate": 9.528817784568207e-06, "loss": 0.68, "step": 4121 }, { "epoch": 0.2936211133668127, "grad_norm": 2.72590708732605, "learning_rate": 9.524894210929843e-06, "loss": 0.3126, "step": 4122 }, { "epoch": 0.29369234604836697, "grad_norm": 1.697399616241455, "learning_rate": 9.520970710595413e-06, "loss": 0.1473, "step": 4123 }, { "epoch": 0.2937635787299213, "grad_norm": 3.1475226879119873, "learning_rate": 9.517047284170266e-06, "loss": 0.1092, "step": 4124 }, { "epoch": 0.2938348114114756, "grad_norm": 4.255743503570557, "learning_rate": 9.51312393225975e-06, "loss": 0.5011, "step": 4125 }, { "epoch": 0.2939060440930299, "grad_norm": 3.522242546081543, "learning_rate": 9.509200655469201e-06, "loss": 0.6405, "step": 4126 }, { "epoch": 0.2939772767745842, "grad_norm": 2.952448606491089, "learning_rate": 9.505277454403932e-06, "loss": 0.3219, "step": 4127 }, { "epoch": 0.2940485094561385, "grad_norm": 5.4855780601501465, "learning_rate": 9.501354329669258e-06, "loss": 0.5414, "step": 4128 }, { "epoch": 0.29411974213769276, "grad_norm": 5.083133220672607, "learning_rate": 9.497431281870479e-06, "loss": 0.8076, "step": 4129 }, { "epoch": 0.29419097481924705, "grad_norm": 2.6162643432617188, "learning_rate": 9.493508311612874e-06, "loss": 0.6476, "step": 4130 }, { "epoch": 0.29426220750080134, "grad_norm": 2.9902071952819824, "learning_rate": 9.48958541950172e-06, "loss": 0.4645, "step": 4131 }, { "epoch": 0.2943334401823557, "grad_norm": 2.3105244636535645, "learning_rate": 9.485662606142285e-06, "loss": 0.1398, "step": 4132 }, { "epoch": 0.29440467286391, "grad_norm": 3.0832414627075195, "learning_rate": 9.48173987213981e-06, "loss": 0.4085, "step": 4133 }, { "epoch": 0.29447590554546427, "grad_norm": 3.9347102642059326, "learning_rate": 9.477817218099535e-06, "loss": 0.16, "step": 4134 }, { "epoch": 0.29454713822701856, "grad_norm": 1.622665286064148, "learning_rate": 9.473894644626684e-06, "loss": 0.1222, "step": 4135 }, { "epoch": 0.29461837090857285, "grad_norm": 2.184812068939209, "learning_rate": 9.469972152326476e-06, "loss": 0.3802, "step": 4136 }, { "epoch": 0.29468960359012714, "grad_norm": 3.3485543727874756, "learning_rate": 9.466049741804104e-06, "loss": 0.3659, "step": 4137 }, { "epoch": 0.2947608362716814, "grad_norm": 4.294192314147949, "learning_rate": 9.462127413664756e-06, "loss": 0.4857, "step": 4138 }, { "epoch": 0.29483206895323577, "grad_norm": 3.800283193588257, "learning_rate": 9.458205168513616e-06, "loss": 0.5518, "step": 4139 }, { "epoch": 0.29490330163479006, "grad_norm": 1.8457146883010864, "learning_rate": 9.454283006955835e-06, "loss": 0.1597, "step": 4140 }, { "epoch": 0.29497453431634435, "grad_norm": 2.472019672393799, "learning_rate": 9.450360929596565e-06, "loss": 0.305, "step": 4141 }, { "epoch": 0.29504576699789864, "grad_norm": 2.9208898544311523, "learning_rate": 9.446438937040947e-06, "loss": 0.5159, "step": 4142 }, { "epoch": 0.29511699967945293, "grad_norm": 2.8645706176757812, "learning_rate": 9.442517029894096e-06, "loss": 0.614, "step": 4143 }, { "epoch": 0.2951882323610072, "grad_norm": 3.4425740242004395, "learning_rate": 9.438595208761127e-06, "loss": 0.7872, "step": 4144 }, { "epoch": 0.2952594650425615, "grad_norm": 4.3796162605285645, "learning_rate": 9.43467347424714e-06, "loss": 0.561, "step": 4145 }, { "epoch": 0.2953306977241158, "grad_norm": 2.6283388137817383, "learning_rate": 9.43075182695721e-06, "loss": 0.3402, "step": 4146 }, { "epoch": 0.29540193040567014, "grad_norm": 2.4041013717651367, "learning_rate": 9.426830267496411e-06, "loss": 0.416, "step": 4147 }, { "epoch": 0.29547316308722443, "grad_norm": 4.143697738647461, "learning_rate": 9.422908796469804e-06, "loss": 0.321, "step": 4148 }, { "epoch": 0.2955443957687787, "grad_norm": 3.7676665782928467, "learning_rate": 9.418987414482422e-06, "loss": 0.5624, "step": 4149 }, { "epoch": 0.295615628450333, "grad_norm": 2.9743547439575195, "learning_rate": 9.415066122139298e-06, "loss": 0.5464, "step": 4150 }, { "epoch": 0.2956868611318873, "grad_norm": 3.300791025161743, "learning_rate": 9.411144920045453e-06, "loss": 0.5148, "step": 4151 }, { "epoch": 0.2957580938134416, "grad_norm": 5.823536396026611, "learning_rate": 9.407223808805878e-06, "loss": 0.4071, "step": 4152 }, { "epoch": 0.2958293264949959, "grad_norm": 2.5024566650390625, "learning_rate": 9.403302789025565e-06, "loss": 0.3922, "step": 4153 }, { "epoch": 0.2959005591765502, "grad_norm": 3.139147996902466, "learning_rate": 9.399381861309491e-06, "loss": 0.2712, "step": 4154 }, { "epoch": 0.2959717918581045, "grad_norm": 3.1788203716278076, "learning_rate": 9.395461026262607e-06, "loss": 0.6731, "step": 4155 }, { "epoch": 0.2960430245396588, "grad_norm": 2.384850025177002, "learning_rate": 9.391540284489862e-06, "loss": 0.3066, "step": 4156 }, { "epoch": 0.2961142572212131, "grad_norm": 4.896937847137451, "learning_rate": 9.387619636596189e-06, "loss": 0.6983, "step": 4157 }, { "epoch": 0.2961854899027674, "grad_norm": 2.1190273761749268, "learning_rate": 9.383699083186493e-06, "loss": 0.3534, "step": 4158 }, { "epoch": 0.2962567225843217, "grad_norm": 2.9610607624053955, "learning_rate": 9.379778624865683e-06, "loss": 0.6792, "step": 4159 }, { "epoch": 0.29632795526587596, "grad_norm": 1.7879056930541992, "learning_rate": 9.375858262238649e-06, "loss": 0.085, "step": 4160 }, { "epoch": 0.29639918794743025, "grad_norm": 5.386721611022949, "learning_rate": 9.371937995910254e-06, "loss": 0.6039, "step": 4161 }, { "epoch": 0.2964704206289846, "grad_norm": 2.8538811206817627, "learning_rate": 9.368017826485358e-06, "loss": 0.4521, "step": 4162 }, { "epoch": 0.2965416533105389, "grad_norm": 2.4626729488372803, "learning_rate": 9.364097754568805e-06, "loss": 0.4031, "step": 4163 }, { "epoch": 0.2966128859920932, "grad_norm": 3.0580620765686035, "learning_rate": 9.36017778076542e-06, "loss": 0.3149, "step": 4164 }, { "epoch": 0.29668411867364747, "grad_norm": 2.5067038536071777, "learning_rate": 9.356257905680012e-06, "loss": 0.309, "step": 4165 }, { "epoch": 0.29675535135520176, "grad_norm": 5.123416900634766, "learning_rate": 9.352338129917384e-06, "loss": 0.6046, "step": 4166 }, { "epoch": 0.29682658403675605, "grad_norm": 4.048019886016846, "learning_rate": 9.348418454082309e-06, "loss": 0.6818, "step": 4167 }, { "epoch": 0.29689781671831034, "grad_norm": 2.170663595199585, "learning_rate": 9.344498878779557e-06, "loss": 0.4243, "step": 4168 }, { "epoch": 0.2969690493998647, "grad_norm": 2.3443777561187744, "learning_rate": 9.34057940461388e-06, "loss": 0.3126, "step": 4169 }, { "epoch": 0.29704028208141897, "grad_norm": 5.4447021484375, "learning_rate": 9.336660032190012e-06, "loss": 0.5645, "step": 4170 }, { "epoch": 0.29711151476297326, "grad_norm": 3.2814650535583496, "learning_rate": 9.332740762112664e-06, "loss": 0.4491, "step": 4171 }, { "epoch": 0.29718274744452755, "grad_norm": 2.601689338684082, "learning_rate": 9.32882159498655e-06, "loss": 0.4202, "step": 4172 }, { "epoch": 0.29725398012608184, "grad_norm": 4.264282703399658, "learning_rate": 9.324902531416348e-06, "loss": 0.4439, "step": 4173 }, { "epoch": 0.29732521280763613, "grad_norm": 2.4646363258361816, "learning_rate": 9.320983572006734e-06, "loss": 0.2532, "step": 4174 }, { "epoch": 0.2973964454891904, "grad_norm": 2.3687684535980225, "learning_rate": 9.317064717362363e-06, "loss": 0.3922, "step": 4175 }, { "epoch": 0.29746767817074476, "grad_norm": 4.296957492828369, "learning_rate": 9.313145968087876e-06, "loss": 0.6854, "step": 4176 }, { "epoch": 0.29753891085229905, "grad_norm": 2.350059747695923, "learning_rate": 9.309227324787892e-06, "loss": 0.2494, "step": 4177 }, { "epoch": 0.29761014353385334, "grad_norm": 6.942707538604736, "learning_rate": 9.305308788067015e-06, "loss": 0.6538, "step": 4178 }, { "epoch": 0.29768137621540763, "grad_norm": 5.558845520019531, "learning_rate": 9.301390358529842e-06, "loss": 0.7226, "step": 4179 }, { "epoch": 0.2977526088969619, "grad_norm": 2.8671700954437256, "learning_rate": 9.297472036780939e-06, "loss": 0.3875, "step": 4180 }, { "epoch": 0.2978238415785162, "grad_norm": 4.167874336242676, "learning_rate": 9.293553823424865e-06, "loss": 0.9383, "step": 4181 }, { "epoch": 0.2978950742600705, "grad_norm": 2.2515015602111816, "learning_rate": 9.289635719066166e-06, "loss": 0.34, "step": 4182 }, { "epoch": 0.2979663069416248, "grad_norm": 3.4035775661468506, "learning_rate": 9.285717724309357e-06, "loss": 0.6153, "step": 4183 }, { "epoch": 0.29803753962317914, "grad_norm": 3.781683921813965, "learning_rate": 9.281799839758949e-06, "loss": 0.5812, "step": 4184 }, { "epoch": 0.2981087723047334, "grad_norm": 3.4622650146484375, "learning_rate": 9.277882066019429e-06, "loss": 0.5229, "step": 4185 }, { "epoch": 0.2981800049862877, "grad_norm": 2.6589276790618896, "learning_rate": 9.27396440369527e-06, "loss": 0.468, "step": 4186 }, { "epoch": 0.298251237667842, "grad_norm": 2.8973019123077393, "learning_rate": 9.270046853390924e-06, "loss": 0.2033, "step": 4187 }, { "epoch": 0.2983224703493963, "grad_norm": 3.884953260421753, "learning_rate": 9.266129415710837e-06, "loss": 0.3775, "step": 4188 }, { "epoch": 0.2983937030309506, "grad_norm": 2.4476912021636963, "learning_rate": 9.26221209125942e-06, "loss": 0.3103, "step": 4189 }, { "epoch": 0.2984649357125049, "grad_norm": 1.704095721244812, "learning_rate": 9.258294880641078e-06, "loss": 0.1707, "step": 4190 }, { "epoch": 0.2985361683940592, "grad_norm": 2.260134220123291, "learning_rate": 9.254377784460202e-06, "loss": 0.2407, "step": 4191 }, { "epoch": 0.2986074010756135, "grad_norm": 2.1794028282165527, "learning_rate": 9.250460803321156e-06, "loss": 0.3514, "step": 4192 }, { "epoch": 0.2986786337571678, "grad_norm": 3.4171605110168457, "learning_rate": 9.246543937828284e-06, "loss": 0.4727, "step": 4193 }, { "epoch": 0.2987498664387221, "grad_norm": 3.384889841079712, "learning_rate": 9.242627188585928e-06, "loss": 0.8188, "step": 4194 }, { "epoch": 0.2988210991202764, "grad_norm": 3.3835582733154297, "learning_rate": 9.238710556198395e-06, "loss": 0.4797, "step": 4195 }, { "epoch": 0.29889233180183067, "grad_norm": 2.116657018661499, "learning_rate": 9.234794041269982e-06, "loss": 0.2992, "step": 4196 }, { "epoch": 0.29896356448338496, "grad_norm": 2.9496450424194336, "learning_rate": 9.230877644404974e-06, "loss": 0.3188, "step": 4197 }, { "epoch": 0.29903479716493925, "grad_norm": 5.167093753814697, "learning_rate": 9.226961366207619e-06, "loss": 0.4239, "step": 4198 }, { "epoch": 0.2991060298464936, "grad_norm": 2.9234161376953125, "learning_rate": 9.223045207282167e-06, "loss": 0.2966, "step": 4199 }, { "epoch": 0.2991772625280479, "grad_norm": 3.522753953933716, "learning_rate": 9.21912916823284e-06, "loss": 0.4791, "step": 4200 }, { "epoch": 0.29924849520960217, "grad_norm": 1.7146542072296143, "learning_rate": 9.215213249663839e-06, "loss": 0.1275, "step": 4201 }, { "epoch": 0.29931972789115646, "grad_norm": 1.8150936365127563, "learning_rate": 9.211297452179348e-06, "loss": 0.3135, "step": 4202 }, { "epoch": 0.29939096057271075, "grad_norm": 5.876014709472656, "learning_rate": 9.207381776383546e-06, "loss": 0.5979, "step": 4203 }, { "epoch": 0.29946219325426504, "grad_norm": 3.2576115131378174, "learning_rate": 9.203466222880567e-06, "loss": 0.6229, "step": 4204 }, { "epoch": 0.29953342593581933, "grad_norm": 3.5881216526031494, "learning_rate": 9.199550792274548e-06, "loss": 0.7513, "step": 4205 }, { "epoch": 0.2996046586173737, "grad_norm": 2.1901488304138184, "learning_rate": 9.195635485169604e-06, "loss": 0.1846, "step": 4206 }, { "epoch": 0.29967589129892797, "grad_norm": 2.0700418949127197, "learning_rate": 9.191720302169815e-06, "loss": 0.3556, "step": 4207 }, { "epoch": 0.29974712398048226, "grad_norm": 3.050637722015381, "learning_rate": 9.187805243879263e-06, "loss": 0.7946, "step": 4208 }, { "epoch": 0.29981835666203654, "grad_norm": 2.864159345626831, "learning_rate": 9.183890310902001e-06, "loss": 0.5633, "step": 4209 }, { "epoch": 0.29988958934359083, "grad_norm": 1.9860117435455322, "learning_rate": 9.179975503842053e-06, "loss": 0.1941, "step": 4210 }, { "epoch": 0.2999608220251451, "grad_norm": 3.463078260421753, "learning_rate": 9.176060823303442e-06, "loss": 0.7851, "step": 4211 }, { "epoch": 0.3000320547066994, "grad_norm": 3.348513126373291, "learning_rate": 9.17214626989016e-06, "loss": 0.5481, "step": 4212 }, { "epoch": 0.3001032873882537, "grad_norm": 3.0467584133148193, "learning_rate": 9.168231844206188e-06, "loss": 0.5316, "step": 4213 }, { "epoch": 0.30017452006980805, "grad_norm": 3.8142752647399902, "learning_rate": 9.164317546855475e-06, "loss": 0.409, "step": 4214 }, { "epoch": 0.30024575275136234, "grad_norm": 1.9620729684829712, "learning_rate": 9.160403378441957e-06, "loss": 0.0993, "step": 4215 }, { "epoch": 0.3003169854329166, "grad_norm": 3.7203292846679688, "learning_rate": 9.156489339569555e-06, "loss": 0.7536, "step": 4216 }, { "epoch": 0.3003882181144709, "grad_norm": 10.8714017868042, "learning_rate": 9.152575430842156e-06, "loss": 0.256, "step": 4217 }, { "epoch": 0.3004594507960252, "grad_norm": 3.9359774589538574, "learning_rate": 9.148661652863644e-06, "loss": 0.6915, "step": 4218 }, { "epoch": 0.3005306834775795, "grad_norm": 4.285372257232666, "learning_rate": 9.144748006237873e-06, "loss": 0.432, "step": 4219 }, { "epoch": 0.3006019161591338, "grad_norm": 2.9534921646118164, "learning_rate": 9.140834491568675e-06, "loss": 0.6149, "step": 4220 }, { "epoch": 0.30067314884068813, "grad_norm": 2.1374928951263428, "learning_rate": 9.136921109459869e-06, "loss": 0.4379, "step": 4221 }, { "epoch": 0.3007443815222424, "grad_norm": 2.0171995162963867, "learning_rate": 9.133007860515248e-06, "loss": 0.3359, "step": 4222 }, { "epoch": 0.3008156142037967, "grad_norm": 4.518317699432373, "learning_rate": 9.129094745338586e-06, "loss": 0.7312, "step": 4223 }, { "epoch": 0.300886846885351, "grad_norm": 3.0956170558929443, "learning_rate": 9.125181764533632e-06, "loss": 0.1923, "step": 4224 }, { "epoch": 0.3009580795669053, "grad_norm": 2.8351054191589355, "learning_rate": 9.12126891870413e-06, "loss": 0.7484, "step": 4225 }, { "epoch": 0.3010293122484596, "grad_norm": 5.95188045501709, "learning_rate": 9.11735620845378e-06, "loss": 0.6748, "step": 4226 }, { "epoch": 0.30110054493001387, "grad_norm": 7.709107875823975, "learning_rate": 9.113443634386277e-06, "loss": 0.3809, "step": 4227 }, { "epoch": 0.3011717776115682, "grad_norm": 2.002079963684082, "learning_rate": 9.109531197105295e-06, "loss": 0.3791, "step": 4228 }, { "epoch": 0.3012430102931225, "grad_norm": 5.689868927001953, "learning_rate": 9.105618897214475e-06, "loss": 0.7795, "step": 4229 }, { "epoch": 0.3013142429746768, "grad_norm": 5.8590288162231445, "learning_rate": 9.101706735317451e-06, "loss": 0.5874, "step": 4230 }, { "epoch": 0.3013854756562311, "grad_norm": 5.056861877441406, "learning_rate": 9.09779471201783e-06, "loss": 0.9011, "step": 4231 }, { "epoch": 0.3014567083377854, "grad_norm": 3.2556560039520264, "learning_rate": 9.09388282791919e-06, "loss": 0.7476, "step": 4232 }, { "epoch": 0.30152794101933966, "grad_norm": 4.825651168823242, "learning_rate": 9.089971083625098e-06, "loss": 0.1173, "step": 4233 }, { "epoch": 0.30159917370089395, "grad_norm": 3.9581308364868164, "learning_rate": 9.086059479739099e-06, "loss": 0.8258, "step": 4234 }, { "epoch": 0.30167040638244824, "grad_norm": 6.5599822998046875, "learning_rate": 9.08214801686471e-06, "loss": 0.4252, "step": 4235 }, { "epoch": 0.3017416390640026, "grad_norm": 2.8494198322296143, "learning_rate": 9.078236695605426e-06, "loss": 0.4221, "step": 4236 }, { "epoch": 0.3018128717455569, "grad_norm": 3.549252510070801, "learning_rate": 9.074325516564734e-06, "loss": 0.7855, "step": 4237 }, { "epoch": 0.30188410442711117, "grad_norm": 2.795511484146118, "learning_rate": 9.07041448034608e-06, "loss": 0.6525, "step": 4238 }, { "epoch": 0.30195533710866546, "grad_norm": 3.5152664184570312, "learning_rate": 9.066503587552895e-06, "loss": 0.6567, "step": 4239 }, { "epoch": 0.30202656979021975, "grad_norm": 2.312873363494873, "learning_rate": 9.0625928387886e-06, "loss": 0.489, "step": 4240 }, { "epoch": 0.30209780247177404, "grad_norm": 2.8336122035980225, "learning_rate": 9.05868223465657e-06, "loss": 0.7738, "step": 4241 }, { "epoch": 0.3021690351533283, "grad_norm": 3.979433536529541, "learning_rate": 9.054771775760179e-06, "loss": 0.4884, "step": 4242 }, { "epoch": 0.30224026783488267, "grad_norm": 2.6046905517578125, "learning_rate": 9.050861462702772e-06, "loss": 0.4968, "step": 4243 }, { "epoch": 0.30231150051643696, "grad_norm": 2.9934370517730713, "learning_rate": 9.046951296087664e-06, "loss": 0.6945, "step": 4244 }, { "epoch": 0.30238273319799125, "grad_norm": 2.9724979400634766, "learning_rate": 9.043041276518158e-06, "loss": 0.3929, "step": 4245 }, { "epoch": 0.30245396587954554, "grad_norm": 2.7028884887695312, "learning_rate": 9.039131404597531e-06, "loss": 0.2709, "step": 4246 }, { "epoch": 0.30252519856109983, "grad_norm": 3.2801566123962402, "learning_rate": 9.035221680929028e-06, "loss": 0.3251, "step": 4247 }, { "epoch": 0.3025964312426541, "grad_norm": 6.440088272094727, "learning_rate": 9.031312106115887e-06, "loss": 0.4223, "step": 4248 }, { "epoch": 0.3026676639242084, "grad_norm": 2.682718515396118, "learning_rate": 9.02740268076131e-06, "loss": 0.498, "step": 4249 }, { "epoch": 0.3027388966057627, "grad_norm": 3.9774861335754395, "learning_rate": 9.023493405468487e-06, "loss": 0.5875, "step": 4250 }, { "epoch": 0.30281012928731704, "grad_norm": 5.889970302581787, "learning_rate": 9.019584280840572e-06, "loss": 0.6406, "step": 4251 }, { "epoch": 0.30288136196887133, "grad_norm": 2.9090187549591064, "learning_rate": 9.01567530748071e-06, "loss": 0.532, "step": 4252 }, { "epoch": 0.3029525946504256, "grad_norm": 3.258918523788452, "learning_rate": 9.011766485992012e-06, "loss": 0.7493, "step": 4253 }, { "epoch": 0.3030238273319799, "grad_norm": 3.460984706878662, "learning_rate": 9.007857816977565e-06, "loss": 0.3691, "step": 4254 }, { "epoch": 0.3030950600135342, "grad_norm": 1.7857024669647217, "learning_rate": 9.003949301040439e-06, "loss": 0.1456, "step": 4255 }, { "epoch": 0.3031662926950885, "grad_norm": 3.9574220180511475, "learning_rate": 9.000040938783681e-06, "loss": 0.3123, "step": 4256 }, { "epoch": 0.3032375253766428, "grad_norm": 2.626986265182495, "learning_rate": 8.996132730810307e-06, "loss": 0.3029, "step": 4257 }, { "epoch": 0.3033087580581971, "grad_norm": 3.649916172027588, "learning_rate": 8.992224677723315e-06, "loss": 0.2524, "step": 4258 }, { "epoch": 0.3033799907397514, "grad_norm": 5.059289455413818, "learning_rate": 8.98831678012568e-06, "loss": 0.6352, "step": 4259 }, { "epoch": 0.3034512234213057, "grad_norm": 2.335726022720337, "learning_rate": 8.984409038620345e-06, "loss": 0.2543, "step": 4260 }, { "epoch": 0.30352245610286, "grad_norm": 3.5034401416778564, "learning_rate": 8.980501453810237e-06, "loss": 0.7411, "step": 4261 }, { "epoch": 0.3035936887844143, "grad_norm": 4.457218647003174, "learning_rate": 8.976594026298257e-06, "loss": 0.7272, "step": 4262 }, { "epoch": 0.3036649214659686, "grad_norm": 3.0218756198883057, "learning_rate": 8.972686756687278e-06, "loss": 0.3799, "step": 4263 }, { "epoch": 0.30373615414752286, "grad_norm": 3.197488784790039, "learning_rate": 8.968779645580153e-06, "loss": 0.4811, "step": 4264 }, { "epoch": 0.30380738682907715, "grad_norm": 2.3019626140594482, "learning_rate": 8.964872693579711e-06, "loss": 0.2158, "step": 4265 }, { "epoch": 0.3038786195106315, "grad_norm": 2.642491340637207, "learning_rate": 8.96096590128875e-06, "loss": 0.0949, "step": 4266 }, { "epoch": 0.3039498521921858, "grad_norm": 5.723636150360107, "learning_rate": 8.957059269310054e-06, "loss": 0.5944, "step": 4267 }, { "epoch": 0.3040210848737401, "grad_norm": 3.176762819290161, "learning_rate": 8.953152798246373e-06, "loss": 0.6032, "step": 4268 }, { "epoch": 0.30409231755529437, "grad_norm": 2.3747763633728027, "learning_rate": 8.949246488700431e-06, "loss": 0.2681, "step": 4269 }, { "epoch": 0.30416355023684866, "grad_norm": 7.002654552459717, "learning_rate": 8.945340341274934e-06, "loss": 0.7507, "step": 4270 }, { "epoch": 0.30423478291840295, "grad_norm": 1.5159443616867065, "learning_rate": 8.941434356572566e-06, "loss": 0.0843, "step": 4271 }, { "epoch": 0.30430601559995724, "grad_norm": 2.130636692047119, "learning_rate": 8.937528535195972e-06, "loss": 0.1511, "step": 4272 }, { "epoch": 0.3043772482815116, "grad_norm": 2.446976661682129, "learning_rate": 8.933622877747784e-06, "loss": 0.421, "step": 4273 }, { "epoch": 0.30444848096306587, "grad_norm": 3.0412063598632812, "learning_rate": 8.929717384830609e-06, "loss": 0.3657, "step": 4274 }, { "epoch": 0.30451971364462016, "grad_norm": 4.62783670425415, "learning_rate": 8.925812057047016e-06, "loss": 0.8408, "step": 4275 }, { "epoch": 0.30459094632617445, "grad_norm": 3.423529624938965, "learning_rate": 8.92190689499956e-06, "loss": 0.5226, "step": 4276 }, { "epoch": 0.30466217900772874, "grad_norm": 2.4135072231292725, "learning_rate": 8.918001899290771e-06, "loss": 0.6016, "step": 4277 }, { "epoch": 0.30473341168928303, "grad_norm": 3.608029842376709, "learning_rate": 8.914097070523143e-06, "loss": 0.6678, "step": 4278 }, { "epoch": 0.3048046443708373, "grad_norm": 2.4737792015075684, "learning_rate": 8.910192409299154e-06, "loss": 0.4822, "step": 4279 }, { "epoch": 0.30487587705239166, "grad_norm": 3.220994234085083, "learning_rate": 8.906287916221259e-06, "loss": 0.4059, "step": 4280 }, { "epoch": 0.30494710973394595, "grad_norm": 3.071329116821289, "learning_rate": 8.90238359189187e-06, "loss": 0.4436, "step": 4281 }, { "epoch": 0.30501834241550024, "grad_norm": 2.6002354621887207, "learning_rate": 8.898479436913391e-06, "loss": 0.5486, "step": 4282 }, { "epoch": 0.30508957509705453, "grad_norm": 6.955035209655762, "learning_rate": 8.894575451888194e-06, "loss": 0.6307, "step": 4283 }, { "epoch": 0.3051608077786088, "grad_norm": 6.71929407119751, "learning_rate": 8.890671637418619e-06, "loss": 0.1837, "step": 4284 }, { "epoch": 0.3052320404601631, "grad_norm": 2.3510310649871826, "learning_rate": 8.886767994106984e-06, "loss": 0.7406, "step": 4285 }, { "epoch": 0.3053032731417174, "grad_norm": 6.6554107666015625, "learning_rate": 8.882864522555588e-06, "loss": 0.5555, "step": 4286 }, { "epoch": 0.3053745058232717, "grad_norm": 3.0292935371398926, "learning_rate": 8.878961223366687e-06, "loss": 0.4901, "step": 4287 }, { "epoch": 0.30544573850482604, "grad_norm": 2.9911577701568604, "learning_rate": 8.875058097142527e-06, "loss": 0.7081, "step": 4288 }, { "epoch": 0.3055169711863803, "grad_norm": 1.7461791038513184, "learning_rate": 8.87115514448532e-06, "loss": 0.3233, "step": 4289 }, { "epoch": 0.3055882038679346, "grad_norm": 3.7038135528564453, "learning_rate": 8.867252365997249e-06, "loss": 0.2044, "step": 4290 }, { "epoch": 0.3056594365494889, "grad_norm": 3.610542058944702, "learning_rate": 8.86334976228047e-06, "loss": 0.392, "step": 4291 }, { "epoch": 0.3057306692310432, "grad_norm": 3.078010082244873, "learning_rate": 8.859447333937117e-06, "loss": 0.7098, "step": 4292 }, { "epoch": 0.3058019019125975, "grad_norm": 3.692584276199341, "learning_rate": 8.8555450815693e-06, "loss": 0.6724, "step": 4293 }, { "epoch": 0.3058731345941518, "grad_norm": 3.304020643234253, "learning_rate": 8.851643005779087e-06, "loss": 0.5025, "step": 4294 }, { "epoch": 0.3059443672757061, "grad_norm": 3.647250175476074, "learning_rate": 8.847741107168532e-06, "loss": 0.3942, "step": 4295 }, { "epoch": 0.3060155999572604, "grad_norm": 5.1784234046936035, "learning_rate": 8.843839386339662e-06, "loss": 0.7857, "step": 4296 }, { "epoch": 0.3060868326388147, "grad_norm": 3.123933792114258, "learning_rate": 8.839937843894466e-06, "loss": 0.2389, "step": 4297 }, { "epoch": 0.306158065320369, "grad_norm": 4.345554351806641, "learning_rate": 8.836036480434914e-06, "loss": 0.6279, "step": 4298 }, { "epoch": 0.3062292980019233, "grad_norm": 2.5086960792541504, "learning_rate": 8.832135296562949e-06, "loss": 0.435, "step": 4299 }, { "epoch": 0.30630053068347757, "grad_norm": 3.213310956954956, "learning_rate": 8.828234292880479e-06, "loss": 0.4052, "step": 4300 }, { "epoch": 0.30637176336503186, "grad_norm": 3.5031986236572266, "learning_rate": 8.824333469989388e-06, "loss": 0.5453, "step": 4301 }, { "epoch": 0.30644299604658615, "grad_norm": 3.4869906902313232, "learning_rate": 8.820432828491542e-06, "loss": 0.7998, "step": 4302 }, { "epoch": 0.3065142287281405, "grad_norm": 4.167101860046387, "learning_rate": 8.816532368988758e-06, "loss": 0.94, "step": 4303 }, { "epoch": 0.3065854614096948, "grad_norm": 6.634154796600342, "learning_rate": 8.812632092082846e-06, "loss": 1.0642, "step": 4304 }, { "epoch": 0.30665669409124907, "grad_norm": 3.9220054149627686, "learning_rate": 8.808731998375572e-06, "loss": 0.7417, "step": 4305 }, { "epoch": 0.30672792677280336, "grad_norm": 3.577909469604492, "learning_rate": 8.804832088468685e-06, "loss": 0.5755, "step": 4306 }, { "epoch": 0.30679915945435765, "grad_norm": 4.973033905029297, "learning_rate": 8.800932362963896e-06, "loss": 0.3755, "step": 4307 }, { "epoch": 0.30687039213591194, "grad_norm": 1.9983112812042236, "learning_rate": 8.7970328224629e-06, "loss": 0.1827, "step": 4308 }, { "epoch": 0.30694162481746623, "grad_norm": 3.3571298122406006, "learning_rate": 8.793133467567346e-06, "loss": 0.5166, "step": 4309 }, { "epoch": 0.3070128574990206, "grad_norm": 3.3155674934387207, "learning_rate": 8.78923429887887e-06, "loss": 0.4796, "step": 4310 }, { "epoch": 0.30708409018057486, "grad_norm": 4.0278120040893555, "learning_rate": 8.785335316999078e-06, "loss": 0.7429, "step": 4311 }, { "epoch": 0.30715532286212915, "grad_norm": 5.494455337524414, "learning_rate": 8.781436522529537e-06, "loss": 0.3249, "step": 4312 }, { "epoch": 0.30722655554368344, "grad_norm": 3.5573971271514893, "learning_rate": 8.777537916071787e-06, "loss": 0.7431, "step": 4313 }, { "epoch": 0.30729778822523773, "grad_norm": 2.1015360355377197, "learning_rate": 8.773639498227355e-06, "loss": 0.3039, "step": 4314 }, { "epoch": 0.307369020906792, "grad_norm": 2.864010810852051, "learning_rate": 8.769741269597713e-06, "loss": 0.537, "step": 4315 }, { "epoch": 0.3074402535883463, "grad_norm": 3.660888671875, "learning_rate": 8.765843230784324e-06, "loss": 0.4762, "step": 4316 }, { "epoch": 0.30751148626990066, "grad_norm": 3.155728816986084, "learning_rate": 8.761945382388619e-06, "loss": 0.7367, "step": 4317 }, { "epoch": 0.30758271895145495, "grad_norm": 3.0535717010498047, "learning_rate": 8.758047725011988e-06, "loss": 0.4165, "step": 4318 }, { "epoch": 0.30765395163300924, "grad_norm": 2.8847055435180664, "learning_rate": 8.754150259255807e-06, "loss": 0.5853, "step": 4319 }, { "epoch": 0.3077251843145635, "grad_norm": 2.4009904861450195, "learning_rate": 8.75025298572141e-06, "loss": 0.291, "step": 4320 }, { "epoch": 0.3077964169961178, "grad_norm": 5.743163108825684, "learning_rate": 8.746355905010108e-06, "loss": 0.4365, "step": 4321 }, { "epoch": 0.3078676496776721, "grad_norm": 2.930565118789673, "learning_rate": 8.742459017723176e-06, "loss": 0.6367, "step": 4322 }, { "epoch": 0.3079388823592264, "grad_norm": 3.3156073093414307, "learning_rate": 8.738562324461873e-06, "loss": 0.1763, "step": 4323 }, { "epoch": 0.3080101150407807, "grad_norm": 3.4233405590057373, "learning_rate": 8.734665825827408e-06, "loss": 0.7505, "step": 4324 }, { "epoch": 0.30808134772233503, "grad_norm": 3.2908341884613037, "learning_rate": 8.730769522420978e-06, "loss": 0.9244, "step": 4325 }, { "epoch": 0.3081525804038893, "grad_norm": 2.17008638381958, "learning_rate": 8.72687341484374e-06, "loss": 0.2163, "step": 4326 }, { "epoch": 0.3082238130854436, "grad_norm": 3.7256693840026855, "learning_rate": 8.722977503696824e-06, "loss": 0.5983, "step": 4327 }, { "epoch": 0.3082950457669979, "grad_norm": 2.8390135765075684, "learning_rate": 8.719081789581329e-06, "loss": 0.5715, "step": 4328 }, { "epoch": 0.3083662784485522, "grad_norm": 2.892637014389038, "learning_rate": 8.715186273098319e-06, "loss": 0.559, "step": 4329 }, { "epoch": 0.3084375111301065, "grad_norm": 3.614980459213257, "learning_rate": 8.711290954848842e-06, "loss": 0.4518, "step": 4330 }, { "epoch": 0.30850874381166077, "grad_norm": 2.3450896739959717, "learning_rate": 8.707395835433895e-06, "loss": 0.2146, "step": 4331 }, { "epoch": 0.3085799764932151, "grad_norm": 2.527146100997925, "learning_rate": 8.703500915454458e-06, "loss": 0.5268, "step": 4332 }, { "epoch": 0.3086512091747694, "grad_norm": 2.843637466430664, "learning_rate": 8.699606195511484e-06, "loss": 0.3483, "step": 4333 }, { "epoch": 0.3087224418563237, "grad_norm": 1.9623680114746094, "learning_rate": 8.69571167620588e-06, "loss": 0.2089, "step": 4334 }, { "epoch": 0.308793674537878, "grad_norm": 2.9874658584594727, "learning_rate": 8.691817358138532e-06, "loss": 0.3427, "step": 4335 }, { "epoch": 0.30886490721943227, "grad_norm": 3.9050164222717285, "learning_rate": 8.687923241910297e-06, "loss": 0.4223, "step": 4336 }, { "epoch": 0.30893613990098656, "grad_norm": 3.972507953643799, "learning_rate": 8.68402932812199e-06, "loss": 0.5613, "step": 4337 }, { "epoch": 0.30900737258254085, "grad_norm": 1.8221428394317627, "learning_rate": 8.680135617374406e-06, "loss": 0.1856, "step": 4338 }, { "epoch": 0.30907860526409514, "grad_norm": 3.954200267791748, "learning_rate": 8.676242110268308e-06, "loss": 0.4775, "step": 4339 }, { "epoch": 0.3091498379456495, "grad_norm": 3.850858449935913, "learning_rate": 8.672348807404416e-06, "loss": 0.7816, "step": 4340 }, { "epoch": 0.3092210706272038, "grad_norm": 5.372876167297363, "learning_rate": 8.668455709383433e-06, "loss": 0.4018, "step": 4341 }, { "epoch": 0.30929230330875807, "grad_norm": 4.272623538970947, "learning_rate": 8.664562816806022e-06, "loss": 0.4583, "step": 4342 }, { "epoch": 0.30936353599031235, "grad_norm": 3.26196551322937, "learning_rate": 8.660670130272816e-06, "loss": 0.5534, "step": 4343 }, { "epoch": 0.30943476867186664, "grad_norm": 2.6067628860473633, "learning_rate": 8.656777650384415e-06, "loss": 0.3647, "step": 4344 }, { "epoch": 0.30950600135342093, "grad_norm": 2.8704769611358643, "learning_rate": 8.652885377741394e-06, "loss": 0.5823, "step": 4345 }, { "epoch": 0.3095772340349752, "grad_norm": 2.750764846801758, "learning_rate": 8.648993312944282e-06, "loss": 0.5034, "step": 4346 }, { "epoch": 0.30964846671652957, "grad_norm": 3.1148273944854736, "learning_rate": 8.645101456593589e-06, "loss": 0.6934, "step": 4347 }, { "epoch": 0.30971969939808386, "grad_norm": 2.6871626377105713, "learning_rate": 8.641209809289792e-06, "loss": 0.2252, "step": 4348 }, { "epoch": 0.30979093207963815, "grad_norm": 3.621628522872925, "learning_rate": 8.637318371633326e-06, "loss": 0.495, "step": 4349 }, { "epoch": 0.30986216476119244, "grad_norm": 2.5049853324890137, "learning_rate": 8.633427144224603e-06, "loss": 0.5087, "step": 4350 }, { "epoch": 0.3099333974427467, "grad_norm": 3.9354910850524902, "learning_rate": 8.629536127664002e-06, "loss": 0.4605, "step": 4351 }, { "epoch": 0.310004630124301, "grad_norm": 3.864823818206787, "learning_rate": 8.625645322551858e-06, "loss": 0.6788, "step": 4352 }, { "epoch": 0.3100758628058553, "grad_norm": 2.8699984550476074, "learning_rate": 8.621754729488488e-06, "loss": 0.3906, "step": 4353 }, { "epoch": 0.3101470954874096, "grad_norm": 3.6552860736846924, "learning_rate": 8.617864349074176e-06, "loss": 0.6528, "step": 4354 }, { "epoch": 0.31021832816896394, "grad_norm": 2.7592854499816895, "learning_rate": 8.613974181909155e-06, "loss": 0.5664, "step": 4355 }, { "epoch": 0.31028956085051823, "grad_norm": 2.4188621044158936, "learning_rate": 8.610084228593649e-06, "loss": 0.4558, "step": 4356 }, { "epoch": 0.3103607935320725, "grad_norm": 2.075303554534912, "learning_rate": 8.60619448972783e-06, "loss": 0.2153, "step": 4357 }, { "epoch": 0.3104320262136268, "grad_norm": 4.724642276763916, "learning_rate": 8.602304965911851e-06, "loss": 0.8414, "step": 4358 }, { "epoch": 0.3105032588951811, "grad_norm": 2.2997043132781982, "learning_rate": 8.598415657745819e-06, "loss": 0.2479, "step": 4359 }, { "epoch": 0.3105744915767354, "grad_norm": 2.76338791847229, "learning_rate": 8.59452656582982e-06, "loss": 0.5045, "step": 4360 }, { "epoch": 0.3106457242582897, "grad_norm": 2.7777702808380127, "learning_rate": 8.590637690763896e-06, "loss": 0.7634, "step": 4361 }, { "epoch": 0.310716956939844, "grad_norm": 3.6921584606170654, "learning_rate": 8.586749033148063e-06, "loss": 0.3805, "step": 4362 }, { "epoch": 0.3107881896213983, "grad_norm": 3.760831832885742, "learning_rate": 8.582860593582301e-06, "loss": 0.4976, "step": 4363 }, { "epoch": 0.3108594223029526, "grad_norm": 2.332559108734131, "learning_rate": 8.578972372666557e-06, "loss": 0.2321, "step": 4364 }, { "epoch": 0.3109306549845069, "grad_norm": 2.484287738800049, "learning_rate": 8.57508437100074e-06, "loss": 0.4344, "step": 4365 }, { "epoch": 0.3110018876660612, "grad_norm": 3.3389015197753906, "learning_rate": 8.571196589184732e-06, "loss": 0.3611, "step": 4366 }, { "epoch": 0.3110731203476155, "grad_norm": 3.2353460788726807, "learning_rate": 8.56730902781838e-06, "loss": 0.8007, "step": 4367 }, { "epoch": 0.31114435302916976, "grad_norm": 5.115993022918701, "learning_rate": 8.563421687501485e-06, "loss": 0.6583, "step": 4368 }, { "epoch": 0.3112155857107241, "grad_norm": 7.229234218597412, "learning_rate": 8.559534568833832e-06, "loss": 0.3066, "step": 4369 }, { "epoch": 0.3112868183922784, "grad_norm": 6.541014194488525, "learning_rate": 8.555647672415162e-06, "loss": 0.5981, "step": 4370 }, { "epoch": 0.3113580510738327, "grad_norm": 4.501908779144287, "learning_rate": 8.55176099884518e-06, "loss": 0.7477, "step": 4371 }, { "epoch": 0.311429283755387, "grad_norm": 2.5065789222717285, "learning_rate": 8.547874548723565e-06, "loss": 0.6611, "step": 4372 }, { "epoch": 0.31150051643694127, "grad_norm": 5.1555914878845215, "learning_rate": 8.543988322649954e-06, "loss": 0.983, "step": 4373 }, { "epoch": 0.31157174911849556, "grad_norm": 2.9244301319122314, "learning_rate": 8.540102321223947e-06, "loss": 0.561, "step": 4374 }, { "epoch": 0.31164298180004985, "grad_norm": 4.186335563659668, "learning_rate": 8.536216545045117e-06, "loss": 0.7613, "step": 4375 }, { "epoch": 0.31171421448160413, "grad_norm": 3.199697971343994, "learning_rate": 8.532330994713006e-06, "loss": 0.636, "step": 4376 }, { "epoch": 0.3117854471631585, "grad_norm": 2.045879364013672, "learning_rate": 8.528445670827103e-06, "loss": 0.2349, "step": 4377 }, { "epoch": 0.31185667984471277, "grad_norm": 4.107431888580322, "learning_rate": 8.52456057398688e-06, "loss": 0.7413, "step": 4378 }, { "epoch": 0.31192791252626706, "grad_norm": 1.489036202430725, "learning_rate": 8.52067570479177e-06, "loss": 0.3244, "step": 4379 }, { "epoch": 0.31199914520782135, "grad_norm": 4.790122985839844, "learning_rate": 8.516791063841161e-06, "loss": 0.5021, "step": 4380 }, { "epoch": 0.31207037788937564, "grad_norm": 2.744105100631714, "learning_rate": 8.512906651734416e-06, "loss": 0.1354, "step": 4381 }, { "epoch": 0.31214161057092993, "grad_norm": 2.57492733001709, "learning_rate": 8.509022469070864e-06, "loss": 0.7096, "step": 4382 }, { "epoch": 0.3122128432524842, "grad_norm": 4.030538558959961, "learning_rate": 8.505138516449786e-06, "loss": 0.6723, "step": 4383 }, { "epoch": 0.31228407593403856, "grad_norm": 2.3406288623809814, "learning_rate": 8.501254794470443e-06, "loss": 0.2123, "step": 4384 }, { "epoch": 0.31235530861559285, "grad_norm": 2.666560173034668, "learning_rate": 8.497371303732054e-06, "loss": 0.1661, "step": 4385 }, { "epoch": 0.31242654129714714, "grad_norm": 2.773963451385498, "learning_rate": 8.493488044833796e-06, "loss": 0.4051, "step": 4386 }, { "epoch": 0.31249777397870143, "grad_norm": 2.573760509490967, "learning_rate": 8.48960501837482e-06, "loss": 0.4835, "step": 4387 }, { "epoch": 0.3125690066602557, "grad_norm": 4.401871204376221, "learning_rate": 8.485722224954237e-06, "loss": 0.8127, "step": 4388 }, { "epoch": 0.31264023934181, "grad_norm": 3.291414737701416, "learning_rate": 8.481839665171117e-06, "loss": 0.634, "step": 4389 }, { "epoch": 0.3127114720233643, "grad_norm": 2.2779970169067383, "learning_rate": 8.477957339624502e-06, "loss": 0.138, "step": 4390 }, { "epoch": 0.3127827047049186, "grad_norm": 2.2870450019836426, "learning_rate": 8.4740752489134e-06, "loss": 0.5013, "step": 4391 }, { "epoch": 0.31285393738647294, "grad_norm": 2.9726130962371826, "learning_rate": 8.47019339363677e-06, "loss": 0.586, "step": 4392 }, { "epoch": 0.3129251700680272, "grad_norm": 2.6184306144714355, "learning_rate": 8.466311774393544e-06, "loss": 0.5438, "step": 4393 }, { "epoch": 0.3129964027495815, "grad_norm": 3.2587482929229736, "learning_rate": 8.462430391782622e-06, "loss": 0.5315, "step": 4394 }, { "epoch": 0.3130676354311358, "grad_norm": 2.489175319671631, "learning_rate": 8.458549246402854e-06, "loss": 0.5304, "step": 4395 }, { "epoch": 0.3131388681126901, "grad_norm": 3.6700615882873535, "learning_rate": 8.454668338853062e-06, "loss": 0.3323, "step": 4396 }, { "epoch": 0.3132101007942444, "grad_norm": 0.7506953477859497, "learning_rate": 8.450787669732036e-06, "loss": 0.0386, "step": 4397 }, { "epoch": 0.3132813334757987, "grad_norm": 3.6840474605560303, "learning_rate": 8.446907239638514e-06, "loss": 0.6268, "step": 4398 }, { "epoch": 0.313352566157353, "grad_norm": 3.6493093967437744, "learning_rate": 8.44302704917121e-06, "loss": 0.3824, "step": 4399 }, { "epoch": 0.3134237988389073, "grad_norm": 2.600205183029175, "learning_rate": 8.439147098928805e-06, "loss": 0.122, "step": 4400 }, { "epoch": 0.3134950315204616, "grad_norm": 5.963331699371338, "learning_rate": 8.435267389509924e-06, "loss": 0.6551, "step": 4401 }, { "epoch": 0.3135662642020159, "grad_norm": 2.776197910308838, "learning_rate": 8.431387921513172e-06, "loss": 0.2669, "step": 4402 }, { "epoch": 0.3136374968835702, "grad_norm": 4.269049644470215, "learning_rate": 8.42750869553711e-06, "loss": 0.3746, "step": 4403 }, { "epoch": 0.31370872956512447, "grad_norm": 3.32574462890625, "learning_rate": 8.423629712180265e-06, "loss": 0.5968, "step": 4404 }, { "epoch": 0.31377996224667876, "grad_norm": 2.749732255935669, "learning_rate": 8.419750972041119e-06, "loss": 0.4035, "step": 4405 }, { "epoch": 0.31385119492823305, "grad_norm": 4.92340087890625, "learning_rate": 8.415872475718125e-06, "loss": 0.3812, "step": 4406 }, { "epoch": 0.3139224276097874, "grad_norm": 2.4102323055267334, "learning_rate": 8.411994223809698e-06, "loss": 0.5628, "step": 4407 }, { "epoch": 0.3139936602913417, "grad_norm": 4.5714921951293945, "learning_rate": 8.408116216914205e-06, "loss": 0.6251, "step": 4408 }, { "epoch": 0.31406489297289597, "grad_norm": 3.4814178943634033, "learning_rate": 8.404238455629989e-06, "loss": 0.6529, "step": 4409 }, { "epoch": 0.31413612565445026, "grad_norm": 3.4257383346557617, "learning_rate": 8.400360940555348e-06, "loss": 0.4147, "step": 4410 }, { "epoch": 0.31420735833600455, "grad_norm": 2.700793981552124, "learning_rate": 8.396483672288536e-06, "loss": 0.1153, "step": 4411 }, { "epoch": 0.31427859101755884, "grad_norm": 3.3001439571380615, "learning_rate": 8.392606651427781e-06, "loss": 0.475, "step": 4412 }, { "epoch": 0.31434982369911313, "grad_norm": 2.3875627517700195, "learning_rate": 8.38872987857127e-06, "loss": 0.5932, "step": 4413 }, { "epoch": 0.3144210563806675, "grad_norm": 4.137585639953613, "learning_rate": 8.384853354317141e-06, "loss": 0.5182, "step": 4414 }, { "epoch": 0.31449228906222176, "grad_norm": 4.447866439819336, "learning_rate": 8.380977079263509e-06, "loss": 0.8212, "step": 4415 }, { "epoch": 0.31456352174377605, "grad_norm": 2.7754032611846924, "learning_rate": 8.377101054008445e-06, "loss": 0.307, "step": 4416 }, { "epoch": 0.31463475442533034, "grad_norm": 3.928285598754883, "learning_rate": 8.373225279149972e-06, "loss": 0.6961, "step": 4417 }, { "epoch": 0.31470598710688463, "grad_norm": 2.7293524742126465, "learning_rate": 8.369349755286084e-06, "loss": 0.6783, "step": 4418 }, { "epoch": 0.3147772197884389, "grad_norm": 3.072671890258789, "learning_rate": 8.365474483014741e-06, "loss": 0.5277, "step": 4419 }, { "epoch": 0.3148484524699932, "grad_norm": 2.8192696571350098, "learning_rate": 8.36159946293385e-06, "loss": 0.6713, "step": 4420 }, { "epoch": 0.31491968515154756, "grad_norm": 2.830188512802124, "learning_rate": 8.357724695641287e-06, "loss": 0.579, "step": 4421 }, { "epoch": 0.31499091783310185, "grad_norm": 3.408909797668457, "learning_rate": 8.353850181734898e-06, "loss": 0.3048, "step": 4422 }, { "epoch": 0.31506215051465614, "grad_norm": 3.6482813358306885, "learning_rate": 8.349975921812468e-06, "loss": 0.6855, "step": 4423 }, { "epoch": 0.3151333831962104, "grad_norm": 2.8937413692474365, "learning_rate": 8.346101916471764e-06, "loss": 0.5801, "step": 4424 }, { "epoch": 0.3152046158777647, "grad_norm": 3.733053207397461, "learning_rate": 8.342228166310502e-06, "loss": 0.3779, "step": 4425 }, { "epoch": 0.315275848559319, "grad_norm": 5.714366912841797, "learning_rate": 8.338354671926364e-06, "loss": 0.4348, "step": 4426 }, { "epoch": 0.3153470812408733, "grad_norm": 2.509014368057251, "learning_rate": 8.334481433916984e-06, "loss": 0.3126, "step": 4427 }, { "epoch": 0.3154183139224276, "grad_norm": 3.8814613819122314, "learning_rate": 8.330608452879972e-06, "loss": 0.6278, "step": 4428 }, { "epoch": 0.31548954660398193, "grad_norm": 1.7270816564559937, "learning_rate": 8.32673572941288e-06, "loss": 0.2556, "step": 4429 }, { "epoch": 0.3155607792855362, "grad_norm": 2.5820960998535156, "learning_rate": 8.322863264113235e-06, "loss": 0.4628, "step": 4430 }, { "epoch": 0.3156320119670905, "grad_norm": 3.401972532272339, "learning_rate": 8.31899105757852e-06, "loss": 0.8946, "step": 4431 }, { "epoch": 0.3157032446486448, "grad_norm": 3.9253885746002197, "learning_rate": 8.315119110406172e-06, "loss": 0.5319, "step": 4432 }, { "epoch": 0.3157744773301991, "grad_norm": 4.602275371551514, "learning_rate": 8.311247423193594e-06, "loss": 0.8001, "step": 4433 }, { "epoch": 0.3158457100117534, "grad_norm": 2.9514331817626953, "learning_rate": 8.30737599653815e-06, "loss": 0.2414, "step": 4434 }, { "epoch": 0.31591694269330767, "grad_norm": 3.3624184131622314, "learning_rate": 8.303504831037154e-06, "loss": 0.0718, "step": 4435 }, { "epoch": 0.315988175374862, "grad_norm": 2.628964900970459, "learning_rate": 8.299633927287894e-06, "loss": 0.315, "step": 4436 }, { "epoch": 0.3160594080564163, "grad_norm": 1.6750332117080688, "learning_rate": 8.295763285887613e-06, "loss": 0.0947, "step": 4437 }, { "epoch": 0.3161306407379706, "grad_norm": 3.8497378826141357, "learning_rate": 8.2918929074335e-06, "loss": 0.6111, "step": 4438 }, { "epoch": 0.3162018734195249, "grad_norm": 2.3965680599212646, "learning_rate": 8.288022792522726e-06, "loss": 0.5426, "step": 4439 }, { "epoch": 0.31627310610107917, "grad_norm": 4.0932722091674805, "learning_rate": 8.284152941752403e-06, "loss": 0.6243, "step": 4440 }, { "epoch": 0.31634433878263346, "grad_norm": 5.200390338897705, "learning_rate": 8.280283355719614e-06, "loss": 0.7271, "step": 4441 }, { "epoch": 0.31641557146418775, "grad_norm": 2.275804281234741, "learning_rate": 8.276414035021391e-06, "loss": 0.1584, "step": 4442 }, { "epoch": 0.31648680414574204, "grad_norm": 2.261564254760742, "learning_rate": 8.272544980254731e-06, "loss": 0.2835, "step": 4443 }, { "epoch": 0.3165580368272964, "grad_norm": 2.1738851070404053, "learning_rate": 8.268676192016598e-06, "loss": 0.5394, "step": 4444 }, { "epoch": 0.3166292695088507, "grad_norm": 3.636981725692749, "learning_rate": 8.264807670903891e-06, "loss": 0.7477, "step": 4445 }, { "epoch": 0.31670050219040496, "grad_norm": 4.05054235458374, "learning_rate": 8.260939417513498e-06, "loss": 0.745, "step": 4446 }, { "epoch": 0.31677173487195925, "grad_norm": 3.1484787464141846, "learning_rate": 8.25707143244224e-06, "loss": 0.3281, "step": 4447 }, { "epoch": 0.31684296755351354, "grad_norm": 2.506338357925415, "learning_rate": 8.253203716286914e-06, "loss": 0.2846, "step": 4448 }, { "epoch": 0.31691420023506783, "grad_norm": 2.0730350017547607, "learning_rate": 8.249336269644264e-06, "loss": 0.3353, "step": 4449 }, { "epoch": 0.3169854329166221, "grad_norm": 3.939392328262329, "learning_rate": 8.245469093111002e-06, "loss": 0.6692, "step": 4450 }, { "epoch": 0.31705666559817647, "grad_norm": 2.979550838470459, "learning_rate": 8.241602187283789e-06, "loss": 0.4395, "step": 4451 }, { "epoch": 0.31712789827973076, "grad_norm": 2.324251174926758, "learning_rate": 8.237735552759247e-06, "loss": 0.2037, "step": 4452 }, { "epoch": 0.31719913096128505, "grad_norm": 2.8187148571014404, "learning_rate": 8.233869190133968e-06, "loss": 0.3709, "step": 4453 }, { "epoch": 0.31727036364283934, "grad_norm": 4.807701587677002, "learning_rate": 8.230003100004481e-06, "loss": 0.3476, "step": 4454 }, { "epoch": 0.3173415963243936, "grad_norm": 2.4146344661712646, "learning_rate": 8.226137282967289e-06, "loss": 0.1372, "step": 4455 }, { "epoch": 0.3174128290059479, "grad_norm": 2.2070720195770264, "learning_rate": 8.222271739618851e-06, "loss": 0.3128, "step": 4456 }, { "epoch": 0.3174840616875022, "grad_norm": 1.4466958045959473, "learning_rate": 8.218406470555571e-06, "loss": 0.2105, "step": 4457 }, { "epoch": 0.3175552943690565, "grad_norm": 3.713785409927368, "learning_rate": 8.214541476373824e-06, "loss": 0.615, "step": 4458 }, { "epoch": 0.31762652705061084, "grad_norm": 3.0894675254821777, "learning_rate": 8.210676757669948e-06, "loss": 0.4278, "step": 4459 }, { "epoch": 0.31769775973216513, "grad_norm": 3.8719382286071777, "learning_rate": 8.206812315040215e-06, "loss": 0.7386, "step": 4460 }, { "epoch": 0.3177689924137194, "grad_norm": 2.9212899208068848, "learning_rate": 8.20294814908088e-06, "loss": 0.5934, "step": 4461 }, { "epoch": 0.3178402250952737, "grad_norm": 3.402566909790039, "learning_rate": 8.199084260388139e-06, "loss": 0.5917, "step": 4462 }, { "epoch": 0.317911457776828, "grad_norm": 4.302872180938721, "learning_rate": 8.19522064955815e-06, "loss": 0.4772, "step": 4463 }, { "epoch": 0.3179826904583823, "grad_norm": 3.3145859241485596, "learning_rate": 8.191357317187028e-06, "loss": 0.3567, "step": 4464 }, { "epoch": 0.3180539231399366, "grad_norm": 3.119938611984253, "learning_rate": 8.18749426387085e-06, "loss": 0.6256, "step": 4465 }, { "epoch": 0.3181251558214909, "grad_norm": 1.7869086265563965, "learning_rate": 8.183631490205636e-06, "loss": 0.3056, "step": 4466 }, { "epoch": 0.3181963885030452, "grad_norm": 5.733473300933838, "learning_rate": 8.179768996787381e-06, "loss": 0.7241, "step": 4467 }, { "epoch": 0.3182676211845995, "grad_norm": 3.6154534816741943, "learning_rate": 8.175906784212028e-06, "loss": 0.797, "step": 4468 }, { "epoch": 0.3183388538661538, "grad_norm": 2.6774485111236572, "learning_rate": 8.17204485307547e-06, "loss": 0.2612, "step": 4469 }, { "epoch": 0.3184100865477081, "grad_norm": 2.9226455688476562, "learning_rate": 8.168183203973568e-06, "loss": 0.2085, "step": 4470 }, { "epoch": 0.31848131922926237, "grad_norm": 2.1213719844818115, "learning_rate": 8.164321837502136e-06, "loss": 0.3904, "step": 4471 }, { "epoch": 0.31855255191081666, "grad_norm": 3.699047565460205, "learning_rate": 8.160460754256937e-06, "loss": 0.69, "step": 4472 }, { "epoch": 0.318623784592371, "grad_norm": 2.9604880809783936, "learning_rate": 8.156599954833699e-06, "loss": 0.5845, "step": 4473 }, { "epoch": 0.3186950172739253, "grad_norm": 3.7761027812957764, "learning_rate": 8.15273943982811e-06, "loss": 0.9336, "step": 4474 }, { "epoch": 0.3187662499554796, "grad_norm": 3.9788568019866943, "learning_rate": 8.148879209835797e-06, "loss": 0.6471, "step": 4475 }, { "epoch": 0.3188374826370339, "grad_norm": 3.8471152782440186, "learning_rate": 8.145019265452361e-06, "loss": 0.5217, "step": 4476 }, { "epoch": 0.31890871531858817, "grad_norm": 1.9961928129196167, "learning_rate": 8.141159607273352e-06, "loss": 0.2338, "step": 4477 }, { "epoch": 0.31897994800014245, "grad_norm": 2.6634116172790527, "learning_rate": 8.13730023589427e-06, "loss": 0.5327, "step": 4478 }, { "epoch": 0.31905118068169674, "grad_norm": 3.2887463569641113, "learning_rate": 8.13344115191058e-06, "loss": 0.6676, "step": 4479 }, { "epoch": 0.31912241336325103, "grad_norm": 2.1521730422973633, "learning_rate": 8.129582355917698e-06, "loss": 0.3967, "step": 4480 }, { "epoch": 0.3191936460448054, "grad_norm": 2.7322535514831543, "learning_rate": 8.125723848511e-06, "loss": 0.2166, "step": 4481 }, { "epoch": 0.31926487872635967, "grad_norm": 3.1086537837982178, "learning_rate": 8.121865630285809e-06, "loss": 0.5537, "step": 4482 }, { "epoch": 0.31933611140791396, "grad_norm": 2.3792033195495605, "learning_rate": 8.118007701837409e-06, "loss": 0.5259, "step": 4483 }, { "epoch": 0.31940734408946825, "grad_norm": 4.470916271209717, "learning_rate": 8.114150063761041e-06, "loss": 0.6563, "step": 4484 }, { "epoch": 0.31947857677102254, "grad_norm": 3.0836472511291504, "learning_rate": 8.110292716651899e-06, "loss": 0.3809, "step": 4485 }, { "epoch": 0.3195498094525768, "grad_norm": 2.4211645126342773, "learning_rate": 8.106435661105127e-06, "loss": 0.6181, "step": 4486 }, { "epoch": 0.3196210421341311, "grad_norm": 2.940680503845215, "learning_rate": 8.102578897715839e-06, "loss": 0.5881, "step": 4487 }, { "epoch": 0.31969227481568546, "grad_norm": 2.457775592803955, "learning_rate": 8.098722427079082e-06, "loss": 0.5124, "step": 4488 }, { "epoch": 0.31976350749723975, "grad_norm": 3.705986499786377, "learning_rate": 8.094866249789874e-06, "loss": 0.3843, "step": 4489 }, { "epoch": 0.31983474017879404, "grad_norm": 3.8257763385772705, "learning_rate": 8.091010366443189e-06, "loss": 0.6262, "step": 4490 }, { "epoch": 0.31990597286034833, "grad_norm": 3.2959768772125244, "learning_rate": 8.087154777633942e-06, "loss": 0.4607, "step": 4491 }, { "epoch": 0.3199772055419026, "grad_norm": 2.0767459869384766, "learning_rate": 8.083299483957016e-06, "loss": 0.1657, "step": 4492 }, { "epoch": 0.3200484382234569, "grad_norm": 3.225154161453247, "learning_rate": 8.079444486007244e-06, "loss": 0.5528, "step": 4493 }, { "epoch": 0.3201196709050112, "grad_norm": 2.881007194519043, "learning_rate": 8.075589784379407e-06, "loss": 0.3944, "step": 4494 }, { "epoch": 0.3201909035865655, "grad_norm": 1.881653904914856, "learning_rate": 8.071735379668246e-06, "loss": 0.3294, "step": 4495 }, { "epoch": 0.32026213626811983, "grad_norm": 2.6200625896453857, "learning_rate": 8.067881272468465e-06, "loss": 0.3137, "step": 4496 }, { "epoch": 0.3203333689496741, "grad_norm": 2.064474582672119, "learning_rate": 8.064027463374702e-06, "loss": 0.2364, "step": 4497 }, { "epoch": 0.3204046016312284, "grad_norm": 3.235304355621338, "learning_rate": 8.060173952981565e-06, "loss": 0.551, "step": 4498 }, { "epoch": 0.3204758343127827, "grad_norm": 2.450329065322876, "learning_rate": 8.056320741883613e-06, "loss": 0.2824, "step": 4499 }, { "epoch": 0.320547066994337, "grad_norm": 3.1044278144836426, "learning_rate": 8.052467830675353e-06, "loss": 0.7086, "step": 4500 }, { "epoch": 0.3206182996758913, "grad_norm": 4.297508239746094, "learning_rate": 8.04861521995125e-06, "loss": 0.6575, "step": 4501 }, { "epoch": 0.3206895323574456, "grad_norm": 3.545212984085083, "learning_rate": 8.044762910305726e-06, "loss": 0.7571, "step": 4502 }, { "epoch": 0.3207607650389999, "grad_norm": 5.9295830726623535, "learning_rate": 8.040910902333149e-06, "loss": 0.559, "step": 4503 }, { "epoch": 0.3208319977205542, "grad_norm": 3.588926076889038, "learning_rate": 8.03705919662784e-06, "loss": 0.6984, "step": 4504 }, { "epoch": 0.3209032304021085, "grad_norm": 2.7164862155914307, "learning_rate": 8.033207793784091e-06, "loss": 0.6762, "step": 4505 }, { "epoch": 0.3209744630836628, "grad_norm": 2.8593807220458984, "learning_rate": 8.02935669439612e-06, "loss": 0.236, "step": 4506 }, { "epoch": 0.3210456957652171, "grad_norm": 3.242243528366089, "learning_rate": 8.025505899058119e-06, "loss": 0.3652, "step": 4507 }, { "epoch": 0.32111692844677137, "grad_norm": 2.851223945617676, "learning_rate": 8.021655408364227e-06, "loss": 0.5216, "step": 4508 }, { "epoch": 0.32118816112832566, "grad_norm": 3.1122169494628906, "learning_rate": 8.017805222908528e-06, "loss": 0.3615, "step": 4509 }, { "epoch": 0.32125939380988, "grad_norm": 3.517717123031616, "learning_rate": 8.01395534328507e-06, "loss": 0.4183, "step": 4510 }, { "epoch": 0.3213306264914343, "grad_norm": 2.393929958343506, "learning_rate": 8.010105770087854e-06, "loss": 0.525, "step": 4511 }, { "epoch": 0.3214018591729886, "grad_norm": 4.979245185852051, "learning_rate": 8.006256503910823e-06, "loss": 0.7319, "step": 4512 }, { "epoch": 0.32147309185454287, "grad_norm": 3.8899667263031006, "learning_rate": 8.002407545347881e-06, "loss": 0.2945, "step": 4513 }, { "epoch": 0.32154432453609716, "grad_norm": 2.365586280822754, "learning_rate": 7.998558894992888e-06, "loss": 0.3564, "step": 4514 }, { "epoch": 0.32161555721765145, "grad_norm": 4.2528300285339355, "learning_rate": 7.994710553439646e-06, "loss": 0.5788, "step": 4515 }, { "epoch": 0.32168678989920574, "grad_norm": 2.933894634246826, "learning_rate": 7.99086252128191e-06, "loss": 0.881, "step": 4516 }, { "epoch": 0.32175802258076003, "grad_norm": 2.5104267597198486, "learning_rate": 7.987014799113398e-06, "loss": 0.432, "step": 4517 }, { "epoch": 0.3218292552623144, "grad_norm": 4.739585876464844, "learning_rate": 7.983167387527778e-06, "loss": 0.779, "step": 4518 }, { "epoch": 0.32190048794386866, "grad_norm": 2.640138626098633, "learning_rate": 7.979320287118656e-06, "loss": 0.3971, "step": 4519 }, { "epoch": 0.32197172062542295, "grad_norm": 3.054812431335449, "learning_rate": 7.975473498479607e-06, "loss": 0.3405, "step": 4520 }, { "epoch": 0.32204295330697724, "grad_norm": 2.281710147857666, "learning_rate": 7.971627022204148e-06, "loss": 0.4895, "step": 4521 }, { "epoch": 0.32211418598853153, "grad_norm": 2.9100067615509033, "learning_rate": 7.967780858885753e-06, "loss": 0.4195, "step": 4522 }, { "epoch": 0.3221854186700858, "grad_norm": 4.047180652618408, "learning_rate": 7.963935009117838e-06, "loss": 0.1587, "step": 4523 }, { "epoch": 0.3222566513516401, "grad_norm": 2.785841941833496, "learning_rate": 7.960089473493791e-06, "loss": 0.776, "step": 4524 }, { "epoch": 0.32232788403319446, "grad_norm": 3.47473406791687, "learning_rate": 7.956244252606926e-06, "loss": 0.6557, "step": 4525 }, { "epoch": 0.32239911671474875, "grad_norm": 2.132091760635376, "learning_rate": 7.952399347050526e-06, "loss": 0.1952, "step": 4526 }, { "epoch": 0.32247034939630304, "grad_norm": 8.148548126220703, "learning_rate": 7.948554757417825e-06, "loss": 0.6685, "step": 4527 }, { "epoch": 0.3225415820778573, "grad_norm": 3.5397679805755615, "learning_rate": 7.944710484301995e-06, "loss": 0.6919, "step": 4528 }, { "epoch": 0.3226128147594116, "grad_norm": 3.5699567794799805, "learning_rate": 7.940866528296175e-06, "loss": 0.6959, "step": 4529 }, { "epoch": 0.3226840474409659, "grad_norm": 2.644759178161621, "learning_rate": 7.937022889993444e-06, "loss": 0.2857, "step": 4530 }, { "epoch": 0.3227552801225202, "grad_norm": 2.7537612915039062, "learning_rate": 7.933179569986834e-06, "loss": 0.5026, "step": 4531 }, { "epoch": 0.3228265128040745, "grad_norm": 2.5867912769317627, "learning_rate": 7.929336568869332e-06, "loss": 0.6441, "step": 4532 }, { "epoch": 0.32289774548562883, "grad_norm": 3.075565814971924, "learning_rate": 7.92549388723388e-06, "loss": 0.6591, "step": 4533 }, { "epoch": 0.3229689781671831, "grad_norm": 3.4691169261932373, "learning_rate": 7.92165152567335e-06, "loss": 0.4089, "step": 4534 }, { "epoch": 0.3230402108487374, "grad_norm": 4.3929619789123535, "learning_rate": 7.91780948478059e-06, "loss": 0.6102, "step": 4535 }, { "epoch": 0.3231114435302917, "grad_norm": 3.8183257579803467, "learning_rate": 7.913967765148386e-06, "loss": 0.784, "step": 4536 }, { "epoch": 0.323182676211846, "grad_norm": 3.6905879974365234, "learning_rate": 7.910126367369474e-06, "loss": 0.3524, "step": 4537 }, { "epoch": 0.3232539088934003, "grad_norm": 5.592245101928711, "learning_rate": 7.906285292036538e-06, "loss": 0.2589, "step": 4538 }, { "epoch": 0.32332514157495457, "grad_norm": 3.5177814960479736, "learning_rate": 7.902444539742224e-06, "loss": 0.7367, "step": 4539 }, { "epoch": 0.3233963742565089, "grad_norm": 6.986121654510498, "learning_rate": 7.898604111079115e-06, "loss": 0.1526, "step": 4540 }, { "epoch": 0.3234676069380632, "grad_norm": 5.070945739746094, "learning_rate": 7.89476400663975e-06, "loss": 0.7114, "step": 4541 }, { "epoch": 0.3235388396196175, "grad_norm": 2.7060396671295166, "learning_rate": 7.890924227016624e-06, "loss": 0.3718, "step": 4542 }, { "epoch": 0.3236100723011718, "grad_norm": 3.456747055053711, "learning_rate": 7.887084772802165e-06, "loss": 0.3377, "step": 4543 }, { "epoch": 0.32368130498272607, "grad_norm": 3.9936141967773438, "learning_rate": 7.88324564458877e-06, "loss": 0.3136, "step": 4544 }, { "epoch": 0.32375253766428036, "grad_norm": 5.6660895347595215, "learning_rate": 7.879406842968772e-06, "loss": 0.9249, "step": 4545 }, { "epoch": 0.32382377034583465, "grad_norm": 2.350403070449829, "learning_rate": 7.875568368534463e-06, "loss": 0.3062, "step": 4546 }, { "epoch": 0.32389500302738894, "grad_norm": 2.6557421684265137, "learning_rate": 7.871730221878073e-06, "loss": 0.2844, "step": 4547 }, { "epoch": 0.3239662357089433, "grad_norm": 3.822779893875122, "learning_rate": 7.867892403591798e-06, "loss": 0.3122, "step": 4548 }, { "epoch": 0.3240374683904976, "grad_norm": 4.6804280281066895, "learning_rate": 7.864054914267765e-06, "loss": 0.2875, "step": 4549 }, { "epoch": 0.32410870107205186, "grad_norm": 2.595519542694092, "learning_rate": 7.86021775449806e-06, "loss": 0.3329, "step": 4550 }, { "epoch": 0.32417993375360615, "grad_norm": 2.402388095855713, "learning_rate": 7.856380924874726e-06, "loss": 0.2289, "step": 4551 }, { "epoch": 0.32425116643516044, "grad_norm": 2.6725473403930664, "learning_rate": 7.85254442598974e-06, "loss": 0.2148, "step": 4552 }, { "epoch": 0.32432239911671473, "grad_norm": 2.1424524784088135, "learning_rate": 7.848708258435031e-06, "loss": 0.3694, "step": 4553 }, { "epoch": 0.324393631798269, "grad_norm": 4.9008989334106445, "learning_rate": 7.844872422802483e-06, "loss": 0.6466, "step": 4554 }, { "epoch": 0.32446486447982337, "grad_norm": 3.9144139289855957, "learning_rate": 7.841036919683932e-06, "loss": 0.3504, "step": 4555 }, { "epoch": 0.32453609716137766, "grad_norm": 1.9470840692520142, "learning_rate": 7.837201749671146e-06, "loss": 0.1572, "step": 4556 }, { "epoch": 0.32460732984293195, "grad_norm": 3.6906204223632812, "learning_rate": 7.833366913355858e-06, "loss": 0.5351, "step": 4557 }, { "epoch": 0.32467856252448624, "grad_norm": 1.7850043773651123, "learning_rate": 7.829532411329747e-06, "loss": 0.0532, "step": 4558 }, { "epoch": 0.3247497952060405, "grad_norm": 2.333930253982544, "learning_rate": 7.825698244184432e-06, "loss": 0.3129, "step": 4559 }, { "epoch": 0.3248210278875948, "grad_norm": 4.41744327545166, "learning_rate": 7.821864412511485e-06, "loss": 0.7986, "step": 4560 }, { "epoch": 0.3248922605691491, "grad_norm": 2.2313714027404785, "learning_rate": 7.818030916902433e-06, "loss": 0.1922, "step": 4561 }, { "epoch": 0.32496349325070345, "grad_norm": 3.120946168899536, "learning_rate": 7.814197757948734e-06, "loss": 0.5856, "step": 4562 }, { "epoch": 0.32503472593225774, "grad_norm": 3.4811017513275146, "learning_rate": 7.810364936241814e-06, "loss": 0.4534, "step": 4563 }, { "epoch": 0.32510595861381203, "grad_norm": 2.605380058288574, "learning_rate": 7.80653245237304e-06, "loss": 0.578, "step": 4564 }, { "epoch": 0.3251771912953663, "grad_norm": 3.1055521965026855, "learning_rate": 7.802700306933716e-06, "loss": 0.8082, "step": 4565 }, { "epoch": 0.3252484239769206, "grad_norm": 2.0455164909362793, "learning_rate": 7.798868500515106e-06, "loss": 0.2166, "step": 4566 }, { "epoch": 0.3253196566584749, "grad_norm": 5.699638843536377, "learning_rate": 7.795037033708422e-06, "loss": 0.4951, "step": 4567 }, { "epoch": 0.3253908893400292, "grad_norm": 4.108621120452881, "learning_rate": 7.791205907104816e-06, "loss": 0.3265, "step": 4568 }, { "epoch": 0.3254621220215835, "grad_norm": 4.042733669281006, "learning_rate": 7.78737512129539e-06, "loss": 0.8858, "step": 4569 }, { "epoch": 0.3255333547031378, "grad_norm": 4.6612653732299805, "learning_rate": 7.783544676871202e-06, "loss": 1.2869, "step": 4570 }, { "epoch": 0.3256045873846921, "grad_norm": 3.4782512187957764, "learning_rate": 7.779714574423241e-06, "loss": 0.4234, "step": 4571 }, { "epoch": 0.3256758200662464, "grad_norm": 3.257732391357422, "learning_rate": 7.775884814542457e-06, "loss": 0.734, "step": 4572 }, { "epoch": 0.3257470527478007, "grad_norm": 4.562542915344238, "learning_rate": 7.772055397819745e-06, "loss": 0.296, "step": 4573 }, { "epoch": 0.325818285429355, "grad_norm": 3.4161343574523926, "learning_rate": 7.768226324845942e-06, "loss": 0.6361, "step": 4574 }, { "epoch": 0.32588951811090927, "grad_norm": 2.360234498977661, "learning_rate": 7.76439759621183e-06, "loss": 0.4928, "step": 4575 }, { "epoch": 0.32596075079246356, "grad_norm": 3.5725317001342773, "learning_rate": 7.76056921250815e-06, "loss": 0.388, "step": 4576 }, { "epoch": 0.3260319834740179, "grad_norm": 1.9611711502075195, "learning_rate": 7.756741174325578e-06, "loss": 0.3361, "step": 4577 }, { "epoch": 0.3261032161555722, "grad_norm": 4.694680690765381, "learning_rate": 7.75291348225474e-06, "loss": 0.6027, "step": 4578 }, { "epoch": 0.3261744488371265, "grad_norm": 2.772019147872925, "learning_rate": 7.749086136886215e-06, "loss": 0.7122, "step": 4579 }, { "epoch": 0.3262456815186808, "grad_norm": 4.558719635009766, "learning_rate": 7.745259138810514e-06, "loss": 0.4281, "step": 4580 }, { "epoch": 0.32631691420023506, "grad_norm": 5.141706466674805, "learning_rate": 7.741432488618112e-06, "loss": 0.3339, "step": 4581 }, { "epoch": 0.32638814688178935, "grad_norm": 1.9551162719726562, "learning_rate": 7.737606186899417e-06, "loss": 0.255, "step": 4582 }, { "epoch": 0.32645937956334364, "grad_norm": 2.4619193077087402, "learning_rate": 7.733780234244792e-06, "loss": 0.3912, "step": 4583 }, { "epoch": 0.32653061224489793, "grad_norm": 2.9057390689849854, "learning_rate": 7.729954631244536e-06, "loss": 0.5563, "step": 4584 }, { "epoch": 0.3266018449264523, "grad_norm": 1.938174843788147, "learning_rate": 7.726129378488907e-06, "loss": 0.2119, "step": 4585 }, { "epoch": 0.32667307760800657, "grad_norm": 2.8859925270080566, "learning_rate": 7.722304476568095e-06, "loss": 0.4962, "step": 4586 }, { "epoch": 0.32674431028956086, "grad_norm": 1.6316781044006348, "learning_rate": 7.718479926072244e-06, "loss": 0.1065, "step": 4587 }, { "epoch": 0.32681554297111515, "grad_norm": 4.93433141708374, "learning_rate": 7.714655727591452e-06, "loss": 0.3778, "step": 4588 }, { "epoch": 0.32688677565266944, "grad_norm": 2.676443576812744, "learning_rate": 7.710831881715742e-06, "loss": 0.3417, "step": 4589 }, { "epoch": 0.3269580083342237, "grad_norm": 2.7512731552124023, "learning_rate": 7.707008389035102e-06, "loss": 0.6069, "step": 4590 }, { "epoch": 0.327029241015778, "grad_norm": 3.981785297393799, "learning_rate": 7.703185250139455e-06, "loss": 0.555, "step": 4591 }, { "epoch": 0.32710047369733236, "grad_norm": 6.251470565795898, "learning_rate": 7.699362465618667e-06, "loss": 0.41, "step": 4592 }, { "epoch": 0.32717170637888665, "grad_norm": 4.044583320617676, "learning_rate": 7.695540036062559e-06, "loss": 0.4484, "step": 4593 }, { "epoch": 0.32724293906044094, "grad_norm": 3.7384438514709473, "learning_rate": 7.691717962060892e-06, "loss": 0.5585, "step": 4594 }, { "epoch": 0.32731417174199523, "grad_norm": 3.7445502281188965, "learning_rate": 7.687896244203377e-06, "loss": 0.8024, "step": 4595 }, { "epoch": 0.3273854044235495, "grad_norm": 2.5019145011901855, "learning_rate": 7.68407488307966e-06, "loss": 0.4192, "step": 4596 }, { "epoch": 0.3274566371051038, "grad_norm": 2.562692165374756, "learning_rate": 7.680253879279335e-06, "loss": 0.2131, "step": 4597 }, { "epoch": 0.3275278697866581, "grad_norm": 3.2250378131866455, "learning_rate": 7.676433233391955e-06, "loss": 0.649, "step": 4598 }, { "epoch": 0.3275991024682124, "grad_norm": 2.993208646774292, "learning_rate": 7.672612946006992e-06, "loss": 0.4245, "step": 4599 }, { "epoch": 0.32767033514976673, "grad_norm": 2.910825490951538, "learning_rate": 7.668793017713886e-06, "loss": 0.3925, "step": 4600 }, { "epoch": 0.327741567831321, "grad_norm": 3.759235143661499, "learning_rate": 7.664973449102013e-06, "loss": 0.6488, "step": 4601 }, { "epoch": 0.3278128005128753, "grad_norm": 2.6061203479766846, "learning_rate": 7.661154240760687e-06, "loss": 0.1184, "step": 4602 }, { "epoch": 0.3278840331944296, "grad_norm": 2.377718687057495, "learning_rate": 7.657335393279179e-06, "loss": 0.5074, "step": 4603 }, { "epoch": 0.3279552658759839, "grad_norm": 2.818444013595581, "learning_rate": 7.653516907246696e-06, "loss": 0.5383, "step": 4604 }, { "epoch": 0.3280264985575382, "grad_norm": 2.6577837467193604, "learning_rate": 7.649698783252388e-06, "loss": 0.6466, "step": 4605 }, { "epoch": 0.32809773123909247, "grad_norm": 3.7307822704315186, "learning_rate": 7.645881021885353e-06, "loss": 0.6858, "step": 4606 }, { "epoch": 0.3281689639206468, "grad_norm": 2.094904661178589, "learning_rate": 7.642063623734638e-06, "loss": 0.3562, "step": 4607 }, { "epoch": 0.3282401966022011, "grad_norm": 2.169494152069092, "learning_rate": 7.63824658938922e-06, "loss": 0.3266, "step": 4608 }, { "epoch": 0.3283114292837554, "grad_norm": 5.226212501525879, "learning_rate": 7.63442991943803e-06, "loss": 0.2559, "step": 4609 }, { "epoch": 0.3283826619653097, "grad_norm": 2.299149751663208, "learning_rate": 7.630613614469948e-06, "loss": 0.5388, "step": 4610 }, { "epoch": 0.328453894646864, "grad_norm": 2.557478189468384, "learning_rate": 7.626797675073783e-06, "loss": 0.4269, "step": 4611 }, { "epoch": 0.32852512732841826, "grad_norm": 3.0766587257385254, "learning_rate": 7.6229821018382965e-06, "loss": 0.4497, "step": 4612 }, { "epoch": 0.32859636000997255, "grad_norm": 2.3657829761505127, "learning_rate": 7.619166895352197e-06, "loss": 0.3358, "step": 4613 }, { "epoch": 0.3286675926915269, "grad_norm": 2.5033559799194336, "learning_rate": 7.615352056204124e-06, "loss": 0.5953, "step": 4614 }, { "epoch": 0.3287388253730812, "grad_norm": 2.063525676727295, "learning_rate": 7.61153758498267e-06, "loss": 0.0826, "step": 4615 }, { "epoch": 0.3288100580546355, "grad_norm": 3.3564841747283936, "learning_rate": 7.607723482276375e-06, "loss": 0.2124, "step": 4616 }, { "epoch": 0.32888129073618977, "grad_norm": 3.1650850772857666, "learning_rate": 7.6039097486737075e-06, "loss": 0.8242, "step": 4617 }, { "epoch": 0.32895252341774406, "grad_norm": 2.7581937313079834, "learning_rate": 7.600096384763093e-06, "loss": 0.5817, "step": 4618 }, { "epoch": 0.32902375609929835, "grad_norm": 3.3289825916290283, "learning_rate": 7.596283391132892e-06, "loss": 0.3756, "step": 4619 }, { "epoch": 0.32909498878085264, "grad_norm": 3.778393268585205, "learning_rate": 7.592470768371409e-06, "loss": 0.5461, "step": 4620 }, { "epoch": 0.3291662214624069, "grad_norm": 2.7573530673980713, "learning_rate": 7.588658517066893e-06, "loss": 0.4557, "step": 4621 }, { "epoch": 0.3292374541439613, "grad_norm": 2.9152297973632812, "learning_rate": 7.5848466378075395e-06, "loss": 0.6307, "step": 4622 }, { "epoch": 0.32930868682551556, "grad_norm": 4.435698986053467, "learning_rate": 7.581035131181473e-06, "loss": 0.5251, "step": 4623 }, { "epoch": 0.32937991950706985, "grad_norm": 2.1514344215393066, "learning_rate": 7.577223997776777e-06, "loss": 0.3338, "step": 4624 }, { "epoch": 0.32945115218862414, "grad_norm": 4.411548137664795, "learning_rate": 7.573413238181473e-06, "loss": 1.0803, "step": 4625 }, { "epoch": 0.32952238487017843, "grad_norm": 4.6066389083862305, "learning_rate": 7.569602852983511e-06, "loss": 0.6619, "step": 4626 }, { "epoch": 0.3295936175517327, "grad_norm": 2.8345000743865967, "learning_rate": 7.565792842770805e-06, "loss": 0.5672, "step": 4627 }, { "epoch": 0.329664850233287, "grad_norm": 2.3425025939941406, "learning_rate": 7.561983208131196e-06, "loss": 0.3391, "step": 4628 }, { "epoch": 0.32973608291484136, "grad_norm": 6.406670093536377, "learning_rate": 7.558173949652468e-06, "loss": 0.5353, "step": 4629 }, { "epoch": 0.32980731559639564, "grad_norm": 3.075216293334961, "learning_rate": 7.554365067922353e-06, "loss": 0.71, "step": 4630 }, { "epoch": 0.32987854827794993, "grad_norm": 4.827171325683594, "learning_rate": 7.550556563528524e-06, "loss": 0.3367, "step": 4631 }, { "epoch": 0.3299497809595042, "grad_norm": 2.769355535507202, "learning_rate": 7.546748437058596e-06, "loss": 0.6986, "step": 4632 }, { "epoch": 0.3300210136410585, "grad_norm": 2.575420379638672, "learning_rate": 7.542940689100117e-06, "loss": 0.5216, "step": 4633 }, { "epoch": 0.3300922463226128, "grad_norm": 4.132898330688477, "learning_rate": 7.539133320240589e-06, "loss": 0.7484, "step": 4634 }, { "epoch": 0.3301634790041671, "grad_norm": 2.709299325942993, "learning_rate": 7.53532633106745e-06, "loss": 0.3475, "step": 4635 }, { "epoch": 0.3302347116857214, "grad_norm": 2.271324396133423, "learning_rate": 7.531519722168072e-06, "loss": 0.3007, "step": 4636 }, { "epoch": 0.33030594436727573, "grad_norm": 4.0888214111328125, "learning_rate": 7.527713494129781e-06, "loss": 0.6332, "step": 4637 }, { "epoch": 0.33037717704883, "grad_norm": 3.9122986793518066, "learning_rate": 7.523907647539841e-06, "loss": 0.245, "step": 4638 }, { "epoch": 0.3304484097303843, "grad_norm": 2.100900173187256, "learning_rate": 7.520102182985449e-06, "loss": 0.3393, "step": 4639 }, { "epoch": 0.3305196424119386, "grad_norm": 3.5663528442382812, "learning_rate": 7.516297101053754e-06, "loss": 0.3497, "step": 4640 }, { "epoch": 0.3305908750934929, "grad_norm": 3.3819496631622314, "learning_rate": 7.51249240233184e-06, "loss": 0.5059, "step": 4641 }, { "epoch": 0.3306621077750472, "grad_norm": 2.3407459259033203, "learning_rate": 7.508688087406731e-06, "loss": 0.3237, "step": 4642 }, { "epoch": 0.33073334045660147, "grad_norm": 2.4289486408233643, "learning_rate": 7.504884156865393e-06, "loss": 0.2734, "step": 4643 }, { "epoch": 0.3308045731381558, "grad_norm": 3.331967353820801, "learning_rate": 7.501080611294739e-06, "loss": 0.7209, "step": 4644 }, { "epoch": 0.3308758058197101, "grad_norm": 3.7075650691986084, "learning_rate": 7.497277451281609e-06, "loss": 0.3987, "step": 4645 }, { "epoch": 0.3309470385012644, "grad_norm": 5.549514293670654, "learning_rate": 7.493474677412795e-06, "loss": 0.6422, "step": 4646 }, { "epoch": 0.3310182711828187, "grad_norm": 2.4122467041015625, "learning_rate": 7.48967229027503e-06, "loss": 0.4127, "step": 4647 }, { "epoch": 0.33108950386437297, "grad_norm": 2.575321674346924, "learning_rate": 7.485870290454974e-06, "loss": 0.5699, "step": 4648 }, { "epoch": 0.33116073654592726, "grad_norm": 1.947758674621582, "learning_rate": 7.482068678539245e-06, "loss": 0.2583, "step": 4649 }, { "epoch": 0.33123196922748155, "grad_norm": 3.092860221862793, "learning_rate": 7.478267455114391e-06, "loss": 0.4261, "step": 4650 }, { "epoch": 0.33130320190903584, "grad_norm": 2.9631783962249756, "learning_rate": 7.474466620766896e-06, "loss": 0.5673, "step": 4651 }, { "epoch": 0.3313744345905902, "grad_norm": 6.532232761383057, "learning_rate": 7.470666176083193e-06, "loss": 0.7001, "step": 4652 }, { "epoch": 0.3314456672721445, "grad_norm": 2.5220866203308105, "learning_rate": 7.466866121649656e-06, "loss": 0.2411, "step": 4653 }, { "epoch": 0.33151689995369876, "grad_norm": 3.19050931930542, "learning_rate": 7.463066458052586e-06, "loss": 0.2293, "step": 4654 }, { "epoch": 0.33158813263525305, "grad_norm": 3.9880757331848145, "learning_rate": 7.4592671858782365e-06, "loss": 0.593, "step": 4655 }, { "epoch": 0.33165936531680734, "grad_norm": 2.4444644451141357, "learning_rate": 7.455468305712801e-06, "loss": 0.4386, "step": 4656 }, { "epoch": 0.33173059799836163, "grad_norm": 3.051908493041992, "learning_rate": 7.451669818142398e-06, "loss": 0.4582, "step": 4657 }, { "epoch": 0.3318018306799159, "grad_norm": 2.750734806060791, "learning_rate": 7.447871723753098e-06, "loss": 0.4514, "step": 4658 }, { "epoch": 0.33187306336147027, "grad_norm": 2.960251808166504, "learning_rate": 7.444074023130914e-06, "loss": 0.6395, "step": 4659 }, { "epoch": 0.33194429604302456, "grad_norm": 3.255450963973999, "learning_rate": 7.440276716861783e-06, "loss": 0.5704, "step": 4660 }, { "epoch": 0.33201552872457885, "grad_norm": 4.107110977172852, "learning_rate": 7.436479805531595e-06, "loss": 0.6391, "step": 4661 }, { "epoch": 0.33208676140613314, "grad_norm": 3.2023603916168213, "learning_rate": 7.432683289726177e-06, "loss": 0.6267, "step": 4662 }, { "epoch": 0.3321579940876874, "grad_norm": 3.2898502349853516, "learning_rate": 7.428887170031285e-06, "loss": 0.7551, "step": 4663 }, { "epoch": 0.3322292267692417, "grad_norm": 3.108536720275879, "learning_rate": 7.425091447032629e-06, "loss": 0.5993, "step": 4664 }, { "epoch": 0.332300459450796, "grad_norm": 4.85275411605835, "learning_rate": 7.421296121315844e-06, "loss": 0.3924, "step": 4665 }, { "epoch": 0.33237169213235035, "grad_norm": 3.734985113143921, "learning_rate": 7.417501193466513e-06, "loss": 0.6086, "step": 4666 }, { "epoch": 0.33244292481390464, "grad_norm": 3.3824589252471924, "learning_rate": 7.413706664070151e-06, "loss": 0.4355, "step": 4667 }, { "epoch": 0.33251415749545893, "grad_norm": 3.119063377380371, "learning_rate": 7.409912533712218e-06, "loss": 0.4371, "step": 4668 }, { "epoch": 0.3325853901770132, "grad_norm": 5.4882636070251465, "learning_rate": 7.406118802978111e-06, "loss": 0.7406, "step": 4669 }, { "epoch": 0.3326566228585675, "grad_norm": 4.121089458465576, "learning_rate": 7.402325472453158e-06, "loss": 0.6643, "step": 4670 }, { "epoch": 0.3327278555401218, "grad_norm": 0.992104709148407, "learning_rate": 7.398532542722635e-06, "loss": 0.0342, "step": 4671 }, { "epoch": 0.3327990882216761, "grad_norm": 2.6238863468170166, "learning_rate": 7.394740014371753e-06, "loss": 0.6238, "step": 4672 }, { "epoch": 0.3328703209032304, "grad_norm": 1.8163622617721558, "learning_rate": 7.390947887985654e-06, "loss": 0.1784, "step": 4673 }, { "epoch": 0.3329415535847847, "grad_norm": 2.6156461238861084, "learning_rate": 7.387156164149427e-06, "loss": 0.3072, "step": 4674 }, { "epoch": 0.333012786266339, "grad_norm": 2.7611193656921387, "learning_rate": 7.383364843448102e-06, "loss": 0.4721, "step": 4675 }, { "epoch": 0.3330840189478933, "grad_norm": 3.305583953857422, "learning_rate": 7.379573926466631e-06, "loss": 0.5965, "step": 4676 }, { "epoch": 0.3331552516294476, "grad_norm": 2.5730974674224854, "learning_rate": 7.375783413789918e-06, "loss": 0.6213, "step": 4677 }, { "epoch": 0.3332264843110019, "grad_norm": 4.834242820739746, "learning_rate": 7.371993306002804e-06, "loss": 0.1784, "step": 4678 }, { "epoch": 0.33329771699255617, "grad_norm": 4.2106804847717285, "learning_rate": 7.368203603690057e-06, "loss": 0.8218, "step": 4679 }, { "epoch": 0.33336894967411046, "grad_norm": 3.863966464996338, "learning_rate": 7.36441430743639e-06, "loss": 0.6513, "step": 4680 }, { "epoch": 0.3334401823556648, "grad_norm": 4.0816545486450195, "learning_rate": 7.360625417826459e-06, "loss": 0.3725, "step": 4681 }, { "epoch": 0.3335114150372191, "grad_norm": 2.1528608798980713, "learning_rate": 7.356836935444841e-06, "loss": 0.3763, "step": 4682 }, { "epoch": 0.3335826477187734, "grad_norm": 2.5440661907196045, "learning_rate": 7.3530488608760645e-06, "loss": 0.5329, "step": 4683 }, { "epoch": 0.3336538804003277, "grad_norm": 3.0768258571624756, "learning_rate": 7.349261194704596e-06, "loss": 0.6678, "step": 4684 }, { "epoch": 0.33372511308188196, "grad_norm": 2.475461006164551, "learning_rate": 7.345473937514822e-06, "loss": 0.083, "step": 4685 }, { "epoch": 0.33379634576343625, "grad_norm": 2.7890825271606445, "learning_rate": 7.341687089891085e-06, "loss": 0.2912, "step": 4686 }, { "epoch": 0.33386757844499054, "grad_norm": 3.426164388656616, "learning_rate": 7.337900652417656e-06, "loss": 0.6457, "step": 4687 }, { "epoch": 0.33393881112654483, "grad_norm": 2.4080028533935547, "learning_rate": 7.334114625678741e-06, "loss": 0.5408, "step": 4688 }, { "epoch": 0.3340100438080992, "grad_norm": 3.2693088054656982, "learning_rate": 7.330329010258483e-06, "loss": 0.5842, "step": 4689 }, { "epoch": 0.33408127648965347, "grad_norm": 3.40964412689209, "learning_rate": 7.3265438067409725e-06, "loss": 0.6186, "step": 4690 }, { "epoch": 0.33415250917120776, "grad_norm": 3.0939197540283203, "learning_rate": 7.3227590157102165e-06, "loss": 0.3751, "step": 4691 }, { "epoch": 0.33422374185276205, "grad_norm": 2.7414748668670654, "learning_rate": 7.318974637750174e-06, "loss": 0.5271, "step": 4692 }, { "epoch": 0.33429497453431634, "grad_norm": 2.5110113620758057, "learning_rate": 7.31519067344474e-06, "loss": 0.3427, "step": 4693 }, { "epoch": 0.3343662072158706, "grad_norm": 2.220691204071045, "learning_rate": 7.311407123377734e-06, "loss": 0.322, "step": 4694 }, { "epoch": 0.3344374398974249, "grad_norm": 2.3473405838012695, "learning_rate": 7.307623988132921e-06, "loss": 0.293, "step": 4695 }, { "epoch": 0.33450867257897926, "grad_norm": 3.252110242843628, "learning_rate": 7.303841268294004e-06, "loss": 0.1875, "step": 4696 }, { "epoch": 0.33457990526053355, "grad_norm": 3.829645872116089, "learning_rate": 7.30005896444461e-06, "loss": 0.6002, "step": 4697 }, { "epoch": 0.33465113794208784, "grad_norm": 4.331864356994629, "learning_rate": 7.2962770771683144e-06, "loss": 0.6226, "step": 4698 }, { "epoch": 0.33472237062364213, "grad_norm": 3.861477851867676, "learning_rate": 7.292495607048626e-06, "loss": 0.1657, "step": 4699 }, { "epoch": 0.3347936033051964, "grad_norm": 2.3378777503967285, "learning_rate": 7.28871455466898e-06, "loss": 0.1094, "step": 4700 }, { "epoch": 0.3348648359867507, "grad_norm": 3.6868815422058105, "learning_rate": 7.284933920612759e-06, "loss": 0.4487, "step": 4701 }, { "epoch": 0.334936068668305, "grad_norm": 2.693251371383667, "learning_rate": 7.281153705463275e-06, "loss": 0.5801, "step": 4702 }, { "epoch": 0.3350073013498593, "grad_norm": 5.3297953605651855, "learning_rate": 7.277373909803774e-06, "loss": 0.7727, "step": 4703 }, { "epoch": 0.33507853403141363, "grad_norm": 2.272310495376587, "learning_rate": 7.273594534217441e-06, "loss": 0.3066, "step": 4704 }, { "epoch": 0.3351497667129679, "grad_norm": 3.0324172973632812, "learning_rate": 7.269815579287398e-06, "loss": 0.4281, "step": 4705 }, { "epoch": 0.3352209993945222, "grad_norm": 2.915126323699951, "learning_rate": 7.266037045596692e-06, "loss": 0.628, "step": 4706 }, { "epoch": 0.3352922320760765, "grad_norm": 3.726095199584961, "learning_rate": 7.262258933728314e-06, "loss": 0.6237, "step": 4707 }, { "epoch": 0.3353634647576308, "grad_norm": 3.920219898223877, "learning_rate": 7.258481244265193e-06, "loss": 0.258, "step": 4708 }, { "epoch": 0.3354346974391851, "grad_norm": 4.451740741729736, "learning_rate": 7.254703977790183e-06, "loss": 0.5678, "step": 4709 }, { "epoch": 0.33550593012073937, "grad_norm": 6.515570640563965, "learning_rate": 7.2509271348860785e-06, "loss": 0.177, "step": 4710 }, { "epoch": 0.3355771628022937, "grad_norm": 2.8827931880950928, "learning_rate": 7.247150716135605e-06, "loss": 0.6024, "step": 4711 }, { "epoch": 0.335648395483848, "grad_norm": 2.2882485389709473, "learning_rate": 7.243374722121431e-06, "loss": 0.3374, "step": 4712 }, { "epoch": 0.3357196281654023, "grad_norm": 3.288632869720459, "learning_rate": 7.2395991534261456e-06, "loss": 0.4757, "step": 4713 }, { "epoch": 0.3357908608469566, "grad_norm": 3.0226128101348877, "learning_rate": 7.235824010632284e-06, "loss": 0.641, "step": 4714 }, { "epoch": 0.3358620935285109, "grad_norm": 2.041171073913574, "learning_rate": 7.232049294322316e-06, "loss": 0.1676, "step": 4715 }, { "epoch": 0.33593332621006516, "grad_norm": 2.9793519973754883, "learning_rate": 7.2282750050786374e-06, "loss": 0.3894, "step": 4716 }, { "epoch": 0.33600455889161945, "grad_norm": 2.543403387069702, "learning_rate": 7.2245011434835775e-06, "loss": 0.3843, "step": 4717 }, { "epoch": 0.3360757915731738, "grad_norm": 3.9400813579559326, "learning_rate": 7.220727710119415e-06, "loss": 0.784, "step": 4718 }, { "epoch": 0.3361470242547281, "grad_norm": 3.1653478145599365, "learning_rate": 7.216954705568342e-06, "loss": 0.3279, "step": 4719 }, { "epoch": 0.3362182569362824, "grad_norm": 4.219123840332031, "learning_rate": 7.2131821304124974e-06, "loss": 0.8888, "step": 4720 }, { "epoch": 0.33628948961783667, "grad_norm": 3.61352801322937, "learning_rate": 7.209409985233955e-06, "loss": 0.5115, "step": 4721 }, { "epoch": 0.33636072229939096, "grad_norm": 3.388252019882202, "learning_rate": 7.20563827061471e-06, "loss": 0.4284, "step": 4722 }, { "epoch": 0.33643195498094525, "grad_norm": 2.386537790298462, "learning_rate": 7.201866987136706e-06, "loss": 0.557, "step": 4723 }, { "epoch": 0.33650318766249954, "grad_norm": 6.358150005340576, "learning_rate": 7.198096135381811e-06, "loss": 0.674, "step": 4724 }, { "epoch": 0.3365744203440538, "grad_norm": 3.321011543273926, "learning_rate": 7.1943257159318295e-06, "loss": 0.403, "step": 4725 }, { "epoch": 0.33664565302560817, "grad_norm": 3.3497908115386963, "learning_rate": 7.190555729368492e-06, "loss": 0.7883, "step": 4726 }, { "epoch": 0.33671688570716246, "grad_norm": 3.562044858932495, "learning_rate": 7.18678617627348e-06, "loss": 0.4854, "step": 4727 }, { "epoch": 0.33678811838871675, "grad_norm": 4.476166248321533, "learning_rate": 7.183017057228386e-06, "loss": 0.5739, "step": 4728 }, { "epoch": 0.33685935107027104, "grad_norm": 2.8775153160095215, "learning_rate": 7.179248372814751e-06, "loss": 0.4623, "step": 4729 }, { "epoch": 0.33693058375182533, "grad_norm": 2.684236526489258, "learning_rate": 7.175480123614048e-06, "loss": 0.561, "step": 4730 }, { "epoch": 0.3370018164333796, "grad_norm": 2.7175350189208984, "learning_rate": 7.17171231020767e-06, "loss": 0.6431, "step": 4731 }, { "epoch": 0.3370730491149339, "grad_norm": 4.444867134094238, "learning_rate": 7.16794493317696e-06, "loss": 0.4072, "step": 4732 }, { "epoch": 0.33714428179648825, "grad_norm": 2.5962066650390625, "learning_rate": 7.164177993103185e-06, "loss": 0.5839, "step": 4733 }, { "epoch": 0.33721551447804254, "grad_norm": 3.9787557125091553, "learning_rate": 7.160411490567536e-06, "loss": 0.4617, "step": 4734 }, { "epoch": 0.33728674715959683, "grad_norm": 3.309234619140625, "learning_rate": 7.156645426151154e-06, "loss": 0.555, "step": 4735 }, { "epoch": 0.3373579798411511, "grad_norm": 5.822576522827148, "learning_rate": 7.152879800435104e-06, "loss": 0.7978, "step": 4736 }, { "epoch": 0.3374292125227054, "grad_norm": 3.013986587524414, "learning_rate": 7.149114614000378e-06, "loss": 0.4046, "step": 4737 }, { "epoch": 0.3375004452042597, "grad_norm": 2.131957530975342, "learning_rate": 7.145349867427911e-06, "loss": 0.5895, "step": 4738 }, { "epoch": 0.337571677885814, "grad_norm": 3.5478086471557617, "learning_rate": 7.141585561298563e-06, "loss": 0.4921, "step": 4739 }, { "epoch": 0.3376429105673683, "grad_norm": 3.1063308715820312, "learning_rate": 7.137821696193126e-06, "loss": 0.7237, "step": 4740 }, { "epoch": 0.3377141432489226, "grad_norm": 2.1285247802734375, "learning_rate": 7.1340582726923235e-06, "loss": 0.4996, "step": 4741 }, { "epoch": 0.3377853759304769, "grad_norm": 2.4835000038146973, "learning_rate": 7.1302952913768205e-06, "loss": 0.3966, "step": 4742 }, { "epoch": 0.3378566086120312, "grad_norm": 7.871196269989014, "learning_rate": 7.1265327528272e-06, "loss": 0.716, "step": 4743 }, { "epoch": 0.3379278412935855, "grad_norm": 4.366952419281006, "learning_rate": 7.122770657623982e-06, "loss": 0.6048, "step": 4744 }, { "epoch": 0.3379990739751398, "grad_norm": 3.2660560607910156, "learning_rate": 7.119009006347625e-06, "loss": 0.7338, "step": 4745 }, { "epoch": 0.3380703066566941, "grad_norm": 3.306685447692871, "learning_rate": 7.1152477995785095e-06, "loss": 0.2875, "step": 4746 }, { "epoch": 0.33814153933824836, "grad_norm": 2.4365556240081787, "learning_rate": 7.111487037896951e-06, "loss": 0.2322, "step": 4747 }, { "epoch": 0.3382127720198027, "grad_norm": 2.6193106174468994, "learning_rate": 7.107726721883196e-06, "loss": 0.4654, "step": 4748 }, { "epoch": 0.338284004701357, "grad_norm": 3.8142549991607666, "learning_rate": 7.1039668521174256e-06, "loss": 0.4697, "step": 4749 }, { "epoch": 0.3383552373829113, "grad_norm": 5.384443283081055, "learning_rate": 7.100207429179744e-06, "loss": 0.545, "step": 4750 }, { "epoch": 0.3384264700644656, "grad_norm": 3.665336847305298, "learning_rate": 7.096448453650193e-06, "loss": 0.5577, "step": 4751 }, { "epoch": 0.33849770274601987, "grad_norm": 2.60298752784729, "learning_rate": 7.092689926108749e-06, "loss": 0.1817, "step": 4752 }, { "epoch": 0.33856893542757416, "grad_norm": 1.9959498643875122, "learning_rate": 7.088931847135305e-06, "loss": 0.1655, "step": 4753 }, { "epoch": 0.33864016810912845, "grad_norm": 2.7104592323303223, "learning_rate": 7.085174217309703e-06, "loss": 0.5599, "step": 4754 }, { "epoch": 0.3387114007906828, "grad_norm": 2.35239839553833, "learning_rate": 7.081417037211702e-06, "loss": 0.3947, "step": 4755 }, { "epoch": 0.3387826334722371, "grad_norm": 3.9957025051116943, "learning_rate": 7.077660307420995e-06, "loss": 0.4625, "step": 4756 }, { "epoch": 0.33885386615379137, "grad_norm": 3.202535629272461, "learning_rate": 7.073904028517207e-06, "loss": 0.578, "step": 4757 }, { "epoch": 0.33892509883534566, "grad_norm": 2.579317092895508, "learning_rate": 7.070148201079898e-06, "loss": 0.5306, "step": 4758 }, { "epoch": 0.33899633151689995, "grad_norm": 3.778904914855957, "learning_rate": 7.066392825688546e-06, "loss": 0.5105, "step": 4759 }, { "epoch": 0.33906756419845424, "grad_norm": 1.2797620296478271, "learning_rate": 7.0626379029225735e-06, "loss": 0.1164, "step": 4760 }, { "epoch": 0.33913879688000853, "grad_norm": 3.023770332336426, "learning_rate": 7.058883433361323e-06, "loss": 0.4621, "step": 4761 }, { "epoch": 0.3392100295615628, "grad_norm": 3.004737377166748, "learning_rate": 7.05512941758407e-06, "loss": 0.7048, "step": 4762 }, { "epoch": 0.33928126224311717, "grad_norm": 2.357022523880005, "learning_rate": 7.051375856170022e-06, "loss": 0.5985, "step": 4763 }, { "epoch": 0.33935249492467146, "grad_norm": 3.745877981185913, "learning_rate": 7.047622749698317e-06, "loss": 0.6592, "step": 4764 }, { "epoch": 0.33942372760622574, "grad_norm": 2.6589133739471436, "learning_rate": 7.043870098748013e-06, "loss": 0.6308, "step": 4765 }, { "epoch": 0.33949496028778003, "grad_norm": 2.8550667762756348, "learning_rate": 7.040117903898112e-06, "loss": 0.6624, "step": 4766 }, { "epoch": 0.3395661929693343, "grad_norm": 5.4388604164123535, "learning_rate": 7.036366165727542e-06, "loss": 0.7204, "step": 4767 }, { "epoch": 0.3396374256508886, "grad_norm": 2.1139230728149414, "learning_rate": 7.0326148848151485e-06, "loss": 0.2086, "step": 4768 }, { "epoch": 0.3397086583324429, "grad_norm": 2.4438607692718506, "learning_rate": 7.028864061739722e-06, "loss": 0.2652, "step": 4769 }, { "epoch": 0.33977989101399725, "grad_norm": 2.440413475036621, "learning_rate": 7.025113697079977e-06, "loss": 0.3051, "step": 4770 }, { "epoch": 0.33985112369555154, "grad_norm": 2.243920087814331, "learning_rate": 7.021363791414548e-06, "loss": 0.3436, "step": 4771 }, { "epoch": 0.3399223563771058, "grad_norm": 3.3423895835876465, "learning_rate": 7.017614345322012e-06, "loss": 0.5154, "step": 4772 }, { "epoch": 0.3399935890586601, "grad_norm": 2.8725674152374268, "learning_rate": 7.0138653593808736e-06, "loss": 0.5797, "step": 4773 }, { "epoch": 0.3400648217402144, "grad_norm": 3.0918385982513428, "learning_rate": 7.0101168341695556e-06, "loss": 0.3637, "step": 4774 }, { "epoch": 0.3401360544217687, "grad_norm": 3.099296808242798, "learning_rate": 7.006368770266421e-06, "loss": 0.708, "step": 4775 }, { "epoch": 0.340207287103323, "grad_norm": 1.9412039518356323, "learning_rate": 7.002621168249759e-06, "loss": 0.415, "step": 4776 }, { "epoch": 0.3402785197848773, "grad_norm": 7.266351699829102, "learning_rate": 6.998874028697782e-06, "loss": 0.4894, "step": 4777 }, { "epoch": 0.3403497524664316, "grad_norm": 3.3867685794830322, "learning_rate": 6.995127352188635e-06, "loss": 0.7751, "step": 4778 }, { "epoch": 0.3404209851479859, "grad_norm": 3.605632781982422, "learning_rate": 6.9913811393003985e-06, "loss": 0.5706, "step": 4779 }, { "epoch": 0.3404922178295402, "grad_norm": 2.5975501537323, "learning_rate": 6.987635390611065e-06, "loss": 0.74, "step": 4780 }, { "epoch": 0.3405634505110945, "grad_norm": 2.4731690883636475, "learning_rate": 6.983890106698567e-06, "loss": 0.2506, "step": 4781 }, { "epoch": 0.3406346831926488, "grad_norm": 3.398949384689331, "learning_rate": 6.980145288140772e-06, "loss": 0.501, "step": 4782 }, { "epoch": 0.34070591587420307, "grad_norm": 3.1422410011291504, "learning_rate": 6.976400935515457e-06, "loss": 0.5294, "step": 4783 }, { "epoch": 0.34077714855575736, "grad_norm": 9.75432014465332, "learning_rate": 6.972657049400342e-06, "loss": 0.6878, "step": 4784 }, { "epoch": 0.3408483812373117, "grad_norm": 1.9270435571670532, "learning_rate": 6.968913630373066e-06, "loss": 0.1229, "step": 4785 }, { "epoch": 0.340919613918866, "grad_norm": 2.232792854309082, "learning_rate": 6.965170679011207e-06, "loss": 0.2111, "step": 4786 }, { "epoch": 0.3409908466004203, "grad_norm": 3.170010805130005, "learning_rate": 6.961428195892256e-06, "loss": 0.5353, "step": 4787 }, { "epoch": 0.3410620792819746, "grad_norm": 2.7573797702789307, "learning_rate": 6.957686181593642e-06, "loss": 0.6759, "step": 4788 }, { "epoch": 0.34113331196352886, "grad_norm": 2.215785503387451, "learning_rate": 6.953944636692727e-06, "loss": 0.2477, "step": 4789 }, { "epoch": 0.34120454464508315, "grad_norm": 2.5929205417633057, "learning_rate": 6.95020356176678e-06, "loss": 0.442, "step": 4790 }, { "epoch": 0.34127577732663744, "grad_norm": 1.2272286415100098, "learning_rate": 6.946462957393019e-06, "loss": 0.1416, "step": 4791 }, { "epoch": 0.34134701000819173, "grad_norm": 2.244680404663086, "learning_rate": 6.94272282414858e-06, "loss": 0.2266, "step": 4792 }, { "epoch": 0.3414182426897461, "grad_norm": 2.0117759704589844, "learning_rate": 6.938983162610522e-06, "loss": 0.2998, "step": 4793 }, { "epoch": 0.34148947537130037, "grad_norm": 2.9041683673858643, "learning_rate": 6.935243973355839e-06, "loss": 0.668, "step": 4794 }, { "epoch": 0.34156070805285466, "grad_norm": 2.700071096420288, "learning_rate": 6.931505256961454e-06, "loss": 0.4382, "step": 4795 }, { "epoch": 0.34163194073440895, "grad_norm": 4.15911865234375, "learning_rate": 6.9277670140042055e-06, "loss": 0.4111, "step": 4796 }, { "epoch": 0.34170317341596324, "grad_norm": 3.085056781768799, "learning_rate": 6.924029245060868e-06, "loss": 0.6224, "step": 4797 }, { "epoch": 0.3417744060975175, "grad_norm": 6.057473659515381, "learning_rate": 6.920291950708144e-06, "loss": 0.2882, "step": 4798 }, { "epoch": 0.3418456387790718, "grad_norm": 5.635477542877197, "learning_rate": 6.916555131522657e-06, "loss": 0.4232, "step": 4799 }, { "epoch": 0.34191687146062616, "grad_norm": 2.725903034210205, "learning_rate": 6.9128187880809595e-06, "loss": 0.4424, "step": 4800 }, { "epoch": 0.34198810414218045, "grad_norm": 2.6637706756591797, "learning_rate": 6.909082920959534e-06, "loss": 0.601, "step": 4801 }, { "epoch": 0.34205933682373474, "grad_norm": 3.359829902648926, "learning_rate": 6.905347530734778e-06, "loss": 0.6736, "step": 4802 }, { "epoch": 0.34213056950528903, "grad_norm": 2.4741768836975098, "learning_rate": 6.90161261798303e-06, "loss": 0.3358, "step": 4803 }, { "epoch": 0.3422018021868433, "grad_norm": 2.2299506664276123, "learning_rate": 6.897878183280553e-06, "loss": 0.3462, "step": 4804 }, { "epoch": 0.3422730348683976, "grad_norm": 3.1643290519714355, "learning_rate": 6.894144227203521e-06, "loss": 0.68, "step": 4805 }, { "epoch": 0.3423442675499519, "grad_norm": 4.19067907333374, "learning_rate": 6.890410750328054e-06, "loss": 0.3393, "step": 4806 }, { "epoch": 0.34241550023150624, "grad_norm": 4.077755928039551, "learning_rate": 6.886677753230184e-06, "loss": 0.8221, "step": 4807 }, { "epoch": 0.34248673291306053, "grad_norm": 1.9689630270004272, "learning_rate": 6.8829452364858776e-06, "loss": 0.1426, "step": 4808 }, { "epoch": 0.3425579655946148, "grad_norm": 3.5670950412750244, "learning_rate": 6.8792132006710175e-06, "loss": 0.4754, "step": 4809 }, { "epoch": 0.3426291982761691, "grad_norm": 2.9554128646850586, "learning_rate": 6.875481646361428e-06, "loss": 0.569, "step": 4810 }, { "epoch": 0.3427004309577234, "grad_norm": 3.624121904373169, "learning_rate": 6.871750574132841e-06, "loss": 0.5342, "step": 4811 }, { "epoch": 0.3427716636392777, "grad_norm": 2.841165781021118, "learning_rate": 6.868019984560925e-06, "loss": 0.4276, "step": 4812 }, { "epoch": 0.342842896320832, "grad_norm": 2.2719078063964844, "learning_rate": 6.864289878221275e-06, "loss": 0.2339, "step": 4813 }, { "epoch": 0.34291412900238627, "grad_norm": 4.226673126220703, "learning_rate": 6.8605602556894056e-06, "loss": 0.7367, "step": 4814 }, { "epoch": 0.3429853616839406, "grad_norm": 3.7817306518554688, "learning_rate": 6.8568311175407546e-06, "loss": 0.6392, "step": 4815 }, { "epoch": 0.3430565943654949, "grad_norm": 2.490841865539551, "learning_rate": 6.853102464350698e-06, "loss": 0.1758, "step": 4816 }, { "epoch": 0.3431278270470492, "grad_norm": 5.021557807922363, "learning_rate": 6.849374296694522e-06, "loss": 0.7158, "step": 4817 }, { "epoch": 0.3431990597286035, "grad_norm": 2.3700428009033203, "learning_rate": 6.845646615147445e-06, "loss": 0.2074, "step": 4818 }, { "epoch": 0.3432702924101578, "grad_norm": 3.45526123046875, "learning_rate": 6.841919420284618e-06, "loss": 0.4776, "step": 4819 }, { "epoch": 0.34334152509171206, "grad_norm": 2.8978049755096436, "learning_rate": 6.8381927126810965e-06, "loss": 0.4579, "step": 4820 }, { "epoch": 0.34341275777326635, "grad_norm": 3.1322989463806152, "learning_rate": 6.834466492911882e-06, "loss": 0.5128, "step": 4821 }, { "epoch": 0.3434839904548207, "grad_norm": 2.163713216781616, "learning_rate": 6.8307407615518865e-06, "loss": 0.2235, "step": 4822 }, { "epoch": 0.343555223136375, "grad_norm": 3.480546474456787, "learning_rate": 6.827015519175958e-06, "loss": 0.5567, "step": 4823 }, { "epoch": 0.3436264558179293, "grad_norm": 2.81010103225708, "learning_rate": 6.823290766358857e-06, "loss": 0.38, "step": 4824 }, { "epoch": 0.34369768849948357, "grad_norm": 3.623692274093628, "learning_rate": 6.819566503675274e-06, "loss": 0.4088, "step": 4825 }, { "epoch": 0.34376892118103786, "grad_norm": 1.9832253456115723, "learning_rate": 6.815842731699834e-06, "loss": 0.1664, "step": 4826 }, { "epoch": 0.34384015386259215, "grad_norm": 2.6226706504821777, "learning_rate": 6.812119451007067e-06, "loss": 0.5828, "step": 4827 }, { "epoch": 0.34391138654414644, "grad_norm": 1.2843459844589233, "learning_rate": 6.808396662171439e-06, "loss": 0.1556, "step": 4828 }, { "epoch": 0.3439826192257007, "grad_norm": 2.8886210918426514, "learning_rate": 6.804674365767341e-06, "loss": 0.6026, "step": 4829 }, { "epoch": 0.34405385190725507, "grad_norm": 3.151761054992676, "learning_rate": 6.8009525623690805e-06, "loss": 0.3041, "step": 4830 }, { "epoch": 0.34412508458880936, "grad_norm": 3.1509010791778564, "learning_rate": 6.797231252550895e-06, "loss": 0.4654, "step": 4831 }, { "epoch": 0.34419631727036365, "grad_norm": 1.6157203912734985, "learning_rate": 6.793510436886951e-06, "loss": 0.107, "step": 4832 }, { "epoch": 0.34426754995191794, "grad_norm": 3.401339530944824, "learning_rate": 6.78979011595132e-06, "loss": 0.5701, "step": 4833 }, { "epoch": 0.34433878263347223, "grad_norm": 2.2026162147521973, "learning_rate": 6.7860702903180165e-06, "loss": 0.2869, "step": 4834 }, { "epoch": 0.3444100153150265, "grad_norm": 2.592362880706787, "learning_rate": 6.782350960560973e-06, "loss": 0.1868, "step": 4835 }, { "epoch": 0.3444812479965808, "grad_norm": 4.046363353729248, "learning_rate": 6.778632127254039e-06, "loss": 0.7324, "step": 4836 }, { "epoch": 0.34455248067813515, "grad_norm": 2.8826816082000732, "learning_rate": 6.774913790970994e-06, "loss": 0.447, "step": 4837 }, { "epoch": 0.34462371335968944, "grad_norm": 2.591515302658081, "learning_rate": 6.771195952285541e-06, "loss": 0.1675, "step": 4838 }, { "epoch": 0.34469494604124373, "grad_norm": 3.100597858428955, "learning_rate": 6.7674786117712985e-06, "loss": 0.2698, "step": 4839 }, { "epoch": 0.344766178722798, "grad_norm": 3.3664910793304443, "learning_rate": 6.763761770001817e-06, "loss": 0.2972, "step": 4840 }, { "epoch": 0.3448374114043523, "grad_norm": 3.6919467449188232, "learning_rate": 6.760045427550574e-06, "loss": 0.4537, "step": 4841 }, { "epoch": 0.3449086440859066, "grad_norm": 2.146289587020874, "learning_rate": 6.75632958499095e-06, "loss": 0.2069, "step": 4842 }, { "epoch": 0.3449798767674609, "grad_norm": 5.881043434143066, "learning_rate": 6.752614242896271e-06, "loss": 0.7743, "step": 4843 }, { "epoch": 0.3450511094490152, "grad_norm": 1.9740699529647827, "learning_rate": 6.748899401839774e-06, "loss": 0.3582, "step": 4844 }, { "epoch": 0.3451223421305695, "grad_norm": 2.859685182571411, "learning_rate": 6.745185062394617e-06, "loss": 0.5597, "step": 4845 }, { "epoch": 0.3451935748121238, "grad_norm": 3.1009833812713623, "learning_rate": 6.741471225133886e-06, "loss": 0.1753, "step": 4846 }, { "epoch": 0.3452648074936781, "grad_norm": 3.394461154937744, "learning_rate": 6.737757890630593e-06, "loss": 0.3417, "step": 4847 }, { "epoch": 0.3453360401752324, "grad_norm": 2.6882078647613525, "learning_rate": 6.734045059457658e-06, "loss": 0.6623, "step": 4848 }, { "epoch": 0.3454072728567867, "grad_norm": 2.342116355895996, "learning_rate": 6.7303327321879375e-06, "loss": 0.2812, "step": 4849 }, { "epoch": 0.345478505538341, "grad_norm": 3.7741892337799072, "learning_rate": 6.7266209093942104e-06, "loss": 0.4175, "step": 4850 }, { "epoch": 0.34554973821989526, "grad_norm": 2.045198678970337, "learning_rate": 6.722909591649163e-06, "loss": 0.2299, "step": 4851 }, { "epoch": 0.3456209709014496, "grad_norm": 2.545976161956787, "learning_rate": 6.7191987795254195e-06, "loss": 0.1207, "step": 4852 }, { "epoch": 0.3456922035830039, "grad_norm": 2.971576452255249, "learning_rate": 6.715488473595522e-06, "loss": 0.2739, "step": 4853 }, { "epoch": 0.3457634362645582, "grad_norm": 2.9075963497161865, "learning_rate": 6.7117786744319235e-06, "loss": 0.5607, "step": 4854 }, { "epoch": 0.3458346689461125, "grad_norm": 4.331082820892334, "learning_rate": 6.708069382607015e-06, "loss": 0.3487, "step": 4855 }, { "epoch": 0.34590590162766677, "grad_norm": 4.891874313354492, "learning_rate": 6.704360598693103e-06, "loss": 0.7439, "step": 4856 }, { "epoch": 0.34597713430922106, "grad_norm": 4.84330940246582, "learning_rate": 6.700652323262409e-06, "loss": 0.7868, "step": 4857 }, { "epoch": 0.34604836699077535, "grad_norm": 4.130517482757568, "learning_rate": 6.696944556887086e-06, "loss": 0.8271, "step": 4858 }, { "epoch": 0.3461195996723297, "grad_norm": 1.6717966794967651, "learning_rate": 6.693237300139201e-06, "loss": 0.2593, "step": 4859 }, { "epoch": 0.346190832353884, "grad_norm": 2.7709686756134033, "learning_rate": 6.6895305535907515e-06, "loss": 0.2399, "step": 4860 }, { "epoch": 0.34626206503543827, "grad_norm": 2.3098409175872803, "learning_rate": 6.6858243178136425e-06, "loss": 0.5491, "step": 4861 }, { "epoch": 0.34633329771699256, "grad_norm": 2.548084259033203, "learning_rate": 6.682118593379713e-06, "loss": 0.4264, "step": 4862 }, { "epoch": 0.34640453039854685, "grad_norm": 6.9492902755737305, "learning_rate": 6.67841338086072e-06, "loss": 0.7408, "step": 4863 }, { "epoch": 0.34647576308010114, "grad_norm": 2.9930858612060547, "learning_rate": 6.674708680828332e-06, "loss": 0.6781, "step": 4864 }, { "epoch": 0.34654699576165543, "grad_norm": 6.3806352615356445, "learning_rate": 6.671004493854154e-06, "loss": 0.3448, "step": 4865 }, { "epoch": 0.3466182284432097, "grad_norm": 3.4364588260650635, "learning_rate": 6.6673008205097e-06, "loss": 0.6224, "step": 4866 }, { "epoch": 0.34668946112476406, "grad_norm": 2.871328353881836, "learning_rate": 6.66359766136641e-06, "loss": 0.5827, "step": 4867 }, { "epoch": 0.34676069380631835, "grad_norm": 4.184079647064209, "learning_rate": 6.659895016995639e-06, "loss": 0.2079, "step": 4868 }, { "epoch": 0.34683192648787264, "grad_norm": 2.856982469558716, "learning_rate": 6.656192887968675e-06, "loss": 0.6883, "step": 4869 }, { "epoch": 0.34690315916942693, "grad_norm": 3.907534122467041, "learning_rate": 6.652491274856711e-06, "loss": 0.651, "step": 4870 }, { "epoch": 0.3469743918509812, "grad_norm": 2.6139230728149414, "learning_rate": 6.6487901782308685e-06, "loss": 0.2549, "step": 4871 }, { "epoch": 0.3470456245325355, "grad_norm": 3.611663579940796, "learning_rate": 6.645089598662197e-06, "loss": 0.572, "step": 4872 }, { "epoch": 0.3471168572140898, "grad_norm": 2.354820966720581, "learning_rate": 6.641389536721646e-06, "loss": 0.1621, "step": 4873 }, { "epoch": 0.34718808989564415, "grad_norm": 1.9360687732696533, "learning_rate": 6.637689992980105e-06, "loss": 0.16, "step": 4874 }, { "epoch": 0.34725932257719844, "grad_norm": 2.9454941749572754, "learning_rate": 6.633990968008374e-06, "loss": 0.196, "step": 4875 }, { "epoch": 0.3473305552587527, "grad_norm": 2.488171100616455, "learning_rate": 6.630292462377172e-06, "loss": 0.471, "step": 4876 }, { "epoch": 0.347401787940307, "grad_norm": 1.4446258544921875, "learning_rate": 6.62659447665714e-06, "loss": 0.0585, "step": 4877 }, { "epoch": 0.3474730206218613, "grad_norm": 2.420793056488037, "learning_rate": 6.622897011418845e-06, "loss": 0.3122, "step": 4878 }, { "epoch": 0.3475442533034156, "grad_norm": 5.568025588989258, "learning_rate": 6.619200067232758e-06, "loss": 0.8038, "step": 4879 }, { "epoch": 0.3476154859849699, "grad_norm": 2.899669885635376, "learning_rate": 6.6155036446692895e-06, "loss": 0.5081, "step": 4880 }, { "epoch": 0.3476867186665242, "grad_norm": 2.1028101444244385, "learning_rate": 6.6118077442987545e-06, "loss": 0.1814, "step": 4881 }, { "epoch": 0.3477579513480785, "grad_norm": 5.894082069396973, "learning_rate": 6.608112366691393e-06, "loss": 0.708, "step": 4882 }, { "epoch": 0.3478291840296328, "grad_norm": 2.637317180633545, "learning_rate": 6.604417512417362e-06, "loss": 0.2134, "step": 4883 }, { "epoch": 0.3479004167111871, "grad_norm": 2.6783599853515625, "learning_rate": 6.600723182046744e-06, "loss": 0.465, "step": 4884 }, { "epoch": 0.3479716493927414, "grad_norm": 4.516765117645264, "learning_rate": 6.5970293761495305e-06, "loss": 0.6863, "step": 4885 }, { "epoch": 0.3480428820742957, "grad_norm": 3.7485268115997314, "learning_rate": 6.593336095295639e-06, "loss": 0.7217, "step": 4886 }, { "epoch": 0.34811411475584997, "grad_norm": 2.700948715209961, "learning_rate": 6.589643340054911e-06, "loss": 0.6704, "step": 4887 }, { "epoch": 0.34818534743740426, "grad_norm": 5.567882061004639, "learning_rate": 6.585951110997092e-06, "loss": 0.5483, "step": 4888 }, { "epoch": 0.3482565801189586, "grad_norm": 4.203105449676514, "learning_rate": 6.58225940869186e-06, "loss": 0.3753, "step": 4889 }, { "epoch": 0.3483278128005129, "grad_norm": 3.416846752166748, "learning_rate": 6.5785682337088085e-06, "loss": 0.6087, "step": 4890 }, { "epoch": 0.3483990454820672, "grad_norm": 2.2568891048431396, "learning_rate": 6.574877586617439e-06, "loss": 0.1811, "step": 4891 }, { "epoch": 0.34847027816362147, "grad_norm": 1.501835823059082, "learning_rate": 6.571187467987187e-06, "loss": 0.2242, "step": 4892 }, { "epoch": 0.34854151084517576, "grad_norm": 4.210564613342285, "learning_rate": 6.567497878387402e-06, "loss": 0.3943, "step": 4893 }, { "epoch": 0.34861274352673005, "grad_norm": 2.154968500137329, "learning_rate": 6.563808818387342e-06, "loss": 0.2769, "step": 4894 }, { "epoch": 0.34868397620828434, "grad_norm": 3.29455828666687, "learning_rate": 6.560120288556197e-06, "loss": 0.4375, "step": 4895 }, { "epoch": 0.34875520888983863, "grad_norm": 3.2181155681610107, "learning_rate": 6.5564322894630705e-06, "loss": 0.6408, "step": 4896 }, { "epoch": 0.348826441571393, "grad_norm": 1.9203405380249023, "learning_rate": 6.552744821676978e-06, "loss": 0.1255, "step": 4897 }, { "epoch": 0.34889767425294727, "grad_norm": 4.009979724884033, "learning_rate": 6.549057885766859e-06, "loss": 0.5405, "step": 4898 }, { "epoch": 0.34896890693450155, "grad_norm": 5.469440460205078, "learning_rate": 6.545371482301568e-06, "loss": 0.9117, "step": 4899 }, { "epoch": 0.34904013961605584, "grad_norm": 1.86632239818573, "learning_rate": 6.5416856118498874e-06, "loss": 0.0533, "step": 4900 }, { "epoch": 0.34911137229761013, "grad_norm": 2.4831655025482178, "learning_rate": 6.538000274980498e-06, "loss": 0.2106, "step": 4901 }, { "epoch": 0.3491826049791644, "grad_norm": 2.7112908363342285, "learning_rate": 6.5343154722620174e-06, "loss": 0.1925, "step": 4902 }, { "epoch": 0.3492538376607187, "grad_norm": 2.3886537551879883, "learning_rate": 6.53063120426297e-06, "loss": 0.3178, "step": 4903 }, { "epoch": 0.34932507034227306, "grad_norm": 2.8526394367218018, "learning_rate": 6.526947471551799e-06, "loss": 0.4598, "step": 4904 }, { "epoch": 0.34939630302382735, "grad_norm": 2.6318199634552, "learning_rate": 6.5232642746968655e-06, "loss": 0.5237, "step": 4905 }, { "epoch": 0.34946753570538164, "grad_norm": 3.1273269653320312, "learning_rate": 6.519581614266456e-06, "loss": 0.4734, "step": 4906 }, { "epoch": 0.3495387683869359, "grad_norm": 3.2111191749572754, "learning_rate": 6.515899490828758e-06, "loss": 0.6437, "step": 4907 }, { "epoch": 0.3496100010684902, "grad_norm": 2.1019387245178223, "learning_rate": 6.512217904951889e-06, "loss": 0.2826, "step": 4908 }, { "epoch": 0.3496812337500445, "grad_norm": 3.0514652729034424, "learning_rate": 6.508536857203884e-06, "loss": 0.3563, "step": 4909 }, { "epoch": 0.3497524664315988, "grad_norm": 3.1299827098846436, "learning_rate": 6.504856348152682e-06, "loss": 0.6711, "step": 4910 }, { "epoch": 0.34982369911315314, "grad_norm": 3.3095428943634033, "learning_rate": 6.5011763783661564e-06, "loss": 0.3252, "step": 4911 }, { "epoch": 0.34989493179470743, "grad_norm": 4.09190034866333, "learning_rate": 6.497496948412085e-06, "loss": 0.685, "step": 4912 }, { "epoch": 0.3499661644762617, "grad_norm": 1.902666687965393, "learning_rate": 6.493818058858161e-06, "loss": 0.1539, "step": 4913 }, { "epoch": 0.350037397157816, "grad_norm": 4.025412082672119, "learning_rate": 6.490139710272005e-06, "loss": 0.7091, "step": 4914 }, { "epoch": 0.3501086298393703, "grad_norm": 4.2406439781188965, "learning_rate": 6.486461903221153e-06, "loss": 0.7841, "step": 4915 }, { "epoch": 0.3501798625209246, "grad_norm": 6.561128616333008, "learning_rate": 6.482784638273041e-06, "loss": 0.2683, "step": 4916 }, { "epoch": 0.3502510952024789, "grad_norm": 3.6483399868011475, "learning_rate": 6.479107915995038e-06, "loss": 0.3507, "step": 4917 }, { "epoch": 0.35032232788403317, "grad_norm": 2.7682273387908936, "learning_rate": 6.475431736954431e-06, "loss": 0.2136, "step": 4918 }, { "epoch": 0.3503935605655875, "grad_norm": 3.4236812591552734, "learning_rate": 6.471756101718408e-06, "loss": 0.6024, "step": 4919 }, { "epoch": 0.3504647932471418, "grad_norm": 2.8326056003570557, "learning_rate": 6.468081010854084e-06, "loss": 0.6347, "step": 4920 }, { "epoch": 0.3505360259286961, "grad_norm": 2.528933048248291, "learning_rate": 6.46440646492849e-06, "loss": 0.8224, "step": 4921 }, { "epoch": 0.3506072586102504, "grad_norm": 4.727286338806152, "learning_rate": 6.460732464508567e-06, "loss": 0.5849, "step": 4922 }, { "epoch": 0.3506784912918047, "grad_norm": 2.0962390899658203, "learning_rate": 6.4570590101611765e-06, "loss": 0.2386, "step": 4923 }, { "epoch": 0.35074972397335896, "grad_norm": 4.197616100311279, "learning_rate": 6.453386102453099e-06, "loss": 0.6308, "step": 4924 }, { "epoch": 0.35082095665491325, "grad_norm": 4.995429992675781, "learning_rate": 6.449713741951021e-06, "loss": 0.7515, "step": 4925 }, { "epoch": 0.3508921893364676, "grad_norm": 2.714306116104126, "learning_rate": 6.446041929221551e-06, "loss": 0.3117, "step": 4926 }, { "epoch": 0.3509634220180219, "grad_norm": 3.7896111011505127, "learning_rate": 6.442370664831214e-06, "loss": 0.8929, "step": 4927 }, { "epoch": 0.3510346546995762, "grad_norm": 3.3625729084014893, "learning_rate": 6.438699949346446e-06, "loss": 0.5496, "step": 4928 }, { "epoch": 0.35110588738113047, "grad_norm": 5.24247407913208, "learning_rate": 6.435029783333599e-06, "loss": 0.2365, "step": 4929 }, { "epoch": 0.35117712006268476, "grad_norm": 3.3277909755706787, "learning_rate": 6.431360167358951e-06, "loss": 0.8642, "step": 4930 }, { "epoch": 0.35124835274423905, "grad_norm": 3.726087808609009, "learning_rate": 6.427691101988673e-06, "loss": 0.6221, "step": 4931 }, { "epoch": 0.35131958542579333, "grad_norm": 3.4819467067718506, "learning_rate": 6.424022587788872e-06, "loss": 0.369, "step": 4932 }, { "epoch": 0.3513908181073476, "grad_norm": 5.185949802398682, "learning_rate": 6.4203546253255635e-06, "loss": 0.4882, "step": 4933 }, { "epoch": 0.35146205078890197, "grad_norm": 2.5879862308502197, "learning_rate": 6.416687215164671e-06, "loss": 0.4588, "step": 4934 }, { "epoch": 0.35153328347045626, "grad_norm": 2.7873852252960205, "learning_rate": 6.413020357872038e-06, "loss": 0.255, "step": 4935 }, { "epoch": 0.35160451615201055, "grad_norm": 5.153915882110596, "learning_rate": 6.409354054013425e-06, "loss": 0.697, "step": 4936 }, { "epoch": 0.35167574883356484, "grad_norm": 3.2566888332366943, "learning_rate": 6.405688304154509e-06, "loss": 0.3414, "step": 4937 }, { "epoch": 0.35174698151511913, "grad_norm": 2.2290549278259277, "learning_rate": 6.4020231088608695e-06, "loss": 0.4382, "step": 4938 }, { "epoch": 0.3518182141966734, "grad_norm": 3.1046807765960693, "learning_rate": 6.398358468698013e-06, "loss": 0.3978, "step": 4939 }, { "epoch": 0.3518894468782277, "grad_norm": 1.6713565587997437, "learning_rate": 6.394694384231358e-06, "loss": 0.2021, "step": 4940 }, { "epoch": 0.35196067955978205, "grad_norm": 2.1259443759918213, "learning_rate": 6.3910308560262305e-06, "loss": 0.4378, "step": 4941 }, { "epoch": 0.35203191224133634, "grad_norm": 3.006228446960449, "learning_rate": 6.387367884647875e-06, "loss": 0.3842, "step": 4942 }, { "epoch": 0.35210314492289063, "grad_norm": 2.852097272872925, "learning_rate": 6.383705470661456e-06, "loss": 0.2442, "step": 4943 }, { "epoch": 0.3521743776044449, "grad_norm": 2.7902326583862305, "learning_rate": 6.380043614632037e-06, "loss": 0.5378, "step": 4944 }, { "epoch": 0.3522456102859992, "grad_norm": 3.7820193767547607, "learning_rate": 6.376382317124612e-06, "loss": 0.6053, "step": 4945 }, { "epoch": 0.3523168429675535, "grad_norm": 3.2754032611846924, "learning_rate": 6.372721578704082e-06, "loss": 0.632, "step": 4946 }, { "epoch": 0.3523880756491078, "grad_norm": 3.046107292175293, "learning_rate": 6.369061399935255e-06, "loss": 0.3955, "step": 4947 }, { "epoch": 0.35245930833066214, "grad_norm": 5.519771099090576, "learning_rate": 6.365401781382865e-06, "loss": 0.3953, "step": 4948 }, { "epoch": 0.3525305410122164, "grad_norm": 3.903052568435669, "learning_rate": 6.361742723611551e-06, "loss": 0.4848, "step": 4949 }, { "epoch": 0.3526017736937707, "grad_norm": 2.805708169937134, "learning_rate": 6.358084227185866e-06, "loss": 0.5111, "step": 4950 }, { "epoch": 0.352673006375325, "grad_norm": 2.670527219772339, "learning_rate": 6.354426292670279e-06, "loss": 0.5153, "step": 4951 }, { "epoch": 0.3527442390568793, "grad_norm": 3.328160047531128, "learning_rate": 6.350768920629179e-06, "loss": 0.3756, "step": 4952 }, { "epoch": 0.3528154717384336, "grad_norm": 4.490268707275391, "learning_rate": 6.3471121116268494e-06, "loss": 0.2544, "step": 4953 }, { "epoch": 0.3528867044199879, "grad_norm": 2.6095545291900635, "learning_rate": 6.343455866227504e-06, "loss": 0.524, "step": 4954 }, { "epoch": 0.35295793710154216, "grad_norm": 4.65035343170166, "learning_rate": 6.339800184995266e-06, "loss": 0.4576, "step": 4955 }, { "epoch": 0.3530291697830965, "grad_norm": 1.8903993368148804, "learning_rate": 6.3361450684941664e-06, "loss": 0.1661, "step": 4956 }, { "epoch": 0.3531004024646508, "grad_norm": 3.325025796890259, "learning_rate": 6.332490517288148e-06, "loss": 0.4737, "step": 4957 }, { "epoch": 0.3531716351462051, "grad_norm": 2.6033575534820557, "learning_rate": 6.328836531941081e-06, "loss": 0.51, "step": 4958 }, { "epoch": 0.3532428678277594, "grad_norm": 3.282172918319702, "learning_rate": 6.3251831130167264e-06, "loss": 0.5928, "step": 4959 }, { "epoch": 0.35331410050931367, "grad_norm": 3.89958119392395, "learning_rate": 6.321530261078774e-06, "loss": 0.6234, "step": 4960 }, { "epoch": 0.35338533319086796, "grad_norm": 1.8258476257324219, "learning_rate": 6.317877976690826e-06, "loss": 0.0981, "step": 4961 }, { "epoch": 0.35345656587242225, "grad_norm": 2.5995824337005615, "learning_rate": 6.314226260416383e-06, "loss": 0.2556, "step": 4962 }, { "epoch": 0.3535277985539766, "grad_norm": 2.511730909347534, "learning_rate": 6.3105751128188756e-06, "loss": 0.4793, "step": 4963 }, { "epoch": 0.3535990312355309, "grad_norm": 2.7253923416137695, "learning_rate": 6.306924534461633e-06, "loss": 0.1428, "step": 4964 }, { "epoch": 0.35367026391708517, "grad_norm": 2.593585252761841, "learning_rate": 6.303274525907903e-06, "loss": 0.4354, "step": 4965 }, { "epoch": 0.35374149659863946, "grad_norm": 2.6137259006500244, "learning_rate": 6.299625087720844e-06, "loss": 0.3361, "step": 4966 }, { "epoch": 0.35381272928019375, "grad_norm": 2.1749002933502197, "learning_rate": 6.295976220463531e-06, "loss": 0.4978, "step": 4967 }, { "epoch": 0.35388396196174804, "grad_norm": 3.1679799556732178, "learning_rate": 6.2923279246989385e-06, "loss": 0.5973, "step": 4968 }, { "epoch": 0.35395519464330233, "grad_norm": 1.6930060386657715, "learning_rate": 6.288680200989967e-06, "loss": 0.1907, "step": 4969 }, { "epoch": 0.3540264273248566, "grad_norm": 3.9466800689697266, "learning_rate": 6.2850330498994235e-06, "loss": 0.7003, "step": 4970 }, { "epoch": 0.35409766000641096, "grad_norm": 4.798286437988281, "learning_rate": 6.281386471990021e-06, "loss": 0.3576, "step": 4971 }, { "epoch": 0.35416889268796525, "grad_norm": 4.012714385986328, "learning_rate": 6.277740467824394e-06, "loss": 0.6653, "step": 4972 }, { "epoch": 0.35424012536951954, "grad_norm": 2.4088292121887207, "learning_rate": 6.2740950379650775e-06, "loss": 0.5054, "step": 4973 }, { "epoch": 0.35431135805107383, "grad_norm": 2.8004422187805176, "learning_rate": 6.270450182974532e-06, "loss": 0.3993, "step": 4974 }, { "epoch": 0.3543825907326281, "grad_norm": 2.834042549133301, "learning_rate": 6.266805903415112e-06, "loss": 0.1791, "step": 4975 }, { "epoch": 0.3544538234141824, "grad_norm": 3.7773311138153076, "learning_rate": 6.2631621998490965e-06, "loss": 0.7879, "step": 4976 }, { "epoch": 0.3545250560957367, "grad_norm": 4.728562831878662, "learning_rate": 6.259519072838676e-06, "loss": 0.767, "step": 4977 }, { "epoch": 0.35459628877729105, "grad_norm": 3.296926259994507, "learning_rate": 6.255876522945941e-06, "loss": 0.3819, "step": 4978 }, { "epoch": 0.35466752145884534, "grad_norm": 3.1713685989379883, "learning_rate": 6.2522345507329e-06, "loss": 0.7598, "step": 4979 }, { "epoch": 0.3547387541403996, "grad_norm": 1.8790316581726074, "learning_rate": 6.248593156761477e-06, "loss": 0.1259, "step": 4980 }, { "epoch": 0.3548099868219539, "grad_norm": 5.50286865234375, "learning_rate": 6.244952341593493e-06, "loss": 0.4429, "step": 4981 }, { "epoch": 0.3548812195035082, "grad_norm": 3.676938772201538, "learning_rate": 6.2413121057906934e-06, "loss": 0.9434, "step": 4982 }, { "epoch": 0.3549524521850625, "grad_norm": 2.3141655921936035, "learning_rate": 6.237672449914734e-06, "loss": 0.4006, "step": 4983 }, { "epoch": 0.3550236848666168, "grad_norm": 4.797348976135254, "learning_rate": 6.234033374527166e-06, "loss": 0.6175, "step": 4984 }, { "epoch": 0.3550949175481711, "grad_norm": 3.3693203926086426, "learning_rate": 6.230394880189468e-06, "loss": 0.4675, "step": 4985 }, { "epoch": 0.3551661502297254, "grad_norm": 3.4799957275390625, "learning_rate": 6.226756967463023e-06, "loss": 0.79, "step": 4986 }, { "epoch": 0.3552373829112797, "grad_norm": 3.1214773654937744, "learning_rate": 6.223119636909118e-06, "loss": 0.5433, "step": 4987 }, { "epoch": 0.355308615592834, "grad_norm": 3.268890142440796, "learning_rate": 6.219482889088959e-06, "loss": 0.816, "step": 4988 }, { "epoch": 0.3553798482743883, "grad_norm": 2.7500734329223633, "learning_rate": 6.215846724563661e-06, "loss": 0.7197, "step": 4989 }, { "epoch": 0.3554510809559426, "grad_norm": 2.284510850906372, "learning_rate": 6.21221114389424e-06, "loss": 0.2395, "step": 4990 }, { "epoch": 0.35552231363749687, "grad_norm": 2.8178398609161377, "learning_rate": 6.208576147641634e-06, "loss": 0.3524, "step": 4991 }, { "epoch": 0.35559354631905116, "grad_norm": 2.743208408355713, "learning_rate": 6.204941736366688e-06, "loss": 0.3602, "step": 4992 }, { "epoch": 0.3556647790006055, "grad_norm": 4.3017578125, "learning_rate": 6.2013079106301454e-06, "loss": 0.8706, "step": 4993 }, { "epoch": 0.3557360116821598, "grad_norm": 3.1211514472961426, "learning_rate": 6.1976746709926775e-06, "loss": 0.6856, "step": 4994 }, { "epoch": 0.3558072443637141, "grad_norm": 4.545780658721924, "learning_rate": 6.194042018014852e-06, "loss": 0.6716, "step": 4995 }, { "epoch": 0.35587847704526837, "grad_norm": 2.1025888919830322, "learning_rate": 6.1904099522571445e-06, "loss": 0.3117, "step": 4996 }, { "epoch": 0.35594970972682266, "grad_norm": 2.353506326675415, "learning_rate": 6.186778474279951e-06, "loss": 0.1946, "step": 4997 }, { "epoch": 0.35602094240837695, "grad_norm": 3.7508530616760254, "learning_rate": 6.183147584643575e-06, "loss": 0.8869, "step": 4998 }, { "epoch": 0.35609217508993124, "grad_norm": 1.8804210424423218, "learning_rate": 6.179517283908217e-06, "loss": 0.3292, "step": 4999 }, { "epoch": 0.3561634077714856, "grad_norm": 2.5393002033233643, "learning_rate": 6.175887572633998e-06, "loss": 0.5306, "step": 5000 }, { "epoch": 0.3562346404530399, "grad_norm": 3.170372724533081, "learning_rate": 6.172258451380949e-06, "loss": 0.5594, "step": 5001 }, { "epoch": 0.35630587313459416, "grad_norm": 1.7196346521377563, "learning_rate": 6.168629920709002e-06, "loss": 0.2016, "step": 5002 }, { "epoch": 0.35637710581614845, "grad_norm": 2.8047094345092773, "learning_rate": 6.165001981178e-06, "loss": 0.7683, "step": 5003 }, { "epoch": 0.35644833849770274, "grad_norm": 2.8255302906036377, "learning_rate": 6.161374633347703e-06, "loss": 0.3927, "step": 5004 }, { "epoch": 0.35651957117925703, "grad_norm": 5.569559097290039, "learning_rate": 6.157747877777766e-06, "loss": 0.0915, "step": 5005 }, { "epoch": 0.3565908038608113, "grad_norm": 1.8585257530212402, "learning_rate": 6.154121715027765e-06, "loss": 0.2722, "step": 5006 }, { "epoch": 0.3566620365423656, "grad_norm": 5.137579441070557, "learning_rate": 6.150496145657183e-06, "loss": 0.3584, "step": 5007 }, { "epoch": 0.35673326922391996, "grad_norm": 2.568105936050415, "learning_rate": 6.146871170225398e-06, "loss": 0.7545, "step": 5008 }, { "epoch": 0.35680450190547425, "grad_norm": 2.023817300796509, "learning_rate": 6.143246789291715e-06, "loss": 0.5163, "step": 5009 }, { "epoch": 0.35687573458702854, "grad_norm": 3.6654419898986816, "learning_rate": 6.139623003415336e-06, "loss": 0.3677, "step": 5010 }, { "epoch": 0.3569469672685828, "grad_norm": 4.065236568450928, "learning_rate": 6.135999813155371e-06, "loss": 0.4701, "step": 5011 }, { "epoch": 0.3570181999501371, "grad_norm": 2.170534372329712, "learning_rate": 6.132377219070842e-06, "loss": 0.1979, "step": 5012 }, { "epoch": 0.3570894326316914, "grad_norm": 1.7756844758987427, "learning_rate": 6.128755221720682e-06, "loss": 0.3072, "step": 5013 }, { "epoch": 0.3571606653132457, "grad_norm": 6.470966815948486, "learning_rate": 6.1251338216637255e-06, "loss": 0.7425, "step": 5014 }, { "epoch": 0.35723189799480004, "grad_norm": 2.656968832015991, "learning_rate": 6.121513019458715e-06, "loss": 0.5155, "step": 5015 }, { "epoch": 0.35730313067635433, "grad_norm": 4.61340856552124, "learning_rate": 6.117892815664306e-06, "loss": 0.6815, "step": 5016 }, { "epoch": 0.3573743633579086, "grad_norm": 2.2831223011016846, "learning_rate": 6.11427321083906e-06, "loss": 0.2628, "step": 5017 }, { "epoch": 0.3574455960394629, "grad_norm": 3.0504324436187744, "learning_rate": 6.110654205541438e-06, "loss": 0.6716, "step": 5018 }, { "epoch": 0.3575168287210172, "grad_norm": 5.5111083984375, "learning_rate": 6.1070358003298215e-06, "loss": 0.4622, "step": 5019 }, { "epoch": 0.3575880614025715, "grad_norm": 1.9373031854629517, "learning_rate": 6.103417995762493e-06, "loss": 0.4711, "step": 5020 }, { "epoch": 0.3576592940841258, "grad_norm": 2.752192735671997, "learning_rate": 6.099800792397636e-06, "loss": 0.5852, "step": 5021 }, { "epoch": 0.35773052676568007, "grad_norm": 3.8595123291015625, "learning_rate": 6.096184190793357e-06, "loss": 0.5801, "step": 5022 }, { "epoch": 0.3578017594472344, "grad_norm": 4.104828357696533, "learning_rate": 6.092568191507655e-06, "loss": 0.4988, "step": 5023 }, { "epoch": 0.3578729921287887, "grad_norm": 1.9905734062194824, "learning_rate": 6.088952795098442e-06, "loss": 0.2007, "step": 5024 }, { "epoch": 0.357944224810343, "grad_norm": 3.577303171157837, "learning_rate": 6.085338002123534e-06, "loss": 0.4485, "step": 5025 }, { "epoch": 0.3580154574918973, "grad_norm": 2.729759454727173, "learning_rate": 6.081723813140664e-06, "loss": 0.536, "step": 5026 }, { "epoch": 0.35808669017345157, "grad_norm": 4.737212657928467, "learning_rate": 6.078110228707454e-06, "loss": 0.6961, "step": 5027 }, { "epoch": 0.35815792285500586, "grad_norm": 3.966489315032959, "learning_rate": 6.07449724938145e-06, "loss": 0.8147, "step": 5028 }, { "epoch": 0.35822915553656015, "grad_norm": 3.8794734477996826, "learning_rate": 6.0708848757200975e-06, "loss": 0.5151, "step": 5029 }, { "epoch": 0.3583003882181145, "grad_norm": 2.377228021621704, "learning_rate": 6.067273108280745e-06, "loss": 0.4824, "step": 5030 }, { "epoch": 0.3583716208996688, "grad_norm": 2.587641954421997, "learning_rate": 6.0636619476206534e-06, "loss": 0.4795, "step": 5031 }, { "epoch": 0.3584428535812231, "grad_norm": 3.141328811645508, "learning_rate": 6.060051394296989e-06, "loss": 1.003, "step": 5032 }, { "epoch": 0.35851408626277737, "grad_norm": 3.860771417617798, "learning_rate": 6.056441448866817e-06, "loss": 0.5971, "step": 5033 }, { "epoch": 0.35858531894433165, "grad_norm": 1.7530887126922607, "learning_rate": 6.052832111887117e-06, "loss": 0.141, "step": 5034 }, { "epoch": 0.35865655162588594, "grad_norm": 4.015860557556152, "learning_rate": 6.04922338391478e-06, "loss": 0.3124, "step": 5035 }, { "epoch": 0.35872778430744023, "grad_norm": 4.108837127685547, "learning_rate": 6.045615265506585e-06, "loss": 0.6174, "step": 5036 }, { "epoch": 0.3587990169889945, "grad_norm": 4.915424823760986, "learning_rate": 6.0420077572192325e-06, "loss": 0.6172, "step": 5037 }, { "epoch": 0.35887024967054887, "grad_norm": 10.135419845581055, "learning_rate": 6.038400859609327e-06, "loss": 0.3152, "step": 5038 }, { "epoch": 0.35894148235210316, "grad_norm": 3.2003345489501953, "learning_rate": 6.034794573233371e-06, "loss": 0.4493, "step": 5039 }, { "epoch": 0.35901271503365745, "grad_norm": 3.138489246368408, "learning_rate": 6.031188898647776e-06, "loss": 0.5534, "step": 5040 }, { "epoch": 0.35908394771521174, "grad_norm": 2.1118359565734863, "learning_rate": 6.027583836408868e-06, "loss": 0.316, "step": 5041 }, { "epoch": 0.359155180396766, "grad_norm": 2.736588954925537, "learning_rate": 6.023979387072861e-06, "loss": 0.3571, "step": 5042 }, { "epoch": 0.3592264130783203, "grad_norm": 3.48360538482666, "learning_rate": 6.020375551195891e-06, "loss": 0.3475, "step": 5043 }, { "epoch": 0.3592976457598746, "grad_norm": 3.654322862625122, "learning_rate": 6.016772329333993e-06, "loss": 0.8132, "step": 5044 }, { "epoch": 0.35936887844142895, "grad_norm": 1.9628994464874268, "learning_rate": 6.013169722043104e-06, "loss": 0.2083, "step": 5045 }, { "epoch": 0.35944011112298324, "grad_norm": 3.6618309020996094, "learning_rate": 6.009567729879071e-06, "loss": 0.9074, "step": 5046 }, { "epoch": 0.35951134380453753, "grad_norm": 2.680354356765747, "learning_rate": 6.005966353397643e-06, "loss": 0.7771, "step": 5047 }, { "epoch": 0.3595825764860918, "grad_norm": 4.100203990936279, "learning_rate": 6.002365593154478e-06, "loss": 0.5298, "step": 5048 }, { "epoch": 0.3596538091676461, "grad_norm": 6.864127159118652, "learning_rate": 5.998765449705131e-06, "loss": 0.2657, "step": 5049 }, { "epoch": 0.3597250418492004, "grad_norm": 2.1657145023345947, "learning_rate": 5.9951659236050695e-06, "loss": 0.387, "step": 5050 }, { "epoch": 0.3597962745307547, "grad_norm": 3.7088398933410645, "learning_rate": 5.99156701540967e-06, "loss": 0.6678, "step": 5051 }, { "epoch": 0.35986750721230903, "grad_norm": 2.5352044105529785, "learning_rate": 5.987968725674196e-06, "loss": 0.3917, "step": 5052 }, { "epoch": 0.3599387398938633, "grad_norm": 3.1292169094085693, "learning_rate": 5.9843710549538346e-06, "loss": 0.4832, "step": 5053 }, { "epoch": 0.3600099725754176, "grad_norm": 3.595350503921509, "learning_rate": 5.980774003803668e-06, "loss": 0.6271, "step": 5054 }, { "epoch": 0.3600812052569719, "grad_norm": 3.6907522678375244, "learning_rate": 5.977177572778679e-06, "loss": 0.3946, "step": 5055 }, { "epoch": 0.3601524379385262, "grad_norm": 3.1163337230682373, "learning_rate": 5.973581762433763e-06, "loss": 0.4946, "step": 5056 }, { "epoch": 0.3602236706200805, "grad_norm": 3.5774126052856445, "learning_rate": 5.969986573323721e-06, "loss": 0.7904, "step": 5057 }, { "epoch": 0.3602949033016348, "grad_norm": 3.0254621505737305, "learning_rate": 5.966392006003245e-06, "loss": 0.5367, "step": 5058 }, { "epoch": 0.36036613598318906, "grad_norm": 3.366608142852783, "learning_rate": 5.9627980610269445e-06, "loss": 0.6734, "step": 5059 }, { "epoch": 0.3604373686647434, "grad_norm": 3.498688220977783, "learning_rate": 5.959204738949334e-06, "loss": 0.6919, "step": 5060 }, { "epoch": 0.3605086013462977, "grad_norm": 3.4901282787323, "learning_rate": 5.955612040324815e-06, "loss": 0.5543, "step": 5061 }, { "epoch": 0.360579834027852, "grad_norm": 5.704268932342529, "learning_rate": 5.952019965707709e-06, "loss": 0.2227, "step": 5062 }, { "epoch": 0.3606510667094063, "grad_norm": 4.989097595214844, "learning_rate": 5.948428515652241e-06, "loss": 0.71, "step": 5063 }, { "epoch": 0.36072229939096057, "grad_norm": 4.233643531799316, "learning_rate": 5.944837690712524e-06, "loss": 0.6295, "step": 5064 }, { "epoch": 0.36079353207251486, "grad_norm": 2.54221510887146, "learning_rate": 5.941247491442592e-06, "loss": 0.3868, "step": 5065 }, { "epoch": 0.36086476475406914, "grad_norm": 6.523016452789307, "learning_rate": 5.9376579183963775e-06, "loss": 0.3374, "step": 5066 }, { "epoch": 0.3609359974356235, "grad_norm": 2.8802051544189453, "learning_rate": 5.9340689721277116e-06, "loss": 0.6337, "step": 5067 }, { "epoch": 0.3610072301171778, "grad_norm": 3.0962767601013184, "learning_rate": 5.930480653190331e-06, "loss": 0.3268, "step": 5068 }, { "epoch": 0.36107846279873207, "grad_norm": 3.375247001647949, "learning_rate": 5.9268929621378805e-06, "loss": 0.4944, "step": 5069 }, { "epoch": 0.36114969548028636, "grad_norm": 4.414747714996338, "learning_rate": 5.923305899523899e-06, "loss": 0.4419, "step": 5070 }, { "epoch": 0.36122092816184065, "grad_norm": 2.5014588832855225, "learning_rate": 5.919719465901834e-06, "loss": 0.3932, "step": 5071 }, { "epoch": 0.36129216084339494, "grad_norm": 2.8766348361968994, "learning_rate": 5.916133661825041e-06, "loss": 0.6227, "step": 5072 }, { "epoch": 0.36136339352494923, "grad_norm": 1.8717265129089355, "learning_rate": 5.9125484878467635e-06, "loss": 0.1281, "step": 5073 }, { "epoch": 0.3614346262065035, "grad_norm": 2.056068181991577, "learning_rate": 5.908963944520162e-06, "loss": 0.1621, "step": 5074 }, { "epoch": 0.36150585888805786, "grad_norm": 2.4367051124572754, "learning_rate": 5.9053800323982976e-06, "loss": 0.4776, "step": 5075 }, { "epoch": 0.36157709156961215, "grad_norm": 2.192336320877075, "learning_rate": 5.901796752034128e-06, "loss": 0.2336, "step": 5076 }, { "epoch": 0.36164832425116644, "grad_norm": 3.639108419418335, "learning_rate": 5.8982141039805115e-06, "loss": 0.6269, "step": 5077 }, { "epoch": 0.36171955693272073, "grad_norm": 3.1763546466827393, "learning_rate": 5.894632088790224e-06, "loss": 0.412, "step": 5078 }, { "epoch": 0.361790789614275, "grad_norm": 3.1476147174835205, "learning_rate": 5.891050707015924e-06, "loss": 0.8583, "step": 5079 }, { "epoch": 0.3618620222958293, "grad_norm": 2.5450055599212646, "learning_rate": 5.887469959210186e-06, "loss": 0.5845, "step": 5080 }, { "epoch": 0.3619332549773836, "grad_norm": 6.806457996368408, "learning_rate": 5.883889845925487e-06, "loss": 0.984, "step": 5081 }, { "epoch": 0.36200448765893795, "grad_norm": 4.39885139465332, "learning_rate": 5.880310367714192e-06, "loss": 0.1811, "step": 5082 }, { "epoch": 0.36207572034049224, "grad_norm": 3.9083189964294434, "learning_rate": 5.8767315251285854e-06, "loss": 0.6724, "step": 5083 }, { "epoch": 0.3621469530220465, "grad_norm": 1.8400399684906006, "learning_rate": 5.873153318720842e-06, "loss": 0.2643, "step": 5084 }, { "epoch": 0.3622181857036008, "grad_norm": 4.277239799499512, "learning_rate": 5.869575749043044e-06, "loss": 0.4775, "step": 5085 }, { "epoch": 0.3622894183851551, "grad_norm": 4.021451473236084, "learning_rate": 5.8659988166471715e-06, "loss": 0.4581, "step": 5086 }, { "epoch": 0.3623606510667094, "grad_norm": 3.1463193893432617, "learning_rate": 5.862422522085108e-06, "loss": 0.6196, "step": 5087 }, { "epoch": 0.3624318837482637, "grad_norm": 2.0717227458953857, "learning_rate": 5.858846865908645e-06, "loss": 0.1643, "step": 5088 }, { "epoch": 0.362503116429818, "grad_norm": 5.194045066833496, "learning_rate": 5.855271848669462e-06, "loss": 0.1643, "step": 5089 }, { "epoch": 0.3625743491113723, "grad_norm": 2.2697293758392334, "learning_rate": 5.851697470919151e-06, "loss": 0.4453, "step": 5090 }, { "epoch": 0.3626455817929266, "grad_norm": 2.0966989994049072, "learning_rate": 5.8481237332092014e-06, "loss": 0.0778, "step": 5091 }, { "epoch": 0.3627168144744809, "grad_norm": 6.715454578399658, "learning_rate": 5.844550636091004e-06, "loss": 1.0224, "step": 5092 }, { "epoch": 0.3627880471560352, "grad_norm": 2.6903114318847656, "learning_rate": 5.840978180115848e-06, "loss": 0.5281, "step": 5093 }, { "epoch": 0.3628592798375895, "grad_norm": 3.7631919384002686, "learning_rate": 5.837406365834934e-06, "loss": 0.3574, "step": 5094 }, { "epoch": 0.36293051251914377, "grad_norm": 4.7824177742004395, "learning_rate": 5.8338351937993476e-06, "loss": 0.7387, "step": 5095 }, { "epoch": 0.36300174520069806, "grad_norm": 3.380218982696533, "learning_rate": 5.830264664560087e-06, "loss": 0.8021, "step": 5096 }, { "epoch": 0.3630729778822524, "grad_norm": 3.749358892440796, "learning_rate": 5.826694778668053e-06, "loss": 0.1327, "step": 5097 }, { "epoch": 0.3631442105638067, "grad_norm": 2.631617784500122, "learning_rate": 5.823125536674032e-06, "loss": 0.4221, "step": 5098 }, { "epoch": 0.363215443245361, "grad_norm": 3.1428048610687256, "learning_rate": 5.81955693912873e-06, "loss": 0.6084, "step": 5099 }, { "epoch": 0.36328667592691527, "grad_norm": 2.052899122238159, "learning_rate": 5.815988986582745e-06, "loss": 0.2552, "step": 5100 }, { "epoch": 0.36335790860846956, "grad_norm": 3.7767717838287354, "learning_rate": 5.812421679586569e-06, "loss": 0.7691, "step": 5101 }, { "epoch": 0.36342914129002385, "grad_norm": 4.876655578613281, "learning_rate": 5.808855018690607e-06, "loss": 0.3523, "step": 5102 }, { "epoch": 0.36350037397157814, "grad_norm": 2.4513397216796875, "learning_rate": 5.805289004445155e-06, "loss": 0.3168, "step": 5103 }, { "epoch": 0.3635716066531325, "grad_norm": 3.390458822250366, "learning_rate": 5.801723637400409e-06, "loss": 0.6582, "step": 5104 }, { "epoch": 0.3636428393346868, "grad_norm": 2.0400948524475098, "learning_rate": 5.798158918106471e-06, "loss": 0.3978, "step": 5105 }, { "epoch": 0.36371407201624106, "grad_norm": 2.9527714252471924, "learning_rate": 5.7945948471133466e-06, "loss": 0.5293, "step": 5106 }, { "epoch": 0.36378530469779535, "grad_norm": 2.319704055786133, "learning_rate": 5.791031424970926e-06, "loss": 0.4979, "step": 5107 }, { "epoch": 0.36385653737934964, "grad_norm": 3.5873255729675293, "learning_rate": 5.787468652229012e-06, "loss": 0.8461, "step": 5108 }, { "epoch": 0.36392777006090393, "grad_norm": 3.6547844409942627, "learning_rate": 5.783906529437309e-06, "loss": 0.6641, "step": 5109 }, { "epoch": 0.3639990027424582, "grad_norm": 3.6282925605773926, "learning_rate": 5.7803450571454066e-06, "loss": 0.8693, "step": 5110 }, { "epoch": 0.3640702354240125, "grad_norm": 4.748791217803955, "learning_rate": 5.776784235902807e-06, "loss": 0.6015, "step": 5111 }, { "epoch": 0.36414146810556686, "grad_norm": 3.490449905395508, "learning_rate": 5.773224066258913e-06, "loss": 0.5426, "step": 5112 }, { "epoch": 0.36421270078712115, "grad_norm": 1.5949841737747192, "learning_rate": 5.769664548763016e-06, "loss": 0.2456, "step": 5113 }, { "epoch": 0.36428393346867544, "grad_norm": 2.5029327869415283, "learning_rate": 5.766105683964314e-06, "loss": 0.3353, "step": 5114 }, { "epoch": 0.3643551661502297, "grad_norm": 2.6595144271850586, "learning_rate": 5.762547472411909e-06, "loss": 0.3662, "step": 5115 }, { "epoch": 0.364426398831784, "grad_norm": 4.029576301574707, "learning_rate": 5.758989914654787e-06, "loss": 0.406, "step": 5116 }, { "epoch": 0.3644976315133383, "grad_norm": 4.70437479019165, "learning_rate": 5.755433011241851e-06, "loss": 0.4259, "step": 5117 }, { "epoch": 0.3645688641948926, "grad_norm": 4.351320743560791, "learning_rate": 5.751876762721887e-06, "loss": 0.4787, "step": 5118 }, { "epoch": 0.36464009687644694, "grad_norm": 2.286245822906494, "learning_rate": 5.748321169643596e-06, "loss": 0.3048, "step": 5119 }, { "epoch": 0.36471132955800123, "grad_norm": 1.6730796098709106, "learning_rate": 5.744766232555561e-06, "loss": 0.4181, "step": 5120 }, { "epoch": 0.3647825622395555, "grad_norm": 2.2171716690063477, "learning_rate": 5.741211952006278e-06, "loss": 0.325, "step": 5121 }, { "epoch": 0.3648537949211098, "grad_norm": 2.6701037883758545, "learning_rate": 5.737658328544131e-06, "loss": 0.2406, "step": 5122 }, { "epoch": 0.3649250276026641, "grad_norm": 2.2267675399780273, "learning_rate": 5.73410536271741e-06, "loss": 0.0864, "step": 5123 }, { "epoch": 0.3649962602842184, "grad_norm": 2.930978536605835, "learning_rate": 5.730553055074306e-06, "loss": 0.4306, "step": 5124 }, { "epoch": 0.3650674929657727, "grad_norm": 3.900667905807495, "learning_rate": 5.7270014061628935e-06, "loss": 0.4877, "step": 5125 }, { "epoch": 0.36513872564732697, "grad_norm": 1.8955894708633423, "learning_rate": 5.7234504165311626e-06, "loss": 0.2132, "step": 5126 }, { "epoch": 0.3652099583288813, "grad_norm": 2.6402740478515625, "learning_rate": 5.71990008672699e-06, "loss": 0.2662, "step": 5127 }, { "epoch": 0.3652811910104356, "grad_norm": 3.319077253341675, "learning_rate": 5.716350417298163e-06, "loss": 0.7514, "step": 5128 }, { "epoch": 0.3653524236919899, "grad_norm": 3.190946102142334, "learning_rate": 5.71280140879235e-06, "loss": 0.616, "step": 5129 }, { "epoch": 0.3654236563735442, "grad_norm": 2.6532938480377197, "learning_rate": 5.7092530617571284e-06, "loss": 0.3307, "step": 5130 }, { "epoch": 0.36549488905509847, "grad_norm": 3.5585408210754395, "learning_rate": 5.7057053767399784e-06, "loss": 0.6445, "step": 5131 }, { "epoch": 0.36556612173665276, "grad_norm": 3.298318386077881, "learning_rate": 5.702158354288265e-06, "loss": 0.7126, "step": 5132 }, { "epoch": 0.36563735441820705, "grad_norm": 2.3601877689361572, "learning_rate": 5.698611994949257e-06, "loss": 0.4722, "step": 5133 }, { "epoch": 0.3657085870997614, "grad_norm": 5.407787322998047, "learning_rate": 5.6950662992701245e-06, "loss": 0.5085, "step": 5134 }, { "epoch": 0.3657798197813157, "grad_norm": 2.5420305728912354, "learning_rate": 5.691521267797926e-06, "loss": 0.3187, "step": 5135 }, { "epoch": 0.36585105246287, "grad_norm": 2.7561185359954834, "learning_rate": 5.687976901079626e-06, "loss": 0.4282, "step": 5136 }, { "epoch": 0.36592228514442426, "grad_norm": 2.5718801021575928, "learning_rate": 5.684433199662091e-06, "loss": 0.0784, "step": 5137 }, { "epoch": 0.36599351782597855, "grad_norm": 4.457485198974609, "learning_rate": 5.680890164092065e-06, "loss": 0.6699, "step": 5138 }, { "epoch": 0.36606475050753284, "grad_norm": 2.0419631004333496, "learning_rate": 5.67734779491621e-06, "loss": 0.0507, "step": 5139 }, { "epoch": 0.36613598318908713, "grad_norm": 2.397392511367798, "learning_rate": 5.67380609268108e-06, "loss": 0.2879, "step": 5140 }, { "epoch": 0.3662072158706414, "grad_norm": 2.1281380653381348, "learning_rate": 5.670265057933114e-06, "loss": 0.2852, "step": 5141 }, { "epoch": 0.36627844855219577, "grad_norm": 2.2734503746032715, "learning_rate": 5.666724691218663e-06, "loss": 0.3703, "step": 5142 }, { "epoch": 0.36634968123375006, "grad_norm": 3.1041204929351807, "learning_rate": 5.663184993083971e-06, "loss": 0.7841, "step": 5143 }, { "epoch": 0.36642091391530435, "grad_norm": 3.8422999382019043, "learning_rate": 5.65964596407517e-06, "loss": 0.177, "step": 5144 }, { "epoch": 0.36649214659685864, "grad_norm": 3.11572527885437, "learning_rate": 5.6561076047383e-06, "loss": 0.5939, "step": 5145 }, { "epoch": 0.3665633792784129, "grad_norm": 4.410401344299316, "learning_rate": 5.652569915619297e-06, "loss": 0.3825, "step": 5146 }, { "epoch": 0.3666346119599672, "grad_norm": 3.136495351791382, "learning_rate": 5.649032897263986e-06, "loss": 0.69, "step": 5147 }, { "epoch": 0.3667058446415215, "grad_norm": 3.8893179893493652, "learning_rate": 5.645496550218089e-06, "loss": 1.0291, "step": 5148 }, { "epoch": 0.36677707732307585, "grad_norm": 3.2902939319610596, "learning_rate": 5.6419608750272355e-06, "loss": 0.3832, "step": 5149 }, { "epoch": 0.36684831000463014, "grad_norm": 6.382895469665527, "learning_rate": 5.638425872236937e-06, "loss": 0.2127, "step": 5150 }, { "epoch": 0.36691954268618443, "grad_norm": 8.663464546203613, "learning_rate": 5.634891542392608e-06, "loss": 0.5823, "step": 5151 }, { "epoch": 0.3669907753677387, "grad_norm": 1.889776587486267, "learning_rate": 5.631357886039568e-06, "loss": 0.1758, "step": 5152 }, { "epoch": 0.367062008049293, "grad_norm": 5.661950588226318, "learning_rate": 5.627824903723014e-06, "loss": 0.2662, "step": 5153 }, { "epoch": 0.3671332407308473, "grad_norm": 3.0446584224700928, "learning_rate": 5.624292595988052e-06, "loss": 0.6166, "step": 5154 }, { "epoch": 0.3672044734124016, "grad_norm": 10.577920913696289, "learning_rate": 5.620760963379686e-06, "loss": 0.5034, "step": 5155 }, { "epoch": 0.36727570609395593, "grad_norm": 3.3664727210998535, "learning_rate": 5.617230006442802e-06, "loss": 0.5681, "step": 5156 }, { "epoch": 0.3673469387755102, "grad_norm": 2.269740104675293, "learning_rate": 5.6136997257221946e-06, "loss": 0.5145, "step": 5157 }, { "epoch": 0.3674181714570645, "grad_norm": 2.287292718887329, "learning_rate": 5.610170121762553e-06, "loss": 0.3947, "step": 5158 }, { "epoch": 0.3674894041386188, "grad_norm": 3.113588571548462, "learning_rate": 5.60664119510845e-06, "loss": 0.3633, "step": 5159 }, { "epoch": 0.3675606368201731, "grad_norm": 4.172954559326172, "learning_rate": 5.603112946304368e-06, "loss": 0.8138, "step": 5160 }, { "epoch": 0.3676318695017274, "grad_norm": 3.523003578186035, "learning_rate": 5.599585375894684e-06, "loss": 0.5373, "step": 5161 }, { "epoch": 0.36770310218328167, "grad_norm": 2.1641969680786133, "learning_rate": 5.5960584844236565e-06, "loss": 0.4443, "step": 5162 }, { "epoch": 0.36777433486483596, "grad_norm": 4.354353904724121, "learning_rate": 5.592532272435458e-06, "loss": 0.6132, "step": 5163 }, { "epoch": 0.3678455675463903, "grad_norm": 3.5036916732788086, "learning_rate": 5.5890067404741365e-06, "loss": 0.5586, "step": 5164 }, { "epoch": 0.3679168002279446, "grad_norm": 3.957042932510376, "learning_rate": 5.585481889083655e-06, "loss": 0.779, "step": 5165 }, { "epoch": 0.3679880329094989, "grad_norm": 2.1320390701293945, "learning_rate": 5.581957718807854e-06, "loss": 0.1094, "step": 5166 }, { "epoch": 0.3680592655910532, "grad_norm": 2.326507806777954, "learning_rate": 5.578434230190478e-06, "loss": 0.3233, "step": 5167 }, { "epoch": 0.36813049827260746, "grad_norm": 3.305370330810547, "learning_rate": 5.574911423775173e-06, "loss": 0.695, "step": 5168 }, { "epoch": 0.36820173095416175, "grad_norm": 2.4906585216522217, "learning_rate": 5.571389300105461e-06, "loss": 0.2105, "step": 5169 }, { "epoch": 0.36827296363571604, "grad_norm": 2.7665276527404785, "learning_rate": 5.567867859724774e-06, "loss": 0.4158, "step": 5170 }, { "epoch": 0.3683441963172704, "grad_norm": 2.325636625289917, "learning_rate": 5.5643471031764375e-06, "loss": 0.4565, "step": 5171 }, { "epoch": 0.3684154289988247, "grad_norm": 2.178807497024536, "learning_rate": 5.560827031003661e-06, "loss": 0.2765, "step": 5172 }, { "epoch": 0.36848666168037897, "grad_norm": 2.9055440425872803, "learning_rate": 5.557307643749559e-06, "loss": 0.408, "step": 5173 }, { "epoch": 0.36855789436193326, "grad_norm": 3.43481707572937, "learning_rate": 5.553788941957141e-06, "loss": 0.8084, "step": 5174 }, { "epoch": 0.36862912704348755, "grad_norm": 4.279539585113525, "learning_rate": 5.550270926169298e-06, "loss": 0.5645, "step": 5175 }, { "epoch": 0.36870035972504184, "grad_norm": 3.141995668411255, "learning_rate": 5.546753596928831e-06, "loss": 0.4479, "step": 5176 }, { "epoch": 0.3687715924065961, "grad_norm": 3.813063383102417, "learning_rate": 5.543236954778421e-06, "loss": 0.3563, "step": 5177 }, { "epoch": 0.3688428250881504, "grad_norm": 1.5742281675338745, "learning_rate": 5.539721000260658e-06, "loss": 0.1271, "step": 5178 }, { "epoch": 0.36891405776970476, "grad_norm": 3.3790054321289062, "learning_rate": 5.5362057339180075e-06, "loss": 0.5135, "step": 5179 }, { "epoch": 0.36898529045125905, "grad_norm": 3.277750015258789, "learning_rate": 5.532691156292849e-06, "loss": 0.7392, "step": 5180 }, { "epoch": 0.36905652313281334, "grad_norm": 2.600461483001709, "learning_rate": 5.529177267927437e-06, "loss": 0.1593, "step": 5181 }, { "epoch": 0.36912775581436763, "grad_norm": 4.592249870300293, "learning_rate": 5.52566406936393e-06, "loss": 0.5743, "step": 5182 }, { "epoch": 0.3691989884959219, "grad_norm": 2.201206684112549, "learning_rate": 5.522151561144386e-06, "loss": 0.1779, "step": 5183 }, { "epoch": 0.3692702211774762, "grad_norm": 3.2065393924713135, "learning_rate": 5.518639743810738e-06, "loss": 0.4686, "step": 5184 }, { "epoch": 0.3693414538590305, "grad_norm": 2.29915714263916, "learning_rate": 5.51512861790483e-06, "loss": 0.5137, "step": 5185 }, { "epoch": 0.36941268654058484, "grad_norm": 1.337724208831787, "learning_rate": 5.5116181839683944e-06, "loss": 0.0588, "step": 5186 }, { "epoch": 0.36948391922213913, "grad_norm": 1.8576438426971436, "learning_rate": 5.508108442543048e-06, "loss": 0.2019, "step": 5187 }, { "epoch": 0.3695551519036934, "grad_norm": 4.213878154754639, "learning_rate": 5.5045993941703094e-06, "loss": 0.5675, "step": 5188 }, { "epoch": 0.3696263845852477, "grad_norm": 3.672503709793091, "learning_rate": 5.501091039391596e-06, "loss": 0.2699, "step": 5189 }, { "epoch": 0.369697617266802, "grad_norm": 3.1179604530334473, "learning_rate": 5.497583378748201e-06, "loss": 0.3539, "step": 5190 }, { "epoch": 0.3697688499483563, "grad_norm": 3.2660715579986572, "learning_rate": 5.49407641278133e-06, "loss": 0.3782, "step": 5191 }, { "epoch": 0.3698400826299106, "grad_norm": 3.031370162963867, "learning_rate": 5.490570142032061e-06, "loss": 0.6116, "step": 5192 }, { "epoch": 0.3699113153114649, "grad_norm": 2.431097984313965, "learning_rate": 5.487064567041387e-06, "loss": 0.3225, "step": 5193 }, { "epoch": 0.3699825479930192, "grad_norm": 2.753305435180664, "learning_rate": 5.48355968835017e-06, "loss": 0.4141, "step": 5194 }, { "epoch": 0.3700537806745735, "grad_norm": 1.5050686597824097, "learning_rate": 5.480055506499187e-06, "loss": 0.0539, "step": 5195 }, { "epoch": 0.3701250133561278, "grad_norm": 3.644435405731201, "learning_rate": 5.476552022029089e-06, "loss": 0.517, "step": 5196 }, { "epoch": 0.3701962460376821, "grad_norm": 2.366736888885498, "learning_rate": 5.473049235480432e-06, "loss": 0.3169, "step": 5197 }, { "epoch": 0.3702674787192364, "grad_norm": 3.4471700191497803, "learning_rate": 5.4695471473936636e-06, "loss": 0.6878, "step": 5198 }, { "epoch": 0.37033871140079067, "grad_norm": 2.2166430950164795, "learning_rate": 5.466045758309111e-06, "loss": 0.3384, "step": 5199 }, { "epoch": 0.37040994408234496, "grad_norm": 3.269068717956543, "learning_rate": 5.462545068767008e-06, "loss": 0.5518, "step": 5200 }, { "epoch": 0.3704811767638993, "grad_norm": 5.45072603225708, "learning_rate": 5.459045079307473e-06, "loss": 0.5051, "step": 5201 }, { "epoch": 0.3705524094454536, "grad_norm": 1.9272286891937256, "learning_rate": 5.455545790470524e-06, "loss": 0.2735, "step": 5202 }, { "epoch": 0.3706236421270079, "grad_norm": 5.968324184417725, "learning_rate": 5.452047202796058e-06, "loss": 0.0582, "step": 5203 }, { "epoch": 0.37069487480856217, "grad_norm": 3.2584598064422607, "learning_rate": 5.448549316823873e-06, "loss": 0.4121, "step": 5204 }, { "epoch": 0.37076610749011646, "grad_norm": 2.1712706089019775, "learning_rate": 5.44505213309366e-06, "loss": 0.2347, "step": 5205 }, { "epoch": 0.37083734017167075, "grad_norm": 3.907010793685913, "learning_rate": 5.4415556521449944e-06, "loss": 0.4689, "step": 5206 }, { "epoch": 0.37090857285322504, "grad_norm": 2.7959144115448, "learning_rate": 5.4380598745173495e-06, "loss": 0.339, "step": 5207 }, { "epoch": 0.3709798055347794, "grad_norm": 2.808856248855591, "learning_rate": 5.434564800750091e-06, "loss": 0.6465, "step": 5208 }, { "epoch": 0.3710510382163337, "grad_norm": 3.7866051197052, "learning_rate": 5.431070431382461e-06, "loss": 0.5056, "step": 5209 }, { "epoch": 0.37112227089788796, "grad_norm": 7.685214042663574, "learning_rate": 5.427576766953615e-06, "loss": 0.4302, "step": 5210 }, { "epoch": 0.37119350357944225, "grad_norm": 2.2866435050964355, "learning_rate": 5.424083808002591e-06, "loss": 0.4406, "step": 5211 }, { "epoch": 0.37126473626099654, "grad_norm": 3.591377019882202, "learning_rate": 5.420591555068308e-06, "loss": 0.2952, "step": 5212 }, { "epoch": 0.37133596894255083, "grad_norm": 2.9376778602600098, "learning_rate": 5.417100008689588e-06, "loss": 0.1059, "step": 5213 }, { "epoch": 0.3714072016241051, "grad_norm": 3.1877260208129883, "learning_rate": 5.413609169405148e-06, "loss": 0.7944, "step": 5214 }, { "epoch": 0.3714784343056594, "grad_norm": 2.474876880645752, "learning_rate": 5.4101190377535785e-06, "loss": 0.4322, "step": 5215 }, { "epoch": 0.37154966698721376, "grad_norm": 2.5932421684265137, "learning_rate": 5.406629614273373e-06, "loss": 0.1686, "step": 5216 }, { "epoch": 0.37162089966876805, "grad_norm": 3.1602461338043213, "learning_rate": 5.403140899502921e-06, "loss": 0.2718, "step": 5217 }, { "epoch": 0.37169213235032234, "grad_norm": 3.386528491973877, "learning_rate": 5.399652893980486e-06, "loss": 0.6754, "step": 5218 }, { "epoch": 0.3717633650318766, "grad_norm": 2.536076307296753, "learning_rate": 5.396165598244234e-06, "loss": 0.3412, "step": 5219 }, { "epoch": 0.3718345977134309, "grad_norm": 3.767047643661499, "learning_rate": 5.392679012832225e-06, "loss": 0.2884, "step": 5220 }, { "epoch": 0.3719058303949852, "grad_norm": 2.522303342819214, "learning_rate": 5.389193138282393e-06, "loss": 0.7325, "step": 5221 }, { "epoch": 0.3719770630765395, "grad_norm": 2.0414650440216064, "learning_rate": 5.385707975132582e-06, "loss": 0.1823, "step": 5222 }, { "epoch": 0.37204829575809384, "grad_norm": 2.2441892623901367, "learning_rate": 5.382223523920511e-06, "loss": 0.3262, "step": 5223 }, { "epoch": 0.37211952843964813, "grad_norm": 2.9956045150756836, "learning_rate": 5.378739785183794e-06, "loss": 0.8222, "step": 5224 }, { "epoch": 0.3721907611212024, "grad_norm": 2.307767152786255, "learning_rate": 5.375256759459939e-06, "loss": 0.4132, "step": 5225 }, { "epoch": 0.3722619938027567, "grad_norm": 3.3371293544769287, "learning_rate": 5.371774447286343e-06, "loss": 0.65, "step": 5226 }, { "epoch": 0.372333226484311, "grad_norm": 2.5902955532073975, "learning_rate": 5.368292849200285e-06, "loss": 0.319, "step": 5227 }, { "epoch": 0.3724044591658653, "grad_norm": 3.6710944175720215, "learning_rate": 5.364811965738943e-06, "loss": 0.4626, "step": 5228 }, { "epoch": 0.3724756918474196, "grad_norm": 3.5370724201202393, "learning_rate": 5.361331797439384e-06, "loss": 0.7051, "step": 5229 }, { "epoch": 0.37254692452897387, "grad_norm": 2.677950143814087, "learning_rate": 5.357852344838557e-06, "loss": 0.2064, "step": 5230 }, { "epoch": 0.3726181572105282, "grad_norm": 4.344975471496582, "learning_rate": 5.354373608473309e-06, "loss": 0.4786, "step": 5231 }, { "epoch": 0.3726893898920825, "grad_norm": 2.7465405464172363, "learning_rate": 5.350895588880376e-06, "loss": 0.2985, "step": 5232 }, { "epoch": 0.3727606225736368, "grad_norm": 2.91294264793396, "learning_rate": 5.347418286596372e-06, "loss": 0.6861, "step": 5233 }, { "epoch": 0.3728318552551911, "grad_norm": 3.03464412689209, "learning_rate": 5.3439417021578154e-06, "loss": 0.5687, "step": 5234 }, { "epoch": 0.37290308793674537, "grad_norm": 3.3087644577026367, "learning_rate": 5.340465836101109e-06, "loss": 0.4663, "step": 5235 }, { "epoch": 0.37297432061829966, "grad_norm": 1.6568076610565186, "learning_rate": 5.336990688962537e-06, "loss": 0.3408, "step": 5236 }, { "epoch": 0.37304555329985395, "grad_norm": 3.784289836883545, "learning_rate": 5.333516261278285e-06, "loss": 0.6043, "step": 5237 }, { "epoch": 0.3731167859814083, "grad_norm": 6.312084197998047, "learning_rate": 5.330042553584416e-06, "loss": 0.7335, "step": 5238 }, { "epoch": 0.3731880186629626, "grad_norm": 2.1365065574645996, "learning_rate": 5.3265695664168926e-06, "loss": 0.3152, "step": 5239 }, { "epoch": 0.3732592513445169, "grad_norm": 4.986188888549805, "learning_rate": 5.323097300311553e-06, "loss": 0.1529, "step": 5240 }, { "epoch": 0.37333048402607116, "grad_norm": 3.699708938598633, "learning_rate": 5.3196257558041386e-06, "loss": 0.4927, "step": 5241 }, { "epoch": 0.37340171670762545, "grad_norm": 6.765391826629639, "learning_rate": 5.316154933430276e-06, "loss": 0.7302, "step": 5242 }, { "epoch": 0.37347294938917974, "grad_norm": 2.0511136054992676, "learning_rate": 5.312684833725468e-06, "loss": 0.268, "step": 5243 }, { "epoch": 0.37354418207073403, "grad_norm": 3.9395008087158203, "learning_rate": 5.309215457225121e-06, "loss": 0.5104, "step": 5244 }, { "epoch": 0.3736154147522884, "grad_norm": 3.771597385406494, "learning_rate": 5.305746804464526e-06, "loss": 0.4323, "step": 5245 }, { "epoch": 0.37368664743384267, "grad_norm": 3.018202066421509, "learning_rate": 5.302278875978855e-06, "loss": 0.3269, "step": 5246 }, { "epoch": 0.37375788011539696, "grad_norm": 4.065255165100098, "learning_rate": 5.298811672303174e-06, "loss": 0.4817, "step": 5247 }, { "epoch": 0.37382911279695125, "grad_norm": 3.679704189300537, "learning_rate": 5.295345193972445e-06, "loss": 0.9309, "step": 5248 }, { "epoch": 0.37390034547850554, "grad_norm": 2.506425380706787, "learning_rate": 5.291879441521499e-06, "loss": 0.2729, "step": 5249 }, { "epoch": 0.3739715781600598, "grad_norm": 1.869818925857544, "learning_rate": 5.288414415485072e-06, "loss": 0.3951, "step": 5250 }, { "epoch": 0.3740428108416141, "grad_norm": 1.9025349617004395, "learning_rate": 5.2849501163977846e-06, "loss": 0.1546, "step": 5251 }, { "epoch": 0.3741140435231684, "grad_norm": 3.2186548709869385, "learning_rate": 5.281486544794139e-06, "loss": 0.2747, "step": 5252 }, { "epoch": 0.37418527620472275, "grad_norm": 2.7010586261749268, "learning_rate": 5.278023701208523e-06, "loss": 0.7668, "step": 5253 }, { "epoch": 0.37425650888627704, "grad_norm": 5.536628246307373, "learning_rate": 5.274561586175226e-06, "loss": 0.3763, "step": 5254 }, { "epoch": 0.37432774156783133, "grad_norm": 1.5852733850479126, "learning_rate": 5.271100200228412e-06, "loss": 0.2726, "step": 5255 }, { "epoch": 0.3743989742493856, "grad_norm": 2.367849588394165, "learning_rate": 5.2676395439021385e-06, "loss": 0.5295, "step": 5256 }, { "epoch": 0.3744702069309399, "grad_norm": 2.6395163536071777, "learning_rate": 5.264179617730353e-06, "loss": 0.3424, "step": 5257 }, { "epoch": 0.3745414396124942, "grad_norm": 3.978299140930176, "learning_rate": 5.260720422246879e-06, "loss": 0.7637, "step": 5258 }, { "epoch": 0.3746126722940485, "grad_norm": 2.2724416255950928, "learning_rate": 5.257261957985438e-06, "loss": 0.3259, "step": 5259 }, { "epoch": 0.37468390497560283, "grad_norm": 4.685385704040527, "learning_rate": 5.253804225479642e-06, "loss": 0.3404, "step": 5260 }, { "epoch": 0.3747551376571571, "grad_norm": 1.692594289779663, "learning_rate": 5.250347225262972e-06, "loss": 0.1737, "step": 5261 }, { "epoch": 0.3748263703387114, "grad_norm": 10.34023666381836, "learning_rate": 5.246890957868813e-06, "loss": 0.6727, "step": 5262 }, { "epoch": 0.3748976030202657, "grad_norm": 2.7189598083496094, "learning_rate": 5.243435423830436e-06, "loss": 0.5281, "step": 5263 }, { "epoch": 0.37496883570182, "grad_norm": 5.697129249572754, "learning_rate": 5.239980623680987e-06, "loss": 0.6634, "step": 5264 }, { "epoch": 0.3750400683833743, "grad_norm": 3.403594732284546, "learning_rate": 5.236526557953508e-06, "loss": 0.8327, "step": 5265 }, { "epoch": 0.37511130106492857, "grad_norm": 2.308908700942993, "learning_rate": 5.233073227180932e-06, "loss": 0.3954, "step": 5266 }, { "epoch": 0.37518253374648286, "grad_norm": 4.196118354797363, "learning_rate": 5.229620631896065e-06, "loss": 0.6282, "step": 5267 }, { "epoch": 0.3752537664280372, "grad_norm": 3.0745368003845215, "learning_rate": 5.226168772631606e-06, "loss": 0.42, "step": 5268 }, { "epoch": 0.3753249991095915, "grad_norm": 3.7624170780181885, "learning_rate": 5.22271764992015e-06, "loss": 0.3715, "step": 5269 }, { "epoch": 0.3753962317911458, "grad_norm": 2.748811960220337, "learning_rate": 5.219267264294159e-06, "loss": 0.7761, "step": 5270 }, { "epoch": 0.3754674644727001, "grad_norm": 3.386136054992676, "learning_rate": 5.215817616285996e-06, "loss": 0.4193, "step": 5271 }, { "epoch": 0.37553869715425436, "grad_norm": 3.583883285522461, "learning_rate": 5.212368706427913e-06, "loss": 0.9263, "step": 5272 }, { "epoch": 0.37560992983580865, "grad_norm": 3.2337419986724854, "learning_rate": 5.20892053525203e-06, "loss": 0.2837, "step": 5273 }, { "epoch": 0.37568116251736294, "grad_norm": 2.3270890712738037, "learning_rate": 5.2054731032903704e-06, "loss": 0.3863, "step": 5274 }, { "epoch": 0.3757523951989173, "grad_norm": 3.1922647953033447, "learning_rate": 5.202026411074841e-06, "loss": 0.9222, "step": 5275 }, { "epoch": 0.3758236278804716, "grad_norm": 1.7449414730072021, "learning_rate": 5.198580459137224e-06, "loss": 0.2712, "step": 5276 }, { "epoch": 0.37589486056202587, "grad_norm": 3.465014696121216, "learning_rate": 5.195135248009196e-06, "loss": 0.7284, "step": 5277 }, { "epoch": 0.37596609324358016, "grad_norm": 3.145000457763672, "learning_rate": 5.191690778222318e-06, "loss": 0.4971, "step": 5278 }, { "epoch": 0.37603732592513445, "grad_norm": 4.97108793258667, "learning_rate": 5.188247050308042e-06, "loss": 0.6996, "step": 5279 }, { "epoch": 0.37610855860668874, "grad_norm": 3.9546384811401367, "learning_rate": 5.18480406479769e-06, "loss": 0.8112, "step": 5280 }, { "epoch": 0.376179791288243, "grad_norm": 3.635969877243042, "learning_rate": 5.181361822222488e-06, "loss": 0.3639, "step": 5281 }, { "epoch": 0.3762510239697973, "grad_norm": 3.030975580215454, "learning_rate": 5.177920323113531e-06, "loss": 0.5625, "step": 5282 }, { "epoch": 0.37632225665135166, "grad_norm": 3.6028575897216797, "learning_rate": 5.174479568001813e-06, "loss": 0.52, "step": 5283 }, { "epoch": 0.37639348933290595, "grad_norm": 3.9165384769439697, "learning_rate": 5.1710395574182026e-06, "loss": 0.5861, "step": 5284 }, { "epoch": 0.37646472201446024, "grad_norm": 3.040771722793579, "learning_rate": 5.167600291893462e-06, "loss": 0.5466, "step": 5285 }, { "epoch": 0.37653595469601453, "grad_norm": 2.9416229724884033, "learning_rate": 5.16416177195823e-06, "loss": 0.5185, "step": 5286 }, { "epoch": 0.3766071873775688, "grad_norm": 1.8767646551132202, "learning_rate": 5.1607239981430355e-06, "loss": 0.1594, "step": 5287 }, { "epoch": 0.3766784200591231, "grad_norm": 2.115520477294922, "learning_rate": 5.1572869709782965e-06, "loss": 0.4016, "step": 5288 }, { "epoch": 0.3767496527406774, "grad_norm": 4.32617712020874, "learning_rate": 5.153850690994306e-06, "loss": 0.5453, "step": 5289 }, { "epoch": 0.37682088542223174, "grad_norm": 2.9918909072875977, "learning_rate": 5.150415158721247e-06, "loss": 0.5264, "step": 5290 }, { "epoch": 0.37689211810378603, "grad_norm": 2.884268283843994, "learning_rate": 5.146980374689192e-06, "loss": 0.4601, "step": 5291 }, { "epoch": 0.3769633507853403, "grad_norm": 3.2273361682891846, "learning_rate": 5.143546339428085e-06, "loss": 0.9025, "step": 5292 }, { "epoch": 0.3770345834668946, "grad_norm": 2.852571487426758, "learning_rate": 5.140113053467765e-06, "loss": 0.6162, "step": 5293 }, { "epoch": 0.3771058161484489, "grad_norm": 3.5077083110809326, "learning_rate": 5.1366805173379575e-06, "loss": 0.7244, "step": 5294 }, { "epoch": 0.3771770488300032, "grad_norm": 3.26444149017334, "learning_rate": 5.133248731568261e-06, "loss": 0.6581, "step": 5295 }, { "epoch": 0.3772482815115575, "grad_norm": 2.4205944538116455, "learning_rate": 5.1298176966881705e-06, "loss": 0.5068, "step": 5296 }, { "epoch": 0.3773195141931118, "grad_norm": 3.180215835571289, "learning_rate": 5.126387413227053e-06, "loss": 0.4755, "step": 5297 }, { "epoch": 0.3773907468746661, "grad_norm": 5.2896599769592285, "learning_rate": 5.122957881714172e-06, "loss": 0.5426, "step": 5298 }, { "epoch": 0.3774619795562204, "grad_norm": 2.517531394958496, "learning_rate": 5.119529102678665e-06, "loss": 0.5878, "step": 5299 }, { "epoch": 0.3775332122377747, "grad_norm": 3.075084924697876, "learning_rate": 5.116101076649559e-06, "loss": 0.5218, "step": 5300 }, { "epoch": 0.377604444919329, "grad_norm": 2.4947307109832764, "learning_rate": 5.112673804155759e-06, "loss": 0.5533, "step": 5301 }, { "epoch": 0.3776756776008833, "grad_norm": 3.974768877029419, "learning_rate": 5.109247285726062e-06, "loss": 0.708, "step": 5302 }, { "epoch": 0.37774691028243756, "grad_norm": 3.3327901363372803, "learning_rate": 5.105821521889147e-06, "loss": 0.5607, "step": 5303 }, { "epoch": 0.37781814296399185, "grad_norm": 2.408461093902588, "learning_rate": 5.102396513173569e-06, "loss": 0.4362, "step": 5304 }, { "epoch": 0.3778893756455462, "grad_norm": 3.0205514430999756, "learning_rate": 5.098972260107771e-06, "loss": 0.415, "step": 5305 }, { "epoch": 0.3779606083271005, "grad_norm": 2.2652175426483154, "learning_rate": 5.0955487632200885e-06, "loss": 0.7525, "step": 5306 }, { "epoch": 0.3780318410086548, "grad_norm": 2.579035758972168, "learning_rate": 5.0921260230387195e-06, "loss": 0.3262, "step": 5307 }, { "epoch": 0.37810307369020907, "grad_norm": 4.143674373626709, "learning_rate": 5.088704040091765e-06, "loss": 0.5135, "step": 5308 }, { "epoch": 0.37817430637176336, "grad_norm": 4.871361255645752, "learning_rate": 5.085282814907205e-06, "loss": 0.3105, "step": 5309 }, { "epoch": 0.37824553905331765, "grad_norm": 2.3559675216674805, "learning_rate": 5.081862348012892e-06, "loss": 0.4371, "step": 5310 }, { "epoch": 0.37831677173487194, "grad_norm": 4.551689624786377, "learning_rate": 5.0784426399365725e-06, "loss": 0.5603, "step": 5311 }, { "epoch": 0.3783880044164263, "grad_norm": 3.184277057647705, "learning_rate": 5.075023691205869e-06, "loss": 0.433, "step": 5312 }, { "epoch": 0.37845923709798057, "grad_norm": 1.9837640523910522, "learning_rate": 5.071605502348297e-06, "loss": 0.2795, "step": 5313 }, { "epoch": 0.37853046977953486, "grad_norm": 2.5657989978790283, "learning_rate": 5.068188073891238e-06, "loss": 0.5214, "step": 5314 }, { "epoch": 0.37860170246108915, "grad_norm": 1.8417726755142212, "learning_rate": 5.064771406361973e-06, "loss": 0.2699, "step": 5315 }, { "epoch": 0.37867293514264344, "grad_norm": 2.7844743728637695, "learning_rate": 5.06135550028766e-06, "loss": 0.6991, "step": 5316 }, { "epoch": 0.37874416782419773, "grad_norm": 2.8952975273132324, "learning_rate": 5.057940356195332e-06, "loss": 0.601, "step": 5317 }, { "epoch": 0.378815400505752, "grad_norm": 2.5124740600585938, "learning_rate": 5.054525974611913e-06, "loss": 0.4911, "step": 5318 }, { "epoch": 0.3788866331873063, "grad_norm": 2.91882061958313, "learning_rate": 5.051112356064212e-06, "loss": 0.4462, "step": 5319 }, { "epoch": 0.37895786586886065, "grad_norm": 2.247272253036499, "learning_rate": 5.047699501078905e-06, "loss": 0.4297, "step": 5320 }, { "epoch": 0.37902909855041494, "grad_norm": 2.373887538909912, "learning_rate": 5.044287410182568e-06, "loss": 0.538, "step": 5321 }, { "epoch": 0.37910033123196923, "grad_norm": 2.4624013900756836, "learning_rate": 5.040876083901654e-06, "loss": 0.2018, "step": 5322 }, { "epoch": 0.3791715639135235, "grad_norm": 2.4546000957489014, "learning_rate": 5.037465522762486e-06, "loss": 0.171, "step": 5323 }, { "epoch": 0.3792427965950778, "grad_norm": 2.6258351802825928, "learning_rate": 5.034055727291283e-06, "loss": 0.5857, "step": 5324 }, { "epoch": 0.3793140292766321, "grad_norm": 3.4285097122192383, "learning_rate": 5.0306466980141475e-06, "loss": 0.6069, "step": 5325 }, { "epoch": 0.3793852619581864, "grad_norm": 3.2171807289123535, "learning_rate": 5.027238435457047e-06, "loss": 0.6955, "step": 5326 }, { "epoch": 0.37945649463974074, "grad_norm": 2.9188919067382812, "learning_rate": 5.023830940145851e-06, "loss": 0.5608, "step": 5327 }, { "epoch": 0.379527727321295, "grad_norm": 4.411522388458252, "learning_rate": 5.0204242126062964e-06, "loss": 0.3404, "step": 5328 }, { "epoch": 0.3795989600028493, "grad_norm": 1.6638033390045166, "learning_rate": 5.017018253364001e-06, "loss": 0.2311, "step": 5329 }, { "epoch": 0.3796701926844036, "grad_norm": 3.142287015914917, "learning_rate": 5.0136130629444755e-06, "loss": 0.5407, "step": 5330 }, { "epoch": 0.3797414253659579, "grad_norm": 4.9604902267456055, "learning_rate": 5.010208641873109e-06, "loss": 0.8272, "step": 5331 }, { "epoch": 0.3798126580475122, "grad_norm": 3.6930861473083496, "learning_rate": 5.006804990675158e-06, "loss": 0.6353, "step": 5332 }, { "epoch": 0.3798838907290665, "grad_norm": 2.7751119136810303, "learning_rate": 5.003402109875779e-06, "loss": 0.5352, "step": 5333 }, { "epoch": 0.37995512341062077, "grad_norm": 3.4887232780456543, "learning_rate": 5.000000000000003e-06, "loss": 0.3533, "step": 5334 }, { "epoch": 0.3800263560921751, "grad_norm": 2.6418051719665527, "learning_rate": 4.996598661572732e-06, "loss": 0.3587, "step": 5335 }, { "epoch": 0.3800975887737294, "grad_norm": 1.5619817972183228, "learning_rate": 4.993198095118763e-06, "loss": 0.2268, "step": 5336 }, { "epoch": 0.3801688214552837, "grad_norm": 3.086181402206421, "learning_rate": 4.989798301162772e-06, "loss": 0.4321, "step": 5337 }, { "epoch": 0.380240054136838, "grad_norm": 6.343308448791504, "learning_rate": 4.986399280229304e-06, "loss": 0.6707, "step": 5338 }, { "epoch": 0.38031128681839227, "grad_norm": 3.152008533477783, "learning_rate": 4.983001032842797e-06, "loss": 0.642, "step": 5339 }, { "epoch": 0.38038251949994656, "grad_norm": 3.8764894008636475, "learning_rate": 4.979603559527569e-06, "loss": 0.5858, "step": 5340 }, { "epoch": 0.38045375218150085, "grad_norm": 2.9446773529052734, "learning_rate": 4.976206860807808e-06, "loss": 0.369, "step": 5341 }, { "epoch": 0.3805249848630552, "grad_norm": 5.104753017425537, "learning_rate": 4.972810937207599e-06, "loss": 0.51, "step": 5342 }, { "epoch": 0.3805962175446095, "grad_norm": 7.593939304351807, "learning_rate": 4.96941578925089e-06, "loss": 0.6832, "step": 5343 }, { "epoch": 0.3806674502261638, "grad_norm": 3.3738250732421875, "learning_rate": 4.9660214174615165e-06, "loss": 0.6555, "step": 5344 }, { "epoch": 0.38073868290771806, "grad_norm": 2.160325765609741, "learning_rate": 4.9626278223631985e-06, "loss": 0.3457, "step": 5345 }, { "epoch": 0.38080991558927235, "grad_norm": 2.178147792816162, "learning_rate": 4.959235004479537e-06, "loss": 0.2082, "step": 5346 }, { "epoch": 0.38088114827082664, "grad_norm": 3.705958366394043, "learning_rate": 4.955842964334e-06, "loss": 0.7573, "step": 5347 }, { "epoch": 0.38095238095238093, "grad_norm": 3.4374706745147705, "learning_rate": 4.952451702449949e-06, "loss": 0.5438, "step": 5348 }, { "epoch": 0.3810236136339353, "grad_norm": 4.831919193267822, "learning_rate": 4.949061219350624e-06, "loss": 0.4935, "step": 5349 }, { "epoch": 0.38109484631548957, "grad_norm": 3.2510929107666016, "learning_rate": 4.945671515559135e-06, "loss": 0.6739, "step": 5350 }, { "epoch": 0.38116607899704386, "grad_norm": 0.9691728949546814, "learning_rate": 4.942282591598481e-06, "loss": 0.0811, "step": 5351 }, { "epoch": 0.38123731167859815, "grad_norm": 4.118435382843018, "learning_rate": 4.938894447991544e-06, "loss": 0.5358, "step": 5352 }, { "epoch": 0.38130854436015243, "grad_norm": 4.566526889801025, "learning_rate": 4.935507085261069e-06, "loss": 1.175, "step": 5353 }, { "epoch": 0.3813797770417067, "grad_norm": 4.992936134338379, "learning_rate": 4.932120503929696e-06, "loss": 0.761, "step": 5354 }, { "epoch": 0.381451009723261, "grad_norm": 3.2740235328674316, "learning_rate": 4.928734704519945e-06, "loss": 0.8418, "step": 5355 }, { "epoch": 0.3815222424048153, "grad_norm": 3.4658758640289307, "learning_rate": 4.925349687554201e-06, "loss": 0.6232, "step": 5356 }, { "epoch": 0.38159347508636965, "grad_norm": 4.0251946449279785, "learning_rate": 4.921965453554747e-06, "loss": 0.8338, "step": 5357 }, { "epoch": 0.38166470776792394, "grad_norm": 3.5260560512542725, "learning_rate": 4.918582003043724e-06, "loss": 0.36, "step": 5358 }, { "epoch": 0.38173594044947823, "grad_norm": 2.7100532054901123, "learning_rate": 4.9151993365431735e-06, "loss": 0.2598, "step": 5359 }, { "epoch": 0.3818071731310325, "grad_norm": 3.2447924613952637, "learning_rate": 4.911817454575e-06, "loss": 0.385, "step": 5360 }, { "epoch": 0.3818784058125868, "grad_norm": 2.8225882053375244, "learning_rate": 4.908436357660993e-06, "loss": 0.6512, "step": 5361 }, { "epoch": 0.3819496384941411, "grad_norm": 5.253851890563965, "learning_rate": 4.905056046322828e-06, "loss": 0.4348, "step": 5362 }, { "epoch": 0.3820208711756954, "grad_norm": 2.639955759048462, "learning_rate": 4.901676521082043e-06, "loss": 0.3102, "step": 5363 }, { "epoch": 0.38209210385724973, "grad_norm": 3.3638997077941895, "learning_rate": 4.8982977824600685e-06, "loss": 0.4492, "step": 5364 }, { "epoch": 0.382163336538804, "grad_norm": 3.124363899230957, "learning_rate": 4.894919830978212e-06, "loss": 0.3435, "step": 5365 }, { "epoch": 0.3822345692203583, "grad_norm": 2.9621386528015137, "learning_rate": 4.89154266715765e-06, "loss": 0.6987, "step": 5366 }, { "epoch": 0.3823058019019126, "grad_norm": 1.6036373376846313, "learning_rate": 4.888166291519449e-06, "loss": 0.2214, "step": 5367 }, { "epoch": 0.3823770345834669, "grad_norm": 2.5020911693573, "learning_rate": 4.884790704584549e-06, "loss": 0.2542, "step": 5368 }, { "epoch": 0.3824482672650212, "grad_norm": 3.5128824710845947, "learning_rate": 4.881415906873763e-06, "loss": 0.4099, "step": 5369 }, { "epoch": 0.38251949994657547, "grad_norm": 3.64870023727417, "learning_rate": 4.878041898907793e-06, "loss": 0.3268, "step": 5370 }, { "epoch": 0.38259073262812976, "grad_norm": 17.458532333374023, "learning_rate": 4.874668681207215e-06, "loss": 0.1792, "step": 5371 }, { "epoch": 0.3826619653096841, "grad_norm": 1.768079161643982, "learning_rate": 4.871296254292479e-06, "loss": 0.1837, "step": 5372 }, { "epoch": 0.3827331979912384, "grad_norm": 2.7318339347839355, "learning_rate": 4.867924618683911e-06, "loss": 0.6093, "step": 5373 }, { "epoch": 0.3828044306727927, "grad_norm": 1.6154614686965942, "learning_rate": 4.8645537749017295e-06, "loss": 0.1497, "step": 5374 }, { "epoch": 0.382875663354347, "grad_norm": 2.399480104446411, "learning_rate": 4.861183723466011e-06, "loss": 0.3912, "step": 5375 }, { "epoch": 0.38294689603590126, "grad_norm": 2.2017369270324707, "learning_rate": 4.857814464896724e-06, "loss": 0.3213, "step": 5376 }, { "epoch": 0.38301812871745555, "grad_norm": 10.204183578491211, "learning_rate": 4.854445999713715e-06, "loss": 0.252, "step": 5377 }, { "epoch": 0.38308936139900984, "grad_norm": 4.357161998748779, "learning_rate": 4.851078328436696e-06, "loss": 0.8561, "step": 5378 }, { "epoch": 0.3831605940805642, "grad_norm": 3.9144515991210938, "learning_rate": 4.847711451585266e-06, "loss": 0.4583, "step": 5379 }, { "epoch": 0.3832318267621185, "grad_norm": 5.315963268280029, "learning_rate": 4.8443453696789055e-06, "loss": 0.7948, "step": 5380 }, { "epoch": 0.38330305944367277, "grad_norm": 2.6430373191833496, "learning_rate": 4.840980083236958e-06, "loss": 0.2688, "step": 5381 }, { "epoch": 0.38337429212522706, "grad_norm": 2.867421865463257, "learning_rate": 4.837615592778655e-06, "loss": 0.68, "step": 5382 }, { "epoch": 0.38344552480678135, "grad_norm": 3.2502732276916504, "learning_rate": 4.834251898823108e-06, "loss": 0.3054, "step": 5383 }, { "epoch": 0.38351675748833564, "grad_norm": 1.8342745304107666, "learning_rate": 4.8308890018892914e-06, "loss": 0.1748, "step": 5384 }, { "epoch": 0.3835879901698899, "grad_norm": 2.5994575023651123, "learning_rate": 4.827526902496073e-06, "loss": 0.3646, "step": 5385 }, { "epoch": 0.3836592228514442, "grad_norm": 4.606513023376465, "learning_rate": 4.8241656011621886e-06, "loss": 0.1794, "step": 5386 }, { "epoch": 0.38373045553299856, "grad_norm": 2.330953359603882, "learning_rate": 4.8208050984062515e-06, "loss": 0.4305, "step": 5387 }, { "epoch": 0.38380168821455285, "grad_norm": 5.77682638168335, "learning_rate": 4.817445394746749e-06, "loss": 1.0778, "step": 5388 }, { "epoch": 0.38387292089610714, "grad_norm": 3.005635976791382, "learning_rate": 4.814086490702056e-06, "loss": 0.52, "step": 5389 }, { "epoch": 0.38394415357766143, "grad_norm": 1.9073559045791626, "learning_rate": 4.810728386790409e-06, "loss": 0.1333, "step": 5390 }, { "epoch": 0.3840153862592157, "grad_norm": 2.8290255069732666, "learning_rate": 4.807371083529933e-06, "loss": 0.4866, "step": 5391 }, { "epoch": 0.38408661894077, "grad_norm": 2.3270747661590576, "learning_rate": 4.8040145814386245e-06, "loss": 0.2577, "step": 5392 }, { "epoch": 0.3841578516223243, "grad_norm": 3.344780921936035, "learning_rate": 4.800658881034362e-06, "loss": 0.3089, "step": 5393 }, { "epoch": 0.38422908430387864, "grad_norm": 4.0259904861450195, "learning_rate": 4.797303982834887e-06, "loss": 0.4092, "step": 5394 }, { "epoch": 0.38430031698543293, "grad_norm": 3.027527093887329, "learning_rate": 4.79394988735783e-06, "loss": 0.2674, "step": 5395 }, { "epoch": 0.3843715496669872, "grad_norm": 2.659032106399536, "learning_rate": 4.790596595120699e-06, "loss": 0.5004, "step": 5396 }, { "epoch": 0.3844427823485415, "grad_norm": 2.2824671268463135, "learning_rate": 4.787244106640861e-06, "loss": 0.5776, "step": 5397 }, { "epoch": 0.3845140150300958, "grad_norm": 2.7044055461883545, "learning_rate": 4.783892422435577e-06, "loss": 0.345, "step": 5398 }, { "epoch": 0.3845852477116501, "grad_norm": 4.290084362030029, "learning_rate": 4.780541543021981e-06, "loss": 0.6442, "step": 5399 }, { "epoch": 0.3846564803932044, "grad_norm": 2.6613588333129883, "learning_rate": 4.7771914689170704e-06, "loss": 0.5965, "step": 5400 }, { "epoch": 0.3847277130747587, "grad_norm": 2.0299010276794434, "learning_rate": 4.773842200637736e-06, "loss": 0.1355, "step": 5401 }, { "epoch": 0.384798945756313, "grad_norm": 3.160139560699463, "learning_rate": 4.770493738700727e-06, "loss": 1.0432, "step": 5402 }, { "epoch": 0.3848701784378673, "grad_norm": 3.267690658569336, "learning_rate": 4.7671460836226845e-06, "loss": 0.2986, "step": 5403 }, { "epoch": 0.3849414111194216, "grad_norm": 2.637984275817871, "learning_rate": 4.763799235920109e-06, "loss": 0.8078, "step": 5404 }, { "epoch": 0.3850126438009759, "grad_norm": 2.713236093521118, "learning_rate": 4.760453196109394e-06, "loss": 0.259, "step": 5405 }, { "epoch": 0.3850838764825302, "grad_norm": 3.1431984901428223, "learning_rate": 4.757107964706788e-06, "loss": 0.6037, "step": 5406 }, { "epoch": 0.38515510916408446, "grad_norm": 3.5138754844665527, "learning_rate": 4.753763542228433e-06, "loss": 0.7508, "step": 5407 }, { "epoch": 0.38522634184563875, "grad_norm": 3.132248640060425, "learning_rate": 4.750419929190342e-06, "loss": 0.5891, "step": 5408 }, { "epoch": 0.3852975745271931, "grad_norm": 3.5287442207336426, "learning_rate": 4.7470771261083915e-06, "loss": 0.4236, "step": 5409 }, { "epoch": 0.3853688072087474, "grad_norm": 2.518059015274048, "learning_rate": 4.743735133498346e-06, "loss": 0.3604, "step": 5410 }, { "epoch": 0.3854400398903017, "grad_norm": 4.807882308959961, "learning_rate": 4.740393951875843e-06, "loss": 0.7433, "step": 5411 }, { "epoch": 0.38551127257185597, "grad_norm": 2.8606789112091064, "learning_rate": 4.737053581756387e-06, "loss": 0.5343, "step": 5412 }, { "epoch": 0.38558250525341026, "grad_norm": 3.5589258670806885, "learning_rate": 4.733714023655366e-06, "loss": 0.4531, "step": 5413 }, { "epoch": 0.38565373793496455, "grad_norm": 4.331246376037598, "learning_rate": 4.730375278088042e-06, "loss": 0.7903, "step": 5414 }, { "epoch": 0.38572497061651884, "grad_norm": 2.233358383178711, "learning_rate": 4.727037345569542e-06, "loss": 0.32, "step": 5415 }, { "epoch": 0.3857962032980732, "grad_norm": 4.009251594543457, "learning_rate": 4.723700226614882e-06, "loss": 0.7624, "step": 5416 }, { "epoch": 0.38586743597962747, "grad_norm": 3.0656464099884033, "learning_rate": 4.7203639217389385e-06, "loss": 0.7133, "step": 5417 }, { "epoch": 0.38593866866118176, "grad_norm": 3.8103580474853516, "learning_rate": 4.717028431456475e-06, "loss": 0.4179, "step": 5418 }, { "epoch": 0.38600990134273605, "grad_norm": 4.634645462036133, "learning_rate": 4.713693756282118e-06, "loss": 0.9891, "step": 5419 }, { "epoch": 0.38608113402429034, "grad_norm": 2.9469528198242188, "learning_rate": 4.710359896730379e-06, "loss": 0.5288, "step": 5420 }, { "epoch": 0.38615236670584463, "grad_norm": 2.5168957710266113, "learning_rate": 4.7070268533156315e-06, "loss": 0.1005, "step": 5421 }, { "epoch": 0.3862235993873989, "grad_norm": 5.091553211212158, "learning_rate": 4.7036946265521335e-06, "loss": 0.3574, "step": 5422 }, { "epoch": 0.3862948320689532, "grad_norm": 2.7958130836486816, "learning_rate": 4.700363216954017e-06, "loss": 0.5072, "step": 5423 }, { "epoch": 0.38636606475050755, "grad_norm": 4.513320446014404, "learning_rate": 4.697032625035277e-06, "loss": 0.6518, "step": 5424 }, { "epoch": 0.38643729743206184, "grad_norm": 3.9788544178009033, "learning_rate": 4.693702851309793e-06, "loss": 0.5489, "step": 5425 }, { "epoch": 0.38650853011361613, "grad_norm": 6.1895527839660645, "learning_rate": 4.690373896291318e-06, "loss": 0.6358, "step": 5426 }, { "epoch": 0.3865797627951704, "grad_norm": 4.21924352645874, "learning_rate": 4.687045760493468e-06, "loss": 0.5407, "step": 5427 }, { "epoch": 0.3866509954767247, "grad_norm": 2.345857620239258, "learning_rate": 4.683718444429746e-06, "loss": 0.2184, "step": 5428 }, { "epoch": 0.386722228158279, "grad_norm": 2.6752989292144775, "learning_rate": 4.680391948613523e-06, "loss": 0.5875, "step": 5429 }, { "epoch": 0.3867934608398333, "grad_norm": 1.9354078769683838, "learning_rate": 4.677066273558038e-06, "loss": 0.4303, "step": 5430 }, { "epoch": 0.38686469352138764, "grad_norm": 3.815338134765625, "learning_rate": 4.673741419776414e-06, "loss": 0.6423, "step": 5431 }, { "epoch": 0.3869359262029419, "grad_norm": 2.8641517162323, "learning_rate": 4.670417387781638e-06, "loss": 0.4952, "step": 5432 }, { "epoch": 0.3870071588844962, "grad_norm": 4.008536338806152, "learning_rate": 4.6670941780865765e-06, "loss": 0.507, "step": 5433 }, { "epoch": 0.3870783915660505, "grad_norm": 3.254265308380127, "learning_rate": 4.663771791203961e-06, "loss": 0.3633, "step": 5434 }, { "epoch": 0.3871496242476048, "grad_norm": 4.915028095245361, "learning_rate": 4.660450227646407e-06, "loss": 0.8511, "step": 5435 }, { "epoch": 0.3872208569291591, "grad_norm": 2.1317648887634277, "learning_rate": 4.657129487926398e-06, "loss": 0.2629, "step": 5436 }, { "epoch": 0.3872920896107134, "grad_norm": 4.369820594787598, "learning_rate": 4.653809572556286e-06, "loss": 0.5346, "step": 5437 }, { "epoch": 0.3873633222922677, "grad_norm": 1.8494757413864136, "learning_rate": 4.650490482048302e-06, "loss": 0.2246, "step": 5438 }, { "epoch": 0.387434554973822, "grad_norm": 2.797700881958008, "learning_rate": 4.647172216914551e-06, "loss": 0.7113, "step": 5439 }, { "epoch": 0.3875057876553763, "grad_norm": 3.5628528594970703, "learning_rate": 4.643854777666998e-06, "loss": 0.6173, "step": 5440 }, { "epoch": 0.3875770203369306, "grad_norm": 7.4370036125183105, "learning_rate": 4.6405381648174976e-06, "loss": 0.4624, "step": 5441 }, { "epoch": 0.3876482530184849, "grad_norm": 7.65494966506958, "learning_rate": 4.637222378877768e-06, "loss": 0.3795, "step": 5442 }, { "epoch": 0.38771948570003917, "grad_norm": 3.912517547607422, "learning_rate": 4.633907420359397e-06, "loss": 0.7722, "step": 5443 }, { "epoch": 0.38779071838159346, "grad_norm": 2.1134748458862305, "learning_rate": 4.630593289773852e-06, "loss": 0.3042, "step": 5444 }, { "epoch": 0.38786195106314775, "grad_norm": 3.722994804382324, "learning_rate": 4.62727998763247e-06, "loss": 0.9333, "step": 5445 }, { "epoch": 0.3879331837447021, "grad_norm": 2.0258405208587646, "learning_rate": 4.623967514446455e-06, "loss": 0.2034, "step": 5446 }, { "epoch": 0.3880044164262564, "grad_norm": 2.417832851409912, "learning_rate": 4.620655870726893e-06, "loss": 0.2725, "step": 5447 }, { "epoch": 0.38807564910781067, "grad_norm": 4.126152515411377, "learning_rate": 4.617345056984734e-06, "loss": 0.613, "step": 5448 }, { "epoch": 0.38814688178936496, "grad_norm": 3.6341519355773926, "learning_rate": 4.614035073730798e-06, "loss": 0.4941, "step": 5449 }, { "epoch": 0.38821811447091925, "grad_norm": 1.897237777709961, "learning_rate": 4.610725921475786e-06, "loss": 0.3891, "step": 5450 }, { "epoch": 0.38828934715247354, "grad_norm": 2.7553465366363525, "learning_rate": 4.60741760073027e-06, "loss": 0.3485, "step": 5451 }, { "epoch": 0.38836057983402783, "grad_norm": 2.9252262115478516, "learning_rate": 4.60411011200468e-06, "loss": 0.632, "step": 5452 }, { "epoch": 0.3884318125155822, "grad_norm": 3.7318358421325684, "learning_rate": 4.600803455809334e-06, "loss": 0.803, "step": 5453 }, { "epoch": 0.38850304519713647, "grad_norm": 1.4613131284713745, "learning_rate": 4.597497632654416e-06, "loss": 0.1084, "step": 5454 }, { "epoch": 0.38857427787869075, "grad_norm": 4.025689601898193, "learning_rate": 4.594192643049976e-06, "loss": 0.3529, "step": 5455 }, { "epoch": 0.38864551056024504, "grad_norm": 2.926999807357788, "learning_rate": 4.590888487505941e-06, "loss": 0.4734, "step": 5456 }, { "epoch": 0.38871674324179933, "grad_norm": 3.092235565185547, "learning_rate": 4.587585166532115e-06, "loss": 0.5577, "step": 5457 }, { "epoch": 0.3887879759233536, "grad_norm": 1.7285566329956055, "learning_rate": 4.584282680638155e-06, "loss": 0.1814, "step": 5458 }, { "epoch": 0.3888592086049079, "grad_norm": 2.8334414958953857, "learning_rate": 4.580981030333606e-06, "loss": 0.4959, "step": 5459 }, { "epoch": 0.3889304412864622, "grad_norm": 4.873475074768066, "learning_rate": 4.577680216127885e-06, "loss": 0.775, "step": 5460 }, { "epoch": 0.38900167396801655, "grad_norm": 3.5511558055877686, "learning_rate": 4.574380238530262e-06, "loss": 0.6279, "step": 5461 }, { "epoch": 0.38907290664957084, "grad_norm": 2.5205962657928467, "learning_rate": 4.5710810980498996e-06, "loss": 0.3534, "step": 5462 }, { "epoch": 0.3891441393311251, "grad_norm": 2.4594614505767822, "learning_rate": 4.567782795195816e-06, "loss": 0.2933, "step": 5463 }, { "epoch": 0.3892153720126794, "grad_norm": 3.255882740020752, "learning_rate": 4.564485330476903e-06, "loss": 0.4725, "step": 5464 }, { "epoch": 0.3892866046942337, "grad_norm": 3.693554162979126, "learning_rate": 4.561188704401929e-06, "loss": 0.3781, "step": 5465 }, { "epoch": 0.389357837375788, "grad_norm": 3.5549910068511963, "learning_rate": 4.557892917479532e-06, "loss": 0.3915, "step": 5466 }, { "epoch": 0.3894290700573423, "grad_norm": 2.399181842803955, "learning_rate": 4.5545979702182105e-06, "loss": 0.6313, "step": 5467 }, { "epoch": 0.38950030273889663, "grad_norm": 4.199453830718994, "learning_rate": 4.551303863126346e-06, "loss": 0.6143, "step": 5468 }, { "epoch": 0.3895715354204509, "grad_norm": 4.368882179260254, "learning_rate": 4.5480105967121855e-06, "loss": 1.1257, "step": 5469 }, { "epoch": 0.3896427681020052, "grad_norm": 2.684546947479248, "learning_rate": 4.544718171483849e-06, "loss": 0.444, "step": 5470 }, { "epoch": 0.3897140007835595, "grad_norm": 2.443817138671875, "learning_rate": 4.541426587949315e-06, "loss": 0.4569, "step": 5471 }, { "epoch": 0.3897852334651138, "grad_norm": 2.0067057609558105, "learning_rate": 4.538135846616447e-06, "loss": 0.3088, "step": 5472 }, { "epoch": 0.3898564661466681, "grad_norm": 2.245345115661621, "learning_rate": 4.534845947992975e-06, "loss": 0.2745, "step": 5473 }, { "epoch": 0.38992769882822237, "grad_norm": 2.4802393913269043, "learning_rate": 4.53155689258649e-06, "loss": 0.3808, "step": 5474 }, { "epoch": 0.38999893150977666, "grad_norm": 5.1353678703308105, "learning_rate": 4.528268680904465e-06, "loss": 0.6405, "step": 5475 }, { "epoch": 0.390070164191331, "grad_norm": 5.4735870361328125, "learning_rate": 4.524981313454232e-06, "loss": 0.4588, "step": 5476 }, { "epoch": 0.3901413968728853, "grad_norm": 3.213718891143799, "learning_rate": 4.521694790743003e-06, "loss": 0.6549, "step": 5477 }, { "epoch": 0.3902126295544396, "grad_norm": 2.543975353240967, "learning_rate": 4.51840911327785e-06, "loss": 0.4541, "step": 5478 }, { "epoch": 0.3902838622359939, "grad_norm": 4.584540367126465, "learning_rate": 4.515124281565724e-06, "loss": 0.9319, "step": 5479 }, { "epoch": 0.39035509491754816, "grad_norm": 1.551682949066162, "learning_rate": 4.511840296113434e-06, "loss": 0.1428, "step": 5480 }, { "epoch": 0.39042632759910245, "grad_norm": 2.9222652912139893, "learning_rate": 4.50855715742767e-06, "loss": 0.5957, "step": 5481 }, { "epoch": 0.39049756028065674, "grad_norm": 2.363218069076538, "learning_rate": 4.505274866014989e-06, "loss": 0.1984, "step": 5482 }, { "epoch": 0.3905687929622111, "grad_norm": 3.1833598613739014, "learning_rate": 4.501993422381807e-06, "loss": 0.6644, "step": 5483 }, { "epoch": 0.3906400256437654, "grad_norm": 1.9959990978240967, "learning_rate": 4.4987128270344224e-06, "loss": 0.314, "step": 5484 }, { "epoch": 0.39071125832531967, "grad_norm": 2.7094383239746094, "learning_rate": 4.4954330804790004e-06, "loss": 0.2055, "step": 5485 }, { "epoch": 0.39078249100687396, "grad_norm": 1.846490502357483, "learning_rate": 4.492154183221565e-06, "loss": 0.2338, "step": 5486 }, { "epoch": 0.39085372368842825, "grad_norm": 4.188443183898926, "learning_rate": 4.488876135768017e-06, "loss": 0.6356, "step": 5487 }, { "epoch": 0.39092495636998253, "grad_norm": 3.230543375015259, "learning_rate": 4.485598938624133e-06, "loss": 0.282, "step": 5488 }, { "epoch": 0.3909961890515368, "grad_norm": 3.3256776332855225, "learning_rate": 4.482322592295541e-06, "loss": 0.5929, "step": 5489 }, { "epoch": 0.39106742173309117, "grad_norm": 2.893441915512085, "learning_rate": 4.479047097287752e-06, "loss": 0.4849, "step": 5490 }, { "epoch": 0.39113865441464546, "grad_norm": 3.9326224327087402, "learning_rate": 4.475772454106144e-06, "loss": 0.2996, "step": 5491 }, { "epoch": 0.39120988709619975, "grad_norm": 2.55621075630188, "learning_rate": 4.47249866325596e-06, "loss": 0.6829, "step": 5492 }, { "epoch": 0.39128111977775404, "grad_norm": 3.620805263519287, "learning_rate": 4.469225725242304e-06, "loss": 0.4839, "step": 5493 }, { "epoch": 0.39135235245930833, "grad_norm": 3.0690131187438965, "learning_rate": 4.465953640570167e-06, "loss": 0.6178, "step": 5494 }, { "epoch": 0.3914235851408626, "grad_norm": 2.6632163524627686, "learning_rate": 4.462682409744391e-06, "loss": 0.1753, "step": 5495 }, { "epoch": 0.3914948178224169, "grad_norm": 4.269992828369141, "learning_rate": 4.459412033269695e-06, "loss": 0.8571, "step": 5496 }, { "epoch": 0.3915660505039712, "grad_norm": 2.856400489807129, "learning_rate": 4.456142511650669e-06, "loss": 0.7619, "step": 5497 }, { "epoch": 0.39163728318552554, "grad_norm": 3.953866720199585, "learning_rate": 4.452873845391759e-06, "loss": 0.6128, "step": 5498 }, { "epoch": 0.39170851586707983, "grad_norm": 3.2647199630737305, "learning_rate": 4.44960603499729e-06, "loss": 0.5558, "step": 5499 }, { "epoch": 0.3917797485486341, "grad_norm": 4.533194065093994, "learning_rate": 4.4463390809714566e-06, "loss": 0.7052, "step": 5500 }, { "epoch": 0.3918509812301884, "grad_norm": 3.3634755611419678, "learning_rate": 4.4430729838183065e-06, "loss": 0.6264, "step": 5501 }, { "epoch": 0.3919222139117427, "grad_norm": 3.106708526611328, "learning_rate": 4.43980774404177e-06, "loss": 0.2476, "step": 5502 }, { "epoch": 0.391993446593297, "grad_norm": 4.899411201477051, "learning_rate": 4.436543362145643e-06, "loss": 0.5122, "step": 5503 }, { "epoch": 0.3920646792748513, "grad_norm": 4.1242804527282715, "learning_rate": 4.433279838633581e-06, "loss": 0.5258, "step": 5504 }, { "epoch": 0.3921359119564056, "grad_norm": 4.868536472320557, "learning_rate": 4.430017174009111e-06, "loss": 0.3375, "step": 5505 }, { "epoch": 0.3922071446379599, "grad_norm": 2.811708688735962, "learning_rate": 4.426755368775637e-06, "loss": 0.7203, "step": 5506 }, { "epoch": 0.3922783773195142, "grad_norm": 3.1068015098571777, "learning_rate": 4.423494423436415e-06, "loss": 0.4292, "step": 5507 }, { "epoch": 0.3923496100010685, "grad_norm": 4.9009199142456055, "learning_rate": 4.420234338494574e-06, "loss": 0.3909, "step": 5508 }, { "epoch": 0.3924208426826228, "grad_norm": 3.1386559009552, "learning_rate": 4.416975114453114e-06, "loss": 0.5027, "step": 5509 }, { "epoch": 0.3924920753641771, "grad_norm": 2.3926899433135986, "learning_rate": 4.4137167518149025e-06, "loss": 0.2896, "step": 5510 }, { "epoch": 0.39256330804573136, "grad_norm": 2.331049919128418, "learning_rate": 4.410459251082666e-06, "loss": 0.2595, "step": 5511 }, { "epoch": 0.39263454072728565, "grad_norm": 3.1107561588287354, "learning_rate": 4.407202612759005e-06, "loss": 0.475, "step": 5512 }, { "epoch": 0.39270577340884, "grad_norm": 3.5783321857452393, "learning_rate": 4.40394683734639e-06, "loss": 0.5751, "step": 5513 }, { "epoch": 0.3927770060903943, "grad_norm": 3.172025680541992, "learning_rate": 4.400691925347147e-06, "loss": 0.7812, "step": 5514 }, { "epoch": 0.3928482387719486, "grad_norm": 2.526502847671509, "learning_rate": 4.397437877263478e-06, "loss": 0.1673, "step": 5515 }, { "epoch": 0.39291947145350287, "grad_norm": 2.57446551322937, "learning_rate": 4.394184693597452e-06, "loss": 0.2173, "step": 5516 }, { "epoch": 0.39299070413505716, "grad_norm": 2.521430015563965, "learning_rate": 4.390932374850996e-06, "loss": 0.6448, "step": 5517 }, { "epoch": 0.39306193681661145, "grad_norm": 2.2509264945983887, "learning_rate": 4.387680921525912e-06, "loss": 0.2478, "step": 5518 }, { "epoch": 0.39313316949816574, "grad_norm": 2.628680944442749, "learning_rate": 4.38443033412387e-06, "loss": 0.113, "step": 5519 }, { "epoch": 0.3932044021797201, "grad_norm": 1.793778896331787, "learning_rate": 4.381180613146396e-06, "loss": 0.2062, "step": 5520 }, { "epoch": 0.39327563486127437, "grad_norm": 2.8083643913269043, "learning_rate": 4.377931759094892e-06, "loss": 0.2791, "step": 5521 }, { "epoch": 0.39334686754282866, "grad_norm": 3.3780946731567383, "learning_rate": 4.374683772470619e-06, "loss": 0.6995, "step": 5522 }, { "epoch": 0.39341810022438295, "grad_norm": 1.8015897274017334, "learning_rate": 4.371436653774714e-06, "loss": 0.3793, "step": 5523 }, { "epoch": 0.39348933290593724, "grad_norm": 2.5770375728607178, "learning_rate": 4.368190403508167e-06, "loss": 0.4898, "step": 5524 }, { "epoch": 0.39356056558749153, "grad_norm": 4.36686897277832, "learning_rate": 4.364945022171847e-06, "loss": 0.5951, "step": 5525 }, { "epoch": 0.3936317982690458, "grad_norm": 2.2994542121887207, "learning_rate": 4.361700510266477e-06, "loss": 0.472, "step": 5526 }, { "epoch": 0.3937030309506001, "grad_norm": 2.05240797996521, "learning_rate": 4.3584568682926555e-06, "loss": 0.1974, "step": 5527 }, { "epoch": 0.39377426363215445, "grad_norm": 4.21865701675415, "learning_rate": 4.355214096750846e-06, "loss": 0.2739, "step": 5528 }, { "epoch": 0.39384549631370874, "grad_norm": 3.7059342861175537, "learning_rate": 4.351972196141368e-06, "loss": 0.3747, "step": 5529 }, { "epoch": 0.39391672899526303, "grad_norm": 5.179049968719482, "learning_rate": 4.348731166964415e-06, "loss": 0.7822, "step": 5530 }, { "epoch": 0.3939879616768173, "grad_norm": 2.7061095237731934, "learning_rate": 4.345491009720052e-06, "loss": 0.4232, "step": 5531 }, { "epoch": 0.3940591943583716, "grad_norm": 3.061569929122925, "learning_rate": 4.342251724908191e-06, "loss": 0.4466, "step": 5532 }, { "epoch": 0.3941304270399259, "grad_norm": 2.8525450229644775, "learning_rate": 4.339013313028626e-06, "loss": 0.2157, "step": 5533 }, { "epoch": 0.3942016597214802, "grad_norm": 4.641369819641113, "learning_rate": 4.3357757745810126e-06, "loss": 0.3824, "step": 5534 }, { "epoch": 0.39427289240303454, "grad_norm": 2.9891068935394287, "learning_rate": 4.332539110064864e-06, "loss": 0.5526, "step": 5535 }, { "epoch": 0.3943441250845888, "grad_norm": 3.9507100582122803, "learning_rate": 4.329303319979571e-06, "loss": 0.4296, "step": 5536 }, { "epoch": 0.3944153577661431, "grad_norm": 2.312575578689575, "learning_rate": 4.326068404824375e-06, "loss": 0.3086, "step": 5537 }, { "epoch": 0.3944865904476974, "grad_norm": 3.338559865951538, "learning_rate": 4.322834365098398e-06, "loss": 0.5138, "step": 5538 }, { "epoch": 0.3945578231292517, "grad_norm": 4.130187511444092, "learning_rate": 4.319601201300611e-06, "loss": 0.4645, "step": 5539 }, { "epoch": 0.394629055810806, "grad_norm": 4.241756439208984, "learning_rate": 4.316368913929864e-06, "loss": 0.6608, "step": 5540 }, { "epoch": 0.3947002884923603, "grad_norm": 2.5561025142669678, "learning_rate": 4.3131375034848624e-06, "loss": 0.4796, "step": 5541 }, { "epoch": 0.3947715211739146, "grad_norm": 2.5069406032562256, "learning_rate": 4.30990697046418e-06, "loss": 0.4648, "step": 5542 }, { "epoch": 0.3948427538554689, "grad_norm": 2.4875786304473877, "learning_rate": 4.306677315366258e-06, "loss": 0.2874, "step": 5543 }, { "epoch": 0.3949139865370232, "grad_norm": 3.1734437942504883, "learning_rate": 4.303448538689393e-06, "loss": 0.7319, "step": 5544 }, { "epoch": 0.3949852192185775, "grad_norm": 3.853637933731079, "learning_rate": 4.300220640931756e-06, "loss": 0.4545, "step": 5545 }, { "epoch": 0.3950564519001318, "grad_norm": 1.8386651277542114, "learning_rate": 4.296993622591377e-06, "loss": 0.1033, "step": 5546 }, { "epoch": 0.39512768458168607, "grad_norm": 3.3511757850646973, "learning_rate": 4.293767484166157e-06, "loss": 0.6425, "step": 5547 }, { "epoch": 0.39519891726324036, "grad_norm": 4.008768081665039, "learning_rate": 4.290542226153847e-06, "loss": 0.4949, "step": 5548 }, { "epoch": 0.39527014994479465, "grad_norm": 3.494267702102661, "learning_rate": 4.287317849052075e-06, "loss": 0.7049, "step": 5549 }, { "epoch": 0.395341382626349, "grad_norm": 5.585061550140381, "learning_rate": 4.284094353358334e-06, "loss": 0.5785, "step": 5550 }, { "epoch": 0.3954126153079033, "grad_norm": 4.198767185211182, "learning_rate": 4.280871739569972e-06, "loss": 0.1196, "step": 5551 }, { "epoch": 0.39548384798945757, "grad_norm": 2.7005066871643066, "learning_rate": 4.277650008184201e-06, "loss": 0.2907, "step": 5552 }, { "epoch": 0.39555508067101186, "grad_norm": 3.2378079891204834, "learning_rate": 4.274429159698109e-06, "loss": 0.529, "step": 5553 }, { "epoch": 0.39562631335256615, "grad_norm": 3.4584784507751465, "learning_rate": 4.271209194608631e-06, "loss": 0.6684, "step": 5554 }, { "epoch": 0.39569754603412044, "grad_norm": 2.4293625354766846, "learning_rate": 4.26799011341258e-06, "loss": 0.4219, "step": 5555 }, { "epoch": 0.39576877871567473, "grad_norm": 3.337925434112549, "learning_rate": 4.26477191660663e-06, "loss": 0.6591, "step": 5556 }, { "epoch": 0.3958400113972291, "grad_norm": 5.615551471710205, "learning_rate": 4.261554604687308e-06, "loss": 0.6732, "step": 5557 }, { "epoch": 0.39591124407878336, "grad_norm": 2.9540719985961914, "learning_rate": 4.2583381781510156e-06, "loss": 0.3019, "step": 5558 }, { "epoch": 0.39598247676033765, "grad_norm": 2.1680104732513428, "learning_rate": 4.255122637494018e-06, "loss": 0.3133, "step": 5559 }, { "epoch": 0.39605370944189194, "grad_norm": 2.062800645828247, "learning_rate": 4.251907983212435e-06, "loss": 0.2134, "step": 5560 }, { "epoch": 0.39612494212344623, "grad_norm": 6.755223751068115, "learning_rate": 4.248694215802254e-06, "loss": 0.5154, "step": 5561 }, { "epoch": 0.3961961748050005, "grad_norm": 3.0557901859283447, "learning_rate": 4.245481335759333e-06, "loss": 0.6599, "step": 5562 }, { "epoch": 0.3962674074865548, "grad_norm": 2.9683494567871094, "learning_rate": 4.2422693435793785e-06, "loss": 0.4865, "step": 5563 }, { "epoch": 0.3963386401681091, "grad_norm": 4.353297710418701, "learning_rate": 4.23905823975797e-06, "loss": 0.3867, "step": 5564 }, { "epoch": 0.39640987284966345, "grad_norm": 2.336637258529663, "learning_rate": 4.2358480247905535e-06, "loss": 0.1486, "step": 5565 }, { "epoch": 0.39648110553121774, "grad_norm": 2.6797850131988525, "learning_rate": 4.2326386991724235e-06, "loss": 0.3882, "step": 5566 }, { "epoch": 0.396552338212772, "grad_norm": 2.8672971725463867, "learning_rate": 4.229430263398754e-06, "loss": 0.39, "step": 5567 }, { "epoch": 0.3966235708943263, "grad_norm": 3.5295283794403076, "learning_rate": 4.2262227179645685e-06, "loss": 0.5657, "step": 5568 }, { "epoch": 0.3966948035758806, "grad_norm": 3.0469577312469482, "learning_rate": 4.2230160633647565e-06, "loss": 0.4126, "step": 5569 }, { "epoch": 0.3967660362574349, "grad_norm": 3.4283294677734375, "learning_rate": 4.2198103000940735e-06, "loss": 0.36, "step": 5570 }, { "epoch": 0.3968372689389892, "grad_norm": 2.640026330947876, "learning_rate": 4.216605428647141e-06, "loss": 0.28, "step": 5571 }, { "epoch": 0.39690850162054353, "grad_norm": 4.8043107986450195, "learning_rate": 4.213401449518431e-06, "loss": 0.6074, "step": 5572 }, { "epoch": 0.3969797343020978, "grad_norm": 3.286510944366455, "learning_rate": 4.210198363202286e-06, "loss": 0.7746, "step": 5573 }, { "epoch": 0.3970509669836521, "grad_norm": 3.3863909244537354, "learning_rate": 4.206996170192913e-06, "loss": 0.4477, "step": 5574 }, { "epoch": 0.3971221996652064, "grad_norm": 2.093454122543335, "learning_rate": 4.203794870984371e-06, "loss": 0.3696, "step": 5575 }, { "epoch": 0.3971934323467607, "grad_norm": 2.256032705307007, "learning_rate": 4.200594466070592e-06, "loss": 0.3355, "step": 5576 }, { "epoch": 0.397264665028315, "grad_norm": 2.3062922954559326, "learning_rate": 4.197394955945368e-06, "loss": 0.1724, "step": 5577 }, { "epoch": 0.39733589770986927, "grad_norm": 2.978522777557373, "learning_rate": 4.1941963411023425e-06, "loss": 0.7614, "step": 5578 }, { "epoch": 0.39740713039142356, "grad_norm": 5.180523872375488, "learning_rate": 4.190998622035034e-06, "loss": 0.5384, "step": 5579 }, { "epoch": 0.3974783630729779, "grad_norm": 3.5489413738250732, "learning_rate": 4.1878017992368205e-06, "loss": 0.5807, "step": 5580 }, { "epoch": 0.3975495957545322, "grad_norm": 3.232182502746582, "learning_rate": 4.184605873200932e-06, "loss": 0.4119, "step": 5581 }, { "epoch": 0.3976208284360865, "grad_norm": 5.883045673370361, "learning_rate": 4.181410844420473e-06, "loss": 0.5685, "step": 5582 }, { "epoch": 0.39769206111764077, "grad_norm": 3.288722038269043, "learning_rate": 4.1782167133883985e-06, "loss": 0.6259, "step": 5583 }, { "epoch": 0.39776329379919506, "grad_norm": 4.009815692901611, "learning_rate": 4.1750234805975355e-06, "loss": 0.3425, "step": 5584 }, { "epoch": 0.39783452648074935, "grad_norm": 3.8598456382751465, "learning_rate": 4.17183114654056e-06, "loss": 0.7768, "step": 5585 }, { "epoch": 0.39790575916230364, "grad_norm": 3.2334201335906982, "learning_rate": 4.168639711710019e-06, "loss": 0.4411, "step": 5586 }, { "epoch": 0.397976991843858, "grad_norm": 3.11377215385437, "learning_rate": 4.165449176598325e-06, "loss": 0.4507, "step": 5587 }, { "epoch": 0.3980482245254123, "grad_norm": 3.379246711730957, "learning_rate": 4.162259541697734e-06, "loss": 0.6168, "step": 5588 }, { "epoch": 0.39811945720696656, "grad_norm": 2.658778429031372, "learning_rate": 4.159070807500378e-06, "loss": 0.347, "step": 5589 }, { "epoch": 0.39819068988852085, "grad_norm": 3.2053604125976562, "learning_rate": 4.155882974498251e-06, "loss": 0.7045, "step": 5590 }, { "epoch": 0.39826192257007514, "grad_norm": 3.5838980674743652, "learning_rate": 4.152696043183194e-06, "loss": 0.6407, "step": 5591 }, { "epoch": 0.39833315525162943, "grad_norm": 2.1597185134887695, "learning_rate": 4.149510014046922e-06, "loss": 0.3473, "step": 5592 }, { "epoch": 0.3984043879331837, "grad_norm": 3.209625005722046, "learning_rate": 4.14632488758101e-06, "loss": 0.4571, "step": 5593 }, { "epoch": 0.39847562061473807, "grad_norm": 3.292127847671509, "learning_rate": 4.143140664276884e-06, "loss": 0.4027, "step": 5594 }, { "epoch": 0.39854685329629236, "grad_norm": 3.22642183303833, "learning_rate": 4.139957344625843e-06, "loss": 0.3247, "step": 5595 }, { "epoch": 0.39861808597784665, "grad_norm": 2.6260876655578613, "learning_rate": 4.136774929119033e-06, "loss": 0.6195, "step": 5596 }, { "epoch": 0.39868931865940094, "grad_norm": 2.67214298248291, "learning_rate": 4.133593418247474e-06, "loss": 0.3484, "step": 5597 }, { "epoch": 0.3987605513409552, "grad_norm": 3.258960008621216, "learning_rate": 4.130412812502037e-06, "loss": 0.539, "step": 5598 }, { "epoch": 0.3988317840225095, "grad_norm": 1.685829520225525, "learning_rate": 4.12723311237346e-06, "loss": 0.1472, "step": 5599 }, { "epoch": 0.3989030167040638, "grad_norm": 3.7060458660125732, "learning_rate": 4.124054318352333e-06, "loss": 0.3885, "step": 5600 }, { "epoch": 0.3989742493856181, "grad_norm": 3.5457687377929688, "learning_rate": 4.120876430929115e-06, "loss": 0.4721, "step": 5601 }, { "epoch": 0.39904548206717244, "grad_norm": 3.4391109943389893, "learning_rate": 4.117699450594122e-06, "loss": 0.3538, "step": 5602 }, { "epoch": 0.39911671474872673, "grad_norm": 2.9192721843719482, "learning_rate": 4.114523377837526e-06, "loss": 0.2232, "step": 5603 }, { "epoch": 0.399187947430281, "grad_norm": 1.5902012586593628, "learning_rate": 4.1113482131493635e-06, "loss": 0.1144, "step": 5604 }, { "epoch": 0.3992591801118353, "grad_norm": 4.557648658752441, "learning_rate": 4.108173957019534e-06, "loss": 0.6449, "step": 5605 }, { "epoch": 0.3993304127933896, "grad_norm": 3.6590940952301025, "learning_rate": 4.1050006099377846e-06, "loss": 0.4986, "step": 5606 }, { "epoch": 0.3994016454749439, "grad_norm": 2.0602104663848877, "learning_rate": 4.101828172393734e-06, "loss": 0.2932, "step": 5607 }, { "epoch": 0.3994728781564982, "grad_norm": 4.471996784210205, "learning_rate": 4.098656644876863e-06, "loss": 0.7359, "step": 5608 }, { "epoch": 0.3995441108380525, "grad_norm": 3.4700565338134766, "learning_rate": 4.095486027876494e-06, "loss": 0.5374, "step": 5609 }, { "epoch": 0.3996153435196068, "grad_norm": 3.1076204776763916, "learning_rate": 4.0923163218818265e-06, "loss": 0.4785, "step": 5610 }, { "epoch": 0.3996865762011611, "grad_norm": 1.8565099239349365, "learning_rate": 4.089147527381917e-06, "loss": 0.2021, "step": 5611 }, { "epoch": 0.3997578088827154, "grad_norm": 1.9633798599243164, "learning_rate": 4.085979644865674e-06, "loss": 0.2435, "step": 5612 }, { "epoch": 0.3998290415642697, "grad_norm": 2.6718318462371826, "learning_rate": 4.082812674821865e-06, "loss": 0.2632, "step": 5613 }, { "epoch": 0.399900274245824, "grad_norm": 4.0243988037109375, "learning_rate": 4.079646617739129e-06, "loss": 0.7216, "step": 5614 }, { "epoch": 0.39997150692737826, "grad_norm": 2.5741164684295654, "learning_rate": 4.076481474105949e-06, "loss": 0.4216, "step": 5615 }, { "epoch": 0.40004273960893255, "grad_norm": 2.7743241786956787, "learning_rate": 4.073317244410677e-06, "loss": 0.3926, "step": 5616 }, { "epoch": 0.4001139722904869, "grad_norm": 1.742374062538147, "learning_rate": 4.070153929141524e-06, "loss": 0.1124, "step": 5617 }, { "epoch": 0.4001852049720412, "grad_norm": 3.0507986545562744, "learning_rate": 4.066991528786551e-06, "loss": 0.6692, "step": 5618 }, { "epoch": 0.4002564376535955, "grad_norm": 3.0860185623168945, "learning_rate": 4.063830043833688e-06, "loss": 0.4879, "step": 5619 }, { "epoch": 0.40032767033514977, "grad_norm": 5.363525390625, "learning_rate": 4.060669474770716e-06, "loss": 0.6333, "step": 5620 }, { "epoch": 0.40039890301670406, "grad_norm": 2.4268500804901123, "learning_rate": 4.057509822085286e-06, "loss": 0.5724, "step": 5621 }, { "epoch": 0.40047013569825834, "grad_norm": 2.060286521911621, "learning_rate": 4.054351086264891e-06, "loss": 0.3674, "step": 5622 }, { "epoch": 0.40054136837981263, "grad_norm": 2.843463659286499, "learning_rate": 4.051193267796894e-06, "loss": 0.3543, "step": 5623 }, { "epoch": 0.400612601061367, "grad_norm": 2.9496917724609375, "learning_rate": 4.048036367168521e-06, "loss": 0.5371, "step": 5624 }, { "epoch": 0.40068383374292127, "grad_norm": 3.627714157104492, "learning_rate": 4.0448803848668374e-06, "loss": 0.6321, "step": 5625 }, { "epoch": 0.40075506642447556, "grad_norm": 3.9404892921447754, "learning_rate": 4.0417253213787885e-06, "loss": 0.7789, "step": 5626 }, { "epoch": 0.40082629910602985, "grad_norm": 3.093587636947632, "learning_rate": 4.038571177191164e-06, "loss": 0.3352, "step": 5627 }, { "epoch": 0.40089753178758414, "grad_norm": 3.0548393726348877, "learning_rate": 4.035417952790613e-06, "loss": 0.5563, "step": 5628 }, { "epoch": 0.40096876446913843, "grad_norm": 4.6604509353637695, "learning_rate": 4.032265648663649e-06, "loss": 0.6275, "step": 5629 }, { "epoch": 0.4010399971506927, "grad_norm": 2.6506800651550293, "learning_rate": 4.029114265296642e-06, "loss": 0.5077, "step": 5630 }, { "epoch": 0.40111122983224706, "grad_norm": 2.462827682495117, "learning_rate": 4.025963803175813e-06, "loss": 0.1702, "step": 5631 }, { "epoch": 0.40118246251380135, "grad_norm": 3.7797000408172607, "learning_rate": 4.022814262787248e-06, "loss": 0.3234, "step": 5632 }, { "epoch": 0.40125369519535564, "grad_norm": 3.530513048171997, "learning_rate": 4.0196656446168925e-06, "loss": 0.4783, "step": 5633 }, { "epoch": 0.40132492787690993, "grad_norm": 5.419461250305176, "learning_rate": 4.01651794915054e-06, "loss": 0.5485, "step": 5634 }, { "epoch": 0.4013961605584642, "grad_norm": 2.8266677856445312, "learning_rate": 4.013371176873849e-06, "loss": 0.358, "step": 5635 }, { "epoch": 0.4014673932400185, "grad_norm": 2.5641632080078125, "learning_rate": 4.0102253282723394e-06, "loss": 0.6311, "step": 5636 }, { "epoch": 0.4015386259215728, "grad_norm": 3.914853811264038, "learning_rate": 4.007080403831374e-06, "loss": 0.6704, "step": 5637 }, { "epoch": 0.4016098586031271, "grad_norm": 2.8380489349365234, "learning_rate": 4.003936404036188e-06, "loss": 0.6349, "step": 5638 }, { "epoch": 0.40168109128468144, "grad_norm": 4.017549514770508, "learning_rate": 4.000793329371872e-06, "loss": 0.2996, "step": 5639 }, { "epoch": 0.4017523239662357, "grad_norm": 1.7629992961883545, "learning_rate": 3.99765118032336e-06, "loss": 0.233, "step": 5640 }, { "epoch": 0.40182355664779, "grad_norm": 2.658208131790161, "learning_rate": 3.9945099573754635e-06, "loss": 0.2826, "step": 5641 }, { "epoch": 0.4018947893293443, "grad_norm": 3.1066253185272217, "learning_rate": 3.991369661012831e-06, "loss": 0.6721, "step": 5642 }, { "epoch": 0.4019660220108986, "grad_norm": 2.8789381980895996, "learning_rate": 3.988230291719987e-06, "loss": 0.6318, "step": 5643 }, { "epoch": 0.4020372546924529, "grad_norm": 5.662254810333252, "learning_rate": 3.9850918499812976e-06, "loss": 0.7761, "step": 5644 }, { "epoch": 0.4021084873740072, "grad_norm": 2.5553088188171387, "learning_rate": 3.981954336280996e-06, "loss": 0.1043, "step": 5645 }, { "epoch": 0.4021797200555615, "grad_norm": 3.878598928451538, "learning_rate": 3.978817751103163e-06, "loss": 0.5155, "step": 5646 }, { "epoch": 0.4022509527371158, "grad_norm": 2.507641553878784, "learning_rate": 3.975682094931747e-06, "loss": 0.4187, "step": 5647 }, { "epoch": 0.4023221854186701, "grad_norm": 1.9053068161010742, "learning_rate": 3.972547368250547e-06, "loss": 0.0809, "step": 5648 }, { "epoch": 0.4023934181002244, "grad_norm": 3.15293550491333, "learning_rate": 3.969413571543214e-06, "loss": 0.3922, "step": 5649 }, { "epoch": 0.4024646507817787, "grad_norm": 4.846325397491455, "learning_rate": 3.9662807052932625e-06, "loss": 0.3441, "step": 5650 }, { "epoch": 0.40253588346333297, "grad_norm": 4.171367168426514, "learning_rate": 3.963148769984069e-06, "loss": 0.8133, "step": 5651 }, { "epoch": 0.40260711614488726, "grad_norm": 5.379081726074219, "learning_rate": 3.960017766098847e-06, "loss": 0.6523, "step": 5652 }, { "epoch": 0.40267834882644155, "grad_norm": 1.9458099603652954, "learning_rate": 3.956887694120685e-06, "loss": 0.0842, "step": 5653 }, { "epoch": 0.4027495815079959, "grad_norm": 2.2699942588806152, "learning_rate": 3.953758554532523e-06, "loss": 0.4191, "step": 5654 }, { "epoch": 0.4028208141895502, "grad_norm": 4.668844223022461, "learning_rate": 3.950630347817148e-06, "loss": 0.6042, "step": 5655 }, { "epoch": 0.40289204687110447, "grad_norm": 3.8662681579589844, "learning_rate": 3.947503074457219e-06, "loss": 0.8306, "step": 5656 }, { "epoch": 0.40296327955265876, "grad_norm": 3.1911513805389404, "learning_rate": 3.9443767349352315e-06, "loss": 0.7461, "step": 5657 }, { "epoch": 0.40303451223421305, "grad_norm": 2.8855390548706055, "learning_rate": 3.9412513297335574e-06, "loss": 0.3531, "step": 5658 }, { "epoch": 0.40310574491576734, "grad_norm": 3.4259185791015625, "learning_rate": 3.938126859334407e-06, "loss": 0.5247, "step": 5659 }, { "epoch": 0.40317697759732163, "grad_norm": 3.20544695854187, "learning_rate": 3.935003324219856e-06, "loss": 0.7279, "step": 5660 }, { "epoch": 0.403248210278876, "grad_norm": 3.3866467475891113, "learning_rate": 3.931880724871838e-06, "loss": 0.5715, "step": 5661 }, { "epoch": 0.40331944296043026, "grad_norm": 1.7271698713302612, "learning_rate": 3.928759061772132e-06, "loss": 0.0776, "step": 5662 }, { "epoch": 0.40339067564198455, "grad_norm": 3.5191445350646973, "learning_rate": 3.9256383354023804e-06, "loss": 0.5635, "step": 5663 }, { "epoch": 0.40346190832353884, "grad_norm": 3.7514805793762207, "learning_rate": 3.922518546244084e-06, "loss": 0.3154, "step": 5664 }, { "epoch": 0.40353314100509313, "grad_norm": 3.208941698074341, "learning_rate": 3.919399694778586e-06, "loss": 0.6876, "step": 5665 }, { "epoch": 0.4036043736866474, "grad_norm": 2.3509411811828613, "learning_rate": 3.916281781487098e-06, "loss": 0.4117, "step": 5666 }, { "epoch": 0.4036756063682017, "grad_norm": 2.328493118286133, "learning_rate": 3.913164806850683e-06, "loss": 0.3136, "step": 5667 }, { "epoch": 0.403746839049756, "grad_norm": 3.501387119293213, "learning_rate": 3.910048771350253e-06, "loss": 0.8216, "step": 5668 }, { "epoch": 0.40381807173131035, "grad_norm": 4.427062034606934, "learning_rate": 3.906933675466584e-06, "loss": 0.3659, "step": 5669 }, { "epoch": 0.40388930441286464, "grad_norm": 2.813295602798462, "learning_rate": 3.9038195196803055e-06, "loss": 0.283, "step": 5670 }, { "epoch": 0.4039605370944189, "grad_norm": 4.333924770355225, "learning_rate": 3.900706304471896e-06, "loss": 0.4413, "step": 5671 }, { "epoch": 0.4040317697759732, "grad_norm": 1.7037440538406372, "learning_rate": 3.89759403032169e-06, "loss": 0.096, "step": 5672 }, { "epoch": 0.4041030024575275, "grad_norm": 3.028991222381592, "learning_rate": 3.8944826977098856e-06, "loss": 0.4569, "step": 5673 }, { "epoch": 0.4041742351390818, "grad_norm": 2.7144863605499268, "learning_rate": 3.891372307116523e-06, "loss": 0.5916, "step": 5674 }, { "epoch": 0.4042454678206361, "grad_norm": 3.2331299781799316, "learning_rate": 3.888262859021508e-06, "loss": 0.7344, "step": 5675 }, { "epoch": 0.40431670050219043, "grad_norm": 3.451601266860962, "learning_rate": 3.885154353904598e-06, "loss": 0.4605, "step": 5676 }, { "epoch": 0.4043879331837447, "grad_norm": 2.3365018367767334, "learning_rate": 3.882046792245395e-06, "loss": 0.1498, "step": 5677 }, { "epoch": 0.404459165865299, "grad_norm": 3.6335320472717285, "learning_rate": 3.878940174523371e-06, "loss": 0.6881, "step": 5678 }, { "epoch": 0.4045303985468533, "grad_norm": 5.393756866455078, "learning_rate": 3.875834501217847e-06, "loss": 0.7355, "step": 5679 }, { "epoch": 0.4046016312284076, "grad_norm": 2.724384069442749, "learning_rate": 3.872729772807989e-06, "loss": 0.7294, "step": 5680 }, { "epoch": 0.4046728639099619, "grad_norm": 2.409464120864868, "learning_rate": 3.869625989772828e-06, "loss": 0.5674, "step": 5681 }, { "epoch": 0.40474409659151617, "grad_norm": 2.431658983230591, "learning_rate": 3.8665231525912505e-06, "loss": 0.4594, "step": 5682 }, { "epoch": 0.4048153292730705, "grad_norm": 2.8031795024871826, "learning_rate": 3.863421261741983e-06, "loss": 0.5357, "step": 5683 }, { "epoch": 0.4048865619546248, "grad_norm": 3.1389405727386475, "learning_rate": 3.860320317703622e-06, "loss": 0.7543, "step": 5684 }, { "epoch": 0.4049577946361791, "grad_norm": 1.7408441305160522, "learning_rate": 3.857220320954612e-06, "loss": 0.1774, "step": 5685 }, { "epoch": 0.4050290273177334, "grad_norm": 2.610588550567627, "learning_rate": 3.854121271973245e-06, "loss": 0.1872, "step": 5686 }, { "epoch": 0.40510025999928767, "grad_norm": 3.602039337158203, "learning_rate": 3.851023171237678e-06, "loss": 0.3202, "step": 5687 }, { "epoch": 0.40517149268084196, "grad_norm": 4.114919662475586, "learning_rate": 3.8479260192259135e-06, "loss": 0.3744, "step": 5688 }, { "epoch": 0.40524272536239625, "grad_norm": 3.1582579612731934, "learning_rate": 3.844829816415808e-06, "loss": 0.4391, "step": 5689 }, { "epoch": 0.40531395804395054, "grad_norm": 3.3042242527008057, "learning_rate": 3.841734563285076e-06, "loss": 0.3553, "step": 5690 }, { "epoch": 0.4053851907255049, "grad_norm": 2.7244980335235596, "learning_rate": 3.8386402603112845e-06, "loss": 0.5238, "step": 5691 }, { "epoch": 0.4054564234070592, "grad_norm": 3.092437267303467, "learning_rate": 3.835546907971849e-06, "loss": 0.6357, "step": 5692 }, { "epoch": 0.40552765608861346, "grad_norm": 3.4852359294891357, "learning_rate": 3.832454506744043e-06, "loss": 0.2806, "step": 5693 }, { "epoch": 0.40559888877016775, "grad_norm": 3.577840805053711, "learning_rate": 3.829363057104998e-06, "loss": 0.6292, "step": 5694 }, { "epoch": 0.40567012145172204, "grad_norm": 3.367783546447754, "learning_rate": 3.8262725595316845e-06, "loss": 0.3552, "step": 5695 }, { "epoch": 0.40574135413327633, "grad_norm": 1.8072013854980469, "learning_rate": 3.823183014500937e-06, "loss": 0.2247, "step": 5696 }, { "epoch": 0.4058125868148306, "grad_norm": 2.6634907722473145, "learning_rate": 3.820094422489442e-06, "loss": 0.5397, "step": 5697 }, { "epoch": 0.40588381949638497, "grad_norm": 3.3671035766601562, "learning_rate": 3.81700678397374e-06, "loss": 0.6367, "step": 5698 }, { "epoch": 0.40595505217793926, "grad_norm": 2.9392037391662598, "learning_rate": 3.813920099430215e-06, "loss": 0.1725, "step": 5699 }, { "epoch": 0.40602628485949355, "grad_norm": 3.306840658187866, "learning_rate": 3.810834369335118e-06, "loss": 0.1003, "step": 5700 }, { "epoch": 0.40609751754104784, "grad_norm": 3.4035587310791016, "learning_rate": 3.8077495941645392e-06, "loss": 0.6465, "step": 5701 }, { "epoch": 0.4061687502226021, "grad_norm": 2.6996712684631348, "learning_rate": 3.8046657743944327e-06, "loss": 0.4227, "step": 5702 }, { "epoch": 0.4062399829041564, "grad_norm": 2.7821662425994873, "learning_rate": 3.801582910500594e-06, "loss": 0.2156, "step": 5703 }, { "epoch": 0.4063112155857107, "grad_norm": 7.856212139129639, "learning_rate": 3.7985010029586856e-06, "loss": 0.2497, "step": 5704 }, { "epoch": 0.406382448267265, "grad_norm": 2.317035436630249, "learning_rate": 3.795420052244205e-06, "loss": 0.5712, "step": 5705 }, { "epoch": 0.40645368094881934, "grad_norm": 1.6571614742279053, "learning_rate": 3.7923400588325156e-06, "loss": 0.149, "step": 5706 }, { "epoch": 0.40652491363037363, "grad_norm": 4.691710472106934, "learning_rate": 3.7892610231988313e-06, "loss": 0.9178, "step": 5707 }, { "epoch": 0.4065961463119279, "grad_norm": 2.8446550369262695, "learning_rate": 3.786182945818211e-06, "loss": 0.5734, "step": 5708 }, { "epoch": 0.4066673789934822, "grad_norm": 2.895655632019043, "learning_rate": 3.7831058271655707e-06, "loss": 0.4076, "step": 5709 }, { "epoch": 0.4067386116750365, "grad_norm": 2.9800350666046143, "learning_rate": 3.7800296677156844e-06, "loss": 0.5477, "step": 5710 }, { "epoch": 0.4068098443565908, "grad_norm": 5.302313327789307, "learning_rate": 3.7769544679431624e-06, "loss": 0.676, "step": 5711 }, { "epoch": 0.4068810770381451, "grad_norm": 3.2127163410186768, "learning_rate": 3.773880228322482e-06, "loss": 0.4672, "step": 5712 }, { "epoch": 0.4069523097196994, "grad_norm": 2.145594596862793, "learning_rate": 3.7708069493279687e-06, "loss": 0.2894, "step": 5713 }, { "epoch": 0.4070235424012537, "grad_norm": 3.660222291946411, "learning_rate": 3.7677346314337913e-06, "loss": 0.462, "step": 5714 }, { "epoch": 0.407094775082808, "grad_norm": 3.74989914894104, "learning_rate": 3.7646632751139844e-06, "loss": 0.943, "step": 5715 }, { "epoch": 0.4071660077643623, "grad_norm": 2.8418540954589844, "learning_rate": 3.7615928808424184e-06, "loss": 0.5426, "step": 5716 }, { "epoch": 0.4072372404459166, "grad_norm": 2.2776432037353516, "learning_rate": 3.7585234490928313e-06, "loss": 0.3791, "step": 5717 }, { "epoch": 0.40730847312747087, "grad_norm": 2.9694786071777344, "learning_rate": 3.7554549803387984e-06, "loss": 0.6099, "step": 5718 }, { "epoch": 0.40737970580902516, "grad_norm": 3.7176294326782227, "learning_rate": 3.7523874750537593e-06, "loss": 0.5608, "step": 5719 }, { "epoch": 0.40745093849057945, "grad_norm": 2.1857492923736572, "learning_rate": 3.7493209337109904e-06, "loss": 0.3549, "step": 5720 }, { "epoch": 0.4075221711721338, "grad_norm": 2.4857680797576904, "learning_rate": 3.7462553567836324e-06, "loss": 0.2466, "step": 5721 }, { "epoch": 0.4075934038536881, "grad_norm": 1.8188310861587524, "learning_rate": 3.743190744744675e-06, "loss": 0.2449, "step": 5722 }, { "epoch": 0.4076646365352424, "grad_norm": 2.9187815189361572, "learning_rate": 3.740127098066949e-06, "loss": 0.3741, "step": 5723 }, { "epoch": 0.40773586921679666, "grad_norm": 2.459627866744995, "learning_rate": 3.7370644172231485e-06, "loss": 0.3682, "step": 5724 }, { "epoch": 0.40780710189835095, "grad_norm": 2.6050169467926025, "learning_rate": 3.734002702685816e-06, "loss": 0.5526, "step": 5725 }, { "epoch": 0.40787833457990524, "grad_norm": 6.309970378875732, "learning_rate": 3.730941954927335e-06, "loss": 0.577, "step": 5726 }, { "epoch": 0.40794956726145953, "grad_norm": 3.64062762260437, "learning_rate": 3.7278821744199524e-06, "loss": 0.5523, "step": 5727 }, { "epoch": 0.4080207999430139, "grad_norm": 3.1729862689971924, "learning_rate": 3.7248233616357633e-06, "loss": 0.5882, "step": 5728 }, { "epoch": 0.40809203262456817, "grad_norm": 2.5477304458618164, "learning_rate": 3.7217655170467035e-06, "loss": 0.4195, "step": 5729 }, { "epoch": 0.40816326530612246, "grad_norm": 3.1363909244537354, "learning_rate": 3.7187086411245723e-06, "loss": 0.4425, "step": 5730 }, { "epoch": 0.40823449798767675, "grad_norm": 3.6493606567382812, "learning_rate": 3.715652734341015e-06, "loss": 0.7686, "step": 5731 }, { "epoch": 0.40830573066923104, "grad_norm": 3.1102566719055176, "learning_rate": 3.7125977971675264e-06, "loss": 0.3404, "step": 5732 }, { "epoch": 0.4083769633507853, "grad_norm": 3.133927345275879, "learning_rate": 3.709543830075445e-06, "loss": 0.3367, "step": 5733 }, { "epoch": 0.4084481960323396, "grad_norm": 2.6895968914031982, "learning_rate": 3.7064908335359716e-06, "loss": 0.7036, "step": 5734 }, { "epoch": 0.40851942871389396, "grad_norm": 3.32208251953125, "learning_rate": 3.7034388080201557e-06, "loss": 0.2798, "step": 5735 }, { "epoch": 0.40859066139544825, "grad_norm": 2.8323776721954346, "learning_rate": 3.7003877539988866e-06, "loss": 0.4226, "step": 5736 }, { "epoch": 0.40866189407700254, "grad_norm": 3.6139140129089355, "learning_rate": 3.6973376719429134e-06, "loss": 0.1382, "step": 5737 }, { "epoch": 0.40873312675855683, "grad_norm": 5.190925121307373, "learning_rate": 3.6942885623228353e-06, "loss": 0.4783, "step": 5738 }, { "epoch": 0.4088043594401111, "grad_norm": 3.7236456871032715, "learning_rate": 3.691240425609093e-06, "loss": 0.9872, "step": 5739 }, { "epoch": 0.4088755921216654, "grad_norm": 2.975264072418213, "learning_rate": 3.6881932622719853e-06, "loss": 0.554, "step": 5740 }, { "epoch": 0.4089468248032197, "grad_norm": 2.64538311958313, "learning_rate": 3.6851470727816617e-06, "loss": 0.5256, "step": 5741 }, { "epoch": 0.409018057484774, "grad_norm": 2.177189826965332, "learning_rate": 3.6821018576081114e-06, "loss": 0.3922, "step": 5742 }, { "epoch": 0.40908929016632833, "grad_norm": 2.672985792160034, "learning_rate": 3.679057617221181e-06, "loss": 0.1679, "step": 5743 }, { "epoch": 0.4091605228478826, "grad_norm": 2.1918694972991943, "learning_rate": 3.6760143520905724e-06, "loss": 0.4394, "step": 5744 }, { "epoch": 0.4092317555294369, "grad_norm": 3.7113778591156006, "learning_rate": 3.6729720626858213e-06, "loss": 0.6048, "step": 5745 }, { "epoch": 0.4093029882109912, "grad_norm": 4.328812599182129, "learning_rate": 3.669930749476327e-06, "loss": 0.6039, "step": 5746 }, { "epoch": 0.4093742208925455, "grad_norm": 4.09529447555542, "learning_rate": 3.666890412931332e-06, "loss": 0.4966, "step": 5747 }, { "epoch": 0.4094454535740998, "grad_norm": 3.480304718017578, "learning_rate": 3.6638510535199245e-06, "loss": 0.5117, "step": 5748 }, { "epoch": 0.40951668625565407, "grad_norm": 2.4372963905334473, "learning_rate": 3.660812671711049e-06, "loss": 0.4272, "step": 5749 }, { "epoch": 0.4095879189372084, "grad_norm": 3.395240068435669, "learning_rate": 3.6577752679735023e-06, "loss": 0.5598, "step": 5750 }, { "epoch": 0.4096591516187627, "grad_norm": 2.992455244064331, "learning_rate": 3.6547388427759144e-06, "loss": 0.6243, "step": 5751 }, { "epoch": 0.409730384300317, "grad_norm": 2.1866371631622314, "learning_rate": 3.651703396586781e-06, "loss": 0.2496, "step": 5752 }, { "epoch": 0.4098016169818713, "grad_norm": 2.5140860080718994, "learning_rate": 3.6486689298744406e-06, "loss": 0.376, "step": 5753 }, { "epoch": 0.4098728496634256, "grad_norm": 3.548611879348755, "learning_rate": 3.645635443107076e-06, "loss": 0.459, "step": 5754 }, { "epoch": 0.40994408234497987, "grad_norm": 2.3242385387420654, "learning_rate": 3.642602936752724e-06, "loss": 0.438, "step": 5755 }, { "epoch": 0.41001531502653416, "grad_norm": 2.6350109577178955, "learning_rate": 3.6395714112792744e-06, "loss": 0.3674, "step": 5756 }, { "epoch": 0.41008654770808844, "grad_norm": 2.766688585281372, "learning_rate": 3.6365408671544534e-06, "loss": 0.5747, "step": 5757 }, { "epoch": 0.4101577803896428, "grad_norm": 3.2196035385131836, "learning_rate": 3.633511304845845e-06, "loss": 0.4137, "step": 5758 }, { "epoch": 0.4102290130711971, "grad_norm": 2.8769912719726562, "learning_rate": 3.630482724820884e-06, "loss": 0.3082, "step": 5759 }, { "epoch": 0.41030024575275137, "grad_norm": 2.9503118991851807, "learning_rate": 3.627455127546842e-06, "loss": 0.5596, "step": 5760 }, { "epoch": 0.41037147843430566, "grad_norm": 3.2712960243225098, "learning_rate": 3.6244285134908517e-06, "loss": 0.4346, "step": 5761 }, { "epoch": 0.41044271111585995, "grad_norm": 2.507336378097534, "learning_rate": 3.6214028831198833e-06, "loss": 0.394, "step": 5762 }, { "epoch": 0.41051394379741424, "grad_norm": 3.6601672172546387, "learning_rate": 3.618378236900767e-06, "loss": 0.8257, "step": 5763 }, { "epoch": 0.4105851764789685, "grad_norm": 3.8679351806640625, "learning_rate": 3.6153545753001663e-06, "loss": 0.4487, "step": 5764 }, { "epoch": 0.4106564091605229, "grad_norm": 2.196439743041992, "learning_rate": 3.612331898784609e-06, "loss": 0.2857, "step": 5765 }, { "epoch": 0.41072764184207716, "grad_norm": 4.112130165100098, "learning_rate": 3.6093102078204566e-06, "loss": 0.7589, "step": 5766 }, { "epoch": 0.41079887452363145, "grad_norm": 1.563562035560608, "learning_rate": 3.6062895028739287e-06, "loss": 0.1439, "step": 5767 }, { "epoch": 0.41087010720518574, "grad_norm": 3.957127332687378, "learning_rate": 3.6032697844110896e-06, "loss": 0.6235, "step": 5768 }, { "epoch": 0.41094133988674003, "grad_norm": 3.1094839572906494, "learning_rate": 3.6002510528978473e-06, "loss": 0.5537, "step": 5769 }, { "epoch": 0.4110125725682943, "grad_norm": 4.865985870361328, "learning_rate": 3.5972333087999622e-06, "loss": 0.2522, "step": 5770 }, { "epoch": 0.4110838052498486, "grad_norm": 2.6582376956939697, "learning_rate": 3.594216552583045e-06, "loss": 0.4576, "step": 5771 }, { "epoch": 0.4111550379314029, "grad_norm": 3.1502950191497803, "learning_rate": 3.591200784712543e-06, "loss": 0.3661, "step": 5772 }, { "epoch": 0.41122627061295725, "grad_norm": 4.050885200500488, "learning_rate": 3.588186005653763e-06, "loss": 0.8423, "step": 5773 }, { "epoch": 0.41129750329451154, "grad_norm": 2.9523370265960693, "learning_rate": 3.5851722158718537e-06, "loss": 0.5806, "step": 5774 }, { "epoch": 0.4113687359760658, "grad_norm": 12.979750633239746, "learning_rate": 3.582159415831814e-06, "loss": 0.1394, "step": 5775 }, { "epoch": 0.4114399686576201, "grad_norm": 5.2561469078063965, "learning_rate": 3.5791476059984866e-06, "loss": 0.6055, "step": 5776 }, { "epoch": 0.4115112013391744, "grad_norm": 3.129910945892334, "learning_rate": 3.576136786836557e-06, "loss": 0.1439, "step": 5777 }, { "epoch": 0.4115824340207287, "grad_norm": 4.608836650848389, "learning_rate": 3.5731269588105723e-06, "loss": 0.7348, "step": 5778 }, { "epoch": 0.411653666702283, "grad_norm": 5.528323650360107, "learning_rate": 3.57011812238491e-06, "loss": 0.6113, "step": 5779 }, { "epoch": 0.41172489938383733, "grad_norm": 4.0324177742004395, "learning_rate": 3.5671102780238066e-06, "loss": 0.3194, "step": 5780 }, { "epoch": 0.4117961320653916, "grad_norm": 3.425563335418701, "learning_rate": 3.5641034261913454e-06, "loss": 0.5847, "step": 5781 }, { "epoch": 0.4118673647469459, "grad_norm": 2.5811498165130615, "learning_rate": 3.561097567351445e-06, "loss": 0.3706, "step": 5782 }, { "epoch": 0.4119385974285002, "grad_norm": 2.5910189151763916, "learning_rate": 3.5580927019678812e-06, "loss": 0.4622, "step": 5783 }, { "epoch": 0.4120098301100545, "grad_norm": 3.65690016746521, "learning_rate": 3.5550888305042785e-06, "loss": 0.7718, "step": 5784 }, { "epoch": 0.4120810627916088, "grad_norm": 2.402601480484009, "learning_rate": 3.552085953424096e-06, "loss": 0.4461, "step": 5785 }, { "epoch": 0.41215229547316307, "grad_norm": 2.2140214443206787, "learning_rate": 3.5490840711906506e-06, "loss": 0.1683, "step": 5786 }, { "epoch": 0.4122235281547174, "grad_norm": 2.390075206756592, "learning_rate": 3.546083184267105e-06, "loss": 0.4589, "step": 5787 }, { "epoch": 0.4122947608362717, "grad_norm": 6.741397380828857, "learning_rate": 3.5430832931164584e-06, "loss": 0.701, "step": 5788 }, { "epoch": 0.412365993517826, "grad_norm": 4.902034282684326, "learning_rate": 3.540084398201565e-06, "loss": 0.7662, "step": 5789 }, { "epoch": 0.4124372261993803, "grad_norm": 2.4830827713012695, "learning_rate": 3.5370864999851296e-06, "loss": 0.5846, "step": 5790 }, { "epoch": 0.41250845888093457, "grad_norm": 3.7896769046783447, "learning_rate": 3.534089598929691e-06, "loss": 0.7019, "step": 5791 }, { "epoch": 0.41257969156248886, "grad_norm": 3.8020808696746826, "learning_rate": 3.5310936954976383e-06, "loss": 0.4768, "step": 5792 }, { "epoch": 0.41265092424404315, "grad_norm": 3.1487534046173096, "learning_rate": 3.5280987901512142e-06, "loss": 0.3684, "step": 5793 }, { "epoch": 0.41272215692559744, "grad_norm": 3.2812628746032715, "learning_rate": 3.525104883352497e-06, "loss": 0.5794, "step": 5794 }, { "epoch": 0.4127933896071518, "grad_norm": 3.4387784004211426, "learning_rate": 3.522111975563417e-06, "loss": 0.7564, "step": 5795 }, { "epoch": 0.4128646222887061, "grad_norm": 2.9661507606506348, "learning_rate": 3.519120067245754e-06, "loss": 0.6039, "step": 5796 }, { "epoch": 0.41293585497026036, "grad_norm": 2.9425723552703857, "learning_rate": 3.51612915886112e-06, "loss": 0.2652, "step": 5797 }, { "epoch": 0.41300708765181465, "grad_norm": 13.936104774475098, "learning_rate": 3.513139250870986e-06, "loss": 0.6388, "step": 5798 }, { "epoch": 0.41307832033336894, "grad_norm": 4.141279220581055, "learning_rate": 3.5101503437366678e-06, "loss": 0.4624, "step": 5799 }, { "epoch": 0.41314955301492323, "grad_norm": 3.0625016689300537, "learning_rate": 3.507162437919316e-06, "loss": 0.2907, "step": 5800 }, { "epoch": 0.4132207856964775, "grad_norm": 3.0329840183258057, "learning_rate": 3.5041755338799354e-06, "loss": 0.3531, "step": 5801 }, { "epoch": 0.41329201837803187, "grad_norm": 2.0237717628479004, "learning_rate": 3.5011896320793802e-06, "loss": 0.2994, "step": 5802 }, { "epoch": 0.41336325105958616, "grad_norm": 2.257962703704834, "learning_rate": 3.4982047329783362e-06, "loss": 0.2695, "step": 5803 }, { "epoch": 0.41343448374114045, "grad_norm": 2.380916118621826, "learning_rate": 3.4952208370373475e-06, "loss": 0.2794, "step": 5804 }, { "epoch": 0.41350571642269474, "grad_norm": 3.8829567432403564, "learning_rate": 3.4922379447167997e-06, "loss": 0.5413, "step": 5805 }, { "epoch": 0.413576949104249, "grad_norm": 2.566236734390259, "learning_rate": 3.4892560564769164e-06, "loss": 0.2842, "step": 5806 }, { "epoch": 0.4136481817858033, "grad_norm": 2.37231183052063, "learning_rate": 3.48627517277778e-06, "loss": 0.4377, "step": 5807 }, { "epoch": 0.4137194144673576, "grad_norm": 5.109662055969238, "learning_rate": 3.4832952940793054e-06, "loss": 0.3858, "step": 5808 }, { "epoch": 0.4137906471489119, "grad_norm": 3.0737504959106445, "learning_rate": 3.4803164208412543e-06, "loss": 0.4978, "step": 5809 }, { "epoch": 0.41386187983046624, "grad_norm": 3.6257059574127197, "learning_rate": 3.4773385535232408e-06, "loss": 0.5509, "step": 5810 }, { "epoch": 0.41393311251202053, "grad_norm": 1.8700114488601685, "learning_rate": 3.4743616925847167e-06, "loss": 0.3415, "step": 5811 }, { "epoch": 0.4140043451935748, "grad_norm": 2.794699192047119, "learning_rate": 3.4713858384849873e-06, "loss": 0.2078, "step": 5812 }, { "epoch": 0.4140755778751291, "grad_norm": 2.37082839012146, "learning_rate": 3.4684109916831866e-06, "loss": 0.3149, "step": 5813 }, { "epoch": 0.4141468105566834, "grad_norm": 2.566359281539917, "learning_rate": 3.465437152638308e-06, "loss": 0.3591, "step": 5814 }, { "epoch": 0.4142180432382377, "grad_norm": 3.55096435546875, "learning_rate": 3.462464321809188e-06, "loss": 0.3233, "step": 5815 }, { "epoch": 0.414289275919792, "grad_norm": 5.469696998596191, "learning_rate": 3.4594924996544952e-06, "loss": 1.0963, "step": 5816 }, { "epoch": 0.4143605086013463, "grad_norm": 2.8506247997283936, "learning_rate": 3.4565216866327556e-06, "loss": 0.5401, "step": 5817 }, { "epoch": 0.4144317412829006, "grad_norm": 3.807304620742798, "learning_rate": 3.4535518832023383e-06, "loss": 0.508, "step": 5818 }, { "epoch": 0.4145029739644549, "grad_norm": 2.1295957565307617, "learning_rate": 3.4505830898214466e-06, "loss": 0.4138, "step": 5819 }, { "epoch": 0.4145742066460092, "grad_norm": 3.1757092475891113, "learning_rate": 3.447615306948142e-06, "loss": 0.4594, "step": 5820 }, { "epoch": 0.4146454393275635, "grad_norm": 1.917020320892334, "learning_rate": 3.4446485350403145e-06, "loss": 0.2694, "step": 5821 }, { "epoch": 0.41471667200911777, "grad_norm": 2.268717050552368, "learning_rate": 3.441682774555716e-06, "loss": 0.1525, "step": 5822 }, { "epoch": 0.41478790469067206, "grad_norm": 4.086832523345947, "learning_rate": 3.438718025951924e-06, "loss": 0.7008, "step": 5823 }, { "epoch": 0.41485913737222635, "grad_norm": 2.729065179824829, "learning_rate": 3.435754289686375e-06, "loss": 0.4417, "step": 5824 }, { "epoch": 0.4149303700537807, "grad_norm": 4.267711162567139, "learning_rate": 3.432791566216338e-06, "loss": 0.5255, "step": 5825 }, { "epoch": 0.415001602735335, "grad_norm": 2.435910701751709, "learning_rate": 3.429829855998933e-06, "loss": 0.5032, "step": 5826 }, { "epoch": 0.4150728354168893, "grad_norm": 4.265196323394775, "learning_rate": 3.426869159491124e-06, "loss": 0.4877, "step": 5827 }, { "epoch": 0.41514406809844356, "grad_norm": 5.158475875854492, "learning_rate": 3.4239094771497104e-06, "loss": 0.9795, "step": 5828 }, { "epoch": 0.41521530077999785, "grad_norm": 3.392801284790039, "learning_rate": 3.420950809431345e-06, "loss": 0.3986, "step": 5829 }, { "epoch": 0.41528653346155214, "grad_norm": 1.617279291152954, "learning_rate": 3.4179931567925216e-06, "loss": 0.2131, "step": 5830 }, { "epoch": 0.41535776614310643, "grad_norm": 3.186418056488037, "learning_rate": 3.4150365196895686e-06, "loss": 0.5289, "step": 5831 }, { "epoch": 0.4154289988246608, "grad_norm": 3.2884984016418457, "learning_rate": 3.412080898578669e-06, "loss": 0.3289, "step": 5832 }, { "epoch": 0.41550023150621507, "grad_norm": 3.683457612991333, "learning_rate": 3.4091262939158477e-06, "loss": 0.5345, "step": 5833 }, { "epoch": 0.41557146418776936, "grad_norm": 2.60968279838562, "learning_rate": 3.406172706156963e-06, "loss": 0.5502, "step": 5834 }, { "epoch": 0.41564269686932365, "grad_norm": 2.9558746814727783, "learning_rate": 3.4032201357577287e-06, "loss": 0.3939, "step": 5835 }, { "epoch": 0.41571392955087794, "grad_norm": 4.7877349853515625, "learning_rate": 3.4002685831736917e-06, "loss": 0.5207, "step": 5836 }, { "epoch": 0.4157851622324322, "grad_norm": 5.138524532318115, "learning_rate": 3.3973180488602508e-06, "loss": 0.641, "step": 5837 }, { "epoch": 0.4158563949139865, "grad_norm": 2.8362767696380615, "learning_rate": 3.3943685332726385e-06, "loss": 0.2536, "step": 5838 }, { "epoch": 0.41592762759554086, "grad_norm": 3.4082226753234863, "learning_rate": 3.391420036865939e-06, "loss": 0.4654, "step": 5839 }, { "epoch": 0.41599886027709515, "grad_norm": 3.0881187915802, "learning_rate": 3.3884725600950687e-06, "loss": 0.5405, "step": 5840 }, { "epoch": 0.41607009295864944, "grad_norm": 2.4080569744110107, "learning_rate": 3.385526103414798e-06, "loss": 0.4796, "step": 5841 }, { "epoch": 0.41614132564020373, "grad_norm": 2.028332233428955, "learning_rate": 3.3825806672797355e-06, "loss": 0.2537, "step": 5842 }, { "epoch": 0.416212558321758, "grad_norm": 4.885587215423584, "learning_rate": 3.379636252144328e-06, "loss": 0.305, "step": 5843 }, { "epoch": 0.4162837910033123, "grad_norm": 5.135656356811523, "learning_rate": 3.37669285846287e-06, "loss": 0.493, "step": 5844 }, { "epoch": 0.4163550236848666, "grad_norm": 2.813612937927246, "learning_rate": 3.3737504866895e-06, "loss": 0.3538, "step": 5845 }, { "epoch": 0.4164262563664209, "grad_norm": 3.7911124229431152, "learning_rate": 3.3708091372781893e-06, "loss": 0.4591, "step": 5846 }, { "epoch": 0.41649748904797523, "grad_norm": 4.865674018859863, "learning_rate": 3.3678688106827616e-06, "loss": 0.4581, "step": 5847 }, { "epoch": 0.4165687217295295, "grad_norm": 4.061943054199219, "learning_rate": 3.364929507356881e-06, "loss": 0.7078, "step": 5848 }, { "epoch": 0.4166399544110838, "grad_norm": 2.3554224967956543, "learning_rate": 3.361991227754048e-06, "loss": 0.5069, "step": 5849 }, { "epoch": 0.4167111870926381, "grad_norm": 3.3657209873199463, "learning_rate": 3.3590539723276083e-06, "loss": 0.5589, "step": 5850 }, { "epoch": 0.4167824197741924, "grad_norm": 2.6310629844665527, "learning_rate": 3.3561177415307566e-06, "loss": 0.4534, "step": 5851 }, { "epoch": 0.4168536524557467, "grad_norm": 2.306216239929199, "learning_rate": 3.3531825358165184e-06, "loss": 0.3168, "step": 5852 }, { "epoch": 0.41692488513730097, "grad_norm": 2.92730450630188, "learning_rate": 3.3502483556377628e-06, "loss": 0.4118, "step": 5853 }, { "epoch": 0.4169961178188553, "grad_norm": 4.864584922790527, "learning_rate": 3.3473152014472064e-06, "loss": 0.7867, "step": 5854 }, { "epoch": 0.4170673505004096, "grad_norm": 2.7155354022979736, "learning_rate": 3.344383073697408e-06, "loss": 0.6435, "step": 5855 }, { "epoch": 0.4171385831819639, "grad_norm": 2.141765594482422, "learning_rate": 3.341451972840759e-06, "loss": 0.4811, "step": 5856 }, { "epoch": 0.4172098158635182, "grad_norm": 3.431333065032959, "learning_rate": 3.338521899329501e-06, "loss": 0.3207, "step": 5857 }, { "epoch": 0.4172810485450725, "grad_norm": 2.286010503768921, "learning_rate": 3.335592853615717e-06, "loss": 0.2496, "step": 5858 }, { "epoch": 0.41735228122662676, "grad_norm": 2.4122233390808105, "learning_rate": 3.3326648361513227e-06, "loss": 0.4134, "step": 5859 }, { "epoch": 0.41742351390818105, "grad_norm": 5.179589748382568, "learning_rate": 3.3297378473880836e-06, "loss": 0.7679, "step": 5860 }, { "epoch": 0.41749474658973534, "grad_norm": 1.9929968118667603, "learning_rate": 3.326811887777607e-06, "loss": 0.2046, "step": 5861 }, { "epoch": 0.4175659792712897, "grad_norm": 3.1979784965515137, "learning_rate": 3.323886957771333e-06, "loss": 0.5664, "step": 5862 }, { "epoch": 0.417637211952844, "grad_norm": 3.760260581970215, "learning_rate": 3.32096305782055e-06, "loss": 0.6218, "step": 5863 }, { "epoch": 0.41770844463439827, "grad_norm": 4.5561347007751465, "learning_rate": 3.31804018837639e-06, "loss": 0.7181, "step": 5864 }, { "epoch": 0.41777967731595256, "grad_norm": 1.7691640853881836, "learning_rate": 3.3151183498898155e-06, "loss": 0.2184, "step": 5865 }, { "epoch": 0.41785090999750685, "grad_norm": 1.9864453077316284, "learning_rate": 3.3121975428116414e-06, "loss": 0.3076, "step": 5866 }, { "epoch": 0.41792214267906114, "grad_norm": 2.5290815830230713, "learning_rate": 3.3092777675925145e-06, "loss": 0.2906, "step": 5867 }, { "epoch": 0.4179933753606154, "grad_norm": 4.411247253417969, "learning_rate": 3.306359024682925e-06, "loss": 0.5499, "step": 5868 }, { "epoch": 0.41806460804216977, "grad_norm": 1.8862189054489136, "learning_rate": 3.3034413145332065e-06, "loss": 0.3449, "step": 5869 }, { "epoch": 0.41813584072372406, "grad_norm": 3.9999032020568848, "learning_rate": 3.300524637593535e-06, "loss": 0.3174, "step": 5870 }, { "epoch": 0.41820707340527835, "grad_norm": 3.961629867553711, "learning_rate": 3.297608994313918e-06, "loss": 0.3854, "step": 5871 }, { "epoch": 0.41827830608683264, "grad_norm": 3.772949695587158, "learning_rate": 3.29469438514421e-06, "loss": 0.4975, "step": 5872 }, { "epoch": 0.41834953876838693, "grad_norm": 3.010425329208374, "learning_rate": 3.291780810534112e-06, "loss": 0.2708, "step": 5873 }, { "epoch": 0.4184207714499412, "grad_norm": 3.0631988048553467, "learning_rate": 3.288868270933151e-06, "loss": 0.6659, "step": 5874 }, { "epoch": 0.4184920041314955, "grad_norm": 2.2251992225646973, "learning_rate": 3.285956766790703e-06, "loss": 0.4072, "step": 5875 }, { "epoch": 0.41856323681304985, "grad_norm": 3.3241264820098877, "learning_rate": 3.2830462985559884e-06, "loss": 0.219, "step": 5876 }, { "epoch": 0.41863446949460414, "grad_norm": 2.9354608058929443, "learning_rate": 3.2801368666780552e-06, "loss": 0.2575, "step": 5877 }, { "epoch": 0.41870570217615843, "grad_norm": 1.9910895824432373, "learning_rate": 3.2772284716058032e-06, "loss": 0.2675, "step": 5878 }, { "epoch": 0.4187769348577127, "grad_norm": 1.636130928993225, "learning_rate": 3.2743211137879693e-06, "loss": 0.0424, "step": 5879 }, { "epoch": 0.418848167539267, "grad_norm": 2.0796895027160645, "learning_rate": 3.2714147936731234e-06, "loss": 0.2632, "step": 5880 }, { "epoch": 0.4189194002208213, "grad_norm": 2.628232717514038, "learning_rate": 3.268509511709688e-06, "loss": 0.7924, "step": 5881 }, { "epoch": 0.4189906329023756, "grad_norm": 4.657520771026611, "learning_rate": 3.2656052683459094e-06, "loss": 0.2415, "step": 5882 }, { "epoch": 0.4190618655839299, "grad_norm": 4.081112384796143, "learning_rate": 3.26270206402989e-06, "loss": 0.6442, "step": 5883 }, { "epoch": 0.4191330982654842, "grad_norm": 3.2694077491760254, "learning_rate": 3.259799899209559e-06, "loss": 0.5557, "step": 5884 }, { "epoch": 0.4192043309470385, "grad_norm": 2.520056962966919, "learning_rate": 3.2568987743326964e-06, "loss": 0.4884, "step": 5885 }, { "epoch": 0.4192755636285928, "grad_norm": 3.5012922286987305, "learning_rate": 3.2539986898469088e-06, "loss": 0.3143, "step": 5886 }, { "epoch": 0.4193467963101471, "grad_norm": 3.1474220752716064, "learning_rate": 3.2510996461996523e-06, "loss": 0.6138, "step": 5887 }, { "epoch": 0.4194180289917014, "grad_norm": 2.5489768981933594, "learning_rate": 3.2482016438382215e-06, "loss": 0.4725, "step": 5888 }, { "epoch": 0.4194892616732557, "grad_norm": 4.584717750549316, "learning_rate": 3.245304683209749e-06, "loss": 1.1145, "step": 5889 }, { "epoch": 0.41956049435480997, "grad_norm": 5.1821393966674805, "learning_rate": 3.242408764761201e-06, "loss": 0.4749, "step": 5890 }, { "epoch": 0.4196317270363643, "grad_norm": 4.121595859527588, "learning_rate": 3.2395138889393918e-06, "loss": 0.5538, "step": 5891 }, { "epoch": 0.4197029597179186, "grad_norm": 3.1257870197296143, "learning_rate": 3.236620056190972e-06, "loss": 0.6205, "step": 5892 }, { "epoch": 0.4197741923994729, "grad_norm": 1.9533977508544922, "learning_rate": 3.233727266962425e-06, "loss": 0.2622, "step": 5893 }, { "epoch": 0.4198454250810272, "grad_norm": 2.10304856300354, "learning_rate": 3.230835521700083e-06, "loss": 0.3131, "step": 5894 }, { "epoch": 0.41991665776258147, "grad_norm": 2.149953603744507, "learning_rate": 3.2279448208501128e-06, "loss": 0.2375, "step": 5895 }, { "epoch": 0.41998789044413576, "grad_norm": 3.154350757598877, "learning_rate": 3.2250551648585194e-06, "loss": 0.6617, "step": 5896 }, { "epoch": 0.42005912312569005, "grad_norm": 2.19071364402771, "learning_rate": 3.222166554171141e-06, "loss": 0.2506, "step": 5897 }, { "epoch": 0.42013035580724434, "grad_norm": 4.702168941497803, "learning_rate": 3.2192789892336694e-06, "loss": 0.8829, "step": 5898 }, { "epoch": 0.4202015884887987, "grad_norm": 3.302645206451416, "learning_rate": 3.216392470491618e-06, "loss": 0.5653, "step": 5899 }, { "epoch": 0.420272821170353, "grad_norm": 1.5432441234588623, "learning_rate": 3.213506998390351e-06, "loss": 0.2184, "step": 5900 }, { "epoch": 0.42034405385190726, "grad_norm": 2.796114444732666, "learning_rate": 3.2106225733750707e-06, "loss": 0.3605, "step": 5901 }, { "epoch": 0.42041528653346155, "grad_norm": 3.417865753173828, "learning_rate": 3.2077391958908065e-06, "loss": 0.4013, "step": 5902 }, { "epoch": 0.42048651921501584, "grad_norm": 2.990722179412842, "learning_rate": 3.2048568663824375e-06, "loss": 0.4846, "step": 5903 }, { "epoch": 0.42055775189657013, "grad_norm": 3.2043960094451904, "learning_rate": 3.20197558529468e-06, "loss": 0.2488, "step": 5904 }, { "epoch": 0.4206289845781244, "grad_norm": 4.050618648529053, "learning_rate": 3.199095353072081e-06, "loss": 0.3506, "step": 5905 }, { "epoch": 0.42070021725967877, "grad_norm": 2.680234909057617, "learning_rate": 3.1962161701590342e-06, "loss": 0.0901, "step": 5906 }, { "epoch": 0.42077144994123306, "grad_norm": 2.487370729446411, "learning_rate": 3.193338036999769e-06, "loss": 0.327, "step": 5907 }, { "epoch": 0.42084268262278735, "grad_norm": 3.835371971130371, "learning_rate": 3.1904609540383467e-06, "loss": 0.7373, "step": 5908 }, { "epoch": 0.42091391530434163, "grad_norm": 2.894747257232666, "learning_rate": 3.187584921718675e-06, "loss": 0.3001, "step": 5909 }, { "epoch": 0.4209851479858959, "grad_norm": 2.4739139080047607, "learning_rate": 3.1847099404844984e-06, "loss": 0.5406, "step": 5910 }, { "epoch": 0.4210563806674502, "grad_norm": 2.3715295791625977, "learning_rate": 3.1818360107793933e-06, "loss": 0.5186, "step": 5911 }, { "epoch": 0.4211276133490045, "grad_norm": 1.9227428436279297, "learning_rate": 3.178963133046776e-06, "loss": 0.3828, "step": 5912 }, { "epoch": 0.4211988460305588, "grad_norm": 2.7741031646728516, "learning_rate": 3.1760913077299072e-06, "loss": 0.4108, "step": 5913 }, { "epoch": 0.42127007871211314, "grad_norm": 1.2976555824279785, "learning_rate": 3.173220535271874e-06, "loss": 0.0528, "step": 5914 }, { "epoch": 0.42134131139366743, "grad_norm": 3.78861927986145, "learning_rate": 3.1703508161156095e-06, "loss": 0.7211, "step": 5915 }, { "epoch": 0.4214125440752217, "grad_norm": 2.691472291946411, "learning_rate": 3.1674821507038857e-06, "loss": 0.2489, "step": 5916 }, { "epoch": 0.421483776756776, "grad_norm": 3.0975186824798584, "learning_rate": 3.1646145394793017e-06, "loss": 0.9288, "step": 5917 }, { "epoch": 0.4215550094383303, "grad_norm": 1.560837745666504, "learning_rate": 3.1617479828843023e-06, "loss": 0.1666, "step": 5918 }, { "epoch": 0.4216262421198846, "grad_norm": 2.432648181915283, "learning_rate": 3.158882481361173e-06, "loss": 0.4876, "step": 5919 }, { "epoch": 0.4216974748014389, "grad_norm": 1.124995470046997, "learning_rate": 3.156018035352024e-06, "loss": 0.0921, "step": 5920 }, { "epoch": 0.4217687074829932, "grad_norm": 3.6031908988952637, "learning_rate": 3.1531546452988127e-06, "loss": 0.6577, "step": 5921 }, { "epoch": 0.4218399401645475, "grad_norm": 2.2725067138671875, "learning_rate": 3.1502923116433324e-06, "loss": 0.3768, "step": 5922 }, { "epoch": 0.4219111728461018, "grad_norm": 3.239386558532715, "learning_rate": 3.1474310348272084e-06, "loss": 0.2541, "step": 5923 }, { "epoch": 0.4219824055276561, "grad_norm": 3.0167651176452637, "learning_rate": 3.1445708152919075e-06, "loss": 0.6099, "step": 5924 }, { "epoch": 0.4220536382092104, "grad_norm": 3.846212863922119, "learning_rate": 3.141711653478736e-06, "loss": 0.6529, "step": 5925 }, { "epoch": 0.42212487089076467, "grad_norm": 2.429652452468872, "learning_rate": 3.1388535498288265e-06, "loss": 0.5353, "step": 5926 }, { "epoch": 0.42219610357231896, "grad_norm": 2.791120767593384, "learning_rate": 3.135996504783161e-06, "loss": 0.3939, "step": 5927 }, { "epoch": 0.4222673362538733, "grad_norm": 2.7103259563446045, "learning_rate": 3.1331405187825457e-06, "loss": 0.4359, "step": 5928 }, { "epoch": 0.4223385689354276, "grad_norm": 2.8195302486419678, "learning_rate": 3.130285592267638e-06, "loss": 0.2435, "step": 5929 }, { "epoch": 0.4224098016169819, "grad_norm": 2.0412495136260986, "learning_rate": 3.1274317256789144e-06, "loss": 0.338, "step": 5930 }, { "epoch": 0.4224810342985362, "grad_norm": 2.803955316543579, "learning_rate": 3.1245789194567024e-06, "loss": 0.5007, "step": 5931 }, { "epoch": 0.42255226698009046, "grad_norm": 3.375281810760498, "learning_rate": 3.1217271740411626e-06, "loss": 0.694, "step": 5932 }, { "epoch": 0.42262349966164475, "grad_norm": 1.6051783561706543, "learning_rate": 3.1188764898722843e-06, "loss": 0.0655, "step": 5933 }, { "epoch": 0.42269473234319904, "grad_norm": 3.445279121398926, "learning_rate": 3.116026867389903e-06, "loss": 0.6707, "step": 5934 }, { "epoch": 0.42276596502475333, "grad_norm": 2.5768442153930664, "learning_rate": 3.1131783070336872e-06, "loss": 0.5422, "step": 5935 }, { "epoch": 0.4228371977063077, "grad_norm": 3.241521120071411, "learning_rate": 3.110330809243134e-06, "loss": 0.6491, "step": 5936 }, { "epoch": 0.42290843038786197, "grad_norm": 2.8832991123199463, "learning_rate": 3.1074843744575877e-06, "loss": 0.5434, "step": 5937 }, { "epoch": 0.42297966306941626, "grad_norm": 3.267740249633789, "learning_rate": 3.1046390031162265e-06, "loss": 0.6108, "step": 5938 }, { "epoch": 0.42305089575097055, "grad_norm": 4.659745693206787, "learning_rate": 3.1017946956580557e-06, "loss": 0.5786, "step": 5939 }, { "epoch": 0.42312212843252484, "grad_norm": 7.030442714691162, "learning_rate": 3.098951452521929e-06, "loss": 0.6909, "step": 5940 }, { "epoch": 0.4231933611140791, "grad_norm": 2.0534324645996094, "learning_rate": 3.0961092741465226e-06, "loss": 0.3873, "step": 5941 }, { "epoch": 0.4232645937956334, "grad_norm": 2.7586405277252197, "learning_rate": 3.093268160970362e-06, "loss": 0.6527, "step": 5942 }, { "epoch": 0.42333582647718776, "grad_norm": 1.5705060958862305, "learning_rate": 3.090428113431795e-06, "loss": 0.2773, "step": 5943 }, { "epoch": 0.42340705915874205, "grad_norm": 2.448012113571167, "learning_rate": 3.0875891319690188e-06, "loss": 0.3926, "step": 5944 }, { "epoch": 0.42347829184029634, "grad_norm": 3.0236589908599854, "learning_rate": 3.0847512170200523e-06, "loss": 0.774, "step": 5945 }, { "epoch": 0.42354952452185063, "grad_norm": 3.1207871437072754, "learning_rate": 3.0819143690227602e-06, "loss": 0.4732, "step": 5946 }, { "epoch": 0.4236207572034049, "grad_norm": 3.1402413845062256, "learning_rate": 3.0790785884148413e-06, "loss": 0.4967, "step": 5947 }, { "epoch": 0.4236919898849592, "grad_norm": 2.5310606956481934, "learning_rate": 3.0762438756338207e-06, "loss": 0.5383, "step": 5948 }, { "epoch": 0.4237632225665135, "grad_norm": 4.0721540451049805, "learning_rate": 3.0734102311170697e-06, "loss": 0.7533, "step": 5949 }, { "epoch": 0.4238344552480678, "grad_norm": 4.205306053161621, "learning_rate": 3.070577655301793e-06, "loss": 0.2963, "step": 5950 }, { "epoch": 0.42390568792962213, "grad_norm": 8.106493949890137, "learning_rate": 3.0677461486250226e-06, "loss": 0.4833, "step": 5951 }, { "epoch": 0.4239769206111764, "grad_norm": 4.6053147315979, "learning_rate": 3.0649157115236315e-06, "loss": 0.5493, "step": 5952 }, { "epoch": 0.4240481532927307, "grad_norm": 1.61564302444458, "learning_rate": 3.062086344434333e-06, "loss": 0.1508, "step": 5953 }, { "epoch": 0.424119385974285, "grad_norm": 2.9903087615966797, "learning_rate": 3.0592580477936606e-06, "loss": 0.3922, "step": 5954 }, { "epoch": 0.4241906186558393, "grad_norm": 2.4572722911834717, "learning_rate": 3.0564308220380003e-06, "loss": 0.3473, "step": 5955 }, { "epoch": 0.4242618513373936, "grad_norm": 1.631314754486084, "learning_rate": 3.0536046676035546e-06, "loss": 0.1918, "step": 5956 }, { "epoch": 0.42433308401894787, "grad_norm": 3.8796427249908447, "learning_rate": 3.050779584926379e-06, "loss": 0.6341, "step": 5957 }, { "epoch": 0.4244043167005022, "grad_norm": 2.8013503551483154, "learning_rate": 3.0479555744423463e-06, "loss": 0.4643, "step": 5958 }, { "epoch": 0.4244755493820565, "grad_norm": 3.2017855644226074, "learning_rate": 3.045132636587179e-06, "loss": 0.5603, "step": 5959 }, { "epoch": 0.4245467820636108, "grad_norm": 2.810589075088501, "learning_rate": 3.042310771796423e-06, "loss": 0.4, "step": 5960 }, { "epoch": 0.4246180147451651, "grad_norm": 7.750619411468506, "learning_rate": 3.0394899805054635e-06, "loss": 0.5066, "step": 5961 }, { "epoch": 0.4246892474267194, "grad_norm": 4.326363563537598, "learning_rate": 3.0366702631495237e-06, "loss": 0.6674, "step": 5962 }, { "epoch": 0.42476048010827366, "grad_norm": 4.1295013427734375, "learning_rate": 3.0338516201636516e-06, "loss": 0.4938, "step": 5963 }, { "epoch": 0.42483171278982795, "grad_norm": 4.214746952056885, "learning_rate": 3.031034051982735e-06, "loss": 0.5811, "step": 5964 }, { "epoch": 0.42490294547138224, "grad_norm": 2.6721715927124023, "learning_rate": 3.0282175590415e-06, "loss": 0.7005, "step": 5965 }, { "epoch": 0.4249741781529366, "grad_norm": 6.079374313354492, "learning_rate": 3.0254021417745027e-06, "loss": 0.2287, "step": 5966 }, { "epoch": 0.4250454108344909, "grad_norm": 2.40307879447937, "learning_rate": 3.022587800616127e-06, "loss": 0.291, "step": 5967 }, { "epoch": 0.42511664351604517, "grad_norm": 3.1208317279815674, "learning_rate": 3.0197745360006004e-06, "loss": 0.9415, "step": 5968 }, { "epoch": 0.42518787619759946, "grad_norm": 5.767766952514648, "learning_rate": 3.0169623483619824e-06, "loss": 0.9464, "step": 5969 }, { "epoch": 0.42525910887915375, "grad_norm": 3.6249587535858154, "learning_rate": 3.014151238134161e-06, "loss": 0.4517, "step": 5970 }, { "epoch": 0.42533034156070804, "grad_norm": 2.8416125774383545, "learning_rate": 3.011341205750866e-06, "loss": 0.469, "step": 5971 }, { "epoch": 0.4254015742422623, "grad_norm": 2.5742063522338867, "learning_rate": 3.0085322516456537e-06, "loss": 0.4274, "step": 5972 }, { "epoch": 0.42547280692381667, "grad_norm": 4.46505069732666, "learning_rate": 3.0057243762519137e-06, "loss": 0.4069, "step": 5973 }, { "epoch": 0.42554403960537096, "grad_norm": 4.611293792724609, "learning_rate": 3.002917580002875e-06, "loss": 0.6683, "step": 5974 }, { "epoch": 0.42561527228692525, "grad_norm": 2.0561046600341797, "learning_rate": 3.0001118633316018e-06, "loss": 0.4518, "step": 5975 }, { "epoch": 0.42568650496847954, "grad_norm": 3.842665195465088, "learning_rate": 2.997307226670979e-06, "loss": 0.3233, "step": 5976 }, { "epoch": 0.42575773765003383, "grad_norm": 4.0659918785095215, "learning_rate": 2.9945036704537376e-06, "loss": 0.3663, "step": 5977 }, { "epoch": 0.4258289703315881, "grad_norm": 2.13629412651062, "learning_rate": 2.991701195112441e-06, "loss": 0.5216, "step": 5978 }, { "epoch": 0.4259002030131424, "grad_norm": 3.8066139221191406, "learning_rate": 2.9888998010794745e-06, "loss": 0.3611, "step": 5979 }, { "epoch": 0.42597143569469675, "grad_norm": 2.027794122695923, "learning_rate": 2.986099488787069e-06, "loss": 0.2481, "step": 5980 }, { "epoch": 0.42604266837625104, "grad_norm": 3.7776620388031006, "learning_rate": 2.9833002586672855e-06, "loss": 0.6287, "step": 5981 }, { "epoch": 0.42611390105780533, "grad_norm": 8.443127632141113, "learning_rate": 2.9805021111520105e-06, "loss": 0.1336, "step": 5982 }, { "epoch": 0.4261851337393596, "grad_norm": 3.8639707565307617, "learning_rate": 2.977705046672974e-06, "loss": 0.6639, "step": 5983 }, { "epoch": 0.4262563664209139, "grad_norm": 2.877847909927368, "learning_rate": 2.9749090656617363e-06, "loss": 0.6011, "step": 5984 }, { "epoch": 0.4263275991024682, "grad_norm": 5.080527305603027, "learning_rate": 2.9721141685496825e-06, "loss": 0.6421, "step": 5985 }, { "epoch": 0.4263988317840225, "grad_norm": 4.840533256530762, "learning_rate": 2.9693203557680415e-06, "loss": 0.4655, "step": 5986 }, { "epoch": 0.4264700644655768, "grad_norm": 2.7496745586395264, "learning_rate": 2.9665276277478672e-06, "loss": 0.314, "step": 5987 }, { "epoch": 0.4265412971471311, "grad_norm": 3.2551255226135254, "learning_rate": 2.9637359849200474e-06, "loss": 0.5522, "step": 5988 }, { "epoch": 0.4266125298286854, "grad_norm": 2.3603501319885254, "learning_rate": 2.960945427715305e-06, "loss": 0.2658, "step": 5989 }, { "epoch": 0.4266837625102397, "grad_norm": 2.6150097846984863, "learning_rate": 2.9581559565641983e-06, "loss": 0.5978, "step": 5990 }, { "epoch": 0.426754995191794, "grad_norm": 3.2339701652526855, "learning_rate": 2.9553675718971065e-06, "loss": 0.2404, "step": 5991 }, { "epoch": 0.4268262278733483, "grad_norm": 3.8630542755126953, "learning_rate": 2.9525802741442532e-06, "loss": 0.8692, "step": 5992 }, { "epoch": 0.4268974605549026, "grad_norm": 3.0960123538970947, "learning_rate": 2.9497940637356924e-06, "loss": 0.6562, "step": 5993 }, { "epoch": 0.42696869323645686, "grad_norm": 2.2975823879241943, "learning_rate": 2.9470089411013014e-06, "loss": 0.5617, "step": 5994 }, { "epoch": 0.4270399259180112, "grad_norm": 2.9853670597076416, "learning_rate": 2.9442249066707993e-06, "loss": 0.637, "step": 5995 }, { "epoch": 0.4271111585995655, "grad_norm": 1.947108268737793, "learning_rate": 2.9414419608737366e-06, "loss": 0.2983, "step": 5996 }, { "epoch": 0.4271823912811198, "grad_norm": 3.07067608833313, "learning_rate": 2.938660104139487e-06, "loss": 0.2958, "step": 5997 }, { "epoch": 0.4272536239626741, "grad_norm": 2.6872668266296387, "learning_rate": 2.935879336897265e-06, "loss": 0.5462, "step": 5998 }, { "epoch": 0.42732485664422837, "grad_norm": 2.9805033206939697, "learning_rate": 2.9330996595761184e-06, "loss": 0.3433, "step": 5999 }, { "epoch": 0.42739608932578266, "grad_norm": 6.275449275970459, "learning_rate": 2.930321072604917e-06, "loss": 0.2172, "step": 6000 }, { "epoch": 0.42746732200733695, "grad_norm": 2.743175506591797, "learning_rate": 2.927543576412373e-06, "loss": 0.5137, "step": 6001 }, { "epoch": 0.42753855468889124, "grad_norm": 16.41115951538086, "learning_rate": 2.9247671714270198e-06, "loss": 0.2632, "step": 6002 }, { "epoch": 0.4276097873704456, "grad_norm": 4.434884548187256, "learning_rate": 2.921991858077234e-06, "loss": 0.8324, "step": 6003 }, { "epoch": 0.42768102005199987, "grad_norm": 3.4875993728637695, "learning_rate": 2.919217636791213e-06, "loss": 0.6165, "step": 6004 }, { "epoch": 0.42775225273355416, "grad_norm": 4.481368541717529, "learning_rate": 2.916444507996993e-06, "loss": 0.5302, "step": 6005 }, { "epoch": 0.42782348541510845, "grad_norm": 5.535424709320068, "learning_rate": 2.9136724721224406e-06, "loss": 0.5229, "step": 6006 }, { "epoch": 0.42789471809666274, "grad_norm": 1.3435949087142944, "learning_rate": 2.910901529595248e-06, "loss": 0.1502, "step": 6007 }, { "epoch": 0.42796595077821703, "grad_norm": 3.9474449157714844, "learning_rate": 2.908131680842946e-06, "loss": 0.9388, "step": 6008 }, { "epoch": 0.4280371834597713, "grad_norm": 3.1376893520355225, "learning_rate": 2.9053629262928974e-06, "loss": 0.4994, "step": 6009 }, { "epoch": 0.42810841614132567, "grad_norm": 2.210268259048462, "learning_rate": 2.9025952663722845e-06, "loss": 0.2447, "step": 6010 }, { "epoch": 0.42817964882287995, "grad_norm": 2.2321877479553223, "learning_rate": 2.899828701508133e-06, "loss": 0.3228, "step": 6011 }, { "epoch": 0.42825088150443424, "grad_norm": 6.62808895111084, "learning_rate": 2.8970632321272983e-06, "loss": 0.5703, "step": 6012 }, { "epoch": 0.42832211418598853, "grad_norm": 4.9213104248046875, "learning_rate": 2.894298858656458e-06, "loss": 0.2121, "step": 6013 }, { "epoch": 0.4283933468675428, "grad_norm": 2.304107427597046, "learning_rate": 2.8915355815221293e-06, "loss": 0.1811, "step": 6014 }, { "epoch": 0.4284645795490971, "grad_norm": 2.7848689556121826, "learning_rate": 2.88877340115066e-06, "loss": 0.6056, "step": 6015 }, { "epoch": 0.4285358122306514, "grad_norm": 3.4945685863494873, "learning_rate": 2.8860123179682244e-06, "loss": 0.3703, "step": 6016 }, { "epoch": 0.4286070449122057, "grad_norm": 3.9626829624176025, "learning_rate": 2.883252332400823e-06, "loss": 0.335, "step": 6017 }, { "epoch": 0.42867827759376004, "grad_norm": 4.496028423309326, "learning_rate": 2.8804934448743037e-06, "loss": 0.5457, "step": 6018 }, { "epoch": 0.4287495102753143, "grad_norm": 2.8165223598480225, "learning_rate": 2.8777356558143255e-06, "loss": 0.2145, "step": 6019 }, { "epoch": 0.4288207429568686, "grad_norm": 2.85192608833313, "learning_rate": 2.87497896564639e-06, "loss": 0.6519, "step": 6020 }, { "epoch": 0.4288919756384229, "grad_norm": 2.535290241241455, "learning_rate": 2.8722233747958295e-06, "loss": 0.4469, "step": 6021 }, { "epoch": 0.4289632083199772, "grad_norm": 2.2683210372924805, "learning_rate": 2.869468883687798e-06, "loss": 0.4465, "step": 6022 }, { "epoch": 0.4290344410015315, "grad_norm": 2.456676721572876, "learning_rate": 2.8667154927472875e-06, "loss": 0.5097, "step": 6023 }, { "epoch": 0.4291056736830858, "grad_norm": 2.68650484085083, "learning_rate": 2.8639632023991204e-06, "loss": 0.198, "step": 6024 }, { "epoch": 0.4291769063646401, "grad_norm": 2.3490242958068848, "learning_rate": 2.861212013067941e-06, "loss": 0.5629, "step": 6025 }, { "epoch": 0.4292481390461944, "grad_norm": 2.0896289348602295, "learning_rate": 2.858461925178233e-06, "loss": 0.2407, "step": 6026 }, { "epoch": 0.4293193717277487, "grad_norm": 4.267187595367432, "learning_rate": 2.855712939154309e-06, "loss": 0.4436, "step": 6027 }, { "epoch": 0.429390604409303, "grad_norm": 20.263036727905273, "learning_rate": 2.852965055420305e-06, "loss": 0.2572, "step": 6028 }, { "epoch": 0.4294618370908573, "grad_norm": 2.179213285446167, "learning_rate": 2.8502182744001903e-06, "loss": 0.3819, "step": 6029 }, { "epoch": 0.42953306977241157, "grad_norm": 6.564464569091797, "learning_rate": 2.8474725965177717e-06, "loss": 0.639, "step": 6030 }, { "epoch": 0.42960430245396586, "grad_norm": 3.020772933959961, "learning_rate": 2.8447280221966754e-06, "loss": 0.6663, "step": 6031 }, { "epoch": 0.4296755351355202, "grad_norm": 2.2091822624206543, "learning_rate": 2.841984551860356e-06, "loss": 0.2487, "step": 6032 }, { "epoch": 0.4297467678170745, "grad_norm": 1.5691063404083252, "learning_rate": 2.8392421859321105e-06, "loss": 0.1527, "step": 6033 }, { "epoch": 0.4298180004986288, "grad_norm": 2.915527105331421, "learning_rate": 2.8365009248350515e-06, "loss": 0.3078, "step": 6034 }, { "epoch": 0.4298892331801831, "grad_norm": 1.8336684703826904, "learning_rate": 2.83376076899213e-06, "loss": 0.2098, "step": 6035 }, { "epoch": 0.42996046586173736, "grad_norm": 3.1131937503814697, "learning_rate": 2.831021718826126e-06, "loss": 0.7711, "step": 6036 }, { "epoch": 0.43003169854329165, "grad_norm": 3.981036424636841, "learning_rate": 2.8282837747596428e-06, "loss": 0.2669, "step": 6037 }, { "epoch": 0.43010293122484594, "grad_norm": 3.143646717071533, "learning_rate": 2.8255469372151178e-06, "loss": 0.4907, "step": 6038 }, { "epoch": 0.43017416390640023, "grad_norm": 2.679574966430664, "learning_rate": 2.8228112066148173e-06, "loss": 0.5169, "step": 6039 }, { "epoch": 0.4302453965879546, "grad_norm": 2.714155912399292, "learning_rate": 2.8200765833808406e-06, "loss": 0.258, "step": 6040 }, { "epoch": 0.43031662926950887, "grad_norm": 3.3589987754821777, "learning_rate": 2.8173430679351055e-06, "loss": 0.1674, "step": 6041 }, { "epoch": 0.43038786195106316, "grad_norm": 3.6199777126312256, "learning_rate": 2.8146106606993674e-06, "loss": 0.377, "step": 6042 }, { "epoch": 0.43045909463261744, "grad_norm": 2.7418365478515625, "learning_rate": 2.8118793620952125e-06, "loss": 0.2001, "step": 6043 }, { "epoch": 0.43053032731417173, "grad_norm": 3.1965506076812744, "learning_rate": 2.8091491725440454e-06, "loss": 0.5511, "step": 6044 }, { "epoch": 0.430601559995726, "grad_norm": 3.3078057765960693, "learning_rate": 2.8064200924671137e-06, "loss": 0.2572, "step": 6045 }, { "epoch": 0.4306727926772803, "grad_norm": 2.8705358505249023, "learning_rate": 2.8036921222854776e-06, "loss": 0.3823, "step": 6046 }, { "epoch": 0.43074402535883466, "grad_norm": 2.9755191802978516, "learning_rate": 2.8009652624200436e-06, "loss": 0.5147, "step": 6047 }, { "epoch": 0.43081525804038895, "grad_norm": 3.286656379699707, "learning_rate": 2.7982395132915295e-06, "loss": 0.3065, "step": 6048 }, { "epoch": 0.43088649072194324, "grad_norm": 1.8481969833374023, "learning_rate": 2.7955148753204995e-06, "loss": 0.2318, "step": 6049 }, { "epoch": 0.43095772340349753, "grad_norm": 3.235415458679199, "learning_rate": 2.7927913489273284e-06, "loss": 0.5142, "step": 6050 }, { "epoch": 0.4310289560850518, "grad_norm": 2.9287281036376953, "learning_rate": 2.790068934532232e-06, "loss": 0.1151, "step": 6051 }, { "epoch": 0.4311001887666061, "grad_norm": 2.3082165718078613, "learning_rate": 2.7873476325552538e-06, "loss": 0.3626, "step": 6052 }, { "epoch": 0.4311714214481604, "grad_norm": 4.116361618041992, "learning_rate": 2.784627443416258e-06, "loss": 0.5812, "step": 6053 }, { "epoch": 0.4312426541297147, "grad_norm": 3.1144957542419434, "learning_rate": 2.7819083675349436e-06, "loss": 0.4005, "step": 6054 }, { "epoch": 0.43131388681126903, "grad_norm": 3.3604133129119873, "learning_rate": 2.779190405330838e-06, "loss": 0.4077, "step": 6055 }, { "epoch": 0.4313851194928233, "grad_norm": 2.6656253337860107, "learning_rate": 2.7764735572232916e-06, "loss": 0.5571, "step": 6056 }, { "epoch": 0.4314563521743776, "grad_norm": 4.352930068969727, "learning_rate": 2.773757823631487e-06, "loss": 0.71, "step": 6057 }, { "epoch": 0.4315275848559319, "grad_norm": 3.0335240364074707, "learning_rate": 2.7710432049744363e-06, "loss": 0.5544, "step": 6058 }, { "epoch": 0.4315988175374862, "grad_norm": 3.964775800704956, "learning_rate": 2.768329701670972e-06, "loss": 0.5323, "step": 6059 }, { "epoch": 0.4316700502190405, "grad_norm": 3.691495895385742, "learning_rate": 2.765617314139767e-06, "loss": 0.4363, "step": 6060 }, { "epoch": 0.43174128290059477, "grad_norm": 2.1823525428771973, "learning_rate": 2.7629060427993072e-06, "loss": 0.4808, "step": 6061 }, { "epoch": 0.4318125155821491, "grad_norm": 2.003344774246216, "learning_rate": 2.7601958880679204e-06, "loss": 0.424, "step": 6062 }, { "epoch": 0.4318837482637034, "grad_norm": 3.534123659133911, "learning_rate": 2.7574868503637496e-06, "loss": 0.7701, "step": 6063 }, { "epoch": 0.4319549809452577, "grad_norm": 3.0005640983581543, "learning_rate": 2.754778930104778e-06, "loss": 0.4568, "step": 6064 }, { "epoch": 0.432026213626812, "grad_norm": 2.007812023162842, "learning_rate": 2.7520721277088023e-06, "loss": 0.2958, "step": 6065 }, { "epoch": 0.4320974463083663, "grad_norm": 2.962883949279785, "learning_rate": 2.7493664435934574e-06, "loss": 0.3147, "step": 6066 }, { "epoch": 0.43216867898992056, "grad_norm": 2.7531275749206543, "learning_rate": 2.7466618781762077e-06, "loss": 0.3228, "step": 6067 }, { "epoch": 0.43223991167147485, "grad_norm": 3.29634165763855, "learning_rate": 2.743958431874332e-06, "loss": 0.6531, "step": 6068 }, { "epoch": 0.4323111443530292, "grad_norm": 2.935725688934326, "learning_rate": 2.7412561051049468e-06, "loss": 0.5842, "step": 6069 }, { "epoch": 0.4323823770345835, "grad_norm": 3.219648599624634, "learning_rate": 2.7385548982849974e-06, "loss": 0.7566, "step": 6070 }, { "epoch": 0.4324536097161378, "grad_norm": 3.5358026027679443, "learning_rate": 2.7358548118312455e-06, "loss": 0.5976, "step": 6071 }, { "epoch": 0.43252484239769207, "grad_norm": 2.8701465129852295, "learning_rate": 2.7331558461602905e-06, "loss": 0.2917, "step": 6072 }, { "epoch": 0.43259607507924636, "grad_norm": 3.6476309299468994, "learning_rate": 2.7304580016885564e-06, "loss": 0.3795, "step": 6073 }, { "epoch": 0.43266730776080065, "grad_norm": 3.0663974285125732, "learning_rate": 2.727761278832288e-06, "loss": 0.541, "step": 6074 }, { "epoch": 0.43273854044235494, "grad_norm": 2.7777254581451416, "learning_rate": 2.725065678007568e-06, "loss": 0.4607, "step": 6075 }, { "epoch": 0.4328097731239092, "grad_norm": 3.635348081588745, "learning_rate": 2.7223711996302935e-06, "loss": 0.6006, "step": 6076 }, { "epoch": 0.43288100580546357, "grad_norm": 2.6531476974487305, "learning_rate": 2.719677844116202e-06, "loss": 0.4674, "step": 6077 }, { "epoch": 0.43295223848701786, "grad_norm": 3.2027394771575928, "learning_rate": 2.7169856118808414e-06, "loss": 0.6344, "step": 6078 }, { "epoch": 0.43302347116857215, "grad_norm": 3.6097819805145264, "learning_rate": 2.714294503339602e-06, "loss": 0.6015, "step": 6079 }, { "epoch": 0.43309470385012644, "grad_norm": 2.8800911903381348, "learning_rate": 2.7116045189076946e-06, "loss": 0.7907, "step": 6080 }, { "epoch": 0.43316593653168073, "grad_norm": 3.9535090923309326, "learning_rate": 2.708915659000151e-06, "loss": 0.7398, "step": 6081 }, { "epoch": 0.433237169213235, "grad_norm": 2.504897356033325, "learning_rate": 2.706227924031838e-06, "loss": 0.5224, "step": 6082 }, { "epoch": 0.4333084018947893, "grad_norm": 1.9366168975830078, "learning_rate": 2.7035413144174472e-06, "loss": 0.2736, "step": 6083 }, { "epoch": 0.43337963457634365, "grad_norm": 3.590627908706665, "learning_rate": 2.7008558305714905e-06, "loss": 0.204, "step": 6084 }, { "epoch": 0.43345086725789794, "grad_norm": 3.0440661907196045, "learning_rate": 2.698171472908312e-06, "loss": 0.466, "step": 6085 }, { "epoch": 0.43352209993945223, "grad_norm": 5.294552803039551, "learning_rate": 2.6954882418420836e-06, "loss": 0.6696, "step": 6086 }, { "epoch": 0.4335933326210065, "grad_norm": 5.678048610687256, "learning_rate": 2.6928061377867954e-06, "loss": 0.6522, "step": 6087 }, { "epoch": 0.4336645653025608, "grad_norm": 3.580132484436035, "learning_rate": 2.6901251611562695e-06, "loss": 0.2967, "step": 6088 }, { "epoch": 0.4337357979841151, "grad_norm": 6.5536298751831055, "learning_rate": 2.6874453123641585e-06, "loss": 0.533, "step": 6089 }, { "epoch": 0.4338070306656694, "grad_norm": 2.6375815868377686, "learning_rate": 2.6847665918239273e-06, "loss": 0.4722, "step": 6090 }, { "epoch": 0.4338782633472237, "grad_norm": 2.282749891281128, "learning_rate": 2.682088999948882e-06, "loss": 0.4727, "step": 6091 }, { "epoch": 0.433949496028778, "grad_norm": 2.2045931816101074, "learning_rate": 2.679412537152143e-06, "loss": 0.3003, "step": 6092 }, { "epoch": 0.4340207287103323, "grad_norm": 2.5593273639678955, "learning_rate": 2.67673720384666e-06, "loss": 0.6758, "step": 6093 }, { "epoch": 0.4340919613918866, "grad_norm": 3.4127109050750732, "learning_rate": 2.6740630004452115e-06, "loss": 0.7395, "step": 6094 }, { "epoch": 0.4341631940734409, "grad_norm": 2.5266528129577637, "learning_rate": 2.6713899273604027e-06, "loss": 0.2208, "step": 6095 }, { "epoch": 0.4342344267549952, "grad_norm": 4.041965484619141, "learning_rate": 2.668717985004654e-06, "loss": 0.7606, "step": 6096 }, { "epoch": 0.4343056594365495, "grad_norm": 2.888692855834961, "learning_rate": 2.6660471737902228e-06, "loss": 0.7565, "step": 6097 }, { "epoch": 0.43437689211810376, "grad_norm": 4.8775858879089355, "learning_rate": 2.6633774941291912e-06, "loss": 0.7512, "step": 6098 }, { "epoch": 0.4344481247996581, "grad_norm": 3.2126784324645996, "learning_rate": 2.6607089464334567e-06, "loss": 0.7371, "step": 6099 }, { "epoch": 0.4345193574812124, "grad_norm": 2.9081318378448486, "learning_rate": 2.658041531114751e-06, "loss": 0.3071, "step": 6100 }, { "epoch": 0.4345905901627667, "grad_norm": 1.5211617946624756, "learning_rate": 2.6553752485846327e-06, "loss": 0.1943, "step": 6101 }, { "epoch": 0.434661822844321, "grad_norm": 3.076270580291748, "learning_rate": 2.652710099254476e-06, "loss": 0.5118, "step": 6102 }, { "epoch": 0.43473305552587527, "grad_norm": 3.941903591156006, "learning_rate": 2.650046083535489e-06, "loss": 0.4606, "step": 6103 }, { "epoch": 0.43480428820742956, "grad_norm": 3.8666813373565674, "learning_rate": 2.6473832018387034e-06, "loss": 0.2514, "step": 6104 }, { "epoch": 0.43487552088898385, "grad_norm": 2.6958167552948, "learning_rate": 2.64472145457497e-06, "loss": 0.5169, "step": 6105 }, { "epoch": 0.43494675357053814, "grad_norm": 3.686375379562378, "learning_rate": 2.642060842154974e-06, "loss": 0.2367, "step": 6106 }, { "epoch": 0.4350179862520925, "grad_norm": 3.763493061065674, "learning_rate": 2.639401364989218e-06, "loss": 0.5553, "step": 6107 }, { "epoch": 0.43508921893364677, "grad_norm": 2.842517375946045, "learning_rate": 2.6367430234880286e-06, "loss": 0.2479, "step": 6108 }, { "epoch": 0.43516045161520106, "grad_norm": 6.214869022369385, "learning_rate": 2.634085818061565e-06, "loss": 0.8702, "step": 6109 }, { "epoch": 0.43523168429675535, "grad_norm": 2.159609079360962, "learning_rate": 2.631429749119807e-06, "loss": 0.1654, "step": 6110 }, { "epoch": 0.43530291697830964, "grad_norm": 4.108116626739502, "learning_rate": 2.6287748170725545e-06, "loss": 0.8272, "step": 6111 }, { "epoch": 0.43537414965986393, "grad_norm": 4.207865238189697, "learning_rate": 2.62612102232944e-06, "loss": 0.1953, "step": 6112 }, { "epoch": 0.4354453823414182, "grad_norm": 3.1795010566711426, "learning_rate": 2.6234683652999173e-06, "loss": 0.9058, "step": 6113 }, { "epoch": 0.43551661502297256, "grad_norm": 3.267263412475586, "learning_rate": 2.6208168463932595e-06, "loss": 0.5501, "step": 6114 }, { "epoch": 0.43558784770452685, "grad_norm": 3.8285255432128906, "learning_rate": 2.618166466018571e-06, "loss": 0.3212, "step": 6115 }, { "epoch": 0.43565908038608114, "grad_norm": 2.491241455078125, "learning_rate": 2.6155172245847793e-06, "loss": 0.2842, "step": 6116 }, { "epoch": 0.43573031306763543, "grad_norm": 5.0723185539245605, "learning_rate": 2.6128691225006376e-06, "loss": 0.6022, "step": 6117 }, { "epoch": 0.4358015457491897, "grad_norm": 3.5263123512268066, "learning_rate": 2.6102221601747136e-06, "loss": 0.5744, "step": 6118 }, { "epoch": 0.435872778430744, "grad_norm": 4.295783042907715, "learning_rate": 2.607576338015414e-06, "loss": 0.9513, "step": 6119 }, { "epoch": 0.4359440111122983, "grad_norm": 2.653439998626709, "learning_rate": 2.6049316564309546e-06, "loss": 0.5835, "step": 6120 }, { "epoch": 0.43601524379385265, "grad_norm": 3.5158331394195557, "learning_rate": 2.60228811582939e-06, "loss": 0.4165, "step": 6121 }, { "epoch": 0.43608647647540694, "grad_norm": 5.0115156173706055, "learning_rate": 2.599645716618584e-06, "loss": 0.6539, "step": 6122 }, { "epoch": 0.4361577091569612, "grad_norm": 4.068092346191406, "learning_rate": 2.597004459206238e-06, "loss": 0.5221, "step": 6123 }, { "epoch": 0.4362289418385155, "grad_norm": 3.8212995529174805, "learning_rate": 2.5943643439998644e-06, "loss": 0.5941, "step": 6124 }, { "epoch": 0.4363001745200698, "grad_norm": 3.0293517112731934, "learning_rate": 2.5917253714068104e-06, "loss": 0.0974, "step": 6125 }, { "epoch": 0.4363714072016241, "grad_norm": 2.8544628620147705, "learning_rate": 2.589087541834243e-06, "loss": 0.4711, "step": 6126 }, { "epoch": 0.4364426398831784, "grad_norm": 3.1250340938568115, "learning_rate": 2.5864508556891475e-06, "loss": 0.5931, "step": 6127 }, { "epoch": 0.4365138725647327, "grad_norm": 3.275975227355957, "learning_rate": 2.5838153133783405e-06, "loss": 0.3779, "step": 6128 }, { "epoch": 0.436585105246287, "grad_norm": 2.4528961181640625, "learning_rate": 2.581180915308461e-06, "loss": 0.3912, "step": 6129 }, { "epoch": 0.4366563379278413, "grad_norm": 2.3589892387390137, "learning_rate": 2.578547661885965e-06, "loss": 0.2333, "step": 6130 }, { "epoch": 0.4367275706093956, "grad_norm": 2.5266566276550293, "learning_rate": 2.5759155535171388e-06, "loss": 0.5275, "step": 6131 }, { "epoch": 0.4367988032909499, "grad_norm": 2.8523449897766113, "learning_rate": 2.5732845906080915e-06, "loss": 0.4839, "step": 6132 }, { "epoch": 0.4368700359725042, "grad_norm": 5.010465145111084, "learning_rate": 2.570654773564749e-06, "loss": 0.1182, "step": 6133 }, { "epoch": 0.43694126865405847, "grad_norm": 2.234947681427002, "learning_rate": 2.5680261027928676e-06, "loss": 0.1451, "step": 6134 }, { "epoch": 0.43701250133561276, "grad_norm": 2.678058385848999, "learning_rate": 2.565398578698026e-06, "loss": 0.2879, "step": 6135 }, { "epoch": 0.4370837340171671, "grad_norm": 2.5277462005615234, "learning_rate": 2.5627722016856237e-06, "loss": 0.5409, "step": 6136 }, { "epoch": 0.4371549666987214, "grad_norm": 3.31471586227417, "learning_rate": 2.5601469721608786e-06, "loss": 0.5628, "step": 6137 }, { "epoch": 0.4372261993802757, "grad_norm": 1.1814305782318115, "learning_rate": 2.557522890528842e-06, "loss": 0.046, "step": 6138 }, { "epoch": 0.43729743206182997, "grad_norm": 6.225907802581787, "learning_rate": 2.554899957194379e-06, "loss": 0.7667, "step": 6139 }, { "epoch": 0.43736866474338426, "grad_norm": 3.072890520095825, "learning_rate": 2.5522781725621814e-06, "loss": 0.6614, "step": 6140 }, { "epoch": 0.43743989742493855, "grad_norm": 2.457834482192993, "learning_rate": 2.549657537036769e-06, "loss": 0.2034, "step": 6141 }, { "epoch": 0.43751113010649284, "grad_norm": 1.6625484228134155, "learning_rate": 2.547038051022472e-06, "loss": 0.1172, "step": 6142 }, { "epoch": 0.43758236278804713, "grad_norm": 3.0896036624908447, "learning_rate": 2.544419714923454e-06, "loss": 0.5071, "step": 6143 }, { "epoch": 0.4376535954696015, "grad_norm": 2.988271713256836, "learning_rate": 2.5418025291436976e-06, "loss": 0.8138, "step": 6144 }, { "epoch": 0.43772482815115576, "grad_norm": 3.3909921646118164, "learning_rate": 2.539186494087005e-06, "loss": 0.3577, "step": 6145 }, { "epoch": 0.43779606083271005, "grad_norm": 3.1512370109558105, "learning_rate": 2.5365716101570036e-06, "loss": 0.2597, "step": 6146 }, { "epoch": 0.43786729351426434, "grad_norm": 4.49468994140625, "learning_rate": 2.533957877757148e-06, "loss": 0.3607, "step": 6147 }, { "epoch": 0.43793852619581863, "grad_norm": 8.536214828491211, "learning_rate": 2.5313452972907027e-06, "loss": 0.6533, "step": 6148 }, { "epoch": 0.4380097588773729, "grad_norm": 3.261131763458252, "learning_rate": 2.5287338691607664e-06, "loss": 0.4125, "step": 6149 }, { "epoch": 0.4380809915589272, "grad_norm": 2.682499408721924, "learning_rate": 2.5261235937702576e-06, "loss": 0.6573, "step": 6150 }, { "epoch": 0.43815222424048156, "grad_norm": 5.785411834716797, "learning_rate": 2.523514471521913e-06, "loss": 0.7226, "step": 6151 }, { "epoch": 0.43822345692203585, "grad_norm": 6.110199451446533, "learning_rate": 2.520906502818289e-06, "loss": 0.358, "step": 6152 }, { "epoch": 0.43829468960359014, "grad_norm": 2.092066764831543, "learning_rate": 2.518299688061772e-06, "loss": 0.4087, "step": 6153 }, { "epoch": 0.4383659222851444, "grad_norm": 2.824615478515625, "learning_rate": 2.5156940276545692e-06, "loss": 0.3101, "step": 6154 }, { "epoch": 0.4384371549666987, "grad_norm": 3.856325149536133, "learning_rate": 2.5130895219987015e-06, "loss": 0.2823, "step": 6155 }, { "epoch": 0.438508387648253, "grad_norm": 3.40096378326416, "learning_rate": 2.5104861714960207e-06, "loss": 0.5964, "step": 6156 }, { "epoch": 0.4385796203298073, "grad_norm": 2.712184190750122, "learning_rate": 2.507883976548199e-06, "loss": 0.2373, "step": 6157 }, { "epoch": 0.4386508530113616, "grad_norm": 3.893972873687744, "learning_rate": 2.5052829375567232e-06, "loss": 0.5082, "step": 6158 }, { "epoch": 0.43872208569291593, "grad_norm": 3.899653434753418, "learning_rate": 2.5026830549229097e-06, "loss": 0.5446, "step": 6159 }, { "epoch": 0.4387933183744702, "grad_norm": 2.5363168716430664, "learning_rate": 2.500084329047896e-06, "loss": 0.3891, "step": 6160 }, { "epoch": 0.4388645510560245, "grad_norm": 2.4369704723358154, "learning_rate": 2.4974867603326337e-06, "loss": 0.3958, "step": 6161 }, { "epoch": 0.4389357837375788, "grad_norm": 3.4272730350494385, "learning_rate": 2.4948903491779032e-06, "loss": 0.7153, "step": 6162 }, { "epoch": 0.4390070164191331, "grad_norm": 2.2510416507720947, "learning_rate": 2.492295095984306e-06, "loss": 0.4146, "step": 6163 }, { "epoch": 0.4390782491006874, "grad_norm": 5.216794013977051, "learning_rate": 2.4897010011522595e-06, "loss": 0.5366, "step": 6164 }, { "epoch": 0.43914948178224167, "grad_norm": 2.2957358360290527, "learning_rate": 2.48710806508201e-06, "loss": 0.3456, "step": 6165 }, { "epoch": 0.439220714463796, "grad_norm": 3.7780020236968994, "learning_rate": 2.484516288173615e-06, "loss": 0.8188, "step": 6166 }, { "epoch": 0.4392919471453503, "grad_norm": 2.52856707572937, "learning_rate": 2.4819256708269655e-06, "loss": 0.5282, "step": 6167 }, { "epoch": 0.4393631798269046, "grad_norm": 6.310977458953857, "learning_rate": 2.47933621344176e-06, "loss": 0.6108, "step": 6168 }, { "epoch": 0.4394344125084589, "grad_norm": 3.3557322025299072, "learning_rate": 2.4767479164175323e-06, "loss": 0.3608, "step": 6169 }, { "epoch": 0.43950564519001317, "grad_norm": 3.2383456230163574, "learning_rate": 2.474160780153624e-06, "loss": 0.3768, "step": 6170 }, { "epoch": 0.43957687787156746, "grad_norm": 3.45382022857666, "learning_rate": 2.471574805049206e-06, "loss": 0.3464, "step": 6171 }, { "epoch": 0.43964811055312175, "grad_norm": 2.983588218688965, "learning_rate": 2.468989991503271e-06, "loss": 0.6759, "step": 6172 }, { "epoch": 0.4397193432346761, "grad_norm": 3.8894338607788086, "learning_rate": 2.4664063399146232e-06, "loss": 0.7045, "step": 6173 }, { "epoch": 0.4397905759162304, "grad_norm": 2.5256385803222656, "learning_rate": 2.4638238506818958e-06, "loss": 0.437, "step": 6174 }, { "epoch": 0.4398618085977847, "grad_norm": 1.9415206909179688, "learning_rate": 2.4612425242035432e-06, "loss": 0.1205, "step": 6175 }, { "epoch": 0.43993304127933897, "grad_norm": 4.39207649230957, "learning_rate": 2.4586623608778324e-06, "loss": 0.25, "step": 6176 }, { "epoch": 0.44000427396089326, "grad_norm": 4.871227741241455, "learning_rate": 2.456083361102858e-06, "loss": 0.7509, "step": 6177 }, { "epoch": 0.44007550664244754, "grad_norm": 3.168877124786377, "learning_rate": 2.453505525276537e-06, "loss": 0.4299, "step": 6178 }, { "epoch": 0.44014673932400183, "grad_norm": 3.197145462036133, "learning_rate": 2.450928853796597e-06, "loss": 0.4528, "step": 6179 }, { "epoch": 0.4402179720055561, "grad_norm": 2.8962011337280273, "learning_rate": 2.4483533470605967e-06, "loss": 0.5495, "step": 6180 }, { "epoch": 0.44028920468711047, "grad_norm": 3.461099624633789, "learning_rate": 2.4457790054659057e-06, "loss": 0.826, "step": 6181 }, { "epoch": 0.44036043736866476, "grad_norm": 2.2828562259674072, "learning_rate": 2.443205829409724e-06, "loss": 0.342, "step": 6182 }, { "epoch": 0.44043167005021905, "grad_norm": 2.1691813468933105, "learning_rate": 2.440633819289059e-06, "loss": 0.4694, "step": 6183 }, { "epoch": 0.44050290273177334, "grad_norm": 2.2506353855133057, "learning_rate": 2.4380629755007524e-06, "loss": 0.434, "step": 6184 }, { "epoch": 0.4405741354133276, "grad_norm": 3.343641996383667, "learning_rate": 2.4354932984414527e-06, "loss": 0.3582, "step": 6185 }, { "epoch": 0.4406453680948819, "grad_norm": 2.8464035987854004, "learning_rate": 2.432924788507638e-06, "loss": 0.6157, "step": 6186 }, { "epoch": 0.4407166007764362, "grad_norm": 3.0613951683044434, "learning_rate": 2.430357446095606e-06, "loss": 0.4461, "step": 6187 }, { "epoch": 0.44078783345799055, "grad_norm": 3.1407933235168457, "learning_rate": 2.427791271601465e-06, "loss": 0.6607, "step": 6188 }, { "epoch": 0.44085906613954484, "grad_norm": 2.873729705810547, "learning_rate": 2.425226265421151e-06, "loss": 0.507, "step": 6189 }, { "epoch": 0.44093029882109913, "grad_norm": 2.2697553634643555, "learning_rate": 2.422662427950423e-06, "loss": 0.439, "step": 6190 }, { "epoch": 0.4410015315026534, "grad_norm": 3.9892892837524414, "learning_rate": 2.4200997595848474e-06, "loss": 0.5307, "step": 6191 }, { "epoch": 0.4410727641842077, "grad_norm": 2.3537118434906006, "learning_rate": 2.4175382607198217e-06, "loss": 0.4376, "step": 6192 }, { "epoch": 0.441143996865762, "grad_norm": 6.134088039398193, "learning_rate": 2.4149779317505574e-06, "loss": 0.3865, "step": 6193 }, { "epoch": 0.4412152295473163, "grad_norm": 3.2272777557373047, "learning_rate": 2.4124187730720916e-06, "loss": 0.6942, "step": 6194 }, { "epoch": 0.4412864622288706, "grad_norm": 2.7457399368286133, "learning_rate": 2.4098607850792712e-06, "loss": 0.6072, "step": 6195 }, { "epoch": 0.4413576949104249, "grad_norm": 2.641655921936035, "learning_rate": 2.4073039681667653e-06, "loss": 0.2694, "step": 6196 }, { "epoch": 0.4414289275919792, "grad_norm": 2.471022367477417, "learning_rate": 2.4047483227290715e-06, "loss": 0.3345, "step": 6197 }, { "epoch": 0.4415001602735335, "grad_norm": 4.0616455078125, "learning_rate": 2.4021938491604912e-06, "loss": 0.5416, "step": 6198 }, { "epoch": 0.4415713929550878, "grad_norm": 3.260835647583008, "learning_rate": 2.3996405478551586e-06, "loss": 1.0051, "step": 6199 }, { "epoch": 0.4416426256366421, "grad_norm": 3.243333101272583, "learning_rate": 2.3970884192070232e-06, "loss": 0.4257, "step": 6200 }, { "epoch": 0.4417138583181964, "grad_norm": 2.7623629570007324, "learning_rate": 2.3945374636098474e-06, "loss": 0.4699, "step": 6201 }, { "epoch": 0.44178509099975066, "grad_norm": 2.1510539054870605, "learning_rate": 2.3919876814572197e-06, "loss": 0.1586, "step": 6202 }, { "epoch": 0.441856323681305, "grad_norm": 3.504040479660034, "learning_rate": 2.3894390731425486e-06, "loss": 0.583, "step": 6203 }, { "epoch": 0.4419275563628593, "grad_norm": 2.916908025741577, "learning_rate": 2.3868916390590524e-06, "loss": 0.5221, "step": 6204 }, { "epoch": 0.4419987890444136, "grad_norm": 2.5921177864074707, "learning_rate": 2.384345379599775e-06, "loss": 0.2169, "step": 6205 }, { "epoch": 0.4420700217259679, "grad_norm": 2.600261926651001, "learning_rate": 2.3818002951575834e-06, "loss": 0.4771, "step": 6206 }, { "epoch": 0.44214125440752217, "grad_norm": 3.1345274448394775, "learning_rate": 2.3792563861251506e-06, "loss": 0.2869, "step": 6207 }, { "epoch": 0.44221248708907646, "grad_norm": 3.2309529781341553, "learning_rate": 2.3767136528949797e-06, "loss": 0.4254, "step": 6208 }, { "epoch": 0.44228371977063075, "grad_norm": 3.671243906021118, "learning_rate": 2.3741720958593896e-06, "loss": 0.5932, "step": 6209 }, { "epoch": 0.44235495245218504, "grad_norm": 2.608288288116455, "learning_rate": 2.371631715410512e-06, "loss": 0.2469, "step": 6210 }, { "epoch": 0.4424261851337394, "grad_norm": 4.417611598968506, "learning_rate": 2.3690925119403065e-06, "loss": 0.6393, "step": 6211 }, { "epoch": 0.44249741781529367, "grad_norm": 2.622506618499756, "learning_rate": 2.3665544858405433e-06, "loss": 0.6337, "step": 6212 }, { "epoch": 0.44256865049684796, "grad_norm": 4.113739967346191, "learning_rate": 2.3640176375028103e-06, "loss": 0.4844, "step": 6213 }, { "epoch": 0.44263988317840225, "grad_norm": 2.8248307704925537, "learning_rate": 2.361481967318521e-06, "loss": 0.5008, "step": 6214 }, { "epoch": 0.44271111585995654, "grad_norm": 3.944446086883545, "learning_rate": 2.3589474756789045e-06, "loss": 0.6232, "step": 6215 }, { "epoch": 0.44278234854151083, "grad_norm": 2.2632646560668945, "learning_rate": 2.3564141629750026e-06, "loss": 0.2187, "step": 6216 }, { "epoch": 0.4428535812230651, "grad_norm": 2.599900960922241, "learning_rate": 2.3538820295976816e-06, "loss": 0.5645, "step": 6217 }, { "epoch": 0.44292481390461946, "grad_norm": 3.003070592880249, "learning_rate": 2.3513510759376266e-06, "loss": 0.5023, "step": 6218 }, { "epoch": 0.44299604658617375, "grad_norm": 2.885899066925049, "learning_rate": 2.3488213023853325e-06, "loss": 0.1337, "step": 6219 }, { "epoch": 0.44306727926772804, "grad_norm": 2.409169912338257, "learning_rate": 2.3462927093311183e-06, "loss": 0.4065, "step": 6220 }, { "epoch": 0.44313851194928233, "grad_norm": 4.791357517242432, "learning_rate": 2.343765297165125e-06, "loss": 0.4647, "step": 6221 }, { "epoch": 0.4432097446308366, "grad_norm": 3.766603708267212, "learning_rate": 2.341239066277299e-06, "loss": 1.1164, "step": 6222 }, { "epoch": 0.4432809773123909, "grad_norm": 1.7544862031936646, "learning_rate": 2.3387140170574154e-06, "loss": 0.1066, "step": 6223 }, { "epoch": 0.4433522099939452, "grad_norm": 3.5059821605682373, "learning_rate": 2.3361901498950656e-06, "loss": 0.542, "step": 6224 }, { "epoch": 0.44342344267549955, "grad_norm": 3.9856226444244385, "learning_rate": 2.333667465179651e-06, "loss": 0.3445, "step": 6225 }, { "epoch": 0.44349467535705384, "grad_norm": 2.767155170440674, "learning_rate": 2.3311459633004006e-06, "loss": 0.6592, "step": 6226 }, { "epoch": 0.4435659080386081, "grad_norm": 8.07968521118164, "learning_rate": 2.328625644646355e-06, "loss": 0.9091, "step": 6227 }, { "epoch": 0.4436371407201624, "grad_norm": 4.427659511566162, "learning_rate": 2.3261065096063696e-06, "loss": 0.3718, "step": 6228 }, { "epoch": 0.4437083734017167, "grad_norm": 3.9593653678894043, "learning_rate": 2.3235885585691243e-06, "loss": 0.532, "step": 6229 }, { "epoch": 0.443779606083271, "grad_norm": 5.650603771209717, "learning_rate": 2.3210717919231117e-06, "loss": 0.7637, "step": 6230 }, { "epoch": 0.4438508387648253, "grad_norm": 4.105041980743408, "learning_rate": 2.318556210056648e-06, "loss": 0.4738, "step": 6231 }, { "epoch": 0.4439220714463796, "grad_norm": 2.9684228897094727, "learning_rate": 2.3160418133578544e-06, "loss": 0.6258, "step": 6232 }, { "epoch": 0.4439933041279339, "grad_norm": 2.26766300201416, "learning_rate": 2.3135286022146785e-06, "loss": 0.1936, "step": 6233 }, { "epoch": 0.4440645368094882, "grad_norm": 2.62985897064209, "learning_rate": 2.3110165770148873e-06, "loss": 0.4906, "step": 6234 }, { "epoch": 0.4441357694910425, "grad_norm": 3.7992422580718994, "learning_rate": 2.308505738146055e-06, "loss": 0.2002, "step": 6235 }, { "epoch": 0.4442070021725968, "grad_norm": 2.6593520641326904, "learning_rate": 2.3059960859955798e-06, "loss": 0.4687, "step": 6236 }, { "epoch": 0.4442782348541511, "grad_norm": 2.7827281951904297, "learning_rate": 2.303487620950677e-06, "loss": 0.5086, "step": 6237 }, { "epoch": 0.44434946753570537, "grad_norm": 1.4494383335113525, "learning_rate": 2.3009803433983744e-06, "loss": 0.1386, "step": 6238 }, { "epoch": 0.44442070021725966, "grad_norm": 3.0305140018463135, "learning_rate": 2.2984742537255233e-06, "loss": 0.294, "step": 6239 }, { "epoch": 0.444491932898814, "grad_norm": 4.01397180557251, "learning_rate": 2.2959693523187808e-06, "loss": 0.4119, "step": 6240 }, { "epoch": 0.4445631655803683, "grad_norm": 3.8499155044555664, "learning_rate": 2.2934656395646336e-06, "loss": 0.8116, "step": 6241 }, { "epoch": 0.4446343982619226, "grad_norm": 2.1110496520996094, "learning_rate": 2.290963115849375e-06, "loss": 0.2815, "step": 6242 }, { "epoch": 0.44470563094347687, "grad_norm": 3.1469733715057373, "learning_rate": 2.2884617815591213e-06, "loss": 0.6202, "step": 6243 }, { "epoch": 0.44477686362503116, "grad_norm": 4.142462253570557, "learning_rate": 2.285961637079799e-06, "loss": 0.3927, "step": 6244 }, { "epoch": 0.44484809630658545, "grad_norm": 3.1516494750976562, "learning_rate": 2.283462682797156e-06, "loss": 0.3785, "step": 6245 }, { "epoch": 0.44491932898813974, "grad_norm": 2.3252005577087402, "learning_rate": 2.2809649190967597e-06, "loss": 0.239, "step": 6246 }, { "epoch": 0.44499056166969403, "grad_norm": 4.6240105628967285, "learning_rate": 2.2784683463639832e-06, "loss": 0.4733, "step": 6247 }, { "epoch": 0.4450617943512484, "grad_norm": 7.6564764976501465, "learning_rate": 2.2759729649840232e-06, "loss": 0.4664, "step": 6248 }, { "epoch": 0.44513302703280266, "grad_norm": 4.207352161407471, "learning_rate": 2.2734787753418965e-06, "loss": 0.5429, "step": 6249 }, { "epoch": 0.44520425971435695, "grad_norm": 4.344802379608154, "learning_rate": 2.2709857778224244e-06, "loss": 0.8192, "step": 6250 }, { "epoch": 0.44527549239591124, "grad_norm": 2.633878469467163, "learning_rate": 2.2684939728102528e-06, "loss": 0.2756, "step": 6251 }, { "epoch": 0.44534672507746553, "grad_norm": 2.53969144821167, "learning_rate": 2.2660033606898447e-06, "loss": 0.5382, "step": 6252 }, { "epoch": 0.4454179577590198, "grad_norm": 4.462120056152344, "learning_rate": 2.263513941845471e-06, "loss": 0.5719, "step": 6253 }, { "epoch": 0.4454891904405741, "grad_norm": 3.4386024475097656, "learning_rate": 2.261025716661225e-06, "loss": 0.793, "step": 6254 }, { "epoch": 0.44556042312212846, "grad_norm": 2.3348007202148438, "learning_rate": 2.2585386855210177e-06, "loss": 0.4461, "step": 6255 }, { "epoch": 0.44563165580368275, "grad_norm": 2.9384138584136963, "learning_rate": 2.256052848808571e-06, "loss": 0.5146, "step": 6256 }, { "epoch": 0.44570288848523704, "grad_norm": 2.879733085632324, "learning_rate": 2.2535682069074183e-06, "loss": 0.6386, "step": 6257 }, { "epoch": 0.4457741211667913, "grad_norm": 3.4434986114501953, "learning_rate": 2.251084760200921e-06, "loss": 0.3884, "step": 6258 }, { "epoch": 0.4458453538483456, "grad_norm": 2.8639168739318848, "learning_rate": 2.248602509072245e-06, "loss": 0.5093, "step": 6259 }, { "epoch": 0.4459165865298999, "grad_norm": 3.270294189453125, "learning_rate": 2.2461214539043773e-06, "loss": 0.5742, "step": 6260 }, { "epoch": 0.4459878192114542, "grad_norm": 2.332632303237915, "learning_rate": 2.2436415950801228e-06, "loss": 0.1386, "step": 6261 }, { "epoch": 0.4460590518930085, "grad_norm": 3.37229061126709, "learning_rate": 2.241162932982093e-06, "loss": 0.5539, "step": 6262 }, { "epoch": 0.44613028457456283, "grad_norm": 3.1763665676116943, "learning_rate": 2.2386854679927215e-06, "loss": 0.5837, "step": 6263 }, { "epoch": 0.4462015172561171, "grad_norm": 1.9344137907028198, "learning_rate": 2.2362092004942583e-06, "loss": 0.2788, "step": 6264 }, { "epoch": 0.4462727499376714, "grad_norm": 2.9812631607055664, "learning_rate": 2.233734130868762e-06, "loss": 0.649, "step": 6265 }, { "epoch": 0.4463439826192257, "grad_norm": 5.884324073791504, "learning_rate": 2.2312602594981126e-06, "loss": 0.7812, "step": 6266 }, { "epoch": 0.44641521530078, "grad_norm": 2.591636896133423, "learning_rate": 2.228787586764004e-06, "loss": 0.6458, "step": 6267 }, { "epoch": 0.4464864479823343, "grad_norm": 2.4004430770874023, "learning_rate": 2.2263161130479405e-06, "loss": 0.3064, "step": 6268 }, { "epoch": 0.44655768066388857, "grad_norm": 3.1187596321105957, "learning_rate": 2.2238458387312476e-06, "loss": 0.2714, "step": 6269 }, { "epoch": 0.4466289133454429, "grad_norm": 2.775867223739624, "learning_rate": 2.2213767641950658e-06, "loss": 0.3771, "step": 6270 }, { "epoch": 0.4467001460269972, "grad_norm": 3.4592645168304443, "learning_rate": 2.2189088898203446e-06, "loss": 0.6352, "step": 6271 }, { "epoch": 0.4467713787085515, "grad_norm": 2.272244930267334, "learning_rate": 2.2164422159878496e-06, "loss": 0.3253, "step": 6272 }, { "epoch": 0.4468426113901058, "grad_norm": 4.355688571929932, "learning_rate": 2.2139767430781654e-06, "loss": 0.7708, "step": 6273 }, { "epoch": 0.44691384407166007, "grad_norm": 1.973746657371521, "learning_rate": 2.211512471471692e-06, "loss": 0.2195, "step": 6274 }, { "epoch": 0.44698507675321436, "grad_norm": 3.7451164722442627, "learning_rate": 2.2090494015486354e-06, "loss": 0.5641, "step": 6275 }, { "epoch": 0.44705630943476865, "grad_norm": 2.5340158939361572, "learning_rate": 2.206587533689025e-06, "loss": 0.1796, "step": 6276 }, { "epoch": 0.447127542116323, "grad_norm": 3.037799596786499, "learning_rate": 2.2041268682727034e-06, "loss": 0.6459, "step": 6277 }, { "epoch": 0.4471987747978773, "grad_norm": 1.384698510169983, "learning_rate": 2.2016674056793232e-06, "loss": 0.0637, "step": 6278 }, { "epoch": 0.4472700074794316, "grad_norm": 2.7998573780059814, "learning_rate": 2.1992091462883537e-06, "loss": 0.4782, "step": 6279 }, { "epoch": 0.44734124016098586, "grad_norm": 3.2574245929718018, "learning_rate": 2.196752090479083e-06, "loss": 0.5286, "step": 6280 }, { "epoch": 0.44741247284254015, "grad_norm": 3.3409149646759033, "learning_rate": 2.194296238630604e-06, "loss": 0.2568, "step": 6281 }, { "epoch": 0.44748370552409444, "grad_norm": 3.1807286739349365, "learning_rate": 2.1918415911218327e-06, "loss": 0.7231, "step": 6282 }, { "epoch": 0.44755493820564873, "grad_norm": 2.5603461265563965, "learning_rate": 2.189388148331498e-06, "loss": 0.435, "step": 6283 }, { "epoch": 0.447626170887203, "grad_norm": 3.1325812339782715, "learning_rate": 2.186935910638136e-06, "loss": 0.5316, "step": 6284 }, { "epoch": 0.44769740356875737, "grad_norm": 3.2706100940704346, "learning_rate": 2.1844848784201067e-06, "loss": 0.4098, "step": 6285 }, { "epoch": 0.44776863625031166, "grad_norm": 4.450396537780762, "learning_rate": 2.182035052055573e-06, "loss": 0.555, "step": 6286 }, { "epoch": 0.44783986893186595, "grad_norm": 6.315605163574219, "learning_rate": 2.1795864319225246e-06, "loss": 0.958, "step": 6287 }, { "epoch": 0.44791110161342024, "grad_norm": 2.987016439437866, "learning_rate": 2.177139018398752e-06, "loss": 0.6766, "step": 6288 }, { "epoch": 0.4479823342949745, "grad_norm": 3.626783609390259, "learning_rate": 2.1746928118618717e-06, "loss": 0.5249, "step": 6289 }, { "epoch": 0.4480535669765288, "grad_norm": 3.2939019203186035, "learning_rate": 2.1722478126893022e-06, "loss": 0.5851, "step": 6290 }, { "epoch": 0.4481247996580831, "grad_norm": 3.8941967487335205, "learning_rate": 2.1698040212582862e-06, "loss": 0.5731, "step": 6291 }, { "epoch": 0.44819603233963745, "grad_norm": 2.622661828994751, "learning_rate": 2.167361437945876e-06, "loss": 0.4494, "step": 6292 }, { "epoch": 0.44826726502119174, "grad_norm": 3.654175043106079, "learning_rate": 2.1649200631289322e-06, "loss": 0.4997, "step": 6293 }, { "epoch": 0.44833849770274603, "grad_norm": 2.8275599479675293, "learning_rate": 2.162479897184139e-06, "loss": 0.5286, "step": 6294 }, { "epoch": 0.4484097303843003, "grad_norm": 2.043207883834839, "learning_rate": 2.1600409404879875e-06, "loss": 0.2082, "step": 6295 }, { "epoch": 0.4484809630658546, "grad_norm": 2.193429708480835, "learning_rate": 2.157603193416781e-06, "loss": 0.1849, "step": 6296 }, { "epoch": 0.4485521957474089, "grad_norm": 2.3078808784484863, "learning_rate": 2.1551666563466413e-06, "loss": 0.2991, "step": 6297 }, { "epoch": 0.4486234284289632, "grad_norm": 3.118712902069092, "learning_rate": 2.152731329653502e-06, "loss": 0.7541, "step": 6298 }, { "epoch": 0.4486946611105175, "grad_norm": 6.2220282554626465, "learning_rate": 2.150297213713105e-06, "loss": 0.529, "step": 6299 }, { "epoch": 0.4487658937920718, "grad_norm": 2.7494149208068848, "learning_rate": 2.1478643089010143e-06, "loss": 0.2644, "step": 6300 }, { "epoch": 0.4488371264736261, "grad_norm": 2.1791200637817383, "learning_rate": 2.1454326155925966e-06, "loss": 0.2333, "step": 6301 }, { "epoch": 0.4489083591551804, "grad_norm": 3.081509590148926, "learning_rate": 2.1430021341630424e-06, "loss": 0.8075, "step": 6302 }, { "epoch": 0.4489795918367347, "grad_norm": 2.457503080368042, "learning_rate": 2.1405728649873458e-06, "loss": 0.5151, "step": 6303 }, { "epoch": 0.449050824518289, "grad_norm": 2.5977909564971924, "learning_rate": 2.138144808440321e-06, "loss": 0.1783, "step": 6304 }, { "epoch": 0.44912205719984327, "grad_norm": 2.006420612335205, "learning_rate": 2.13571796489659e-06, "loss": 0.1461, "step": 6305 }, { "epoch": 0.44919328988139756, "grad_norm": 2.6872735023498535, "learning_rate": 2.133292334730589e-06, "loss": 0.604, "step": 6306 }, { "epoch": 0.4492645225629519, "grad_norm": 3.830747127532959, "learning_rate": 2.1308679183165693e-06, "loss": 0.9836, "step": 6307 }, { "epoch": 0.4493357552445062, "grad_norm": 2.591790199279785, "learning_rate": 2.128444716028597e-06, "loss": 0.3375, "step": 6308 }, { "epoch": 0.4494069879260605, "grad_norm": 4.348878383636475, "learning_rate": 2.12602272824054e-06, "loss": 0.7728, "step": 6309 }, { "epoch": 0.4494782206076148, "grad_norm": 2.914680242538452, "learning_rate": 2.123601955326091e-06, "loss": 0.5439, "step": 6310 }, { "epoch": 0.44954945328916907, "grad_norm": 1.9101747274398804, "learning_rate": 2.1211823976587508e-06, "loss": 0.1617, "step": 6311 }, { "epoch": 0.44962068597072335, "grad_norm": 3.814605712890625, "learning_rate": 2.118764055611828e-06, "loss": 0.9661, "step": 6312 }, { "epoch": 0.44969191865227764, "grad_norm": 3.789152145385742, "learning_rate": 2.1163469295584504e-06, "loss": 0.8037, "step": 6313 }, { "epoch": 0.449763151333832, "grad_norm": 2.4426026344299316, "learning_rate": 2.113931019871559e-06, "loss": 0.137, "step": 6314 }, { "epoch": 0.4498343840153863, "grad_norm": 2.185706615447998, "learning_rate": 2.1115163269238992e-06, "loss": 0.3173, "step": 6315 }, { "epoch": 0.44990561669694057, "grad_norm": 2.9182868003845215, "learning_rate": 2.109102851088033e-06, "loss": 0.4701, "step": 6316 }, { "epoch": 0.44997684937849486, "grad_norm": 2.855363130569458, "learning_rate": 2.106690592736338e-06, "loss": 0.3056, "step": 6317 }, { "epoch": 0.45004808206004915, "grad_norm": 2.9583773612976074, "learning_rate": 2.1042795522409977e-06, "loss": 0.7341, "step": 6318 }, { "epoch": 0.45011931474160344, "grad_norm": 3.6918089389801025, "learning_rate": 2.101869729974011e-06, "loss": 0.5232, "step": 6319 }, { "epoch": 0.4501905474231577, "grad_norm": 5.348743438720703, "learning_rate": 2.099461126307194e-06, "loss": 0.2204, "step": 6320 }, { "epoch": 0.450261780104712, "grad_norm": 3.7337305545806885, "learning_rate": 2.0970537416121617e-06, "loss": 0.3796, "step": 6321 }, { "epoch": 0.45033301278626636, "grad_norm": 2.666795253753662, "learning_rate": 2.0946475762603525e-06, "loss": 0.4037, "step": 6322 }, { "epoch": 0.45040424546782065, "grad_norm": 2.556870222091675, "learning_rate": 2.092242630623016e-06, "loss": 0.3062, "step": 6323 }, { "epoch": 0.45047547814937494, "grad_norm": 2.7773261070251465, "learning_rate": 2.0898389050712044e-06, "loss": 0.6347, "step": 6324 }, { "epoch": 0.45054671083092923, "grad_norm": 2.2585866451263428, "learning_rate": 2.0874363999757906e-06, "loss": 0.2262, "step": 6325 }, { "epoch": 0.4506179435124835, "grad_norm": 2.840648651123047, "learning_rate": 2.08503511570746e-06, "loss": 0.6288, "step": 6326 }, { "epoch": 0.4506891761940378, "grad_norm": 3.353273868560791, "learning_rate": 2.0826350526367e-06, "loss": 0.5778, "step": 6327 }, { "epoch": 0.4507604088755921, "grad_norm": 3.0211691856384277, "learning_rate": 2.0802362111338183e-06, "loss": 0.2682, "step": 6328 }, { "epoch": 0.45083164155714645, "grad_norm": 6.06519889831543, "learning_rate": 2.0778385915689336e-06, "loss": 0.5065, "step": 6329 }, { "epoch": 0.45090287423870073, "grad_norm": 2.8839221000671387, "learning_rate": 2.0754421943119695e-06, "loss": 0.5508, "step": 6330 }, { "epoch": 0.450974106920255, "grad_norm": 2.9481730461120605, "learning_rate": 2.0730470197326702e-06, "loss": 0.3375, "step": 6331 }, { "epoch": 0.4510453396018093, "grad_norm": 4.038989543914795, "learning_rate": 2.0706530682005833e-06, "loss": 0.6119, "step": 6332 }, { "epoch": 0.4511165722833636, "grad_norm": 3.7977545261383057, "learning_rate": 2.06826034008507e-06, "loss": 0.5584, "step": 6333 }, { "epoch": 0.4511878049649179, "grad_norm": 3.963423728942871, "learning_rate": 2.0658688357553036e-06, "loss": 0.8763, "step": 6334 }, { "epoch": 0.4512590376464722, "grad_norm": 4.254647254943848, "learning_rate": 2.063478555580274e-06, "loss": 0.5066, "step": 6335 }, { "epoch": 0.4513302703280265, "grad_norm": 8.69538402557373, "learning_rate": 2.06108949992877e-06, "loss": 0.3331, "step": 6336 }, { "epoch": 0.4514015030095808, "grad_norm": 3.0336554050445557, "learning_rate": 2.0587016691694006e-06, "loss": 0.6768, "step": 6337 }, { "epoch": 0.4514727356911351, "grad_norm": 3.1091878414154053, "learning_rate": 2.0563150636705873e-06, "loss": 0.2836, "step": 6338 }, { "epoch": 0.4515439683726894, "grad_norm": 2.6075050830841064, "learning_rate": 2.053929683800553e-06, "loss": 0.5987, "step": 6339 }, { "epoch": 0.4516152010542437, "grad_norm": 2.797555685043335, "learning_rate": 2.05154552992734e-06, "loss": 0.2512, "step": 6340 }, { "epoch": 0.451686433735798, "grad_norm": 2.2745895385742188, "learning_rate": 2.0491626024188005e-06, "loss": 0.4333, "step": 6341 }, { "epoch": 0.45175766641735227, "grad_norm": 4.45824670791626, "learning_rate": 2.046780901642591e-06, "loss": 0.4453, "step": 6342 }, { "epoch": 0.45182889909890656, "grad_norm": 5.3848466873168945, "learning_rate": 2.0444004279661866e-06, "loss": 0.7062, "step": 6343 }, { "epoch": 0.4519001317804609, "grad_norm": 5.47923469543457, "learning_rate": 2.0420211817568724e-06, "loss": 0.7213, "step": 6344 }, { "epoch": 0.4519713644620152, "grad_norm": 4.919703006744385, "learning_rate": 2.0396431633817348e-06, "loss": 0.3584, "step": 6345 }, { "epoch": 0.4520425971435695, "grad_norm": 1.5846375226974487, "learning_rate": 2.0372663732076847e-06, "loss": 0.1436, "step": 6346 }, { "epoch": 0.45211382982512377, "grad_norm": 2.386284112930298, "learning_rate": 2.03489081160143e-06, "loss": 0.4487, "step": 6347 }, { "epoch": 0.45218506250667806, "grad_norm": 2.2093725204467773, "learning_rate": 2.0325164789295004e-06, "loss": 0.4244, "step": 6348 }, { "epoch": 0.45225629518823235, "grad_norm": 2.9953384399414062, "learning_rate": 2.0301433755582266e-06, "loss": 0.4747, "step": 6349 }, { "epoch": 0.45232752786978664, "grad_norm": 2.634274482727051, "learning_rate": 2.027771501853757e-06, "loss": 0.3182, "step": 6350 }, { "epoch": 0.45239876055134093, "grad_norm": 2.8009018898010254, "learning_rate": 2.025400858182048e-06, "loss": 0.4224, "step": 6351 }, { "epoch": 0.4524699932328953, "grad_norm": 6.4801483154296875, "learning_rate": 2.0230314449088626e-06, "loss": 0.7316, "step": 6352 }, { "epoch": 0.45254122591444956, "grad_norm": 3.899460554122925, "learning_rate": 2.020663262399778e-06, "loss": 0.4124, "step": 6353 }, { "epoch": 0.45261245859600385, "grad_norm": 6.739773750305176, "learning_rate": 2.0182963110201823e-06, "loss": 0.3827, "step": 6354 }, { "epoch": 0.45268369127755814, "grad_norm": 2.761176824569702, "learning_rate": 2.0159305911352688e-06, "loss": 0.4125, "step": 6355 }, { "epoch": 0.45275492395911243, "grad_norm": 2.3198940753936768, "learning_rate": 2.013566103110045e-06, "loss": 0.5653, "step": 6356 }, { "epoch": 0.4528261566406667, "grad_norm": 3.0092873573303223, "learning_rate": 2.0112028473093294e-06, "loss": 0.3433, "step": 6357 }, { "epoch": 0.452897389322221, "grad_norm": 2.53853440284729, "learning_rate": 2.008840824097743e-06, "loss": 0.5559, "step": 6358 }, { "epoch": 0.45296862200377536, "grad_norm": 2.6324641704559326, "learning_rate": 2.006480033839728e-06, "loss": 0.5738, "step": 6359 }, { "epoch": 0.45303985468532965, "grad_norm": 3.0993127822875977, "learning_rate": 2.0041204768995225e-06, "loss": 0.9536, "step": 6360 }, { "epoch": 0.45311108736688394, "grad_norm": 2.3607587814331055, "learning_rate": 2.001762153641189e-06, "loss": 0.1523, "step": 6361 }, { "epoch": 0.4531823200484382, "grad_norm": 2.4994571208953857, "learning_rate": 1.999405064428587e-06, "loss": 0.4516, "step": 6362 }, { "epoch": 0.4532535527299925, "grad_norm": 4.880392074584961, "learning_rate": 1.9970492096253955e-06, "loss": 0.5123, "step": 6363 }, { "epoch": 0.4533247854115468, "grad_norm": 3.2944586277008057, "learning_rate": 1.9946945895950943e-06, "loss": 0.5112, "step": 6364 }, { "epoch": 0.4533960180931011, "grad_norm": 2.6860311031341553, "learning_rate": 1.9923412047009794e-06, "loss": 0.6089, "step": 6365 }, { "epoch": 0.45346725077465544, "grad_norm": 3.4830739498138428, "learning_rate": 1.9899890553061565e-06, "loss": 0.4381, "step": 6366 }, { "epoch": 0.45353848345620973, "grad_norm": 3.7443289756774902, "learning_rate": 1.9876381417735312e-06, "loss": 0.1884, "step": 6367 }, { "epoch": 0.453609716137764, "grad_norm": 3.2143213748931885, "learning_rate": 1.98528846446583e-06, "loss": 0.5967, "step": 6368 }, { "epoch": 0.4536809488193183, "grad_norm": 4.233303070068359, "learning_rate": 1.9829400237455865e-06, "loss": 0.6884, "step": 6369 }, { "epoch": 0.4537521815008726, "grad_norm": 3.418067216873169, "learning_rate": 1.9805928199751336e-06, "loss": 0.7299, "step": 6370 }, { "epoch": 0.4538234141824269, "grad_norm": 4.001426696777344, "learning_rate": 1.9782468535166253e-06, "loss": 0.8095, "step": 6371 }, { "epoch": 0.4538946468639812, "grad_norm": 3.7231404781341553, "learning_rate": 1.975902124732022e-06, "loss": 0.6979, "step": 6372 }, { "epoch": 0.45396587954553547, "grad_norm": 2.9199557304382324, "learning_rate": 1.973558633983087e-06, "loss": 0.7129, "step": 6373 }, { "epoch": 0.4540371122270898, "grad_norm": 2.541935443878174, "learning_rate": 1.971216381631397e-06, "loss": 0.4029, "step": 6374 }, { "epoch": 0.4541083449086441, "grad_norm": 3.0945558547973633, "learning_rate": 1.968875368038342e-06, "loss": 0.5246, "step": 6375 }, { "epoch": 0.4541795775901984, "grad_norm": 3.1955699920654297, "learning_rate": 1.9665355935651133e-06, "loss": 0.5534, "step": 6376 }, { "epoch": 0.4542508102717527, "grad_norm": 2.9667539596557617, "learning_rate": 1.964197058572711e-06, "loss": 0.7607, "step": 6377 }, { "epoch": 0.45432204295330697, "grad_norm": 2.362520933151245, "learning_rate": 1.961859763421953e-06, "loss": 0.428, "step": 6378 }, { "epoch": 0.45439327563486126, "grad_norm": 4.3969244956970215, "learning_rate": 1.959523708473453e-06, "loss": 0.473, "step": 6379 }, { "epoch": 0.45446450831641555, "grad_norm": 3.567908763885498, "learning_rate": 1.9571888940876436e-06, "loss": 0.3078, "step": 6380 }, { "epoch": 0.4545357409979699, "grad_norm": 3.121673107147217, "learning_rate": 1.9548553206247667e-06, "loss": 0.3323, "step": 6381 }, { "epoch": 0.4546069736795242, "grad_norm": 2.7947494983673096, "learning_rate": 1.9525229884448624e-06, "loss": 0.1969, "step": 6382 }, { "epoch": 0.4546782063610785, "grad_norm": 4.290217399597168, "learning_rate": 1.9501918979077874e-06, "loss": 0.7027, "step": 6383 }, { "epoch": 0.45474943904263276, "grad_norm": 3.388784885406494, "learning_rate": 1.947862049373206e-06, "loss": 0.7496, "step": 6384 }, { "epoch": 0.45482067172418705, "grad_norm": 4.435743808746338, "learning_rate": 1.945533443200591e-06, "loss": 0.6381, "step": 6385 }, { "epoch": 0.45489190440574134, "grad_norm": 3.3321945667266846, "learning_rate": 1.9432060797492193e-06, "loss": 0.5644, "step": 6386 }, { "epoch": 0.45496313708729563, "grad_norm": 2.4315483570098877, "learning_rate": 1.94087995937818e-06, "loss": 0.6565, "step": 6387 }, { "epoch": 0.4550343697688499, "grad_norm": 2.1148345470428467, "learning_rate": 1.9385550824463727e-06, "loss": 0.2803, "step": 6388 }, { "epoch": 0.45510560245040427, "grad_norm": 7.497655868530273, "learning_rate": 1.9362314493124965e-06, "loss": 0.4469, "step": 6389 }, { "epoch": 0.45517683513195856, "grad_norm": 9.046839714050293, "learning_rate": 1.9339090603350698e-06, "loss": 0.5036, "step": 6390 }, { "epoch": 0.45524806781351285, "grad_norm": 3.8496623039245605, "learning_rate": 1.9315879158724106e-06, "loss": 0.359, "step": 6391 }, { "epoch": 0.45531930049506714, "grad_norm": 2.717216730117798, "learning_rate": 1.929268016282645e-06, "loss": 0.3884, "step": 6392 }, { "epoch": 0.4553905331766214, "grad_norm": 4.097451686859131, "learning_rate": 1.9269493619237114e-06, "loss": 0.766, "step": 6393 }, { "epoch": 0.4554617658581757, "grad_norm": 3.5936412811279297, "learning_rate": 1.9246319531533574e-06, "loss": 0.5918, "step": 6394 }, { "epoch": 0.45553299853973, "grad_norm": 4.026879787445068, "learning_rate": 1.9223157903291313e-06, "loss": 0.4959, "step": 6395 }, { "epoch": 0.45560423122128435, "grad_norm": 2.3716819286346436, "learning_rate": 1.920000873808394e-06, "loss": 0.5433, "step": 6396 }, { "epoch": 0.45567546390283864, "grad_norm": 2.8551535606384277, "learning_rate": 1.917687203948316e-06, "loss": 0.5482, "step": 6397 }, { "epoch": 0.45574669658439293, "grad_norm": 2.3873302936553955, "learning_rate": 1.91537478110587e-06, "loss": 0.2247, "step": 6398 }, { "epoch": 0.4558179292659472, "grad_norm": 3.8049182891845703, "learning_rate": 1.913063605637838e-06, "loss": 0.129, "step": 6399 }, { "epoch": 0.4558891619475015, "grad_norm": 4.082761287689209, "learning_rate": 1.9107536779008153e-06, "loss": 0.6675, "step": 6400 }, { "epoch": 0.4559603946290558, "grad_norm": 3.3647968769073486, "learning_rate": 1.908444998251194e-06, "loss": 0.6177, "step": 6401 }, { "epoch": 0.4560316273106101, "grad_norm": 3.581413984298706, "learning_rate": 1.9061375670451831e-06, "loss": 0.2609, "step": 6402 }, { "epoch": 0.4561028599921644, "grad_norm": 3.0104637145996094, "learning_rate": 1.903831384638798e-06, "loss": 0.5844, "step": 6403 }, { "epoch": 0.4561740926737187, "grad_norm": 2.767990827560425, "learning_rate": 1.9015264513878528e-06, "loss": 0.4532, "step": 6404 }, { "epoch": 0.456245325355273, "grad_norm": 2.30112624168396, "learning_rate": 1.8992227676479803e-06, "loss": 0.345, "step": 6405 }, { "epoch": 0.4563165580368273, "grad_norm": 2.864879846572876, "learning_rate": 1.8969203337746101e-06, "loss": 0.3506, "step": 6406 }, { "epoch": 0.4563877907183816, "grad_norm": 2.817561149597168, "learning_rate": 1.8946191501229905e-06, "loss": 0.3004, "step": 6407 }, { "epoch": 0.4564590233999359, "grad_norm": 4.448941707611084, "learning_rate": 1.892319217048163e-06, "loss": 0.7499, "step": 6408 }, { "epoch": 0.45653025608149017, "grad_norm": 4.0233330726623535, "learning_rate": 1.8900205349049904e-06, "loss": 0.5481, "step": 6409 }, { "epoch": 0.45660148876304446, "grad_norm": 3.4009838104248047, "learning_rate": 1.8877231040481302e-06, "loss": 0.5824, "step": 6410 }, { "epoch": 0.4566727214445988, "grad_norm": 2.8363654613494873, "learning_rate": 1.8854269248320545e-06, "loss": 0.7246, "step": 6411 }, { "epoch": 0.4567439541261531, "grad_norm": 3.031869411468506, "learning_rate": 1.883131997611043e-06, "loss": 0.6708, "step": 6412 }, { "epoch": 0.4568151868077074, "grad_norm": 2.955056667327881, "learning_rate": 1.8808383227391747e-06, "loss": 0.5351, "step": 6413 }, { "epoch": 0.4568864194892617, "grad_norm": 2.352604866027832, "learning_rate": 1.8785459005703411e-06, "loss": 0.4331, "step": 6414 }, { "epoch": 0.45695765217081596, "grad_norm": 3.237407922744751, "learning_rate": 1.8762547314582435e-06, "loss": 0.3626, "step": 6415 }, { "epoch": 0.45702888485237025, "grad_norm": 4.992914199829102, "learning_rate": 1.8739648157563794e-06, "loss": 0.4984, "step": 6416 }, { "epoch": 0.45710011753392454, "grad_norm": 3.644850969314575, "learning_rate": 1.8716761538180627e-06, "loss": 0.6079, "step": 6417 }, { "epoch": 0.4571713502154789, "grad_norm": 2.506635904312134, "learning_rate": 1.8693887459964123e-06, "loss": 0.2814, "step": 6418 }, { "epoch": 0.4572425828970332, "grad_norm": 4.283682823181152, "learning_rate": 1.8671025926443464e-06, "loss": 0.4468, "step": 6419 }, { "epoch": 0.45731381557858747, "grad_norm": 3.42008113861084, "learning_rate": 1.8648176941146012e-06, "loss": 0.6428, "step": 6420 }, { "epoch": 0.45738504826014176, "grad_norm": 6.659435272216797, "learning_rate": 1.8625340507597056e-06, "loss": 0.3815, "step": 6421 }, { "epoch": 0.45745628094169605, "grad_norm": 2.2295608520507812, "learning_rate": 1.86025166293201e-06, "loss": 0.5507, "step": 6422 }, { "epoch": 0.45752751362325034, "grad_norm": 8.334569931030273, "learning_rate": 1.8579705309836571e-06, "loss": 0.44, "step": 6423 }, { "epoch": 0.4575987463048046, "grad_norm": 2.9834020137786865, "learning_rate": 1.8556906552666042e-06, "loss": 0.3007, "step": 6424 }, { "epoch": 0.4576699789863589, "grad_norm": 2.068946123123169, "learning_rate": 1.8534120361326159e-06, "loss": 0.3188, "step": 6425 }, { "epoch": 0.45774121166791326, "grad_norm": 2.5374064445495605, "learning_rate": 1.8511346739332535e-06, "loss": 0.4464, "step": 6426 }, { "epoch": 0.45781244434946755, "grad_norm": 4.193359851837158, "learning_rate": 1.8488585690198946e-06, "loss": 0.5182, "step": 6427 }, { "epoch": 0.45788367703102184, "grad_norm": 3.3787121772766113, "learning_rate": 1.8465837217437199e-06, "loss": 0.689, "step": 6428 }, { "epoch": 0.45795490971257613, "grad_norm": 5.091341495513916, "learning_rate": 1.8443101324557111e-06, "loss": 0.5854, "step": 6429 }, { "epoch": 0.4580261423941304, "grad_norm": 2.751946210861206, "learning_rate": 1.842037801506661e-06, "loss": 0.4633, "step": 6430 }, { "epoch": 0.4580973750756847, "grad_norm": 1.6666077375411987, "learning_rate": 1.839766729247171e-06, "loss": 0.2048, "step": 6431 }, { "epoch": 0.458168607757239, "grad_norm": 2.7755935192108154, "learning_rate": 1.8374969160276368e-06, "loss": 0.4781, "step": 6432 }, { "epoch": 0.45823984043879334, "grad_norm": 3.2829957008361816, "learning_rate": 1.8352283621982713e-06, "loss": 0.5088, "step": 6433 }, { "epoch": 0.45831107312034763, "grad_norm": 2.1816184520721436, "learning_rate": 1.8329610681090914e-06, "loss": 0.1876, "step": 6434 }, { "epoch": 0.4583823058019019, "grad_norm": 3.8675334453582764, "learning_rate": 1.8306950341099138e-06, "loss": 0.17, "step": 6435 }, { "epoch": 0.4584535384834562, "grad_norm": 2.9724974632263184, "learning_rate": 1.8284302605503624e-06, "loss": 0.413, "step": 6436 }, { "epoch": 0.4585247711650105, "grad_norm": 3.322694778442383, "learning_rate": 1.826166747779874e-06, "loss": 0.5376, "step": 6437 }, { "epoch": 0.4585960038465648, "grad_norm": 1.514337420463562, "learning_rate": 1.8239044961476794e-06, "loss": 0.1774, "step": 6438 }, { "epoch": 0.4586672365281191, "grad_norm": 4.929084777832031, "learning_rate": 1.8216435060028237e-06, "loss": 0.3941, "step": 6439 }, { "epoch": 0.45873846920967337, "grad_norm": 2.613879919052124, "learning_rate": 1.819383777694157e-06, "loss": 0.302, "step": 6440 }, { "epoch": 0.4588097018912277, "grad_norm": 3.503235340118408, "learning_rate": 1.817125311570327e-06, "loss": 0.5738, "step": 6441 }, { "epoch": 0.458880934572782, "grad_norm": 1.6343082189559937, "learning_rate": 1.8148681079797925e-06, "loss": 0.1852, "step": 6442 }, { "epoch": 0.4589521672543363, "grad_norm": 2.3605124950408936, "learning_rate": 1.812612167270823e-06, "loss": 0.171, "step": 6443 }, { "epoch": 0.4590233999358906, "grad_norm": 2.91479754447937, "learning_rate": 1.810357489791479e-06, "loss": 0.2067, "step": 6444 }, { "epoch": 0.4590946326174449, "grad_norm": 3.3121347427368164, "learning_rate": 1.8081040758896361e-06, "loss": 0.7521, "step": 6445 }, { "epoch": 0.45916586529899917, "grad_norm": 3.1146039962768555, "learning_rate": 1.805851925912978e-06, "loss": 0.2852, "step": 6446 }, { "epoch": 0.45923709798055345, "grad_norm": 3.0709993839263916, "learning_rate": 1.803601040208981e-06, "loss": 0.3547, "step": 6447 }, { "epoch": 0.4593083306621078, "grad_norm": 3.309846878051758, "learning_rate": 1.801351419124938e-06, "loss": 0.3278, "step": 6448 }, { "epoch": 0.4593795633436621, "grad_norm": 5.465806484222412, "learning_rate": 1.7991030630079431e-06, "loss": 0.6691, "step": 6449 }, { "epoch": 0.4594507960252164, "grad_norm": 2.5592403411865234, "learning_rate": 1.7968559722048906e-06, "loss": 0.6216, "step": 6450 }, { "epoch": 0.45952202870677067, "grad_norm": 2.4334020614624023, "learning_rate": 1.7946101470624877e-06, "loss": 0.2948, "step": 6451 }, { "epoch": 0.45959326138832496, "grad_norm": 4.775262832641602, "learning_rate": 1.7923655879272395e-06, "loss": 0.7109, "step": 6452 }, { "epoch": 0.45966449406987925, "grad_norm": 2.822122812271118, "learning_rate": 1.7901222951454566e-06, "loss": 0.5362, "step": 6453 }, { "epoch": 0.45973572675143354, "grad_norm": 2.611074924468994, "learning_rate": 1.7878802690632579e-06, "loss": 0.5533, "step": 6454 }, { "epoch": 0.4598069594329878, "grad_norm": 2.6383607387542725, "learning_rate": 1.785639510026569e-06, "loss": 0.4956, "step": 6455 }, { "epoch": 0.4598781921145422, "grad_norm": 2.770038604736328, "learning_rate": 1.7834000183811085e-06, "loss": 0.5189, "step": 6456 }, { "epoch": 0.45994942479609646, "grad_norm": 1.7091611623764038, "learning_rate": 1.7811617944724103e-06, "loss": 0.1821, "step": 6457 }, { "epoch": 0.46002065747765075, "grad_norm": 6.033488750457764, "learning_rate": 1.7789248386458102e-06, "loss": 0.6869, "step": 6458 }, { "epoch": 0.46009189015920504, "grad_norm": 4.0247650146484375, "learning_rate": 1.7766891512464491e-06, "loss": 0.4231, "step": 6459 }, { "epoch": 0.46016312284075933, "grad_norm": 4.170050621032715, "learning_rate": 1.7744547326192662e-06, "loss": 0.513, "step": 6460 }, { "epoch": 0.4602343555223136, "grad_norm": 3.5898590087890625, "learning_rate": 1.7722215831090106e-06, "loss": 0.1435, "step": 6461 }, { "epoch": 0.4603055882038679, "grad_norm": 2.704913854598999, "learning_rate": 1.7699897030602376e-06, "loss": 0.4935, "step": 6462 }, { "epoch": 0.46037682088542226, "grad_norm": 2.485830307006836, "learning_rate": 1.7677590928172994e-06, "loss": 0.3099, "step": 6463 }, { "epoch": 0.46044805356697655, "grad_norm": 2.8727641105651855, "learning_rate": 1.7655297527243587e-06, "loss": 0.5405, "step": 6464 }, { "epoch": 0.46051928624853083, "grad_norm": 3.9489989280700684, "learning_rate": 1.7633016831253757e-06, "loss": 0.8301, "step": 6465 }, { "epoch": 0.4605905189300851, "grad_norm": 3.355722188949585, "learning_rate": 1.7610748843641245e-06, "loss": 0.2879, "step": 6466 }, { "epoch": 0.4606617516116394, "grad_norm": 2.711534023284912, "learning_rate": 1.7588493567841724e-06, "loss": 0.5314, "step": 6467 }, { "epoch": 0.4607329842931937, "grad_norm": 3.5248405933380127, "learning_rate": 1.7566251007288992e-06, "loss": 0.6662, "step": 6468 }, { "epoch": 0.460804216974748, "grad_norm": 1.9741860628128052, "learning_rate": 1.7544021165414793e-06, "loss": 0.2604, "step": 6469 }, { "epoch": 0.46087544965630234, "grad_norm": 3.2706139087677, "learning_rate": 1.7521804045649005e-06, "loss": 0.5181, "step": 6470 }, { "epoch": 0.46094668233785663, "grad_norm": 2.7259857654571533, "learning_rate": 1.7499599651419508e-06, "loss": 0.4199, "step": 6471 }, { "epoch": 0.4610179150194109, "grad_norm": 4.116976737976074, "learning_rate": 1.7477407986152174e-06, "loss": 0.5987, "step": 6472 }, { "epoch": 0.4610891477009652, "grad_norm": 4.288010120391846, "learning_rate": 1.7455229053270973e-06, "loss": 0.6123, "step": 6473 }, { "epoch": 0.4611603803825195, "grad_norm": 2.7004878520965576, "learning_rate": 1.7433062856197902e-06, "loss": 0.4284, "step": 6474 }, { "epoch": 0.4612316130640738, "grad_norm": 2.6090314388275146, "learning_rate": 1.7410909398352937e-06, "loss": 0.4643, "step": 6475 }, { "epoch": 0.4613028457456281, "grad_norm": 7.232074737548828, "learning_rate": 1.7388768683154145e-06, "loss": 0.5699, "step": 6476 }, { "epoch": 0.46137407842718237, "grad_norm": 3.252870798110962, "learning_rate": 1.7366640714017647e-06, "loss": 0.5074, "step": 6477 }, { "epoch": 0.4614453111087367, "grad_norm": 3.0352890491485596, "learning_rate": 1.734452549435749e-06, "loss": 0.6338, "step": 6478 }, { "epoch": 0.461516543790291, "grad_norm": 1.956315279006958, "learning_rate": 1.73224230275859e-06, "loss": 0.2979, "step": 6479 }, { "epoch": 0.4615877764718453, "grad_norm": 1.6142736673355103, "learning_rate": 1.7300333317112983e-06, "loss": 0.2108, "step": 6480 }, { "epoch": 0.4616590091533996, "grad_norm": 2.4774441719055176, "learning_rate": 1.7278256366347034e-06, "loss": 0.6041, "step": 6481 }, { "epoch": 0.46173024183495387, "grad_norm": 2.813711643218994, "learning_rate": 1.725619217869422e-06, "loss": 0.5928, "step": 6482 }, { "epoch": 0.46180147451650816, "grad_norm": 3.444661855697632, "learning_rate": 1.7234140757558892e-06, "loss": 0.5263, "step": 6483 }, { "epoch": 0.46187270719806245, "grad_norm": 2.1454854011535645, "learning_rate": 1.7212102106343287e-06, "loss": 0.4439, "step": 6484 }, { "epoch": 0.4619439398796168, "grad_norm": 3.627229928970337, "learning_rate": 1.7190076228447782e-06, "loss": 0.6154, "step": 6485 }, { "epoch": 0.4620151725611711, "grad_norm": 3.084658622741699, "learning_rate": 1.7168063127270762e-06, "loss": 0.5618, "step": 6486 }, { "epoch": 0.4620864052427254, "grad_norm": 3.5990757942199707, "learning_rate": 1.7146062806208573e-06, "loss": 0.5913, "step": 6487 }, { "epoch": 0.46215763792427966, "grad_norm": 2.359935760498047, "learning_rate": 1.7124075268655672e-06, "loss": 0.3764, "step": 6488 }, { "epoch": 0.46222887060583395, "grad_norm": 3.6043341159820557, "learning_rate": 1.7102100518004517e-06, "loss": 0.6907, "step": 6489 }, { "epoch": 0.46230010328738824, "grad_norm": 2.4643008708953857, "learning_rate": 1.7080138557645543e-06, "loss": 0.4231, "step": 6490 }, { "epoch": 0.46237133596894253, "grad_norm": 10.855515480041504, "learning_rate": 1.7058189390967272e-06, "loss": 0.472, "step": 6491 }, { "epoch": 0.4624425686504968, "grad_norm": 2.5654218196868896, "learning_rate": 1.7036253021356275e-06, "loss": 0.3843, "step": 6492 }, { "epoch": 0.46251380133205117, "grad_norm": 5.8926520347595215, "learning_rate": 1.7014329452197054e-06, "loss": 0.7223, "step": 6493 }, { "epoch": 0.46258503401360546, "grad_norm": 3.413288116455078, "learning_rate": 1.6992418686872203e-06, "loss": 0.6356, "step": 6494 }, { "epoch": 0.46265626669515975, "grad_norm": 10.97032356262207, "learning_rate": 1.6970520728762374e-06, "loss": 0.3436, "step": 6495 }, { "epoch": 0.46272749937671404, "grad_norm": 2.2926626205444336, "learning_rate": 1.6948635581246142e-06, "loss": 0.444, "step": 6496 }, { "epoch": 0.4627987320582683, "grad_norm": 2.4920904636383057, "learning_rate": 1.6926763247700163e-06, "loss": 0.2521, "step": 6497 }, { "epoch": 0.4628699647398226, "grad_norm": 3.9400460720062256, "learning_rate": 1.6904903731499122e-06, "loss": 0.7414, "step": 6498 }, { "epoch": 0.4629411974213769, "grad_norm": 3.4644641876220703, "learning_rate": 1.688305703601575e-06, "loss": 0.6528, "step": 6499 }, { "epoch": 0.46301243010293125, "grad_norm": 2.3085365295410156, "learning_rate": 1.686122316462071e-06, "loss": 0.5324, "step": 6500 }, { "epoch": 0.46308366278448554, "grad_norm": 4.641005516052246, "learning_rate": 1.6839402120682768e-06, "loss": 0.6204, "step": 6501 }, { "epoch": 0.46315489546603983, "grad_norm": 3.5865602493286133, "learning_rate": 1.681759390756873e-06, "loss": 0.4236, "step": 6502 }, { "epoch": 0.4632261281475941, "grad_norm": 1.9229716062545776, "learning_rate": 1.6795798528643304e-06, "loss": 0.1912, "step": 6503 }, { "epoch": 0.4632973608291484, "grad_norm": 1.056739091873169, "learning_rate": 1.677401598726932e-06, "loss": 0.0476, "step": 6504 }, { "epoch": 0.4633685935107027, "grad_norm": 2.1217455863952637, "learning_rate": 1.6752246286807638e-06, "loss": 0.1661, "step": 6505 }, { "epoch": 0.463439826192257, "grad_norm": 2.834165096282959, "learning_rate": 1.6730489430617048e-06, "loss": 0.6505, "step": 6506 }, { "epoch": 0.4635110588738113, "grad_norm": 2.1951916217803955, "learning_rate": 1.670874542205443e-06, "loss": 0.2556, "step": 6507 }, { "epoch": 0.4635822915553656, "grad_norm": 2.890962839126587, "learning_rate": 1.6687014264474677e-06, "loss": 0.5041, "step": 6508 }, { "epoch": 0.4636535242369199, "grad_norm": 1.5172946453094482, "learning_rate": 1.6665295961230644e-06, "loss": 0.2261, "step": 6509 }, { "epoch": 0.4637247569184742, "grad_norm": 5.233238220214844, "learning_rate": 1.664359051567328e-06, "loss": 0.4572, "step": 6510 }, { "epoch": 0.4637959896000285, "grad_norm": 2.722219705581665, "learning_rate": 1.6621897931151498e-06, "loss": 0.3614, "step": 6511 }, { "epoch": 0.4638672222815828, "grad_norm": 2.8046984672546387, "learning_rate": 1.660021821101222e-06, "loss": 0.4138, "step": 6512 }, { "epoch": 0.46393845496313707, "grad_norm": 5.629184722900391, "learning_rate": 1.6578551358600415e-06, "loss": 0.5871, "step": 6513 }, { "epoch": 0.46400968764469136, "grad_norm": 4.361949443817139, "learning_rate": 1.6556897377259085e-06, "loss": 0.7852, "step": 6514 }, { "epoch": 0.4640809203262457, "grad_norm": 3.8532114028930664, "learning_rate": 1.653525627032917e-06, "loss": 0.4143, "step": 6515 }, { "epoch": 0.4641521530078, "grad_norm": 4.3455119132995605, "learning_rate": 1.6513628041149688e-06, "loss": 0.3488, "step": 6516 }, { "epoch": 0.4642233856893543, "grad_norm": 3.508120059967041, "learning_rate": 1.649201269305768e-06, "loss": 0.611, "step": 6517 }, { "epoch": 0.4642946183709086, "grad_norm": 2.5644407272338867, "learning_rate": 1.6470410229388134e-06, "loss": 0.3778, "step": 6518 }, { "epoch": 0.46436585105246286, "grad_norm": 5.747261047363281, "learning_rate": 1.6448820653474084e-06, "loss": 0.2529, "step": 6519 }, { "epoch": 0.46443708373401715, "grad_norm": 5.4300665855407715, "learning_rate": 1.6427243968646632e-06, "loss": 0.6348, "step": 6520 }, { "epoch": 0.46450831641557144, "grad_norm": 2.5853934288024902, "learning_rate": 1.6405680178234784e-06, "loss": 0.6554, "step": 6521 }, { "epoch": 0.4645795490971258, "grad_norm": 3.8222577571868896, "learning_rate": 1.638412928556562e-06, "loss": 0.5517, "step": 6522 }, { "epoch": 0.4646507817786801, "grad_norm": 2.297990083694458, "learning_rate": 1.6362591293964247e-06, "loss": 0.2241, "step": 6523 }, { "epoch": 0.46472201446023437, "grad_norm": 6.138814449310303, "learning_rate": 1.634106620675373e-06, "loss": 0.303, "step": 6524 }, { "epoch": 0.46479324714178866, "grad_norm": 3.318302631378174, "learning_rate": 1.631955402725519e-06, "loss": 0.7415, "step": 6525 }, { "epoch": 0.46486447982334295, "grad_norm": 4.856845378875732, "learning_rate": 1.6298054758787707e-06, "loss": 0.5901, "step": 6526 }, { "epoch": 0.46493571250489724, "grad_norm": 3.0052976608276367, "learning_rate": 1.6276568404668425e-06, "loss": 0.2179, "step": 6527 }, { "epoch": 0.4650069451864515, "grad_norm": 3.4598820209503174, "learning_rate": 1.6255094968212436e-06, "loss": 0.1593, "step": 6528 }, { "epoch": 0.4650781778680058, "grad_norm": 3.439640998840332, "learning_rate": 1.6233634452732916e-06, "loss": 0.7047, "step": 6529 }, { "epoch": 0.46514941054956016, "grad_norm": 3.302255392074585, "learning_rate": 1.6212186861540946e-06, "loss": 0.366, "step": 6530 }, { "epoch": 0.46522064323111445, "grad_norm": 3.0571680068969727, "learning_rate": 1.619075219794569e-06, "loss": 0.393, "step": 6531 }, { "epoch": 0.46529187591266874, "grad_norm": 2.7455904483795166, "learning_rate": 1.616933046525433e-06, "loss": 0.5592, "step": 6532 }, { "epoch": 0.46536310859422303, "grad_norm": 3.5418179035186768, "learning_rate": 1.614792166677197e-06, "loss": 0.4078, "step": 6533 }, { "epoch": 0.4654343412757773, "grad_norm": 7.029465198516846, "learning_rate": 1.6126525805801786e-06, "loss": 0.7612, "step": 6534 }, { "epoch": 0.4655055739573316, "grad_norm": 3.5011441707611084, "learning_rate": 1.610514288564493e-06, "loss": 0.4578, "step": 6535 }, { "epoch": 0.4655768066388859, "grad_norm": 3.096881628036499, "learning_rate": 1.6083772909600614e-06, "loss": 0.3986, "step": 6536 }, { "epoch": 0.46564803932044024, "grad_norm": 1.701206088066101, "learning_rate": 1.6062415880965932e-06, "loss": 0.2835, "step": 6537 }, { "epoch": 0.46571927200199453, "grad_norm": 2.8044943809509277, "learning_rate": 1.60410718030361e-06, "loss": 0.7674, "step": 6538 }, { "epoch": 0.4657905046835488, "grad_norm": 2.957308053970337, "learning_rate": 1.6019740679104301e-06, "loss": 0.2237, "step": 6539 }, { "epoch": 0.4658617373651031, "grad_norm": 2.7347910404205322, "learning_rate": 1.5998422512461687e-06, "loss": 0.6657, "step": 6540 }, { "epoch": 0.4659329700466574, "grad_norm": 3.7717366218566895, "learning_rate": 1.5977117306397394e-06, "loss": 0.2085, "step": 6541 }, { "epoch": 0.4660042027282117, "grad_norm": 2.1448869705200195, "learning_rate": 1.5955825064198671e-06, "loss": 0.2586, "step": 6542 }, { "epoch": 0.466075435409766, "grad_norm": 3.1186156272888184, "learning_rate": 1.5934545789150625e-06, "loss": 0.5558, "step": 6543 }, { "epoch": 0.46614666809132027, "grad_norm": 3.7518088817596436, "learning_rate": 1.591327948453646e-06, "loss": 0.6981, "step": 6544 }, { "epoch": 0.4662179007728746, "grad_norm": 2.4236953258514404, "learning_rate": 1.5892026153637363e-06, "loss": 0.455, "step": 6545 }, { "epoch": 0.4662891334544289, "grad_norm": 1.4875683784484863, "learning_rate": 1.5870785799732459e-06, "loss": 0.1381, "step": 6546 }, { "epoch": 0.4663603661359832, "grad_norm": 3.66121506690979, "learning_rate": 1.5849558426098955e-06, "loss": 0.3265, "step": 6547 }, { "epoch": 0.4664315988175375, "grad_norm": 3.2124414443969727, "learning_rate": 1.5828344036012012e-06, "loss": 0.4937, "step": 6548 }, { "epoch": 0.4665028314990918, "grad_norm": 3.5985605716705322, "learning_rate": 1.5807142632744776e-06, "loss": 0.7212, "step": 6549 }, { "epoch": 0.46657406418064606, "grad_norm": 2.2822561264038086, "learning_rate": 1.57859542195684e-06, "loss": 0.2648, "step": 6550 }, { "epoch": 0.46664529686220035, "grad_norm": 4.280763626098633, "learning_rate": 1.5764778799752079e-06, "loss": 0.4693, "step": 6551 }, { "epoch": 0.4667165295437547, "grad_norm": 2.929579973220825, "learning_rate": 1.5743616376562921e-06, "loss": 0.3696, "step": 6552 }, { "epoch": 0.466787762225309, "grad_norm": 5.497262477874756, "learning_rate": 1.5722466953266068e-06, "loss": 0.2582, "step": 6553 }, { "epoch": 0.4668589949068633, "grad_norm": 3.9009182453155518, "learning_rate": 1.5701330533124704e-06, "loss": 0.7119, "step": 6554 }, { "epoch": 0.46693022758841757, "grad_norm": 3.7729132175445557, "learning_rate": 1.5680207119399926e-06, "loss": 0.7514, "step": 6555 }, { "epoch": 0.46700146026997186, "grad_norm": 3.2792844772338867, "learning_rate": 1.5659096715350842e-06, "loss": 0.3014, "step": 6556 }, { "epoch": 0.46707269295152615, "grad_norm": 2.5813825130462646, "learning_rate": 1.563799932423462e-06, "loss": 0.1855, "step": 6557 }, { "epoch": 0.46714392563308044, "grad_norm": 2.6247329711914062, "learning_rate": 1.5616914949306316e-06, "loss": 0.5623, "step": 6558 }, { "epoch": 0.4672151583146348, "grad_norm": 2.576932907104492, "learning_rate": 1.559584359381906e-06, "loss": 0.3245, "step": 6559 }, { "epoch": 0.46728639099618907, "grad_norm": 3.5239462852478027, "learning_rate": 1.557478526102396e-06, "loss": 0.5016, "step": 6560 }, { "epoch": 0.46735762367774336, "grad_norm": 2.6840529441833496, "learning_rate": 1.5553739954170055e-06, "loss": 0.5234, "step": 6561 }, { "epoch": 0.46742885635929765, "grad_norm": 5.744152069091797, "learning_rate": 1.5532707676504455e-06, "loss": 0.3768, "step": 6562 }, { "epoch": 0.46750008904085194, "grad_norm": 2.9437851905822754, "learning_rate": 1.5511688431272242e-06, "loss": 0.4909, "step": 6563 }, { "epoch": 0.46757132172240623, "grad_norm": 3.071855306625366, "learning_rate": 1.5490682221716413e-06, "loss": 0.5848, "step": 6564 }, { "epoch": 0.4676425544039605, "grad_norm": 2.5293354988098145, "learning_rate": 1.5469689051078041e-06, "loss": 0.3411, "step": 6565 }, { "epoch": 0.4677137870855148, "grad_norm": 2.5778539180755615, "learning_rate": 1.5448708922596178e-06, "loss": 0.2991, "step": 6566 }, { "epoch": 0.46778501976706915, "grad_norm": 3.1205360889434814, "learning_rate": 1.5427741839507804e-06, "loss": 0.5954, "step": 6567 }, { "epoch": 0.46785625244862344, "grad_norm": 2.420848846435547, "learning_rate": 1.540678780504793e-06, "loss": 0.4385, "step": 6568 }, { "epoch": 0.46792748513017773, "grad_norm": 1.990368366241455, "learning_rate": 1.538584682244958e-06, "loss": 0.1023, "step": 6569 }, { "epoch": 0.467998717811732, "grad_norm": 4.09674596786499, "learning_rate": 1.5364918894943682e-06, "loss": 0.5381, "step": 6570 }, { "epoch": 0.4680699504932863, "grad_norm": 3.6959357261657715, "learning_rate": 1.534400402575925e-06, "loss": 0.6571, "step": 6571 }, { "epoch": 0.4681411831748406, "grad_norm": 2.1228511333465576, "learning_rate": 1.5323102218123186e-06, "loss": 0.1544, "step": 6572 }, { "epoch": 0.4682124158563949, "grad_norm": 4.020543575286865, "learning_rate": 1.5302213475260475e-06, "loss": 0.8741, "step": 6573 }, { "epoch": 0.46828364853794924, "grad_norm": 3.395815849304199, "learning_rate": 1.528133780039397e-06, "loss": 0.5079, "step": 6574 }, { "epoch": 0.4683548812195035, "grad_norm": 3.723921537399292, "learning_rate": 1.5260475196744618e-06, "loss": 0.2649, "step": 6575 }, { "epoch": 0.4684261139010578, "grad_norm": 3.9587326049804688, "learning_rate": 1.5239625667531322e-06, "loss": 0.4013, "step": 6576 }, { "epoch": 0.4684973465826121, "grad_norm": 4.867684364318848, "learning_rate": 1.5218789215970897e-06, "loss": 0.6773, "step": 6577 }, { "epoch": 0.4685685792641664, "grad_norm": 2.9534149169921875, "learning_rate": 1.5197965845278217e-06, "loss": 0.4418, "step": 6578 }, { "epoch": 0.4686398119457207, "grad_norm": 2.6740307807922363, "learning_rate": 1.5177155558666135e-06, "loss": 0.4061, "step": 6579 }, { "epoch": 0.468711044627275, "grad_norm": 3.5342018604278564, "learning_rate": 1.5156358359345425e-06, "loss": 0.5769, "step": 6580 }, { "epoch": 0.46878227730882926, "grad_norm": 2.3838906288146973, "learning_rate": 1.5135574250524898e-06, "loss": 0.5104, "step": 6581 }, { "epoch": 0.4688535099903836, "grad_norm": 3.429731607437134, "learning_rate": 1.5114803235411346e-06, "loss": 0.4144, "step": 6582 }, { "epoch": 0.4689247426719379, "grad_norm": 2.609485387802124, "learning_rate": 1.5094045317209493e-06, "loss": 0.3734, "step": 6583 }, { "epoch": 0.4689959753534922, "grad_norm": 2.6277661323547363, "learning_rate": 1.5073300499122113e-06, "loss": 0.5694, "step": 6584 }, { "epoch": 0.4690672080350465, "grad_norm": 2.152160406112671, "learning_rate": 1.5052568784349852e-06, "loss": 0.4805, "step": 6585 }, { "epoch": 0.46913844071660077, "grad_norm": 2.6383533477783203, "learning_rate": 1.5031850176091467e-06, "loss": 0.3358, "step": 6586 }, { "epoch": 0.46920967339815506, "grad_norm": 2.9883861541748047, "learning_rate": 1.5011144677543576e-06, "loss": 0.3322, "step": 6587 }, { "epoch": 0.46928090607970935, "grad_norm": 2.83713436126709, "learning_rate": 1.499045229190087e-06, "loss": 0.5851, "step": 6588 }, { "epoch": 0.4693521387612637, "grad_norm": 5.251065731048584, "learning_rate": 1.4969773022355927e-06, "loss": 0.4719, "step": 6589 }, { "epoch": 0.469423371442818, "grad_norm": 3.278249740600586, "learning_rate": 1.494910687209935e-06, "loss": 0.449, "step": 6590 }, { "epoch": 0.4694946041243723, "grad_norm": 3.5622665882110596, "learning_rate": 1.4928453844319769e-06, "loss": 0.3146, "step": 6591 }, { "epoch": 0.46956583680592656, "grad_norm": 3.0127949714660645, "learning_rate": 1.4907813942203652e-06, "loss": 0.8918, "step": 6592 }, { "epoch": 0.46963706948748085, "grad_norm": 1.4107050895690918, "learning_rate": 1.4887187168935579e-06, "loss": 0.1157, "step": 6593 }, { "epoch": 0.46970830216903514, "grad_norm": 1.560308575630188, "learning_rate": 1.4866573527698047e-06, "loss": 0.2379, "step": 6594 }, { "epoch": 0.46977953485058943, "grad_norm": 3.9353537559509277, "learning_rate": 1.48459730216715e-06, "loss": 0.8477, "step": 6595 }, { "epoch": 0.4698507675321437, "grad_norm": 1.846720814704895, "learning_rate": 1.4825385654034386e-06, "loss": 0.2629, "step": 6596 }, { "epoch": 0.46992200021369807, "grad_norm": 4.0231122970581055, "learning_rate": 1.4804811427963173e-06, "loss": 0.7095, "step": 6597 }, { "epoch": 0.46999323289525236, "grad_norm": 2.9340505599975586, "learning_rate": 1.478425034663219e-06, "loss": 0.4226, "step": 6598 }, { "epoch": 0.47006446557680664, "grad_norm": 2.7364938259124756, "learning_rate": 1.4763702413213843e-06, "loss": 0.5879, "step": 6599 }, { "epoch": 0.47013569825836093, "grad_norm": 2.336080551147461, "learning_rate": 1.474316763087843e-06, "loss": 0.3743, "step": 6600 }, { "epoch": 0.4702069309399152, "grad_norm": 4.173142910003662, "learning_rate": 1.4722646002794294e-06, "loss": 0.3967, "step": 6601 }, { "epoch": 0.4702781636214695, "grad_norm": 4.170112609863281, "learning_rate": 1.470213753212768e-06, "loss": 0.6245, "step": 6602 }, { "epoch": 0.4703493963030238, "grad_norm": 3.4463725090026855, "learning_rate": 1.468164222204287e-06, "loss": 0.6265, "step": 6603 }, { "epoch": 0.47042062898457815, "grad_norm": 3.7944698333740234, "learning_rate": 1.4661160075702018e-06, "loss": 0.5963, "step": 6604 }, { "epoch": 0.47049186166613244, "grad_norm": 2.5857653617858887, "learning_rate": 1.4640691096265358e-06, "loss": 0.2937, "step": 6605 }, { "epoch": 0.47056309434768673, "grad_norm": 2.6213817596435547, "learning_rate": 1.4620235286891049e-06, "loss": 0.3203, "step": 6606 }, { "epoch": 0.470634327029241, "grad_norm": 3.338888645172119, "learning_rate": 1.4599792650735179e-06, "loss": 0.0624, "step": 6607 }, { "epoch": 0.4707055597107953, "grad_norm": 3.766329050064087, "learning_rate": 1.4579363190951845e-06, "loss": 0.6594, "step": 6608 }, { "epoch": 0.4707767923923496, "grad_norm": 1.542273759841919, "learning_rate": 1.4558946910693127e-06, "loss": 0.1329, "step": 6609 }, { "epoch": 0.4708480250739039, "grad_norm": 2.676816701889038, "learning_rate": 1.453854381310902e-06, "loss": 0.3962, "step": 6610 }, { "epoch": 0.47091925775545823, "grad_norm": 2.192960262298584, "learning_rate": 1.451815390134751e-06, "loss": 0.1554, "step": 6611 }, { "epoch": 0.4709904904370125, "grad_norm": 2.4474425315856934, "learning_rate": 1.449777717855455e-06, "loss": 0.4886, "step": 6612 }, { "epoch": 0.4710617231185668, "grad_norm": 5.298205852508545, "learning_rate": 1.4477413647874106e-06, "loss": 0.9578, "step": 6613 }, { "epoch": 0.4711329558001211, "grad_norm": 3.8672733306884766, "learning_rate": 1.4457063312447995e-06, "loss": 0.2686, "step": 6614 }, { "epoch": 0.4712041884816754, "grad_norm": 2.945650577545166, "learning_rate": 1.4436726175416116e-06, "loss": 0.3857, "step": 6615 }, { "epoch": 0.4712754211632297, "grad_norm": 1.3394191265106201, "learning_rate": 1.4416402239916261e-06, "loss": 0.1394, "step": 6616 }, { "epoch": 0.47134665384478397, "grad_norm": 2.064345359802246, "learning_rate": 1.4396091509084175e-06, "loss": 0.5043, "step": 6617 }, { "epoch": 0.47141788652633826, "grad_norm": 5.940158367156982, "learning_rate": 1.4375793986053622e-06, "loss": 0.7498, "step": 6618 }, { "epoch": 0.4714891192078926, "grad_norm": 2.0144550800323486, "learning_rate": 1.4355509673956313e-06, "loss": 0.1001, "step": 6619 }, { "epoch": 0.4715603518894469, "grad_norm": 1.5349955558776855, "learning_rate": 1.4335238575921884e-06, "loss": 0.1669, "step": 6620 }, { "epoch": 0.4716315845710012, "grad_norm": 2.2870047092437744, "learning_rate": 1.431498069507795e-06, "loss": 0.2915, "step": 6621 }, { "epoch": 0.4717028172525555, "grad_norm": 4.914179801940918, "learning_rate": 1.429473603455015e-06, "loss": 0.8856, "step": 6622 }, { "epoch": 0.47177404993410976, "grad_norm": 1.6545122861862183, "learning_rate": 1.4274504597461946e-06, "loss": 0.2067, "step": 6623 }, { "epoch": 0.47184528261566405, "grad_norm": 2.294379949569702, "learning_rate": 1.425428638693489e-06, "loss": 0.3311, "step": 6624 }, { "epoch": 0.47191651529721834, "grad_norm": 3.636535882949829, "learning_rate": 1.4234081406088463e-06, "loss": 0.6868, "step": 6625 }, { "epoch": 0.4719877479787727, "grad_norm": 5.532772541046143, "learning_rate": 1.4213889658040026e-06, "loss": 0.5161, "step": 6626 }, { "epoch": 0.472058980660327, "grad_norm": 2.1767489910125732, "learning_rate": 1.4193711145904988e-06, "loss": 0.4061, "step": 6627 }, { "epoch": 0.47213021334188127, "grad_norm": 2.261375904083252, "learning_rate": 1.4173545872796713e-06, "loss": 0.4357, "step": 6628 }, { "epoch": 0.47220144602343556, "grad_norm": 4.155944347381592, "learning_rate": 1.4153393841826446e-06, "loss": 0.3793, "step": 6629 }, { "epoch": 0.47227267870498985, "grad_norm": 4.631064414978027, "learning_rate": 1.4133255056103478e-06, "loss": 0.4762, "step": 6630 }, { "epoch": 0.47234391138654414, "grad_norm": 2.622234582901001, "learning_rate": 1.4113129518735002e-06, "loss": 0.3891, "step": 6631 }, { "epoch": 0.4724151440680984, "grad_norm": 2.034285068511963, "learning_rate": 1.4093017232826155e-06, "loss": 0.1232, "step": 6632 }, { "epoch": 0.4724863767496527, "grad_norm": 3.184260606765747, "learning_rate": 1.4072918201480078e-06, "loss": 0.4556, "step": 6633 }, { "epoch": 0.47255760943120706, "grad_norm": 5.115048885345459, "learning_rate": 1.405283242779787e-06, "loss": 0.4002, "step": 6634 }, { "epoch": 0.47262884211276135, "grad_norm": 2.875037908554077, "learning_rate": 1.4032759914878501e-06, "loss": 0.366, "step": 6635 }, { "epoch": 0.47270007479431564, "grad_norm": 2.9192068576812744, "learning_rate": 1.401270066581899e-06, "loss": 0.5155, "step": 6636 }, { "epoch": 0.47277130747586993, "grad_norm": 3.3256752490997314, "learning_rate": 1.3992654683714303e-06, "loss": 0.4253, "step": 6637 }, { "epoch": 0.4728425401574242, "grad_norm": 3.0028059482574463, "learning_rate": 1.397262197165725e-06, "loss": 0.2913, "step": 6638 }, { "epoch": 0.4729137728389785, "grad_norm": 4.231089115142822, "learning_rate": 1.3952602532738734e-06, "loss": 0.6412, "step": 6639 }, { "epoch": 0.4729850055205328, "grad_norm": 3.069636583328247, "learning_rate": 1.3932596370047547e-06, "loss": 0.6431, "step": 6640 }, { "epoch": 0.47305623820208714, "grad_norm": 1.7116262912750244, "learning_rate": 1.3912603486670396e-06, "loss": 0.2411, "step": 6641 }, { "epoch": 0.47312747088364143, "grad_norm": 3.493382215499878, "learning_rate": 1.3892623885692003e-06, "loss": 0.6861, "step": 6642 }, { "epoch": 0.4731987035651957, "grad_norm": 3.528439521789551, "learning_rate": 1.3872657570195025e-06, "loss": 0.408, "step": 6643 }, { "epoch": 0.47326993624675, "grad_norm": 3.7245874404907227, "learning_rate": 1.385270454326002e-06, "loss": 0.4869, "step": 6644 }, { "epoch": 0.4733411689283043, "grad_norm": 2.1782777309417725, "learning_rate": 1.3832764807965582e-06, "loss": 0.2917, "step": 6645 }, { "epoch": 0.4734124016098586, "grad_norm": 3.3865835666656494, "learning_rate": 1.3812838367388171e-06, "loss": 0.5119, "step": 6646 }, { "epoch": 0.4734836342914129, "grad_norm": 2.8600010871887207, "learning_rate": 1.379292522460225e-06, "loss": 0.5976, "step": 6647 }, { "epoch": 0.47355486697296717, "grad_norm": 5.083021640777588, "learning_rate": 1.3773025382680195e-06, "loss": 0.4278, "step": 6648 }, { "epoch": 0.4736260996545215, "grad_norm": 3.9565885066986084, "learning_rate": 1.3753138844692348e-06, "loss": 0.6498, "step": 6649 }, { "epoch": 0.4736973323360758, "grad_norm": 3.150566577911377, "learning_rate": 1.3733265613707037e-06, "loss": 0.7928, "step": 6650 }, { "epoch": 0.4737685650176301, "grad_norm": 2.8308160305023193, "learning_rate": 1.3713405692790448e-06, "loss": 0.6548, "step": 6651 }, { "epoch": 0.4738397976991844, "grad_norm": 4.138162612915039, "learning_rate": 1.3693559085006768e-06, "loss": 0.3495, "step": 6652 }, { "epoch": 0.4739110303807387, "grad_norm": 4.102545261383057, "learning_rate": 1.367372579341817e-06, "loss": 0.4397, "step": 6653 }, { "epoch": 0.47398226306229296, "grad_norm": 2.611173391342163, "learning_rate": 1.3653905821084668e-06, "loss": 0.6483, "step": 6654 }, { "epoch": 0.47405349574384725, "grad_norm": 2.3353869915008545, "learning_rate": 1.3634099171064297e-06, "loss": 0.1478, "step": 6655 }, { "epoch": 0.4741247284254016, "grad_norm": 3.054138422012329, "learning_rate": 1.3614305846413056e-06, "loss": 0.0914, "step": 6656 }, { "epoch": 0.4741959611069559, "grad_norm": 4.899764060974121, "learning_rate": 1.3594525850184803e-06, "loss": 0.5379, "step": 6657 }, { "epoch": 0.4742671937885102, "grad_norm": 3.2155354022979736, "learning_rate": 1.3574759185431408e-06, "loss": 0.2781, "step": 6658 }, { "epoch": 0.47433842647006447, "grad_norm": 6.39648962020874, "learning_rate": 1.3555005855202674e-06, "loss": 0.307, "step": 6659 }, { "epoch": 0.47440965915161876, "grad_norm": 3.383511543273926, "learning_rate": 1.3535265862546333e-06, "loss": 0.341, "step": 6660 }, { "epoch": 0.47448089183317305, "grad_norm": 4.3580169677734375, "learning_rate": 1.3515539210508033e-06, "loss": 0.7626, "step": 6661 }, { "epoch": 0.47455212451472734, "grad_norm": 2.9124691486358643, "learning_rate": 1.3495825902131443e-06, "loss": 0.3263, "step": 6662 }, { "epoch": 0.4746233571962817, "grad_norm": 5.067067623138428, "learning_rate": 1.3476125940458062e-06, "loss": 0.4852, "step": 6663 }, { "epoch": 0.47469458987783597, "grad_norm": 2.3191475868225098, "learning_rate": 1.3456439328527426e-06, "loss": 0.2884, "step": 6664 }, { "epoch": 0.47476582255939026, "grad_norm": 2.5844335556030273, "learning_rate": 1.3436766069377006e-06, "loss": 0.3054, "step": 6665 }, { "epoch": 0.47483705524094455, "grad_norm": 2.3713042736053467, "learning_rate": 1.3417106166042127e-06, "loss": 0.3989, "step": 6666 }, { "epoch": 0.47490828792249884, "grad_norm": 1.9888051748275757, "learning_rate": 1.339745962155613e-06, "loss": 0.177, "step": 6667 }, { "epoch": 0.47497952060405313, "grad_norm": 3.774502754211426, "learning_rate": 1.3377826438950315e-06, "loss": 0.5266, "step": 6668 }, { "epoch": 0.4750507532856074, "grad_norm": 2.7767393589019775, "learning_rate": 1.3358206621253812e-06, "loss": 0.4791, "step": 6669 }, { "epoch": 0.4751219859671617, "grad_norm": 2.6602606773376465, "learning_rate": 1.3338600171493787e-06, "loss": 0.2994, "step": 6670 }, { "epoch": 0.47519321864871605, "grad_norm": 2.30703067779541, "learning_rate": 1.3319007092695346e-06, "loss": 0.2419, "step": 6671 }, { "epoch": 0.47526445133027034, "grad_norm": 2.095945119857788, "learning_rate": 1.3299427387881436e-06, "loss": 0.1464, "step": 6672 }, { "epoch": 0.47533568401182463, "grad_norm": 2.71028995513916, "learning_rate": 1.327986106007305e-06, "loss": 0.4484, "step": 6673 }, { "epoch": 0.4754069166933789, "grad_norm": 3.3351969718933105, "learning_rate": 1.3260308112289066e-06, "loss": 0.8554, "step": 6674 }, { "epoch": 0.4754781493749332, "grad_norm": 2.678757905960083, "learning_rate": 1.3240768547546302e-06, "loss": 0.4835, "step": 6675 }, { "epoch": 0.4755493820564875, "grad_norm": 2.534868001937866, "learning_rate": 1.3221242368859489e-06, "loss": 0.3731, "step": 6676 }, { "epoch": 0.4756206147380418, "grad_norm": 2.073946475982666, "learning_rate": 1.320172957924134e-06, "loss": 0.2395, "step": 6677 }, { "epoch": 0.47569184741959614, "grad_norm": 3.585601568222046, "learning_rate": 1.318223018170245e-06, "loss": 0.5856, "step": 6678 }, { "epoch": 0.4757630801011504, "grad_norm": 2.981635093688965, "learning_rate": 1.3162744179251396e-06, "loss": 0.2631, "step": 6679 }, { "epoch": 0.4758343127827047, "grad_norm": 2.280066728591919, "learning_rate": 1.3143271574894677e-06, "loss": 0.2066, "step": 6680 }, { "epoch": 0.475905545464259, "grad_norm": 2.3983232975006104, "learning_rate": 1.3123812371636691e-06, "loss": 0.3237, "step": 6681 }, { "epoch": 0.4759767781458133, "grad_norm": 5.346007347106934, "learning_rate": 1.3104366572479798e-06, "loss": 0.365, "step": 6682 }, { "epoch": 0.4760480108273676, "grad_norm": 3.2261884212493896, "learning_rate": 1.3084934180424324e-06, "loss": 0.5144, "step": 6683 }, { "epoch": 0.4761192435089219, "grad_norm": 2.5675857067108154, "learning_rate": 1.3065515198468425e-06, "loss": 0.563, "step": 6684 }, { "epoch": 0.47619047619047616, "grad_norm": 3.6305530071258545, "learning_rate": 1.3046109629608273e-06, "loss": 1.2937, "step": 6685 }, { "epoch": 0.4762617088720305, "grad_norm": 2.396578311920166, "learning_rate": 1.302671747683798e-06, "loss": 0.2519, "step": 6686 }, { "epoch": 0.4763329415535848, "grad_norm": 2.0750908851623535, "learning_rate": 1.3007338743149511e-06, "loss": 0.3173, "step": 6687 }, { "epoch": 0.4764041742351391, "grad_norm": 3.171759605407715, "learning_rate": 1.2987973431532818e-06, "loss": 0.5694, "step": 6688 }, { "epoch": 0.4764754069166934, "grad_norm": 3.499128580093384, "learning_rate": 1.296862154497579e-06, "loss": 0.1307, "step": 6689 }, { "epoch": 0.47654663959824767, "grad_norm": 2.235219717025757, "learning_rate": 1.2949283086464192e-06, "loss": 0.371, "step": 6690 }, { "epoch": 0.47661787227980196, "grad_norm": 5.18875789642334, "learning_rate": 1.2929958058981796e-06, "loss": 0.6027, "step": 6691 }, { "epoch": 0.47668910496135625, "grad_norm": 2.7113020420074463, "learning_rate": 1.291064646551019e-06, "loss": 0.3685, "step": 6692 }, { "epoch": 0.4767603376429106, "grad_norm": 2.6726129055023193, "learning_rate": 1.2891348309029005e-06, "loss": 0.5461, "step": 6693 }, { "epoch": 0.4768315703244649, "grad_norm": 2.675476312637329, "learning_rate": 1.2872063592515716e-06, "loss": 0.5156, "step": 6694 }, { "epoch": 0.47690280300601917, "grad_norm": 1.73629891872406, "learning_rate": 1.2852792318945773e-06, "loss": 0.245, "step": 6695 }, { "epoch": 0.47697403568757346, "grad_norm": 1.9426484107971191, "learning_rate": 1.2833534491292554e-06, "loss": 0.2578, "step": 6696 }, { "epoch": 0.47704526836912775, "grad_norm": 2.83271861076355, "learning_rate": 1.2814290112527295e-06, "loss": 0.2606, "step": 6697 }, { "epoch": 0.47711650105068204, "grad_norm": 3.453753709793091, "learning_rate": 1.279505918561923e-06, "loss": 0.3171, "step": 6698 }, { "epoch": 0.47718773373223633, "grad_norm": 3.0940284729003906, "learning_rate": 1.2775841713535532e-06, "loss": 0.5619, "step": 6699 }, { "epoch": 0.4772589664137906, "grad_norm": 1.9528040885925293, "learning_rate": 1.2756637699241181e-06, "loss": 0.2193, "step": 6700 }, { "epoch": 0.47733019909534496, "grad_norm": 5.8202667236328125, "learning_rate": 1.273744714569921e-06, "loss": 0.6264, "step": 6701 }, { "epoch": 0.47740143177689925, "grad_norm": 3.7324612140655518, "learning_rate": 1.271827005587054e-06, "loss": 0.1296, "step": 6702 }, { "epoch": 0.47747266445845354, "grad_norm": 1.5996700525283813, "learning_rate": 1.2699106432713947e-06, "loss": 0.2557, "step": 6703 }, { "epoch": 0.47754389714000783, "grad_norm": 2.8573782444000244, "learning_rate": 1.2679956279186234e-06, "loss": 0.629, "step": 6704 }, { "epoch": 0.4776151298215621, "grad_norm": 2.837320566177368, "learning_rate": 1.2660819598242013e-06, "loss": 0.417, "step": 6705 }, { "epoch": 0.4776863625031164, "grad_norm": 4.444062232971191, "learning_rate": 1.2641696392833935e-06, "loss": 0.612, "step": 6706 }, { "epoch": 0.4777575951846707, "grad_norm": 3.2451350688934326, "learning_rate": 1.262258666591246e-06, "loss": 0.5078, "step": 6707 }, { "epoch": 0.47782882786622505, "grad_norm": 2.5801877975463867, "learning_rate": 1.260349042042608e-06, "loss": 0.5273, "step": 6708 }, { "epoch": 0.47790006054777934, "grad_norm": 3.9341275691986084, "learning_rate": 1.2584407659321086e-06, "loss": 0.3881, "step": 6709 }, { "epoch": 0.4779712932293336, "grad_norm": 3.692490339279175, "learning_rate": 1.2565338385541792e-06, "loss": 0.5275, "step": 6710 }, { "epoch": 0.4780425259108879, "grad_norm": 2.8742947578430176, "learning_rate": 1.2546282602030402e-06, "loss": 0.3333, "step": 6711 }, { "epoch": 0.4781137585924422, "grad_norm": 3.216761350631714, "learning_rate": 1.2527240311726985e-06, "loss": 0.4925, "step": 6712 }, { "epoch": 0.4781849912739965, "grad_norm": 2.617849349975586, "learning_rate": 1.2508211517569592e-06, "loss": 0.4977, "step": 6713 }, { "epoch": 0.4782562239555508, "grad_norm": 3.800647735595703, "learning_rate": 1.2489196222494193e-06, "loss": 0.6696, "step": 6714 }, { "epoch": 0.47832745663710513, "grad_norm": 4.229685306549072, "learning_rate": 1.2470194429434601e-06, "loss": 0.2419, "step": 6715 }, { "epoch": 0.4783986893186594, "grad_norm": 4.766760349273682, "learning_rate": 1.2451206141322635e-06, "loss": 0.5912, "step": 6716 }, { "epoch": 0.4784699220002137, "grad_norm": 3.0035996437072754, "learning_rate": 1.243223136108801e-06, "loss": 0.3814, "step": 6717 }, { "epoch": 0.478541154681768, "grad_norm": 3.0768489837646484, "learning_rate": 1.241327009165828e-06, "loss": 0.736, "step": 6718 }, { "epoch": 0.4786123873633223, "grad_norm": 5.041527271270752, "learning_rate": 1.239432233595903e-06, "loss": 0.4569, "step": 6719 }, { "epoch": 0.4786836200448766, "grad_norm": 3.604030132293701, "learning_rate": 1.2375388096913666e-06, "loss": 0.6077, "step": 6720 }, { "epoch": 0.47875485272643087, "grad_norm": 2.6577908992767334, "learning_rate": 1.235646737744357e-06, "loss": 0.3653, "step": 6721 }, { "epoch": 0.47882608540798516, "grad_norm": 5.711407661437988, "learning_rate": 1.2337560180467988e-06, "loss": 0.3567, "step": 6722 }, { "epoch": 0.4788973180895395, "grad_norm": 2.623837947845459, "learning_rate": 1.2318666508904143e-06, "loss": 0.3269, "step": 6723 }, { "epoch": 0.4789685507710938, "grad_norm": 3.6601316928863525, "learning_rate": 1.2299786365667088e-06, "loss": 0.6149, "step": 6724 }, { "epoch": 0.4790397834526481, "grad_norm": 1.7194029092788696, "learning_rate": 1.2280919753669863e-06, "loss": 0.06, "step": 6725 }, { "epoch": 0.47911101613420237, "grad_norm": 3.0804858207702637, "learning_rate": 1.226206667582338e-06, "loss": 0.6511, "step": 6726 }, { "epoch": 0.47918224881575666, "grad_norm": 4.025754451751709, "learning_rate": 1.2243227135036517e-06, "loss": 0.7324, "step": 6727 }, { "epoch": 0.47925348149731095, "grad_norm": 2.6396055221557617, "learning_rate": 1.2224401134215957e-06, "loss": 0.4771, "step": 6728 }, { "epoch": 0.47932471417886524, "grad_norm": 2.993772029876709, "learning_rate": 1.220558867626639e-06, "loss": 0.5827, "step": 6729 }, { "epoch": 0.4793959468604196, "grad_norm": 2.9008493423461914, "learning_rate": 1.2186789764090412e-06, "loss": 0.7085, "step": 6730 }, { "epoch": 0.4794671795419739, "grad_norm": 2.8717167377471924, "learning_rate": 1.216800440058844e-06, "loss": 0.5565, "step": 6731 }, { "epoch": 0.47953841222352817, "grad_norm": 2.9439663887023926, "learning_rate": 1.21492325886589e-06, "loss": 0.5976, "step": 6732 }, { "epoch": 0.47960964490508246, "grad_norm": 2.8542001247406006, "learning_rate": 1.2130474331198106e-06, "loss": 0.3029, "step": 6733 }, { "epoch": 0.47968087758663674, "grad_norm": 2.841121196746826, "learning_rate": 1.2111729631100211e-06, "loss": 0.7306, "step": 6734 }, { "epoch": 0.47975211026819103, "grad_norm": 3.0378100872039795, "learning_rate": 1.209299849125739e-06, "loss": 0.6679, "step": 6735 }, { "epoch": 0.4798233429497453, "grad_norm": 6.575226783752441, "learning_rate": 1.2074280914559634e-06, "loss": 0.7616, "step": 6736 }, { "epoch": 0.4798945756312996, "grad_norm": 3.4652700424194336, "learning_rate": 1.205557690389485e-06, "loss": 0.1701, "step": 6737 }, { "epoch": 0.47996580831285396, "grad_norm": 3.055183172225952, "learning_rate": 1.20368864621489e-06, "loss": 0.5192, "step": 6738 }, { "epoch": 0.48003704099440825, "grad_norm": 2.6634652614593506, "learning_rate": 1.2018209592205542e-06, "loss": 0.4973, "step": 6739 }, { "epoch": 0.48010827367596254, "grad_norm": 2.1813502311706543, "learning_rate": 1.1999546296946386e-06, "loss": 0.2408, "step": 6740 }, { "epoch": 0.4801795063575168, "grad_norm": 2.909623384475708, "learning_rate": 1.198089657925101e-06, "loss": 0.4847, "step": 6741 }, { "epoch": 0.4802507390390711, "grad_norm": 3.9206454753875732, "learning_rate": 1.1962260441996888e-06, "loss": 0.7095, "step": 6742 }, { "epoch": 0.4803219717206254, "grad_norm": 2.910741090774536, "learning_rate": 1.1943637888059346e-06, "loss": 0.596, "step": 6743 }, { "epoch": 0.4803932044021797, "grad_norm": 2.9206740856170654, "learning_rate": 1.1925028920311676e-06, "loss": 0.5275, "step": 6744 }, { "epoch": 0.48046443708373404, "grad_norm": 5.6976165771484375, "learning_rate": 1.1906433541625063e-06, "loss": 0.5067, "step": 6745 }, { "epoch": 0.48053566976528833, "grad_norm": 2.184295892715454, "learning_rate": 1.1887851754868551e-06, "loss": 0.2833, "step": 6746 }, { "epoch": 0.4806069024468426, "grad_norm": 2.880514621734619, "learning_rate": 1.1869283562909128e-06, "loss": 0.4446, "step": 6747 }, { "epoch": 0.4806781351283969, "grad_norm": 1.78355872631073, "learning_rate": 1.1850728968611702e-06, "loss": 0.1126, "step": 6748 }, { "epoch": 0.4807493678099512, "grad_norm": 4.620299816131592, "learning_rate": 1.1832187974839015e-06, "loss": 0.5406, "step": 6749 }, { "epoch": 0.4808206004915055, "grad_norm": 3.034471035003662, "learning_rate": 1.181366058445179e-06, "loss": 0.4019, "step": 6750 }, { "epoch": 0.4808918331730598, "grad_norm": 2.3429527282714844, "learning_rate": 1.17951468003086e-06, "loss": 0.2434, "step": 6751 }, { "epoch": 0.4809630658546141, "grad_norm": 2.3042080402374268, "learning_rate": 1.1776646625265897e-06, "loss": 0.3952, "step": 6752 }, { "epoch": 0.4810342985361684, "grad_norm": 3.3723161220550537, "learning_rate": 1.1758160062178093e-06, "loss": 0.6149, "step": 6753 }, { "epoch": 0.4811055312177227, "grad_norm": 1.8691256046295166, "learning_rate": 1.1739687113897501e-06, "loss": 0.4459, "step": 6754 }, { "epoch": 0.481176763899277, "grad_norm": 5.147912979125977, "learning_rate": 1.1721227783274259e-06, "loss": 0.6916, "step": 6755 }, { "epoch": 0.4812479965808313, "grad_norm": 3.3752524852752686, "learning_rate": 1.1702782073156482e-06, "loss": 0.5254, "step": 6756 }, { "epoch": 0.4813192292623856, "grad_norm": 2.766251564025879, "learning_rate": 1.1684349986390154e-06, "loss": 0.4284, "step": 6757 }, { "epoch": 0.48139046194393986, "grad_norm": 2.2808353900909424, "learning_rate": 1.166593152581914e-06, "loss": 0.6032, "step": 6758 }, { "epoch": 0.48146169462549415, "grad_norm": 2.3275344371795654, "learning_rate": 1.1647526694285216e-06, "loss": 0.363, "step": 6759 }, { "epoch": 0.4815329273070485, "grad_norm": 3.8492236137390137, "learning_rate": 1.1629135494628097e-06, "loss": 0.4515, "step": 6760 }, { "epoch": 0.4816041599886028, "grad_norm": 4.225333213806152, "learning_rate": 1.1610757929685301e-06, "loss": 0.3807, "step": 6761 }, { "epoch": 0.4816753926701571, "grad_norm": 4.6221418380737305, "learning_rate": 1.1592394002292328e-06, "loss": 0.5335, "step": 6762 }, { "epoch": 0.48174662535171137, "grad_norm": 2.655164957046509, "learning_rate": 1.1574043715282557e-06, "loss": 0.3772, "step": 6763 }, { "epoch": 0.48181785803326566, "grad_norm": 3.5539486408233643, "learning_rate": 1.155570707148721e-06, "loss": 0.4585, "step": 6764 }, { "epoch": 0.48188909071481995, "grad_norm": 3.985091209411621, "learning_rate": 1.153738407373548e-06, "loss": 0.4468, "step": 6765 }, { "epoch": 0.48196032339637424, "grad_norm": 2.6089844703674316, "learning_rate": 1.1519074724854373e-06, "loss": 0.5758, "step": 6766 }, { "epoch": 0.4820315560779286, "grad_norm": 4.2450337409973145, "learning_rate": 1.1500779027668885e-06, "loss": 0.5131, "step": 6767 }, { "epoch": 0.48210278875948287, "grad_norm": 2.4200856685638428, "learning_rate": 1.1482496985001812e-06, "loss": 0.4191, "step": 6768 }, { "epoch": 0.48217402144103716, "grad_norm": 3.51727032661438, "learning_rate": 1.1464228599673889e-06, "loss": 0.7026, "step": 6769 }, { "epoch": 0.48224525412259145, "grad_norm": 4.110243797302246, "learning_rate": 1.144597387450378e-06, "loss": 0.9646, "step": 6770 }, { "epoch": 0.48231648680414574, "grad_norm": 2.545754909515381, "learning_rate": 1.1427732812307945e-06, "loss": 0.3414, "step": 6771 }, { "epoch": 0.48238771948570003, "grad_norm": 2.156074047088623, "learning_rate": 1.1409505415900823e-06, "loss": 0.3985, "step": 6772 }, { "epoch": 0.4824589521672543, "grad_norm": 3.190268039703369, "learning_rate": 1.139129168809473e-06, "loss": 0.434, "step": 6773 }, { "epoch": 0.4825301848488086, "grad_norm": 3.390319585800171, "learning_rate": 1.1373091631699817e-06, "loss": 0.9144, "step": 6774 }, { "epoch": 0.48260141753036295, "grad_norm": 1.703062891960144, "learning_rate": 1.1354905249524184e-06, "loss": 0.1667, "step": 6775 }, { "epoch": 0.48267265021191724, "grad_norm": 2.5753061771392822, "learning_rate": 1.133673254437383e-06, "loss": 0.5212, "step": 6776 }, { "epoch": 0.48274388289347153, "grad_norm": 6.860708713531494, "learning_rate": 1.1318573519052556e-06, "loss": 0.3258, "step": 6777 }, { "epoch": 0.4828151155750258, "grad_norm": 3.8226542472839355, "learning_rate": 1.1300428176362155e-06, "loss": 0.6135, "step": 6778 }, { "epoch": 0.4828863482565801, "grad_norm": 2.6273186206817627, "learning_rate": 1.1282296519102277e-06, "loss": 0.0359, "step": 6779 }, { "epoch": 0.4829575809381344, "grad_norm": 1.750291109085083, "learning_rate": 1.1264178550070427e-06, "loss": 0.2321, "step": 6780 }, { "epoch": 0.4830288136196887, "grad_norm": 3.362609624862671, "learning_rate": 1.1246074272062012e-06, "loss": 0.7619, "step": 6781 }, { "epoch": 0.48310004630124304, "grad_norm": 2.6145529747009277, "learning_rate": 1.1227983687870358e-06, "loss": 0.447, "step": 6782 }, { "epoch": 0.4831712789827973, "grad_norm": 2.5232248306274414, "learning_rate": 1.120990680028663e-06, "loss": 0.5158, "step": 6783 }, { "epoch": 0.4832425116643516, "grad_norm": 3.8176090717315674, "learning_rate": 1.119184361209993e-06, "loss": 0.4723, "step": 6784 }, { "epoch": 0.4833137443459059, "grad_norm": 4.417736530303955, "learning_rate": 1.1173794126097226e-06, "loss": 0.4752, "step": 6785 }, { "epoch": 0.4833849770274602, "grad_norm": 2.6440069675445557, "learning_rate": 1.1155758345063328e-06, "loss": 0.5167, "step": 6786 }, { "epoch": 0.4834562097090145, "grad_norm": 3.9820897579193115, "learning_rate": 1.1137736271781007e-06, "loss": 0.1995, "step": 6787 }, { "epoch": 0.4835274423905688, "grad_norm": 3.4895236492156982, "learning_rate": 1.1119727909030897e-06, "loss": 0.5923, "step": 6788 }, { "epoch": 0.48359867507212306, "grad_norm": 3.015716791152954, "learning_rate": 1.1101733259591453e-06, "loss": 0.6348, "step": 6789 }, { "epoch": 0.4836699077536774, "grad_norm": 2.5619330406188965, "learning_rate": 1.1083752326239094e-06, "loss": 0.5646, "step": 6790 }, { "epoch": 0.4837411404352317, "grad_norm": 2.944471597671509, "learning_rate": 1.1065785111748117e-06, "loss": 0.7776, "step": 6791 }, { "epoch": 0.483812373116786, "grad_norm": 5.469735622406006, "learning_rate": 1.1047831618890625e-06, "loss": 0.3934, "step": 6792 }, { "epoch": 0.4838836057983403, "grad_norm": 2.328559637069702, "learning_rate": 1.1029891850436691e-06, "loss": 0.3686, "step": 6793 }, { "epoch": 0.48395483847989457, "grad_norm": 2.23797345161438, "learning_rate": 1.1011965809154245e-06, "loss": 0.2459, "step": 6794 }, { "epoch": 0.48402607116144886, "grad_norm": 3.2184319496154785, "learning_rate": 1.0994053497809077e-06, "loss": 0.5334, "step": 6795 }, { "epoch": 0.48409730384300315, "grad_norm": 3.5021791458129883, "learning_rate": 1.097615491916485e-06, "loss": 0.6976, "step": 6796 }, { "epoch": 0.4841685365245575, "grad_norm": 4.545474052429199, "learning_rate": 1.0958270075983167e-06, "loss": 0.7243, "step": 6797 }, { "epoch": 0.4842397692061118, "grad_norm": 5.447112083435059, "learning_rate": 1.0940398971023447e-06, "loss": 0.5872, "step": 6798 }, { "epoch": 0.48431100188766607, "grad_norm": 2.162572145462036, "learning_rate": 1.0922541607043024e-06, "loss": 0.2455, "step": 6799 }, { "epoch": 0.48438223456922036, "grad_norm": 2.101013422012329, "learning_rate": 1.0904697986797131e-06, "loss": 0.3885, "step": 6800 }, { "epoch": 0.48445346725077465, "grad_norm": 3.637101411819458, "learning_rate": 1.0886868113038817e-06, "loss": 0.5714, "step": 6801 }, { "epoch": 0.48452469993232894, "grad_norm": 4.817246437072754, "learning_rate": 1.0869051988519063e-06, "loss": 0.6792, "step": 6802 }, { "epoch": 0.48459593261388323, "grad_norm": 2.9785044193267822, "learning_rate": 1.0851249615986715e-06, "loss": 0.4883, "step": 6803 }, { "epoch": 0.4846671652954376, "grad_norm": 1.8277772665023804, "learning_rate": 1.0833460998188516e-06, "loss": 0.2329, "step": 6804 }, { "epoch": 0.48473839797699186, "grad_norm": 2.816361904144287, "learning_rate": 1.081568613786903e-06, "loss": 0.4141, "step": 6805 }, { "epoch": 0.48480963065854615, "grad_norm": 2.222522258758545, "learning_rate": 1.079792503777075e-06, "loss": 0.1282, "step": 6806 }, { "epoch": 0.48488086334010044, "grad_norm": 3.156116485595703, "learning_rate": 1.0780177700634053e-06, "loss": 0.6593, "step": 6807 }, { "epoch": 0.48495209602165473, "grad_norm": 3.8780031204223633, "learning_rate": 1.0762444129197136e-06, "loss": 0.3442, "step": 6808 }, { "epoch": 0.485023328703209, "grad_norm": 3.932863235473633, "learning_rate": 1.0744724326196133e-06, "loss": 0.4766, "step": 6809 }, { "epoch": 0.4850945613847633, "grad_norm": 2.660362482070923, "learning_rate": 1.0727018294364999e-06, "loss": 0.4509, "step": 6810 }, { "epoch": 0.4851657940663176, "grad_norm": 2.3607850074768066, "learning_rate": 1.070932603643563e-06, "loss": 0.1815, "step": 6811 }, { "epoch": 0.48523702674787195, "grad_norm": 7.4851789474487305, "learning_rate": 1.0691647555137719e-06, "loss": 1.0609, "step": 6812 }, { "epoch": 0.48530825942942624, "grad_norm": 4.357445240020752, "learning_rate": 1.0673982853198906e-06, "loss": 0.7399, "step": 6813 }, { "epoch": 0.4853794921109805, "grad_norm": 2.899803400039673, "learning_rate": 1.0656331933344643e-06, "loss": 0.5985, "step": 6814 }, { "epoch": 0.4854507247925348, "grad_norm": 2.3384623527526855, "learning_rate": 1.06386947982983e-06, "loss": 0.2726, "step": 6815 }, { "epoch": 0.4855219574740891, "grad_norm": 5.576411247253418, "learning_rate": 1.0621071450781118e-06, "loss": 0.1379, "step": 6816 }, { "epoch": 0.4855931901556434, "grad_norm": 1.7567386627197266, "learning_rate": 1.060346189351218e-06, "loss": 0.1937, "step": 6817 }, { "epoch": 0.4856644228371977, "grad_norm": 2.353579044342041, "learning_rate": 1.0585866129208456e-06, "loss": 0.2131, "step": 6818 }, { "epoch": 0.48573565551875203, "grad_norm": 2.8311655521392822, "learning_rate": 1.0568284160584818e-06, "loss": 0.4819, "step": 6819 }, { "epoch": 0.4858068882003063, "grad_norm": 3.4968879222869873, "learning_rate": 1.0550715990353955e-06, "loss": 0.0768, "step": 6820 }, { "epoch": 0.4858781208818606, "grad_norm": 2.9307119846343994, "learning_rate": 1.0533161621226463e-06, "loss": 0.2942, "step": 6821 }, { "epoch": 0.4859493535634149, "grad_norm": 5.359652996063232, "learning_rate": 1.051562105591082e-06, "loss": 0.7248, "step": 6822 }, { "epoch": 0.4860205862449692, "grad_norm": 2.2696259021759033, "learning_rate": 1.0498094297113314e-06, "loss": 0.3161, "step": 6823 }, { "epoch": 0.4860918189265235, "grad_norm": 7.495211601257324, "learning_rate": 1.0480581347538199e-06, "loss": 0.65, "step": 6824 }, { "epoch": 0.48616305160807777, "grad_norm": 3.442379951477051, "learning_rate": 1.0463082209887477e-06, "loss": 0.5995, "step": 6825 }, { "epoch": 0.48623428428963206, "grad_norm": 3.6434929370880127, "learning_rate": 1.0445596886861143e-06, "loss": 0.6497, "step": 6826 }, { "epoch": 0.4863055169711864, "grad_norm": 4.444101810455322, "learning_rate": 1.0428125381156962e-06, "loss": 0.4133, "step": 6827 }, { "epoch": 0.4863767496527407, "grad_norm": 3.4162282943725586, "learning_rate": 1.0410667695470633e-06, "loss": 0.5382, "step": 6828 }, { "epoch": 0.486447982334295, "grad_norm": 9.455236434936523, "learning_rate": 1.039322383249568e-06, "loss": 0.9076, "step": 6829 }, { "epoch": 0.48651921501584927, "grad_norm": 4.115329742431641, "learning_rate": 1.0375793794923505e-06, "loss": 0.7909, "step": 6830 }, { "epoch": 0.48659044769740356, "grad_norm": 6.62841272354126, "learning_rate": 1.0358377585443424e-06, "loss": 0.5553, "step": 6831 }, { "epoch": 0.48666168037895785, "grad_norm": 2.334926128387451, "learning_rate": 1.0340975206742531e-06, "loss": 0.4455, "step": 6832 }, { "epoch": 0.48673291306051214, "grad_norm": 5.51884126663208, "learning_rate": 1.0323586661505858e-06, "loss": 0.3244, "step": 6833 }, { "epoch": 0.4868041457420665, "grad_norm": 4.566254615783691, "learning_rate": 1.030621195241629e-06, "loss": 0.4909, "step": 6834 }, { "epoch": 0.4868753784236208, "grad_norm": 2.6384363174438477, "learning_rate": 1.0288851082154528e-06, "loss": 0.8499, "step": 6835 }, { "epoch": 0.48694661110517506, "grad_norm": 2.7153141498565674, "learning_rate": 1.0271504053399195e-06, "loss": 0.5196, "step": 6836 }, { "epoch": 0.48701784378672935, "grad_norm": 2.9260287284851074, "learning_rate": 1.0254170868826796e-06, "loss": 0.5672, "step": 6837 }, { "epoch": 0.48708907646828364, "grad_norm": 2.1708598136901855, "learning_rate": 1.0236851531111592e-06, "loss": 0.4821, "step": 6838 }, { "epoch": 0.48716030914983793, "grad_norm": 2.8686411380767822, "learning_rate": 1.0219546042925842e-06, "loss": 0.1863, "step": 6839 }, { "epoch": 0.4872315418313922, "grad_norm": 6.473099708557129, "learning_rate": 1.020225440693956e-06, "loss": 0.6234, "step": 6840 }, { "epoch": 0.4873027745129465, "grad_norm": 2.989523410797119, "learning_rate": 1.0184976625820707e-06, "loss": 0.6415, "step": 6841 }, { "epoch": 0.48737400719450086, "grad_norm": 3.7360410690307617, "learning_rate": 1.0167712702235023e-06, "loss": 0.4253, "step": 6842 }, { "epoch": 0.48744523987605515, "grad_norm": 3.221451997756958, "learning_rate": 1.015046263884617e-06, "loss": 0.2292, "step": 6843 }, { "epoch": 0.48751647255760944, "grad_norm": 2.7488138675689697, "learning_rate": 1.013322643831569e-06, "loss": 0.581, "step": 6844 }, { "epoch": 0.4875877052391637, "grad_norm": 4.380011081695557, "learning_rate": 1.011600410330289e-06, "loss": 0.604, "step": 6845 }, { "epoch": 0.487658937920718, "grad_norm": 4.3793110847473145, "learning_rate": 1.0098795636465042e-06, "loss": 0.367, "step": 6846 }, { "epoch": 0.4877301706022723, "grad_norm": 3.0893375873565674, "learning_rate": 1.0081601040457246e-06, "loss": 0.5985, "step": 6847 }, { "epoch": 0.4878014032838266, "grad_norm": 2.867900848388672, "learning_rate": 1.00644203179324e-06, "loss": 0.7272, "step": 6848 }, { "epoch": 0.48787263596538094, "grad_norm": 2.8948705196380615, "learning_rate": 1.004725347154134e-06, "loss": 0.5653, "step": 6849 }, { "epoch": 0.48794386864693523, "grad_norm": 2.9083571434020996, "learning_rate": 1.0030100503932761e-06, "loss": 0.1797, "step": 6850 }, { "epoch": 0.4880151013284895, "grad_norm": 5.001065731048584, "learning_rate": 1.0012961417753142e-06, "loss": 0.8412, "step": 6851 }, { "epoch": 0.4880863340100438, "grad_norm": 3.7148561477661133, "learning_rate": 9.995836215646892e-07, "loss": 0.2816, "step": 6852 }, { "epoch": 0.4881575666915981, "grad_norm": 2.839829206466675, "learning_rate": 9.978724900256265e-07, "loss": 0.416, "step": 6853 }, { "epoch": 0.4882287993731524, "grad_norm": 2.9991135597229004, "learning_rate": 9.961627474221324e-07, "loss": 0.8047, "step": 6854 }, { "epoch": 0.4883000320547067, "grad_norm": 5.505061626434326, "learning_rate": 9.944543940180074e-07, "loss": 0.4095, "step": 6855 }, { "epoch": 0.488371264736261, "grad_norm": 2.382655143737793, "learning_rate": 9.927474300768303e-07, "loss": 0.3477, "step": 6856 }, { "epoch": 0.4884424974178153, "grad_norm": 3.3052632808685303, "learning_rate": 9.91041855861965e-07, "loss": 0.7244, "step": 6857 }, { "epoch": 0.4885137300993696, "grad_norm": 2.5866942405700684, "learning_rate": 9.893376716365677e-07, "loss": 0.5067, "step": 6858 }, { "epoch": 0.4885849627809239, "grad_norm": 2.290200710296631, "learning_rate": 9.87634877663578e-07, "loss": 0.1635, "step": 6859 }, { "epoch": 0.4886561954624782, "grad_norm": 1.9446641206741333, "learning_rate": 9.859334742057158e-07, "loss": 0.1821, "step": 6860 }, { "epoch": 0.48872742814403247, "grad_norm": 3.0201687812805176, "learning_rate": 9.842334615254901e-07, "loss": 0.3567, "step": 6861 }, { "epoch": 0.48879866082558676, "grad_norm": 10.118295669555664, "learning_rate": 9.825348398851998e-07, "loss": 0.4435, "step": 6862 }, { "epoch": 0.48886989350714105, "grad_norm": 2.012267827987671, "learning_rate": 9.808376095469196e-07, "loss": 0.3766, "step": 6863 }, { "epoch": 0.4889411261886954, "grad_norm": 2.8989098072052, "learning_rate": 9.791417707725171e-07, "loss": 0.5021, "step": 6864 }, { "epoch": 0.4890123588702497, "grad_norm": 5.650821208953857, "learning_rate": 9.774473238236449e-07, "loss": 0.5408, "step": 6865 }, { "epoch": 0.489083591551804, "grad_norm": 3.016995668411255, "learning_rate": 9.757542689617328e-07, "loss": 0.4736, "step": 6866 }, { "epoch": 0.48915482423335827, "grad_norm": 3.0431363582611084, "learning_rate": 9.740626064480063e-07, "loss": 0.3035, "step": 6867 }, { "epoch": 0.48922605691491255, "grad_norm": 3.1950039863586426, "learning_rate": 9.723723365434722e-07, "loss": 0.4621, "step": 6868 }, { "epoch": 0.48929728959646684, "grad_norm": 2.288564443588257, "learning_rate": 9.706834595089187e-07, "loss": 0.1941, "step": 6869 }, { "epoch": 0.48936852227802113, "grad_norm": 2.4997146129608154, "learning_rate": 9.68995975604925e-07, "loss": 0.1678, "step": 6870 }, { "epoch": 0.4894397549595755, "grad_norm": 3.134751081466675, "learning_rate": 9.673098850918506e-07, "loss": 0.6879, "step": 6871 }, { "epoch": 0.48951098764112977, "grad_norm": 3.51460337638855, "learning_rate": 9.656251882298394e-07, "loss": 0.6849, "step": 6872 }, { "epoch": 0.48958222032268406, "grad_norm": 3.1705968379974365, "learning_rate": 9.639418852788274e-07, "loss": 0.534, "step": 6873 }, { "epoch": 0.48965345300423835, "grad_norm": 4.000990390777588, "learning_rate": 9.622599764985297e-07, "loss": 0.5394, "step": 6874 }, { "epoch": 0.48972468568579264, "grad_norm": 5.095405578613281, "learning_rate": 9.605794621484455e-07, "loss": 0.3263, "step": 6875 }, { "epoch": 0.4897959183673469, "grad_norm": 5.771871566772461, "learning_rate": 9.589003424878618e-07, "loss": 0.3636, "step": 6876 }, { "epoch": 0.4898671510489012, "grad_norm": 10.596820831298828, "learning_rate": 9.572226177758514e-07, "loss": 0.1987, "step": 6877 }, { "epoch": 0.4899383837304555, "grad_norm": 2.5591206550598145, "learning_rate": 9.555462882712684e-07, "loss": 0.3733, "step": 6878 }, { "epoch": 0.49000961641200985, "grad_norm": 2.5082337856292725, "learning_rate": 9.538713542327527e-07, "loss": 0.18, "step": 6879 }, { "epoch": 0.49008084909356414, "grad_norm": 4.608580589294434, "learning_rate": 9.521978159187295e-07, "loss": 0.3437, "step": 6880 }, { "epoch": 0.49015208177511843, "grad_norm": 3.233171224594116, "learning_rate": 9.505256735874113e-07, "loss": 0.456, "step": 6881 }, { "epoch": 0.4902233144566727, "grad_norm": 4.31549596786499, "learning_rate": 9.488549274967873e-07, "loss": 0.498, "step": 6882 }, { "epoch": 0.490294547138227, "grad_norm": 1.6665531396865845, "learning_rate": 9.471855779046424e-07, "loss": 0.2603, "step": 6883 }, { "epoch": 0.4903657798197813, "grad_norm": 3.323343276977539, "learning_rate": 9.455176250685338e-07, "loss": 0.4051, "step": 6884 }, { "epoch": 0.4904370125013356, "grad_norm": 5.789434432983398, "learning_rate": 9.438510692458147e-07, "loss": 0.3179, "step": 6885 }, { "epoch": 0.49050824518288993, "grad_norm": 3.379242181777954, "learning_rate": 9.421859106936138e-07, "loss": 0.6076, "step": 6886 }, { "epoch": 0.4905794778644442, "grad_norm": 2.242483139038086, "learning_rate": 9.40522149668851e-07, "loss": 0.4758, "step": 6887 }, { "epoch": 0.4906507105459985, "grad_norm": 1.8197773694992065, "learning_rate": 9.388597864282245e-07, "loss": 0.1404, "step": 6888 }, { "epoch": 0.4907219432275528, "grad_norm": 2.1024560928344727, "learning_rate": 9.371988212282212e-07, "loss": 0.2904, "step": 6889 }, { "epoch": 0.4907931759091071, "grad_norm": 2.4462544918060303, "learning_rate": 9.355392543251119e-07, "loss": 0.2597, "step": 6890 }, { "epoch": 0.4908644085906614, "grad_norm": 3.0274150371551514, "learning_rate": 9.338810859749492e-07, "loss": 0.6057, "step": 6891 }, { "epoch": 0.4909356412722157, "grad_norm": 4.7427167892456055, "learning_rate": 9.322243164335709e-07, "loss": 0.4951, "step": 6892 }, { "epoch": 0.49100687395376996, "grad_norm": 4.452565670013428, "learning_rate": 9.305689459566025e-07, "loss": 0.7306, "step": 6893 }, { "epoch": 0.4910781066353243, "grad_norm": 3.7484536170959473, "learning_rate": 9.289149747994475e-07, "loss": 0.4062, "step": 6894 }, { "epoch": 0.4911493393168786, "grad_norm": 2.2982165813446045, "learning_rate": 9.272624032172972e-07, "loss": 0.6548, "step": 6895 }, { "epoch": 0.4912205719984329, "grad_norm": 6.028674125671387, "learning_rate": 9.2561123146513e-07, "loss": 0.2481, "step": 6896 }, { "epoch": 0.4912918046799872, "grad_norm": 3.776531219482422, "learning_rate": 9.239614597976987e-07, "loss": 0.9238, "step": 6897 }, { "epoch": 0.49136303736154147, "grad_norm": 2.8779706954956055, "learning_rate": 9.223130884695486e-07, "loss": 0.5188, "step": 6898 }, { "epoch": 0.49143427004309576, "grad_norm": 2.0036349296569824, "learning_rate": 9.206661177350096e-07, "loss": 0.2021, "step": 6899 }, { "epoch": 0.49150550272465005, "grad_norm": 2.6250667572021484, "learning_rate": 9.190205478481895e-07, "loss": 0.5555, "step": 6900 }, { "epoch": 0.4915767354062044, "grad_norm": 4.382199764251709, "learning_rate": 9.173763790629808e-07, "loss": 0.3584, "step": 6901 }, { "epoch": 0.4916479680877587, "grad_norm": 3.375939130783081, "learning_rate": 9.15733611633065e-07, "loss": 0.4643, "step": 6902 }, { "epoch": 0.49171920076931297, "grad_norm": 2.935966730117798, "learning_rate": 9.140922458119028e-07, "loss": 0.345, "step": 6903 }, { "epoch": 0.49179043345086726, "grad_norm": 2.2760441303253174, "learning_rate": 9.124522818527393e-07, "loss": 0.2691, "step": 6904 }, { "epoch": 0.49186166613242155, "grad_norm": 3.4746475219726562, "learning_rate": 9.108137200086076e-07, "loss": 0.7823, "step": 6905 }, { "epoch": 0.49193289881397584, "grad_norm": 3.3325369358062744, "learning_rate": 9.091765605323155e-07, "loss": 0.7443, "step": 6906 }, { "epoch": 0.49200413149553013, "grad_norm": 3.0286998748779297, "learning_rate": 9.075408036764633e-07, "loss": 0.6487, "step": 6907 }, { "epoch": 0.4920753641770845, "grad_norm": 3.0871455669403076, "learning_rate": 9.059064496934333e-07, "loss": 0.6201, "step": 6908 }, { "epoch": 0.49214659685863876, "grad_norm": 3.3309617042541504, "learning_rate": 9.042734988353841e-07, "loss": 0.7058, "step": 6909 }, { "epoch": 0.49221782954019305, "grad_norm": 3.254223585128784, "learning_rate": 9.026419513542673e-07, "loss": 0.3298, "step": 6910 }, { "epoch": 0.49228906222174734, "grad_norm": 6.902993679046631, "learning_rate": 9.010118075018137e-07, "loss": 0.4891, "step": 6911 }, { "epoch": 0.49236029490330163, "grad_norm": 1.806561827659607, "learning_rate": 8.993830675295345e-07, "loss": 0.3673, "step": 6912 }, { "epoch": 0.4924315275848559, "grad_norm": 3.0861849784851074, "learning_rate": 8.977557316887309e-07, "loss": 0.608, "step": 6913 }, { "epoch": 0.4925027602664102, "grad_norm": 5.912367820739746, "learning_rate": 8.961298002304841e-07, "loss": 0.3829, "step": 6914 }, { "epoch": 0.4925739929479645, "grad_norm": 3.6014726161956787, "learning_rate": 8.945052734056581e-07, "loss": 0.7214, "step": 6915 }, { "epoch": 0.49264522562951885, "grad_norm": 3.054527997970581, "learning_rate": 8.928821514648977e-07, "loss": 0.7013, "step": 6916 }, { "epoch": 0.49271645831107314, "grad_norm": 1.9560402631759644, "learning_rate": 8.912604346586362e-07, "loss": 0.1256, "step": 6917 }, { "epoch": 0.4927876909926274, "grad_norm": 4.856667518615723, "learning_rate": 8.896401232370889e-07, "loss": 0.4805, "step": 6918 }, { "epoch": 0.4928589236741817, "grad_norm": 4.076572895050049, "learning_rate": 8.880212174502512e-07, "loss": 0.7536, "step": 6919 }, { "epoch": 0.492930156355736, "grad_norm": 4.3914923667907715, "learning_rate": 8.864037175479034e-07, "loss": 0.4675, "step": 6920 }, { "epoch": 0.4930013890372903, "grad_norm": 3.151677131652832, "learning_rate": 8.847876237796127e-07, "loss": 0.8723, "step": 6921 }, { "epoch": 0.4930726217188446, "grad_norm": 2.6197128295898438, "learning_rate": 8.831729363947216e-07, "loss": 0.5949, "step": 6922 }, { "epoch": 0.49314385440039893, "grad_norm": 4.393150329589844, "learning_rate": 8.815596556423611e-07, "loss": 0.4125, "step": 6923 }, { "epoch": 0.4932150870819532, "grad_norm": 2.495401620864868, "learning_rate": 8.799477817714452e-07, "loss": 0.3969, "step": 6924 }, { "epoch": 0.4932863197635075, "grad_norm": 2.5357189178466797, "learning_rate": 8.783373150306663e-07, "loss": 0.2539, "step": 6925 }, { "epoch": 0.4933575524450618, "grad_norm": 1.9841082096099854, "learning_rate": 8.767282556685053e-07, "loss": 0.1705, "step": 6926 }, { "epoch": 0.4934287851266161, "grad_norm": 3.923297166824341, "learning_rate": 8.75120603933225e-07, "loss": 0.4196, "step": 6927 }, { "epoch": 0.4935000178081704, "grad_norm": 2.986379623413086, "learning_rate": 8.735143600728646e-07, "loss": 0.6734, "step": 6928 }, { "epoch": 0.49357125048972467, "grad_norm": 3.682682991027832, "learning_rate": 8.71909524335256e-07, "loss": 0.5074, "step": 6929 }, { "epoch": 0.49364248317127896, "grad_norm": 2.711554765701294, "learning_rate": 8.703060969680055e-07, "loss": 0.4713, "step": 6930 }, { "epoch": 0.4937137158528333, "grad_norm": 2.264694929122925, "learning_rate": 8.687040782185074e-07, "loss": 0.2988, "step": 6931 }, { "epoch": 0.4937849485343876, "grad_norm": 2.9945425987243652, "learning_rate": 8.671034683339352e-07, "loss": 0.9607, "step": 6932 }, { "epoch": 0.4938561812159419, "grad_norm": 2.4905059337615967, "learning_rate": 8.65504267561248e-07, "loss": 0.3987, "step": 6933 }, { "epoch": 0.49392741389749617, "grad_norm": 3.3012304306030273, "learning_rate": 8.639064761471838e-07, "loss": 0.3767, "step": 6934 }, { "epoch": 0.49399864657905046, "grad_norm": 2.1931920051574707, "learning_rate": 8.623100943382667e-07, "loss": 0.2751, "step": 6935 }, { "epoch": 0.49406987926060475, "grad_norm": 2.9938712120056152, "learning_rate": 8.607151223808041e-07, "loss": 0.3261, "step": 6936 }, { "epoch": 0.49414111194215904, "grad_norm": 4.935549736022949, "learning_rate": 8.591215605208791e-07, "loss": 0.6788, "step": 6937 }, { "epoch": 0.4942123446237134, "grad_norm": 5.832387924194336, "learning_rate": 8.575294090043651e-07, "loss": 0.6813, "step": 6938 }, { "epoch": 0.4942835773052677, "grad_norm": 4.491652965545654, "learning_rate": 8.559386680769166e-07, "loss": 0.5747, "step": 6939 }, { "epoch": 0.49435480998682196, "grad_norm": 3.0779848098754883, "learning_rate": 8.543493379839629e-07, "loss": 0.42, "step": 6940 }, { "epoch": 0.49442604266837625, "grad_norm": 1.5512157678604126, "learning_rate": 8.527614189707245e-07, "loss": 0.211, "step": 6941 }, { "epoch": 0.49449727534993054, "grad_norm": 2.7590739727020264, "learning_rate": 8.511749112822032e-07, "loss": 0.5027, "step": 6942 }, { "epoch": 0.49456850803148483, "grad_norm": 2.75046706199646, "learning_rate": 8.495898151631765e-07, "loss": 0.4612, "step": 6943 }, { "epoch": 0.4946397407130391, "grad_norm": 2.5387589931488037, "learning_rate": 8.480061308582122e-07, "loss": 0.332, "step": 6944 }, { "epoch": 0.4947109733945934, "grad_norm": 2.9802141189575195, "learning_rate": 8.464238586116524e-07, "loss": 0.8421, "step": 6945 }, { "epoch": 0.49478220607614776, "grad_norm": 2.658884286880493, "learning_rate": 8.448429986676298e-07, "loss": 0.3173, "step": 6946 }, { "epoch": 0.49485343875770205, "grad_norm": 2.6953814029693604, "learning_rate": 8.432635512700505e-07, "loss": 0.8285, "step": 6947 }, { "epoch": 0.49492467143925634, "grad_norm": 3.1267330646514893, "learning_rate": 8.416855166626114e-07, "loss": 0.5126, "step": 6948 }, { "epoch": 0.4949959041208106, "grad_norm": 4.763150691986084, "learning_rate": 8.401088950887826e-07, "loss": 0.3223, "step": 6949 }, { "epoch": 0.4950671368023649, "grad_norm": 3.278834819793701, "learning_rate": 8.385336867918226e-07, "loss": 0.5392, "step": 6950 }, { "epoch": 0.4951383694839192, "grad_norm": 2.4613542556762695, "learning_rate": 8.369598920147715e-07, "loss": 0.4374, "step": 6951 }, { "epoch": 0.4952096021654735, "grad_norm": 1.959525227546692, "learning_rate": 8.353875110004462e-07, "loss": 0.2579, "step": 6952 }, { "epoch": 0.49528083484702784, "grad_norm": 3.1054019927978516, "learning_rate": 8.338165439914514e-07, "loss": 0.7087, "step": 6953 }, { "epoch": 0.49535206752858213, "grad_norm": 2.4320991039276123, "learning_rate": 8.3224699123017e-07, "loss": 0.1466, "step": 6954 }, { "epoch": 0.4954233002101364, "grad_norm": 3.0900497436523438, "learning_rate": 8.306788529587695e-07, "loss": 0.5381, "step": 6955 }, { "epoch": 0.4954945328916907, "grad_norm": 3.4924776554107666, "learning_rate": 8.291121294191951e-07, "loss": 0.5973, "step": 6956 }, { "epoch": 0.495565765573245, "grad_norm": 7.4852070808410645, "learning_rate": 8.275468208531767e-07, "loss": 0.3036, "step": 6957 }, { "epoch": 0.4956369982547993, "grad_norm": 4.957789421081543, "learning_rate": 8.25982927502228e-07, "loss": 0.6667, "step": 6958 }, { "epoch": 0.4957082309363536, "grad_norm": 3.662119150161743, "learning_rate": 8.244204496076402e-07, "loss": 0.3243, "step": 6959 }, { "epoch": 0.4957794636179079, "grad_norm": 3.690943717956543, "learning_rate": 8.22859387410484e-07, "loss": 0.3485, "step": 6960 }, { "epoch": 0.4958506962994622, "grad_norm": 2.280865430831909, "learning_rate": 8.212997411516199e-07, "loss": 0.3761, "step": 6961 }, { "epoch": 0.4959219289810165, "grad_norm": 10.248139381408691, "learning_rate": 8.197415110716822e-07, "loss": 0.4584, "step": 6962 }, { "epoch": 0.4959931616625708, "grad_norm": 4.0341796875, "learning_rate": 8.181846974110907e-07, "loss": 0.5795, "step": 6963 }, { "epoch": 0.4960643943441251, "grad_norm": 3.3893613815307617, "learning_rate": 8.166293004100478e-07, "loss": 0.3749, "step": 6964 }, { "epoch": 0.49613562702567937, "grad_norm": 1.6842421293258667, "learning_rate": 8.150753203085315e-07, "loss": 0.1938, "step": 6965 }, { "epoch": 0.49620685970723366, "grad_norm": 3.418952465057373, "learning_rate": 8.135227573463067e-07, "loss": 0.6606, "step": 6966 }, { "epoch": 0.49627809238878795, "grad_norm": 2.5414130687713623, "learning_rate": 8.119716117629206e-07, "loss": 0.2615, "step": 6967 }, { "epoch": 0.4963493250703423, "grad_norm": 2.6307597160339355, "learning_rate": 8.10421883797694e-07, "loss": 0.5967, "step": 6968 }, { "epoch": 0.4964205577518966, "grad_norm": 4.818629264831543, "learning_rate": 8.088735736897369e-07, "loss": 0.7667, "step": 6969 }, { "epoch": 0.4964917904334509, "grad_norm": 2.5803542137145996, "learning_rate": 8.07326681677938e-07, "loss": 0.203, "step": 6970 }, { "epoch": 0.49656302311500516, "grad_norm": 4.989433765411377, "learning_rate": 8.057812080009641e-07, "loss": 0.975, "step": 6971 }, { "epoch": 0.49663425579655945, "grad_norm": 3.1245601177215576, "learning_rate": 8.042371528972681e-07, "loss": 0.6288, "step": 6972 }, { "epoch": 0.49670548847811374, "grad_norm": 3.0013954639434814, "learning_rate": 8.026945166050837e-07, "loss": 0.8206, "step": 6973 }, { "epoch": 0.49677672115966803, "grad_norm": 4.0351691246032715, "learning_rate": 8.011532993624194e-07, "loss": 0.4076, "step": 6974 }, { "epoch": 0.4968479538412224, "grad_norm": 2.880011796951294, "learning_rate": 7.996135014070727e-07, "loss": 0.4892, "step": 6975 }, { "epoch": 0.49691918652277667, "grad_norm": 3.493957281112671, "learning_rate": 7.98075122976617e-07, "loss": 0.8013, "step": 6976 }, { "epoch": 0.49699041920433096, "grad_norm": 3.3819124698638916, "learning_rate": 7.965381643084069e-07, "loss": 0.6933, "step": 6977 }, { "epoch": 0.49706165188588525, "grad_norm": 3.0437910556793213, "learning_rate": 7.950026256395804e-07, "loss": 0.2781, "step": 6978 }, { "epoch": 0.49713288456743954, "grad_norm": 3.028006076812744, "learning_rate": 7.934685072070569e-07, "loss": 0.6458, "step": 6979 }, { "epoch": 0.4972041172489938, "grad_norm": 6.487387657165527, "learning_rate": 7.919358092475326e-07, "loss": 0.3331, "step": 6980 }, { "epoch": 0.4972753499305481, "grad_norm": 4.034944534301758, "learning_rate": 7.904045319974885e-07, "loss": 0.7502, "step": 6981 }, { "epoch": 0.4973465826121024, "grad_norm": 11.030532836914062, "learning_rate": 7.888746756931865e-07, "loss": 0.7637, "step": 6982 }, { "epoch": 0.49741781529365675, "grad_norm": 2.9480693340301514, "learning_rate": 7.873462405706633e-07, "loss": 0.6236, "step": 6983 }, { "epoch": 0.49748904797521104, "grad_norm": 3.0904791355133057, "learning_rate": 7.858192268657438e-07, "loss": 0.3578, "step": 6984 }, { "epoch": 0.49756028065676533, "grad_norm": 3.204613447189331, "learning_rate": 7.842936348140317e-07, "loss": 0.4284, "step": 6985 }, { "epoch": 0.4976315133383196, "grad_norm": 2.548462152481079, "learning_rate": 7.827694646509065e-07, "loss": 0.7166, "step": 6986 }, { "epoch": 0.4977027460198739, "grad_norm": 3.3622846603393555, "learning_rate": 7.812467166115334e-07, "loss": 0.5579, "step": 6987 }, { "epoch": 0.4977739787014282, "grad_norm": 3.849102735519409, "learning_rate": 7.797253909308588e-07, "loss": 0.8524, "step": 6988 }, { "epoch": 0.4978452113829825, "grad_norm": 2.029212236404419, "learning_rate": 7.782054878436051e-07, "loss": 0.2338, "step": 6989 }, { "epoch": 0.49791644406453683, "grad_norm": 4.046153545379639, "learning_rate": 7.766870075842792e-07, "loss": 0.8265, "step": 6990 }, { "epoch": 0.4979876767460911, "grad_norm": 3.02897310256958, "learning_rate": 7.751699503871646e-07, "loss": 0.4295, "step": 6991 }, { "epoch": 0.4980589094276454, "grad_norm": 3.3403680324554443, "learning_rate": 7.736543164863319e-07, "loss": 0.5538, "step": 6992 }, { "epoch": 0.4981301421091997, "grad_norm": 3.379850387573242, "learning_rate": 7.721401061156231e-07, "loss": 0.6317, "step": 6993 }, { "epoch": 0.498201374790754, "grad_norm": 4.575530052185059, "learning_rate": 7.706273195086667e-07, "loss": 0.7812, "step": 6994 }, { "epoch": 0.4982726074723083, "grad_norm": 4.768683433532715, "learning_rate": 7.691159568988727e-07, "loss": 0.3357, "step": 6995 }, { "epoch": 0.49834384015386257, "grad_norm": 3.0543744564056396, "learning_rate": 7.676060185194256e-07, "loss": 0.5748, "step": 6996 }, { "epoch": 0.4984150728354169, "grad_norm": 3.42824387550354, "learning_rate": 7.660975046032948e-07, "loss": 0.4535, "step": 6997 }, { "epoch": 0.4984863055169712, "grad_norm": 2.36903715133667, "learning_rate": 7.645904153832295e-07, "loss": 0.4884, "step": 6998 }, { "epoch": 0.4985575381985255, "grad_norm": 4.3946685791015625, "learning_rate": 7.63084751091755e-07, "loss": 0.3233, "step": 6999 }, { "epoch": 0.4986287708800798, "grad_norm": 3.3960068225860596, "learning_rate": 7.615805119611818e-07, "loss": 0.2332, "step": 7000 }, { "epoch": 0.4987000035616341, "grad_norm": 2.8848719596862793, "learning_rate": 7.600776982235992e-07, "loss": 0.666, "step": 7001 }, { "epoch": 0.49877123624318837, "grad_norm": 2.2715649604797363, "learning_rate": 7.585763101108746e-07, "loss": 0.2746, "step": 7002 }, { "epoch": 0.49884246892474265, "grad_norm": 3.673194408416748, "learning_rate": 7.570763478546572e-07, "loss": 0.7834, "step": 7003 }, { "epoch": 0.49891370160629694, "grad_norm": 4.193108558654785, "learning_rate": 7.555778116863755e-07, "loss": 0.1901, "step": 7004 }, { "epoch": 0.4989849342878513, "grad_norm": 3.8741989135742188, "learning_rate": 7.540807018372387e-07, "loss": 0.8202, "step": 7005 }, { "epoch": 0.4990561669694056, "grad_norm": 4.8872480392456055, "learning_rate": 7.525850185382344e-07, "loss": 0.5501, "step": 7006 }, { "epoch": 0.49912739965095987, "grad_norm": 2.549118995666504, "learning_rate": 7.510907620201335e-07, "loss": 0.7379, "step": 7007 }, { "epoch": 0.49919863233251416, "grad_norm": 0.5704389810562134, "learning_rate": 7.495979325134806e-07, "loss": 0.0284, "step": 7008 }, { "epoch": 0.49926986501406845, "grad_norm": 4.41850471496582, "learning_rate": 7.481065302486057e-07, "loss": 0.7527, "step": 7009 }, { "epoch": 0.49934109769562274, "grad_norm": 4.402604103088379, "learning_rate": 7.466165554556193e-07, "loss": 0.4555, "step": 7010 }, { "epoch": 0.499412330377177, "grad_norm": 3.9272332191467285, "learning_rate": 7.451280083644052e-07, "loss": 0.165, "step": 7011 }, { "epoch": 0.4994835630587314, "grad_norm": 3.0579049587249756, "learning_rate": 7.436408892046321e-07, "loss": 0.2472, "step": 7012 }, { "epoch": 0.49955479574028566, "grad_norm": 3.631671190261841, "learning_rate": 7.421551982057496e-07, "loss": 0.7755, "step": 7013 }, { "epoch": 0.49962602842183995, "grad_norm": 8.378089904785156, "learning_rate": 7.406709355969821e-07, "loss": 0.3562, "step": 7014 }, { "epoch": 0.49969726110339424, "grad_norm": 2.406205177307129, "learning_rate": 7.391881016073354e-07, "loss": 0.3843, "step": 7015 }, { "epoch": 0.49976849378494853, "grad_norm": 4.510846138000488, "learning_rate": 7.377066964655987e-07, "loss": 0.6895, "step": 7016 }, { "epoch": 0.4998397264665028, "grad_norm": 2.8362464904785156, "learning_rate": 7.362267204003337e-07, "loss": 0.4048, "step": 7017 }, { "epoch": 0.4999109591480571, "grad_norm": 3.6531670093536377, "learning_rate": 7.347481736398876e-07, "loss": 0.7736, "step": 7018 }, { "epoch": 0.4999821918296114, "grad_norm": 3.4931163787841797, "learning_rate": 7.332710564123869e-07, "loss": 0.5629, "step": 7019 }, { "epoch": 0.5000534245111657, "grad_norm": 3.938913345336914, "learning_rate": 7.317953689457325e-07, "loss": 0.6316, "step": 7020 }, { "epoch": 0.50012465719272, "grad_norm": 4.7716193199157715, "learning_rate": 7.303211114676067e-07, "loss": 0.5894, "step": 7021 }, { "epoch": 0.5001958898742743, "grad_norm": 3.637650728225708, "learning_rate": 7.288482842054767e-07, "loss": 0.8965, "step": 7022 }, { "epoch": 0.5002671225558286, "grad_norm": 3.717162847518921, "learning_rate": 7.273768873865794e-07, "loss": 0.7754, "step": 7023 }, { "epoch": 0.5003383552373829, "grad_norm": 2.7143845558166504, "learning_rate": 7.259069212379399e-07, "loss": 0.5177, "step": 7024 }, { "epoch": 0.5004095879189372, "grad_norm": 3.0608928203582764, "learning_rate": 7.244383859863591e-07, "loss": 0.477, "step": 7025 }, { "epoch": 0.5004808206004915, "grad_norm": 4.5193328857421875, "learning_rate": 7.229712818584134e-07, "loss": 0.5351, "step": 7026 }, { "epoch": 0.5005520532820458, "grad_norm": 4.158038139343262, "learning_rate": 7.215056090804651e-07, "loss": 0.8929, "step": 7027 }, { "epoch": 0.5006232859636001, "grad_norm": 1.9390133619308472, "learning_rate": 7.200413678786522e-07, "loss": 0.108, "step": 7028 }, { "epoch": 0.5006945186451544, "grad_norm": 5.877832412719727, "learning_rate": 7.185785584788896e-07, "loss": 0.6044, "step": 7029 }, { "epoch": 0.5007657513267086, "grad_norm": 2.137346029281616, "learning_rate": 7.171171811068744e-07, "loss": 0.4834, "step": 7030 }, { "epoch": 0.500836984008263, "grad_norm": 2.8656558990478516, "learning_rate": 7.156572359880842e-07, "loss": 0.3832, "step": 7031 }, { "epoch": 0.5009082166898173, "grad_norm": 4.632926940917969, "learning_rate": 7.141987233477732e-07, "loss": 0.7806, "step": 7032 }, { "epoch": 0.5009794493713716, "grad_norm": 3.237718343734741, "learning_rate": 7.127416434109724e-07, "loss": 0.6559, "step": 7033 }, { "epoch": 0.5010506820529259, "grad_norm": 11.359260559082031, "learning_rate": 7.112859964024977e-07, "loss": 0.2754, "step": 7034 }, { "epoch": 0.5011219147344802, "grad_norm": 1.8970661163330078, "learning_rate": 7.098317825469381e-07, "loss": 0.3858, "step": 7035 }, { "epoch": 0.5011931474160345, "grad_norm": 3.170715093612671, "learning_rate": 7.083790020686632e-07, "loss": 0.7492, "step": 7036 }, { "epoch": 0.5012643800975888, "grad_norm": 3.8105626106262207, "learning_rate": 7.069276551918225e-07, "loss": 0.2472, "step": 7037 }, { "epoch": 0.5013356127791431, "grad_norm": 3.4479997158050537, "learning_rate": 7.054777421403469e-07, "loss": 0.6435, "step": 7038 }, { "epoch": 0.5014068454606974, "grad_norm": 4.047771453857422, "learning_rate": 7.040292631379386e-07, "loss": 0.5481, "step": 7039 }, { "epoch": 0.5014780781422516, "grad_norm": 2.8221096992492676, "learning_rate": 7.025822184080844e-07, "loss": 0.404, "step": 7040 }, { "epoch": 0.5015493108238059, "grad_norm": 4.452408790588379, "learning_rate": 7.011366081740512e-07, "loss": 0.6145, "step": 7041 }, { "epoch": 0.5016205435053602, "grad_norm": 2.5688114166259766, "learning_rate": 6.996924326588772e-07, "loss": 0.3619, "step": 7042 }, { "epoch": 0.5016917761869145, "grad_norm": 4.849457740783691, "learning_rate": 6.982496920853876e-07, "loss": 0.605, "step": 7043 }, { "epoch": 0.5017630088684688, "grad_norm": 3.3648898601531982, "learning_rate": 6.968083866761821e-07, "loss": 0.5162, "step": 7044 }, { "epoch": 0.5018342415500231, "grad_norm": 3.184480905532837, "learning_rate": 6.953685166536361e-07, "loss": 0.5609, "step": 7045 }, { "epoch": 0.5019054742315775, "grad_norm": 2.692234516143799, "learning_rate": 6.939300822399086e-07, "loss": 0.5, "step": 7046 }, { "epoch": 0.5019767069131318, "grad_norm": 3.8278088569641113, "learning_rate": 6.924930836569377e-07, "loss": 0.5319, "step": 7047 }, { "epoch": 0.5020479395946861, "grad_norm": 4.263569355010986, "learning_rate": 6.910575211264336e-07, "loss": 0.4289, "step": 7048 }, { "epoch": 0.5021191722762404, "grad_norm": 1.6501566171646118, "learning_rate": 6.896233948698916e-07, "loss": 0.3094, "step": 7049 }, { "epoch": 0.5021904049577947, "grad_norm": 2.4768497943878174, "learning_rate": 6.881907051085801e-07, "loss": 0.5641, "step": 7050 }, { "epoch": 0.502261637639349, "grad_norm": 6.547874450683594, "learning_rate": 6.867594520635512e-07, "loss": 0.8116, "step": 7051 }, { "epoch": 0.5023328703209032, "grad_norm": 3.2118263244628906, "learning_rate": 6.853296359556294e-07, "loss": 0.4591, "step": 7052 }, { "epoch": 0.5024041030024575, "grad_norm": 2.748988151550293, "learning_rate": 6.839012570054249e-07, "loss": 0.1723, "step": 7053 }, { "epoch": 0.5024753356840118, "grad_norm": 4.520891189575195, "learning_rate": 6.824743154333157e-07, "loss": 0.4147, "step": 7054 }, { "epoch": 0.5025465683655661, "grad_norm": 2.10227108001709, "learning_rate": 6.810488114594694e-07, "loss": 0.3429, "step": 7055 }, { "epoch": 0.5026178010471204, "grad_norm": 3.3220009803771973, "learning_rate": 6.796247453038252e-07, "loss": 0.8197, "step": 7056 }, { "epoch": 0.5026890337286747, "grad_norm": 2.5653116703033447, "learning_rate": 6.782021171861008e-07, "loss": 0.6055, "step": 7057 }, { "epoch": 0.502760266410229, "grad_norm": 3.448395252227783, "learning_rate": 6.76780927325793e-07, "loss": 0.4146, "step": 7058 }, { "epoch": 0.5028314990917833, "grad_norm": 2.6981191635131836, "learning_rate": 6.753611759421796e-07, "loss": 0.1345, "step": 7059 }, { "epoch": 0.5029027317733376, "grad_norm": 2.280843496322632, "learning_rate": 6.739428632543099e-07, "loss": 0.5596, "step": 7060 }, { "epoch": 0.502973964454892, "grad_norm": 2.5431878566741943, "learning_rate": 6.725259894810165e-07, "loss": 0.5266, "step": 7061 }, { "epoch": 0.5030451971364462, "grad_norm": 2.644406795501709, "learning_rate": 6.711105548409103e-07, "loss": 0.7285, "step": 7062 }, { "epoch": 0.5031164298180005, "grad_norm": 2.215074062347412, "learning_rate": 6.696965595523741e-07, "loss": 0.1578, "step": 7063 }, { "epoch": 0.5031876624995548, "grad_norm": 4.8253092765808105, "learning_rate": 6.682840038335781e-07, "loss": 0.3884, "step": 7064 }, { "epoch": 0.5032588951811091, "grad_norm": 2.7443697452545166, "learning_rate": 6.6687288790246e-07, "loss": 0.3672, "step": 7065 }, { "epoch": 0.5033301278626634, "grad_norm": 2.5844039916992188, "learning_rate": 6.654632119767446e-07, "loss": 0.3427, "step": 7066 }, { "epoch": 0.5034013605442177, "grad_norm": 3.3051793575286865, "learning_rate": 6.640549762739257e-07, "loss": 0.7149, "step": 7067 }, { "epoch": 0.503472593225772, "grad_norm": 3.9498555660247803, "learning_rate": 6.62648181011284e-07, "loss": 0.2801, "step": 7068 }, { "epoch": 0.5035438259073263, "grad_norm": 2.545088052749634, "learning_rate": 6.612428264058723e-07, "loss": 0.5936, "step": 7069 }, { "epoch": 0.5036150585888806, "grad_norm": 3.494051933288574, "learning_rate": 6.598389126745209e-07, "loss": 0.1903, "step": 7070 }, { "epoch": 0.5036862912704348, "grad_norm": 2.94842791557312, "learning_rate": 6.584364400338395e-07, "loss": 0.3941, "step": 7071 }, { "epoch": 0.5037575239519891, "grad_norm": 3.382988929748535, "learning_rate": 6.570354087002173e-07, "loss": 0.9815, "step": 7072 }, { "epoch": 0.5038287566335434, "grad_norm": 2.7471420764923096, "learning_rate": 6.55635818889817e-07, "loss": 0.4493, "step": 7073 }, { "epoch": 0.5038999893150977, "grad_norm": 3.9391772747039795, "learning_rate": 6.542376708185816e-07, "loss": 0.3606, "step": 7074 }, { "epoch": 0.5039712219966521, "grad_norm": 3.424945116043091, "learning_rate": 6.528409647022316e-07, "loss": 0.6616, "step": 7075 }, { "epoch": 0.5040424546782064, "grad_norm": 2.763226270675659, "learning_rate": 6.514457007562625e-07, "loss": 0.3519, "step": 7076 }, { "epoch": 0.5041136873597607, "grad_norm": 2.2798383235931396, "learning_rate": 6.500518791959498e-07, "loss": 0.4391, "step": 7077 }, { "epoch": 0.504184920041315, "grad_norm": 3.3153367042541504, "learning_rate": 6.486595002363494e-07, "loss": 0.6116, "step": 7078 }, { "epoch": 0.5042561527228693, "grad_norm": 3.8383946418762207, "learning_rate": 6.47268564092286e-07, "loss": 0.6092, "step": 7079 }, { "epoch": 0.5043273854044236, "grad_norm": 3.0323102474212646, "learning_rate": 6.45879070978368e-07, "loss": 0.6208, "step": 7080 }, { "epoch": 0.5043986180859779, "grad_norm": 3.200270175933838, "learning_rate": 6.444910211089827e-07, "loss": 0.3586, "step": 7081 }, { "epoch": 0.5044698507675321, "grad_norm": 1.3147733211517334, "learning_rate": 6.431044146982868e-07, "loss": 0.1346, "step": 7082 }, { "epoch": 0.5045410834490864, "grad_norm": 3.1278791427612305, "learning_rate": 6.417192519602233e-07, "loss": 0.3028, "step": 7083 }, { "epoch": 0.5046123161306407, "grad_norm": 2.4814915657043457, "learning_rate": 6.403355331085092e-07, "loss": 0.3134, "step": 7084 }, { "epoch": 0.504683548812195, "grad_norm": 5.392724990844727, "learning_rate": 6.389532583566338e-07, "loss": 0.5944, "step": 7085 }, { "epoch": 0.5047547814937493, "grad_norm": 2.6514840126037598, "learning_rate": 6.375724279178719e-07, "loss": 0.4138, "step": 7086 }, { "epoch": 0.5048260141753036, "grad_norm": 2.593513250350952, "learning_rate": 6.361930420052709e-07, "loss": 0.1097, "step": 7087 }, { "epoch": 0.5048972468568579, "grad_norm": 2.6298668384552, "learning_rate": 6.348151008316539e-07, "loss": 0.5342, "step": 7088 }, { "epoch": 0.5049684795384122, "grad_norm": 3.504694700241089, "learning_rate": 6.334386046096231e-07, "loss": 0.4233, "step": 7089 }, { "epoch": 0.5050397122199666, "grad_norm": 2.8575878143310547, "learning_rate": 6.320635535515607e-07, "loss": 0.4335, "step": 7090 }, { "epoch": 0.5051109449015209, "grad_norm": 3.0193405151367188, "learning_rate": 6.306899478696193e-07, "loss": 0.3942, "step": 7091 }, { "epoch": 0.5051821775830752, "grad_norm": 2.2725863456726074, "learning_rate": 6.293177877757339e-07, "loss": 0.309, "step": 7092 }, { "epoch": 0.5052534102646294, "grad_norm": 2.568166732788086, "learning_rate": 6.279470734816162e-07, "loss": 0.3785, "step": 7093 }, { "epoch": 0.5053246429461837, "grad_norm": 4.316408157348633, "learning_rate": 6.265778051987492e-07, "loss": 0.6568, "step": 7094 }, { "epoch": 0.505395875627738, "grad_norm": 2.8176040649414062, "learning_rate": 6.252099831384018e-07, "loss": 0.5066, "step": 7095 }, { "epoch": 0.5054671083092923, "grad_norm": 2.924787759780884, "learning_rate": 6.238436075116117e-07, "loss": 0.2594, "step": 7096 }, { "epoch": 0.5055383409908466, "grad_norm": 3.1109111309051514, "learning_rate": 6.22478678529197e-07, "loss": 0.4404, "step": 7097 }, { "epoch": 0.5056095736724009, "grad_norm": 1.8457469940185547, "learning_rate": 6.211151964017503e-07, "loss": 0.3909, "step": 7098 }, { "epoch": 0.5056808063539552, "grad_norm": 2.8827149868011475, "learning_rate": 6.197531613396479e-07, "loss": 0.2626, "step": 7099 }, { "epoch": 0.5057520390355095, "grad_norm": 1.7248761653900146, "learning_rate": 6.183925735530327e-07, "loss": 0.2094, "step": 7100 }, { "epoch": 0.5058232717170638, "grad_norm": 1.9571735858917236, "learning_rate": 6.170334332518325e-07, "loss": 0.2026, "step": 7101 }, { "epoch": 0.505894504398618, "grad_norm": 2.408919334411621, "learning_rate": 6.156757406457481e-07, "loss": 0.2744, "step": 7102 }, { "epoch": 0.5059657370801723, "grad_norm": 3.468468427658081, "learning_rate": 6.143194959442566e-07, "loss": 0.7062, "step": 7103 }, { "epoch": 0.5060369697617266, "grad_norm": 3.052624225616455, "learning_rate": 6.129646993566118e-07, "loss": 0.3128, "step": 7104 }, { "epoch": 0.506108202443281, "grad_norm": 3.1647281646728516, "learning_rate": 6.116113510918476e-07, "loss": 0.642, "step": 7105 }, { "epoch": 0.5061794351248353, "grad_norm": 2.6511964797973633, "learning_rate": 6.102594513587701e-07, "loss": 0.0851, "step": 7106 }, { "epoch": 0.5062506678063896, "grad_norm": 2.8216590881347656, "learning_rate": 6.089090003659637e-07, "loss": 0.2972, "step": 7107 }, { "epoch": 0.5063219004879439, "grad_norm": 4.126372337341309, "learning_rate": 6.075599983217895e-07, "loss": 0.6848, "step": 7108 }, { "epoch": 0.5063931331694982, "grad_norm": 1.6875137090682983, "learning_rate": 6.062124454343832e-07, "loss": 0.1374, "step": 7109 }, { "epoch": 0.5064643658510525, "grad_norm": 2.4085373878479004, "learning_rate": 6.048663419116607e-07, "loss": 0.5314, "step": 7110 }, { "epoch": 0.5065355985326068, "grad_norm": 2.6984362602233887, "learning_rate": 6.035216879613082e-07, "loss": 0.1644, "step": 7111 }, { "epoch": 0.5066068312141611, "grad_norm": 3.6792924404144287, "learning_rate": 6.021784837907962e-07, "loss": 0.4828, "step": 7112 }, { "epoch": 0.5066780638957153, "grad_norm": 3.8966691493988037, "learning_rate": 6.008367296073636e-07, "loss": 0.1986, "step": 7113 }, { "epoch": 0.5067492965772696, "grad_norm": 2.1929075717926025, "learning_rate": 5.994964256180313e-07, "loss": 0.2146, "step": 7114 }, { "epoch": 0.5068205292588239, "grad_norm": 3.6177923679351807, "learning_rate": 5.981575720295963e-07, "loss": 0.2444, "step": 7115 }, { "epoch": 0.5068917619403782, "grad_norm": 3.0888490676879883, "learning_rate": 5.968201690486252e-07, "loss": 0.6589, "step": 7116 }, { "epoch": 0.5069629946219325, "grad_norm": 4.690827369689941, "learning_rate": 5.954842168814679e-07, "loss": 0.9515, "step": 7117 }, { "epoch": 0.5070342273034868, "grad_norm": 3.1460814476013184, "learning_rate": 5.941497157342502e-07, "loss": 0.7351, "step": 7118 }, { "epoch": 0.5071054599850411, "grad_norm": 3.758777618408203, "learning_rate": 5.928166658128687e-07, "loss": 0.781, "step": 7119 }, { "epoch": 0.5071766926665955, "grad_norm": 2.2301180362701416, "learning_rate": 5.914850673229988e-07, "loss": 0.3092, "step": 7120 }, { "epoch": 0.5072479253481498, "grad_norm": 3.0696041584014893, "learning_rate": 5.901549204700974e-07, "loss": 0.4781, "step": 7121 }, { "epoch": 0.5073191580297041, "grad_norm": 2.219264507293701, "learning_rate": 5.888262254593869e-07, "loss": 0.351, "step": 7122 }, { "epoch": 0.5073903907112584, "grad_norm": 3.055255889892578, "learning_rate": 5.874989824958744e-07, "loss": 0.5255, "step": 7123 }, { "epoch": 0.5074616233928126, "grad_norm": 3.5506930351257324, "learning_rate": 5.861731917843383e-07, "loss": 0.5795, "step": 7124 }, { "epoch": 0.5075328560743669, "grad_norm": 4.004717826843262, "learning_rate": 5.848488535293362e-07, "loss": 0.4382, "step": 7125 }, { "epoch": 0.5076040887559212, "grad_norm": 5.44902229309082, "learning_rate": 5.835259679351968e-07, "loss": 0.7967, "step": 7126 }, { "epoch": 0.5076753214374755, "grad_norm": 6.355368137359619, "learning_rate": 5.822045352060313e-07, "loss": 0.6142, "step": 7127 }, { "epoch": 0.5077465541190298, "grad_norm": 3.423246145248413, "learning_rate": 5.808845555457198e-07, "loss": 0.6276, "step": 7128 }, { "epoch": 0.5078177868005841, "grad_norm": 2.3688621520996094, "learning_rate": 5.795660291579241e-07, "loss": 0.2357, "step": 7129 }, { "epoch": 0.5078890194821384, "grad_norm": 4.284274578094482, "learning_rate": 5.782489562460791e-07, "loss": 0.388, "step": 7130 }, { "epoch": 0.5079602521636927, "grad_norm": 3.452381134033203, "learning_rate": 5.769333370133933e-07, "loss": 0.5917, "step": 7131 }, { "epoch": 0.508031484845247, "grad_norm": 3.4803826808929443, "learning_rate": 5.756191716628556e-07, "loss": 0.5408, "step": 7132 }, { "epoch": 0.5081027175268013, "grad_norm": 3.502436399459839, "learning_rate": 5.743064603972282e-07, "loss": 0.8203, "step": 7133 }, { "epoch": 0.5081739502083555, "grad_norm": 3.583850860595703, "learning_rate": 5.729952034190467e-07, "loss": 0.6003, "step": 7134 }, { "epoch": 0.5082451828899099, "grad_norm": 2.9292495250701904, "learning_rate": 5.71685400930626e-07, "loss": 0.6202, "step": 7135 }, { "epoch": 0.5083164155714642, "grad_norm": 2.695899248123169, "learning_rate": 5.703770531340569e-07, "loss": 0.2878, "step": 7136 }, { "epoch": 0.5083876482530185, "grad_norm": 2.6095192432403564, "learning_rate": 5.69070160231201e-07, "loss": 0.3679, "step": 7137 }, { "epoch": 0.5084588809345728, "grad_norm": 2.380871295928955, "learning_rate": 5.677647224236982e-07, "loss": 0.4612, "step": 7138 }, { "epoch": 0.5085301136161271, "grad_norm": 3.180985689163208, "learning_rate": 5.664607399129684e-07, "loss": 0.2766, "step": 7139 }, { "epoch": 0.5086013462976814, "grad_norm": 3.457551956176758, "learning_rate": 5.651582129001987e-07, "loss": 0.8088, "step": 7140 }, { "epoch": 0.5086725789792357, "grad_norm": 3.2328855991363525, "learning_rate": 5.638571415863559e-07, "loss": 0.684, "step": 7141 }, { "epoch": 0.50874381166079, "grad_norm": 2.157127857208252, "learning_rate": 5.625575261721838e-07, "loss": 0.2532, "step": 7142 }, { "epoch": 0.5088150443423443, "grad_norm": 2.56624698638916, "learning_rate": 5.612593668581978e-07, "loss": 0.2591, "step": 7143 }, { "epoch": 0.5088862770238985, "grad_norm": 1.8575184345245361, "learning_rate": 5.599626638446898e-07, "loss": 0.2309, "step": 7144 }, { "epoch": 0.5089575097054528, "grad_norm": 1.5549707412719727, "learning_rate": 5.586674173317308e-07, "loss": 0.0444, "step": 7145 }, { "epoch": 0.5090287423870071, "grad_norm": 3.6813292503356934, "learning_rate": 5.573736275191622e-07, "loss": 0.3947, "step": 7146 }, { "epoch": 0.5090999750685614, "grad_norm": 3.0581016540527344, "learning_rate": 5.560812946066029e-07, "loss": 0.9587, "step": 7147 }, { "epoch": 0.5091712077501157, "grad_norm": 1.9513366222381592, "learning_rate": 5.54790418793445e-07, "loss": 0.3382, "step": 7148 }, { "epoch": 0.50924244043167, "grad_norm": 2.7805681228637695, "learning_rate": 5.53501000278861e-07, "loss": 0.6942, "step": 7149 }, { "epoch": 0.5093136731132244, "grad_norm": 2.583946704864502, "learning_rate": 5.522130392617908e-07, "loss": 0.2142, "step": 7150 }, { "epoch": 0.5093849057947787, "grad_norm": 3.961430072784424, "learning_rate": 5.509265359409544e-07, "loss": 0.576, "step": 7151 }, { "epoch": 0.509456138476333, "grad_norm": 3.6132519245147705, "learning_rate": 5.496414905148495e-07, "loss": 0.7205, "step": 7152 }, { "epoch": 0.5095273711578873, "grad_norm": 3.8218839168548584, "learning_rate": 5.48357903181741e-07, "loss": 0.5112, "step": 7153 }, { "epoch": 0.5095986038394416, "grad_norm": 4.782495975494385, "learning_rate": 5.47075774139676e-07, "loss": 0.7699, "step": 7154 }, { "epoch": 0.5096698365209958, "grad_norm": 2.456644296646118, "learning_rate": 5.457951035864729e-07, "loss": 0.2944, "step": 7155 }, { "epoch": 0.5097410692025501, "grad_norm": 1.7652498483657837, "learning_rate": 5.445158917197246e-07, "loss": 0.1535, "step": 7156 }, { "epoch": 0.5098123018841044, "grad_norm": 4.420324325561523, "learning_rate": 5.432381387368014e-07, "loss": 0.733, "step": 7157 }, { "epoch": 0.5098835345656587, "grad_norm": 4.40230655670166, "learning_rate": 5.419618448348485e-07, "loss": 0.8712, "step": 7158 }, { "epoch": 0.509954767247213, "grad_norm": 3.241487741470337, "learning_rate": 5.40687010210783e-07, "loss": 0.516, "step": 7159 }, { "epoch": 0.5100259999287673, "grad_norm": 2.2996950149536133, "learning_rate": 5.394136350613e-07, "loss": 0.4085, "step": 7160 }, { "epoch": 0.5100972326103216, "grad_norm": 4.399352073669434, "learning_rate": 5.381417195828698e-07, "loss": 0.358, "step": 7161 }, { "epoch": 0.5101684652918759, "grad_norm": 2.4533164501190186, "learning_rate": 5.368712639717311e-07, "loss": 0.6607, "step": 7162 }, { "epoch": 0.5102396979734302, "grad_norm": 4.730014324188232, "learning_rate": 5.35602268423906e-07, "loss": 0.4914, "step": 7163 }, { "epoch": 0.5103109306549845, "grad_norm": 1.989198923110962, "learning_rate": 5.343347331351878e-07, "loss": 0.2332, "step": 7164 }, { "epoch": 0.5103821633365389, "grad_norm": 3.009842872619629, "learning_rate": 5.330686583011413e-07, "loss": 0.6556, "step": 7165 }, { "epoch": 0.5104533960180931, "grad_norm": 2.942322254180908, "learning_rate": 5.318040441171101e-07, "loss": 0.5842, "step": 7166 }, { "epoch": 0.5105246286996474, "grad_norm": 5.381338596343994, "learning_rate": 5.305408907782128e-07, "loss": 0.2975, "step": 7167 }, { "epoch": 0.5105958613812017, "grad_norm": 3.218686819076538, "learning_rate": 5.292791984793388e-07, "loss": 0.7563, "step": 7168 }, { "epoch": 0.510667094062756, "grad_norm": 2.5473177433013916, "learning_rate": 5.280189674151559e-07, "loss": 0.3388, "step": 7169 }, { "epoch": 0.5107383267443103, "grad_norm": 2.428277015686035, "learning_rate": 5.267601977801018e-07, "loss": 0.4497, "step": 7170 }, { "epoch": 0.5108095594258646, "grad_norm": 2.962090492248535, "learning_rate": 5.255028897683956e-07, "loss": 0.463, "step": 7171 }, { "epoch": 0.5108807921074189, "grad_norm": 5.582705974578857, "learning_rate": 5.242470435740232e-07, "loss": 0.3299, "step": 7172 }, { "epoch": 0.5109520247889732, "grad_norm": 2.7032206058502197, "learning_rate": 5.229926593907531e-07, "loss": 0.3225, "step": 7173 }, { "epoch": 0.5110232574705275, "grad_norm": 3.536829710006714, "learning_rate": 5.217397374121192e-07, "loss": 0.3025, "step": 7174 }, { "epoch": 0.5110944901520817, "grad_norm": 2.3112151622772217, "learning_rate": 5.204882778314358e-07, "loss": 0.6975, "step": 7175 }, { "epoch": 0.511165722833636, "grad_norm": 3.7248263359069824, "learning_rate": 5.192382808417939e-07, "loss": 0.6517, "step": 7176 }, { "epoch": 0.5112369555151903, "grad_norm": 3.3052966594696045, "learning_rate": 5.179897466360495e-07, "loss": 0.7765, "step": 7177 }, { "epoch": 0.5113081881967446, "grad_norm": 4.127822399139404, "learning_rate": 5.167426754068427e-07, "loss": 0.7462, "step": 7178 }, { "epoch": 0.511379420878299, "grad_norm": 16.24900245666504, "learning_rate": 5.154970673465831e-07, "loss": 0.4086, "step": 7179 }, { "epoch": 0.5114506535598533, "grad_norm": 1.6410331726074219, "learning_rate": 5.142529226474536e-07, "loss": 0.236, "step": 7180 }, { "epoch": 0.5115218862414076, "grad_norm": 2.5697178840637207, "learning_rate": 5.130102415014137e-07, "loss": 0.4096, "step": 7181 }, { "epoch": 0.5115931189229619, "grad_norm": 3.9551451206207275, "learning_rate": 5.11769024100196e-07, "loss": 0.7269, "step": 7182 }, { "epoch": 0.5116643516045162, "grad_norm": 2.3693883419036865, "learning_rate": 5.105292706353093e-07, "loss": 0.5533, "step": 7183 }, { "epoch": 0.5117355842860705, "grad_norm": 3.283262014389038, "learning_rate": 5.09290981298034e-07, "loss": 0.6624, "step": 7184 }, { "epoch": 0.5118068169676248, "grad_norm": 2.7254750728607178, "learning_rate": 5.080541562794239e-07, "loss": 0.4013, "step": 7185 }, { "epoch": 0.511878049649179, "grad_norm": 2.3901591300964355, "learning_rate": 5.068187957703097e-07, "loss": 0.2652, "step": 7186 }, { "epoch": 0.5119492823307333, "grad_norm": 2.3569319248199463, "learning_rate": 5.055848999612934e-07, "loss": 0.2324, "step": 7187 }, { "epoch": 0.5120205150122876, "grad_norm": 3.012967109680176, "learning_rate": 5.043524690427537e-07, "loss": 0.4579, "step": 7188 }, { "epoch": 0.5120917476938419, "grad_norm": 13.71623420715332, "learning_rate": 5.031215032048431e-07, "loss": 0.3825, "step": 7189 }, { "epoch": 0.5121629803753962, "grad_norm": 3.105757474899292, "learning_rate": 5.018920026374841e-07, "loss": 0.5007, "step": 7190 }, { "epoch": 0.5122342130569505, "grad_norm": 2.266240358352661, "learning_rate": 5.006639675303781e-07, "loss": 0.2406, "step": 7191 }, { "epoch": 0.5123054457385048, "grad_norm": 2.4211995601654053, "learning_rate": 4.994373980729983e-07, "loss": 0.3096, "step": 7192 }, { "epoch": 0.5123766784200591, "grad_norm": 2.2920732498168945, "learning_rate": 4.982122944545908e-07, "loss": 0.2168, "step": 7193 }, { "epoch": 0.5124479111016135, "grad_norm": 2.670802116394043, "learning_rate": 4.969886568641757e-07, "loss": 0.1696, "step": 7194 }, { "epoch": 0.5125191437831678, "grad_norm": 3.296621561050415, "learning_rate": 4.957664854905508e-07, "loss": 0.8539, "step": 7195 }, { "epoch": 0.512590376464722, "grad_norm": 2.631026029586792, "learning_rate": 4.945457805222809e-07, "loss": 0.3557, "step": 7196 }, { "epoch": 0.5126616091462763, "grad_norm": 2.441922187805176, "learning_rate": 4.933265421477096e-07, "loss": 0.367, "step": 7197 }, { "epoch": 0.5127328418278306, "grad_norm": 3.2386527061462402, "learning_rate": 4.921087705549544e-07, "loss": 0.6647, "step": 7198 }, { "epoch": 0.5128040745093849, "grad_norm": 3.7563369274139404, "learning_rate": 4.908924659319037e-07, "loss": 0.4602, "step": 7199 }, { "epoch": 0.5128753071909392, "grad_norm": 3.630751609802246, "learning_rate": 4.896776284662186e-07, "loss": 0.5371, "step": 7200 }, { "epoch": 0.5129465398724935, "grad_norm": 2.7215218544006348, "learning_rate": 4.884642583453403e-07, "loss": 0.3699, "step": 7201 }, { "epoch": 0.5130177725540478, "grad_norm": 4.664460182189941, "learning_rate": 4.872523557564756e-07, "loss": 0.3533, "step": 7202 }, { "epoch": 0.5130890052356021, "grad_norm": 3.4788641929626465, "learning_rate": 4.860419208866096e-07, "loss": 0.5605, "step": 7203 }, { "epoch": 0.5131602379171564, "grad_norm": 3.6448776721954346, "learning_rate": 4.848329539225027e-07, "loss": 0.4927, "step": 7204 }, { "epoch": 0.5132314705987107, "grad_norm": 2.5010929107666016, "learning_rate": 4.836254550506814e-07, "loss": 0.2741, "step": 7205 }, { "epoch": 0.513302703280265, "grad_norm": 2.9513514041900635, "learning_rate": 4.824194244574531e-07, "loss": 0.7751, "step": 7206 }, { "epoch": 0.5133739359618192, "grad_norm": 2.883650779724121, "learning_rate": 4.81214862328897e-07, "loss": 0.7055, "step": 7207 }, { "epoch": 0.5134451686433735, "grad_norm": 2.700439691543579, "learning_rate": 4.80011768850862e-07, "loss": 0.3149, "step": 7208 }, { "epoch": 0.5135164013249279, "grad_norm": 3.024247646331787, "learning_rate": 4.788101442089732e-07, "loss": 0.7762, "step": 7209 }, { "epoch": 0.5135876340064822, "grad_norm": 2.292707681655884, "learning_rate": 4.77609988588632e-07, "loss": 0.333, "step": 7210 }, { "epoch": 0.5136588666880365, "grad_norm": 4.843536376953125, "learning_rate": 4.764113021750061e-07, "loss": 0.5064, "step": 7211 }, { "epoch": 0.5137300993695908, "grad_norm": 2.417818784713745, "learning_rate": 4.752140851530429e-07, "loss": 0.2582, "step": 7212 }, { "epoch": 0.5138013320511451, "grad_norm": 6.513185501098633, "learning_rate": 4.740183377074603e-07, "loss": 0.3231, "step": 7213 }, { "epoch": 0.5138725647326994, "grad_norm": 2.35105037689209, "learning_rate": 4.728240600227496e-07, "loss": 0.3681, "step": 7214 }, { "epoch": 0.5139437974142537, "grad_norm": 2.2293806076049805, "learning_rate": 4.7163125228317565e-07, "loss": 0.3116, "step": 7215 }, { "epoch": 0.514015030095808, "grad_norm": 4.149982929229736, "learning_rate": 4.704399146727767e-07, "loss": 0.6606, "step": 7216 }, { "epoch": 0.5140862627773622, "grad_norm": 6.1696295738220215, "learning_rate": 4.692500473753625e-07, "loss": 0.2956, "step": 7217 }, { "epoch": 0.5141574954589165, "grad_norm": 3.5890140533447266, "learning_rate": 4.6806165057451833e-07, "loss": 0.5625, "step": 7218 }, { "epoch": 0.5142287281404708, "grad_norm": 3.27402663230896, "learning_rate": 4.6687472445360206e-07, "loss": 0.1709, "step": 7219 }, { "epoch": 0.5142999608220251, "grad_norm": 2.7515766620635986, "learning_rate": 4.656892691957426e-07, "loss": 0.2846, "step": 7220 }, { "epoch": 0.5143711935035794, "grad_norm": 3.1696417331695557, "learning_rate": 4.6450528498384493e-07, "loss": 0.5442, "step": 7221 }, { "epoch": 0.5144424261851337, "grad_norm": 2.246270179748535, "learning_rate": 4.6332277200058397e-07, "loss": 0.4428, "step": 7222 }, { "epoch": 0.514513658866688, "grad_norm": 3.408012866973877, "learning_rate": 4.621417304284126e-07, "loss": 0.4448, "step": 7223 }, { "epoch": 0.5145848915482424, "grad_norm": 3.199068546295166, "learning_rate": 4.609621604495507e-07, "loss": 0.3674, "step": 7224 }, { "epoch": 0.5146561242297967, "grad_norm": 2.7457895278930664, "learning_rate": 4.597840622459937e-07, "loss": 0.3773, "step": 7225 }, { "epoch": 0.514727356911351, "grad_norm": 2.2761178016662598, "learning_rate": 4.5860743599951186e-07, "loss": 0.3033, "step": 7226 }, { "epoch": 0.5147985895929053, "grad_norm": 3.4888570308685303, "learning_rate": 4.574322818916443e-07, "loss": 0.71, "step": 7227 }, { "epoch": 0.5148698222744595, "grad_norm": 5.0437822341918945, "learning_rate": 4.5625860010370726e-07, "loss": 0.2366, "step": 7228 }, { "epoch": 0.5149410549560138, "grad_norm": 2.198338508605957, "learning_rate": 4.550863908167846e-07, "loss": 0.4654, "step": 7229 }, { "epoch": 0.5150122876375681, "grad_norm": 3.2792088985443115, "learning_rate": 4.5391565421174065e-07, "loss": 0.3345, "step": 7230 }, { "epoch": 0.5150835203191224, "grad_norm": 4.313316345214844, "learning_rate": 4.527463904692042e-07, "loss": 0.2279, "step": 7231 }, { "epoch": 0.5151547530006767, "grad_norm": 2.4347567558288574, "learning_rate": 4.515785997695832e-07, "loss": 0.2324, "step": 7232 }, { "epoch": 0.515225985682231, "grad_norm": 2.5585811138153076, "learning_rate": 4.5041228229305343e-07, "loss": 0.3454, "step": 7233 }, { "epoch": 0.5152972183637853, "grad_norm": 2.9775772094726562, "learning_rate": 4.492474382195666e-07, "loss": 0.2582, "step": 7234 }, { "epoch": 0.5153684510453396, "grad_norm": 4.047214508056641, "learning_rate": 4.480840677288478e-07, "loss": 0.6604, "step": 7235 }, { "epoch": 0.5154396837268939, "grad_norm": 2.8986968994140625, "learning_rate": 4.4692217100039013e-07, "loss": 0.6183, "step": 7236 }, { "epoch": 0.5155109164084481, "grad_norm": 2.9824278354644775, "learning_rate": 4.457617482134635e-07, "loss": 0.3525, "step": 7237 }, { "epoch": 0.5155821490900024, "grad_norm": 3.7801830768585205, "learning_rate": 4.446027995471114e-07, "loss": 0.8195, "step": 7238 }, { "epoch": 0.5156533817715568, "grad_norm": 2.588034152984619, "learning_rate": 4.4344532518014405e-07, "loss": 0.3125, "step": 7239 }, { "epoch": 0.5157246144531111, "grad_norm": 3.813239812850952, "learning_rate": 4.4228932529114975e-07, "loss": 0.5172, "step": 7240 }, { "epoch": 0.5157958471346654, "grad_norm": 4.110038757324219, "learning_rate": 4.411348000584881e-07, "loss": 0.5821, "step": 7241 }, { "epoch": 0.5158670798162197, "grad_norm": 3.410630941390991, "learning_rate": 4.3998174966028875e-07, "loss": 0.7399, "step": 7242 }, { "epoch": 0.515938312497774, "grad_norm": 3.475984573364258, "learning_rate": 4.3883017427445717e-07, "loss": 0.3586, "step": 7243 }, { "epoch": 0.5160095451793283, "grad_norm": 2.7212443351745605, "learning_rate": 4.3768007407866685e-07, "loss": 0.3546, "step": 7244 }, { "epoch": 0.5160807778608826, "grad_norm": 2.740640640258789, "learning_rate": 4.3653144925037025e-07, "loss": 0.2465, "step": 7245 }, { "epoch": 0.5161520105424369, "grad_norm": 4.34317684173584, "learning_rate": 4.3538429996678567e-07, "loss": 0.8716, "step": 7246 }, { "epoch": 0.5162232432239912, "grad_norm": 3.7734198570251465, "learning_rate": 4.342386264049081e-07, "loss": 0.3242, "step": 7247 }, { "epoch": 0.5162944759055454, "grad_norm": 3.74568247795105, "learning_rate": 4.3309442874150063e-07, "loss": 0.438, "step": 7248 }, { "epoch": 0.5163657085870997, "grad_norm": 3.061685800552368, "learning_rate": 4.319517071531021e-07, "loss": 0.3687, "step": 7249 }, { "epoch": 0.516436941268654, "grad_norm": 3.0279006958007812, "learning_rate": 4.3081046181602583e-07, "loss": 0.8306, "step": 7250 }, { "epoch": 0.5165081739502083, "grad_norm": 3.8167357444763184, "learning_rate": 4.296706929063499e-07, "loss": 0.5398, "step": 7251 }, { "epoch": 0.5165794066317626, "grad_norm": 3.923177719116211, "learning_rate": 4.285324005999303e-07, "loss": 0.7383, "step": 7252 }, { "epoch": 0.5166506393133169, "grad_norm": 3.5786919593811035, "learning_rate": 4.2739558507239543e-07, "loss": 0.4957, "step": 7253 }, { "epoch": 0.5167218719948713, "grad_norm": 4.488072872161865, "learning_rate": 4.2626024649914275e-07, "loss": 0.3529, "step": 7254 }, { "epoch": 0.5167931046764256, "grad_norm": 1.8422389030456543, "learning_rate": 4.251263850553433e-07, "loss": 0.3158, "step": 7255 }, { "epoch": 0.5168643373579799, "grad_norm": 1.9354150295257568, "learning_rate": 4.2399400091594154e-07, "loss": 0.3675, "step": 7256 }, { "epoch": 0.5169355700395342, "grad_norm": 5.893336296081543, "learning_rate": 4.2286309425564997e-07, "loss": 0.2118, "step": 7257 }, { "epoch": 0.5170068027210885, "grad_norm": 3.260591506958008, "learning_rate": 4.2173366524895787e-07, "loss": 0.5953, "step": 7258 }, { "epoch": 0.5170780354026427, "grad_norm": 2.5647685527801514, "learning_rate": 4.2060571407012583e-07, "loss": 0.3911, "step": 7259 }, { "epoch": 0.517149268084197, "grad_norm": 6.826327323913574, "learning_rate": 4.1947924089318247e-07, "loss": 0.6345, "step": 7260 }, { "epoch": 0.5172205007657513, "grad_norm": 3.02736234664917, "learning_rate": 4.1835424589193096e-07, "loss": 0.6495, "step": 7261 }, { "epoch": 0.5172917334473056, "grad_norm": 5.11752462387085, "learning_rate": 4.17230729239948e-07, "loss": 0.6763, "step": 7262 }, { "epoch": 0.5173629661288599, "grad_norm": 1.8962597846984863, "learning_rate": 4.161086911105816e-07, "loss": 0.0951, "step": 7263 }, { "epoch": 0.5174341988104142, "grad_norm": 2.5982837677001953, "learning_rate": 4.1498813167694776e-07, "loss": 0.5689, "step": 7264 }, { "epoch": 0.5175054314919685, "grad_norm": 2.7144675254821777, "learning_rate": 4.138690511119381e-07, "loss": 0.515, "step": 7265 }, { "epoch": 0.5175766641735228, "grad_norm": 4.055663585662842, "learning_rate": 4.127514495882168e-07, "loss": 0.2615, "step": 7266 }, { "epoch": 0.5176478968550771, "grad_norm": 1.7971875667572021, "learning_rate": 4.1163532727821696e-07, "loss": 0.2187, "step": 7267 }, { "epoch": 0.5177191295366315, "grad_norm": 2.8879809379577637, "learning_rate": 4.1052068435414426e-07, "loss": 0.503, "step": 7268 }, { "epoch": 0.5177903622181858, "grad_norm": 3.240346908569336, "learning_rate": 4.094075209879789e-07, "loss": 0.7112, "step": 7269 }, { "epoch": 0.51786159489974, "grad_norm": 2.816014528274536, "learning_rate": 4.082958373514689e-07, "loss": 0.652, "step": 7270 }, { "epoch": 0.5179328275812943, "grad_norm": 3.4296717643737793, "learning_rate": 4.0718563361613396e-07, "loss": 0.3012, "step": 7271 }, { "epoch": 0.5180040602628486, "grad_norm": 4.3344926834106445, "learning_rate": 4.060769099532713e-07, "loss": 0.398, "step": 7272 }, { "epoch": 0.5180752929444029, "grad_norm": 4.981000900268555, "learning_rate": 4.04969666533942e-07, "loss": 0.4613, "step": 7273 }, { "epoch": 0.5181465256259572, "grad_norm": 3.9553542137145996, "learning_rate": 4.0386390352898376e-07, "loss": 0.2875, "step": 7274 }, { "epoch": 0.5182177583075115, "grad_norm": 2.4791500568389893, "learning_rate": 4.0275962110900455e-07, "loss": 0.3777, "step": 7275 }, { "epoch": 0.5182889909890658, "grad_norm": 2.220536231994629, "learning_rate": 4.016568194443826e-07, "loss": 0.2365, "step": 7276 }, { "epoch": 0.5183602236706201, "grad_norm": 2.5073249340057373, "learning_rate": 4.0055549870526955e-07, "loss": 0.3583, "step": 7277 }, { "epoch": 0.5184314563521744, "grad_norm": 4.617092132568359, "learning_rate": 3.9945565906158833e-07, "loss": 0.2868, "step": 7278 }, { "epoch": 0.5185026890337286, "grad_norm": 3.297714948654175, "learning_rate": 3.9835730068303215e-07, "loss": 0.6411, "step": 7279 }, { "epoch": 0.5185739217152829, "grad_norm": 8.427702903747559, "learning_rate": 3.9726042373906536e-07, "loss": 0.127, "step": 7280 }, { "epoch": 0.5186451543968372, "grad_norm": 4.889952659606934, "learning_rate": 3.961650283989282e-07, "loss": 0.4657, "step": 7281 }, { "epoch": 0.5187163870783915, "grad_norm": 3.3820276260375977, "learning_rate": 3.9507111483162554e-07, "loss": 0.6969, "step": 7282 }, { "epoch": 0.5187876197599459, "grad_norm": 2.6312150955200195, "learning_rate": 3.939786832059389e-07, "loss": 0.2511, "step": 7283 }, { "epoch": 0.5188588524415002, "grad_norm": 2.5803451538085938, "learning_rate": 3.928877336904191e-07, "loss": 0.3679, "step": 7284 }, { "epoch": 0.5189300851230545, "grad_norm": 2.2186317443847656, "learning_rate": 3.9179826645338594e-07, "loss": 0.378, "step": 7285 }, { "epoch": 0.5190013178046088, "grad_norm": 3.73803448677063, "learning_rate": 3.90710281662936e-07, "loss": 0.4826, "step": 7286 }, { "epoch": 0.5190725504861631, "grad_norm": 2.616384744644165, "learning_rate": 3.8962377948693395e-07, "loss": 0.0871, "step": 7287 }, { "epoch": 0.5191437831677174, "grad_norm": 2.191718578338623, "learning_rate": 3.885387600930135e-07, "loss": 0.2707, "step": 7288 }, { "epoch": 0.5192150158492717, "grad_norm": 3.404226541519165, "learning_rate": 3.8745522364858513e-07, "loss": 0.1538, "step": 7289 }, { "epoch": 0.519286248530826, "grad_norm": 3.2766611576080322, "learning_rate": 3.86373170320824e-07, "loss": 0.6862, "step": 7290 }, { "epoch": 0.5193574812123802, "grad_norm": 2.045926570892334, "learning_rate": 3.8529260027668325e-07, "loss": 0.3398, "step": 7291 }, { "epoch": 0.5194287138939345, "grad_norm": 2.6399519443511963, "learning_rate": 3.842135136828806e-07, "loss": 0.6511, "step": 7292 }, { "epoch": 0.5194999465754888, "grad_norm": 2.951551675796509, "learning_rate": 3.831359107059096e-07, "loss": 0.4247, "step": 7293 }, { "epoch": 0.5195711792570431, "grad_norm": 3.5684943199157715, "learning_rate": 3.8205979151203274e-07, "loss": 0.5291, "step": 7294 }, { "epoch": 0.5196424119385974, "grad_norm": 2.1468935012817383, "learning_rate": 3.809851562672839e-07, "loss": 0.4874, "step": 7295 }, { "epoch": 0.5197136446201517, "grad_norm": 5.2682881355285645, "learning_rate": 3.799120051374694e-07, "loss": 0.4848, "step": 7296 }, { "epoch": 0.519784877301706, "grad_norm": 3.3029208183288574, "learning_rate": 3.7884033828816556e-07, "loss": 0.561, "step": 7297 }, { "epoch": 0.5198561099832604, "grad_norm": 1.3828883171081543, "learning_rate": 3.77770155884718e-07, "loss": 0.1106, "step": 7298 }, { "epoch": 0.5199273426648147, "grad_norm": 4.840816497802734, "learning_rate": 3.7670145809224567e-07, "loss": 0.763, "step": 7299 }, { "epoch": 0.519998575346369, "grad_norm": 3.0040955543518066, "learning_rate": 3.7563424507563785e-07, "loss": 0.6745, "step": 7300 }, { "epoch": 0.5200698080279232, "grad_norm": 2.844226121902466, "learning_rate": 3.745685169995539e-07, "loss": 0.4684, "step": 7301 }, { "epoch": 0.5201410407094775, "grad_norm": 6.36546516418457, "learning_rate": 3.7350427402842446e-07, "loss": 0.3819, "step": 7302 }, { "epoch": 0.5202122733910318, "grad_norm": 4.70986795425415, "learning_rate": 3.7244151632645387e-07, "loss": 0.3543, "step": 7303 }, { "epoch": 0.5202835060725861, "grad_norm": 2.3736560344696045, "learning_rate": 3.7138024405761197e-07, "loss": 0.5787, "step": 7304 }, { "epoch": 0.5203547387541404, "grad_norm": 3.0161523818969727, "learning_rate": 3.7032045738564114e-07, "loss": 0.8169, "step": 7305 }, { "epoch": 0.5204259714356947, "grad_norm": 3.2749743461608887, "learning_rate": 3.692621564740584e-07, "loss": 0.366, "step": 7306 }, { "epoch": 0.520497204117249, "grad_norm": 2.800293445587158, "learning_rate": 3.682053414861475e-07, "loss": 0.6009, "step": 7307 }, { "epoch": 0.5205684367988033, "grad_norm": 3.1516129970550537, "learning_rate": 3.6715001258496365e-07, "loss": 0.2254, "step": 7308 }, { "epoch": 0.5206396694803576, "grad_norm": 3.0057079792022705, "learning_rate": 3.660961699333343e-07, "loss": 0.5407, "step": 7309 }, { "epoch": 0.5207109021619118, "grad_norm": 2.6712427139282227, "learning_rate": 3.65043813693855e-07, "loss": 0.2754, "step": 7310 }, { "epoch": 0.5207821348434661, "grad_norm": 2.240623950958252, "learning_rate": 3.6399294402889473e-07, "loss": 0.0955, "step": 7311 }, { "epoch": 0.5208533675250204, "grad_norm": 4.042974948883057, "learning_rate": 3.629435611005916e-07, "loss": 0.6393, "step": 7312 }, { "epoch": 0.5209246002065748, "grad_norm": 1.920744776725769, "learning_rate": 3.618956650708549e-07, "loss": 0.2087, "step": 7313 }, { "epoch": 0.5209958328881291, "grad_norm": 4.098276615142822, "learning_rate": 3.608492561013632e-07, "loss": 0.297, "step": 7314 }, { "epoch": 0.5210670655696834, "grad_norm": 3.961179733276367, "learning_rate": 3.598043343535673e-07, "loss": 0.4995, "step": 7315 }, { "epoch": 0.5211382982512377, "grad_norm": 3.6389095783233643, "learning_rate": 3.5876089998868825e-07, "loss": 0.6548, "step": 7316 }, { "epoch": 0.521209530932792, "grad_norm": 3.637761354446411, "learning_rate": 3.577189531677161e-07, "loss": 0.3554, "step": 7317 }, { "epoch": 0.5212807636143463, "grad_norm": 2.4317569732666016, "learning_rate": 3.566784940514145e-07, "loss": 0.7043, "step": 7318 }, { "epoch": 0.5213519962959006, "grad_norm": 4.104288101196289, "learning_rate": 3.55639522800314e-07, "loss": 0.5246, "step": 7319 }, { "epoch": 0.5214232289774549, "grad_norm": 3.0311129093170166, "learning_rate": 3.546020395747163e-07, "loss": 0.2006, "step": 7320 }, { "epoch": 0.5214944616590091, "grad_norm": 1.367026448249817, "learning_rate": 3.5356604453469665e-07, "loss": 0.0559, "step": 7321 }, { "epoch": 0.5215656943405634, "grad_norm": 2.072713851928711, "learning_rate": 3.525315378400962e-07, "loss": 0.2941, "step": 7322 }, { "epoch": 0.5216369270221177, "grad_norm": 2.892385959625244, "learning_rate": 3.514985196505305e-07, "loss": 0.4223, "step": 7323 }, { "epoch": 0.521708159703672, "grad_norm": 3.440812110900879, "learning_rate": 3.504669901253832e-07, "loss": 0.3679, "step": 7324 }, { "epoch": 0.5217793923852263, "grad_norm": 3.121241569519043, "learning_rate": 3.4943694942380704e-07, "loss": 0.8591, "step": 7325 }, { "epoch": 0.5218506250667806, "grad_norm": 3.3079710006713867, "learning_rate": 3.484083977047281e-07, "loss": 0.4663, "step": 7326 }, { "epoch": 0.5219218577483349, "grad_norm": 3.918362617492676, "learning_rate": 3.473813351268429e-07, "loss": 0.5848, "step": 7327 }, { "epoch": 0.5219930904298893, "grad_norm": 3.6097209453582764, "learning_rate": 3.463557618486135e-07, "loss": 0.4479, "step": 7328 }, { "epoch": 0.5220643231114436, "grad_norm": 2.6871843338012695, "learning_rate": 3.453316780282767e-07, "loss": 0.3102, "step": 7329 }, { "epoch": 0.5221355557929979, "grad_norm": 2.9700417518615723, "learning_rate": 3.4430908382383944e-07, "loss": 0.3979, "step": 7330 }, { "epoch": 0.5222067884745522, "grad_norm": 6.1161322593688965, "learning_rate": 3.4328797939307435e-07, "loss": 0.4374, "step": 7331 }, { "epoch": 0.5222780211561064, "grad_norm": 3.087515115737915, "learning_rate": 3.4226836489352987e-07, "loss": 0.495, "step": 7332 }, { "epoch": 0.5223492538376607, "grad_norm": 4.935407638549805, "learning_rate": 3.412502404825224e-07, "loss": 0.392, "step": 7333 }, { "epoch": 0.522420486519215, "grad_norm": 2.417964458465576, "learning_rate": 3.402336063171352e-07, "loss": 0.4972, "step": 7334 }, { "epoch": 0.5224917192007693, "grad_norm": 3.2368688583374023, "learning_rate": 3.392184625542283e-07, "loss": 0.5477, "step": 7335 }, { "epoch": 0.5225629518823236, "grad_norm": 3.095021963119507, "learning_rate": 3.382048093504242e-07, "loss": 0.5834, "step": 7336 }, { "epoch": 0.5226341845638779, "grad_norm": 2.8518571853637695, "learning_rate": 3.371926468621212e-07, "loss": 0.664, "step": 7337 }, { "epoch": 0.5227054172454322, "grad_norm": 2.7889347076416016, "learning_rate": 3.3618197524548534e-07, "loss": 0.5326, "step": 7338 }, { "epoch": 0.5227766499269865, "grad_norm": 5.055323600769043, "learning_rate": 3.3517279465645204e-07, "loss": 0.6304, "step": 7339 }, { "epoch": 0.5228478826085408, "grad_norm": 2.582900047302246, "learning_rate": 3.3416510525072886e-07, "loss": 0.6306, "step": 7340 }, { "epoch": 0.522919115290095, "grad_norm": 2.6048319339752197, "learning_rate": 3.331589071837904e-07, "loss": 0.4795, "step": 7341 }, { "epoch": 0.5229903479716493, "grad_norm": 3.873405694961548, "learning_rate": 3.3215420061088245e-07, "loss": 0.6672, "step": 7342 }, { "epoch": 0.5230615806532037, "grad_norm": 2.859588146209717, "learning_rate": 3.311509856870243e-07, "loss": 0.6175, "step": 7343 }, { "epoch": 0.523132813334758, "grad_norm": 4.445398330688477, "learning_rate": 3.3014926256699665e-07, "loss": 0.4311, "step": 7344 }, { "epoch": 0.5232040460163123, "grad_norm": 2.372067928314209, "learning_rate": 3.2914903140535914e-07, "loss": 0.5268, "step": 7345 }, { "epoch": 0.5232752786978666, "grad_norm": 4.644402980804443, "learning_rate": 3.2815029235643505e-07, "loss": 0.5515, "step": 7346 }, { "epoch": 0.5233465113794209, "grad_norm": 3.2275583744049072, "learning_rate": 3.2715304557431994e-07, "loss": 0.7686, "step": 7347 }, { "epoch": 0.5234177440609752, "grad_norm": 1.9206202030181885, "learning_rate": 3.261572912128796e-07, "loss": 0.4832, "step": 7348 }, { "epoch": 0.5234889767425295, "grad_norm": 15.000120162963867, "learning_rate": 3.2516302942574794e-07, "loss": 0.5139, "step": 7349 }, { "epoch": 0.5235602094240838, "grad_norm": 1.9420912265777588, "learning_rate": 3.241702603663288e-07, "loss": 0.3867, "step": 7350 }, { "epoch": 0.5236314421056381, "grad_norm": 1.9078075885772705, "learning_rate": 3.2317898418779634e-07, "loss": 0.2849, "step": 7351 }, { "epoch": 0.5237026747871923, "grad_norm": 3.054232597351074, "learning_rate": 3.2218920104309605e-07, "loss": 0.5032, "step": 7352 }, { "epoch": 0.5237739074687466, "grad_norm": 3.042529582977295, "learning_rate": 3.212009110849379e-07, "loss": 0.699, "step": 7353 }, { "epoch": 0.5238451401503009, "grad_norm": 1.578521728515625, "learning_rate": 3.2021411446580774e-07, "loss": 0.1221, "step": 7354 }, { "epoch": 0.5239163728318552, "grad_norm": 2.696368455886841, "learning_rate": 3.1922881133795827e-07, "loss": 0.5791, "step": 7355 }, { "epoch": 0.5239876055134095, "grad_norm": 2.097703695297241, "learning_rate": 3.182450018534089e-07, "loss": 0.3598, "step": 7356 }, { "epoch": 0.5240588381949638, "grad_norm": 3.294520139694214, "learning_rate": 3.1726268616395273e-07, "loss": 0.4801, "step": 7357 }, { "epoch": 0.5241300708765182, "grad_norm": 2.670103073120117, "learning_rate": 3.1628186442115294e-07, "loss": 0.5807, "step": 7358 }, { "epoch": 0.5242013035580725, "grad_norm": 3.0777170658111572, "learning_rate": 3.1530253677633625e-07, "loss": 0.5191, "step": 7359 }, { "epoch": 0.5242725362396268, "grad_norm": 1.7526979446411133, "learning_rate": 3.143247033806063e-07, "loss": 0.1918, "step": 7360 }, { "epoch": 0.5243437689211811, "grad_norm": 2.440971612930298, "learning_rate": 3.133483643848323e-07, "loss": 0.5778, "step": 7361 }, { "epoch": 0.5244150016027354, "grad_norm": 3.2063658237457275, "learning_rate": 3.123735199396516e-07, "loss": 0.4531, "step": 7362 }, { "epoch": 0.5244862342842896, "grad_norm": 2.764845609664917, "learning_rate": 3.1140017019547385e-07, "loss": 0.3682, "step": 7363 }, { "epoch": 0.5245574669658439, "grad_norm": 3.3254623413085938, "learning_rate": 3.1042831530247566e-07, "loss": 0.2923, "step": 7364 }, { "epoch": 0.5246286996473982, "grad_norm": 2.667142629623413, "learning_rate": 3.0945795541060696e-07, "loss": 0.2481, "step": 7365 }, { "epoch": 0.5246999323289525, "grad_norm": 1.9321303367614746, "learning_rate": 3.0848909066958035e-07, "loss": 0.2148, "step": 7366 }, { "epoch": 0.5247711650105068, "grad_norm": 2.1404106616973877, "learning_rate": 3.07521721228885e-07, "loss": 0.1969, "step": 7367 }, { "epoch": 0.5248423976920611, "grad_norm": 4.160566806793213, "learning_rate": 3.06555847237775e-07, "loss": 0.5647, "step": 7368 }, { "epoch": 0.5249136303736154, "grad_norm": 2.9812381267547607, "learning_rate": 3.0559146884527324e-07, "loss": 0.5116, "step": 7369 }, { "epoch": 0.5249848630551697, "grad_norm": 2.7775421142578125, "learning_rate": 3.0462858620017633e-07, "loss": 0.4604, "step": 7370 }, { "epoch": 0.525056095736724, "grad_norm": 4.42488431930542, "learning_rate": 3.0366719945104427e-07, "loss": 0.3822, "step": 7371 }, { "epoch": 0.5251273284182784, "grad_norm": 4.2048869132995605, "learning_rate": 3.027073087462107e-07, "loss": 0.5854, "step": 7372 }, { "epoch": 0.5251985610998327, "grad_norm": 5.142922878265381, "learning_rate": 3.0174891423377595e-07, "loss": 0.7469, "step": 7373 }, { "epoch": 0.5252697937813869, "grad_norm": 2.9659533500671387, "learning_rate": 3.007920160616129e-07, "loss": 0.1409, "step": 7374 }, { "epoch": 0.5253410264629412, "grad_norm": 2.7532474994659424, "learning_rate": 2.998366143773579e-07, "loss": 0.7766, "step": 7375 }, { "epoch": 0.5254122591444955, "grad_norm": 2.765733480453491, "learning_rate": 2.988827093284219e-07, "loss": 0.4173, "step": 7376 }, { "epoch": 0.5254834918260498, "grad_norm": 2.918403148651123, "learning_rate": 2.9793030106198164e-07, "loss": 0.6417, "step": 7377 }, { "epoch": 0.5255547245076041, "grad_norm": 3.260082244873047, "learning_rate": 2.9697938972498287e-07, "loss": 0.5489, "step": 7378 }, { "epoch": 0.5256259571891584, "grad_norm": 3.261107921600342, "learning_rate": 2.960299754641438e-07, "loss": 0.579, "step": 7379 }, { "epoch": 0.5256971898707127, "grad_norm": 2.0193488597869873, "learning_rate": 2.9508205842594727e-07, "loss": 0.3042, "step": 7380 }, { "epoch": 0.525768422552267, "grad_norm": 2.193911075592041, "learning_rate": 2.941356387566474e-07, "loss": 0.4339, "step": 7381 }, { "epoch": 0.5258396552338213, "grad_norm": 1.5000758171081543, "learning_rate": 2.9319071660226737e-07, "loss": 0.1546, "step": 7382 }, { "epoch": 0.5259108879153755, "grad_norm": 9.618896484375, "learning_rate": 2.922472921086006e-07, "loss": 0.5134, "step": 7383 }, { "epoch": 0.5259821205969298, "grad_norm": 3.6583285331726074, "learning_rate": 2.913053654212039e-07, "loss": 0.7817, "step": 7384 }, { "epoch": 0.5260533532784841, "grad_norm": 3.841365098953247, "learning_rate": 2.9036493668541e-07, "loss": 0.6614, "step": 7385 }, { "epoch": 0.5261245859600384, "grad_norm": 4.7059149742126465, "learning_rate": 2.894260060463172e-07, "loss": 1.0482, "step": 7386 }, { "epoch": 0.5261958186415928, "grad_norm": 2.0539908409118652, "learning_rate": 2.884885736487919e-07, "loss": 0.1607, "step": 7387 }, { "epoch": 0.5262670513231471, "grad_norm": 2.662811517715454, "learning_rate": 2.875526396374695e-07, "loss": 0.4879, "step": 7388 }, { "epoch": 0.5263382840047014, "grad_norm": 4.3309245109558105, "learning_rate": 2.866182041567567e-07, "loss": 0.5829, "step": 7389 }, { "epoch": 0.5264095166862557, "grad_norm": 3.751035451889038, "learning_rate": 2.856852673508259e-07, "loss": 0.7315, "step": 7390 }, { "epoch": 0.52648074936781, "grad_norm": 3.563852310180664, "learning_rate": 2.8475382936362095e-07, "loss": 0.3748, "step": 7391 }, { "epoch": 0.5265519820493643, "grad_norm": 4.162822723388672, "learning_rate": 2.838238903388524e-07, "loss": 1.0498, "step": 7392 }, { "epoch": 0.5266232147309186, "grad_norm": 2.071932792663574, "learning_rate": 2.828954504199999e-07, "loss": 0.2843, "step": 7393 }, { "epoch": 0.5266944474124728, "grad_norm": 2.120229721069336, "learning_rate": 2.819685097503133e-07, "loss": 0.3156, "step": 7394 }, { "epoch": 0.5267656800940271, "grad_norm": 2.5819547176361084, "learning_rate": 2.810430684728094e-07, "loss": 0.3811, "step": 7395 }, { "epoch": 0.5268369127755814, "grad_norm": 2.03535532951355, "learning_rate": 2.8011912673027274e-07, "loss": 0.153, "step": 7396 }, { "epoch": 0.5269081454571357, "grad_norm": 2.094332456588745, "learning_rate": 2.791966846652594e-07, "loss": 0.2759, "step": 7397 }, { "epoch": 0.52697937813869, "grad_norm": 4.4255523681640625, "learning_rate": 2.7827574242009434e-07, "loss": 0.6836, "step": 7398 }, { "epoch": 0.5270506108202443, "grad_norm": 5.111942768096924, "learning_rate": 2.773563001368673e-07, "loss": 0.5143, "step": 7399 }, { "epoch": 0.5271218435017986, "grad_norm": 3.190901279449463, "learning_rate": 2.764383579574381e-07, "loss": 0.6412, "step": 7400 }, { "epoch": 0.5271930761833529, "grad_norm": 3.3009307384490967, "learning_rate": 2.75521916023439e-07, "loss": 0.485, "step": 7401 }, { "epoch": 0.5272643088649073, "grad_norm": 2.0613911151885986, "learning_rate": 2.7460697447626363e-07, "loss": 0.2546, "step": 7402 }, { "epoch": 0.5273355415464616, "grad_norm": 6.791894435882568, "learning_rate": 2.7369353345708006e-07, "loss": 0.3345, "step": 7403 }, { "epoch": 0.5274067742280159, "grad_norm": 2.8610899448394775, "learning_rate": 2.727815931068234e-07, "loss": 0.5418, "step": 7404 }, { "epoch": 0.5274780069095701, "grad_norm": 3.7295169830322266, "learning_rate": 2.7187115356619553e-07, "loss": 0.6049, "step": 7405 }, { "epoch": 0.5275492395911244, "grad_norm": 3.7452197074890137, "learning_rate": 2.7096221497566853e-07, "loss": 0.3509, "step": 7406 }, { "epoch": 0.5276204722726787, "grad_norm": 3.0864384174346924, "learning_rate": 2.7005477747548245e-07, "loss": 0.6803, "step": 7407 }, { "epoch": 0.527691704954233, "grad_norm": 2.6676981449127197, "learning_rate": 2.691488412056442e-07, "loss": 0.3796, "step": 7408 }, { "epoch": 0.5277629376357873, "grad_norm": 2.1242268085479736, "learning_rate": 2.682444063059331e-07, "loss": 0.2401, "step": 7409 }, { "epoch": 0.5278341703173416, "grad_norm": 3.262446880340576, "learning_rate": 2.6734147291589075e-07, "loss": 0.4289, "step": 7410 }, { "epoch": 0.5279054029988959, "grad_norm": 4.166593551635742, "learning_rate": 2.6644004117483357e-07, "loss": 0.4664, "step": 7411 }, { "epoch": 0.5279766356804502, "grad_norm": 3.845658540725708, "learning_rate": 2.655401112218403e-07, "loss": 0.353, "step": 7412 }, { "epoch": 0.5280478683620045, "grad_norm": 3.3459970951080322, "learning_rate": 2.646416831957621e-07, "loss": 0.5973, "step": 7413 }, { "epoch": 0.5281191010435587, "grad_norm": 1.8883562088012695, "learning_rate": 2.637447572352192e-07, "loss": 0.2508, "step": 7414 }, { "epoch": 0.528190333725113, "grad_norm": 3.6799581050872803, "learning_rate": 2.6284933347859534e-07, "loss": 0.3701, "step": 7415 }, { "epoch": 0.5282615664066673, "grad_norm": 2.713212013244629, "learning_rate": 2.619554120640455e-07, "loss": 0.4351, "step": 7416 }, { "epoch": 0.5283327990882217, "grad_norm": 2.0016391277313232, "learning_rate": 2.610629931294939e-07, "loss": 0.3713, "step": 7417 }, { "epoch": 0.528404031769776, "grad_norm": 2.1827645301818848, "learning_rate": 2.6017207681263033e-07, "loss": 0.2916, "step": 7418 }, { "epoch": 0.5284752644513303, "grad_norm": 3.7150087356567383, "learning_rate": 2.5928266325091377e-07, "loss": 0.4579, "step": 7419 }, { "epoch": 0.5285464971328846, "grad_norm": 2.821340799331665, "learning_rate": 2.583947525815733e-07, "loss": 0.4106, "step": 7420 }, { "epoch": 0.5286177298144389, "grad_norm": 1.9555166959762573, "learning_rate": 2.575083449416038e-07, "loss": 0.2174, "step": 7421 }, { "epoch": 0.5286889624959932, "grad_norm": 3.008690118789673, "learning_rate": 2.5662344046776697e-07, "loss": 0.453, "step": 7422 }, { "epoch": 0.5287601951775475, "grad_norm": 2.893326997756958, "learning_rate": 2.5574003929659697e-07, "loss": 0.383, "step": 7423 }, { "epoch": 0.5288314278591018, "grad_norm": 3.5224547386169434, "learning_rate": 2.548581415643936e-07, "loss": 0.3363, "step": 7424 }, { "epoch": 0.528902660540656, "grad_norm": 4.409058570861816, "learning_rate": 2.5397774740722134e-07, "loss": 0.5546, "step": 7425 }, { "epoch": 0.5289738932222103, "grad_norm": 3.4172451496124268, "learning_rate": 2.5309885696091943e-07, "loss": 0.5982, "step": 7426 }, { "epoch": 0.5290451259037646, "grad_norm": 3.9802308082580566, "learning_rate": 2.5222147036108925e-07, "loss": 0.8131, "step": 7427 }, { "epoch": 0.5291163585853189, "grad_norm": 5.955788612365723, "learning_rate": 2.513455877431037e-07, "loss": 0.702, "step": 7428 }, { "epoch": 0.5291875912668732, "grad_norm": 2.539759635925293, "learning_rate": 2.5047120924210243e-07, "loss": 0.1863, "step": 7429 }, { "epoch": 0.5292588239484275, "grad_norm": 3.1652987003326416, "learning_rate": 2.4959833499299314e-07, "loss": 0.3829, "step": 7430 }, { "epoch": 0.5293300566299818, "grad_norm": 2.597029685974121, "learning_rate": 2.4872696513045025e-07, "loss": 0.6094, "step": 7431 }, { "epoch": 0.5294012893115362, "grad_norm": 3.594470262527466, "learning_rate": 2.478570997889185e-07, "loss": 0.609, "step": 7432 }, { "epoch": 0.5294725219930905, "grad_norm": 1.7371569871902466, "learning_rate": 2.4698873910260824e-07, "loss": 0.0317, "step": 7433 }, { "epoch": 0.5295437546746448, "grad_norm": 3.910377264022827, "learning_rate": 2.46121883205499e-07, "loss": 0.4324, "step": 7434 }, { "epoch": 0.529614987356199, "grad_norm": 1.9691083431243896, "learning_rate": 2.452565322313383e-07, "loss": 0.2484, "step": 7435 }, { "epoch": 0.5296862200377533, "grad_norm": 4.2353315353393555, "learning_rate": 2.4439268631363924e-07, "loss": 0.7567, "step": 7436 }, { "epoch": 0.5297574527193076, "grad_norm": 4.619460105895996, "learning_rate": 2.435303455856863e-07, "loss": 0.6209, "step": 7437 }, { "epoch": 0.5298286854008619, "grad_norm": 2.872312068939209, "learning_rate": 2.426695101805288e-07, "loss": 0.2574, "step": 7438 }, { "epoch": 0.5298999180824162, "grad_norm": 3.793900728225708, "learning_rate": 2.418101802309847e-07, "loss": 0.5387, "step": 7439 }, { "epoch": 0.5299711507639705, "grad_norm": 5.071267604827881, "learning_rate": 2.4095235586963916e-07, "loss": 0.6685, "step": 7440 }, { "epoch": 0.5300423834455248, "grad_norm": 2.7095940113067627, "learning_rate": 2.4009603722884745e-07, "loss": 0.5167, "step": 7441 }, { "epoch": 0.5301136161270791, "grad_norm": 5.421020984649658, "learning_rate": 2.392412244407294e-07, "loss": 0.7642, "step": 7442 }, { "epoch": 0.5301848488086334, "grad_norm": 6.789554595947266, "learning_rate": 2.3838791763717283e-07, "loss": 0.9256, "step": 7443 }, { "epoch": 0.5302560814901877, "grad_norm": 5.3228373527526855, "learning_rate": 2.3753611694983693e-07, "loss": 0.2888, "step": 7444 }, { "epoch": 0.530327314171742, "grad_norm": 2.5435988903045654, "learning_rate": 2.3668582251014316e-07, "loss": 0.3251, "step": 7445 }, { "epoch": 0.5303985468532962, "grad_norm": 4.241490840911865, "learning_rate": 2.3583703444928442e-07, "loss": 0.4739, "step": 7446 }, { "epoch": 0.5304697795348506, "grad_norm": 2.783050298690796, "learning_rate": 2.3498975289822035e-07, "loss": 0.6624, "step": 7447 }, { "epoch": 0.5305410122164049, "grad_norm": 5.086959362030029, "learning_rate": 2.341439779876775e-07, "loss": 0.5904, "step": 7448 }, { "epoch": 0.5306122448979592, "grad_norm": 3.067317008972168, "learning_rate": 2.3329970984814932e-07, "loss": 0.5245, "step": 7449 }, { "epoch": 0.5306834775795135, "grad_norm": 3.6853713989257812, "learning_rate": 2.324569486098982e-07, "loss": 0.685, "step": 7450 }, { "epoch": 0.5307547102610678, "grad_norm": 5.385271072387695, "learning_rate": 2.3161569440295462e-07, "loss": 0.2861, "step": 7451 }, { "epoch": 0.5308259429426221, "grad_norm": 9.763182640075684, "learning_rate": 2.307759473571136e-07, "loss": 0.701, "step": 7452 }, { "epoch": 0.5308971756241764, "grad_norm": 2.2091639041900635, "learning_rate": 2.2993770760194044e-07, "loss": 0.3203, "step": 7453 }, { "epoch": 0.5309684083057307, "grad_norm": 9.340452194213867, "learning_rate": 2.2910097526676723e-07, "loss": 0.5324, "step": 7454 }, { "epoch": 0.531039640987285, "grad_norm": 1.6348552703857422, "learning_rate": 2.2826575048069287e-07, "loss": 0.2522, "step": 7455 }, { "epoch": 0.5311108736688392, "grad_norm": 1.9933967590332031, "learning_rate": 2.2743203337258323e-07, "loss": 0.2696, "step": 7456 }, { "epoch": 0.5311821063503935, "grad_norm": 3.3857033252716064, "learning_rate": 2.2659982407107427e-07, "loss": 0.746, "step": 7457 }, { "epoch": 0.5312533390319478, "grad_norm": 2.291717767715454, "learning_rate": 2.2576912270456442e-07, "loss": 0.2988, "step": 7458 }, { "epoch": 0.5313245717135021, "grad_norm": 3.3275604248046875, "learning_rate": 2.2493992940122334e-07, "loss": 0.7999, "step": 7459 }, { "epoch": 0.5313958043950564, "grad_norm": 2.364797592163086, "learning_rate": 2.241122442889887e-07, "loss": 0.5773, "step": 7460 }, { "epoch": 0.5314670370766108, "grad_norm": 3.609888792037964, "learning_rate": 2.232860674955617e-07, "loss": 0.4963, "step": 7461 }, { "epoch": 0.5315382697581651, "grad_norm": 2.8609507083892822, "learning_rate": 2.224613991484148e-07, "loss": 0.2329, "step": 7462 }, { "epoch": 0.5316095024397194, "grad_norm": 3.411076307296753, "learning_rate": 2.2163823937478512e-07, "loss": 0.4818, "step": 7463 }, { "epoch": 0.5316807351212737, "grad_norm": 3.0571889877319336, "learning_rate": 2.2081658830167552e-07, "loss": 0.6886, "step": 7464 }, { "epoch": 0.531751967802828, "grad_norm": 2.38433837890625, "learning_rate": 2.1999644605586122e-07, "loss": 0.4947, "step": 7465 }, { "epoch": 0.5318232004843823, "grad_norm": 2.5428340435028076, "learning_rate": 2.1917781276388217e-07, "loss": 0.5973, "step": 7466 }, { "epoch": 0.5318944331659365, "grad_norm": 2.847691774368286, "learning_rate": 2.1836068855204174e-07, "loss": 0.3822, "step": 7467 }, { "epoch": 0.5319656658474908, "grad_norm": 2.7947826385498047, "learning_rate": 2.1754507354641686e-07, "loss": 0.4283, "step": 7468 }, { "epoch": 0.5320368985290451, "grad_norm": 3.6702983379364014, "learning_rate": 2.1673096787284686e-07, "loss": 0.3501, "step": 7469 }, { "epoch": 0.5321081312105994, "grad_norm": 1.959649920463562, "learning_rate": 2.1591837165694018e-07, "loss": 0.3753, "step": 7470 }, { "epoch": 0.5321793638921537, "grad_norm": 3.809377670288086, "learning_rate": 2.1510728502407206e-07, "loss": 0.5753, "step": 7471 }, { "epoch": 0.532250596573708, "grad_norm": 2.8163955211639404, "learning_rate": 2.1429770809938577e-07, "loss": 0.608, "step": 7472 }, { "epoch": 0.5323218292552623, "grad_norm": 2.934256076812744, "learning_rate": 2.1348964100778914e-07, "loss": 0.3423, "step": 7473 }, { "epoch": 0.5323930619368166, "grad_norm": 3.502397298812866, "learning_rate": 2.1268308387395908e-07, "loss": 0.558, "step": 7474 }, { "epoch": 0.5324642946183709, "grad_norm": 2.2727811336517334, "learning_rate": 2.1187803682234055e-07, "loss": 0.2669, "step": 7475 }, { "epoch": 0.5325355272999253, "grad_norm": 3.1333260536193848, "learning_rate": 2.110744999771419e-07, "loss": 0.5021, "step": 7476 }, { "epoch": 0.5326067599814795, "grad_norm": 3.3653500080108643, "learning_rate": 2.102724734623407e-07, "loss": 0.6042, "step": 7477 }, { "epoch": 0.5326779926630338, "grad_norm": 4.4091796875, "learning_rate": 2.0947195740168347e-07, "loss": 0.577, "step": 7478 }, { "epoch": 0.5327492253445881, "grad_norm": 3.1905155181884766, "learning_rate": 2.086729519186803e-07, "loss": 0.3945, "step": 7479 }, { "epoch": 0.5328204580261424, "grad_norm": 4.646909236907959, "learning_rate": 2.0787545713660817e-07, "loss": 0.9256, "step": 7480 }, { "epoch": 0.5328916907076967, "grad_norm": 3.8900718688964844, "learning_rate": 2.0707947317851528e-07, "loss": 0.3051, "step": 7481 }, { "epoch": 0.532962923389251, "grad_norm": 5.948149681091309, "learning_rate": 2.062850001672112e-07, "loss": 0.33, "step": 7482 }, { "epoch": 0.5330341560708053, "grad_norm": 2.3961575031280518, "learning_rate": 2.0549203822527675e-07, "loss": 0.2244, "step": 7483 }, { "epoch": 0.5331053887523596, "grad_norm": 3.584526300430298, "learning_rate": 2.0470058747505516e-07, "loss": 0.4482, "step": 7484 }, { "epoch": 0.5331766214339139, "grad_norm": 1.9237407445907593, "learning_rate": 2.0391064803866213e-07, "loss": 0.2255, "step": 7485 }, { "epoch": 0.5332478541154682, "grad_norm": 3.056946277618408, "learning_rate": 2.0312222003797565e-07, "loss": 0.377, "step": 7486 }, { "epoch": 0.5333190867970224, "grad_norm": 2.1142704486846924, "learning_rate": 2.0233530359464183e-07, "loss": 0.3109, "step": 7487 }, { "epoch": 0.5333903194785767, "grad_norm": 1.7463321685791016, "learning_rate": 2.0154989883007458e-07, "loss": 0.1501, "step": 7488 }, { "epoch": 0.533461552160131, "grad_norm": 2.2283706665039062, "learning_rate": 2.007660058654537e-07, "loss": 0.3774, "step": 7489 }, { "epoch": 0.5335327848416853, "grad_norm": 4.428100109100342, "learning_rate": 1.9998362482172462e-07, "loss": 0.4715, "step": 7490 }, { "epoch": 0.5336040175232397, "grad_norm": 2.4414775371551514, "learning_rate": 1.9920275581960303e-07, "loss": 0.392, "step": 7491 }, { "epoch": 0.533675250204794, "grad_norm": 2.979792833328247, "learning_rate": 1.9842339897956585e-07, "loss": 0.386, "step": 7492 }, { "epoch": 0.5337464828863483, "grad_norm": 2.651026964187622, "learning_rate": 1.976455544218625e-07, "loss": 0.5557, "step": 7493 }, { "epoch": 0.5338177155679026, "grad_norm": 3.1157009601593018, "learning_rate": 1.9686922226650584e-07, "loss": 0.6489, "step": 7494 }, { "epoch": 0.5338889482494569, "grad_norm": 3.1939353942871094, "learning_rate": 1.960944026332745e-07, "loss": 0.7253, "step": 7495 }, { "epoch": 0.5339601809310112, "grad_norm": 5.455673694610596, "learning_rate": 1.953210956417162e-07, "loss": 0.7677, "step": 7496 }, { "epoch": 0.5340314136125655, "grad_norm": 2.084951162338257, "learning_rate": 1.9454930141114546e-07, "loss": 0.2948, "step": 7497 }, { "epoch": 0.5341026462941197, "grad_norm": 3.0726513862609863, "learning_rate": 1.9377902006063932e-07, "loss": 0.5351, "step": 7498 }, { "epoch": 0.534173878975674, "grad_norm": 2.8505306243896484, "learning_rate": 1.930102517090471e-07, "loss": 0.5535, "step": 7499 }, { "epoch": 0.5342451116572283, "grad_norm": 2.4944252967834473, "learning_rate": 1.9224299647498058e-07, "loss": 0.5197, "step": 7500 }, { "epoch": 0.5343163443387826, "grad_norm": 2.291916847229004, "learning_rate": 1.9147725447681841e-07, "loss": 0.2014, "step": 7501 }, { "epoch": 0.5343875770203369, "grad_norm": 1.6442835330963135, "learning_rate": 1.9071302583270724e-07, "loss": 0.0763, "step": 7502 }, { "epoch": 0.5344588097018912, "grad_norm": 2.33927059173584, "learning_rate": 1.8995031066056157e-07, "loss": 0.2518, "step": 7503 }, { "epoch": 0.5345300423834455, "grad_norm": 3.528688669204712, "learning_rate": 1.8918910907805733e-07, "loss": 0.1411, "step": 7504 }, { "epoch": 0.5346012750649998, "grad_norm": 2.888627529144287, "learning_rate": 1.8842942120264272e-07, "loss": 0.758, "step": 7505 }, { "epoch": 0.5346725077465542, "grad_norm": 3.1443932056427, "learning_rate": 1.8767124715152962e-07, "loss": 0.6028, "step": 7506 }, { "epoch": 0.5347437404281085, "grad_norm": 2.079202175140381, "learning_rate": 1.8691458704169442e-07, "loss": 0.3904, "step": 7507 }, { "epoch": 0.5348149731096628, "grad_norm": 4.632449150085449, "learning_rate": 1.861594409898826e-07, "loss": 0.4018, "step": 7508 }, { "epoch": 0.534886205791217, "grad_norm": 4.842185020446777, "learning_rate": 1.8540580911260764e-07, "loss": 0.5286, "step": 7509 }, { "epoch": 0.5349574384727713, "grad_norm": 6.618931770324707, "learning_rate": 1.846536915261443e-07, "loss": 0.4377, "step": 7510 }, { "epoch": 0.5350286711543256, "grad_norm": 3.3281478881835938, "learning_rate": 1.839030883465387e-07, "loss": 0.3055, "step": 7511 }, { "epoch": 0.5350999038358799, "grad_norm": 2.0756523609161377, "learning_rate": 1.8315399968960036e-07, "loss": 0.2985, "step": 7512 }, { "epoch": 0.5351711365174342, "grad_norm": 1.6428738832473755, "learning_rate": 1.824064256709046e-07, "loss": 0.2072, "step": 7513 }, { "epoch": 0.5352423691989885, "grad_norm": 4.330747127532959, "learning_rate": 1.8166036640579697e-07, "loss": 0.5614, "step": 7514 }, { "epoch": 0.5353136018805428, "grad_norm": 3.1738357543945312, "learning_rate": 1.8091582200938652e-07, "loss": 0.5013, "step": 7515 }, { "epoch": 0.5353848345620971, "grad_norm": 3.1269166469573975, "learning_rate": 1.8017279259654574e-07, "loss": 0.0855, "step": 7516 }, { "epoch": 0.5354560672436514, "grad_norm": 4.537172794342041, "learning_rate": 1.7943127828191852e-07, "loss": 0.5534, "step": 7517 }, { "epoch": 0.5355272999252056, "grad_norm": 2.8654258251190186, "learning_rate": 1.7869127917991446e-07, "loss": 0.4295, "step": 7518 }, { "epoch": 0.5355985326067599, "grad_norm": 2.903534173965454, "learning_rate": 1.7795279540470446e-07, "loss": 0.2925, "step": 7519 }, { "epoch": 0.5356697652883142, "grad_norm": 3.527278184890747, "learning_rate": 1.7721582707023065e-07, "loss": 0.6114, "step": 7520 }, { "epoch": 0.5357409979698686, "grad_norm": 2.3233299255371094, "learning_rate": 1.7648037429019993e-07, "loss": 0.4386, "step": 7521 }, { "epoch": 0.5358122306514229, "grad_norm": 3.980056047439575, "learning_rate": 1.7574643717808483e-07, "loss": 0.6978, "step": 7522 }, { "epoch": 0.5358834633329772, "grad_norm": 4.7527289390563965, "learning_rate": 1.7501401584712475e-07, "loss": 0.6038, "step": 7523 }, { "epoch": 0.5359546960145315, "grad_norm": 1.9348121881484985, "learning_rate": 1.7428311041032264e-07, "loss": 0.1304, "step": 7524 }, { "epoch": 0.5360259286960858, "grad_norm": 3.1117753982543945, "learning_rate": 1.7355372098045274e-07, "loss": 0.5012, "step": 7525 }, { "epoch": 0.5360971613776401, "grad_norm": 2.0211284160614014, "learning_rate": 1.7282584767005062e-07, "loss": 0.2779, "step": 7526 }, { "epoch": 0.5361683940591944, "grad_norm": 2.9592947959899902, "learning_rate": 1.7209949059142084e-07, "loss": 0.5105, "step": 7527 }, { "epoch": 0.5362396267407487, "grad_norm": 6.04970121383667, "learning_rate": 1.7137464985663045e-07, "loss": 0.6316, "step": 7528 }, { "epoch": 0.5363108594223029, "grad_norm": 1.7231855392456055, "learning_rate": 1.7065132557751662e-07, "loss": 0.2266, "step": 7529 }, { "epoch": 0.5363820921038572, "grad_norm": 2.6249196529388428, "learning_rate": 1.6992951786568123e-07, "loss": 0.4214, "step": 7530 }, { "epoch": 0.5364533247854115, "grad_norm": 3.371922492980957, "learning_rate": 1.6920922683249076e-07, "loss": 0.3796, "step": 7531 }, { "epoch": 0.5365245574669658, "grad_norm": 3.844515323638916, "learning_rate": 1.6849045258907848e-07, "loss": 0.4662, "step": 7532 }, { "epoch": 0.5365957901485201, "grad_norm": 2.7120745182037354, "learning_rate": 1.677731952463446e-07, "loss": 0.2539, "step": 7533 }, { "epoch": 0.5366670228300744, "grad_norm": 4.265021324157715, "learning_rate": 1.6705745491495394e-07, "loss": 0.4434, "step": 7534 }, { "epoch": 0.5367382555116287, "grad_norm": 2.00809383392334, "learning_rate": 1.6634323170533928e-07, "loss": 0.3958, "step": 7535 }, { "epoch": 0.5368094881931831, "grad_norm": 2.3186850547790527, "learning_rate": 1.6563052572769578e-07, "loss": 0.2915, "step": 7536 }, { "epoch": 0.5368807208747374, "grad_norm": 4.7869696617126465, "learning_rate": 1.649193370919888e-07, "loss": 0.5948, "step": 7537 }, { "epoch": 0.5369519535562917, "grad_norm": 1.8938156366348267, "learning_rate": 1.6420966590794617e-07, "loss": 0.2333, "step": 7538 }, { "epoch": 0.537023186237846, "grad_norm": 4.05657434463501, "learning_rate": 1.6350151228506251e-07, "loss": 0.5094, "step": 7539 }, { "epoch": 0.5370944189194002, "grad_norm": 3.710397243499756, "learning_rate": 1.6279487633259926e-07, "loss": 0.478, "step": 7540 }, { "epoch": 0.5371656516009545, "grad_norm": 3.3787853717803955, "learning_rate": 1.620897581595826e-07, "loss": 0.3375, "step": 7541 }, { "epoch": 0.5372368842825088, "grad_norm": 1.9229674339294434, "learning_rate": 1.613861578748066e-07, "loss": 0.3011, "step": 7542 }, { "epoch": 0.5373081169640631, "grad_norm": 4.55526876449585, "learning_rate": 1.6068407558682775e-07, "loss": 0.6053, "step": 7543 }, { "epoch": 0.5373793496456174, "grad_norm": 4.114986896514893, "learning_rate": 1.599835114039705e-07, "loss": 0.5872, "step": 7544 }, { "epoch": 0.5374505823271717, "grad_norm": 2.2125742435455322, "learning_rate": 1.5928446543432507e-07, "loss": 0.1873, "step": 7545 }, { "epoch": 0.537521815008726, "grad_norm": 2.613133430480957, "learning_rate": 1.585869377857474e-07, "loss": 0.5921, "step": 7546 }, { "epoch": 0.5375930476902803, "grad_norm": 2.220241069793701, "learning_rate": 1.5789092856585697e-07, "loss": 0.2446, "step": 7547 }, { "epoch": 0.5376642803718346, "grad_norm": 3.9256701469421387, "learning_rate": 1.571964378820434e-07, "loss": 0.3174, "step": 7548 }, { "epoch": 0.5377355130533888, "grad_norm": 1.8084020614624023, "learning_rate": 1.565034658414577e-07, "loss": 0.2003, "step": 7549 }, { "epoch": 0.5378067457349431, "grad_norm": 3.2130610942840576, "learning_rate": 1.5581201255101874e-07, "loss": 0.6813, "step": 7550 }, { "epoch": 0.5378779784164975, "grad_norm": 2.8691389560699463, "learning_rate": 1.551220781174101e-07, "loss": 0.4197, "step": 7551 }, { "epoch": 0.5379492110980518, "grad_norm": 3.897522211074829, "learning_rate": 1.5443366264708326e-07, "loss": 0.377, "step": 7552 }, { "epoch": 0.5380204437796061, "grad_norm": 2.6542420387268066, "learning_rate": 1.5374676624625218e-07, "loss": 0.3281, "step": 7553 }, { "epoch": 0.5380916764611604, "grad_norm": 2.71775484085083, "learning_rate": 1.5306138902089763e-07, "loss": 0.4352, "step": 7554 }, { "epoch": 0.5381629091427147, "grad_norm": 3.14345645904541, "learning_rate": 1.5237753107676721e-07, "loss": 0.6235, "step": 7555 }, { "epoch": 0.538234141824269, "grad_norm": 4.376299858093262, "learning_rate": 1.5169519251937325e-07, "loss": 0.5914, "step": 7556 }, { "epoch": 0.5383053745058233, "grad_norm": 3.07694411277771, "learning_rate": 1.5101437345399262e-07, "loss": 0.3914, "step": 7557 }, { "epoch": 0.5383766071873776, "grad_norm": 3.5492608547210693, "learning_rate": 1.5033507398567017e-07, "loss": 0.4689, "step": 7558 }, { "epoch": 0.5384478398689319, "grad_norm": 2.24772310256958, "learning_rate": 1.4965729421921425e-07, "loss": 0.4672, "step": 7559 }, { "epoch": 0.5385190725504861, "grad_norm": 1.680410385131836, "learning_rate": 1.4898103425919687e-07, "loss": 0.0898, "step": 7560 }, { "epoch": 0.5385903052320404, "grad_norm": 4.740353584289551, "learning_rate": 1.4830629420996222e-07, "loss": 0.5169, "step": 7561 }, { "epoch": 0.5386615379135947, "grad_norm": 3.7652628421783447, "learning_rate": 1.4763307417561157e-07, "loss": 0.8174, "step": 7562 }, { "epoch": 0.538732770595149, "grad_norm": 2.1879994869232178, "learning_rate": 1.4696137426001844e-07, "loss": 0.3484, "step": 7563 }, { "epoch": 0.5388040032767033, "grad_norm": 3.2241392135620117, "learning_rate": 1.4629119456681884e-07, "loss": 0.885, "step": 7564 }, { "epoch": 0.5388752359582577, "grad_norm": 5.646509170532227, "learning_rate": 1.456225351994156e-07, "loss": 0.3804, "step": 7565 }, { "epoch": 0.538946468639812, "grad_norm": 2.463679790496826, "learning_rate": 1.4495539626097289e-07, "loss": 0.2412, "step": 7566 }, { "epoch": 0.5390177013213663, "grad_norm": 4.645917892456055, "learning_rate": 1.44289777854425e-07, "loss": 0.5753, "step": 7567 }, { "epoch": 0.5390889340029206, "grad_norm": 2.6007442474365234, "learning_rate": 1.4362568008247202e-07, "loss": 0.2479, "step": 7568 }, { "epoch": 0.5391601666844749, "grad_norm": 2.2917702198028564, "learning_rate": 1.4296310304757423e-07, "loss": 0.5459, "step": 7569 }, { "epoch": 0.5392313993660292, "grad_norm": 2.901759147644043, "learning_rate": 1.4230204685196202e-07, "loss": 0.6601, "step": 7570 }, { "epoch": 0.5393026320475834, "grad_norm": 3.9079976081848145, "learning_rate": 1.4164251159762944e-07, "loss": 0.7435, "step": 7571 }, { "epoch": 0.5393738647291377, "grad_norm": 2.972947597503662, "learning_rate": 1.4098449738633614e-07, "loss": 0.4493, "step": 7572 }, { "epoch": 0.539445097410692, "grad_norm": 3.1257569789886475, "learning_rate": 1.4032800431960647e-07, "loss": 0.5069, "step": 7573 }, { "epoch": 0.5395163300922463, "grad_norm": 2.3356142044067383, "learning_rate": 1.3967303249873053e-07, "loss": 0.2478, "step": 7574 }, { "epoch": 0.5395875627738006, "grad_norm": 2.583242893218994, "learning_rate": 1.390195820247653e-07, "loss": 0.5203, "step": 7575 }, { "epoch": 0.5396587954553549, "grad_norm": 4.2791948318481445, "learning_rate": 1.3836765299852894e-07, "loss": 0.5763, "step": 7576 }, { "epoch": 0.5397300281369092, "grad_norm": 4.18467903137207, "learning_rate": 1.3771724552060885e-07, "loss": 0.2734, "step": 7577 }, { "epoch": 0.5398012608184635, "grad_norm": 3.503340244293213, "learning_rate": 1.3706835969135467e-07, "loss": 0.6156, "step": 7578 }, { "epoch": 0.5398724935000178, "grad_norm": 3.0074503421783447, "learning_rate": 1.3642099561088528e-07, "loss": 0.5709, "step": 7579 }, { "epoch": 0.5399437261815722, "grad_norm": 1.8807884454727173, "learning_rate": 1.3577515337908076e-07, "loss": 0.3871, "step": 7580 }, { "epoch": 0.5400149588631264, "grad_norm": 3.0269527435302734, "learning_rate": 1.3513083309558806e-07, "loss": 0.6222, "step": 7581 }, { "epoch": 0.5400861915446807, "grad_norm": 2.3784196376800537, "learning_rate": 1.3448803485981986e-07, "loss": 0.4436, "step": 7582 }, { "epoch": 0.540157424226235, "grad_norm": 2.030333995819092, "learning_rate": 1.3384675877095244e-07, "loss": 0.1974, "step": 7583 }, { "epoch": 0.5402286569077893, "grad_norm": 4.041801452636719, "learning_rate": 1.3320700492792771e-07, "loss": 0.7734, "step": 7584 }, { "epoch": 0.5402998895893436, "grad_norm": 4.120552062988281, "learning_rate": 1.3256877342945452e-07, "loss": 0.8503, "step": 7585 }, { "epoch": 0.5403711222708979, "grad_norm": 2.5376834869384766, "learning_rate": 1.319320643740052e-07, "loss": 0.5076, "step": 7586 }, { "epoch": 0.5404423549524522, "grad_norm": 3.909581184387207, "learning_rate": 1.312968778598167e-07, "loss": 0.6149, "step": 7587 }, { "epoch": 0.5405135876340065, "grad_norm": 3.5810859203338623, "learning_rate": 1.3066321398489178e-07, "loss": 0.577, "step": 7588 }, { "epoch": 0.5405848203155608, "grad_norm": 2.791630744934082, "learning_rate": 1.3003107284699777e-07, "loss": 0.6434, "step": 7589 }, { "epoch": 0.540656052997115, "grad_norm": 4.220392227172852, "learning_rate": 1.294004545436689e-07, "loss": 0.5262, "step": 7590 }, { "epoch": 0.5407272856786693, "grad_norm": 2.050551414489746, "learning_rate": 1.2877135917220173e-07, "loss": 0.1481, "step": 7591 }, { "epoch": 0.5407985183602236, "grad_norm": 2.872183322906494, "learning_rate": 1.281437868296609e-07, "loss": 0.6362, "step": 7592 }, { "epoch": 0.5408697510417779, "grad_norm": 2.6647980213165283, "learning_rate": 1.2751773761287333e-07, "loss": 0.5221, "step": 7593 }, { "epoch": 0.5409409837233322, "grad_norm": 3.8687000274658203, "learning_rate": 1.2689321161843071e-07, "loss": 0.2874, "step": 7594 }, { "epoch": 0.5410122164048866, "grad_norm": 4.327996253967285, "learning_rate": 1.262702089426926e-07, "loss": 0.4405, "step": 7595 }, { "epoch": 0.5410834490864409, "grad_norm": 4.819122791290283, "learning_rate": 1.256487296817821e-07, "loss": 0.503, "step": 7596 }, { "epoch": 0.5411546817679952, "grad_norm": 3.2940890789031982, "learning_rate": 1.2502877393158587e-07, "loss": 0.6965, "step": 7597 }, { "epoch": 0.5412259144495495, "grad_norm": 2.838977813720703, "learning_rate": 1.2441034178775735e-07, "loss": 0.2113, "step": 7598 }, { "epoch": 0.5412971471311038, "grad_norm": 2.1768057346343994, "learning_rate": 1.237934333457147e-07, "loss": 0.3736, "step": 7599 }, { "epoch": 0.5413683798126581, "grad_norm": 3.0969910621643066, "learning_rate": 1.2317804870063954e-07, "loss": 0.356, "step": 7600 }, { "epoch": 0.5414396124942124, "grad_norm": 3.4651834964752197, "learning_rate": 1.2256418794747925e-07, "loss": 0.8168, "step": 7601 }, { "epoch": 0.5415108451757666, "grad_norm": 1.8291802406311035, "learning_rate": 1.219518511809481e-07, "loss": 0.2576, "step": 7602 }, { "epoch": 0.5415820778573209, "grad_norm": 3.8508570194244385, "learning_rate": 1.213410384955227e-07, "loss": 0.6507, "step": 7603 }, { "epoch": 0.5416533105388752, "grad_norm": 3.498600721359253, "learning_rate": 1.2073174998544323e-07, "loss": 0.7818, "step": 7604 }, { "epoch": 0.5417245432204295, "grad_norm": 3.6213083267211914, "learning_rate": 1.2012398574471785e-07, "loss": 0.6391, "step": 7605 }, { "epoch": 0.5417957759019838, "grad_norm": 5.101166248321533, "learning_rate": 1.1951774586711927e-07, "loss": 0.6895, "step": 7606 }, { "epoch": 0.5418670085835381, "grad_norm": 3.561216115951538, "learning_rate": 1.1891303044618275e-07, "loss": 0.5098, "step": 7607 }, { "epoch": 0.5419382412650924, "grad_norm": 3.958801031112671, "learning_rate": 1.1830983957521024e-07, "loss": 0.9107, "step": 7608 }, { "epoch": 0.5420094739466467, "grad_norm": 2.810807704925537, "learning_rate": 1.1770817334726736e-07, "loss": 0.2289, "step": 7609 }, { "epoch": 0.5420807066282011, "grad_norm": 3.409294605255127, "learning_rate": 1.1710803185518537e-07, "loss": 0.5101, "step": 7610 }, { "epoch": 0.5421519393097554, "grad_norm": 3.2524330615997314, "learning_rate": 1.1650941519156023e-07, "loss": 0.228, "step": 7611 }, { "epoch": 0.5422231719913096, "grad_norm": 2.7365777492523193, "learning_rate": 1.1591232344875248e-07, "loss": 0.6463, "step": 7612 }, { "epoch": 0.5422944046728639, "grad_norm": 3.289658546447754, "learning_rate": 1.1531675671888621e-07, "loss": 0.7346, "step": 7613 }, { "epoch": 0.5423656373544182, "grad_norm": 2.7063119411468506, "learning_rate": 1.1472271509385235e-07, "loss": 0.4834, "step": 7614 }, { "epoch": 0.5424368700359725, "grad_norm": 3.3463001251220703, "learning_rate": 1.1413019866530429e-07, "loss": 0.1499, "step": 7615 }, { "epoch": 0.5425081027175268, "grad_norm": 2.155736207962036, "learning_rate": 1.1353920752466219e-07, "loss": 0.1833, "step": 7616 }, { "epoch": 0.5425793353990811, "grad_norm": 5.533310413360596, "learning_rate": 1.129497417631098e-07, "loss": 0.6191, "step": 7617 }, { "epoch": 0.5426505680806354, "grad_norm": 3.1962287425994873, "learning_rate": 1.1236180147159437e-07, "loss": 0.1869, "step": 7618 }, { "epoch": 0.5427218007621897, "grad_norm": 4.616801738739014, "learning_rate": 1.117753867408311e-07, "loss": 0.8899, "step": 7619 }, { "epoch": 0.542793033443744, "grad_norm": 2.998332977294922, "learning_rate": 1.1119049766129652e-07, "loss": 0.3725, "step": 7620 }, { "epoch": 0.5428642661252983, "grad_norm": 3.190378189086914, "learning_rate": 1.1060713432323288e-07, "loss": 0.895, "step": 7621 }, { "epoch": 0.5429354988068525, "grad_norm": 2.6346018314361572, "learning_rate": 1.1002529681664598e-07, "loss": 0.4315, "step": 7622 }, { "epoch": 0.5430067314884068, "grad_norm": 4.30924654006958, "learning_rate": 1.0944498523131064e-07, "loss": 0.7715, "step": 7623 }, { "epoch": 0.5430779641699611, "grad_norm": 3.9343645572662354, "learning_rate": 1.0886619965676082e-07, "loss": 0.2394, "step": 7624 }, { "epoch": 0.5431491968515155, "grad_norm": 3.7590513229370117, "learning_rate": 1.0828894018229619e-07, "loss": 0.6168, "step": 7625 }, { "epoch": 0.5432204295330698, "grad_norm": 2.485750198364258, "learning_rate": 1.0771320689698439e-07, "loss": 0.6024, "step": 7626 }, { "epoch": 0.5432916622146241, "grad_norm": 8.826316833496094, "learning_rate": 1.0713899988965326e-07, "loss": 0.7554, "step": 7627 }, { "epoch": 0.5433628948961784, "grad_norm": 4.413617134094238, "learning_rate": 1.0656631924889749e-07, "loss": 0.3918, "step": 7628 }, { "epoch": 0.5434341275777327, "grad_norm": 2.529816150665283, "learning_rate": 1.059951650630775e-07, "loss": 0.4427, "step": 7629 }, { "epoch": 0.543505360259287, "grad_norm": 3.5218138694763184, "learning_rate": 1.0542553742031392e-07, "loss": 0.5226, "step": 7630 }, { "epoch": 0.5435765929408413, "grad_norm": 4.575818061828613, "learning_rate": 1.0485743640849533e-07, "loss": 0.4971, "step": 7631 }, { "epoch": 0.5436478256223956, "grad_norm": 3.8174943923950195, "learning_rate": 1.0429086211527385e-07, "loss": 0.3132, "step": 7632 }, { "epoch": 0.5437190583039498, "grad_norm": 2.6931822299957275, "learning_rate": 1.037258146280673e-07, "loss": 0.7737, "step": 7633 }, { "epoch": 0.5437902909855041, "grad_norm": 3.152010202407837, "learning_rate": 1.0316229403405487e-07, "loss": 0.6172, "step": 7634 }, { "epoch": 0.5438615236670584, "grad_norm": 3.067268133163452, "learning_rate": 1.0260030042018365e-07, "loss": 0.4707, "step": 7635 }, { "epoch": 0.5439327563486127, "grad_norm": 2.217122793197632, "learning_rate": 1.0203983387316097e-07, "loss": 0.231, "step": 7636 }, { "epoch": 0.544003989030167, "grad_norm": 4.601772785186768, "learning_rate": 1.0148089447946319e-07, "loss": 0.3413, "step": 7637 }, { "epoch": 0.5440752217117213, "grad_norm": 2.3971962928771973, "learning_rate": 1.0092348232532911e-07, "loss": 0.3094, "step": 7638 }, { "epoch": 0.5441464543932756, "grad_norm": 2.931617498397827, "learning_rate": 1.0036759749676106e-07, "loss": 0.8698, "step": 7639 }, { "epoch": 0.54421768707483, "grad_norm": 6.637814998626709, "learning_rate": 9.981324007952486e-08, "loss": 0.7596, "step": 7640 }, { "epoch": 0.5442889197563843, "grad_norm": 2.811720848083496, "learning_rate": 9.926041015915434e-08, "loss": 0.2585, "step": 7641 }, { "epoch": 0.5443601524379386, "grad_norm": 2.801391124725342, "learning_rate": 9.870910782094456e-08, "loss": 0.6888, "step": 7642 }, { "epoch": 0.5444313851194928, "grad_norm": 4.6452789306640625, "learning_rate": 9.81593331499564e-08, "loss": 0.3282, "step": 7643 }, { "epoch": 0.5445026178010471, "grad_norm": 3.150313138961792, "learning_rate": 9.761108623101312e-08, "loss": 0.5436, "step": 7644 }, { "epoch": 0.5445738504826014, "grad_norm": 6.539673805236816, "learning_rate": 9.706436714870482e-08, "loss": 0.9308, "step": 7645 }, { "epoch": 0.5446450831641557, "grad_norm": 2.547029733657837, "learning_rate": 9.651917598738402e-08, "loss": 0.3773, "step": 7646 }, { "epoch": 0.54471631584571, "grad_norm": 4.775746822357178, "learning_rate": 9.597551283116901e-08, "loss": 0.636, "step": 7647 }, { "epoch": 0.5447875485272643, "grad_norm": 4.5785722732543945, "learning_rate": 9.543337776393936e-08, "loss": 0.4596, "step": 7648 }, { "epoch": 0.5448587812088186, "grad_norm": 4.211926460266113, "learning_rate": 9.489277086934257e-08, "loss": 0.63, "step": 7649 }, { "epoch": 0.5449300138903729, "grad_norm": 3.413299798965454, "learning_rate": 9.435369223078861e-08, "loss": 0.7079, "step": 7650 }, { "epoch": 0.5450012465719272, "grad_norm": 3.0871775150299072, "learning_rate": 9.381614193145206e-08, "loss": 0.4492, "step": 7651 }, { "epoch": 0.5450724792534815, "grad_norm": 2.882493495941162, "learning_rate": 9.32801200542699e-08, "loss": 0.9539, "step": 7652 }, { "epoch": 0.5451437119350357, "grad_norm": 3.6491870880126953, "learning_rate": 9.274562668194598e-08, "loss": 0.7623, "step": 7653 }, { "epoch": 0.5452149446165901, "grad_norm": 4.339097023010254, "learning_rate": 9.221266189694767e-08, "loss": 0.5334, "step": 7654 }, { "epoch": 0.5452861772981444, "grad_norm": 2.508916139602661, "learning_rate": 9.168122578150363e-08, "loss": 0.5415, "step": 7655 }, { "epoch": 0.5453574099796987, "grad_norm": 3.3261210918426514, "learning_rate": 9.11513184176116e-08, "loss": 0.6169, "step": 7656 }, { "epoch": 0.545428642661253, "grad_norm": 3.1041765213012695, "learning_rate": 9.062293988702953e-08, "loss": 0.2127, "step": 7657 }, { "epoch": 0.5454998753428073, "grad_norm": 3.4854276180267334, "learning_rate": 9.009609027128108e-08, "loss": 0.551, "step": 7658 }, { "epoch": 0.5455711080243616, "grad_norm": 2.744854688644409, "learning_rate": 8.957076965165234e-08, "loss": 0.4606, "step": 7659 }, { "epoch": 0.5456423407059159, "grad_norm": 2.8889126777648926, "learning_rate": 8.904697810919848e-08, "loss": 0.4295, "step": 7660 }, { "epoch": 0.5457135733874702, "grad_norm": 2.9919095039367676, "learning_rate": 8.852471572473153e-08, "loss": 0.7477, "step": 7661 }, { "epoch": 0.5457848060690245, "grad_norm": 3.1688406467437744, "learning_rate": 8.800398257883146e-08, "loss": 0.5632, "step": 7662 }, { "epoch": 0.5458560387505788, "grad_norm": 3.449676513671875, "learning_rate": 8.748477875184514e-08, "loss": 0.4319, "step": 7663 }, { "epoch": 0.545927271432133, "grad_norm": 2.625154495239258, "learning_rate": 8.696710432387733e-08, "loss": 0.6672, "step": 7664 }, { "epoch": 0.5459985041136873, "grad_norm": 2.3108668327331543, "learning_rate": 8.645095937480086e-08, "loss": 0.2748, "step": 7665 }, { "epoch": 0.5460697367952416, "grad_norm": 6.098298072814941, "learning_rate": 8.593634398425199e-08, "loss": 0.7394, "step": 7666 }, { "epoch": 0.5461409694767959, "grad_norm": 2.121581554412842, "learning_rate": 8.542325823162945e-08, "loss": 0.2788, "step": 7667 }, { "epoch": 0.5462122021583502, "grad_norm": 3.3699097633361816, "learning_rate": 8.491170219609767e-08, "loss": 0.4085, "step": 7668 }, { "epoch": 0.5462834348399046, "grad_norm": 4.401212215423584, "learning_rate": 8.440167595658577e-08, "loss": 0.3976, "step": 7669 }, { "epoch": 0.5463546675214589, "grad_norm": 3.002629041671753, "learning_rate": 8.3893179591783e-08, "loss": 0.3386, "step": 7670 }, { "epoch": 0.5464259002030132, "grad_norm": 1.5456699132919312, "learning_rate": 8.338621318014662e-08, "loss": 0.1727, "step": 7671 }, { "epoch": 0.5464971328845675, "grad_norm": 2.7102391719818115, "learning_rate": 8.288077679989737e-08, "loss": 0.4435, "step": 7672 }, { "epoch": 0.5465683655661218, "grad_norm": 1.956850528717041, "learning_rate": 8.237687052901622e-08, "loss": 0.1716, "step": 7673 }, { "epoch": 0.546639598247676, "grad_norm": 7.857179641723633, "learning_rate": 8.187449444525319e-08, "loss": 1.0892, "step": 7674 }, { "epoch": 0.5467108309292303, "grad_norm": 3.0679595470428467, "learning_rate": 8.137364862611851e-08, "loss": 0.5934, "step": 7675 }, { "epoch": 0.5467820636107846, "grad_norm": 1.5925017595291138, "learning_rate": 8.087433314888815e-08, "loss": 0.1892, "step": 7676 }, { "epoch": 0.5468532962923389, "grad_norm": 3.110698938369751, "learning_rate": 8.037654809059937e-08, "loss": 0.309, "step": 7677 }, { "epoch": 0.5469245289738932, "grad_norm": 5.750886917114258, "learning_rate": 7.988029352805849e-08, "loss": 0.7382, "step": 7678 }, { "epoch": 0.5469957616554475, "grad_norm": 3.967155694961548, "learning_rate": 7.938556953783095e-08, "loss": 0.523, "step": 7679 }, { "epoch": 0.5470669943370018, "grad_norm": 2.5275278091430664, "learning_rate": 7.889237619624679e-08, "loss": 0.3994, "step": 7680 }, { "epoch": 0.5471382270185561, "grad_norm": 5.480907917022705, "learning_rate": 7.840071357940072e-08, "loss": 0.784, "step": 7681 }, { "epoch": 0.5472094597001104, "grad_norm": 2.2464475631713867, "learning_rate": 7.791058176315313e-08, "loss": 0.2815, "step": 7682 }, { "epoch": 0.5472806923816647, "grad_norm": 2.6000003814697266, "learning_rate": 7.742198082312357e-08, "loss": 0.3804, "step": 7683 }, { "epoch": 0.5473519250632191, "grad_norm": 4.387770175933838, "learning_rate": 7.693491083470062e-08, "loss": 0.4119, "step": 7684 }, { "epoch": 0.5474231577447733, "grad_norm": 3.0926270484924316, "learning_rate": 7.644937187303303e-08, "loss": 0.4238, "step": 7685 }, { "epoch": 0.5474943904263276, "grad_norm": 4.185608863830566, "learning_rate": 7.596536401303422e-08, "loss": 0.6656, "step": 7686 }, { "epoch": 0.5475656231078819, "grad_norm": 5.092912197113037, "learning_rate": 7.548288732938225e-08, "loss": 0.3352, "step": 7687 }, { "epoch": 0.5476368557894362, "grad_norm": 3.372498035430908, "learning_rate": 7.500194189651866e-08, "loss": 0.3158, "step": 7688 }, { "epoch": 0.5477080884709905, "grad_norm": 2.6990439891815186, "learning_rate": 7.452252778864632e-08, "loss": 0.5423, "step": 7689 }, { "epoch": 0.5477793211525448, "grad_norm": 3.0789339542388916, "learning_rate": 7.404464507973608e-08, "loss": 0.6628, "step": 7690 }, { "epoch": 0.5478505538340991, "grad_norm": 3.7628068923950195, "learning_rate": 7.356829384351893e-08, "loss": 0.5672, "step": 7691 }, { "epoch": 0.5479217865156534, "grad_norm": 4.761411666870117, "learning_rate": 7.309347415349278e-08, "loss": 0.7628, "step": 7692 }, { "epoch": 0.5479930191972077, "grad_norm": 3.5601933002471924, "learning_rate": 7.262018608291566e-08, "loss": 0.6495, "step": 7693 }, { "epoch": 0.548064251878762, "grad_norm": 2.921278715133667, "learning_rate": 7.214842970481139e-08, "loss": 0.7216, "step": 7694 }, { "epoch": 0.5481354845603162, "grad_norm": 2.719074249267578, "learning_rate": 7.167820509196732e-08, "loss": 0.5956, "step": 7695 }, { "epoch": 0.5482067172418705, "grad_norm": 3.853837251663208, "learning_rate": 7.12095123169343e-08, "loss": 0.4053, "step": 7696 }, { "epoch": 0.5482779499234248, "grad_norm": 3.9845261573791504, "learning_rate": 7.074235145202668e-08, "loss": 0.4641, "step": 7697 }, { "epoch": 0.5483491826049791, "grad_norm": 4.112980365753174, "learning_rate": 7.027672256932238e-08, "loss": 0.4184, "step": 7698 }, { "epoch": 0.5484204152865335, "grad_norm": 3.1945621967315674, "learning_rate": 6.981262574066395e-08, "loss": 0.4536, "step": 7699 }, { "epoch": 0.5484916479680878, "grad_norm": 4.4538116455078125, "learning_rate": 6.93500610376563e-08, "loss": 0.6931, "step": 7700 }, { "epoch": 0.5485628806496421, "grad_norm": 3.6579689979553223, "learning_rate": 6.88890285316679e-08, "loss": 0.4505, "step": 7701 }, { "epoch": 0.5486341133311964, "grad_norm": 3.3298754692077637, "learning_rate": 6.842952829383187e-08, "loss": 0.6356, "step": 7702 }, { "epoch": 0.5487053460127507, "grad_norm": 2.9645657539367676, "learning_rate": 6.797156039504482e-08, "loss": 0.193, "step": 7703 }, { "epoch": 0.548776578694305, "grad_norm": 5.665770053863525, "learning_rate": 6.751512490596467e-08, "loss": 0.7548, "step": 7704 }, { "epoch": 0.5488478113758593, "grad_norm": 2.5123274326324463, "learning_rate": 6.706022189701622e-08, "loss": 0.2955, "step": 7705 }, { "epoch": 0.5489190440574135, "grad_norm": 2.8927993774414062, "learning_rate": 6.660685143838664e-08, "loss": 0.4424, "step": 7706 }, { "epoch": 0.5489902767389678, "grad_norm": 4.220381259918213, "learning_rate": 6.615501360002552e-08, "loss": 0.6421, "step": 7707 }, { "epoch": 0.5490615094205221, "grad_norm": 4.214420318603516, "learning_rate": 6.570470845164712e-08, "loss": 0.9537, "step": 7708 }, { "epoch": 0.5491327421020764, "grad_norm": 3.141754627227783, "learning_rate": 6.525593606272917e-08, "loss": 0.7397, "step": 7709 }, { "epoch": 0.5492039747836307, "grad_norm": 3.570624828338623, "learning_rate": 6.480869650251187e-08, "loss": 0.7172, "step": 7710 }, { "epoch": 0.549275207465185, "grad_norm": 2.5893068313598633, "learning_rate": 6.436298983999889e-08, "loss": 0.3067, "step": 7711 }, { "epoch": 0.5493464401467393, "grad_norm": 2.217454195022583, "learning_rate": 6.391881614396078e-08, "loss": 0.2337, "step": 7712 }, { "epoch": 0.5494176728282936, "grad_norm": 3.6010031700134277, "learning_rate": 6.347617548292717e-08, "loss": 0.5369, "step": 7713 }, { "epoch": 0.549488905509848, "grad_norm": 2.5687592029571533, "learning_rate": 6.303506792519232e-08, "loss": 0.1904, "step": 7714 }, { "epoch": 0.5495601381914023, "grad_norm": 1.8440096378326416, "learning_rate": 6.259549353881623e-08, "loss": 0.3013, "step": 7715 }, { "epoch": 0.5496313708729565, "grad_norm": 4.370350360870361, "learning_rate": 6.215745239162018e-08, "loss": 0.8012, "step": 7716 }, { "epoch": 0.5497026035545108, "grad_norm": 4.230505466461182, "learning_rate": 6.172094455118904e-08, "loss": 0.5392, "step": 7717 }, { "epoch": 0.5497738362360651, "grad_norm": 1.986342430114746, "learning_rate": 6.128597008487225e-08, "loss": 0.2176, "step": 7718 }, { "epoch": 0.5498450689176194, "grad_norm": 3.8707261085510254, "learning_rate": 6.085252905978056e-08, "loss": 0.6813, "step": 7719 }, { "epoch": 0.5499163015991737, "grad_norm": 3.794936180114746, "learning_rate": 6.042062154279049e-08, "loss": 0.2082, "step": 7720 }, { "epoch": 0.549987534280728, "grad_norm": 2.2686924934387207, "learning_rate": 5.999024760054095e-08, "loss": 0.3203, "step": 7721 }, { "epoch": 0.5500587669622823, "grad_norm": 1.9317094087600708, "learning_rate": 5.9561407299433274e-08, "loss": 0.2615, "step": 7722 }, { "epoch": 0.5501299996438366, "grad_norm": 3.0700809955596924, "learning_rate": 5.9134100705634525e-08, "loss": 0.7513, "step": 7723 }, { "epoch": 0.5502012323253909, "grad_norm": 3.888275384902954, "learning_rate": 5.8708327885071966e-08, "loss": 0.4374, "step": 7724 }, { "epoch": 0.5502724650069452, "grad_norm": 3.0837743282318115, "learning_rate": 5.8284088903439726e-08, "loss": 0.3571, "step": 7725 }, { "epoch": 0.5503436976884994, "grad_norm": 5.204401969909668, "learning_rate": 5.786138382619322e-08, "loss": 0.4719, "step": 7726 }, { "epoch": 0.5504149303700537, "grad_norm": 6.175567626953125, "learning_rate": 5.744021271854916e-08, "loss": 0.5744, "step": 7727 }, { "epoch": 0.550486163051608, "grad_norm": 3.8082916736602783, "learning_rate": 5.702057564549335e-08, "loss": 0.7622, "step": 7728 }, { "epoch": 0.5505573957331624, "grad_norm": 2.249260425567627, "learning_rate": 5.660247267176844e-08, "loss": 0.4061, "step": 7729 }, { "epoch": 0.5506286284147167, "grad_norm": 3.773714303970337, "learning_rate": 5.618590386188616e-08, "loss": 0.4051, "step": 7730 }, { "epoch": 0.550699861096271, "grad_norm": 2.533546209335327, "learning_rate": 5.577086928011732e-08, "loss": 0.4, "step": 7731 }, { "epoch": 0.5507710937778253, "grad_norm": 2.8772921562194824, "learning_rate": 5.535736899049626e-08, "loss": 0.5164, "step": 7732 }, { "epoch": 0.5508423264593796, "grad_norm": 2.921257495880127, "learning_rate": 5.4945403056824164e-08, "loss": 0.6854, "step": 7733 }, { "epoch": 0.5509135591409339, "grad_norm": 3.0907161235809326, "learning_rate": 5.453497154266241e-08, "loss": 0.7422, "step": 7734 }, { "epoch": 0.5509847918224882, "grad_norm": 2.7287585735321045, "learning_rate": 5.412607451133478e-08, "loss": 0.3959, "step": 7735 }, { "epoch": 0.5510560245040425, "grad_norm": 1.951749563217163, "learning_rate": 5.371871202593193e-08, "loss": 0.2491, "step": 7736 }, { "epoch": 0.5511272571855967, "grad_norm": 2.5423085689544678, "learning_rate": 5.33128841493058e-08, "loss": 0.5115, "step": 7737 }, { "epoch": 0.551198489867151, "grad_norm": 2.6882338523864746, "learning_rate": 5.290859094406964e-08, "loss": 0.5708, "step": 7738 }, { "epoch": 0.5512697225487053, "grad_norm": 2.332165002822876, "learning_rate": 5.250583247260355e-08, "loss": 0.3605, "step": 7739 }, { "epoch": 0.5513409552302596, "grad_norm": 1.961175799369812, "learning_rate": 5.2104608797047816e-08, "loss": 0.3093, "step": 7740 }, { "epoch": 0.5514121879118139, "grad_norm": 3.2519655227661133, "learning_rate": 5.170491997930627e-08, "loss": 0.6974, "step": 7741 }, { "epoch": 0.5514834205933682, "grad_norm": 4.284727096557617, "learning_rate": 5.1306766081048456e-08, "loss": 0.6803, "step": 7742 }, { "epoch": 0.5515546532749225, "grad_norm": 5.0569305419921875, "learning_rate": 5.091014716370524e-08, "loss": 0.8504, "step": 7743 }, { "epoch": 0.5516258859564769, "grad_norm": 2.860712766647339, "learning_rate": 5.0515063288471e-08, "loss": 0.3867, "step": 7744 }, { "epoch": 0.5516971186380312, "grad_norm": 4.0784711837768555, "learning_rate": 5.012151451630143e-08, "loss": 0.5162, "step": 7745 }, { "epoch": 0.5517683513195855, "grad_norm": 5.706878185272217, "learning_rate": 4.972950090791906e-08, "loss": 0.5555, "step": 7746 }, { "epoch": 0.5518395840011397, "grad_norm": 2.7492282390594482, "learning_rate": 4.933902252380662e-08, "loss": 0.4103, "step": 7747 }, { "epoch": 0.551910816682694, "grad_norm": 3.0709125995635986, "learning_rate": 4.8950079424211484e-08, "loss": 0.6153, "step": 7748 }, { "epoch": 0.5519820493642483, "grad_norm": 2.9208645820617676, "learning_rate": 4.8562671669142304e-08, "loss": 0.6798, "step": 7749 }, { "epoch": 0.5520532820458026, "grad_norm": 5.433892726898193, "learning_rate": 4.8176799318373494e-08, "loss": 0.1475, "step": 7750 }, { "epoch": 0.5521245147273569, "grad_norm": 2.3780665397644043, "learning_rate": 4.7792462431439643e-08, "loss": 0.5178, "step": 7751 }, { "epoch": 0.5521957474089112, "grad_norm": 3.2842283248901367, "learning_rate": 4.740966106764222e-08, "loss": 0.5266, "step": 7752 }, { "epoch": 0.5522669800904655, "grad_norm": 4.342316627502441, "learning_rate": 4.702839528604064e-08, "loss": 0.4664, "step": 7753 }, { "epoch": 0.5523382127720198, "grad_norm": 2.360856533050537, "learning_rate": 4.66486651454634e-08, "loss": 0.3498, "step": 7754 }, { "epoch": 0.5524094454535741, "grad_norm": 2.365292549133301, "learning_rate": 4.627047070449697e-08, "loss": 0.2265, "step": 7755 }, { "epoch": 0.5524806781351284, "grad_norm": 2.648293972015381, "learning_rate": 4.589381202149357e-08, "loss": 0.4681, "step": 7756 }, { "epoch": 0.5525519108166826, "grad_norm": 3.507157325744629, "learning_rate": 4.55186891545667e-08, "loss": 0.4246, "step": 7757 }, { "epoch": 0.552623143498237, "grad_norm": 3.107165813446045, "learning_rate": 4.514510216159562e-08, "loss": 0.5971, "step": 7758 }, { "epoch": 0.5526943761797913, "grad_norm": 2.6098921298980713, "learning_rate": 4.4773051100219787e-08, "loss": 0.3586, "step": 7759 }, { "epoch": 0.5527656088613456, "grad_norm": 3.4834229946136475, "learning_rate": 4.440253602784328e-08, "loss": 0.7843, "step": 7760 }, { "epoch": 0.5528368415428999, "grad_norm": 2.9441401958465576, "learning_rate": 4.4033557001631475e-08, "loss": 0.5425, "step": 7761 }, { "epoch": 0.5529080742244542, "grad_norm": 3.9757320880889893, "learning_rate": 4.366611407851662e-08, "loss": 0.5081, "step": 7762 }, { "epoch": 0.5529793069060085, "grad_norm": 3.3606112003326416, "learning_rate": 4.3300207315190026e-08, "loss": 0.8734, "step": 7763 }, { "epoch": 0.5530505395875628, "grad_norm": 2.0940868854522705, "learning_rate": 4.293583676810653e-08, "loss": 0.2429, "step": 7764 }, { "epoch": 0.5531217722691171, "grad_norm": 3.6909210681915283, "learning_rate": 4.257300249348562e-08, "loss": 0.4496, "step": 7765 }, { "epoch": 0.5531930049506714, "grad_norm": 2.5279664993286133, "learning_rate": 4.221170454730916e-08, "loss": 0.4888, "step": 7766 }, { "epoch": 0.5532642376322257, "grad_norm": 4.310198783874512, "learning_rate": 4.185194298532147e-08, "loss": 0.5567, "step": 7767 }, { "epoch": 0.5533354703137799, "grad_norm": 3.0357422828674316, "learning_rate": 4.149371786302925e-08, "loss": 0.5255, "step": 7768 }, { "epoch": 0.5534067029953342, "grad_norm": 2.655999183654785, "learning_rate": 4.113702923570384e-08, "loss": 0.4891, "step": 7769 }, { "epoch": 0.5534779356768885, "grad_norm": 3.7333805561065674, "learning_rate": 4.0781877158377894e-08, "loss": 0.4704, "step": 7770 }, { "epoch": 0.5535491683584428, "grad_norm": 2.657094717025757, "learning_rate": 4.042826168584868e-08, "loss": 0.7902, "step": 7771 }, { "epoch": 0.5536204010399971, "grad_norm": 2.9379167556762695, "learning_rate": 4.0076182872674785e-08, "loss": 0.7793, "step": 7772 }, { "epoch": 0.5536916337215515, "grad_norm": 2.556596517562866, "learning_rate": 3.972564077317831e-08, "loss": 0.1514, "step": 7773 }, { "epoch": 0.5537628664031058, "grad_norm": 2.633413791656494, "learning_rate": 3.9376635441444874e-08, "loss": 0.3537, "step": 7774 }, { "epoch": 0.5538340990846601, "grad_norm": 2.41231632232666, "learning_rate": 3.9029166931322524e-08, "loss": 0.4693, "step": 7775 }, { "epoch": 0.5539053317662144, "grad_norm": 2.194342613220215, "learning_rate": 3.86832352964206e-08, "loss": 0.439, "step": 7776 }, { "epoch": 0.5539765644477687, "grad_norm": 2.508671522140503, "learning_rate": 3.833884059011417e-08, "loss": 0.3311, "step": 7777 }, { "epoch": 0.554047797129323, "grad_norm": 1.9792808294296265, "learning_rate": 3.7995982865539624e-08, "loss": 0.2876, "step": 7778 }, { "epoch": 0.5541190298108772, "grad_norm": 2.4841830730438232, "learning_rate": 3.765466217559577e-08, "loss": 0.3568, "step": 7779 }, { "epoch": 0.5541902624924315, "grad_norm": 2.383645534515381, "learning_rate": 3.731487857294491e-08, "loss": 0.2233, "step": 7780 }, { "epoch": 0.5542614951739858, "grad_norm": 4.596088409423828, "learning_rate": 3.69766321100129e-08, "loss": 0.5559, "step": 7781 }, { "epoch": 0.5543327278555401, "grad_norm": 2.4148917198181152, "learning_rate": 3.663992283898687e-08, "loss": 0.5524, "step": 7782 }, { "epoch": 0.5544039605370944, "grad_norm": 3.621654987335205, "learning_rate": 3.630475081181861e-08, "loss": 0.3623, "step": 7783 }, { "epoch": 0.5544751932186487, "grad_norm": 2.4172565937042236, "learning_rate": 3.597111608022119e-08, "loss": 0.2663, "step": 7784 }, { "epoch": 0.554546425900203, "grad_norm": 3.4446892738342285, "learning_rate": 3.56390186956701e-08, "loss": 0.6989, "step": 7785 }, { "epoch": 0.5546176585817573, "grad_norm": 6.504136085510254, "learning_rate": 3.530845870940658e-08, "loss": 0.9221, "step": 7786 }, { "epoch": 0.5546888912633116, "grad_norm": 3.1206417083740234, "learning_rate": 3.497943617242983e-08, "loss": 0.3524, "step": 7787 }, { "epoch": 0.554760123944866, "grad_norm": 2.409424304962158, "learning_rate": 3.465195113550701e-08, "loss": 0.2781, "step": 7788 }, { "epoch": 0.5548313566264202, "grad_norm": 2.401844024658203, "learning_rate": 3.43260036491655e-08, "loss": 0.4193, "step": 7789 }, { "epoch": 0.5549025893079745, "grad_norm": 4.367741107940674, "learning_rate": 3.400159376369394e-08, "loss": 0.7396, "step": 7790 }, { "epoch": 0.5549738219895288, "grad_norm": 3.2767157554626465, "learning_rate": 3.367872152914675e-08, "loss": 0.6997, "step": 7791 }, { "epoch": 0.5550450546710831, "grad_norm": 1.8757404088974, "learning_rate": 3.335738699533964e-08, "loss": 0.3576, "step": 7792 }, { "epoch": 0.5551162873526374, "grad_norm": 3.5769405364990234, "learning_rate": 3.3037590211851823e-08, "loss": 0.6443, "step": 7793 }, { "epoch": 0.5551875200341917, "grad_norm": 4.097972869873047, "learning_rate": 3.271933122802273e-08, "loss": 0.8519, "step": 7794 }, { "epoch": 0.555258752715746, "grad_norm": 2.78297758102417, "learning_rate": 3.240261009295864e-08, "loss": 0.6457, "step": 7795 }, { "epoch": 0.5553299853973003, "grad_norm": 3.5495595932006836, "learning_rate": 3.208742685552602e-08, "loss": 0.5758, "step": 7796 }, { "epoch": 0.5554012180788546, "grad_norm": 3.646533250808716, "learning_rate": 3.1773781564352625e-08, "loss": 0.2442, "step": 7797 }, { "epoch": 0.5554724507604089, "grad_norm": 3.039987087249756, "learning_rate": 3.146167426783198e-08, "loss": 0.4021, "step": 7798 }, { "epoch": 0.5555436834419631, "grad_norm": 1.5916635990142822, "learning_rate": 3.1151105014119995e-08, "loss": 0.0667, "step": 7799 }, { "epoch": 0.5556149161235174, "grad_norm": 3.4018850326538086, "learning_rate": 3.084207385113169e-08, "loss": 0.1219, "step": 7800 }, { "epoch": 0.5556861488050717, "grad_norm": 4.36669397354126, "learning_rate": 3.053458082655003e-08, "loss": 0.5838, "step": 7801 }, { "epoch": 0.555757381486626, "grad_norm": 3.58266544342041, "learning_rate": 3.0228625987817064e-08, "loss": 0.7276, "step": 7802 }, { "epoch": 0.5558286141681804, "grad_norm": 2.8137269020080566, "learning_rate": 2.992420938213725e-08, "loss": 0.2616, "step": 7803 }, { "epoch": 0.5558998468497347, "grad_norm": 3.408853530883789, "learning_rate": 2.9621331056480796e-08, "loss": 0.7183, "step": 7804 }, { "epoch": 0.555971079531289, "grad_norm": 4.086880207061768, "learning_rate": 2.931999105757699e-08, "loss": 0.6556, "step": 7805 }, { "epoch": 0.5560423122128433, "grad_norm": 6.911553859710693, "learning_rate": 2.9020189431920865e-08, "loss": 0.7934, "step": 7806 }, { "epoch": 0.5561135448943976, "grad_norm": 1.9814187288284302, "learning_rate": 2.8721926225768748e-08, "loss": 0.2379, "step": 7807 }, { "epoch": 0.5561847775759519, "grad_norm": 3.5593101978302, "learning_rate": 2.8425201485139387e-08, "loss": 0.6608, "step": 7808 }, { "epoch": 0.5562560102575062, "grad_norm": 3.1641995906829834, "learning_rate": 2.8130015255812827e-08, "loss": 0.5672, "step": 7809 }, { "epoch": 0.5563272429390604, "grad_norm": 2.3600001335144043, "learning_rate": 2.7836367583335967e-08, "loss": 0.2408, "step": 7810 }, { "epoch": 0.5563984756206147, "grad_norm": 3.8378312587738037, "learning_rate": 2.7544258513013678e-08, "loss": 0.6825, "step": 7811 }, { "epoch": 0.556469708302169, "grad_norm": 2.5330843925476074, "learning_rate": 2.7253688089915466e-08, "loss": 0.5055, "step": 7812 }, { "epoch": 0.5565409409837233, "grad_norm": 2.7759287357330322, "learning_rate": 2.6964656358874353e-08, "loss": 0.5065, "step": 7813 }, { "epoch": 0.5566121736652776, "grad_norm": 3.4949333667755127, "learning_rate": 2.667716336448356e-08, "loss": 0.772, "step": 7814 }, { "epoch": 0.5566834063468319, "grad_norm": 3.0370240211486816, "learning_rate": 2.639120915110094e-08, "loss": 0.577, "step": 7815 }, { "epoch": 0.5567546390283862, "grad_norm": 2.607733726501465, "learning_rate": 2.6106793762847858e-08, "loss": 0.2649, "step": 7816 }, { "epoch": 0.5568258717099405, "grad_norm": 2.2729737758636475, "learning_rate": 2.5823917243603668e-08, "loss": 0.5127, "step": 7817 }, { "epoch": 0.5568971043914949, "grad_norm": 1.8517160415649414, "learning_rate": 2.5542579637015675e-08, "loss": 0.1651, "step": 7818 }, { "epoch": 0.5569683370730492, "grad_norm": 2.8363287448883057, "learning_rate": 2.5262780986491375e-08, "loss": 0.6022, "step": 7819 }, { "epoch": 0.5570395697546034, "grad_norm": 2.3952596187591553, "learning_rate": 2.4984521335198464e-08, "loss": 0.329, "step": 7820 }, { "epoch": 0.5571108024361577, "grad_norm": 2.9925477504730225, "learning_rate": 2.4707800726072594e-08, "loss": 0.4395, "step": 7821 }, { "epoch": 0.557182035117712, "grad_norm": 4.030280113220215, "learning_rate": 2.4432619201806283e-08, "loss": 0.5369, "step": 7822 }, { "epoch": 0.5572532677992663, "grad_norm": 1.9779810905456543, "learning_rate": 2.4158976804858903e-08, "loss": 0.1748, "step": 7823 }, { "epoch": 0.5573245004808206, "grad_norm": 3.6897168159484863, "learning_rate": 2.3886873577450008e-08, "loss": 0.6052, "step": 7824 }, { "epoch": 0.5573957331623749, "grad_norm": 3.8620359897613525, "learning_rate": 2.3616309561562688e-08, "loss": 0.6433, "step": 7825 }, { "epoch": 0.5574669658439292, "grad_norm": 2.70617413520813, "learning_rate": 2.3347284798941327e-08, "loss": 0.4206, "step": 7826 }, { "epoch": 0.5575381985254835, "grad_norm": 2.2870960235595703, "learning_rate": 2.3079799331094943e-08, "loss": 0.2298, "step": 7827 }, { "epoch": 0.5576094312070378, "grad_norm": 3.416171073913574, "learning_rate": 2.2813853199292745e-08, "loss": 0.5651, "step": 7828 }, { "epoch": 0.557680663888592, "grad_norm": 3.269270658493042, "learning_rate": 2.2549446444567468e-08, "loss": 0.7144, "step": 7829 }, { "epoch": 0.5577518965701463, "grad_norm": 2.19840407371521, "learning_rate": 2.2286579107716476e-08, "loss": 0.2434, "step": 7830 }, { "epoch": 0.5578231292517006, "grad_norm": 5.390160083770752, "learning_rate": 2.2025251229293997e-08, "loss": 0.6837, "step": 7831 }, { "epoch": 0.5578943619332549, "grad_norm": 2.474555015563965, "learning_rate": 2.176546284962222e-08, "loss": 0.4239, "step": 7832 }, { "epoch": 0.5579655946148093, "grad_norm": 3.7006897926330566, "learning_rate": 2.1507214008783527e-08, "loss": 0.7575, "step": 7833 }, { "epoch": 0.5580368272963636, "grad_norm": 1.6301823854446411, "learning_rate": 2.1250504746623822e-08, "loss": 0.3262, "step": 7834 }, { "epoch": 0.5581080599779179, "grad_norm": 3.109536647796631, "learning_rate": 2.0995335102749204e-08, "loss": 0.5138, "step": 7835 }, { "epoch": 0.5581792926594722, "grad_norm": 4.641711711883545, "learning_rate": 2.0741705116531507e-08, "loss": 0.6849, "step": 7836 }, { "epoch": 0.5582505253410265, "grad_norm": 2.3549723625183105, "learning_rate": 2.0489614827101656e-08, "loss": 0.4653, "step": 7837 }, { "epoch": 0.5583217580225808, "grad_norm": 4.449515342712402, "learning_rate": 2.02390642733552e-08, "loss": 0.5267, "step": 7838 }, { "epoch": 0.5583929907041351, "grad_norm": 5.935319900512695, "learning_rate": 1.9990053493949003e-08, "loss": 0.3793, "step": 7839 }, { "epoch": 0.5584642233856894, "grad_norm": 3.1920862197875977, "learning_rate": 1.9742582527303433e-08, "loss": 0.4785, "step": 7840 }, { "epoch": 0.5585354560672436, "grad_norm": 3.86711049079895, "learning_rate": 1.9496651411601285e-08, "loss": 0.5262, "step": 7841 }, { "epoch": 0.5586066887487979, "grad_norm": 4.953291416168213, "learning_rate": 1.9252260184786652e-08, "loss": 0.3534, "step": 7842 }, { "epoch": 0.5586779214303522, "grad_norm": 3.625856876373291, "learning_rate": 1.900940888456604e-08, "loss": 0.5279, "step": 7843 }, { "epoch": 0.5587491541119065, "grad_norm": 3.552368640899658, "learning_rate": 1.876809754840836e-08, "loss": 0.4651, "step": 7844 }, { "epoch": 0.5588203867934608, "grad_norm": 3.3632590770721436, "learning_rate": 1.8528326213548276e-08, "loss": 0.4801, "step": 7845 }, { "epoch": 0.5588916194750151, "grad_norm": 4.509491443634033, "learning_rate": 1.829009491697731e-08, "loss": 0.596, "step": 7846 }, { "epoch": 0.5589628521565694, "grad_norm": 3.4641363620758057, "learning_rate": 1.805340369545272e-08, "loss": 0.7419, "step": 7847 }, { "epoch": 0.5590340848381238, "grad_norm": 2.8943030834198, "learning_rate": 1.781825258549419e-08, "loss": 0.3386, "step": 7848 }, { "epoch": 0.5591053175196781, "grad_norm": 1.8215324878692627, "learning_rate": 1.7584641623381583e-08, "loss": 0.2553, "step": 7849 }, { "epoch": 0.5591765502012324, "grad_norm": 2.773716926574707, "learning_rate": 1.735257084516051e-08, "loss": 0.5131, "step": 7850 }, { "epoch": 0.5592477828827866, "grad_norm": 2.9393818378448486, "learning_rate": 1.7122040286636775e-08, "loss": 0.3916, "step": 7851 }, { "epoch": 0.5593190155643409, "grad_norm": 3.097292184829712, "learning_rate": 1.6893049983378597e-08, "loss": 0.531, "step": 7852 }, { "epoch": 0.5593902482458952, "grad_norm": 3.4396767616271973, "learning_rate": 1.6665599970715484e-08, "loss": 0.3404, "step": 7853 }, { "epoch": 0.5594614809274495, "grad_norm": 2.856501579284668, "learning_rate": 1.6439690283742704e-08, "loss": 0.185, "step": 7854 }, { "epoch": 0.5595327136090038, "grad_norm": 3.564424753189087, "learning_rate": 1.6215320957315707e-08, "loss": 0.4715, "step": 7855 }, { "epoch": 0.5596039462905581, "grad_norm": 3.418421983718872, "learning_rate": 1.5992492026050134e-08, "loss": 0.7906, "step": 7856 }, { "epoch": 0.5596751789721124, "grad_norm": 2.9778218269348145, "learning_rate": 1.5771203524328483e-08, "loss": 0.8346, "step": 7857 }, { "epoch": 0.5597464116536667, "grad_norm": 5.799298286437988, "learning_rate": 1.5551455486292333e-08, "loss": 0.7985, "step": 7858 }, { "epoch": 0.559817644335221, "grad_norm": 3.4570984840393066, "learning_rate": 1.5333247945846787e-08, "loss": 0.5656, "step": 7859 }, { "epoch": 0.5598888770167753, "grad_norm": 2.910675048828125, "learning_rate": 1.5116580936658242e-08, "loss": 0.6535, "step": 7860 }, { "epoch": 0.5599601096983295, "grad_norm": 4.337214946746826, "learning_rate": 1.4901454492157742e-08, "loss": 0.3016, "step": 7861 }, { "epoch": 0.560031342379884, "grad_norm": 2.281318187713623, "learning_rate": 1.4687868645535398e-08, "loss": 0.3213, "step": 7862 }, { "epoch": 0.5601025750614382, "grad_norm": 4.482895851135254, "learning_rate": 1.4475823429747071e-08, "loss": 0.9231, "step": 7863 }, { "epoch": 0.5601738077429925, "grad_norm": 2.260748863220215, "learning_rate": 1.4265318877507705e-08, "loss": 0.4018, "step": 7864 }, { "epoch": 0.5602450404245468, "grad_norm": 3.5733699798583984, "learning_rate": 1.4056355021295764e-08, "loss": 0.6359, "step": 7865 }, { "epoch": 0.5603162731061011, "grad_norm": 3.6480746269226074, "learning_rate": 1.3848931893353235e-08, "loss": 0.6206, "step": 7866 }, { "epoch": 0.5603875057876554, "grad_norm": 3.7058770656585693, "learning_rate": 1.3643049525683405e-08, "loss": 0.6914, "step": 7867 }, { "epoch": 0.5604587384692097, "grad_norm": 1.1684201955795288, "learning_rate": 1.3438707950051978e-08, "loss": 0.0949, "step": 7868 }, { "epoch": 0.560529971150764, "grad_norm": 2.2283449172973633, "learning_rate": 1.3235907197984843e-08, "loss": 0.457, "step": 7869 }, { "epoch": 0.5606012038323183, "grad_norm": 2.3117194175720215, "learning_rate": 1.303464730077475e-08, "loss": 0.3426, "step": 7870 }, { "epoch": 0.5606724365138726, "grad_norm": 1.8663591146469116, "learning_rate": 1.2834928289472415e-08, "loss": 0.3022, "step": 7871 }, { "epoch": 0.5607436691954268, "grad_norm": 2.9436991214752197, "learning_rate": 1.2636750194892078e-08, "loss": 0.4457, "step": 7872 }, { "epoch": 0.5608149018769811, "grad_norm": 3.2899227142333984, "learning_rate": 1.2440113047611502e-08, "loss": 0.7867, "step": 7873 }, { "epoch": 0.5608861345585354, "grad_norm": 5.238700866699219, "learning_rate": 1.224501687796975e-08, "loss": 0.2937, "step": 7874 }, { "epoch": 0.5609573672400897, "grad_norm": 2.991050958633423, "learning_rate": 1.2051461716068302e-08, "loss": 0.5194, "step": 7875 }, { "epoch": 0.561028599921644, "grad_norm": 3.103086233139038, "learning_rate": 1.1859447591769934e-08, "loss": 0.5056, "step": 7876 }, { "epoch": 0.5610998326031984, "grad_norm": 3.016963481903076, "learning_rate": 1.166897453470095e-08, "loss": 0.4907, "step": 7877 }, { "epoch": 0.5611710652847527, "grad_norm": 2.692230224609375, "learning_rate": 1.148004257424895e-08, "loss": 0.074, "step": 7878 }, { "epoch": 0.561242297966307, "grad_norm": 2.8725013732910156, "learning_rate": 1.1292651739565063e-08, "loss": 0.5574, "step": 7879 }, { "epoch": 0.5613135306478613, "grad_norm": 2.308006525039673, "learning_rate": 1.1106802059560607e-08, "loss": 0.3635, "step": 7880 }, { "epoch": 0.5613847633294156, "grad_norm": 1.4389848709106445, "learning_rate": 1.092249356291042e-08, "loss": 0.1384, "step": 7881 }, { "epoch": 0.5614559960109698, "grad_norm": 2.9365224838256836, "learning_rate": 1.0739726278052864e-08, "loss": 0.5047, "step": 7882 }, { "epoch": 0.5615272286925241, "grad_norm": 2.1850132942199707, "learning_rate": 1.0558500233186498e-08, "loss": 0.4789, "step": 7883 }, { "epoch": 0.5615984613740784, "grad_norm": 1.9816335439682007, "learning_rate": 1.0378815456271174e-08, "loss": 0.1754, "step": 7884 }, { "epoch": 0.5616696940556327, "grad_norm": 3.438931465148926, "learning_rate": 1.0200671975031384e-08, "loss": 0.5693, "step": 7885 }, { "epoch": 0.561740926737187, "grad_norm": 2.6418631076812744, "learning_rate": 1.002406981695292e-08, "loss": 0.3812, "step": 7886 }, { "epoch": 0.5618121594187413, "grad_norm": 3.3097293376922607, "learning_rate": 9.849009009285093e-09, "loss": 0.6553, "step": 7887 }, { "epoch": 0.5618833921002956, "grad_norm": 3.8921782970428467, "learning_rate": 9.675489579035191e-09, "loss": 0.2395, "step": 7888 }, { "epoch": 0.5619546247818499, "grad_norm": 2.6477859020233154, "learning_rate": 9.503511552977351e-09, "loss": 0.2186, "step": 7889 }, { "epoch": 0.5620258574634042, "grad_norm": 4.376904487609863, "learning_rate": 9.333074957644795e-09, "loss": 0.5576, "step": 7890 }, { "epoch": 0.5620970901449585, "grad_norm": 1.917253017425537, "learning_rate": 9.164179819335373e-09, "loss": 0.1342, "step": 7891 }, { "epoch": 0.5621683228265129, "grad_norm": 5.0961809158325195, "learning_rate": 8.996826164107131e-09, "loss": 0.5576, "step": 7892 }, { "epoch": 0.5622395555080671, "grad_norm": 2.557985782623291, "learning_rate": 8.831014017780526e-09, "loss": 0.4102, "step": 7893 }, { "epoch": 0.5623107881896214, "grad_norm": 3.106353759765625, "learning_rate": 8.666743405940647e-09, "loss": 0.5179, "step": 7894 }, { "epoch": 0.5623820208711757, "grad_norm": 3.5430965423583984, "learning_rate": 8.504014353930557e-09, "loss": 0.2892, "step": 7895 }, { "epoch": 0.56245325355273, "grad_norm": 2.0382144451141357, "learning_rate": 8.342826886857946e-09, "loss": 0.2433, "step": 7896 }, { "epoch": 0.5625244862342843, "grad_norm": 4.944927215576172, "learning_rate": 8.183181029594034e-09, "loss": 0.1018, "step": 7897 }, { "epoch": 0.5625957189158386, "grad_norm": 1.908676028251648, "learning_rate": 8.025076806769117e-09, "loss": 0.2144, "step": 7898 }, { "epoch": 0.5626669515973929, "grad_norm": 2.8429338932037354, "learning_rate": 7.868514242777015e-09, "loss": 0.5674, "step": 7899 }, { "epoch": 0.5627381842789472, "grad_norm": 2.6234800815582275, "learning_rate": 7.71349336177507e-09, "loss": 0.3956, "step": 7900 }, { "epoch": 0.5628094169605015, "grad_norm": 4.225670337677002, "learning_rate": 7.56001418767971e-09, "loss": 0.2642, "step": 7901 }, { "epoch": 0.5628806496420558, "grad_norm": 7.393583297729492, "learning_rate": 7.408076744171988e-09, "loss": 0.3125, "step": 7902 }, { "epoch": 0.56295188232361, "grad_norm": 1.9143962860107422, "learning_rate": 7.257681054695375e-09, "loss": 0.1578, "step": 7903 }, { "epoch": 0.5630231150051643, "grad_norm": 1.8996244668960571, "learning_rate": 7.108827142452423e-09, "loss": 0.2655, "step": 7904 }, { "epoch": 0.5630943476867186, "grad_norm": 3.7336719036102295, "learning_rate": 6.961515030410315e-09, "loss": 0.7814, "step": 7905 }, { "epoch": 0.5631655803682729, "grad_norm": 3.1664786338806152, "learning_rate": 6.8157447412975365e-09, "loss": 0.3553, "step": 7906 }, { "epoch": 0.5632368130498273, "grad_norm": 2.322354316711426, "learning_rate": 6.671516297606095e-09, "loss": 0.257, "step": 7907 }, { "epoch": 0.5633080457313816, "grad_norm": 2.5657153129577637, "learning_rate": 6.528829721588193e-09, "loss": 0.3741, "step": 7908 }, { "epoch": 0.5633792784129359, "grad_norm": 3.148228883743286, "learning_rate": 6.38768503525955e-09, "loss": 0.2655, "step": 7909 }, { "epoch": 0.5634505110944902, "grad_norm": 3.1034390926361084, "learning_rate": 6.2480822603960825e-09, "loss": 0.4473, "step": 7910 }, { "epoch": 0.5635217437760445, "grad_norm": 2.6554789543151855, "learning_rate": 6.110021418538337e-09, "loss": 0.2909, "step": 7911 }, { "epoch": 0.5635929764575988, "grad_norm": 2.842952251434326, "learning_rate": 5.973502530987052e-09, "loss": 0.0688, "step": 7912 }, { "epoch": 0.563664209139153, "grad_norm": 3.781017303466797, "learning_rate": 5.83852561880538e-09, "loss": 0.434, "step": 7913 }, { "epoch": 0.5637354418207073, "grad_norm": 3.0704665184020996, "learning_rate": 5.705090702819993e-09, "loss": 0.4731, "step": 7914 }, { "epoch": 0.5638066745022616, "grad_norm": 5.238247394561768, "learning_rate": 5.573197803616648e-09, "loss": 0.4458, "step": 7915 }, { "epoch": 0.5638779071838159, "grad_norm": 3.1659021377563477, "learning_rate": 5.442846941546842e-09, "loss": 0.4403, "step": 7916 }, { "epoch": 0.5639491398653702, "grad_norm": 7.546285629272461, "learning_rate": 5.314038136722266e-09, "loss": 0.4764, "step": 7917 }, { "epoch": 0.5640203725469245, "grad_norm": 5.385149955749512, "learning_rate": 5.1867714090148016e-09, "loss": 0.355, "step": 7918 }, { "epoch": 0.5640916052284788, "grad_norm": 6.329132080078125, "learning_rate": 5.061046778063183e-09, "loss": 0.6702, "step": 7919 }, { "epoch": 0.5641628379100331, "grad_norm": 4.595382213592529, "learning_rate": 4.936864263264119e-09, "loss": 0.629, "step": 7920 }, { "epoch": 0.5642340705915874, "grad_norm": 2.268078565597534, "learning_rate": 4.814223883776725e-09, "loss": 0.4435, "step": 7921 }, { "epoch": 0.5643053032731418, "grad_norm": 3.7182199954986572, "learning_rate": 4.693125658524755e-09, "loss": 0.4694, "step": 7922 }, { "epoch": 0.5643765359546961, "grad_norm": 2.7569587230682373, "learning_rate": 4.573569606191042e-09, "loss": 0.1745, "step": 7923 }, { "epoch": 0.5644477686362503, "grad_norm": 3.0226950645446777, "learning_rate": 4.45555574522305e-09, "loss": 0.7063, "step": 7924 }, { "epoch": 0.5645190013178046, "grad_norm": 2.9079253673553467, "learning_rate": 4.339084093828438e-09, "loss": 0.4934, "step": 7925 }, { "epoch": 0.5645902339993589, "grad_norm": 3.0796058177948, "learning_rate": 4.224154669978386e-09, "loss": 0.5715, "step": 7926 }, { "epoch": 0.5646614666809132, "grad_norm": 2.3621673583984375, "learning_rate": 4.1107674914042665e-09, "loss": 0.2975, "step": 7927 }, { "epoch": 0.5647326993624675, "grad_norm": 3.077343463897705, "learning_rate": 3.998922575600972e-09, "loss": 0.3719, "step": 7928 }, { "epoch": 0.5648039320440218, "grad_norm": 1.8794455528259277, "learning_rate": 3.8886199398247005e-09, "loss": 0.1702, "step": 7929 }, { "epoch": 0.5648751647255761, "grad_norm": 2.394895076751709, "learning_rate": 3.77985960109517e-09, "loss": 0.3956, "step": 7930 }, { "epoch": 0.5649463974071304, "grad_norm": 2.755058765411377, "learning_rate": 3.6726415761911826e-09, "loss": 0.3779, "step": 7931 }, { "epoch": 0.5650176300886847, "grad_norm": 6.295623302459717, "learning_rate": 3.5669658816572803e-09, "loss": 0.6638, "step": 7932 }, { "epoch": 0.565088862770239, "grad_norm": 2.444293975830078, "learning_rate": 3.462832533795979e-09, "loss": 0.5655, "step": 7933 }, { "epoch": 0.5651600954517932, "grad_norm": 2.496925115585327, "learning_rate": 3.360241548676646e-09, "loss": 0.5918, "step": 7934 }, { "epoch": 0.5652313281333475, "grad_norm": 1.987681269645691, "learning_rate": 3.259192942125511e-09, "loss": 0.3597, "step": 7935 }, { "epoch": 0.5653025608149018, "grad_norm": 2.9125776290893555, "learning_rate": 3.1596867297345457e-09, "loss": 0.6287, "step": 7936 }, { "epoch": 0.5653737934964562, "grad_norm": 3.272914171218872, "learning_rate": 3.0617229268570248e-09, "loss": 0.3878, "step": 7937 }, { "epoch": 0.5654450261780105, "grad_norm": 1.9060794115066528, "learning_rate": 2.9653015486064143e-09, "loss": 0.2847, "step": 7938 }, { "epoch": 0.5655162588595648, "grad_norm": 4.053839683532715, "learning_rate": 2.8704226098597023e-09, "loss": 0.5769, "step": 7939 }, { "epoch": 0.5655874915411191, "grad_norm": 2.0018322467803955, "learning_rate": 2.7770861252574e-09, "loss": 0.0534, "step": 7940 }, { "epoch": 0.5656587242226734, "grad_norm": 2.508251667022705, "learning_rate": 2.6852921091991e-09, "loss": 0.391, "step": 7941 }, { "epoch": 0.5657299569042277, "grad_norm": 2.754751205444336, "learning_rate": 2.595040575846808e-09, "loss": 0.4051, "step": 7942 }, { "epoch": 0.565801189585782, "grad_norm": 3.6243176460266113, "learning_rate": 2.5063315391271605e-09, "loss": 0.8947, "step": 7943 }, { "epoch": 0.5658724222673363, "grad_norm": 2.8534584045410156, "learning_rate": 2.4191650127269873e-09, "loss": 0.4384, "step": 7944 }, { "epoch": 0.5659436549488905, "grad_norm": 3.389200448989868, "learning_rate": 2.3335410100933096e-09, "loss": 0.184, "step": 7945 }, { "epoch": 0.5660148876304448, "grad_norm": 2.6001739501953125, "learning_rate": 2.249459544438892e-09, "loss": 0.447, "step": 7946 }, { "epoch": 0.5660861203119991, "grad_norm": 3.8022382259368896, "learning_rate": 2.1669206287355803e-09, "loss": 0.6975, "step": 7947 }, { "epoch": 0.5661573529935534, "grad_norm": 3.3764431476593018, "learning_rate": 2.0859242757187425e-09, "loss": 0.5908, "step": 7948 }, { "epoch": 0.5662285856751077, "grad_norm": 2.39042592048645, "learning_rate": 2.006470497885049e-09, "loss": 0.7021, "step": 7949 }, { "epoch": 0.566299818356662, "grad_norm": 5.746589183807373, "learning_rate": 1.9285593074935826e-09, "loss": 0.6881, "step": 7950 }, { "epoch": 0.5663710510382164, "grad_norm": 3.009629964828491, "learning_rate": 1.8521907165658382e-09, "loss": 0.3901, "step": 7951 }, { "epoch": 0.5664422837197707, "grad_norm": 2.6721503734588623, "learning_rate": 1.7773647368835023e-09, "loss": 0.616, "step": 7952 }, { "epoch": 0.566513516401325, "grad_norm": 3.161980152130127, "learning_rate": 1.7040813799917844e-09, "loss": 0.2691, "step": 7953 }, { "epoch": 0.5665847490828793, "grad_norm": 2.9419503211975098, "learning_rate": 1.6323406571983058e-09, "loss": 0.5447, "step": 7954 }, { "epoch": 0.5666559817644335, "grad_norm": 3.552130699157715, "learning_rate": 1.56214257957088e-09, "loss": 0.7426, "step": 7955 }, { "epoch": 0.5667272144459878, "grad_norm": 2.4074392318725586, "learning_rate": 1.4934871579408428e-09, "loss": 0.2586, "step": 7956 }, { "epoch": 0.5667984471275421, "grad_norm": 1.9889326095581055, "learning_rate": 1.4263744029019422e-09, "loss": 0.2684, "step": 7957 }, { "epoch": 0.5668696798090964, "grad_norm": 3.276353120803833, "learning_rate": 1.360804324807008e-09, "loss": 0.3336, "step": 7958 }, { "epoch": 0.5669409124906507, "grad_norm": 2.4460644721984863, "learning_rate": 1.2967769337746128e-09, "loss": 0.4707, "step": 7959 }, { "epoch": 0.567012145172205, "grad_norm": 3.302131175994873, "learning_rate": 1.2342922396824108e-09, "loss": 0.5154, "step": 7960 }, { "epoch": 0.5670833778537593, "grad_norm": 2.811765193939209, "learning_rate": 1.173350252171579e-09, "loss": 0.5314, "step": 7961 }, { "epoch": 0.5671546105353136, "grad_norm": 3.1619880199432373, "learning_rate": 1.113950980645706e-09, "loss": 0.6523, "step": 7962 }, { "epoch": 0.5672258432168679, "grad_norm": 2.415459394454956, "learning_rate": 1.0560944342674627e-09, "loss": 0.3605, "step": 7963 }, { "epoch": 0.5672970758984222, "grad_norm": 2.3636345863342285, "learning_rate": 9.997806219652628e-10, "loss": 0.3121, "step": 7964 }, { "epoch": 0.5673683085799764, "grad_norm": 3.455606460571289, "learning_rate": 9.450095524266012e-10, "loss": 0.4613, "step": 7965 }, { "epoch": 0.5674395412615308, "grad_norm": 5.289156913757324, "learning_rate": 8.917812341024956e-10, "loss": 0.6403, "step": 7966 }, { "epoch": 0.5675107739430851, "grad_norm": 4.195044994354248, "learning_rate": 8.400956752063761e-10, "loss": 0.6795, "step": 7967 }, { "epoch": 0.5675820066246394, "grad_norm": 2.0360312461853027, "learning_rate": 7.899528837118642e-10, "loss": 0.2818, "step": 7968 }, { "epoch": 0.5676532393061937, "grad_norm": 2.180840253829956, "learning_rate": 7.413528673549941e-10, "loss": 0.2594, "step": 7969 }, { "epoch": 0.567724471987748, "grad_norm": 1.2556729316711426, "learning_rate": 6.942956336353224e-10, "loss": 0.1083, "step": 7970 }, { "epoch": 0.5677957046693023, "grad_norm": 3.0073678493499756, "learning_rate": 6.487811898137075e-10, "loss": 0.3127, "step": 7971 }, { "epoch": 0.5678669373508566, "grad_norm": 4.258357048034668, "learning_rate": 6.048095429111999e-10, "loss": 0.6505, "step": 7972 }, { "epoch": 0.5679381700324109, "grad_norm": 4.109939098358154, "learning_rate": 5.623806997123726e-10, "loss": 0.5887, "step": 7973 }, { "epoch": 0.5680094027139652, "grad_norm": 3.1260902881622314, "learning_rate": 5.214946667642106e-10, "loss": 0.5746, "step": 7974 }, { "epoch": 0.5680806353955195, "grad_norm": 5.254842758178711, "learning_rate": 4.821514503750013e-10, "loss": 0.9055, "step": 7975 }, { "epoch": 0.5681518680770737, "grad_norm": 4.475924015045166, "learning_rate": 4.4435105661433387e-10, "loss": 0.4265, "step": 7976 }, { "epoch": 0.568223100758628, "grad_norm": 3.3674325942993164, "learning_rate": 4.0809349131420984e-10, "loss": 0.6114, "step": 7977 }, { "epoch": 0.5682943334401823, "grad_norm": 1.4041287899017334, "learning_rate": 3.7337876007015325e-10, "loss": 0.0996, "step": 7978 }, { "epoch": 0.5683655661217366, "grad_norm": 2.6119141578674316, "learning_rate": 3.4020686823788007e-10, "loss": 0.5537, "step": 7979 }, { "epoch": 0.5684367988032909, "grad_norm": 2.903928518295288, "learning_rate": 3.0857782093440813e-10, "loss": 0.4918, "step": 7980 }, { "epoch": 0.5685080314848453, "grad_norm": 3.4136440753936768, "learning_rate": 2.784916230402779e-10, "loss": 0.5079, "step": 7981 }, { "epoch": 0.5685792641663996, "grad_norm": 2.227678060531616, "learning_rate": 2.49948279198442e-10, "loss": 0.3039, "step": 7982 }, { "epoch": 0.5686504968479539, "grad_norm": 1.7017629146575928, "learning_rate": 2.2294779381204502e-10, "loss": 0.3205, "step": 7983 }, { "epoch": 0.5687217295295082, "grad_norm": 2.216078996658325, "learning_rate": 1.974901710466437e-10, "loss": 0.4131, "step": 7984 }, { "epoch": 0.5687929622110625, "grad_norm": 2.8692374229431152, "learning_rate": 1.7357541483020712e-10, "loss": 0.5823, "step": 7985 }, { "epoch": 0.5688641948926167, "grad_norm": 4.2620649337768555, "learning_rate": 1.5120352885311663e-10, "loss": 0.3984, "step": 7986 }, { "epoch": 0.568935427574171, "grad_norm": 9.487737655639648, "learning_rate": 1.3037451656705558e-10, "loss": 0.2473, "step": 7987 }, { "epoch": 0.5690066602557253, "grad_norm": 2.8039162158966064, "learning_rate": 1.1108838118500942e-10, "loss": 0.4467, "step": 7988 }, { "epoch": 0.5690778929372796, "grad_norm": 2.002211570739746, "learning_rate": 9.334512568348608e-11, "loss": 0.2458, "step": 7989 }, { "epoch": 0.5691491256188339, "grad_norm": 8.200965881347656, "learning_rate": 7.714475279918531e-11, "loss": 0.4861, "step": 7990 }, { "epoch": 0.5692203583003882, "grad_norm": 2.4076390266418457, "learning_rate": 6.248726503232938e-11, "loss": 0.1563, "step": 7991 }, { "epoch": 0.5692915909819425, "grad_norm": 2.7991442680358887, "learning_rate": 4.937266464444257e-11, "loss": 0.5219, "step": 7992 }, { "epoch": 0.5693628236634968, "grad_norm": 2.8955795764923096, "learning_rate": 3.7800953658351236e-11, "loss": 0.2338, "step": 7993 }, { "epoch": 0.5694340563450511, "grad_norm": 3.1865525245666504, "learning_rate": 2.7772133860404227e-11, "loss": 0.4248, "step": 7994 }, { "epoch": 0.5695052890266054, "grad_norm": 2.644304037094116, "learning_rate": 1.9286206797142214e-11, "loss": 0.2147, "step": 7995 }, { "epoch": 0.5695765217081598, "grad_norm": 2.9231414794921875, "learning_rate": 1.234317377862837e-11, "loss": 0.2525, "step": 7996 }, { "epoch": 0.569647754389714, "grad_norm": 6.312331199645996, "learning_rate": 6.943035875117688e-12, "loss": 0.2932, "step": 7997 }, { "epoch": 0.5697189870712683, "grad_norm": 6.431723594665527, "learning_rate": 3.0857939203876584e-12, "loss": 0.7688, "step": 7998 }, { "epoch": 0.5697902197528226, "grad_norm": 5.2601776123046875, "learning_rate": 7.714485095178248e-13, "loss": 0.6731, "step": 7999 }, { "epoch": 0.5698614524343769, "grad_norm": 4.535676002502441, "learning_rate": 0.0, "loss": 0.1902, "step": 8000 }, { "epoch": 0.5698614524343769, "eval_loss": 0.4367330074310303, "eval_runtime": 897.3473, "eval_samples_per_second": 1.663, "eval_steps_per_second": 1.663, "step": 8000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.644784301113344e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }