{ "best_metric": 0.06457369029521942, "best_model_checkpoint": "./phishing-email_sender-detection/checkpoint-2734", "epoch": 2.0, "eval_steps": 1, "global_step": 2734, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000731528895391368, "grad_norm": 2.0537707805633545, "learning_rate": 1.0000000000000002e-06, "loss": 0.7134, "step": 1 }, { "epoch": 0.001463057790782736, "grad_norm": 0.6965876221656799, "learning_rate": 2.0000000000000003e-06, "loss": 0.7055, "step": 2 }, { "epoch": 0.0021945866861741038, "grad_norm": 2.5094313621520996, "learning_rate": 3e-06, "loss": 0.6741, "step": 3 }, { "epoch": 0.002926115581565472, "grad_norm": 1.8208039999008179, "learning_rate": 4.000000000000001e-06, "loss": 0.7129, "step": 4 }, { "epoch": 0.0036576444769568397, "grad_norm": 1.0614643096923828, "learning_rate": 5e-06, "loss": 0.6947, "step": 5 }, { "epoch": 0.0043891733723482075, "grad_norm": 0.7732250094413757, "learning_rate": 6e-06, "loss": 0.6849, "step": 6 }, { "epoch": 0.005120702267739576, "grad_norm": 0.5648068189620972, "learning_rate": 7.000000000000001e-06, "loss": 0.6935, "step": 7 }, { "epoch": 0.005852231163130944, "grad_norm": 1.847030758857727, "learning_rate": 8.000000000000001e-06, "loss": 0.7088, "step": 8 }, { "epoch": 0.006583760058522311, "grad_norm": 1.1081476211547852, "learning_rate": 9e-06, "loss": 0.6985, "step": 9 }, { "epoch": 0.0073152889539136795, "grad_norm": 2.4518747329711914, "learning_rate": 1e-05, "loss": 0.6815, "step": 10 }, { "epoch": 0.008046817849305048, "grad_norm": 0.8495326638221741, "learning_rate": 1.1000000000000001e-05, "loss": 0.6913, "step": 11 }, { "epoch": 0.008778346744696415, "grad_norm": 1.3938794136047363, "learning_rate": 1.2e-05, "loss": 0.7089, "step": 12 }, { "epoch": 0.009509875640087784, "grad_norm": 1.0653553009033203, "learning_rate": 1.3000000000000001e-05, "loss": 0.6965, "step": 13 }, { "epoch": 0.010241404535479151, "grad_norm": 0.568507969379425, "learning_rate": 1.4000000000000001e-05, "loss": 0.6924, "step": 14 }, { "epoch": 0.010972933430870519, "grad_norm": 1.7980329990386963, "learning_rate": 1.5e-05, "loss": 0.7087, "step": 15 }, { "epoch": 0.011704462326261888, "grad_norm": 0.6387674808502197, "learning_rate": 1.6000000000000003e-05, "loss": 0.6923, "step": 16 }, { "epoch": 0.012435991221653255, "grad_norm": 0.9484112858772278, "learning_rate": 1.7000000000000003e-05, "loss": 0.6745, "step": 17 }, { "epoch": 0.013167520117044623, "grad_norm": 1.246181845664978, "learning_rate": 1.8e-05, "loss": 0.6774, "step": 18 }, { "epoch": 0.013899049012435992, "grad_norm": 1.0500237941741943, "learning_rate": 1.9e-05, "loss": 0.6824, "step": 19 }, { "epoch": 0.014630577907827359, "grad_norm": 1.0836827754974365, "learning_rate": 2e-05, "loss": 0.7019, "step": 20 }, { "epoch": 0.015362106803218726, "grad_norm": 0.9788469672203064, "learning_rate": 2.1e-05, "loss": 0.686, "step": 21 }, { "epoch": 0.016093635698610095, "grad_norm": 1.1970380544662476, "learning_rate": 2.2000000000000003e-05, "loss": 0.6772, "step": 22 }, { "epoch": 0.016825164594001463, "grad_norm": 1.2800047397613525, "learning_rate": 2.3000000000000003e-05, "loss": 0.6639, "step": 23 }, { "epoch": 0.01755669348939283, "grad_norm": 7.34714937210083, "learning_rate": 2.4e-05, "loss": 0.7496, "step": 24 }, { "epoch": 0.018288222384784197, "grad_norm": 2.5049424171447754, "learning_rate": 2.5e-05, "loss": 0.6495, "step": 25 }, { "epoch": 0.019019751280175568, "grad_norm": 1.583709716796875, "learning_rate": 2.6000000000000002e-05, "loss": 0.6465, "step": 26 }, { "epoch": 0.019751280175566936, "grad_norm": 3.135451316833496, "learning_rate": 2.7000000000000002e-05, "loss": 0.6697, "step": 27 }, { "epoch": 0.020482809070958303, "grad_norm": 2.220353364944458, "learning_rate": 2.8000000000000003e-05, "loss": 0.6101, "step": 28 }, { "epoch": 0.02121433796634967, "grad_norm": 5.0612640380859375, "learning_rate": 2.9e-05, "loss": 0.6399, "step": 29 }, { "epoch": 0.021945866861741038, "grad_norm": 4.573606014251709, "learning_rate": 3e-05, "loss": 0.6372, "step": 30 }, { "epoch": 0.02267739575713241, "grad_norm": 13.581506729125977, "learning_rate": 3.1e-05, "loss": 0.634, "step": 31 }, { "epoch": 0.023408924652523776, "grad_norm": 9.19709300994873, "learning_rate": 3.2000000000000005e-05, "loss": 0.5199, "step": 32 }, { "epoch": 0.024140453547915143, "grad_norm": 21.063003540039062, "learning_rate": 3.3e-05, "loss": 0.5879, "step": 33 }, { "epoch": 0.02487198244330651, "grad_norm": 30.006093978881836, "learning_rate": 3.4000000000000007e-05, "loss": 0.6509, "step": 34 }, { "epoch": 0.025603511338697878, "grad_norm": 11.763099670410156, "learning_rate": 3.5e-05, "loss": 0.5285, "step": 35 }, { "epoch": 0.026335040234089245, "grad_norm": 17.88917350769043, "learning_rate": 3.6e-05, "loss": 0.4909, "step": 36 }, { "epoch": 0.027066569129480616, "grad_norm": 12.684080123901367, "learning_rate": 3.7e-05, "loss": 0.3255, "step": 37 }, { "epoch": 0.027798098024871983, "grad_norm": 9.809749603271484, "learning_rate": 3.8e-05, "loss": 0.2877, "step": 38 }, { "epoch": 0.02852962692026335, "grad_norm": 8.171330451965332, "learning_rate": 3.9000000000000006e-05, "loss": 0.3169, "step": 39 }, { "epoch": 0.029261155815654718, "grad_norm": 11.782173156738281, "learning_rate": 4e-05, "loss": 0.4555, "step": 40 }, { "epoch": 0.029992684711046085, "grad_norm": 54.60282516479492, "learning_rate": 4.1e-05, "loss": 0.6452, "step": 41 }, { "epoch": 0.030724213606437453, "grad_norm": 12.34322738647461, "learning_rate": 4.2e-05, "loss": 0.2935, "step": 42 }, { "epoch": 0.03145574250182882, "grad_norm": 36.28764343261719, "learning_rate": 4.3e-05, "loss": 0.2555, "step": 43 }, { "epoch": 0.03218727139722019, "grad_norm": 21.328907012939453, "learning_rate": 4.4000000000000006e-05, "loss": 0.28, "step": 44 }, { "epoch": 0.03291880029261156, "grad_norm": 9.044025421142578, "learning_rate": 4.5e-05, "loss": 0.4389, "step": 45 }, { "epoch": 0.033650329188002925, "grad_norm": 10.441105842590332, "learning_rate": 4.600000000000001e-05, "loss": 0.4949, "step": 46 }, { "epoch": 0.03438185808339429, "grad_norm": 17.071596145629883, "learning_rate": 4.7e-05, "loss": 0.2833, "step": 47 }, { "epoch": 0.03511338697878566, "grad_norm": 14.12200927734375, "learning_rate": 4.8e-05, "loss": 0.2754, "step": 48 }, { "epoch": 0.03584491587417703, "grad_norm": 16.147666931152344, "learning_rate": 4.9e-05, "loss": 0.3038, "step": 49 }, { "epoch": 0.036576444769568395, "grad_norm": 19.590967178344727, "learning_rate": 5e-05, "loss": 0.2744, "step": 50 }, { "epoch": 0.03730797366495977, "grad_norm": 7.307535648345947, "learning_rate": 4.998765736855098e-05, "loss": 0.2622, "step": 51 }, { "epoch": 0.038039502560351136, "grad_norm": 24.6008243560791, "learning_rate": 4.997531473710195e-05, "loss": 0.3354, "step": 52 }, { "epoch": 0.038771031455742504, "grad_norm": 20.546419143676758, "learning_rate": 4.9962972105652926e-05, "loss": 0.3201, "step": 53 }, { "epoch": 0.03950256035113387, "grad_norm": 16.79451560974121, "learning_rate": 4.99506294742039e-05, "loss": 0.3841, "step": 54 }, { "epoch": 0.04023408924652524, "grad_norm": 10.07118034362793, "learning_rate": 4.993828684275488e-05, "loss": 0.2322, "step": 55 }, { "epoch": 0.040965618141916606, "grad_norm": 6.132495403289795, "learning_rate": 4.992594421130585e-05, "loss": 0.1937, "step": 56 }, { "epoch": 0.04169714703730797, "grad_norm": 37.36897277832031, "learning_rate": 4.9913601579856826e-05, "loss": 0.4, "step": 57 }, { "epoch": 0.04242867593269934, "grad_norm": 9.6050386428833, "learning_rate": 4.99012589484078e-05, "loss": 0.1375, "step": 58 }, { "epoch": 0.04316020482809071, "grad_norm": 44.04307556152344, "learning_rate": 4.988891631695878e-05, "loss": 0.3884, "step": 59 }, { "epoch": 0.043891733723482075, "grad_norm": 6.768269062042236, "learning_rate": 4.987657368550975e-05, "loss": 0.0424, "step": 60 }, { "epoch": 0.04462326261887344, "grad_norm": 25.444942474365234, "learning_rate": 4.986423105406073e-05, "loss": 0.1089, "step": 61 }, { "epoch": 0.04535479151426482, "grad_norm": 38.78431701660156, "learning_rate": 4.9851888422611704e-05, "loss": 0.2247, "step": 62 }, { "epoch": 0.046086320409656184, "grad_norm": 29.26559829711914, "learning_rate": 4.983954579116268e-05, "loss": 0.2094, "step": 63 }, { "epoch": 0.04681784930504755, "grad_norm": 8.007121086120605, "learning_rate": 4.982720315971365e-05, "loss": 0.257, "step": 64 }, { "epoch": 0.04754937820043892, "grad_norm": 14.513443946838379, "learning_rate": 4.981486052826463e-05, "loss": 0.1479, "step": 65 }, { "epoch": 0.048280907095830286, "grad_norm": 8.328230857849121, "learning_rate": 4.9802517896815604e-05, "loss": 0.0645, "step": 66 }, { "epoch": 0.04901243599122165, "grad_norm": 27.02349281311035, "learning_rate": 4.979017526536658e-05, "loss": 0.354, "step": 67 }, { "epoch": 0.04974396488661302, "grad_norm": 35.4229621887207, "learning_rate": 4.977783263391755e-05, "loss": 0.6023, "step": 68 }, { "epoch": 0.05047549378200439, "grad_norm": 14.835311889648438, "learning_rate": 4.976549000246853e-05, "loss": 0.4632, "step": 69 }, { "epoch": 0.051207022677395755, "grad_norm": 30.72248077392578, "learning_rate": 4.9753147371019504e-05, "loss": 0.3481, "step": 70 }, { "epoch": 0.05193855157278712, "grad_norm": 6.599341869354248, "learning_rate": 4.974080473957048e-05, "loss": 0.1833, "step": 71 }, { "epoch": 0.05267008046817849, "grad_norm": 12.653827667236328, "learning_rate": 4.972846210812145e-05, "loss": 0.1525, "step": 72 }, { "epoch": 0.053401609363569864, "grad_norm": 14.4039945602417, "learning_rate": 4.971611947667243e-05, "loss": 0.3468, "step": 73 }, { "epoch": 0.05413313825896123, "grad_norm": 8.219855308532715, "learning_rate": 4.9703776845223405e-05, "loss": 0.151, "step": 74 }, { "epoch": 0.0548646671543526, "grad_norm": 8.978411674499512, "learning_rate": 4.969143421377438e-05, "loss": 0.2622, "step": 75 }, { "epoch": 0.055596196049743966, "grad_norm": 14.019556999206543, "learning_rate": 4.967909158232535e-05, "loss": 0.3429, "step": 76 }, { "epoch": 0.056327724945135334, "grad_norm": 26.526100158691406, "learning_rate": 4.966674895087633e-05, "loss": 0.1705, "step": 77 }, { "epoch": 0.0570592538405267, "grad_norm": 16.27678871154785, "learning_rate": 4.9654406319427305e-05, "loss": 0.2471, "step": 78 }, { "epoch": 0.05779078273591807, "grad_norm": 7.492588043212891, "learning_rate": 4.964206368797828e-05, "loss": 0.1573, "step": 79 }, { "epoch": 0.058522311631309436, "grad_norm": 10.001906394958496, "learning_rate": 4.962972105652925e-05, "loss": 0.1296, "step": 80 }, { "epoch": 0.0592538405267008, "grad_norm": 25.81623077392578, "learning_rate": 4.961737842508023e-05, "loss": 0.144, "step": 81 }, { "epoch": 0.05998536942209217, "grad_norm": 10.74669361114502, "learning_rate": 4.9605035793631206e-05, "loss": 0.0786, "step": 82 }, { "epoch": 0.06071689831748354, "grad_norm": 8.346054077148438, "learning_rate": 4.959269316218218e-05, "loss": 0.1613, "step": 83 }, { "epoch": 0.061448427212874905, "grad_norm": 45.951324462890625, "learning_rate": 4.958035053073315e-05, "loss": 0.6456, "step": 84 }, { "epoch": 0.06217995610826628, "grad_norm": 14.57592487335205, "learning_rate": 4.956800789928413e-05, "loss": 0.2145, "step": 85 }, { "epoch": 0.06291148500365765, "grad_norm": 5.5634965896606445, "learning_rate": 4.9555665267835106e-05, "loss": 0.1315, "step": 86 }, { "epoch": 0.06364301389904901, "grad_norm": 82.94335174560547, "learning_rate": 4.954332263638608e-05, "loss": 0.7881, "step": 87 }, { "epoch": 0.06437454279444038, "grad_norm": 59.09476089477539, "learning_rate": 4.953098000493705e-05, "loss": 1.2367, "step": 88 }, { "epoch": 0.06510607168983175, "grad_norm": 73.15406036376953, "learning_rate": 4.951863737348803e-05, "loss": 1.117, "step": 89 }, { "epoch": 0.06583760058522312, "grad_norm": 45.18766784667969, "learning_rate": 4.9506294742039006e-05, "loss": 0.5958, "step": 90 }, { "epoch": 0.06656912948061448, "grad_norm": 6.104206085205078, "learning_rate": 4.949395211058998e-05, "loss": 0.2602, "step": 91 }, { "epoch": 0.06730065837600585, "grad_norm": 21.383394241333008, "learning_rate": 4.948160947914095e-05, "loss": 0.3353, "step": 92 }, { "epoch": 0.06803218727139722, "grad_norm": 30.0102481842041, "learning_rate": 4.946926684769193e-05, "loss": 0.5275, "step": 93 }, { "epoch": 0.06876371616678859, "grad_norm": 25.439563751220703, "learning_rate": 4.945692421624291e-05, "loss": 0.5606, "step": 94 }, { "epoch": 0.06949524506217995, "grad_norm": 9.922821044921875, "learning_rate": 4.9444581584793883e-05, "loss": 0.2766, "step": 95 }, { "epoch": 0.07022677395757132, "grad_norm": 3.7160801887512207, "learning_rate": 4.9432238953344854e-05, "loss": 0.1596, "step": 96 }, { "epoch": 0.07095830285296269, "grad_norm": 12.51370906829834, "learning_rate": 4.941989632189583e-05, "loss": 0.2305, "step": 97 }, { "epoch": 0.07168983174835405, "grad_norm": 10.646834373474121, "learning_rate": 4.940755369044681e-05, "loss": 0.1399, "step": 98 }, { "epoch": 0.07242136064374542, "grad_norm": 7.5731096267700195, "learning_rate": 4.9395211058997784e-05, "loss": 0.0881, "step": 99 }, { "epoch": 0.07315288953913679, "grad_norm": 7.635189533233643, "learning_rate": 4.9382868427548754e-05, "loss": 0.1383, "step": 100 }, { "epoch": 0.07388441843452817, "grad_norm": 27.70612144470215, "learning_rate": 4.937052579609973e-05, "loss": 0.3705, "step": 101 }, { "epoch": 0.07461594732991954, "grad_norm": 5.46030855178833, "learning_rate": 4.935818316465071e-05, "loss": 0.0391, "step": 102 }, { "epoch": 0.0753474762253109, "grad_norm": 17.61092185974121, "learning_rate": 4.9345840533201684e-05, "loss": 0.631, "step": 103 }, { "epoch": 0.07607900512070227, "grad_norm": 16.299503326416016, "learning_rate": 4.9333497901752654e-05, "loss": 0.1832, "step": 104 }, { "epoch": 0.07681053401609364, "grad_norm": 16.016836166381836, "learning_rate": 4.932115527030363e-05, "loss": 0.1731, "step": 105 }, { "epoch": 0.07754206291148501, "grad_norm": 27.381195068359375, "learning_rate": 4.930881263885461e-05, "loss": 0.205, "step": 106 }, { "epoch": 0.07827359180687637, "grad_norm": 5.96188497543335, "learning_rate": 4.9296470007405585e-05, "loss": 0.1942, "step": 107 }, { "epoch": 0.07900512070226774, "grad_norm": 9.40506649017334, "learning_rate": 4.9284127375956555e-05, "loss": 0.3032, "step": 108 }, { "epoch": 0.07973664959765911, "grad_norm": 8.216382026672363, "learning_rate": 4.927178474450753e-05, "loss": 0.274, "step": 109 }, { "epoch": 0.08046817849305048, "grad_norm": 19.86112403869629, "learning_rate": 4.925944211305851e-05, "loss": 0.366, "step": 110 }, { "epoch": 0.08119970738844184, "grad_norm": 17.23086929321289, "learning_rate": 4.9247099481609485e-05, "loss": 0.3444, "step": 111 }, { "epoch": 0.08193123628383321, "grad_norm": 10.441373825073242, "learning_rate": 4.9234756850160455e-05, "loss": 0.305, "step": 112 }, { "epoch": 0.08266276517922458, "grad_norm": 13.03131103515625, "learning_rate": 4.922241421871143e-05, "loss": 0.2535, "step": 113 }, { "epoch": 0.08339429407461595, "grad_norm": 28.761531829833984, "learning_rate": 4.921007158726241e-05, "loss": 0.4498, "step": 114 }, { "epoch": 0.08412582297000731, "grad_norm": 15.969609260559082, "learning_rate": 4.9197728955813385e-05, "loss": 0.2765, "step": 115 }, { "epoch": 0.08485735186539868, "grad_norm": 10.151286125183105, "learning_rate": 4.9185386324364355e-05, "loss": 0.2975, "step": 116 }, { "epoch": 0.08558888076079005, "grad_norm": 5.0157084465026855, "learning_rate": 4.917304369291533e-05, "loss": 0.2164, "step": 117 }, { "epoch": 0.08632040965618142, "grad_norm": 6.630679607391357, "learning_rate": 4.916070106146631e-05, "loss": 0.2373, "step": 118 }, { "epoch": 0.08705193855157278, "grad_norm": 4.453832149505615, "learning_rate": 4.9148358430017286e-05, "loss": 0.2486, "step": 119 }, { "epoch": 0.08778346744696415, "grad_norm": 5.8336944580078125, "learning_rate": 4.9136015798568256e-05, "loss": 0.2642, "step": 120 }, { "epoch": 0.08851499634235552, "grad_norm": 4.0966315269470215, "learning_rate": 4.912367316711923e-05, "loss": 0.2279, "step": 121 }, { "epoch": 0.08924652523774688, "grad_norm": 1.7203295230865479, "learning_rate": 4.911133053567021e-05, "loss": 0.0467, "step": 122 }, { "epoch": 0.08997805413313825, "grad_norm": 12.75731086730957, "learning_rate": 4.9098987904221186e-05, "loss": 0.3042, "step": 123 }, { "epoch": 0.09070958302852963, "grad_norm": 3.464693307876587, "learning_rate": 4.9086645272772156e-05, "loss": 0.0267, "step": 124 }, { "epoch": 0.091441111923921, "grad_norm": 21.937667846679688, "learning_rate": 4.907430264132313e-05, "loss": 0.2223, "step": 125 }, { "epoch": 0.09217264081931237, "grad_norm": 22.564531326293945, "learning_rate": 4.906196000987411e-05, "loss": 0.2796, "step": 126 }, { "epoch": 0.09290416971470374, "grad_norm": 9.286537170410156, "learning_rate": 4.904961737842509e-05, "loss": 0.3039, "step": 127 }, { "epoch": 0.0936356986100951, "grad_norm": 8.94360065460205, "learning_rate": 4.903727474697606e-05, "loss": 0.2151, "step": 128 }, { "epoch": 0.09436722750548647, "grad_norm": 18.137731552124023, "learning_rate": 4.9024932115527033e-05, "loss": 0.353, "step": 129 }, { "epoch": 0.09509875640087784, "grad_norm": 84.7243881225586, "learning_rate": 4.901258948407801e-05, "loss": 0.5309, "step": 130 }, { "epoch": 0.0958302852962692, "grad_norm": 36.38287353515625, "learning_rate": 4.900024685262899e-05, "loss": 0.4783, "step": 131 }, { "epoch": 0.09656181419166057, "grad_norm": 29.418577194213867, "learning_rate": 4.898790422117996e-05, "loss": 0.056, "step": 132 }, { "epoch": 0.09729334308705194, "grad_norm": 2.9273157119750977, "learning_rate": 4.8975561589730934e-05, "loss": 0.2665, "step": 133 }, { "epoch": 0.0980248719824433, "grad_norm": 20.61284828186035, "learning_rate": 4.896321895828191e-05, "loss": 0.4083, "step": 134 }, { "epoch": 0.09875640087783467, "grad_norm": 46.84558868408203, "learning_rate": 4.895087632683289e-05, "loss": 0.3958, "step": 135 }, { "epoch": 0.09948792977322604, "grad_norm": 6.488471031188965, "learning_rate": 4.893853369538386e-05, "loss": 0.4875, "step": 136 }, { "epoch": 0.10021945866861741, "grad_norm": 9.760618209838867, "learning_rate": 4.8926191063934834e-05, "loss": 0.3268, "step": 137 }, { "epoch": 0.10095098756400878, "grad_norm": 4.168111324310303, "learning_rate": 4.891384843248581e-05, "loss": 0.3074, "step": 138 }, { "epoch": 0.10168251645940014, "grad_norm": 7.366902828216553, "learning_rate": 4.890150580103678e-05, "loss": 0.2583, "step": 139 }, { "epoch": 0.10241404535479151, "grad_norm": 6.250606536865234, "learning_rate": 4.888916316958776e-05, "loss": 0.2241, "step": 140 }, { "epoch": 0.10314557425018288, "grad_norm": 29.720741271972656, "learning_rate": 4.8876820538138735e-05, "loss": 0.4047, "step": 141 }, { "epoch": 0.10387710314557425, "grad_norm": 23.65095329284668, "learning_rate": 4.886447790668971e-05, "loss": 0.517, "step": 142 }, { "epoch": 0.10460863204096561, "grad_norm": 7.486756801605225, "learning_rate": 4.885213527524068e-05, "loss": 0.1839, "step": 143 }, { "epoch": 0.10534016093635698, "grad_norm": 3.6147615909576416, "learning_rate": 4.883979264379166e-05, "loss": 0.2103, "step": 144 }, { "epoch": 0.10607168983174835, "grad_norm": 14.521058082580566, "learning_rate": 4.8827450012342635e-05, "loss": 0.4051, "step": 145 }, { "epoch": 0.10680321872713973, "grad_norm": 8.3184814453125, "learning_rate": 4.881510738089361e-05, "loss": 0.1985, "step": 146 }, { "epoch": 0.1075347476225311, "grad_norm": 1.5090686082839966, "learning_rate": 4.880276474944458e-05, "loss": 0.0346, "step": 147 }, { "epoch": 0.10826627651792246, "grad_norm": 5.222468376159668, "learning_rate": 4.879042211799556e-05, "loss": 0.0368, "step": 148 }, { "epoch": 0.10899780541331383, "grad_norm": 13.494483947753906, "learning_rate": 4.8778079486546535e-05, "loss": 0.1878, "step": 149 }, { "epoch": 0.1097293343087052, "grad_norm": 10.366374969482422, "learning_rate": 4.876573685509751e-05, "loss": 0.3375, "step": 150 }, { "epoch": 0.11046086320409657, "grad_norm": 23.43581771850586, "learning_rate": 4.875339422364848e-05, "loss": 0.346, "step": 151 }, { "epoch": 0.11119239209948793, "grad_norm": 60.233028411865234, "learning_rate": 4.874105159219946e-05, "loss": 0.4402, "step": 152 }, { "epoch": 0.1119239209948793, "grad_norm": 11.637187957763672, "learning_rate": 4.8728708960750436e-05, "loss": 0.097, "step": 153 }, { "epoch": 0.11265544989027067, "grad_norm": 38.89413070678711, "learning_rate": 4.871636632930141e-05, "loss": 0.2134, "step": 154 }, { "epoch": 0.11338697878566203, "grad_norm": 7.130136966705322, "learning_rate": 4.870402369785238e-05, "loss": 0.2345, "step": 155 }, { "epoch": 0.1141185076810534, "grad_norm": 4.393019199371338, "learning_rate": 4.869168106640336e-05, "loss": 0.4511, "step": 156 }, { "epoch": 0.11485003657644477, "grad_norm": 3.5831000804901123, "learning_rate": 4.8679338434954336e-05, "loss": 0.3546, "step": 157 }, { "epoch": 0.11558156547183614, "grad_norm": 8.565537452697754, "learning_rate": 4.866699580350531e-05, "loss": 0.2307, "step": 158 }, { "epoch": 0.1163130943672275, "grad_norm": 3.342007875442505, "learning_rate": 4.865465317205628e-05, "loss": 0.163, "step": 159 }, { "epoch": 0.11704462326261887, "grad_norm": 7.106039524078369, "learning_rate": 4.864231054060726e-05, "loss": 0.2171, "step": 160 }, { "epoch": 0.11777615215801024, "grad_norm": 2.9548141956329346, "learning_rate": 4.8629967909158237e-05, "loss": 0.1061, "step": 161 }, { "epoch": 0.1185076810534016, "grad_norm": 32.10722351074219, "learning_rate": 4.861762527770921e-05, "loss": 0.2932, "step": 162 }, { "epoch": 0.11923920994879297, "grad_norm": 18.315641403198242, "learning_rate": 4.860528264626018e-05, "loss": 0.1823, "step": 163 }, { "epoch": 0.11997073884418434, "grad_norm": 0.24524831771850586, "learning_rate": 4.859294001481116e-05, "loss": 0.0083, "step": 164 }, { "epoch": 0.12070226773957571, "grad_norm": 14.872994422912598, "learning_rate": 4.858059738336214e-05, "loss": 0.5164, "step": 165 }, { "epoch": 0.12143379663496708, "grad_norm": 20.988685607910156, "learning_rate": 4.8568254751913114e-05, "loss": 0.4102, "step": 166 }, { "epoch": 0.12216532553035844, "grad_norm": 9.56988525390625, "learning_rate": 4.8555912120464084e-05, "loss": 0.0889, "step": 167 }, { "epoch": 0.12289685442574981, "grad_norm": 33.11470413208008, "learning_rate": 4.854356948901506e-05, "loss": 0.1555, "step": 168 }, { "epoch": 0.12362838332114119, "grad_norm": 35.06438064575195, "learning_rate": 4.853122685756604e-05, "loss": 0.5702, "step": 169 }, { "epoch": 0.12435991221653256, "grad_norm": 23.100418090820312, "learning_rate": 4.8518884226117014e-05, "loss": 0.4799, "step": 170 }, { "epoch": 0.1250914411119239, "grad_norm": 31.50035858154297, "learning_rate": 4.8506541594667984e-05, "loss": 0.1517, "step": 171 }, { "epoch": 0.1258229700073153, "grad_norm": 3.558380126953125, "learning_rate": 4.849419896321896e-05, "loss": 0.0626, "step": 172 }, { "epoch": 0.12655449890270665, "grad_norm": 11.05613899230957, "learning_rate": 4.848185633176994e-05, "loss": 0.2765, "step": 173 }, { "epoch": 0.12728602779809803, "grad_norm": 3.682497501373291, "learning_rate": 4.8469513700320915e-05, "loss": 0.1095, "step": 174 }, { "epoch": 0.12801755669348938, "grad_norm": 4.236935615539551, "learning_rate": 4.8457171068871885e-05, "loss": 0.2326, "step": 175 }, { "epoch": 0.12874908558888076, "grad_norm": 11.288254737854004, "learning_rate": 4.844482843742286e-05, "loss": 0.3543, "step": 176 }, { "epoch": 0.12948061448427212, "grad_norm": 10.396478652954102, "learning_rate": 4.843248580597384e-05, "loss": 0.1368, "step": 177 }, { "epoch": 0.1302121433796635, "grad_norm": 17.125335693359375, "learning_rate": 4.8420143174524815e-05, "loss": 0.2687, "step": 178 }, { "epoch": 0.13094367227505485, "grad_norm": 8.014018058776855, "learning_rate": 4.8407800543075785e-05, "loss": 0.1766, "step": 179 }, { "epoch": 0.13167520117044623, "grad_norm": 6.8600921630859375, "learning_rate": 4.839545791162676e-05, "loss": 0.2155, "step": 180 }, { "epoch": 0.1324067300658376, "grad_norm": 15.573537826538086, "learning_rate": 4.838311528017774e-05, "loss": 0.224, "step": 181 }, { "epoch": 0.13313825896122897, "grad_norm": 4.411632061004639, "learning_rate": 4.8370772648728715e-05, "loss": 0.1445, "step": 182 }, { "epoch": 0.13386978785662035, "grad_norm": 7.362241744995117, "learning_rate": 4.8358430017279685e-05, "loss": 0.1539, "step": 183 }, { "epoch": 0.1346013167520117, "grad_norm": 2.369643211364746, "learning_rate": 4.834608738583066e-05, "loss": 0.0384, "step": 184 }, { "epoch": 0.13533284564740308, "grad_norm": 10.269309043884277, "learning_rate": 4.833374475438164e-05, "loss": 0.1107, "step": 185 }, { "epoch": 0.13606437454279444, "grad_norm": 2.7502355575561523, "learning_rate": 4.8321402122932616e-05, "loss": 0.0621, "step": 186 }, { "epoch": 0.13679590343818582, "grad_norm": 7.162024974822998, "learning_rate": 4.8309059491483586e-05, "loss": 0.144, "step": 187 }, { "epoch": 0.13752743233357717, "grad_norm": 20.229320526123047, "learning_rate": 4.829671686003456e-05, "loss": 0.2135, "step": 188 }, { "epoch": 0.13825896122896855, "grad_norm": 25.805286407470703, "learning_rate": 4.828437422858554e-05, "loss": 0.4351, "step": 189 }, { "epoch": 0.1389904901243599, "grad_norm": 21.225719451904297, "learning_rate": 4.8272031597136516e-05, "loss": 0.2038, "step": 190 }, { "epoch": 0.1397220190197513, "grad_norm": 39.45839309692383, "learning_rate": 4.8259688965687486e-05, "loss": 0.5611, "step": 191 }, { "epoch": 0.14045354791514264, "grad_norm": 3.6694536209106445, "learning_rate": 4.824734633423846e-05, "loss": 0.3287, "step": 192 }, { "epoch": 0.14118507681053402, "grad_norm": 7.285686016082764, "learning_rate": 4.823500370278944e-05, "loss": 0.2319, "step": 193 }, { "epoch": 0.14191660570592537, "grad_norm": 8.359248161315918, "learning_rate": 4.8222661071340416e-05, "loss": 0.2445, "step": 194 }, { "epoch": 0.14264813460131676, "grad_norm": 20.972332000732422, "learning_rate": 4.8210318439891386e-05, "loss": 0.5335, "step": 195 }, { "epoch": 0.1433796634967081, "grad_norm": 17.88499641418457, "learning_rate": 4.819797580844236e-05, "loss": 0.596, "step": 196 }, { "epoch": 0.1441111923920995, "grad_norm": 2.0829763412475586, "learning_rate": 4.818563317699334e-05, "loss": 0.098, "step": 197 }, { "epoch": 0.14484272128749084, "grad_norm": 2.5155184268951416, "learning_rate": 4.817329054554432e-05, "loss": 0.1969, "step": 198 }, { "epoch": 0.14557425018288223, "grad_norm": 15.860042572021484, "learning_rate": 4.816094791409529e-05, "loss": 0.2791, "step": 199 }, { "epoch": 0.14630577907827358, "grad_norm": 2.9932219982147217, "learning_rate": 4.8148605282646264e-05, "loss": 0.1191, "step": 200 }, { "epoch": 0.14703730797366496, "grad_norm": 3.041715145111084, "learning_rate": 4.813626265119724e-05, "loss": 0.0954, "step": 201 }, { "epoch": 0.14776883686905634, "grad_norm": 3.9579122066497803, "learning_rate": 4.812392001974822e-05, "loss": 0.1072, "step": 202 }, { "epoch": 0.1485003657644477, "grad_norm": 2.2798852920532227, "learning_rate": 4.811157738829919e-05, "loss": 0.0418, "step": 203 }, { "epoch": 0.14923189465983908, "grad_norm": 34.190067291259766, "learning_rate": 4.8099234756850164e-05, "loss": 0.6495, "step": 204 }, { "epoch": 0.14996342355523043, "grad_norm": 17.399703979492188, "learning_rate": 4.808689212540114e-05, "loss": 0.7027, "step": 205 }, { "epoch": 0.1506949524506218, "grad_norm": 13.492084503173828, "learning_rate": 4.807454949395212e-05, "loss": 0.4129, "step": 206 }, { "epoch": 0.15142648134601316, "grad_norm": 17.838804244995117, "learning_rate": 4.806220686250309e-05, "loss": 0.2803, "step": 207 }, { "epoch": 0.15215801024140455, "grad_norm": 2.9302186965942383, "learning_rate": 4.8049864231054064e-05, "loss": 0.0171, "step": 208 }, { "epoch": 0.1528895391367959, "grad_norm": 13.686524391174316, "learning_rate": 4.803752159960504e-05, "loss": 0.6821, "step": 209 }, { "epoch": 0.15362106803218728, "grad_norm": 27.379335403442383, "learning_rate": 4.802517896815602e-05, "loss": 0.4106, "step": 210 }, { "epoch": 0.15435259692757863, "grad_norm": 18.955272674560547, "learning_rate": 4.801283633670699e-05, "loss": 0.3756, "step": 211 }, { "epoch": 0.15508412582297001, "grad_norm": 18.004667282104492, "learning_rate": 4.8000493705257965e-05, "loss": 0.3324, "step": 212 }, { "epoch": 0.15581565471836137, "grad_norm": 9.363983154296875, "learning_rate": 4.798815107380894e-05, "loss": 0.3503, "step": 213 }, { "epoch": 0.15654718361375275, "grad_norm": 4.422360897064209, "learning_rate": 4.797580844235992e-05, "loss": 0.1719, "step": 214 }, { "epoch": 0.1572787125091441, "grad_norm": 7.835522174835205, "learning_rate": 4.796346581091089e-05, "loss": 0.1714, "step": 215 }, { "epoch": 0.15801024140453548, "grad_norm": 4.396750450134277, "learning_rate": 4.7951123179461865e-05, "loss": 0.1088, "step": 216 }, { "epoch": 0.15874177029992684, "grad_norm": 10.480301856994629, "learning_rate": 4.793878054801284e-05, "loss": 0.3243, "step": 217 }, { "epoch": 0.15947329919531822, "grad_norm": 5.426126480102539, "learning_rate": 4.792643791656382e-05, "loss": 0.2009, "step": 218 }, { "epoch": 0.16020482809070957, "grad_norm": 1.5307382345199585, "learning_rate": 4.791409528511479e-05, "loss": 0.0265, "step": 219 }, { "epoch": 0.16093635698610095, "grad_norm": 16.452421188354492, "learning_rate": 4.7901752653665766e-05, "loss": 0.1295, "step": 220 }, { "epoch": 0.1616678858814923, "grad_norm": 7.689098834991455, "learning_rate": 4.788941002221674e-05, "loss": 0.1626, "step": 221 }, { "epoch": 0.1623994147768837, "grad_norm": 6.923509120941162, "learning_rate": 4.787706739076772e-05, "loss": 0.1803, "step": 222 }, { "epoch": 0.16313094367227504, "grad_norm": 3.3413968086242676, "learning_rate": 4.786472475931869e-05, "loss": 0.2265, "step": 223 }, { "epoch": 0.16386247256766642, "grad_norm": 15.044098854064941, "learning_rate": 4.7852382127869666e-05, "loss": 0.3286, "step": 224 }, { "epoch": 0.1645940014630578, "grad_norm": 3.253849983215332, "learning_rate": 4.784003949642064e-05, "loss": 0.1032, "step": 225 }, { "epoch": 0.16532553035844916, "grad_norm": 3.2642874717712402, "learning_rate": 4.782769686497161e-05, "loss": 0.0268, "step": 226 }, { "epoch": 0.16605705925384054, "grad_norm": 10.20294189453125, "learning_rate": 4.781535423352259e-05, "loss": 0.0653, "step": 227 }, { "epoch": 0.1667885881492319, "grad_norm": 8.866772651672363, "learning_rate": 4.7803011602073566e-05, "loss": 0.1721, "step": 228 }, { "epoch": 0.16752011704462327, "grad_norm": 15.879972457885742, "learning_rate": 4.779066897062454e-05, "loss": 0.4907, "step": 229 }, { "epoch": 0.16825164594001463, "grad_norm": 7.308863162994385, "learning_rate": 4.777832633917551e-05, "loss": 0.334, "step": 230 }, { "epoch": 0.168983174835406, "grad_norm": 6.620587348937988, "learning_rate": 4.776598370772649e-05, "loss": 0.0389, "step": 231 }, { "epoch": 0.16971470373079736, "grad_norm": 9.335672378540039, "learning_rate": 4.775364107627747e-05, "loss": 0.0943, "step": 232 }, { "epoch": 0.17044623262618874, "grad_norm": 7.447492599487305, "learning_rate": 4.7741298444828444e-05, "loss": 0.2025, "step": 233 }, { "epoch": 0.1711777615215801, "grad_norm": 2.4395194053649902, "learning_rate": 4.7728955813379414e-05, "loss": 0.0273, "step": 234 }, { "epoch": 0.17190929041697148, "grad_norm": 4.0022172927856445, "learning_rate": 4.771661318193039e-05, "loss": 0.0725, "step": 235 }, { "epoch": 0.17264081931236283, "grad_norm": 13.485539436340332, "learning_rate": 4.770427055048137e-05, "loss": 0.1937, "step": 236 }, { "epoch": 0.1733723482077542, "grad_norm": 10.196976661682129, "learning_rate": 4.7691927919032344e-05, "loss": 0.3132, "step": 237 }, { "epoch": 0.17410387710314557, "grad_norm": 13.43415641784668, "learning_rate": 4.7679585287583314e-05, "loss": 0.1573, "step": 238 }, { "epoch": 0.17483540599853695, "grad_norm": 6.70851993560791, "learning_rate": 4.766724265613429e-05, "loss": 0.0876, "step": 239 }, { "epoch": 0.1755669348939283, "grad_norm": 5.514919281005859, "learning_rate": 4.765490002468527e-05, "loss": 0.1139, "step": 240 }, { "epoch": 0.17629846378931968, "grad_norm": 2.0644354820251465, "learning_rate": 4.7642557393236244e-05, "loss": 0.0412, "step": 241 }, { "epoch": 0.17702999268471104, "grad_norm": 4.40508508682251, "learning_rate": 4.7630214761787214e-05, "loss": 0.065, "step": 242 }, { "epoch": 0.17776152158010242, "grad_norm": 17.07434844970703, "learning_rate": 4.761787213033819e-05, "loss": 0.2743, "step": 243 }, { "epoch": 0.17849305047549377, "grad_norm": 29.899076461791992, "learning_rate": 4.760552949888917e-05, "loss": 0.3955, "step": 244 }, { "epoch": 0.17922457937088515, "grad_norm": 9.88703727722168, "learning_rate": 4.7593186867440145e-05, "loss": 0.1688, "step": 245 }, { "epoch": 0.1799561082662765, "grad_norm": 9.09658145904541, "learning_rate": 4.7580844235991115e-05, "loss": 0.1012, "step": 246 }, { "epoch": 0.18068763716166789, "grad_norm": 7.767172813415527, "learning_rate": 4.756850160454209e-05, "loss": 0.1575, "step": 247 }, { "epoch": 0.18141916605705927, "grad_norm": 7.3553595542907715, "learning_rate": 4.755615897309307e-05, "loss": 0.1235, "step": 248 }, { "epoch": 0.18215069495245062, "grad_norm": 16.951398849487305, "learning_rate": 4.7543816341644045e-05, "loss": 0.0887, "step": 249 }, { "epoch": 0.182882223847842, "grad_norm": 7.027613639831543, "learning_rate": 4.7531473710195015e-05, "loss": 0.2497, "step": 250 }, { "epoch": 0.18361375274323335, "grad_norm": 4.424843788146973, "learning_rate": 4.751913107874599e-05, "loss": 0.1192, "step": 251 }, { "epoch": 0.18434528163862474, "grad_norm": 15.930411338806152, "learning_rate": 4.750678844729697e-05, "loss": 0.2294, "step": 252 }, { "epoch": 0.1850768105340161, "grad_norm": 8.033312797546387, "learning_rate": 4.7494445815847946e-05, "loss": 0.2674, "step": 253 }, { "epoch": 0.18580833942940747, "grad_norm": 1.2036100625991821, "learning_rate": 4.7482103184398916e-05, "loss": 0.0639, "step": 254 }, { "epoch": 0.18653986832479882, "grad_norm": 12.488160133361816, "learning_rate": 4.746976055294989e-05, "loss": 0.1571, "step": 255 }, { "epoch": 0.1872713972201902, "grad_norm": 8.707588195800781, "learning_rate": 4.745741792150087e-05, "loss": 0.1373, "step": 256 }, { "epoch": 0.18800292611558156, "grad_norm": 12.656996726989746, "learning_rate": 4.7445075290051846e-05, "loss": 0.1986, "step": 257 }, { "epoch": 0.18873445501097294, "grad_norm": 10.386478424072266, "learning_rate": 4.7432732658602816e-05, "loss": 0.2349, "step": 258 }, { "epoch": 0.1894659839063643, "grad_norm": 4.765628814697266, "learning_rate": 4.742039002715379e-05, "loss": 0.1669, "step": 259 }, { "epoch": 0.19019751280175567, "grad_norm": 2.6097073554992676, "learning_rate": 4.740804739570477e-05, "loss": 0.064, "step": 260 }, { "epoch": 0.19092904169714703, "grad_norm": 9.988167762756348, "learning_rate": 4.7395704764255746e-05, "loss": 0.1645, "step": 261 }, { "epoch": 0.1916605705925384, "grad_norm": 27.082853317260742, "learning_rate": 4.7383362132806716e-05, "loss": 0.3467, "step": 262 }, { "epoch": 0.19239209948792976, "grad_norm": 11.774621963500977, "learning_rate": 4.737101950135769e-05, "loss": 0.1893, "step": 263 }, { "epoch": 0.19312362838332114, "grad_norm": 1.818307638168335, "learning_rate": 4.735867686990867e-05, "loss": 0.1931, "step": 264 }, { "epoch": 0.1938551572787125, "grad_norm": 3.186414957046509, "learning_rate": 4.734633423845965e-05, "loss": 0.0696, "step": 265 }, { "epoch": 0.19458668617410388, "grad_norm": 6.54341983795166, "learning_rate": 4.733399160701062e-05, "loss": 0.0789, "step": 266 }, { "epoch": 0.19531821506949523, "grad_norm": 14.461315155029297, "learning_rate": 4.7321648975561594e-05, "loss": 0.1792, "step": 267 }, { "epoch": 0.1960497439648866, "grad_norm": 30.36673355102539, "learning_rate": 4.730930634411257e-05, "loss": 0.6178, "step": 268 }, { "epoch": 0.196781272860278, "grad_norm": 9.937984466552734, "learning_rate": 4.729696371266355e-05, "loss": 0.246, "step": 269 }, { "epoch": 0.19751280175566935, "grad_norm": 0.2891613245010376, "learning_rate": 4.728462108121452e-05, "loss": 0.0082, "step": 270 }, { "epoch": 0.19824433065106073, "grad_norm": 2.421114444732666, "learning_rate": 4.7272278449765494e-05, "loss": 0.1385, "step": 271 }, { "epoch": 0.19897585954645208, "grad_norm": 3.8238348960876465, "learning_rate": 4.725993581831647e-05, "loss": 0.0433, "step": 272 }, { "epoch": 0.19970738844184346, "grad_norm": 7.166514873504639, "learning_rate": 4.724759318686745e-05, "loss": 0.0599, "step": 273 }, { "epoch": 0.20043891733723482, "grad_norm": 17.691762924194336, "learning_rate": 4.723525055541842e-05, "loss": 0.1552, "step": 274 }, { "epoch": 0.2011704462326262, "grad_norm": 2.889138698577881, "learning_rate": 4.7222907923969394e-05, "loss": 0.0744, "step": 275 }, { "epoch": 0.20190197512801755, "grad_norm": 4.187315464019775, "learning_rate": 4.721056529252037e-05, "loss": 0.0877, "step": 276 }, { "epoch": 0.20263350402340893, "grad_norm": 7.357713222503662, "learning_rate": 4.719822266107135e-05, "loss": 0.1234, "step": 277 }, { "epoch": 0.2033650329188003, "grad_norm": 3.833280086517334, "learning_rate": 4.718588002962232e-05, "loss": 0.073, "step": 278 }, { "epoch": 0.20409656181419167, "grad_norm": 3.9294726848602295, "learning_rate": 4.7173537398173295e-05, "loss": 0.183, "step": 279 }, { "epoch": 0.20482809070958302, "grad_norm": 11.629706382751465, "learning_rate": 4.716119476672427e-05, "loss": 0.0964, "step": 280 }, { "epoch": 0.2055596196049744, "grad_norm": 3.805372476577759, "learning_rate": 4.714885213527525e-05, "loss": 0.0412, "step": 281 }, { "epoch": 0.20629114850036576, "grad_norm": 13.99399471282959, "learning_rate": 4.713650950382622e-05, "loss": 0.112, "step": 282 }, { "epoch": 0.20702267739575714, "grad_norm": 6.269346714019775, "learning_rate": 4.7124166872377195e-05, "loss": 0.1348, "step": 283 }, { "epoch": 0.2077542062911485, "grad_norm": 3.353523015975952, "learning_rate": 4.711182424092817e-05, "loss": 0.1057, "step": 284 }, { "epoch": 0.20848573518653987, "grad_norm": 3.510869026184082, "learning_rate": 4.709948160947915e-05, "loss": 0.3108, "step": 285 }, { "epoch": 0.20921726408193123, "grad_norm": 3.914752960205078, "learning_rate": 4.708713897803012e-05, "loss": 0.0996, "step": 286 }, { "epoch": 0.2099487929773226, "grad_norm": 1.0317579507827759, "learning_rate": 4.7074796346581095e-05, "loss": 0.043, "step": 287 }, { "epoch": 0.21068032187271396, "grad_norm": 5.032279014587402, "learning_rate": 4.706245371513207e-05, "loss": 0.0989, "step": 288 }, { "epoch": 0.21141185076810534, "grad_norm": 8.016688346862793, "learning_rate": 4.705011108368305e-05, "loss": 0.1714, "step": 289 }, { "epoch": 0.2121433796634967, "grad_norm": 15.527971267700195, "learning_rate": 4.703776845223402e-05, "loss": 0.2575, "step": 290 }, { "epoch": 0.21287490855888808, "grad_norm": 2.810373306274414, "learning_rate": 4.7025425820784996e-05, "loss": 0.0423, "step": 291 }, { "epoch": 0.21360643745427946, "grad_norm": 9.562439918518066, "learning_rate": 4.701308318933597e-05, "loss": 0.1653, "step": 292 }, { "epoch": 0.2143379663496708, "grad_norm": 1.443484902381897, "learning_rate": 4.700074055788695e-05, "loss": 0.0166, "step": 293 }, { "epoch": 0.2150694952450622, "grad_norm": 1.6016751527786255, "learning_rate": 4.698839792643792e-05, "loss": 0.0115, "step": 294 }, { "epoch": 0.21580102414045355, "grad_norm": 19.7925968170166, "learning_rate": 4.6976055294988896e-05, "loss": 0.3584, "step": 295 }, { "epoch": 0.21653255303584493, "grad_norm": 13.747271537780762, "learning_rate": 4.696371266353987e-05, "loss": 0.1211, "step": 296 }, { "epoch": 0.21726408193123628, "grad_norm": 14.209345817565918, "learning_rate": 4.695137003209085e-05, "loss": 0.1564, "step": 297 }, { "epoch": 0.21799561082662766, "grad_norm": 11.147798538208008, "learning_rate": 4.693902740064182e-05, "loss": 0.0887, "step": 298 }, { "epoch": 0.21872713972201902, "grad_norm": 9.423097610473633, "learning_rate": 4.69266847691928e-05, "loss": 0.1077, "step": 299 }, { "epoch": 0.2194586686174104, "grad_norm": 0.5840073823928833, "learning_rate": 4.6914342137743773e-05, "loss": 0.0046, "step": 300 }, { "epoch": 0.22019019751280175, "grad_norm": 2.183845281600952, "learning_rate": 4.690199950629475e-05, "loss": 0.0091, "step": 301 }, { "epoch": 0.22092172640819313, "grad_norm": 9.541942596435547, "learning_rate": 4.688965687484572e-05, "loss": 0.2686, "step": 302 }, { "epoch": 0.22165325530358448, "grad_norm": 25.877164840698242, "learning_rate": 4.68773142433967e-05, "loss": 0.6105, "step": 303 }, { "epoch": 0.22238478419897587, "grad_norm": 19.57236671447754, "learning_rate": 4.686497161194767e-05, "loss": 0.1923, "step": 304 }, { "epoch": 0.22311631309436722, "grad_norm": 12.51420783996582, "learning_rate": 4.6852628980498644e-05, "loss": 0.2546, "step": 305 }, { "epoch": 0.2238478419897586, "grad_norm": 9.829113960266113, "learning_rate": 4.6840286349049614e-05, "loss": 0.0921, "step": 306 }, { "epoch": 0.22457937088514995, "grad_norm": 1.4385827779769897, "learning_rate": 4.682794371760059e-05, "loss": 0.0077, "step": 307 }, { "epoch": 0.22531089978054133, "grad_norm": 22.88669776916504, "learning_rate": 4.681560108615157e-05, "loss": 0.3058, "step": 308 }, { "epoch": 0.2260424286759327, "grad_norm": 2.760131359100342, "learning_rate": 4.6803258454702544e-05, "loss": 0.0121, "step": 309 }, { "epoch": 0.22677395757132407, "grad_norm": 4.101071834564209, "learning_rate": 4.6790915823253514e-05, "loss": 0.0447, "step": 310 }, { "epoch": 0.22750548646671542, "grad_norm": 3.5138494968414307, "learning_rate": 4.677857319180449e-05, "loss": 0.0196, "step": 311 }, { "epoch": 0.2282370153621068, "grad_norm": 8.76970386505127, "learning_rate": 4.676623056035547e-05, "loss": 0.122, "step": 312 }, { "epoch": 0.22896854425749816, "grad_norm": 3.9574027061462402, "learning_rate": 4.6753887928906445e-05, "loss": 0.0253, "step": 313 }, { "epoch": 0.22970007315288954, "grad_norm": 5.413010120391846, "learning_rate": 4.6741545297457415e-05, "loss": 0.091, "step": 314 }, { "epoch": 0.23043160204828092, "grad_norm": 14.828239440917969, "learning_rate": 4.672920266600839e-05, "loss": 0.0616, "step": 315 }, { "epoch": 0.23116313094367227, "grad_norm": 8.366146087646484, "learning_rate": 4.671686003455937e-05, "loss": 0.0495, "step": 316 }, { "epoch": 0.23189465983906365, "grad_norm": 12.14842700958252, "learning_rate": 4.6704517403110345e-05, "loss": 0.0528, "step": 317 }, { "epoch": 0.232626188734455, "grad_norm": 14.286412239074707, "learning_rate": 4.6692174771661315e-05, "loss": 0.2823, "step": 318 }, { "epoch": 0.2333577176298464, "grad_norm": 3.8804407119750977, "learning_rate": 4.667983214021229e-05, "loss": 0.1595, "step": 319 }, { "epoch": 0.23408924652523774, "grad_norm": 1.4678597450256348, "learning_rate": 4.666748950876327e-05, "loss": 0.0115, "step": 320 }, { "epoch": 0.23482077542062912, "grad_norm": 5.201881408691406, "learning_rate": 4.6655146877314245e-05, "loss": 0.2697, "step": 321 }, { "epoch": 0.23555230431602048, "grad_norm": 18.59732437133789, "learning_rate": 4.6642804245865215e-05, "loss": 0.1367, "step": 322 }, { "epoch": 0.23628383321141186, "grad_norm": 16.630435943603516, "learning_rate": 4.663046161441619e-05, "loss": 0.1778, "step": 323 }, { "epoch": 0.2370153621068032, "grad_norm": 10.160158157348633, "learning_rate": 4.661811898296717e-05, "loss": 0.0543, "step": 324 }, { "epoch": 0.2377468910021946, "grad_norm": 8.025282859802246, "learning_rate": 4.6605776351518146e-05, "loss": 0.0721, "step": 325 }, { "epoch": 0.23847841989758595, "grad_norm": 25.696889877319336, "learning_rate": 4.6593433720069116e-05, "loss": 0.3421, "step": 326 }, { "epoch": 0.23920994879297733, "grad_norm": 9.36668872833252, "learning_rate": 4.658109108862009e-05, "loss": 0.1015, "step": 327 }, { "epoch": 0.23994147768836868, "grad_norm": 1.5681483745574951, "learning_rate": 4.656874845717107e-05, "loss": 0.0296, "step": 328 }, { "epoch": 0.24067300658376006, "grad_norm": 14.844806671142578, "learning_rate": 4.6556405825722046e-05, "loss": 0.4227, "step": 329 }, { "epoch": 0.24140453547915142, "grad_norm": 33.07256317138672, "learning_rate": 4.6544063194273016e-05, "loss": 0.2847, "step": 330 }, { "epoch": 0.2421360643745428, "grad_norm": 7.86090612411499, "learning_rate": 4.653172056282399e-05, "loss": 0.0852, "step": 331 }, { "epoch": 0.24286759326993415, "grad_norm": 15.146425247192383, "learning_rate": 4.651937793137497e-05, "loss": 0.1584, "step": 332 }, { "epoch": 0.24359912216532553, "grad_norm": 7.614223957061768, "learning_rate": 4.6507035299925947e-05, "loss": 0.0628, "step": 333 }, { "epoch": 0.24433065106071689, "grad_norm": 11.871129035949707, "learning_rate": 4.649469266847692e-05, "loss": 0.1583, "step": 334 }, { "epoch": 0.24506217995610827, "grad_norm": 1.8296329975128174, "learning_rate": 4.6482350037027893e-05, "loss": 0.0112, "step": 335 }, { "epoch": 0.24579370885149962, "grad_norm": 17.215360641479492, "learning_rate": 4.647000740557887e-05, "loss": 0.378, "step": 336 }, { "epoch": 0.246525237746891, "grad_norm": 1.8717336654663086, "learning_rate": 4.645766477412985e-05, "loss": 0.0111, "step": 337 }, { "epoch": 0.24725676664228238, "grad_norm": 13.354321479797363, "learning_rate": 4.644532214268082e-05, "loss": 0.4637, "step": 338 }, { "epoch": 0.24798829553767374, "grad_norm": 18.055574417114258, "learning_rate": 4.6432979511231794e-05, "loss": 0.202, "step": 339 }, { "epoch": 0.24871982443306512, "grad_norm": 14.598061561584473, "learning_rate": 4.642063687978277e-05, "loss": 0.1427, "step": 340 }, { "epoch": 0.24945135332845647, "grad_norm": 8.440162658691406, "learning_rate": 4.640829424833375e-05, "loss": 0.0481, "step": 341 }, { "epoch": 0.2501828822238478, "grad_norm": 3.068803310394287, "learning_rate": 4.639595161688472e-05, "loss": 0.2184, "step": 342 }, { "epoch": 0.25091441111923923, "grad_norm": 35.451290130615234, "learning_rate": 4.6383608985435694e-05, "loss": 0.238, "step": 343 }, { "epoch": 0.2516459400146306, "grad_norm": 31.3826961517334, "learning_rate": 4.637126635398667e-05, "loss": 0.663, "step": 344 }, { "epoch": 0.25237746891002194, "grad_norm": 25.89190673828125, "learning_rate": 4.635892372253765e-05, "loss": 0.4696, "step": 345 }, { "epoch": 0.2531089978054133, "grad_norm": 50.426849365234375, "learning_rate": 4.634658109108862e-05, "loss": 0.6192, "step": 346 }, { "epoch": 0.2538405267008047, "grad_norm": 41.61827087402344, "learning_rate": 4.6334238459639595e-05, "loss": 0.4811, "step": 347 }, { "epoch": 0.25457205559619606, "grad_norm": 4.632473945617676, "learning_rate": 4.632189582819057e-05, "loss": 0.0418, "step": 348 }, { "epoch": 0.2553035844915874, "grad_norm": 7.787686824798584, "learning_rate": 4.630955319674155e-05, "loss": 0.2665, "step": 349 }, { "epoch": 0.25603511338697876, "grad_norm": 6.358901500701904, "learning_rate": 4.629721056529252e-05, "loss": 0.0876, "step": 350 }, { "epoch": 0.25676664228237017, "grad_norm": 7.633840084075928, "learning_rate": 4.6284867933843495e-05, "loss": 0.1171, "step": 351 }, { "epoch": 0.2574981711777615, "grad_norm": 2.7725157737731934, "learning_rate": 4.627252530239447e-05, "loss": 0.0375, "step": 352 }, { "epoch": 0.2582297000731529, "grad_norm": 9.562684059143066, "learning_rate": 4.626018267094545e-05, "loss": 0.2849, "step": 353 }, { "epoch": 0.25896122896854423, "grad_norm": 8.398584365844727, "learning_rate": 4.624784003949642e-05, "loss": 0.4374, "step": 354 }, { "epoch": 0.25969275786393564, "grad_norm": 10.577627182006836, "learning_rate": 4.6235497408047395e-05, "loss": 0.265, "step": 355 }, { "epoch": 0.260424286759327, "grad_norm": 3.158632278442383, "learning_rate": 4.622315477659837e-05, "loss": 0.1546, "step": 356 }, { "epoch": 0.26115581565471835, "grad_norm": 3.3129448890686035, "learning_rate": 4.621081214514935e-05, "loss": 0.1046, "step": 357 }, { "epoch": 0.2618873445501097, "grad_norm": 9.525498390197754, "learning_rate": 4.619846951370032e-05, "loss": 0.2317, "step": 358 }, { "epoch": 0.2626188734455011, "grad_norm": 3.840601682662964, "learning_rate": 4.6186126882251296e-05, "loss": 0.0898, "step": 359 }, { "epoch": 0.26335040234089246, "grad_norm": 9.692495346069336, "learning_rate": 4.617378425080227e-05, "loss": 0.1936, "step": 360 }, { "epoch": 0.2640819312362838, "grad_norm": 2.988752841949463, "learning_rate": 4.616144161935325e-05, "loss": 0.0519, "step": 361 }, { "epoch": 0.2648134601316752, "grad_norm": 3.5448288917541504, "learning_rate": 4.614909898790422e-05, "loss": 0.0925, "step": 362 }, { "epoch": 0.2655449890270666, "grad_norm": 3.1314074993133545, "learning_rate": 4.6136756356455196e-05, "loss": 0.0958, "step": 363 }, { "epoch": 0.26627651792245793, "grad_norm": 11.940053939819336, "learning_rate": 4.612441372500617e-05, "loss": 0.205, "step": 364 }, { "epoch": 0.2670080468178493, "grad_norm": 2.839132308959961, "learning_rate": 4.611207109355715e-05, "loss": 0.0419, "step": 365 }, { "epoch": 0.2677395757132407, "grad_norm": 7.8122477531433105, "learning_rate": 4.609972846210812e-05, "loss": 0.1139, "step": 366 }, { "epoch": 0.26847110460863205, "grad_norm": 2.806342363357544, "learning_rate": 4.6087385830659097e-05, "loss": 0.0196, "step": 367 }, { "epoch": 0.2692026335040234, "grad_norm": 7.119914531707764, "learning_rate": 4.607504319921007e-05, "loss": 0.0665, "step": 368 }, { "epoch": 0.26993416239941476, "grad_norm": 12.34570026397705, "learning_rate": 4.606270056776105e-05, "loss": 0.5235, "step": 369 }, { "epoch": 0.27066569129480617, "grad_norm": 3.175091505050659, "learning_rate": 4.605035793631202e-05, "loss": 0.0284, "step": 370 }, { "epoch": 0.2713972201901975, "grad_norm": 4.075298309326172, "learning_rate": 4.6038015304863e-05, "loss": 0.0225, "step": 371 }, { "epoch": 0.2721287490855889, "grad_norm": 12.988760948181152, "learning_rate": 4.6025672673413974e-05, "loss": 0.2284, "step": 372 }, { "epoch": 0.2728602779809802, "grad_norm": 1.418082356452942, "learning_rate": 4.6013330041964944e-05, "loss": 0.0081, "step": 373 }, { "epoch": 0.27359180687637163, "grad_norm": 10.288276672363281, "learning_rate": 4.600098741051592e-05, "loss": 0.0355, "step": 374 }, { "epoch": 0.274323335771763, "grad_norm": 17.57086753845215, "learning_rate": 4.59886447790669e-05, "loss": 0.3916, "step": 375 }, { "epoch": 0.27505486466715434, "grad_norm": 13.713560104370117, "learning_rate": 4.5976302147617874e-05, "loss": 0.0621, "step": 376 }, { "epoch": 0.2757863935625457, "grad_norm": 22.142488479614258, "learning_rate": 4.5963959516168844e-05, "loss": 0.1623, "step": 377 }, { "epoch": 0.2765179224579371, "grad_norm": 1.7554984092712402, "learning_rate": 4.595161688471982e-05, "loss": 0.3688, "step": 378 }, { "epoch": 0.27724945135332846, "grad_norm": 21.340072631835938, "learning_rate": 4.59392742532708e-05, "loss": 0.4538, "step": 379 }, { "epoch": 0.2779809802487198, "grad_norm": 2.1839237213134766, "learning_rate": 4.5926931621821775e-05, "loss": 0.1475, "step": 380 }, { "epoch": 0.2787125091441112, "grad_norm": 11.007948875427246, "learning_rate": 4.5914588990372745e-05, "loss": 0.4767, "step": 381 }, { "epoch": 0.2794440380395026, "grad_norm": 8.423256874084473, "learning_rate": 4.590224635892372e-05, "loss": 0.0833, "step": 382 }, { "epoch": 0.2801755669348939, "grad_norm": 9.626747131347656, "learning_rate": 4.58899037274747e-05, "loss": 0.1195, "step": 383 }, { "epoch": 0.2809070958302853, "grad_norm": 3.054462432861328, "learning_rate": 4.5877561096025675e-05, "loss": 0.0992, "step": 384 }, { "epoch": 0.2816386247256767, "grad_norm": 6.227707862854004, "learning_rate": 4.5865218464576645e-05, "loss": 0.0673, "step": 385 }, { "epoch": 0.28237015362106804, "grad_norm": 10.16669750213623, "learning_rate": 4.585287583312762e-05, "loss": 0.1402, "step": 386 }, { "epoch": 0.2831016825164594, "grad_norm": 8.189421653747559, "learning_rate": 4.58405332016786e-05, "loss": 0.0756, "step": 387 }, { "epoch": 0.28383321141185075, "grad_norm": 3.553325891494751, "learning_rate": 4.5828190570229575e-05, "loss": 0.1328, "step": 388 }, { "epoch": 0.28456474030724216, "grad_norm": 7.250787734985352, "learning_rate": 4.5815847938780545e-05, "loss": 0.1323, "step": 389 }, { "epoch": 0.2852962692026335, "grad_norm": 9.348138809204102, "learning_rate": 4.580350530733152e-05, "loss": 0.0697, "step": 390 }, { "epoch": 0.28602779809802487, "grad_norm": 2.0810070037841797, "learning_rate": 4.57911626758825e-05, "loss": 0.0484, "step": 391 }, { "epoch": 0.2867593269934162, "grad_norm": 15.590902328491211, "learning_rate": 4.5778820044433476e-05, "loss": 0.2263, "step": 392 }, { "epoch": 0.28749085588880763, "grad_norm": 3.916673421859741, "learning_rate": 4.5766477412984446e-05, "loss": 0.0805, "step": 393 }, { "epoch": 0.288222384784199, "grad_norm": 6.439150810241699, "learning_rate": 4.575413478153542e-05, "loss": 0.0818, "step": 394 }, { "epoch": 0.28895391367959034, "grad_norm": 4.075969696044922, "learning_rate": 4.57417921500864e-05, "loss": 0.0821, "step": 395 }, { "epoch": 0.2896854425749817, "grad_norm": 16.932350158691406, "learning_rate": 4.5729449518637376e-05, "loss": 0.3344, "step": 396 }, { "epoch": 0.2904169714703731, "grad_norm": 1.8576041460037231, "learning_rate": 4.5717106887188346e-05, "loss": 0.0474, "step": 397 }, { "epoch": 0.29114850036576445, "grad_norm": 9.165979385375977, "learning_rate": 4.570476425573932e-05, "loss": 0.0667, "step": 398 }, { "epoch": 0.2918800292611558, "grad_norm": 1.5089490413665771, "learning_rate": 4.56924216242903e-05, "loss": 0.026, "step": 399 }, { "epoch": 0.29261155815654716, "grad_norm": 3.0352537631988525, "learning_rate": 4.5680078992841276e-05, "loss": 0.14, "step": 400 }, { "epoch": 0.29334308705193857, "grad_norm": 0.46172085404396057, "learning_rate": 4.5667736361392246e-05, "loss": 0.0105, "step": 401 }, { "epoch": 0.2940746159473299, "grad_norm": 9.996049880981445, "learning_rate": 4.565539372994322e-05, "loss": 0.093, "step": 402 }, { "epoch": 0.2948061448427213, "grad_norm": 4.32982873916626, "learning_rate": 4.56430510984942e-05, "loss": 0.0401, "step": 403 }, { "epoch": 0.2955376737381127, "grad_norm": 9.770384788513184, "learning_rate": 4.563070846704518e-05, "loss": 0.237, "step": 404 }, { "epoch": 0.29626920263350404, "grad_norm": 0.34693005681037903, "learning_rate": 4.561836583559615e-05, "loss": 0.0053, "step": 405 }, { "epoch": 0.2970007315288954, "grad_norm": 8.954268455505371, "learning_rate": 4.5606023204147124e-05, "loss": 0.1329, "step": 406 }, { "epoch": 0.29773226042428674, "grad_norm": 9.5706148147583, "learning_rate": 4.55936805726981e-05, "loss": 0.1488, "step": 407 }, { "epoch": 0.29846378931967815, "grad_norm": 5.196387767791748, "learning_rate": 4.558133794124908e-05, "loss": 0.0431, "step": 408 }, { "epoch": 0.2991953182150695, "grad_norm": 6.316328525543213, "learning_rate": 4.556899530980005e-05, "loss": 0.2046, "step": 409 }, { "epoch": 0.29992684711046086, "grad_norm": 9.856035232543945, "learning_rate": 4.5556652678351024e-05, "loss": 0.0697, "step": 410 }, { "epoch": 0.3006583760058522, "grad_norm": 33.11104202270508, "learning_rate": 4.5544310046902e-05, "loss": 0.6189, "step": 411 }, { "epoch": 0.3013899049012436, "grad_norm": 16.26744270324707, "learning_rate": 4.553196741545298e-05, "loss": 0.2905, "step": 412 }, { "epoch": 0.302121433796635, "grad_norm": 16.105607986450195, "learning_rate": 4.551962478400395e-05, "loss": 0.4693, "step": 413 }, { "epoch": 0.30285296269202633, "grad_norm": 16.66805076599121, "learning_rate": 4.5507282152554924e-05, "loss": 0.125, "step": 414 }, { "epoch": 0.3035844915874177, "grad_norm": 7.947268486022949, "learning_rate": 4.54949395211059e-05, "loss": 0.0608, "step": 415 }, { "epoch": 0.3043160204828091, "grad_norm": 7.275471210479736, "learning_rate": 4.548259688965688e-05, "loss": 0.6203, "step": 416 }, { "epoch": 0.30504754937820044, "grad_norm": 12.733180046081543, "learning_rate": 4.547025425820785e-05, "loss": 0.2561, "step": 417 }, { "epoch": 0.3057790782735918, "grad_norm": 5.065639972686768, "learning_rate": 4.5457911626758825e-05, "loss": 0.3417, "step": 418 }, { "epoch": 0.30651060716898315, "grad_norm": 15.026095390319824, "learning_rate": 4.54455689953098e-05, "loss": 0.2149, "step": 419 }, { "epoch": 0.30724213606437456, "grad_norm": 12.774312973022461, "learning_rate": 4.543322636386078e-05, "loss": 0.2864, "step": 420 }, { "epoch": 0.3079736649597659, "grad_norm": 7.869741916656494, "learning_rate": 4.542088373241175e-05, "loss": 0.2387, "step": 421 }, { "epoch": 0.30870519385515727, "grad_norm": 1.8443080186843872, "learning_rate": 4.5408541100962725e-05, "loss": 0.0339, "step": 422 }, { "epoch": 0.3094367227505486, "grad_norm": 10.47679328918457, "learning_rate": 4.53961984695137e-05, "loss": 0.1512, "step": 423 }, { "epoch": 0.31016825164594003, "grad_norm": 5.055495738983154, "learning_rate": 4.538385583806468e-05, "loss": 0.1008, "step": 424 }, { "epoch": 0.3108997805413314, "grad_norm": 10.940006256103516, "learning_rate": 4.537151320661565e-05, "loss": 0.1645, "step": 425 }, { "epoch": 0.31163130943672274, "grad_norm": 12.164027214050293, "learning_rate": 4.5359170575166626e-05, "loss": 0.1742, "step": 426 }, { "epoch": 0.31236283833211415, "grad_norm": 13.576825141906738, "learning_rate": 4.53468279437176e-05, "loss": 0.2228, "step": 427 }, { "epoch": 0.3130943672275055, "grad_norm": 9.05417537689209, "learning_rate": 4.533448531226858e-05, "loss": 0.1428, "step": 428 }, { "epoch": 0.31382589612289685, "grad_norm": 1.611741304397583, "learning_rate": 4.532214268081955e-05, "loss": 0.0255, "step": 429 }, { "epoch": 0.3145574250182882, "grad_norm": 5.510623931884766, "learning_rate": 4.5309800049370526e-05, "loss": 0.1006, "step": 430 }, { "epoch": 0.3152889539136796, "grad_norm": 1.394653558731079, "learning_rate": 4.52974574179215e-05, "loss": 0.0307, "step": 431 }, { "epoch": 0.31602048280907097, "grad_norm": 3.096932888031006, "learning_rate": 4.528511478647248e-05, "loss": 0.0327, "step": 432 }, { "epoch": 0.3167520117044623, "grad_norm": 1.666124701499939, "learning_rate": 4.527277215502345e-05, "loss": 0.0126, "step": 433 }, { "epoch": 0.3174835405998537, "grad_norm": 2.412437915802002, "learning_rate": 4.5260429523574426e-05, "loss": 0.0148, "step": 434 }, { "epoch": 0.3182150694952451, "grad_norm": 0.6682825684547424, "learning_rate": 4.52480868921254e-05, "loss": 0.0059, "step": 435 }, { "epoch": 0.31894659839063644, "grad_norm": 1.2551891803741455, "learning_rate": 4.523574426067638e-05, "loss": 0.1772, "step": 436 }, { "epoch": 0.3196781272860278, "grad_norm": 22.184246063232422, "learning_rate": 4.522340162922735e-05, "loss": 0.3924, "step": 437 }, { "epoch": 0.32040965618141914, "grad_norm": 25.171894073486328, "learning_rate": 4.521105899777833e-05, "loss": 0.0753, "step": 438 }, { "epoch": 0.32114118507681055, "grad_norm": 18.36154556274414, "learning_rate": 4.5198716366329304e-05, "loss": 0.3197, "step": 439 }, { "epoch": 0.3218727139722019, "grad_norm": 9.47744369506836, "learning_rate": 4.518637373488028e-05, "loss": 0.0538, "step": 440 }, { "epoch": 0.32260424286759326, "grad_norm": 15.026765823364258, "learning_rate": 4.517403110343125e-05, "loss": 0.2411, "step": 441 }, { "epoch": 0.3233357717629846, "grad_norm": 22.966650009155273, "learning_rate": 4.516168847198223e-05, "loss": 0.3196, "step": 442 }, { "epoch": 0.324067300658376, "grad_norm": 16.954500198364258, "learning_rate": 4.5149345840533204e-05, "loss": 0.0929, "step": 443 }, { "epoch": 0.3247988295537674, "grad_norm": 15.828384399414062, "learning_rate": 4.513700320908418e-05, "loss": 0.0695, "step": 444 }, { "epoch": 0.32553035844915873, "grad_norm": 10.415084838867188, "learning_rate": 4.512466057763515e-05, "loss": 0.5213, "step": 445 }, { "epoch": 0.3262618873445501, "grad_norm": 8.23692798614502, "learning_rate": 4.511231794618613e-05, "loss": 0.3098, "step": 446 }, { "epoch": 0.3269934162399415, "grad_norm": 1.7196277379989624, "learning_rate": 4.5099975314737104e-05, "loss": 0.025, "step": 447 }, { "epoch": 0.32772494513533285, "grad_norm": 0.16354356706142426, "learning_rate": 4.508763268328808e-05, "loss": 0.006, "step": 448 }, { "epoch": 0.3284564740307242, "grad_norm": 2.368960380554199, "learning_rate": 4.507529005183905e-05, "loss": 0.022, "step": 449 }, { "epoch": 0.3291880029261156, "grad_norm": 0.8189841508865356, "learning_rate": 4.506294742039003e-05, "loss": 0.0162, "step": 450 }, { "epoch": 0.32991953182150696, "grad_norm": 0.4513343274593353, "learning_rate": 4.5050604788941005e-05, "loss": 0.0086, "step": 451 }, { "epoch": 0.3306510607168983, "grad_norm": 16.59419822692871, "learning_rate": 4.503826215749198e-05, "loss": 0.1809, "step": 452 }, { "epoch": 0.33138258961228967, "grad_norm": 5.402805328369141, "learning_rate": 4.502591952604295e-05, "loss": 0.0363, "step": 453 }, { "epoch": 0.3321141185076811, "grad_norm": 11.294575691223145, "learning_rate": 4.501357689459393e-05, "loss": 0.1241, "step": 454 }, { "epoch": 0.33284564740307243, "grad_norm": 8.12694263458252, "learning_rate": 4.5001234263144905e-05, "loss": 0.0962, "step": 455 }, { "epoch": 0.3335771762984638, "grad_norm": 6.511351108551025, "learning_rate": 4.498889163169588e-05, "loss": 0.1859, "step": 456 }, { "epoch": 0.33430870519385514, "grad_norm": 7.598495960235596, "learning_rate": 4.497654900024685e-05, "loss": 0.1231, "step": 457 }, { "epoch": 0.33504023408924655, "grad_norm": 6.566070079803467, "learning_rate": 4.496420636879783e-05, "loss": 0.1472, "step": 458 }, { "epoch": 0.3357717629846379, "grad_norm": 7.950456619262695, "learning_rate": 4.4951863737348806e-05, "loss": 0.1591, "step": 459 }, { "epoch": 0.33650329188002925, "grad_norm": 5.999925136566162, "learning_rate": 4.4939521105899776e-05, "loss": 0.1604, "step": 460 }, { "epoch": 0.3372348207754206, "grad_norm": 20.319589614868164, "learning_rate": 4.492717847445075e-05, "loss": 0.0977, "step": 461 }, { "epoch": 0.337966349670812, "grad_norm": 2.070016860961914, "learning_rate": 4.491483584300173e-05, "loss": 0.0132, "step": 462 }, { "epoch": 0.33869787856620337, "grad_norm": 4.265620231628418, "learning_rate": 4.4902493211552706e-05, "loss": 0.1825, "step": 463 }, { "epoch": 0.3394294074615947, "grad_norm": 11.451858520507812, "learning_rate": 4.4890150580103676e-05, "loss": 0.2299, "step": 464 }, { "epoch": 0.3401609363569861, "grad_norm": 7.441442489624023, "learning_rate": 4.487780794865465e-05, "loss": 0.1731, "step": 465 }, { "epoch": 0.3408924652523775, "grad_norm": 8.330365180969238, "learning_rate": 4.486546531720563e-05, "loss": 0.2542, "step": 466 }, { "epoch": 0.34162399414776884, "grad_norm": 1.1899523735046387, "learning_rate": 4.4853122685756606e-05, "loss": 0.0146, "step": 467 }, { "epoch": 0.3423555230431602, "grad_norm": 2.357039451599121, "learning_rate": 4.4840780054307576e-05, "loss": 0.0285, "step": 468 }, { "epoch": 0.34308705193855155, "grad_norm": 8.613577842712402, "learning_rate": 4.482843742285855e-05, "loss": 0.2645, "step": 469 }, { "epoch": 0.34381858083394295, "grad_norm": 3.0956192016601562, "learning_rate": 4.481609479140953e-05, "loss": 0.0411, "step": 470 }, { "epoch": 0.3445501097293343, "grad_norm": 10.226750373840332, "learning_rate": 4.480375215996051e-05, "loss": 0.0881, "step": 471 }, { "epoch": 0.34528163862472566, "grad_norm": 11.859057426452637, "learning_rate": 4.479140952851148e-05, "loss": 0.0753, "step": 472 }, { "epoch": 0.34601316752011707, "grad_norm": 10.647882461547852, "learning_rate": 4.4779066897062454e-05, "loss": 0.2195, "step": 473 }, { "epoch": 0.3467446964155084, "grad_norm": 3.6903796195983887, "learning_rate": 4.476672426561343e-05, "loss": 0.021, "step": 474 }, { "epoch": 0.3474762253108998, "grad_norm": 0.659842848777771, "learning_rate": 4.475438163416441e-05, "loss": 0.0117, "step": 475 }, { "epoch": 0.34820775420629113, "grad_norm": 11.622856140136719, "learning_rate": 4.474203900271538e-05, "loss": 0.1931, "step": 476 }, { "epoch": 0.34893928310168254, "grad_norm": 14.464702606201172, "learning_rate": 4.4729696371266354e-05, "loss": 0.4818, "step": 477 }, { "epoch": 0.3496708119970739, "grad_norm": 3.9912984371185303, "learning_rate": 4.471735373981733e-05, "loss": 0.1299, "step": 478 }, { "epoch": 0.35040234089246525, "grad_norm": 9.154773712158203, "learning_rate": 4.470501110836831e-05, "loss": 0.1705, "step": 479 }, { "epoch": 0.3511338697878566, "grad_norm": 3.7301886081695557, "learning_rate": 4.469266847691928e-05, "loss": 0.0985, "step": 480 }, { "epoch": 0.351865398683248, "grad_norm": 5.815854549407959, "learning_rate": 4.4680325845470254e-05, "loss": 0.032, "step": 481 }, { "epoch": 0.35259692757863936, "grad_norm": 2.5031096935272217, "learning_rate": 4.466798321402123e-05, "loss": 0.1047, "step": 482 }, { "epoch": 0.3533284564740307, "grad_norm": 0.8649278879165649, "learning_rate": 4.465564058257221e-05, "loss": 0.0139, "step": 483 }, { "epoch": 0.35405998536942207, "grad_norm": 8.683521270751953, "learning_rate": 4.464329795112318e-05, "loss": 0.1934, "step": 484 }, { "epoch": 0.3547915142648135, "grad_norm": 2.593301773071289, "learning_rate": 4.4630955319674155e-05, "loss": 0.1196, "step": 485 }, { "epoch": 0.35552304316020483, "grad_norm": 5.719599723815918, "learning_rate": 4.461861268822513e-05, "loss": 0.1311, "step": 486 }, { "epoch": 0.3562545720555962, "grad_norm": 0.9278637170791626, "learning_rate": 4.460627005677611e-05, "loss": 0.0141, "step": 487 }, { "epoch": 0.35698610095098754, "grad_norm": 3.5398714542388916, "learning_rate": 4.459392742532708e-05, "loss": 0.1077, "step": 488 }, { "epoch": 0.35771762984637895, "grad_norm": 1.9011242389678955, "learning_rate": 4.4581584793878055e-05, "loss": 0.0544, "step": 489 }, { "epoch": 0.3584491587417703, "grad_norm": 14.884604454040527, "learning_rate": 4.456924216242903e-05, "loss": 0.1844, "step": 490 }, { "epoch": 0.35918068763716166, "grad_norm": 7.201797008514404, "learning_rate": 4.455689953098001e-05, "loss": 0.0527, "step": 491 }, { "epoch": 0.359912216532553, "grad_norm": 5.878300666809082, "learning_rate": 4.454455689953098e-05, "loss": 0.1005, "step": 492 }, { "epoch": 0.3606437454279444, "grad_norm": 12.048202514648438, "learning_rate": 4.4532214268081955e-05, "loss": 0.2735, "step": 493 }, { "epoch": 0.36137527432333577, "grad_norm": 4.299004077911377, "learning_rate": 4.451987163663293e-05, "loss": 0.2464, "step": 494 }, { "epoch": 0.3621068032187271, "grad_norm": 10.206283569335938, "learning_rate": 4.450752900518391e-05, "loss": 0.1965, "step": 495 }, { "epoch": 0.36283833211411853, "grad_norm": 4.975686073303223, "learning_rate": 4.449518637373488e-05, "loss": 0.1946, "step": 496 }, { "epoch": 0.3635698610095099, "grad_norm": 4.535261154174805, "learning_rate": 4.4482843742285856e-05, "loss": 0.0507, "step": 497 }, { "epoch": 0.36430138990490124, "grad_norm": 12.854368209838867, "learning_rate": 4.447050111083683e-05, "loss": 0.2708, "step": 498 }, { "epoch": 0.3650329188002926, "grad_norm": 6.746164798736572, "learning_rate": 4.445815847938781e-05, "loss": 0.0853, "step": 499 }, { "epoch": 0.365764447695684, "grad_norm": 1.5886473655700684, "learning_rate": 4.444581584793878e-05, "loss": 0.0349, "step": 500 }, { "epoch": 0.36649597659107536, "grad_norm": 8.353938102722168, "learning_rate": 4.4433473216489756e-05, "loss": 0.1493, "step": 501 }, { "epoch": 0.3672275054864667, "grad_norm": 2.976858615875244, "learning_rate": 4.442113058504073e-05, "loss": 0.0413, "step": 502 }, { "epoch": 0.36795903438185806, "grad_norm": 5.565382480621338, "learning_rate": 4.440878795359171e-05, "loss": 0.1743, "step": 503 }, { "epoch": 0.3686905632772495, "grad_norm": 1.7665258646011353, "learning_rate": 4.439644532214268e-05, "loss": 0.036, "step": 504 }, { "epoch": 0.3694220921726408, "grad_norm": 19.89697265625, "learning_rate": 4.438410269069366e-05, "loss": 0.2309, "step": 505 }, { "epoch": 0.3701536210680322, "grad_norm": 3.514456272125244, "learning_rate": 4.4371760059244633e-05, "loss": 0.1105, "step": 506 }, { "epoch": 0.37088514996342353, "grad_norm": 15.712457656860352, "learning_rate": 4.435941742779561e-05, "loss": 0.4409, "step": 507 }, { "epoch": 0.37161667885881494, "grad_norm": 1.9639244079589844, "learning_rate": 4.434707479634658e-05, "loss": 0.0384, "step": 508 }, { "epoch": 0.3723482077542063, "grad_norm": 2.435147523880005, "learning_rate": 4.433473216489756e-05, "loss": 0.0246, "step": 509 }, { "epoch": 0.37307973664959765, "grad_norm": 2.5627830028533936, "learning_rate": 4.4322389533448534e-05, "loss": 0.0287, "step": 510 }, { "epoch": 0.373811265544989, "grad_norm": 3.3289380073547363, "learning_rate": 4.431004690199951e-05, "loss": 0.0311, "step": 511 }, { "epoch": 0.3745427944403804, "grad_norm": 2.539510726928711, "learning_rate": 4.429770427055048e-05, "loss": 0.0429, "step": 512 }, { "epoch": 0.37527432333577176, "grad_norm": 12.228981971740723, "learning_rate": 4.428536163910146e-05, "loss": 0.1211, "step": 513 }, { "epoch": 0.3760058522311631, "grad_norm": 4.044177532196045, "learning_rate": 4.4273019007652434e-05, "loss": 0.0187, "step": 514 }, { "epoch": 0.37673738112655447, "grad_norm": 0.5507501363754272, "learning_rate": 4.426067637620341e-05, "loss": 0.0077, "step": 515 }, { "epoch": 0.3774689100219459, "grad_norm": 1.1484920978546143, "learning_rate": 4.424833374475438e-05, "loss": 0.008, "step": 516 }, { "epoch": 0.37820043891733723, "grad_norm": 0.7131054997444153, "learning_rate": 4.423599111330536e-05, "loss": 0.0041, "step": 517 }, { "epoch": 0.3789319678127286, "grad_norm": 5.8445892333984375, "learning_rate": 4.4223648481856335e-05, "loss": 0.0131, "step": 518 }, { "epoch": 0.37966349670812, "grad_norm": 7.291834831237793, "learning_rate": 4.421130585040731e-05, "loss": 0.1569, "step": 519 }, { "epoch": 0.38039502560351135, "grad_norm": 30.251922607421875, "learning_rate": 4.419896321895828e-05, "loss": 0.3099, "step": 520 }, { "epoch": 0.3811265544989027, "grad_norm": 9.129247665405273, "learning_rate": 4.418662058750926e-05, "loss": 0.2026, "step": 521 }, { "epoch": 0.38185808339429406, "grad_norm": 0.9658652544021606, "learning_rate": 4.4174277956060235e-05, "loss": 0.0037, "step": 522 }, { "epoch": 0.38258961228968547, "grad_norm": 12.478177070617676, "learning_rate": 4.416193532461121e-05, "loss": 0.031, "step": 523 }, { "epoch": 0.3833211411850768, "grad_norm": 26.538421630859375, "learning_rate": 4.414959269316218e-05, "loss": 0.3514, "step": 524 }, { "epoch": 0.3840526700804682, "grad_norm": 1.3212406635284424, "learning_rate": 4.413725006171316e-05, "loss": 0.203, "step": 525 }, { "epoch": 0.3847841989758595, "grad_norm": 51.49991226196289, "learning_rate": 4.4124907430264135e-05, "loss": 0.1257, "step": 526 }, { "epoch": 0.38551572787125093, "grad_norm": 0.7703857421875, "learning_rate": 4.411256479881511e-05, "loss": 0.1784, "step": 527 }, { "epoch": 0.3862472567666423, "grad_norm": 14.708564758300781, "learning_rate": 4.410022216736608e-05, "loss": 0.2668, "step": 528 }, { "epoch": 0.38697878566203364, "grad_norm": 5.808557510375977, "learning_rate": 4.408787953591706e-05, "loss": 0.1638, "step": 529 }, { "epoch": 0.387710314557425, "grad_norm": 0.8505542874336243, "learning_rate": 4.4075536904468036e-05, "loss": 0.0073, "step": 530 }, { "epoch": 0.3884418434528164, "grad_norm": 0.11061286181211472, "learning_rate": 4.406319427301901e-05, "loss": 0.005, "step": 531 }, { "epoch": 0.38917337234820776, "grad_norm": 0.7503408789634705, "learning_rate": 4.405085164156998e-05, "loss": 0.1625, "step": 532 }, { "epoch": 0.3899049012435991, "grad_norm": 2.246086359024048, "learning_rate": 4.403850901012096e-05, "loss": 0.4061, "step": 533 }, { "epoch": 0.39063643013899046, "grad_norm": 6.411849498748779, "learning_rate": 4.4026166378671936e-05, "loss": 0.5269, "step": 534 }, { "epoch": 0.3913679590343819, "grad_norm": 8.61979866027832, "learning_rate": 4.401382374722291e-05, "loss": 0.3667, "step": 535 }, { "epoch": 0.3920994879297732, "grad_norm": 1.9316766262054443, "learning_rate": 4.400148111577388e-05, "loss": 0.2059, "step": 536 }, { "epoch": 0.3928310168251646, "grad_norm": 3.0647356510162354, "learning_rate": 4.398913848432486e-05, "loss": 0.4213, "step": 537 }, { "epoch": 0.393562545720556, "grad_norm": 1.8672873973846436, "learning_rate": 4.3976795852875837e-05, "loss": 0.1797, "step": 538 }, { "epoch": 0.39429407461594734, "grad_norm": 2.091475248336792, "learning_rate": 4.396445322142681e-05, "loss": 0.3265, "step": 539 }, { "epoch": 0.3950256035113387, "grad_norm": 4.4425153732299805, "learning_rate": 4.395211058997778e-05, "loss": 0.3095, "step": 540 }, { "epoch": 0.39575713240673005, "grad_norm": 2.684081554412842, "learning_rate": 4.393976795852876e-05, "loss": 0.162, "step": 541 }, { "epoch": 0.39648866130212146, "grad_norm": 4.997131824493408, "learning_rate": 4.392742532707974e-05, "loss": 0.2286, "step": 542 }, { "epoch": 0.3972201901975128, "grad_norm": 1.9866807460784912, "learning_rate": 4.3915082695630714e-05, "loss": 0.1451, "step": 543 }, { "epoch": 0.39795171909290417, "grad_norm": 6.190872669219971, "learning_rate": 4.3902740064181684e-05, "loss": 0.172, "step": 544 }, { "epoch": 0.3986832479882955, "grad_norm": 2.6774699687957764, "learning_rate": 4.389039743273266e-05, "loss": 0.0782, "step": 545 }, { "epoch": 0.39941477688368693, "grad_norm": 7.32177209854126, "learning_rate": 4.387805480128364e-05, "loss": 0.1327, "step": 546 }, { "epoch": 0.4001463057790783, "grad_norm": 10.950617790222168, "learning_rate": 4.386571216983461e-05, "loss": 0.2989, "step": 547 }, { "epoch": 0.40087783467446964, "grad_norm": 17.907926559448242, "learning_rate": 4.3853369538385584e-05, "loss": 0.2421, "step": 548 }, { "epoch": 0.401609363569861, "grad_norm": 11.753515243530273, "learning_rate": 4.384102690693656e-05, "loss": 0.2459, "step": 549 }, { "epoch": 0.4023408924652524, "grad_norm": 0.2575973570346832, "learning_rate": 4.382868427548754e-05, "loss": 0.011, "step": 550 }, { "epoch": 0.40307242136064375, "grad_norm": 2.6689629554748535, "learning_rate": 4.381634164403851e-05, "loss": 0.0107, "step": 551 }, { "epoch": 0.4038039502560351, "grad_norm": 14.89208698272705, "learning_rate": 4.3803999012589485e-05, "loss": 0.1624, "step": 552 }, { "epoch": 0.40453547915142646, "grad_norm": 5.7271199226379395, "learning_rate": 4.379165638114046e-05, "loss": 0.4662, "step": 553 }, { "epoch": 0.40526700804681787, "grad_norm": 2.521291732788086, "learning_rate": 4.377931374969144e-05, "loss": 0.4761, "step": 554 }, { "epoch": 0.4059985369422092, "grad_norm": 1.4226897954940796, "learning_rate": 4.376697111824241e-05, "loss": 0.2979, "step": 555 }, { "epoch": 0.4067300658376006, "grad_norm": 4.659623622894287, "learning_rate": 4.3754628486793385e-05, "loss": 0.561, "step": 556 }, { "epoch": 0.4074615947329919, "grad_norm": 9.953287124633789, "learning_rate": 4.374228585534436e-05, "loss": 0.2677, "step": 557 }, { "epoch": 0.40819312362838334, "grad_norm": 2.778878688812256, "learning_rate": 4.372994322389534e-05, "loss": 0.3978, "step": 558 }, { "epoch": 0.4089246525237747, "grad_norm": 35.572509765625, "learning_rate": 4.371760059244631e-05, "loss": 0.3995, "step": 559 }, { "epoch": 0.40965618141916604, "grad_norm": 1.0459601879119873, "learning_rate": 4.3705257960997285e-05, "loss": 0.0286, "step": 560 }, { "epoch": 0.41038771031455745, "grad_norm": 0.8999807834625244, "learning_rate": 4.369291532954826e-05, "loss": 0.0332, "step": 561 }, { "epoch": 0.4111192392099488, "grad_norm": 1.8403478860855103, "learning_rate": 4.368057269809924e-05, "loss": 0.0445, "step": 562 }, { "epoch": 0.41185076810534016, "grad_norm": 22.366085052490234, "learning_rate": 4.366823006665021e-05, "loss": 0.5474, "step": 563 }, { "epoch": 0.4125822970007315, "grad_norm": 7.149095058441162, "learning_rate": 4.3655887435201186e-05, "loss": 0.0355, "step": 564 }, { "epoch": 0.4133138258961229, "grad_norm": 14.463014602661133, "learning_rate": 4.364354480375216e-05, "loss": 0.1708, "step": 565 }, { "epoch": 0.4140453547915143, "grad_norm": 23.6442928314209, "learning_rate": 4.363120217230314e-05, "loss": 0.3781, "step": 566 }, { "epoch": 0.41477688368690563, "grad_norm": 18.11533546447754, "learning_rate": 4.361885954085411e-05, "loss": 0.245, "step": 567 }, { "epoch": 0.415508412582297, "grad_norm": 11.10037612915039, "learning_rate": 4.3606516909405086e-05, "loss": 0.1446, "step": 568 }, { "epoch": 0.4162399414776884, "grad_norm": 26.17498016357422, "learning_rate": 4.359417427795606e-05, "loss": 0.09, "step": 569 }, { "epoch": 0.41697147037307974, "grad_norm": 30.629398345947266, "learning_rate": 4.358183164650704e-05, "loss": 0.1065, "step": 570 }, { "epoch": 0.4177029992684711, "grad_norm": 11.63933277130127, "learning_rate": 4.356948901505801e-05, "loss": 0.0465, "step": 571 }, { "epoch": 0.41843452816386245, "grad_norm": 9.38221549987793, "learning_rate": 4.3557146383608987e-05, "loss": 0.2022, "step": 572 }, { "epoch": 0.41916605705925386, "grad_norm": 0.6041578650474548, "learning_rate": 4.354480375215996e-05, "loss": 0.0278, "step": 573 }, { "epoch": 0.4198975859546452, "grad_norm": 7.006832599639893, "learning_rate": 4.353246112071094e-05, "loss": 0.1208, "step": 574 }, { "epoch": 0.42062911485003657, "grad_norm": 21.108745574951172, "learning_rate": 4.352011848926191e-05, "loss": 0.1467, "step": 575 }, { "epoch": 0.4213606437454279, "grad_norm": 11.106658935546875, "learning_rate": 4.350777585781289e-05, "loss": 0.2864, "step": 576 }, { "epoch": 0.42209217264081933, "grad_norm": 17.836238861083984, "learning_rate": 4.3495433226363864e-05, "loss": 0.1764, "step": 577 }, { "epoch": 0.4228237015362107, "grad_norm": 11.10281753540039, "learning_rate": 4.348309059491484e-05, "loss": 0.0491, "step": 578 }, { "epoch": 0.42355523043160204, "grad_norm": 6.381745338439941, "learning_rate": 4.347074796346581e-05, "loss": 0.1787, "step": 579 }, { "epoch": 0.4242867593269934, "grad_norm": 13.746402740478516, "learning_rate": 4.345840533201679e-05, "loss": 0.4172, "step": 580 }, { "epoch": 0.4250182882223848, "grad_norm": 13.652838706970215, "learning_rate": 4.3446062700567764e-05, "loss": 0.3321, "step": 581 }, { "epoch": 0.42574981711777615, "grad_norm": 14.336955070495605, "learning_rate": 4.343372006911874e-05, "loss": 0.4395, "step": 582 }, { "epoch": 0.4264813460131675, "grad_norm": 5.352224349975586, "learning_rate": 4.342137743766971e-05, "loss": 0.2033, "step": 583 }, { "epoch": 0.4272128749085589, "grad_norm": 6.963387489318848, "learning_rate": 4.340903480622069e-05, "loss": 0.2703, "step": 584 }, { "epoch": 0.42794440380395027, "grad_norm": 3.3067362308502197, "learning_rate": 4.3396692174771664e-05, "loss": 0.1195, "step": 585 }, { "epoch": 0.4286759326993416, "grad_norm": 1.9241249561309814, "learning_rate": 4.338434954332264e-05, "loss": 0.2334, "step": 586 }, { "epoch": 0.429407461594733, "grad_norm": 4.043102741241455, "learning_rate": 4.337200691187361e-05, "loss": 0.1091, "step": 587 }, { "epoch": 0.4301389904901244, "grad_norm": 12.428733825683594, "learning_rate": 4.335966428042459e-05, "loss": 0.0766, "step": 588 }, { "epoch": 0.43087051938551574, "grad_norm": 6.44443941116333, "learning_rate": 4.3347321648975565e-05, "loss": 0.2105, "step": 589 }, { "epoch": 0.4316020482809071, "grad_norm": 6.943089008331299, "learning_rate": 4.333497901752654e-05, "loss": 0.0968, "step": 590 }, { "epoch": 0.43233357717629844, "grad_norm": 2.2569751739501953, "learning_rate": 4.332263638607751e-05, "loss": 0.1184, "step": 591 }, { "epoch": 0.43306510607168985, "grad_norm": 6.495257377624512, "learning_rate": 4.331029375462849e-05, "loss": 0.0956, "step": 592 }, { "epoch": 0.4337966349670812, "grad_norm": 2.2578611373901367, "learning_rate": 4.3297951123179465e-05, "loss": 0.0831, "step": 593 }, { "epoch": 0.43452816386247256, "grad_norm": 5.792292594909668, "learning_rate": 4.328560849173044e-05, "loss": 0.0973, "step": 594 }, { "epoch": 0.4352596927578639, "grad_norm": 2.944272994995117, "learning_rate": 4.327326586028141e-05, "loss": 0.1564, "step": 595 }, { "epoch": 0.4359912216532553, "grad_norm": 3.121856689453125, "learning_rate": 4.326092322883239e-05, "loss": 0.1286, "step": 596 }, { "epoch": 0.4367227505486467, "grad_norm": 4.188198566436768, "learning_rate": 4.3248580597383366e-05, "loss": 0.0746, "step": 597 }, { "epoch": 0.43745427944403803, "grad_norm": 3.11100435256958, "learning_rate": 4.323623796593434e-05, "loss": 0.125, "step": 598 }, { "epoch": 0.4381858083394294, "grad_norm": 7.424309253692627, "learning_rate": 4.322389533448531e-05, "loss": 0.2076, "step": 599 }, { "epoch": 0.4389173372348208, "grad_norm": 5.295783996582031, "learning_rate": 4.321155270303629e-05, "loss": 0.071, "step": 600 }, { "epoch": 0.43964886613021215, "grad_norm": 1.6776206493377686, "learning_rate": 4.3199210071587266e-05, "loss": 0.039, "step": 601 }, { "epoch": 0.4403803950256035, "grad_norm": 2.2487690448760986, "learning_rate": 4.318686744013824e-05, "loss": 0.0602, "step": 602 }, { "epoch": 0.44111192392099485, "grad_norm": 7.919351100921631, "learning_rate": 4.317452480868921e-05, "loss": 0.1089, "step": 603 }, { "epoch": 0.44184345281638626, "grad_norm": 5.996087551116943, "learning_rate": 4.316218217724019e-05, "loss": 0.1667, "step": 604 }, { "epoch": 0.4425749817117776, "grad_norm": 13.989508628845215, "learning_rate": 4.3149839545791166e-05, "loss": 0.3554, "step": 605 }, { "epoch": 0.44330651060716897, "grad_norm": 4.725215911865234, "learning_rate": 4.313749691434214e-05, "loss": 0.2307, "step": 606 }, { "epoch": 0.4440380395025604, "grad_norm": 3.2230350971221924, "learning_rate": 4.312515428289311e-05, "loss": 0.0548, "step": 607 }, { "epoch": 0.44476956839795173, "grad_norm": 8.356630325317383, "learning_rate": 4.311281165144409e-05, "loss": 0.1629, "step": 608 }, { "epoch": 0.4455010972933431, "grad_norm": 2.3178961277008057, "learning_rate": 4.310046901999507e-05, "loss": 0.1472, "step": 609 }, { "epoch": 0.44623262618873444, "grad_norm": 8.679362297058105, "learning_rate": 4.3088126388546044e-05, "loss": 0.0774, "step": 610 }, { "epoch": 0.44696415508412585, "grad_norm": 13.396735191345215, "learning_rate": 4.3075783757097014e-05, "loss": 0.1179, "step": 611 }, { "epoch": 0.4476956839795172, "grad_norm": 13.011494636535645, "learning_rate": 4.306344112564799e-05, "loss": 0.1385, "step": 612 }, { "epoch": 0.44842721287490855, "grad_norm": 3.9012670516967773, "learning_rate": 4.305109849419897e-05, "loss": 0.22, "step": 613 }, { "epoch": 0.4491587417702999, "grad_norm": 12.76486873626709, "learning_rate": 4.3038755862749944e-05, "loss": 0.3073, "step": 614 }, { "epoch": 0.4498902706656913, "grad_norm": 12.403929710388184, "learning_rate": 4.3026413231300914e-05, "loss": 0.2179, "step": 615 }, { "epoch": 0.45062179956108267, "grad_norm": 7.929824352264404, "learning_rate": 4.301407059985189e-05, "loss": 0.106, "step": 616 }, { "epoch": 0.451353328456474, "grad_norm": 9.39020824432373, "learning_rate": 4.300172796840287e-05, "loss": 0.177, "step": 617 }, { "epoch": 0.4520848573518654, "grad_norm": 7.313223838806152, "learning_rate": 4.2989385336953844e-05, "loss": 0.0897, "step": 618 }, { "epoch": 0.4528163862472568, "grad_norm": 1.2038553953170776, "learning_rate": 4.2977042705504814e-05, "loss": 0.0195, "step": 619 }, { "epoch": 0.45354791514264814, "grad_norm": 0.6123846769332886, "learning_rate": 4.296470007405579e-05, "loss": 0.1432, "step": 620 }, { "epoch": 0.4542794440380395, "grad_norm": 13.58670425415039, "learning_rate": 4.295235744260677e-05, "loss": 0.2656, "step": 621 }, { "epoch": 0.45501097293343085, "grad_norm": 6.364424228668213, "learning_rate": 4.2940014811157745e-05, "loss": 0.149, "step": 622 }, { "epoch": 0.45574250182882226, "grad_norm": 10.840051651000977, "learning_rate": 4.2927672179708715e-05, "loss": 0.191, "step": 623 }, { "epoch": 0.4564740307242136, "grad_norm": 8.332551956176758, "learning_rate": 4.291532954825969e-05, "loss": 0.1225, "step": 624 }, { "epoch": 0.45720555961960496, "grad_norm": 1.1605379581451416, "learning_rate": 4.290298691681067e-05, "loss": 0.03, "step": 625 }, { "epoch": 0.4579370885149963, "grad_norm": 5.372580051422119, "learning_rate": 4.2890644285361645e-05, "loss": 0.1061, "step": 626 }, { "epoch": 0.4586686174103877, "grad_norm": 3.3384809494018555, "learning_rate": 4.2878301653912615e-05, "loss": 0.0438, "step": 627 }, { "epoch": 0.4594001463057791, "grad_norm": 11.407611846923828, "learning_rate": 4.286595902246359e-05, "loss": 0.1385, "step": 628 }, { "epoch": 0.46013167520117043, "grad_norm": 8.344658851623535, "learning_rate": 4.285361639101457e-05, "loss": 0.0822, "step": 629 }, { "epoch": 0.46086320409656184, "grad_norm": 2.814397096633911, "learning_rate": 4.2841273759565546e-05, "loss": 0.1884, "step": 630 }, { "epoch": 0.4615947329919532, "grad_norm": 9.74708080291748, "learning_rate": 4.2828931128116516e-05, "loss": 0.0951, "step": 631 }, { "epoch": 0.46232626188734455, "grad_norm": 11.294310569763184, "learning_rate": 4.281658849666749e-05, "loss": 0.1134, "step": 632 }, { "epoch": 0.4630577907827359, "grad_norm": 6.855737686157227, "learning_rate": 4.280424586521847e-05, "loss": 0.1572, "step": 633 }, { "epoch": 0.4637893196781273, "grad_norm": 2.9653918743133545, "learning_rate": 4.279190323376944e-05, "loss": 0.0311, "step": 634 }, { "epoch": 0.46452084857351866, "grad_norm": 8.525978088378906, "learning_rate": 4.2779560602320416e-05, "loss": 0.3048, "step": 635 }, { "epoch": 0.46525237746891, "grad_norm": 11.054092407226562, "learning_rate": 4.276721797087139e-05, "loss": 0.0689, "step": 636 }, { "epoch": 0.46598390636430137, "grad_norm": 2.042938470840454, "learning_rate": 4.275487533942237e-05, "loss": 0.033, "step": 637 }, { "epoch": 0.4667154352596928, "grad_norm": 2.014674425125122, "learning_rate": 4.274253270797334e-05, "loss": 0.0255, "step": 638 }, { "epoch": 0.46744696415508413, "grad_norm": 6.355203628540039, "learning_rate": 4.2730190076524316e-05, "loss": 0.172, "step": 639 }, { "epoch": 0.4681784930504755, "grad_norm": 2.6938765048980713, "learning_rate": 4.271784744507529e-05, "loss": 0.1473, "step": 640 }, { "epoch": 0.46891002194586684, "grad_norm": 4.249083995819092, "learning_rate": 4.270550481362627e-05, "loss": 0.1746, "step": 641 }, { "epoch": 0.46964155084125825, "grad_norm": 6.592108726501465, "learning_rate": 4.269316218217724e-05, "loss": 0.1321, "step": 642 }, { "epoch": 0.4703730797366496, "grad_norm": 6.125608921051025, "learning_rate": 4.268081955072822e-05, "loss": 0.056, "step": 643 }, { "epoch": 0.47110460863204096, "grad_norm": 9.186792373657227, "learning_rate": 4.2668476919279194e-05, "loss": 0.0898, "step": 644 }, { "epoch": 0.4718361375274323, "grad_norm": 2.1937906742095947, "learning_rate": 4.265613428783017e-05, "loss": 0.0291, "step": 645 }, { "epoch": 0.4725676664228237, "grad_norm": 8.007266998291016, "learning_rate": 4.264379165638114e-05, "loss": 0.2746, "step": 646 }, { "epoch": 0.47329919531821507, "grad_norm": 4.2162957191467285, "learning_rate": 4.263144902493212e-05, "loss": 0.0599, "step": 647 }, { "epoch": 0.4740307242136064, "grad_norm": 3.355351209640503, "learning_rate": 4.2619106393483094e-05, "loss": 0.0446, "step": 648 }, { "epoch": 0.4747622531089978, "grad_norm": 2.482913017272949, "learning_rate": 4.260676376203407e-05, "loss": 0.0375, "step": 649 }, { "epoch": 0.4754937820043892, "grad_norm": 1.1520805358886719, "learning_rate": 4.259442113058504e-05, "loss": 0.1317, "step": 650 }, { "epoch": 0.47622531089978054, "grad_norm": 3.815011501312256, "learning_rate": 4.258207849913602e-05, "loss": 0.0523, "step": 651 }, { "epoch": 0.4769568397951719, "grad_norm": 1.0056209564208984, "learning_rate": 4.2569735867686994e-05, "loss": 0.0281, "step": 652 }, { "epoch": 0.4776883686905633, "grad_norm": 2.873462200164795, "learning_rate": 4.255739323623797e-05, "loss": 0.0303, "step": 653 }, { "epoch": 0.47841989758595466, "grad_norm": 1.7466613054275513, "learning_rate": 4.254505060478894e-05, "loss": 0.024, "step": 654 }, { "epoch": 0.479151426481346, "grad_norm": 4.801599979400635, "learning_rate": 4.253270797333992e-05, "loss": 0.0512, "step": 655 }, { "epoch": 0.47988295537673736, "grad_norm": 3.0511820316314697, "learning_rate": 4.2520365341890895e-05, "loss": 0.136, "step": 656 }, { "epoch": 0.4806144842721288, "grad_norm": 0.2912273406982422, "learning_rate": 4.250802271044187e-05, "loss": 0.0078, "step": 657 }, { "epoch": 0.4813460131675201, "grad_norm": 0.16110171377658844, "learning_rate": 4.249568007899284e-05, "loss": 0.0075, "step": 658 }, { "epoch": 0.4820775420629115, "grad_norm": 2.950793981552124, "learning_rate": 4.248333744754382e-05, "loss": 0.1291, "step": 659 }, { "epoch": 0.48280907095830283, "grad_norm": 4.6130523681640625, "learning_rate": 4.2470994816094795e-05, "loss": 0.2669, "step": 660 }, { "epoch": 0.48354059985369424, "grad_norm": 13.903064727783203, "learning_rate": 4.245865218464577e-05, "loss": 0.0972, "step": 661 }, { "epoch": 0.4842721287490856, "grad_norm": 3.334106683731079, "learning_rate": 4.244630955319674e-05, "loss": 0.0238, "step": 662 }, { "epoch": 0.48500365764447695, "grad_norm": 20.483291625976562, "learning_rate": 4.243396692174772e-05, "loss": 0.0682, "step": 663 }, { "epoch": 0.4857351865398683, "grad_norm": 5.096444606781006, "learning_rate": 4.2421624290298696e-05, "loss": 0.4044, "step": 664 }, { "epoch": 0.4864667154352597, "grad_norm": 4.751810073852539, "learning_rate": 4.240928165884967e-05, "loss": 0.2673, "step": 665 }, { "epoch": 0.48719824433065106, "grad_norm": 134.5506591796875, "learning_rate": 4.239693902740064e-05, "loss": 0.9668, "step": 666 }, { "epoch": 0.4879297732260424, "grad_norm": 45.321449279785156, "learning_rate": 4.238459639595162e-05, "loss": 0.8747, "step": 667 }, { "epoch": 0.48866130212143377, "grad_norm": 3.439634084701538, "learning_rate": 4.2372253764502596e-05, "loss": 0.5639, "step": 668 }, { "epoch": 0.4893928310168252, "grad_norm": 35.555809020996094, "learning_rate": 4.235991113305357e-05, "loss": 0.5531, "step": 669 }, { "epoch": 0.49012435991221653, "grad_norm": 30.20450782775879, "learning_rate": 4.234756850160454e-05, "loss": 0.1762, "step": 670 }, { "epoch": 0.4908558888076079, "grad_norm": 3.469167947769165, "learning_rate": 4.233522587015552e-05, "loss": 0.2136, "step": 671 }, { "epoch": 0.49158741770299924, "grad_norm": 1.7070997953414917, "learning_rate": 4.2322883238706496e-05, "loss": 0.1967, "step": 672 }, { "epoch": 0.49231894659839065, "grad_norm": 3.872246742248535, "learning_rate": 4.231054060725747e-05, "loss": 0.2449, "step": 673 }, { "epoch": 0.493050475493782, "grad_norm": 11.770777702331543, "learning_rate": 4.229819797580844e-05, "loss": 0.1813, "step": 674 }, { "epoch": 0.49378200438917336, "grad_norm": 11.4252347946167, "learning_rate": 4.228585534435942e-05, "loss": 0.2651, "step": 675 }, { "epoch": 0.49451353328456477, "grad_norm": 0.42086589336395264, "learning_rate": 4.22735127129104e-05, "loss": 0.0187, "step": 676 }, { "epoch": 0.4952450621799561, "grad_norm": 6.418496608734131, "learning_rate": 4.2261170081461373e-05, "loss": 0.0419, "step": 677 }, { "epoch": 0.4959765910753475, "grad_norm": 0.5276121497154236, "learning_rate": 4.2248827450012343e-05, "loss": 0.0228, "step": 678 }, { "epoch": 0.4967081199707388, "grad_norm": 0.8030569553375244, "learning_rate": 4.223648481856332e-05, "loss": 0.0215, "step": 679 }, { "epoch": 0.49743964886613024, "grad_norm": 0.6889498233795166, "learning_rate": 4.22241421871143e-05, "loss": 0.0235, "step": 680 }, { "epoch": 0.4981711777615216, "grad_norm": 7.080549240112305, "learning_rate": 4.2211799555665274e-05, "loss": 0.152, "step": 681 }, { "epoch": 0.49890270665691294, "grad_norm": 2.7901346683502197, "learning_rate": 4.2199456924216244e-05, "loss": 0.1194, "step": 682 }, { "epoch": 0.4996342355523043, "grad_norm": 0.30674052238464355, "learning_rate": 4.218711429276722e-05, "loss": 0.0125, "step": 683 }, { "epoch": 0.5003657644476956, "grad_norm": 10.95960521697998, "learning_rate": 4.21747716613182e-05, "loss": 0.0591, "step": 684 }, { "epoch": 0.5010972933430871, "grad_norm": 1.1131641864776611, "learning_rate": 4.2162429029869174e-05, "loss": 0.0144, "step": 685 }, { "epoch": 0.5018288222384785, "grad_norm": 0.36405983567237854, "learning_rate": 4.2150086398420144e-05, "loss": 0.0104, "step": 686 }, { "epoch": 0.5025603511338698, "grad_norm": 0.20484277606010437, "learning_rate": 4.213774376697112e-05, "loss": 0.0073, "step": 687 }, { "epoch": 0.5032918800292612, "grad_norm": 0.13230332732200623, "learning_rate": 4.21254011355221e-05, "loss": 0.0053, "step": 688 }, { "epoch": 0.5040234089246525, "grad_norm": 2.079373836517334, "learning_rate": 4.2113058504073075e-05, "loss": 0.182, "step": 689 }, { "epoch": 0.5047549378200439, "grad_norm": 16.82198143005371, "learning_rate": 4.2100715872624045e-05, "loss": 0.332, "step": 690 }, { "epoch": 0.5054864667154353, "grad_norm": 11.086824417114258, "learning_rate": 4.208837324117502e-05, "loss": 0.1179, "step": 691 }, { "epoch": 0.5062179956108266, "grad_norm": 13.113372802734375, "learning_rate": 4.2076030609726e-05, "loss": 0.0593, "step": 692 }, { "epoch": 0.506949524506218, "grad_norm": 10.513792991638184, "learning_rate": 4.2063687978276975e-05, "loss": 0.0645, "step": 693 }, { "epoch": 0.5076810534016094, "grad_norm": 7.3590288162231445, "learning_rate": 4.2051345346827945e-05, "loss": 0.0254, "step": 694 }, { "epoch": 0.5084125822970007, "grad_norm": 6.023112773895264, "learning_rate": 4.203900271537892e-05, "loss": 0.0165, "step": 695 }, { "epoch": 0.5091441111923921, "grad_norm": 6.942991733551025, "learning_rate": 4.20266600839299e-05, "loss": 0.0299, "step": 696 }, { "epoch": 0.5098756400877835, "grad_norm": 1.045620083808899, "learning_rate": 4.2014317452480875e-05, "loss": 0.0072, "step": 697 }, { "epoch": 0.5106071689831748, "grad_norm": 11.125314712524414, "learning_rate": 4.2001974821031845e-05, "loss": 0.0816, "step": 698 }, { "epoch": 0.5113386978785662, "grad_norm": 0.07034100592136383, "learning_rate": 4.198963218958282e-05, "loss": 0.0035, "step": 699 }, { "epoch": 0.5120702267739575, "grad_norm": 13.33211898803711, "learning_rate": 4.19772895581338e-05, "loss": 0.1942, "step": 700 }, { "epoch": 0.5128017556693489, "grad_norm": 10.145108222961426, "learning_rate": 4.1964946926684776e-05, "loss": 0.1361, "step": 701 }, { "epoch": 0.5135332845647403, "grad_norm": 11.358627319335938, "learning_rate": 4.1952604295235746e-05, "loss": 0.244, "step": 702 }, { "epoch": 0.5142648134601316, "grad_norm": 3.1792821884155273, "learning_rate": 4.194026166378672e-05, "loss": 0.0117, "step": 703 }, { "epoch": 0.514996342355523, "grad_norm": 5.971025466918945, "learning_rate": 4.19279190323377e-05, "loss": 0.0173, "step": 704 }, { "epoch": 0.5157278712509145, "grad_norm": 11.793340682983398, "learning_rate": 4.1915576400888676e-05, "loss": 0.0453, "step": 705 }, { "epoch": 0.5164594001463058, "grad_norm": 1.2848302125930786, "learning_rate": 4.1903233769439646e-05, "loss": 0.0062, "step": 706 }, { "epoch": 0.5171909290416972, "grad_norm": 0.11418507248163223, "learning_rate": 4.189089113799062e-05, "loss": 0.0037, "step": 707 }, { "epoch": 0.5179224579370885, "grad_norm": 1.8805304765701294, "learning_rate": 4.18785485065416e-05, "loss": 0.0071, "step": 708 }, { "epoch": 0.5186539868324799, "grad_norm": 15.428498268127441, "learning_rate": 4.186620587509258e-05, "loss": 0.0372, "step": 709 }, { "epoch": 0.5193855157278713, "grad_norm": 9.171894073486328, "learning_rate": 4.185386324364355e-05, "loss": 0.329, "step": 710 }, { "epoch": 0.5201170446232626, "grad_norm": 0.7844057679176331, "learning_rate": 4.1841520612194523e-05, "loss": 0.1923, "step": 711 }, { "epoch": 0.520848573518654, "grad_norm": 2.243504762649536, "learning_rate": 4.18291779807455e-05, "loss": 0.3619, "step": 712 }, { "epoch": 0.5215801024140454, "grad_norm": 6.4915361404418945, "learning_rate": 4.181683534929648e-05, "loss": 0.1712, "step": 713 }, { "epoch": 0.5223116313094367, "grad_norm": 8.421492576599121, "learning_rate": 4.180449271784745e-05, "loss": 0.0995, "step": 714 }, { "epoch": 0.5230431602048281, "grad_norm": 22.12891960144043, "learning_rate": 4.1792150086398424e-05, "loss": 0.1715, "step": 715 }, { "epoch": 0.5237746891002194, "grad_norm": 0.7860155701637268, "learning_rate": 4.17798074549494e-05, "loss": 0.1654, "step": 716 }, { "epoch": 0.5245062179956108, "grad_norm": 2.6723453998565674, "learning_rate": 4.176746482350038e-05, "loss": 0.1596, "step": 717 }, { "epoch": 0.5252377468910022, "grad_norm": 0.15984389185905457, "learning_rate": 4.175512219205135e-05, "loss": 0.0063, "step": 718 }, { "epoch": 0.5259692757863935, "grad_norm": 0.14592869579792023, "learning_rate": 4.1742779560602324e-05, "loss": 0.0067, "step": 719 }, { "epoch": 0.5267008046817849, "grad_norm": 10.546784400939941, "learning_rate": 4.17304369291533e-05, "loss": 0.0295, "step": 720 }, { "epoch": 0.5274323335771763, "grad_norm": 21.378711700439453, "learning_rate": 4.171809429770427e-05, "loss": 0.1462, "step": 721 }, { "epoch": 0.5281638624725676, "grad_norm": 5.249191761016846, "learning_rate": 4.170575166625525e-05, "loss": 0.0212, "step": 722 }, { "epoch": 0.528895391367959, "grad_norm": 20.340368270874023, "learning_rate": 4.1693409034806225e-05, "loss": 0.1144, "step": 723 }, { "epoch": 0.5296269202633505, "grad_norm": 0.32923728227615356, "learning_rate": 4.16810664033572e-05, "loss": 0.0114, "step": 724 }, { "epoch": 0.5303584491587418, "grad_norm": 6.3767008781433105, "learning_rate": 4.166872377190817e-05, "loss": 0.0659, "step": 725 }, { "epoch": 0.5310899780541332, "grad_norm": 5.1905388832092285, "learning_rate": 4.165638114045915e-05, "loss": 0.0281, "step": 726 }, { "epoch": 0.5318215069495245, "grad_norm": 8.403443336486816, "learning_rate": 4.1644038509010125e-05, "loss": 0.0445, "step": 727 }, { "epoch": 0.5325530358449159, "grad_norm": 8.090340614318848, "learning_rate": 4.16316958775611e-05, "loss": 0.0882, "step": 728 }, { "epoch": 0.5332845647403073, "grad_norm": 17.79546546936035, "learning_rate": 4.161935324611207e-05, "loss": 0.3322, "step": 729 }, { "epoch": 0.5340160936356986, "grad_norm": 5.76694393157959, "learning_rate": 4.160701061466305e-05, "loss": 0.1581, "step": 730 }, { "epoch": 0.53474762253109, "grad_norm": 5.839162349700928, "learning_rate": 4.1594667983214025e-05, "loss": 0.2293, "step": 731 }, { "epoch": 0.5354791514264814, "grad_norm": 5.666024208068848, "learning_rate": 4.1582325351765e-05, "loss": 0.0966, "step": 732 }, { "epoch": 0.5362106803218727, "grad_norm": 3.0791571140289307, "learning_rate": 4.156998272031597e-05, "loss": 0.0226, "step": 733 }, { "epoch": 0.5369422092172641, "grad_norm": 13.17873764038086, "learning_rate": 4.155764008886695e-05, "loss": 0.1551, "step": 734 }, { "epoch": 0.5376737381126554, "grad_norm": 3.2885055541992188, "learning_rate": 4.1545297457417926e-05, "loss": 0.0226, "step": 735 }, { "epoch": 0.5384052670080468, "grad_norm": 4.992366313934326, "learning_rate": 4.15329548259689e-05, "loss": 0.0797, "step": 736 }, { "epoch": 0.5391367959034382, "grad_norm": 0.5026952028274536, "learning_rate": 4.152061219451987e-05, "loss": 0.0167, "step": 737 }, { "epoch": 0.5398683247988295, "grad_norm": 0.569699227809906, "learning_rate": 4.150826956307085e-05, "loss": 0.0164, "step": 738 }, { "epoch": 0.5405998536942209, "grad_norm": 4.628734588623047, "learning_rate": 4.1495926931621826e-05, "loss": 0.0624, "step": 739 }, { "epoch": 0.5413313825896123, "grad_norm": 10.831014633178711, "learning_rate": 4.14835843001728e-05, "loss": 0.1731, "step": 740 }, { "epoch": 0.5420629114850036, "grad_norm": 4.297609329223633, "learning_rate": 4.147124166872377e-05, "loss": 0.0356, "step": 741 }, { "epoch": 0.542794440380395, "grad_norm": 0.20635929703712463, "learning_rate": 4.145889903727475e-05, "loss": 0.0099, "step": 742 }, { "epoch": 0.5435259692757864, "grad_norm": 4.069065570831299, "learning_rate": 4.1446556405825727e-05, "loss": 0.077, "step": 743 }, { "epoch": 0.5442574981711777, "grad_norm": 4.823711395263672, "learning_rate": 4.14342137743767e-05, "loss": 0.0334, "step": 744 }, { "epoch": 0.5449890270665692, "grad_norm": 10.716259002685547, "learning_rate": 4.142187114292767e-05, "loss": 0.1703, "step": 745 }, { "epoch": 0.5457205559619605, "grad_norm": 5.66307258605957, "learning_rate": 4.140952851147865e-05, "loss": 0.0306, "step": 746 }, { "epoch": 0.5464520848573519, "grad_norm": 5.9702324867248535, "learning_rate": 4.139718588002963e-05, "loss": 0.0279, "step": 747 }, { "epoch": 0.5471836137527433, "grad_norm": 7.778929710388184, "learning_rate": 4.1384843248580604e-05, "loss": 0.0812, "step": 748 }, { "epoch": 0.5479151426481346, "grad_norm": 16.14423942565918, "learning_rate": 4.1372500617131574e-05, "loss": 0.2387, "step": 749 }, { "epoch": 0.548646671543526, "grad_norm": 2.369166135787964, "learning_rate": 4.136015798568255e-05, "loss": 0.0253, "step": 750 }, { "epoch": 0.5493782004389174, "grad_norm": 0.8558024764060974, "learning_rate": 4.134781535423353e-05, "loss": 0.0093, "step": 751 }, { "epoch": 0.5501097293343087, "grad_norm": 6.904941558837891, "learning_rate": 4.1335472722784504e-05, "loss": 0.0937, "step": 752 }, { "epoch": 0.5508412582297001, "grad_norm": 9.970929145812988, "learning_rate": 4.1323130091335474e-05, "loss": 0.1728, "step": 753 }, { "epoch": 0.5515727871250914, "grad_norm": 1.9157273769378662, "learning_rate": 4.131078745988645e-05, "loss": 0.1411, "step": 754 }, { "epoch": 0.5523043160204828, "grad_norm": 0.10472261905670166, "learning_rate": 4.129844482843743e-05, "loss": 0.0044, "step": 755 }, { "epoch": 0.5530358449158742, "grad_norm": 2.6553354263305664, "learning_rate": 4.1286102196988405e-05, "loss": 0.0153, "step": 756 }, { "epoch": 0.5537673738112655, "grad_norm": 0.27471113204956055, "learning_rate": 4.1273759565539375e-05, "loss": 0.0061, "step": 757 }, { "epoch": 0.5544989027066569, "grad_norm": 9.496356964111328, "learning_rate": 4.126141693409035e-05, "loss": 0.0618, "step": 758 }, { "epoch": 0.5552304316020483, "grad_norm": 5.223247528076172, "learning_rate": 4.124907430264133e-05, "loss": 0.0518, "step": 759 }, { "epoch": 0.5559619604974396, "grad_norm": 10.061881065368652, "learning_rate": 4.1236731671192305e-05, "loss": 0.1864, "step": 760 }, { "epoch": 0.556693489392831, "grad_norm": 3.3330605030059814, "learning_rate": 4.1224389039743275e-05, "loss": 0.0134, "step": 761 }, { "epoch": 0.5574250182882224, "grad_norm": 6.660134315490723, "learning_rate": 4.121204640829425e-05, "loss": 0.2899, "step": 762 }, { "epoch": 0.5581565471836137, "grad_norm": 9.015495300292969, "learning_rate": 4.119970377684523e-05, "loss": 0.0373, "step": 763 }, { "epoch": 0.5588880760790051, "grad_norm": 0.16248272359371185, "learning_rate": 4.1187361145396205e-05, "loss": 0.004, "step": 764 }, { "epoch": 0.5596196049743964, "grad_norm": 0.0996611937880516, "learning_rate": 4.1175018513947175e-05, "loss": 0.004, "step": 765 }, { "epoch": 0.5603511338697879, "grad_norm": 9.446507453918457, "learning_rate": 4.116267588249815e-05, "loss": 0.026, "step": 766 }, { "epoch": 0.5610826627651793, "grad_norm": 0.3400660753250122, "learning_rate": 4.115033325104913e-05, "loss": 0.0044, "step": 767 }, { "epoch": 0.5618141916605706, "grad_norm": 0.5023971796035767, "learning_rate": 4.1137990619600106e-05, "loss": 0.0079, "step": 768 }, { "epoch": 0.562545720555962, "grad_norm": 13.001029014587402, "learning_rate": 4.1125647988151076e-05, "loss": 0.3813, "step": 769 }, { "epoch": 0.5632772494513534, "grad_norm": 6.489999771118164, "learning_rate": 4.111330535670205e-05, "loss": 0.0364, "step": 770 }, { "epoch": 0.5640087783467447, "grad_norm": 4.629186153411865, "learning_rate": 4.110096272525303e-05, "loss": 0.0294, "step": 771 }, { "epoch": 0.5647403072421361, "grad_norm": 0.3521539866924286, "learning_rate": 4.1088620093804006e-05, "loss": 0.0053, "step": 772 }, { "epoch": 0.5654718361375274, "grad_norm": 6.888949871063232, "learning_rate": 4.1076277462354976e-05, "loss": 0.1221, "step": 773 }, { "epoch": 0.5662033650329188, "grad_norm": 1.2678054571151733, "learning_rate": 4.106393483090595e-05, "loss": 0.0093, "step": 774 }, { "epoch": 0.5669348939283102, "grad_norm": 2.7566277980804443, "learning_rate": 4.105159219945693e-05, "loss": 0.0131, "step": 775 }, { "epoch": 0.5676664228237015, "grad_norm": 7.866898536682129, "learning_rate": 4.1039249568007906e-05, "loss": 0.0838, "step": 776 }, { "epoch": 0.5683979517190929, "grad_norm": 12.55176067352295, "learning_rate": 4.1026906936558876e-05, "loss": 0.0592, "step": 777 }, { "epoch": 0.5691294806144843, "grad_norm": 2.925304889678955, "learning_rate": 4.101456430510985e-05, "loss": 0.0135, "step": 778 }, { "epoch": 0.5698610095098756, "grad_norm": 8.334945678710938, "learning_rate": 4.100222167366083e-05, "loss": 0.0291, "step": 779 }, { "epoch": 0.570592538405267, "grad_norm": 1.3202673196792603, "learning_rate": 4.098987904221181e-05, "loss": 0.009, "step": 780 }, { "epoch": 0.5713240673006583, "grad_norm": 6.14036750793457, "learning_rate": 4.097753641076278e-05, "loss": 0.1411, "step": 781 }, { "epoch": 0.5720555961960497, "grad_norm": 1.0117555856704712, "learning_rate": 4.0965193779313754e-05, "loss": 0.1757, "step": 782 }, { "epoch": 0.5727871250914411, "grad_norm": 9.380324363708496, "learning_rate": 4.095285114786473e-05, "loss": 0.0496, "step": 783 }, { "epoch": 0.5735186539868324, "grad_norm": 0.24652482569217682, "learning_rate": 4.094050851641571e-05, "loss": 0.004, "step": 784 }, { "epoch": 0.5742501828822238, "grad_norm": 13.625909805297852, "learning_rate": 4.092816588496668e-05, "loss": 0.1675, "step": 785 }, { "epoch": 0.5749817117776153, "grad_norm": 16.575687408447266, "learning_rate": 4.0915823253517654e-05, "loss": 0.1715, "step": 786 }, { "epoch": 0.5757132406730066, "grad_norm": 0.638912558555603, "learning_rate": 4.090348062206863e-05, "loss": 0.0054, "step": 787 }, { "epoch": 0.576444769568398, "grad_norm": 1.5406861305236816, "learning_rate": 4.089113799061961e-05, "loss": 0.0181, "step": 788 }, { "epoch": 0.5771762984637894, "grad_norm": 13.126152038574219, "learning_rate": 4.087879535917058e-05, "loss": 0.298, "step": 789 }, { "epoch": 0.5779078273591807, "grad_norm": 1.0056079626083374, "learning_rate": 4.0866452727721554e-05, "loss": 0.0128, "step": 790 }, { "epoch": 0.5786393562545721, "grad_norm": 2.8785016536712646, "learning_rate": 4.085411009627253e-05, "loss": 0.1186, "step": 791 }, { "epoch": 0.5793708851499634, "grad_norm": 3.830570697784424, "learning_rate": 4.084176746482351e-05, "loss": 0.0954, "step": 792 }, { "epoch": 0.5801024140453548, "grad_norm": 4.1612982749938965, "learning_rate": 4.082942483337448e-05, "loss": 0.0801, "step": 793 }, { "epoch": 0.5808339429407462, "grad_norm": 2.9737050533294678, "learning_rate": 4.0817082201925455e-05, "loss": 0.0522, "step": 794 }, { "epoch": 0.5815654718361375, "grad_norm": 22.134775161743164, "learning_rate": 4.080473957047643e-05, "loss": 0.0968, "step": 795 }, { "epoch": 0.5822970007315289, "grad_norm": 22.347124099731445, "learning_rate": 4.079239693902741e-05, "loss": 0.1441, "step": 796 }, { "epoch": 0.5830285296269203, "grad_norm": 30.386415481567383, "learning_rate": 4.078005430757838e-05, "loss": 0.2162, "step": 797 }, { "epoch": 0.5837600585223116, "grad_norm": 10.874345779418945, "learning_rate": 4.0767711676129355e-05, "loss": 0.1294, "step": 798 }, { "epoch": 0.584491587417703, "grad_norm": 5.209266185760498, "learning_rate": 4.075536904468033e-05, "loss": 0.0298, "step": 799 }, { "epoch": 0.5852231163130943, "grad_norm": 0.48224905133247375, "learning_rate": 4.074302641323131e-05, "loss": 0.0077, "step": 800 }, { "epoch": 0.5859546452084857, "grad_norm": 4.427761554718018, "learning_rate": 4.073068378178228e-05, "loss": 0.0869, "step": 801 }, { "epoch": 0.5866861741038771, "grad_norm": 8.012751579284668, "learning_rate": 4.0718341150333256e-05, "loss": 0.1481, "step": 802 }, { "epoch": 0.5874177029992684, "grad_norm": 0.2811213433742523, "learning_rate": 4.070599851888423e-05, "loss": 0.0063, "step": 803 }, { "epoch": 0.5881492318946598, "grad_norm": 4.579434394836426, "learning_rate": 4.069365588743521e-05, "loss": 0.0304, "step": 804 }, { "epoch": 0.5888807607900512, "grad_norm": 2.960239887237549, "learning_rate": 4.068131325598618e-05, "loss": 0.0789, "step": 805 }, { "epoch": 0.5896122896854425, "grad_norm": 11.558305740356445, "learning_rate": 4.0668970624537156e-05, "loss": 0.1536, "step": 806 }, { "epoch": 0.590343818580834, "grad_norm": 1.5892901420593262, "learning_rate": 4.065662799308813e-05, "loss": 0.0665, "step": 807 }, { "epoch": 0.5910753474762254, "grad_norm": 0.44062989950180054, "learning_rate": 4.06442853616391e-05, "loss": 0.0076, "step": 808 }, { "epoch": 0.5918068763716167, "grad_norm": 0.5799773335456848, "learning_rate": 4.063194273019008e-05, "loss": 0.0076, "step": 809 }, { "epoch": 0.5925384052670081, "grad_norm": 1.6439162492752075, "learning_rate": 4.061960009874105e-05, "loss": 0.0313, "step": 810 }, { "epoch": 0.5932699341623994, "grad_norm": 1.178436040878296, "learning_rate": 4.0607257467292026e-05, "loss": 0.0093, "step": 811 }, { "epoch": 0.5940014630577908, "grad_norm": 3.0798380374908447, "learning_rate": 4.0594914835843e-05, "loss": 0.0166, "step": 812 }, { "epoch": 0.5947329919531822, "grad_norm": 8.261957168579102, "learning_rate": 4.058257220439397e-05, "loss": 0.0579, "step": 813 }, { "epoch": 0.5954645208485735, "grad_norm": 0.08798419684171677, "learning_rate": 4.057022957294495e-05, "loss": 0.0024, "step": 814 }, { "epoch": 0.5961960497439649, "grad_norm": 0.6028685569763184, "learning_rate": 4.055788694149593e-05, "loss": 0.0042, "step": 815 }, { "epoch": 0.5969275786393563, "grad_norm": 7.69093132019043, "learning_rate": 4.0545544310046904e-05, "loss": 0.441, "step": 816 }, { "epoch": 0.5976591075347476, "grad_norm": 15.75400161743164, "learning_rate": 4.0533201678597874e-05, "loss": 0.1739, "step": 817 }, { "epoch": 0.598390636430139, "grad_norm": 3.5482561588287354, "learning_rate": 4.052085904714885e-05, "loss": 0.0221, "step": 818 }, { "epoch": 0.5991221653255303, "grad_norm": 6.809575080871582, "learning_rate": 4.050851641569983e-05, "loss": 0.0164, "step": 819 }, { "epoch": 0.5998536942209217, "grad_norm": 0.38165053725242615, "learning_rate": 4.0496173784250804e-05, "loss": 0.0023, "step": 820 }, { "epoch": 0.6005852231163131, "grad_norm": 0.05292700603604317, "learning_rate": 4.0483831152801774e-05, "loss": 0.0022, "step": 821 }, { "epoch": 0.6013167520117044, "grad_norm": 16.812456130981445, "learning_rate": 4.047148852135275e-05, "loss": 0.0874, "step": 822 }, { "epoch": 0.6020482809070958, "grad_norm": 0.0667090117931366, "learning_rate": 4.045914588990373e-05, "loss": 0.0022, "step": 823 }, { "epoch": 0.6027798098024872, "grad_norm": 0.055737175047397614, "learning_rate": 4.0446803258454704e-05, "loss": 0.0023, "step": 824 }, { "epoch": 0.6035113386978785, "grad_norm": 9.985952377319336, "learning_rate": 4.0434460627005674e-05, "loss": 0.1411, "step": 825 }, { "epoch": 0.60424286759327, "grad_norm": 2.5101125240325928, "learning_rate": 4.042211799555665e-05, "loss": 0.0084, "step": 826 }, { "epoch": 0.6049743964886612, "grad_norm": 0.05159799009561539, "learning_rate": 4.040977536410763e-05, "loss": 0.0021, "step": 827 }, { "epoch": 0.6057059253840527, "grad_norm": 49.64204788208008, "learning_rate": 4.0397432732658605e-05, "loss": 0.3649, "step": 828 }, { "epoch": 0.6064374542794441, "grad_norm": 0.4127582907676697, "learning_rate": 4.0385090101209575e-05, "loss": 0.0019, "step": 829 }, { "epoch": 0.6071689831748354, "grad_norm": 0.036887601017951965, "learning_rate": 4.037274746976055e-05, "loss": 0.0017, "step": 830 }, { "epoch": 0.6079005120702268, "grad_norm": 0.03499932959675789, "learning_rate": 4.036040483831153e-05, "loss": 0.0016, "step": 831 }, { "epoch": 0.6086320409656182, "grad_norm": 0.03522849455475807, "learning_rate": 4.0348062206862505e-05, "loss": 0.0015, "step": 832 }, { "epoch": 0.6093635698610095, "grad_norm": 0.049857527017593384, "learning_rate": 4.0335719575413475e-05, "loss": 0.0019, "step": 833 }, { "epoch": 0.6100950987564009, "grad_norm": 4.753680229187012, "learning_rate": 4.032337694396445e-05, "loss": 0.0139, "step": 834 }, { "epoch": 0.6108266276517923, "grad_norm": 0.03313909471035004, "learning_rate": 4.031103431251543e-05, "loss": 0.0015, "step": 835 }, { "epoch": 0.6115581565471836, "grad_norm": 10.666071891784668, "learning_rate": 4.0298691681066406e-05, "loss": 0.1395, "step": 836 }, { "epoch": 0.612289685442575, "grad_norm": 19.34674644470215, "learning_rate": 4.0286349049617376e-05, "loss": 0.2653, "step": 837 }, { "epoch": 0.6130212143379663, "grad_norm": 2.78745698928833, "learning_rate": 4.027400641816835e-05, "loss": 0.2099, "step": 838 }, { "epoch": 0.6137527432333577, "grad_norm": 29.09014129638672, "learning_rate": 4.026166378671933e-05, "loss": 0.1223, "step": 839 }, { "epoch": 0.6144842721287491, "grad_norm": 0.893337607383728, "learning_rate": 4.0249321155270306e-05, "loss": 0.2058, "step": 840 }, { "epoch": 0.6152158010241404, "grad_norm": 1.2082867622375488, "learning_rate": 4.0236978523821276e-05, "loss": 0.0059, "step": 841 }, { "epoch": 0.6159473299195318, "grad_norm": 3.2194061279296875, "learning_rate": 4.022463589237225e-05, "loss": 0.0256, "step": 842 }, { "epoch": 0.6166788588149232, "grad_norm": 3.6414053440093994, "learning_rate": 4.021229326092323e-05, "loss": 0.0189, "step": 843 }, { "epoch": 0.6174103877103145, "grad_norm": 24.851337432861328, "learning_rate": 4.0199950629474206e-05, "loss": 0.0969, "step": 844 }, { "epoch": 0.6181419166057059, "grad_norm": 7.9118452072143555, "learning_rate": 4.0187607998025176e-05, "loss": 0.1954, "step": 845 }, { "epoch": 0.6188734455010972, "grad_norm": 12.826066970825195, "learning_rate": 4.017526536657615e-05, "loss": 0.0832, "step": 846 }, { "epoch": 0.6196049743964887, "grad_norm": 4.211085796356201, "learning_rate": 4.016292273512713e-05, "loss": 0.0154, "step": 847 }, { "epoch": 0.6203365032918801, "grad_norm": 8.898704528808594, "learning_rate": 4.015058010367811e-05, "loss": 0.1338, "step": 848 }, { "epoch": 0.6210680321872714, "grad_norm": 8.614721298217773, "learning_rate": 4.013823747222908e-05, "loss": 0.0367, "step": 849 }, { "epoch": 0.6217995610826628, "grad_norm": 4.080761909484863, "learning_rate": 4.0125894840780054e-05, "loss": 0.0564, "step": 850 }, { "epoch": 0.6225310899780542, "grad_norm": 14.539011001586914, "learning_rate": 4.011355220933103e-05, "loss": 0.2957, "step": 851 }, { "epoch": 0.6232626188734455, "grad_norm": 5.609674453735352, "learning_rate": 4.010120957788201e-05, "loss": 0.095, "step": 852 }, { "epoch": 0.6239941477688369, "grad_norm": 4.352258205413818, "learning_rate": 4.008886694643298e-05, "loss": 0.1396, "step": 853 }, { "epoch": 0.6247256766642283, "grad_norm": 1.6065304279327393, "learning_rate": 4.0076524314983954e-05, "loss": 0.016, "step": 854 }, { "epoch": 0.6254572055596196, "grad_norm": 3.3217954635620117, "learning_rate": 4.006418168353493e-05, "loss": 0.0419, "step": 855 }, { "epoch": 0.626188734455011, "grad_norm": 0.488718718290329, "learning_rate": 4.005183905208591e-05, "loss": 0.0111, "step": 856 }, { "epoch": 0.6269202633504023, "grad_norm": 21.064950942993164, "learning_rate": 4.003949642063688e-05, "loss": 0.099, "step": 857 }, { "epoch": 0.6276517922457937, "grad_norm": 1.5653704404830933, "learning_rate": 4.0027153789187854e-05, "loss": 0.0286, "step": 858 }, { "epoch": 0.6283833211411851, "grad_norm": 0.1769692450761795, "learning_rate": 4.001481115773883e-05, "loss": 0.0064, "step": 859 }, { "epoch": 0.6291148500365764, "grad_norm": 4.473258972167969, "learning_rate": 4.000246852628981e-05, "loss": 0.1438, "step": 860 }, { "epoch": 0.6298463789319678, "grad_norm": 13.103838920593262, "learning_rate": 3.999012589484078e-05, "loss": 0.166, "step": 861 }, { "epoch": 0.6305779078273592, "grad_norm": 5.032441139221191, "learning_rate": 3.9977783263391755e-05, "loss": 0.1663, "step": 862 }, { "epoch": 0.6313094367227505, "grad_norm": 2.475566864013672, "learning_rate": 3.996544063194273e-05, "loss": 0.1675, "step": 863 }, { "epoch": 0.6320409656181419, "grad_norm": 4.09551477432251, "learning_rate": 3.995309800049371e-05, "loss": 0.0415, "step": 864 }, { "epoch": 0.6327724945135332, "grad_norm": 1.0061204433441162, "learning_rate": 3.994075536904468e-05, "loss": 0.0122, "step": 865 }, { "epoch": 0.6335040234089246, "grad_norm": 4.730765342712402, "learning_rate": 3.9928412737595655e-05, "loss": 0.024, "step": 866 }, { "epoch": 0.634235552304316, "grad_norm": 0.7941578030586243, "learning_rate": 3.991607010614663e-05, "loss": 0.0101, "step": 867 }, { "epoch": 0.6349670811997074, "grad_norm": 3.8424909114837646, "learning_rate": 3.99037274746976e-05, "loss": 0.1254, "step": 868 }, { "epoch": 0.6356986100950988, "grad_norm": 9.678428649902344, "learning_rate": 3.989138484324858e-05, "loss": 0.0867, "step": 869 }, { "epoch": 0.6364301389904902, "grad_norm": 8.755456924438477, "learning_rate": 3.9879042211799556e-05, "loss": 0.1916, "step": 870 }, { "epoch": 0.6371616678858815, "grad_norm": 3.2315239906311035, "learning_rate": 3.986669958035053e-05, "loss": 0.0304, "step": 871 }, { "epoch": 0.6378931967812729, "grad_norm": 2.5404348373413086, "learning_rate": 3.98543569489015e-05, "loss": 0.0235, "step": 872 }, { "epoch": 0.6386247256766642, "grad_norm": 2.415790319442749, "learning_rate": 3.984201431745248e-05, "loss": 0.017, "step": 873 }, { "epoch": 0.6393562545720556, "grad_norm": 0.5679402351379395, "learning_rate": 3.9829671686003456e-05, "loss": 0.0113, "step": 874 }, { "epoch": 0.640087783467447, "grad_norm": 10.065632820129395, "learning_rate": 3.981732905455443e-05, "loss": 0.0932, "step": 875 }, { "epoch": 0.6408193123628383, "grad_norm": 10.601405143737793, "learning_rate": 3.98049864231054e-05, "loss": 0.1458, "step": 876 }, { "epoch": 0.6415508412582297, "grad_norm": 4.166922092437744, "learning_rate": 3.979264379165638e-05, "loss": 0.0233, "step": 877 }, { "epoch": 0.6422823701536211, "grad_norm": 5.005373001098633, "learning_rate": 3.9780301160207356e-05, "loss": 0.0752, "step": 878 }, { "epoch": 0.6430138990490124, "grad_norm": 5.869197845458984, "learning_rate": 3.976795852875833e-05, "loss": 0.2175, "step": 879 }, { "epoch": 0.6437454279444038, "grad_norm": 10.5758056640625, "learning_rate": 3.97556158973093e-05, "loss": 0.2727, "step": 880 }, { "epoch": 0.6444769568397952, "grad_norm": 0.4902363717556, "learning_rate": 3.974327326586028e-05, "loss": 0.0043, "step": 881 }, { "epoch": 0.6452084857351865, "grad_norm": 0.13102532923221588, "learning_rate": 3.973093063441126e-05, "loss": 0.0031, "step": 882 }, { "epoch": 0.6459400146305779, "grad_norm": 8.531027793884277, "learning_rate": 3.9718588002962233e-05, "loss": 0.0456, "step": 883 }, { "epoch": 0.6466715435259692, "grad_norm": 1.3646446466445923, "learning_rate": 3.9706245371513203e-05, "loss": 0.0121, "step": 884 }, { "epoch": 0.6474030724213606, "grad_norm": 6.305933952331543, "learning_rate": 3.969390274006418e-05, "loss": 0.1922, "step": 885 }, { "epoch": 0.648134601316752, "grad_norm": 20.66791534423828, "learning_rate": 3.968156010861516e-05, "loss": 0.1599, "step": 886 }, { "epoch": 0.6488661302121433, "grad_norm": 6.863165855407715, "learning_rate": 3.9669217477166134e-05, "loss": 0.0729, "step": 887 }, { "epoch": 0.6495976591075348, "grad_norm": 1.9629490375518799, "learning_rate": 3.9656874845717104e-05, "loss": 0.0129, "step": 888 }, { "epoch": 0.6503291880029262, "grad_norm": 6.61815881729126, "learning_rate": 3.964453221426808e-05, "loss": 0.3649, "step": 889 }, { "epoch": 0.6510607168983175, "grad_norm": 3.7577686309814453, "learning_rate": 3.963218958281906e-05, "loss": 0.0198, "step": 890 }, { "epoch": 0.6517922457937089, "grad_norm": 5.714045524597168, "learning_rate": 3.9619846951370034e-05, "loss": 0.0754, "step": 891 }, { "epoch": 0.6525237746891002, "grad_norm": 0.883346676826477, "learning_rate": 3.9607504319921004e-05, "loss": 0.0082, "step": 892 }, { "epoch": 0.6532553035844916, "grad_norm": 2.630702495574951, "learning_rate": 3.959516168847198e-05, "loss": 0.557, "step": 893 }, { "epoch": 0.653986832479883, "grad_norm": 0.8990138173103333, "learning_rate": 3.958281905702296e-05, "loss": 0.01, "step": 894 }, { "epoch": 0.6547183613752743, "grad_norm": 0.8425120115280151, "learning_rate": 3.9570476425573935e-05, "loss": 0.0074, "step": 895 }, { "epoch": 0.6554498902706657, "grad_norm": 0.33035966753959656, "learning_rate": 3.9558133794124905e-05, "loss": 0.0038, "step": 896 }, { "epoch": 0.6561814191660571, "grad_norm": 13.367514610290527, "learning_rate": 3.954579116267588e-05, "loss": 0.1039, "step": 897 }, { "epoch": 0.6569129480614484, "grad_norm": 0.37728333473205566, "learning_rate": 3.953344853122686e-05, "loss": 0.0069, "step": 898 }, { "epoch": 0.6576444769568398, "grad_norm": 8.391501426696777, "learning_rate": 3.9521105899777835e-05, "loss": 0.1719, "step": 899 }, { "epoch": 0.6583760058522312, "grad_norm": 3.0866429805755615, "learning_rate": 3.9508763268328805e-05, "loss": 0.1494, "step": 900 }, { "epoch": 0.6591075347476225, "grad_norm": 2.793360471725464, "learning_rate": 3.949642063687978e-05, "loss": 0.1037, "step": 901 }, { "epoch": 0.6598390636430139, "grad_norm": 3.639033317565918, "learning_rate": 3.948407800543076e-05, "loss": 0.0195, "step": 902 }, { "epoch": 0.6605705925384052, "grad_norm": 1.543113350868225, "learning_rate": 3.9471735373981735e-05, "loss": 0.0136, "step": 903 }, { "epoch": 0.6613021214337966, "grad_norm": 7.4816107749938965, "learning_rate": 3.9459392742532705e-05, "loss": 0.0537, "step": 904 }, { "epoch": 0.662033650329188, "grad_norm": 5.666170120239258, "learning_rate": 3.944705011108368e-05, "loss": 0.1494, "step": 905 }, { "epoch": 0.6627651792245793, "grad_norm": 0.36430931091308594, "learning_rate": 3.943470747963466e-05, "loss": 0.0092, "step": 906 }, { "epoch": 0.6634967081199707, "grad_norm": 18.60066795349121, "learning_rate": 3.9422364848185636e-05, "loss": 0.1849, "step": 907 }, { "epoch": 0.6642282370153622, "grad_norm": 29.476797103881836, "learning_rate": 3.9410022216736606e-05, "loss": 0.2296, "step": 908 }, { "epoch": 0.6649597659107535, "grad_norm": 31.422658920288086, "learning_rate": 3.939767958528758e-05, "loss": 0.3131, "step": 909 }, { "epoch": 0.6656912948061449, "grad_norm": 1.7637907266616821, "learning_rate": 3.938533695383856e-05, "loss": 0.0178, "step": 910 }, { "epoch": 0.6664228237015362, "grad_norm": 14.469127655029297, "learning_rate": 3.9372994322389536e-05, "loss": 0.0576, "step": 911 }, { "epoch": 0.6671543525969276, "grad_norm": 11.222125053405762, "learning_rate": 3.9360651690940506e-05, "loss": 0.0569, "step": 912 }, { "epoch": 0.667885881492319, "grad_norm": 7.3534321784973145, "learning_rate": 3.934830905949148e-05, "loss": 0.2263, "step": 913 }, { "epoch": 0.6686174103877103, "grad_norm": 0.1405080258846283, "learning_rate": 3.933596642804246e-05, "loss": 0.005, "step": 914 }, { "epoch": 0.6693489392831017, "grad_norm": 0.12584330141544342, "learning_rate": 3.9323623796593437e-05, "loss": 0.0041, "step": 915 }, { "epoch": 0.6700804681784931, "grad_norm": 2.234483480453491, "learning_rate": 3.931128116514441e-05, "loss": 0.0152, "step": 916 }, { "epoch": 0.6708119970738844, "grad_norm": 6.636422634124756, "learning_rate": 3.9298938533695383e-05, "loss": 0.0566, "step": 917 }, { "epoch": 0.6715435259692758, "grad_norm": 18.967309951782227, "learning_rate": 3.928659590224636e-05, "loss": 0.1812, "step": 918 }, { "epoch": 0.6722750548646672, "grad_norm": 2.520273208618164, "learning_rate": 3.927425327079734e-05, "loss": 0.1409, "step": 919 }, { "epoch": 0.6730065837600585, "grad_norm": 10.879125595092773, "learning_rate": 3.926191063934831e-05, "loss": 0.1432, "step": 920 }, { "epoch": 0.6737381126554499, "grad_norm": 5.208620548248291, "learning_rate": 3.9249568007899284e-05, "loss": 0.089, "step": 921 }, { "epoch": 0.6744696415508412, "grad_norm": 14.269115447998047, "learning_rate": 3.923722537645026e-05, "loss": 0.4383, "step": 922 }, { "epoch": 0.6752011704462326, "grad_norm": 24.433881759643555, "learning_rate": 3.922488274500124e-05, "loss": 0.4122, "step": 923 }, { "epoch": 0.675932699341624, "grad_norm": 17.217639923095703, "learning_rate": 3.921254011355221e-05, "loss": 0.143, "step": 924 }, { "epoch": 0.6766642282370153, "grad_norm": 5.019330024719238, "learning_rate": 3.9200197482103184e-05, "loss": 0.0813, "step": 925 }, { "epoch": 0.6773957571324067, "grad_norm": 2.309191942214966, "learning_rate": 3.918785485065416e-05, "loss": 0.1389, "step": 926 }, { "epoch": 0.6781272860277981, "grad_norm": 0.7261210680007935, "learning_rate": 3.917551221920514e-05, "loss": 0.011, "step": 927 }, { "epoch": 0.6788588149231894, "grad_norm": 0.71645188331604, "learning_rate": 3.916316958775611e-05, "loss": 0.0125, "step": 928 }, { "epoch": 0.6795903438185809, "grad_norm": 1.9616981744766235, "learning_rate": 3.9150826956307085e-05, "loss": 0.0269, "step": 929 }, { "epoch": 0.6803218727139722, "grad_norm": 1.2950495481491089, "learning_rate": 3.913848432485806e-05, "loss": 0.2075, "step": 930 }, { "epoch": 0.6810534016093636, "grad_norm": 4.261727333068848, "learning_rate": 3.912614169340904e-05, "loss": 0.0555, "step": 931 }, { "epoch": 0.681784930504755, "grad_norm": 8.565560340881348, "learning_rate": 3.911379906196001e-05, "loss": 0.1722, "step": 932 }, { "epoch": 0.6825164594001463, "grad_norm": 2.361720323562622, "learning_rate": 3.9101456430510985e-05, "loss": 0.0263, "step": 933 }, { "epoch": 0.6832479882955377, "grad_norm": 8.274238586425781, "learning_rate": 3.908911379906196e-05, "loss": 0.1568, "step": 934 }, { "epoch": 0.6839795171909291, "grad_norm": 7.332021236419678, "learning_rate": 3.907677116761294e-05, "loss": 0.0529, "step": 935 }, { "epoch": 0.6847110460863204, "grad_norm": 3.1175765991210938, "learning_rate": 3.906442853616391e-05, "loss": 0.0432, "step": 936 }, { "epoch": 0.6854425749817118, "grad_norm": 4.248939514160156, "learning_rate": 3.9052085904714885e-05, "loss": 0.0753, "step": 937 }, { "epoch": 0.6861741038771031, "grad_norm": 0.3015318214893341, "learning_rate": 3.903974327326586e-05, "loss": 0.0093, "step": 938 }, { "epoch": 0.6869056327724945, "grad_norm": 0.6484887003898621, "learning_rate": 3.902740064181684e-05, "loss": 0.0204, "step": 939 }, { "epoch": 0.6876371616678859, "grad_norm": 1.7893606424331665, "learning_rate": 3.901505801036781e-05, "loss": 0.0154, "step": 940 }, { "epoch": 0.6883686905632772, "grad_norm": 0.20695145428180695, "learning_rate": 3.9002715378918786e-05, "loss": 0.0081, "step": 941 }, { "epoch": 0.6891002194586686, "grad_norm": 0.6637898683547974, "learning_rate": 3.899037274746976e-05, "loss": 0.0131, "step": 942 }, { "epoch": 0.68983174835406, "grad_norm": 8.330491065979004, "learning_rate": 3.897803011602074e-05, "loss": 0.1121, "step": 943 }, { "epoch": 0.6905632772494513, "grad_norm": 12.127256393432617, "learning_rate": 3.896568748457171e-05, "loss": 0.1393, "step": 944 }, { "epoch": 0.6912948061448427, "grad_norm": 0.37415435910224915, "learning_rate": 3.8953344853122686e-05, "loss": 0.0058, "step": 945 }, { "epoch": 0.6920263350402341, "grad_norm": 7.829522132873535, "learning_rate": 3.894100222167366e-05, "loss": 0.4086, "step": 946 }, { "epoch": 0.6927578639356254, "grad_norm": 8.53035831451416, "learning_rate": 3.892865959022464e-05, "loss": 0.0824, "step": 947 }, { "epoch": 0.6934893928310168, "grad_norm": 6.320878028869629, "learning_rate": 3.891631695877561e-05, "loss": 0.3036, "step": 948 }, { "epoch": 0.6942209217264081, "grad_norm": 6.026665210723877, "learning_rate": 3.8903974327326587e-05, "loss": 0.0767, "step": 949 }, { "epoch": 0.6949524506217996, "grad_norm": 0.06525591015815735, "learning_rate": 3.889163169587756e-05, "loss": 0.0031, "step": 950 }, { "epoch": 0.695683979517191, "grad_norm": 0.05366502329707146, "learning_rate": 3.887928906442854e-05, "loss": 0.0026, "step": 951 }, { "epoch": 0.6964155084125823, "grad_norm": 0.6692391037940979, "learning_rate": 3.886694643297951e-05, "loss": 0.0063, "step": 952 }, { "epoch": 0.6971470373079737, "grad_norm": 13.039706230163574, "learning_rate": 3.885460380153049e-05, "loss": 0.0819, "step": 953 }, { "epoch": 0.6978785662033651, "grad_norm": 7.711069107055664, "learning_rate": 3.8842261170081464e-05, "loss": 0.2563, "step": 954 }, { "epoch": 0.6986100950987564, "grad_norm": 3.3118340969085693, "learning_rate": 3.8829918538632434e-05, "loss": 0.0484, "step": 955 }, { "epoch": 0.6993416239941478, "grad_norm": 0.1611553132534027, "learning_rate": 3.881757590718341e-05, "loss": 0.0029, "step": 956 }, { "epoch": 0.7000731528895391, "grad_norm": 0.22936682403087616, "learning_rate": 3.880523327573439e-05, "loss": 0.003, "step": 957 }, { "epoch": 0.7008046817849305, "grad_norm": 13.584369659423828, "learning_rate": 3.8792890644285364e-05, "loss": 0.2609, "step": 958 }, { "epoch": 0.7015362106803219, "grad_norm": 2.0444931983947754, "learning_rate": 3.8780548012836334e-05, "loss": 0.0154, "step": 959 }, { "epoch": 0.7022677395757132, "grad_norm": 2.326467990875244, "learning_rate": 3.876820538138731e-05, "loss": 0.0083, "step": 960 }, { "epoch": 0.7029992684711046, "grad_norm": 0.5622855424880981, "learning_rate": 3.875586274993829e-05, "loss": 0.0044, "step": 961 }, { "epoch": 0.703730797366496, "grad_norm": 12.245174407958984, "learning_rate": 3.8743520118489265e-05, "loss": 0.1648, "step": 962 }, { "epoch": 0.7044623262618873, "grad_norm": 0.42305460572242737, "learning_rate": 3.8731177487040235e-05, "loss": 0.0035, "step": 963 }, { "epoch": 0.7051938551572787, "grad_norm": 0.0441155731678009, "learning_rate": 3.871883485559121e-05, "loss": 0.0021, "step": 964 }, { "epoch": 0.7059253840526701, "grad_norm": 0.9511516094207764, "learning_rate": 3.870649222414219e-05, "loss": 0.0061, "step": 965 }, { "epoch": 0.7066569129480614, "grad_norm": 17.984100341796875, "learning_rate": 3.8694149592693165e-05, "loss": 0.3047, "step": 966 }, { "epoch": 0.7073884418434528, "grad_norm": 26.463150024414062, "learning_rate": 3.8681806961244135e-05, "loss": 0.4925, "step": 967 }, { "epoch": 0.7081199707388441, "grad_norm": 0.4991806745529175, "learning_rate": 3.866946432979511e-05, "loss": 0.0031, "step": 968 }, { "epoch": 0.7088514996342355, "grad_norm": 1.8873685598373413, "learning_rate": 3.865712169834609e-05, "loss": 0.0063, "step": 969 }, { "epoch": 0.709583028529627, "grad_norm": 10.461649894714355, "learning_rate": 3.8644779066897065e-05, "loss": 0.2855, "step": 970 }, { "epoch": 0.7103145574250183, "grad_norm": 0.05337952822446823, "learning_rate": 3.8632436435448035e-05, "loss": 0.0018, "step": 971 }, { "epoch": 0.7110460863204097, "grad_norm": 5.713789939880371, "learning_rate": 3.862009380399901e-05, "loss": 0.0291, "step": 972 }, { "epoch": 0.7117776152158011, "grad_norm": 14.204408645629883, "learning_rate": 3.860775117254999e-05, "loss": 0.2487, "step": 973 }, { "epoch": 0.7125091441111924, "grad_norm": 20.30039405822754, "learning_rate": 3.8595408541100966e-05, "loss": 0.0931, "step": 974 }, { "epoch": 0.7132406730065838, "grad_norm": 26.75405502319336, "learning_rate": 3.8583065909651936e-05, "loss": 0.1804, "step": 975 }, { "epoch": 0.7139722019019751, "grad_norm": 0.08599051833152771, "learning_rate": 3.857072327820291e-05, "loss": 0.0027, "step": 976 }, { "epoch": 0.7147037307973665, "grad_norm": 6.961690902709961, "learning_rate": 3.855838064675389e-05, "loss": 0.0192, "step": 977 }, { "epoch": 0.7154352596927579, "grad_norm": 8.669118881225586, "learning_rate": 3.8546038015304866e-05, "loss": 0.1492, "step": 978 }, { "epoch": 0.7161667885881492, "grad_norm": 0.33928126096725464, "learning_rate": 3.8533695383855836e-05, "loss": 0.0036, "step": 979 }, { "epoch": 0.7168983174835406, "grad_norm": 1.2685185670852661, "learning_rate": 3.852135275240681e-05, "loss": 0.0072, "step": 980 }, { "epoch": 0.717629846378932, "grad_norm": 12.974535942077637, "learning_rate": 3.850901012095779e-05, "loss": 0.0857, "step": 981 }, { "epoch": 0.7183613752743233, "grad_norm": 15.632758140563965, "learning_rate": 3.8496667489508766e-05, "loss": 0.3353, "step": 982 }, { "epoch": 0.7190929041697147, "grad_norm": 2.175177574157715, "learning_rate": 3.8484324858059736e-05, "loss": 0.1808, "step": 983 }, { "epoch": 0.719824433065106, "grad_norm": 6.464937210083008, "learning_rate": 3.847198222661071e-05, "loss": 0.0183, "step": 984 }, { "epoch": 0.7205559619604974, "grad_norm": 0.07105523347854614, "learning_rate": 3.845963959516169e-05, "loss": 0.002, "step": 985 }, { "epoch": 0.7212874908558888, "grad_norm": 10.845245361328125, "learning_rate": 3.844729696371267e-05, "loss": 0.3392, "step": 986 }, { "epoch": 0.7220190197512801, "grad_norm": 0.05880076065659523, "learning_rate": 3.843495433226364e-05, "loss": 0.0022, "step": 987 }, { "epoch": 0.7227505486466715, "grad_norm": 4.673022270202637, "learning_rate": 3.8422611700814614e-05, "loss": 0.0985, "step": 988 }, { "epoch": 0.723482077542063, "grad_norm": 0.6066250205039978, "learning_rate": 3.841026906936559e-05, "loss": 0.0066, "step": 989 }, { "epoch": 0.7242136064374542, "grad_norm": 6.7639241218566895, "learning_rate": 3.839792643791657e-05, "loss": 0.1529, "step": 990 }, { "epoch": 0.7249451353328457, "grad_norm": 1.2912427186965942, "learning_rate": 3.838558380646754e-05, "loss": 0.0139, "step": 991 }, { "epoch": 0.7256766642282371, "grad_norm": 1.0863337516784668, "learning_rate": 3.8373241175018514e-05, "loss": 0.012, "step": 992 }, { "epoch": 0.7264081931236284, "grad_norm": 0.5433011651039124, "learning_rate": 3.836089854356949e-05, "loss": 0.0074, "step": 993 }, { "epoch": 0.7271397220190198, "grad_norm": 8.452055931091309, "learning_rate": 3.834855591212047e-05, "loss": 0.0932, "step": 994 }, { "epoch": 0.7278712509144111, "grad_norm": 3.274561882019043, "learning_rate": 3.833621328067144e-05, "loss": 0.2198, "step": 995 }, { "epoch": 0.7286027798098025, "grad_norm": 0.09904321283102036, "learning_rate": 3.8323870649222414e-05, "loss": 0.0029, "step": 996 }, { "epoch": 0.7293343087051939, "grad_norm": 5.479438304901123, "learning_rate": 3.831152801777339e-05, "loss": 0.104, "step": 997 }, { "epoch": 0.7300658376005852, "grad_norm": 10.534286499023438, "learning_rate": 3.829918538632437e-05, "loss": 0.1528, "step": 998 }, { "epoch": 0.7307973664959766, "grad_norm": 0.06063681095838547, "learning_rate": 3.828684275487534e-05, "loss": 0.0022, "step": 999 }, { "epoch": 0.731528895391368, "grad_norm": 5.658867359161377, "learning_rate": 3.8274500123426315e-05, "loss": 0.0493, "step": 1000 }, { "epoch": 0.7322604242867593, "grad_norm": 4.843900203704834, "learning_rate": 3.826215749197729e-05, "loss": 0.2386, "step": 1001 }, { "epoch": 0.7329919531821507, "grad_norm": 7.207432746887207, "learning_rate": 3.824981486052827e-05, "loss": 0.0924, "step": 1002 }, { "epoch": 0.733723482077542, "grad_norm": 7.565855503082275, "learning_rate": 3.823747222907924e-05, "loss": 0.0728, "step": 1003 }, { "epoch": 0.7344550109729334, "grad_norm": 0.11197708547115326, "learning_rate": 3.8225129597630215e-05, "loss": 0.0034, "step": 1004 }, { "epoch": 0.7351865398683248, "grad_norm": 10.543878555297852, "learning_rate": 3.821278696618119e-05, "loss": 0.0422, "step": 1005 }, { "epoch": 0.7359180687637161, "grad_norm": 3.733477830886841, "learning_rate": 3.820044433473217e-05, "loss": 0.1369, "step": 1006 }, { "epoch": 0.7366495976591075, "grad_norm": 31.05219841003418, "learning_rate": 3.818810170328314e-05, "loss": 0.4071, "step": 1007 }, { "epoch": 0.737381126554499, "grad_norm": 10.512689590454102, "learning_rate": 3.8175759071834116e-05, "loss": 0.1965, "step": 1008 }, { "epoch": 0.7381126554498902, "grad_norm": 19.802833557128906, "learning_rate": 3.816341644038509e-05, "loss": 0.1951, "step": 1009 }, { "epoch": 0.7388441843452817, "grad_norm": 2.64235782623291, "learning_rate": 3.815107380893607e-05, "loss": 0.0241, "step": 1010 }, { "epoch": 0.7395757132406731, "grad_norm": 4.849562168121338, "learning_rate": 3.813873117748704e-05, "loss": 0.0378, "step": 1011 }, { "epoch": 0.7403072421360644, "grad_norm": 1.8162140846252441, "learning_rate": 3.8126388546038016e-05, "loss": 0.0188, "step": 1012 }, { "epoch": 0.7410387710314558, "grad_norm": 0.5132425427436829, "learning_rate": 3.811404591458899e-05, "loss": 0.0065, "step": 1013 }, { "epoch": 0.7417702999268471, "grad_norm": 0.3988964855670929, "learning_rate": 3.810170328313997e-05, "loss": 0.0079, "step": 1014 }, { "epoch": 0.7425018288222385, "grad_norm": 4.677145004272461, "learning_rate": 3.808936065169094e-05, "loss": 0.2081, "step": 1015 }, { "epoch": 0.7432333577176299, "grad_norm": 5.611171722412109, "learning_rate": 3.8077018020241916e-05, "loss": 0.2242, "step": 1016 }, { "epoch": 0.7439648866130212, "grad_norm": 0.11959764361381531, "learning_rate": 3.806467538879289e-05, "loss": 0.0037, "step": 1017 }, { "epoch": 0.7446964155084126, "grad_norm": 8.211718559265137, "learning_rate": 3.805233275734387e-05, "loss": 0.1293, "step": 1018 }, { "epoch": 0.745427944403804, "grad_norm": 4.393526554107666, "learning_rate": 3.803999012589484e-05, "loss": 0.0287, "step": 1019 }, { "epoch": 0.7461594732991953, "grad_norm": 4.449804306030273, "learning_rate": 3.802764749444582e-05, "loss": 0.1624, "step": 1020 }, { "epoch": 0.7468910021945867, "grad_norm": 0.17249034345149994, "learning_rate": 3.8015304862996794e-05, "loss": 0.0032, "step": 1021 }, { "epoch": 0.747622531089978, "grad_norm": 4.425702095031738, "learning_rate": 3.800296223154777e-05, "loss": 0.0268, "step": 1022 }, { "epoch": 0.7483540599853694, "grad_norm": 5.568118572235107, "learning_rate": 3.799061960009874e-05, "loss": 0.2244, "step": 1023 }, { "epoch": 0.7490855888807608, "grad_norm": 2.9736692905426025, "learning_rate": 3.797827696864972e-05, "loss": 0.0208, "step": 1024 }, { "epoch": 0.7498171177761521, "grad_norm": 1.833612322807312, "learning_rate": 3.7965934337200694e-05, "loss": 0.007, "step": 1025 }, { "epoch": 0.7505486466715435, "grad_norm": 0.1576753556728363, "learning_rate": 3.795359170575167e-05, "loss": 0.0031, "step": 1026 }, { "epoch": 0.7512801755669349, "grad_norm": 2.138373613357544, "learning_rate": 3.794124907430264e-05, "loss": 0.0155, "step": 1027 }, { "epoch": 0.7520117044623262, "grad_norm": 5.878775119781494, "learning_rate": 3.792890644285362e-05, "loss": 0.1063, "step": 1028 }, { "epoch": 0.7527432333577176, "grad_norm": 15.249051094055176, "learning_rate": 3.7916563811404594e-05, "loss": 0.3644, "step": 1029 }, { "epoch": 0.7534747622531089, "grad_norm": 39.03351974487305, "learning_rate": 3.790422117995557e-05, "loss": 0.2458, "step": 1030 }, { "epoch": 0.7542062911485004, "grad_norm": 12.34658432006836, "learning_rate": 3.789187854850654e-05, "loss": 0.0844, "step": 1031 }, { "epoch": 0.7549378200438918, "grad_norm": 2.417318344116211, "learning_rate": 3.787953591705752e-05, "loss": 0.0092, "step": 1032 }, { "epoch": 0.7556693489392831, "grad_norm": 6.1147260665893555, "learning_rate": 3.7867193285608495e-05, "loss": 0.0214, "step": 1033 }, { "epoch": 0.7564008778346745, "grad_norm": 3.1801607608795166, "learning_rate": 3.785485065415947e-05, "loss": 0.0143, "step": 1034 }, { "epoch": 0.7571324067300659, "grad_norm": 0.05284363776445389, "learning_rate": 3.784250802271044e-05, "loss": 0.0022, "step": 1035 }, { "epoch": 0.7578639356254572, "grad_norm": 1.024236798286438, "learning_rate": 3.783016539126142e-05, "loss": 0.1999, "step": 1036 }, { "epoch": 0.7585954645208486, "grad_norm": 17.735424041748047, "learning_rate": 3.7817822759812395e-05, "loss": 0.5087, "step": 1037 }, { "epoch": 0.75932699341624, "grad_norm": 23.19840431213379, "learning_rate": 3.780548012836337e-05, "loss": 0.2119, "step": 1038 }, { "epoch": 0.7600585223116313, "grad_norm": 11.74705982208252, "learning_rate": 3.779313749691434e-05, "loss": 0.063, "step": 1039 }, { "epoch": 0.7607900512070227, "grad_norm": 3.485328197479248, "learning_rate": 3.778079486546532e-05, "loss": 0.0208, "step": 1040 }, { "epoch": 0.761521580102414, "grad_norm": 13.445764541625977, "learning_rate": 3.7768452234016296e-05, "loss": 0.4179, "step": 1041 }, { "epoch": 0.7622531089978054, "grad_norm": 26.27625846862793, "learning_rate": 3.7756109602567266e-05, "loss": 0.3296, "step": 1042 }, { "epoch": 0.7629846378931968, "grad_norm": 1.7876505851745605, "learning_rate": 3.774376697111824e-05, "loss": 0.3376, "step": 1043 }, { "epoch": 0.7637161667885881, "grad_norm": 0.3369132876396179, "learning_rate": 3.773142433966922e-05, "loss": 0.0064, "step": 1044 }, { "epoch": 0.7644476956839795, "grad_norm": 4.197978973388672, "learning_rate": 3.7719081708220196e-05, "loss": 0.0305, "step": 1045 }, { "epoch": 0.7651792245793709, "grad_norm": 0.4592178761959076, "learning_rate": 3.7706739076771166e-05, "loss": 0.0112, "step": 1046 }, { "epoch": 0.7659107534747622, "grad_norm": 16.037433624267578, "learning_rate": 3.769439644532214e-05, "loss": 0.2805, "step": 1047 }, { "epoch": 0.7666422823701536, "grad_norm": 4.3848395347595215, "learning_rate": 3.768205381387312e-05, "loss": 0.0264, "step": 1048 }, { "epoch": 0.7673738112655449, "grad_norm": 17.151697158813477, "learning_rate": 3.7669711182424096e-05, "loss": 0.1921, "step": 1049 }, { "epoch": 0.7681053401609363, "grad_norm": 20.50341033935547, "learning_rate": 3.7657368550975066e-05, "loss": 0.2029, "step": 1050 }, { "epoch": 0.7688368690563278, "grad_norm": 0.7659966349601746, "learning_rate": 3.764502591952604e-05, "loss": 0.0115, "step": 1051 }, { "epoch": 0.769568397951719, "grad_norm": 5.680034637451172, "learning_rate": 3.763268328807702e-05, "loss": 0.0814, "step": 1052 }, { "epoch": 0.7702999268471105, "grad_norm": 3.599799394607544, "learning_rate": 3.7620340656628e-05, "loss": 0.1889, "step": 1053 }, { "epoch": 0.7710314557425019, "grad_norm": 2.8572914600372314, "learning_rate": 3.760799802517897e-05, "loss": 0.0226, "step": 1054 }, { "epoch": 0.7717629846378932, "grad_norm": 3.9617996215820312, "learning_rate": 3.7595655393729944e-05, "loss": 0.0863, "step": 1055 }, { "epoch": 0.7724945135332846, "grad_norm": 6.079298496246338, "learning_rate": 3.758331276228092e-05, "loss": 0.0469, "step": 1056 }, { "epoch": 0.773226042428676, "grad_norm": 0.33199143409729004, "learning_rate": 3.75709701308319e-05, "loss": 0.0098, "step": 1057 }, { "epoch": 0.7739575713240673, "grad_norm": 4.527032375335693, "learning_rate": 3.755862749938287e-05, "loss": 0.308, "step": 1058 }, { "epoch": 0.7746891002194587, "grad_norm": 4.715875625610352, "learning_rate": 3.7546284867933844e-05, "loss": 0.1346, "step": 1059 }, { "epoch": 0.77542062911485, "grad_norm": 5.458766460418701, "learning_rate": 3.753394223648482e-05, "loss": 0.1037, "step": 1060 }, { "epoch": 0.7761521580102414, "grad_norm": 3.976893424987793, "learning_rate": 3.75215996050358e-05, "loss": 0.0771, "step": 1061 }, { "epoch": 0.7768836869056328, "grad_norm": 11.672456741333008, "learning_rate": 3.750925697358677e-05, "loss": 0.084, "step": 1062 }, { "epoch": 0.7776152158010241, "grad_norm": 2.071512222290039, "learning_rate": 3.7496914342137744e-05, "loss": 0.1241, "step": 1063 }, { "epoch": 0.7783467446964155, "grad_norm": 5.296865940093994, "learning_rate": 3.748457171068872e-05, "loss": 0.09, "step": 1064 }, { "epoch": 0.7790782735918069, "grad_norm": 0.7184045910835266, "learning_rate": 3.74722290792397e-05, "loss": 0.0248, "step": 1065 }, { "epoch": 0.7798098024871982, "grad_norm": 4.133099555969238, "learning_rate": 3.745988644779067e-05, "loss": 0.2923, "step": 1066 }, { "epoch": 0.7805413313825896, "grad_norm": 4.3759541511535645, "learning_rate": 3.7447543816341645e-05, "loss": 0.085, "step": 1067 }, { "epoch": 0.7812728602779809, "grad_norm": 5.699994087219238, "learning_rate": 3.743520118489262e-05, "loss": 0.0542, "step": 1068 }, { "epoch": 0.7820043891733723, "grad_norm": 4.137044906616211, "learning_rate": 3.74228585534436e-05, "loss": 0.1238, "step": 1069 }, { "epoch": 0.7827359180687637, "grad_norm": 7.586370468139648, "learning_rate": 3.741051592199457e-05, "loss": 0.0985, "step": 1070 }, { "epoch": 0.783467446964155, "grad_norm": 1.7356756925582886, "learning_rate": 3.7398173290545545e-05, "loss": 0.0257, "step": 1071 }, { "epoch": 0.7841989758595465, "grad_norm": 1.5577689409255981, "learning_rate": 3.738583065909652e-05, "loss": 0.0329, "step": 1072 }, { "epoch": 0.7849305047549379, "grad_norm": 0.5070388317108154, "learning_rate": 3.73734880276475e-05, "loss": 0.0148, "step": 1073 }, { "epoch": 0.7856620336503292, "grad_norm": 3.483774423599243, "learning_rate": 3.736114539619847e-05, "loss": 0.0423, "step": 1074 }, { "epoch": 0.7863935625457206, "grad_norm": 0.3883195221424103, "learning_rate": 3.7348802764749445e-05, "loss": 0.0097, "step": 1075 }, { "epoch": 0.787125091441112, "grad_norm": 1.6701300144195557, "learning_rate": 3.733646013330042e-05, "loss": 0.0111, "step": 1076 }, { "epoch": 0.7878566203365033, "grad_norm": 7.759876728057861, "learning_rate": 3.73241175018514e-05, "loss": 0.1144, "step": 1077 }, { "epoch": 0.7885881492318947, "grad_norm": 0.5120604038238525, "learning_rate": 3.731177487040237e-05, "loss": 0.009, "step": 1078 }, { "epoch": 0.789319678127286, "grad_norm": 1.1746852397918701, "learning_rate": 3.7299432238953346e-05, "loss": 0.0073, "step": 1079 }, { "epoch": 0.7900512070226774, "grad_norm": 11.427943229675293, "learning_rate": 3.728708960750432e-05, "loss": 0.0682, "step": 1080 }, { "epoch": 0.7907827359180688, "grad_norm": 3.772310972213745, "learning_rate": 3.72747469760553e-05, "loss": 0.1405, "step": 1081 }, { "epoch": 0.7915142648134601, "grad_norm": 0.20535320043563843, "learning_rate": 3.726240434460627e-05, "loss": 0.0039, "step": 1082 }, { "epoch": 0.7922457937088515, "grad_norm": 21.914583206176758, "learning_rate": 3.7250061713157246e-05, "loss": 0.2123, "step": 1083 }, { "epoch": 0.7929773226042429, "grad_norm": 14.826451301574707, "learning_rate": 3.723771908170822e-05, "loss": 0.0354, "step": 1084 }, { "epoch": 0.7937088514996342, "grad_norm": 35.385921478271484, "learning_rate": 3.72253764502592e-05, "loss": 0.2646, "step": 1085 }, { "epoch": 0.7944403803950256, "grad_norm": 22.324390411376953, "learning_rate": 3.721303381881017e-05, "loss": 0.2869, "step": 1086 }, { "epoch": 0.7951719092904169, "grad_norm": 18.230485916137695, "learning_rate": 3.720069118736115e-05, "loss": 0.1224, "step": 1087 }, { "epoch": 0.7959034381858083, "grad_norm": 0.04892382398247719, "learning_rate": 3.7188348555912123e-05, "loss": 0.0022, "step": 1088 }, { "epoch": 0.7966349670811997, "grad_norm": 0.7617624402046204, "learning_rate": 3.71760059244631e-05, "loss": 0.0049, "step": 1089 }, { "epoch": 0.797366495976591, "grad_norm": 13.784245491027832, "learning_rate": 3.716366329301407e-05, "loss": 0.0483, "step": 1090 }, { "epoch": 0.7980980248719824, "grad_norm": 0.048972342163324356, "learning_rate": 3.715132066156505e-05, "loss": 0.0022, "step": 1091 }, { "epoch": 0.7988295537673739, "grad_norm": 20.11948013305664, "learning_rate": 3.7138978030116024e-05, "loss": 0.0321, "step": 1092 }, { "epoch": 0.7995610826627652, "grad_norm": 0.039313867688179016, "learning_rate": 3.7126635398667e-05, "loss": 0.0018, "step": 1093 }, { "epoch": 0.8002926115581566, "grad_norm": 0.045873384922742844, "learning_rate": 3.711429276721797e-05, "loss": 0.002, "step": 1094 }, { "epoch": 0.8010241404535479, "grad_norm": 0.7714202404022217, "learning_rate": 3.710195013576895e-05, "loss": 0.1799, "step": 1095 }, { "epoch": 0.8017556693489393, "grad_norm": 1.5761042833328247, "learning_rate": 3.7089607504319924e-05, "loss": 0.3622, "step": 1096 }, { "epoch": 0.8024871982443307, "grad_norm": 1.188443660736084, "learning_rate": 3.70772648728709e-05, "loss": 0.1732, "step": 1097 }, { "epoch": 0.803218727139722, "grad_norm": 0.22692027688026428, "learning_rate": 3.706492224142187e-05, "loss": 0.0041, "step": 1098 }, { "epoch": 0.8039502560351134, "grad_norm": 8.514050483703613, "learning_rate": 3.705257960997285e-05, "loss": 0.349, "step": 1099 }, { "epoch": 0.8046817849305048, "grad_norm": 12.092692375183105, "learning_rate": 3.7040236978523825e-05, "loss": 0.3904, "step": 1100 }, { "epoch": 0.8054133138258961, "grad_norm": 0.30475661158561707, "learning_rate": 3.70278943470748e-05, "loss": 0.0034, "step": 1101 }, { "epoch": 0.8061448427212875, "grad_norm": 1.900626540184021, "learning_rate": 3.701555171562577e-05, "loss": 0.1444, "step": 1102 }, { "epoch": 0.8068763716166789, "grad_norm": 0.11917265504598618, "learning_rate": 3.700320908417675e-05, "loss": 0.0055, "step": 1103 }, { "epoch": 0.8076079005120702, "grad_norm": 0.16247105598449707, "learning_rate": 3.6990866452727725e-05, "loss": 0.0074, "step": 1104 }, { "epoch": 0.8083394294074616, "grad_norm": 0.6768609881401062, "learning_rate": 3.69785238212787e-05, "loss": 0.1475, "step": 1105 }, { "epoch": 0.8090709583028529, "grad_norm": 0.1992805004119873, "learning_rate": 3.696618118982967e-05, "loss": 0.009, "step": 1106 }, { "epoch": 0.8098024871982443, "grad_norm": 12.276022911071777, "learning_rate": 3.695383855838065e-05, "loss": 0.0236, "step": 1107 }, { "epoch": 0.8105340160936357, "grad_norm": 31.3634090423584, "learning_rate": 3.6941495926931625e-05, "loss": 0.1733, "step": 1108 }, { "epoch": 0.811265544989027, "grad_norm": 3.626657724380493, "learning_rate": 3.69291532954826e-05, "loss": 0.0184, "step": 1109 }, { "epoch": 0.8119970738844184, "grad_norm": 0.251913845539093, "learning_rate": 3.691681066403357e-05, "loss": 0.0117, "step": 1110 }, { "epoch": 0.8127286027798098, "grad_norm": 4.771862506866455, "learning_rate": 3.690446803258455e-05, "loss": 0.1659, "step": 1111 }, { "epoch": 0.8134601316752011, "grad_norm": 17.779802322387695, "learning_rate": 3.6892125401135526e-05, "loss": 0.084, "step": 1112 }, { "epoch": 0.8141916605705926, "grad_norm": 1.6818208694458008, "learning_rate": 3.68797827696865e-05, "loss": 0.018, "step": 1113 }, { "epoch": 0.8149231894659839, "grad_norm": 0.7858742475509644, "learning_rate": 3.686744013823747e-05, "loss": 0.0173, "step": 1114 }, { "epoch": 0.8156547183613753, "grad_norm": 1.6476829051971436, "learning_rate": 3.685509750678845e-05, "loss": 0.1218, "step": 1115 }, { "epoch": 0.8163862472567667, "grad_norm": 13.304701805114746, "learning_rate": 3.6842754875339426e-05, "loss": 0.1091, "step": 1116 }, { "epoch": 0.817117776152158, "grad_norm": 0.394349604845047, "learning_rate": 3.68304122438904e-05, "loss": 0.0167, "step": 1117 }, { "epoch": 0.8178493050475494, "grad_norm": 0.40692126750946045, "learning_rate": 3.681806961244137e-05, "loss": 0.0183, "step": 1118 }, { "epoch": 0.8185808339429408, "grad_norm": 0.6504019498825073, "learning_rate": 3.680572698099235e-05, "loss": 0.0166, "step": 1119 }, { "epoch": 0.8193123628383321, "grad_norm": 0.19595515727996826, "learning_rate": 3.6793384349543327e-05, "loss": 0.0085, "step": 1120 }, { "epoch": 0.8200438917337235, "grad_norm": 2.3169803619384766, "learning_rate": 3.67810417180943e-05, "loss": 0.123, "step": 1121 }, { "epoch": 0.8207754206291149, "grad_norm": 7.032483100891113, "learning_rate": 3.676869908664527e-05, "loss": 0.1468, "step": 1122 }, { "epoch": 0.8215069495245062, "grad_norm": 5.8273749351501465, "learning_rate": 3.675635645519625e-05, "loss": 0.036, "step": 1123 }, { "epoch": 0.8222384784198976, "grad_norm": 0.2746058404445648, "learning_rate": 3.674401382374723e-05, "loss": 0.0112, "step": 1124 }, { "epoch": 0.8229700073152889, "grad_norm": 0.19737322628498077, "learning_rate": 3.6731671192298204e-05, "loss": 0.0088, "step": 1125 }, { "epoch": 0.8237015362106803, "grad_norm": 16.120397567749023, "learning_rate": 3.6719328560849174e-05, "loss": 0.0584, "step": 1126 }, { "epoch": 0.8244330651060717, "grad_norm": 15.767183303833008, "learning_rate": 3.670698592940015e-05, "loss": 0.144, "step": 1127 }, { "epoch": 0.825164594001463, "grad_norm": 0.13112042844295502, "learning_rate": 3.669464329795113e-05, "loss": 0.0051, "step": 1128 }, { "epoch": 0.8258961228968544, "grad_norm": 0.10558177530765533, "learning_rate": 3.66823006665021e-05, "loss": 0.0045, "step": 1129 }, { "epoch": 0.8266276517922458, "grad_norm": 0.17575369775295258, "learning_rate": 3.6669958035053074e-05, "loss": 0.005, "step": 1130 }, { "epoch": 0.8273591806876371, "grad_norm": 9.906211853027344, "learning_rate": 3.665761540360405e-05, "loss": 0.0931, "step": 1131 }, { "epoch": 0.8280907095830286, "grad_norm": 0.20811326801776886, "learning_rate": 3.664527277215503e-05, "loss": 0.0045, "step": 1132 }, { "epoch": 0.8288222384784198, "grad_norm": 16.610424041748047, "learning_rate": 3.6632930140706e-05, "loss": 0.0393, "step": 1133 }, { "epoch": 0.8295537673738113, "grad_norm": 0.06370551884174347, "learning_rate": 3.6620587509256975e-05, "loss": 0.0029, "step": 1134 }, { "epoch": 0.8302852962692027, "grad_norm": 0.04557839035987854, "learning_rate": 3.660824487780795e-05, "loss": 0.002, "step": 1135 }, { "epoch": 0.831016825164594, "grad_norm": 0.1478087306022644, "learning_rate": 3.659590224635893e-05, "loss": 0.0029, "step": 1136 }, { "epoch": 0.8317483540599854, "grad_norm": 0.0542423240840435, "learning_rate": 3.65835596149099e-05, "loss": 0.0023, "step": 1137 }, { "epoch": 0.8324798829553768, "grad_norm": 2.745847463607788, "learning_rate": 3.6571216983460875e-05, "loss": 0.01, "step": 1138 }, { "epoch": 0.8332114118507681, "grad_norm": 9.858214378356934, "learning_rate": 3.655887435201185e-05, "loss": 0.1455, "step": 1139 }, { "epoch": 0.8339429407461595, "grad_norm": 1.7605944871902466, "learning_rate": 3.654653172056283e-05, "loss": 0.0095, "step": 1140 }, { "epoch": 0.8346744696415508, "grad_norm": 7.674892902374268, "learning_rate": 3.65341890891138e-05, "loss": 0.0325, "step": 1141 }, { "epoch": 0.8354059985369422, "grad_norm": 7.620135307312012, "learning_rate": 3.6521846457664775e-05, "loss": 0.143, "step": 1142 }, { "epoch": 0.8361375274323336, "grad_norm": 1.800767421722412, "learning_rate": 3.650950382621575e-05, "loss": 0.1702, "step": 1143 }, { "epoch": 0.8368690563277249, "grad_norm": 0.25818291306495667, "learning_rate": 3.649716119476673e-05, "loss": 0.0028, "step": 1144 }, { "epoch": 0.8376005852231163, "grad_norm": 16.649354934692383, "learning_rate": 3.64848185633177e-05, "loss": 0.1178, "step": 1145 }, { "epoch": 0.8383321141185077, "grad_norm": 1.1298874616622925, "learning_rate": 3.6472475931868676e-05, "loss": 0.173, "step": 1146 }, { "epoch": 0.839063643013899, "grad_norm": 0.07936200499534607, "learning_rate": 3.646013330041965e-05, "loss": 0.0022, "step": 1147 }, { "epoch": 0.8397951719092904, "grad_norm": 3.351274013519287, "learning_rate": 3.644779066897063e-05, "loss": 0.3457, "step": 1148 }, { "epoch": 0.8405267008046818, "grad_norm": 10.962160110473633, "learning_rate": 3.64354480375216e-05, "loss": 0.1017, "step": 1149 }, { "epoch": 0.8412582297000731, "grad_norm": 14.838444709777832, "learning_rate": 3.6423105406072576e-05, "loss": 0.074, "step": 1150 }, { "epoch": 0.8419897585954645, "grad_norm": 23.12447738647461, "learning_rate": 3.641076277462355e-05, "loss": 0.1128, "step": 1151 }, { "epoch": 0.8427212874908558, "grad_norm": 0.07154089957475662, "learning_rate": 3.639842014317453e-05, "loss": 0.0031, "step": 1152 }, { "epoch": 0.8434528163862473, "grad_norm": 3.516690969467163, "learning_rate": 3.63860775117255e-05, "loss": 0.1277, "step": 1153 }, { "epoch": 0.8441843452816387, "grad_norm": 3.8505780696868896, "learning_rate": 3.6373734880276477e-05, "loss": 0.149, "step": 1154 }, { "epoch": 0.84491587417703, "grad_norm": 2.4613749980926514, "learning_rate": 3.636139224882745e-05, "loss": 0.0137, "step": 1155 }, { "epoch": 0.8456474030724214, "grad_norm": 6.416619777679443, "learning_rate": 3.634904961737843e-05, "loss": 0.0366, "step": 1156 }, { "epoch": 0.8463789319678128, "grad_norm": 5.611688137054443, "learning_rate": 3.63367069859294e-05, "loss": 0.0865, "step": 1157 }, { "epoch": 0.8471104608632041, "grad_norm": 15.718743324279785, "learning_rate": 3.632436435448038e-05, "loss": 0.104, "step": 1158 }, { "epoch": 0.8478419897585955, "grad_norm": 1.6273428201675415, "learning_rate": 3.6312021723031354e-05, "loss": 0.0177, "step": 1159 }, { "epoch": 0.8485735186539868, "grad_norm": 14.527206420898438, "learning_rate": 3.629967909158233e-05, "loss": 0.1434, "step": 1160 }, { "epoch": 0.8493050475493782, "grad_norm": 3.933804512023926, "learning_rate": 3.62873364601333e-05, "loss": 0.0351, "step": 1161 }, { "epoch": 0.8500365764447696, "grad_norm": 11.103996276855469, "learning_rate": 3.627499382868428e-05, "loss": 0.1196, "step": 1162 }, { "epoch": 0.8507681053401609, "grad_norm": 10.623714447021484, "learning_rate": 3.6262651197235254e-05, "loss": 0.0907, "step": 1163 }, { "epoch": 0.8514996342355523, "grad_norm": 2.2590460777282715, "learning_rate": 3.625030856578623e-05, "loss": 0.0177, "step": 1164 }, { "epoch": 0.8522311631309437, "grad_norm": 2.660825490951538, "learning_rate": 3.62379659343372e-05, "loss": 0.1173, "step": 1165 }, { "epoch": 0.852962692026335, "grad_norm": 0.1347128003835678, "learning_rate": 3.622562330288818e-05, "loss": 0.0043, "step": 1166 }, { "epoch": 0.8536942209217264, "grad_norm": 6.939416885375977, "learning_rate": 3.6213280671439154e-05, "loss": 0.1712, "step": 1167 }, { "epoch": 0.8544257498171178, "grad_norm": 2.950993299484253, "learning_rate": 3.620093803999013e-05, "loss": 0.1579, "step": 1168 }, { "epoch": 0.8551572787125091, "grad_norm": 0.8513153195381165, "learning_rate": 3.61885954085411e-05, "loss": 0.0104, "step": 1169 }, { "epoch": 0.8558888076079005, "grad_norm": 14.566949844360352, "learning_rate": 3.617625277709208e-05, "loss": 0.2654, "step": 1170 }, { "epoch": 0.8566203365032918, "grad_norm": 5.616229057312012, "learning_rate": 3.6163910145643055e-05, "loss": 0.1193, "step": 1171 }, { "epoch": 0.8573518653986832, "grad_norm": 1.7381362915039062, "learning_rate": 3.615156751419403e-05, "loss": 0.0937, "step": 1172 }, { "epoch": 0.8580833942940747, "grad_norm": 5.354511737823486, "learning_rate": 3.6139224882745e-05, "loss": 0.1098, "step": 1173 }, { "epoch": 0.858814923189466, "grad_norm": 0.6128389239311218, "learning_rate": 3.612688225129598e-05, "loss": 0.0141, "step": 1174 }, { "epoch": 0.8595464520848574, "grad_norm": 1.260972023010254, "learning_rate": 3.6114539619846955e-05, "loss": 0.0212, "step": 1175 }, { "epoch": 0.8602779809802488, "grad_norm": 3.4271926879882812, "learning_rate": 3.610219698839793e-05, "loss": 0.0188, "step": 1176 }, { "epoch": 0.8610095098756401, "grad_norm": 0.7983370423316956, "learning_rate": 3.60898543569489e-05, "loss": 0.0126, "step": 1177 }, { "epoch": 0.8617410387710315, "grad_norm": 1.2084529399871826, "learning_rate": 3.607751172549988e-05, "loss": 0.0186, "step": 1178 }, { "epoch": 0.8624725676664228, "grad_norm": 0.27804988622665405, "learning_rate": 3.6065169094050856e-05, "loss": 0.0047, "step": 1179 }, { "epoch": 0.8632040965618142, "grad_norm": 1.3906307220458984, "learning_rate": 3.605282646260183e-05, "loss": 0.0085, "step": 1180 }, { "epoch": 0.8639356254572056, "grad_norm": 0.3746333122253418, "learning_rate": 3.60404838311528e-05, "loss": 0.0037, "step": 1181 }, { "epoch": 0.8646671543525969, "grad_norm": 3.4379327297210693, "learning_rate": 3.602814119970378e-05, "loss": 0.0102, "step": 1182 }, { "epoch": 0.8653986832479883, "grad_norm": 20.274341583251953, "learning_rate": 3.6015798568254756e-05, "loss": 0.1128, "step": 1183 }, { "epoch": 0.8661302121433797, "grad_norm": 1.8224565982818604, "learning_rate": 3.600345593680573e-05, "loss": 0.172, "step": 1184 }, { "epoch": 0.866861741038771, "grad_norm": 17.517250061035156, "learning_rate": 3.59911133053567e-05, "loss": 0.1231, "step": 1185 }, { "epoch": 0.8675932699341624, "grad_norm": 1.0547069311141968, "learning_rate": 3.597877067390768e-05, "loss": 0.2505, "step": 1186 }, { "epoch": 0.8683247988295537, "grad_norm": 12.502161979675293, "learning_rate": 3.5966428042458656e-05, "loss": 0.2978, "step": 1187 }, { "epoch": 0.8690563277249451, "grad_norm": 0.06060807406902313, "learning_rate": 3.595408541100963e-05, "loss": 0.0025, "step": 1188 }, { "epoch": 0.8697878566203365, "grad_norm": 0.94465172290802, "learning_rate": 3.59417427795606e-05, "loss": 0.009, "step": 1189 }, { "epoch": 0.8705193855157278, "grad_norm": 0.08379081636667252, "learning_rate": 3.592940014811158e-05, "loss": 0.003, "step": 1190 }, { "epoch": 0.8712509144111192, "grad_norm": 5.571483612060547, "learning_rate": 3.591705751666256e-05, "loss": 0.3975, "step": 1191 }, { "epoch": 0.8719824433065106, "grad_norm": 0.08954920619726181, "learning_rate": 3.5904714885213534e-05, "loss": 0.003, "step": 1192 }, { "epoch": 0.8727139722019019, "grad_norm": 0.07338370382785797, "learning_rate": 3.5892372253764504e-05, "loss": 0.0031, "step": 1193 }, { "epoch": 0.8734455010972934, "grad_norm": 12.561967849731445, "learning_rate": 3.588002962231548e-05, "loss": 0.2037, "step": 1194 }, { "epoch": 0.8741770299926848, "grad_norm": 1.1642833948135376, "learning_rate": 3.586768699086646e-05, "loss": 0.1799, "step": 1195 }, { "epoch": 0.8749085588880761, "grad_norm": 0.260811984539032, "learning_rate": 3.5855344359417434e-05, "loss": 0.0036, "step": 1196 }, { "epoch": 0.8756400877834675, "grad_norm": 5.019128799438477, "learning_rate": 3.5843001727968404e-05, "loss": 0.1275, "step": 1197 }, { "epoch": 0.8763716166788588, "grad_norm": 0.08676271140575409, "learning_rate": 3.583065909651938e-05, "loss": 0.004, "step": 1198 }, { "epoch": 0.8771031455742502, "grad_norm": 0.13112711906433105, "learning_rate": 3.581831646507036e-05, "loss": 0.0049, "step": 1199 }, { "epoch": 0.8778346744696416, "grad_norm": 0.07610342651605606, "learning_rate": 3.5805973833621334e-05, "loss": 0.0035, "step": 1200 }, { "epoch": 0.8785662033650329, "grad_norm": 0.07200533151626587, "learning_rate": 3.5793631202172304e-05, "loss": 0.0035, "step": 1201 }, { "epoch": 0.8792977322604243, "grad_norm": 0.07944530248641968, "learning_rate": 3.578128857072328e-05, "loss": 0.0038, "step": 1202 }, { "epoch": 0.8800292611558157, "grad_norm": 5.468553066253662, "learning_rate": 3.576894593927426e-05, "loss": 0.1348, "step": 1203 }, { "epoch": 0.880760790051207, "grad_norm": 0.10427096486091614, "learning_rate": 3.5756603307825235e-05, "loss": 0.0044, "step": 1204 }, { "epoch": 0.8814923189465984, "grad_norm": 0.14841090142726898, "learning_rate": 3.5744260676376205e-05, "loss": 0.0044, "step": 1205 }, { "epoch": 0.8822238478419897, "grad_norm": 1.071906566619873, "learning_rate": 3.573191804492718e-05, "loss": 0.2015, "step": 1206 }, { "epoch": 0.8829553767373811, "grad_norm": 0.08777624368667603, "learning_rate": 3.571957541347816e-05, "loss": 0.0042, "step": 1207 }, { "epoch": 0.8836869056327725, "grad_norm": 0.2573084235191345, "learning_rate": 3.5707232782029135e-05, "loss": 0.0062, "step": 1208 }, { "epoch": 0.8844184345281638, "grad_norm": 0.3076290786266327, "learning_rate": 3.5694890150580105e-05, "loss": 0.0064, "step": 1209 }, { "epoch": 0.8851499634235552, "grad_norm": 0.11940550059080124, "learning_rate": 3.568254751913108e-05, "loss": 0.0059, "step": 1210 }, { "epoch": 0.8858814923189466, "grad_norm": 0.1304163783788681, "learning_rate": 3.567020488768206e-05, "loss": 0.0066, "step": 1211 }, { "epoch": 0.8866130212143379, "grad_norm": 0.10190224647521973, "learning_rate": 3.5657862256233036e-05, "loss": 0.0051, "step": 1212 }, { "epoch": 0.8873445501097293, "grad_norm": 1.4498507976531982, "learning_rate": 3.5645519624784006e-05, "loss": 0.0126, "step": 1213 }, { "epoch": 0.8880760790051208, "grad_norm": 6.455715656280518, "learning_rate": 3.563317699333498e-05, "loss": 0.0273, "step": 1214 }, { "epoch": 0.888807607900512, "grad_norm": 3.2564661502838135, "learning_rate": 3.562083436188596e-05, "loss": 0.2917, "step": 1215 }, { "epoch": 0.8895391367959035, "grad_norm": 2.2873740196228027, "learning_rate": 3.560849173043693e-05, "loss": 0.3173, "step": 1216 }, { "epoch": 0.8902706656912948, "grad_norm": 0.13191908597946167, "learning_rate": 3.5596149098987906e-05, "loss": 0.0063, "step": 1217 }, { "epoch": 0.8910021945866862, "grad_norm": 2.3480215072631836, "learning_rate": 3.558380646753888e-05, "loss": 0.3117, "step": 1218 }, { "epoch": 0.8917337234820776, "grad_norm": 8.037874221801758, "learning_rate": 3.557146383608986e-05, "loss": 0.0257, "step": 1219 }, { "epoch": 0.8924652523774689, "grad_norm": 4.598278999328613, "learning_rate": 3.555912120464083e-05, "loss": 0.1667, "step": 1220 }, { "epoch": 0.8931967812728603, "grad_norm": 0.7400414347648621, "learning_rate": 3.5546778573191806e-05, "loss": 0.1622, "step": 1221 }, { "epoch": 0.8939283101682517, "grad_norm": 17.120569229125977, "learning_rate": 3.553443594174278e-05, "loss": 0.047, "step": 1222 }, { "epoch": 0.894659839063643, "grad_norm": 0.2703888416290283, "learning_rate": 3.552209331029376e-05, "loss": 0.0102, "step": 1223 }, { "epoch": 0.8953913679590344, "grad_norm": 16.594266891479492, "learning_rate": 3.550975067884473e-05, "loss": 0.2078, "step": 1224 }, { "epoch": 0.8961228968544257, "grad_norm": 6.492002010345459, "learning_rate": 3.549740804739571e-05, "loss": 0.0318, "step": 1225 }, { "epoch": 0.8968544257498171, "grad_norm": 0.16398820281028748, "learning_rate": 3.5485065415946684e-05, "loss": 0.0079, "step": 1226 }, { "epoch": 0.8975859546452085, "grad_norm": 0.5179646611213684, "learning_rate": 3.547272278449766e-05, "loss": 0.0092, "step": 1227 }, { "epoch": 0.8983174835405998, "grad_norm": 7.510553359985352, "learning_rate": 3.546038015304863e-05, "loss": 0.0494, "step": 1228 }, { "epoch": 0.8990490124359912, "grad_norm": 1.9805355072021484, "learning_rate": 3.544803752159961e-05, "loss": 0.019, "step": 1229 }, { "epoch": 0.8997805413313826, "grad_norm": 0.3753361105918884, "learning_rate": 3.5435694890150584e-05, "loss": 0.0127, "step": 1230 }, { "epoch": 0.9005120702267739, "grad_norm": 1.0102137327194214, "learning_rate": 3.542335225870156e-05, "loss": 0.0158, "step": 1231 }, { "epoch": 0.9012435991221653, "grad_norm": 28.969539642333984, "learning_rate": 3.541100962725253e-05, "loss": 0.1095, "step": 1232 }, { "epoch": 0.9019751280175567, "grad_norm": 3.430152416229248, "learning_rate": 3.539866699580351e-05, "loss": 0.0203, "step": 1233 }, { "epoch": 0.902706656912948, "grad_norm": 0.3639318645000458, "learning_rate": 3.5386324364354484e-05, "loss": 0.0089, "step": 1234 }, { "epoch": 0.9034381858083395, "grad_norm": 0.6330716609954834, "learning_rate": 3.537398173290546e-05, "loss": 0.0102, "step": 1235 }, { "epoch": 0.9041697147037308, "grad_norm": 0.15021081268787384, "learning_rate": 3.536163910145643e-05, "loss": 0.0068, "step": 1236 }, { "epoch": 0.9049012435991222, "grad_norm": 3.509488105773926, "learning_rate": 3.534929647000741e-05, "loss": 0.2827, "step": 1237 }, { "epoch": 0.9056327724945136, "grad_norm": 0.12826737761497498, "learning_rate": 3.5336953838558385e-05, "loss": 0.0059, "step": 1238 }, { "epoch": 0.9063643013899049, "grad_norm": 0.14289896190166473, "learning_rate": 3.532461120710936e-05, "loss": 0.0065, "step": 1239 }, { "epoch": 0.9070958302852963, "grad_norm": 14.035099983215332, "learning_rate": 3.531226857566033e-05, "loss": 0.2056, "step": 1240 }, { "epoch": 0.9078273591806877, "grad_norm": 17.07788848876953, "learning_rate": 3.529992594421131e-05, "loss": 0.2867, "step": 1241 }, { "epoch": 0.908558888076079, "grad_norm": 6.665271759033203, "learning_rate": 3.5287583312762285e-05, "loss": 0.1047, "step": 1242 }, { "epoch": 0.9092904169714704, "grad_norm": 2.0996744632720947, "learning_rate": 3.527524068131326e-05, "loss": 0.1358, "step": 1243 }, { "epoch": 0.9100219458668617, "grad_norm": 8.831835746765137, "learning_rate": 3.526289804986423e-05, "loss": 0.1232, "step": 1244 }, { "epoch": 0.9107534747622531, "grad_norm": 0.12171490490436554, "learning_rate": 3.525055541841521e-05, "loss": 0.0054, "step": 1245 }, { "epoch": 0.9114850036576445, "grad_norm": 0.16027671098709106, "learning_rate": 3.5238212786966186e-05, "loss": 0.0057, "step": 1246 }, { "epoch": 0.9122165325530358, "grad_norm": 0.19664543867111206, "learning_rate": 3.522587015551716e-05, "loss": 0.0053, "step": 1247 }, { "epoch": 0.9129480614484272, "grad_norm": 16.46450424194336, "learning_rate": 3.521352752406813e-05, "loss": 0.0618, "step": 1248 }, { "epoch": 0.9136795903438186, "grad_norm": 1.2592145204544067, "learning_rate": 3.520118489261911e-05, "loss": 0.0119, "step": 1249 }, { "epoch": 0.9144111192392099, "grad_norm": 0.4839952886104584, "learning_rate": 3.5188842261170086e-05, "loss": 0.0074, "step": 1250 }, { "epoch": 0.9151426481346013, "grad_norm": 18.501768112182617, "learning_rate": 3.517649962972106e-05, "loss": 0.1135, "step": 1251 }, { "epoch": 0.9158741770299926, "grad_norm": 3.033693552017212, "learning_rate": 3.516415699827203e-05, "loss": 0.0074, "step": 1252 }, { "epoch": 0.916605705925384, "grad_norm": 15.249194145202637, "learning_rate": 3.515181436682301e-05, "loss": 0.1011, "step": 1253 }, { "epoch": 0.9173372348207754, "grad_norm": 0.1773417890071869, "learning_rate": 3.5139471735373986e-05, "loss": 0.0059, "step": 1254 }, { "epoch": 0.9180687637161667, "grad_norm": 0.13995128870010376, "learning_rate": 3.512712910392496e-05, "loss": 0.0045, "step": 1255 }, { "epoch": 0.9188002926115582, "grad_norm": 0.0676693394780159, "learning_rate": 3.511478647247593e-05, "loss": 0.0031, "step": 1256 }, { "epoch": 0.9195318215069496, "grad_norm": 9.10500431060791, "learning_rate": 3.510244384102691e-05, "loss": 0.0145, "step": 1257 }, { "epoch": 0.9202633504023409, "grad_norm": 0.11467210948467255, "learning_rate": 3.509010120957789e-05, "loss": 0.0033, "step": 1258 }, { "epoch": 0.9209948792977323, "grad_norm": 0.7510115504264832, "learning_rate": 3.5077758578128863e-05, "loss": 0.1652, "step": 1259 }, { "epoch": 0.9217264081931237, "grad_norm": 0.2613893747329712, "learning_rate": 3.5065415946679833e-05, "loss": 0.0049, "step": 1260 }, { "epoch": 0.922457937088515, "grad_norm": 12.89853572845459, "learning_rate": 3.505307331523081e-05, "loss": 0.0665, "step": 1261 }, { "epoch": 0.9231894659839064, "grad_norm": 0.05714341253042221, "learning_rate": 3.504073068378179e-05, "loss": 0.0026, "step": 1262 }, { "epoch": 0.9239209948792977, "grad_norm": 0.12209334969520569, "learning_rate": 3.5028388052332764e-05, "loss": 0.0041, "step": 1263 }, { "epoch": 0.9246525237746891, "grad_norm": 0.38629889488220215, "learning_rate": 3.5016045420883734e-05, "loss": 0.0037, "step": 1264 }, { "epoch": 0.9253840526700805, "grad_norm": 0.35251206159591675, "learning_rate": 3.500370278943471e-05, "loss": 0.0037, "step": 1265 }, { "epoch": 0.9261155815654718, "grad_norm": 0.08887609094381332, "learning_rate": 3.499136015798569e-05, "loss": 0.0037, "step": 1266 }, { "epoch": 0.9268471104608632, "grad_norm": 14.315370559692383, "learning_rate": 3.4979017526536664e-05, "loss": 0.0786, "step": 1267 }, { "epoch": 0.9275786393562546, "grad_norm": 9.509969711303711, "learning_rate": 3.4966674895087634e-05, "loss": 0.2147, "step": 1268 }, { "epoch": 0.9283101682516459, "grad_norm": 24.303054809570312, "learning_rate": 3.495433226363861e-05, "loss": 0.048, "step": 1269 }, { "epoch": 0.9290416971470373, "grad_norm": 1.5027357339859009, "learning_rate": 3.494198963218959e-05, "loss": 0.005, "step": 1270 }, { "epoch": 0.9297732260424286, "grad_norm": 0.10049434006214142, "learning_rate": 3.4929647000740565e-05, "loss": 0.0028, "step": 1271 }, { "epoch": 0.93050475493782, "grad_norm": 0.08551489561796188, "learning_rate": 3.4917304369291535e-05, "loss": 0.0028, "step": 1272 }, { "epoch": 0.9312362838332114, "grad_norm": 5.423097610473633, "learning_rate": 3.490496173784251e-05, "loss": 0.133, "step": 1273 }, { "epoch": 0.9319678127286027, "grad_norm": 3.29148268699646, "learning_rate": 3.489261910639349e-05, "loss": 0.1383, "step": 1274 }, { "epoch": 0.9326993416239941, "grad_norm": 0.07311111688613892, "learning_rate": 3.4880276474944465e-05, "loss": 0.0026, "step": 1275 }, { "epoch": 0.9334308705193856, "grad_norm": 0.7554091215133667, "learning_rate": 3.4867933843495435e-05, "loss": 0.173, "step": 1276 }, { "epoch": 0.9341623994147769, "grad_norm": 0.09717479348182678, "learning_rate": 3.485559121204641e-05, "loss": 0.0031, "step": 1277 }, { "epoch": 0.9348939283101683, "grad_norm": 5.173389434814453, "learning_rate": 3.484324858059739e-05, "loss": 0.1537, "step": 1278 }, { "epoch": 0.9356254572055597, "grad_norm": 0.9893867373466492, "learning_rate": 3.4830905949148365e-05, "loss": 0.0044, "step": 1279 }, { "epoch": 0.936356986100951, "grad_norm": 3.6123850345611572, "learning_rate": 3.4818563317699335e-05, "loss": 0.0995, "step": 1280 }, { "epoch": 0.9370885149963424, "grad_norm": 18.264923095703125, "learning_rate": 3.480622068625031e-05, "loss": 0.1197, "step": 1281 }, { "epoch": 0.9378200438917337, "grad_norm": 3.6301305294036865, "learning_rate": 3.479387805480129e-05, "loss": 0.2167, "step": 1282 }, { "epoch": 0.9385515727871251, "grad_norm": 11.595027923583984, "learning_rate": 3.4781535423352266e-05, "loss": 0.191, "step": 1283 }, { "epoch": 0.9392831016825165, "grad_norm": 5.308442115783691, "learning_rate": 3.4769192791903236e-05, "loss": 0.0241, "step": 1284 }, { "epoch": 0.9400146305779078, "grad_norm": 0.7865046858787537, "learning_rate": 3.475685016045421e-05, "loss": 0.0104, "step": 1285 }, { "epoch": 0.9407461594732992, "grad_norm": 1.2571054697036743, "learning_rate": 3.474450752900519e-05, "loss": 0.2197, "step": 1286 }, { "epoch": 0.9414776883686906, "grad_norm": 6.001715660095215, "learning_rate": 3.4732164897556166e-05, "loss": 0.233, "step": 1287 }, { "epoch": 0.9422092172640819, "grad_norm": 0.4065973162651062, "learning_rate": 3.4719822266107136e-05, "loss": 0.0104, "step": 1288 }, { "epoch": 0.9429407461594733, "grad_norm": 3.1361210346221924, "learning_rate": 3.470747963465811e-05, "loss": 0.0189, "step": 1289 }, { "epoch": 0.9436722750548646, "grad_norm": 0.13243088126182556, "learning_rate": 3.469513700320909e-05, "loss": 0.0057, "step": 1290 }, { "epoch": 0.944403803950256, "grad_norm": 11.493453979492188, "learning_rate": 3.4682794371760067e-05, "loss": 0.1324, "step": 1291 }, { "epoch": 0.9451353328456474, "grad_norm": 6.502859115600586, "learning_rate": 3.467045174031104e-05, "loss": 0.0836, "step": 1292 }, { "epoch": 0.9458668617410387, "grad_norm": 2.9295060634613037, "learning_rate": 3.4658109108862013e-05, "loss": 0.0116, "step": 1293 }, { "epoch": 0.9465983906364301, "grad_norm": 7.569868087768555, "learning_rate": 3.464576647741299e-05, "loss": 0.0603, "step": 1294 }, { "epoch": 0.9473299195318216, "grad_norm": 4.644510746002197, "learning_rate": 3.463342384596397e-05, "loss": 0.0744, "step": 1295 }, { "epoch": 0.9480614484272128, "grad_norm": 6.791468620300293, "learning_rate": 3.462108121451494e-05, "loss": 0.0708, "step": 1296 }, { "epoch": 0.9487929773226043, "grad_norm": 1.744657278060913, "learning_rate": 3.4608738583065914e-05, "loss": 0.1263, "step": 1297 }, { "epoch": 0.9495245062179956, "grad_norm": 5.067393779754639, "learning_rate": 3.459639595161689e-05, "loss": 0.0875, "step": 1298 }, { "epoch": 0.950256035113387, "grad_norm": 14.149504661560059, "learning_rate": 3.458405332016787e-05, "loss": 0.0954, "step": 1299 }, { "epoch": 0.9509875640087784, "grad_norm": 0.20408152043819427, "learning_rate": 3.457171068871884e-05, "loss": 0.0097, "step": 1300 }, { "epoch": 0.9517190929041697, "grad_norm": 2.09316349029541, "learning_rate": 3.4559368057269814e-05, "loss": 0.1171, "step": 1301 }, { "epoch": 0.9524506217995611, "grad_norm": 1.3302726745605469, "learning_rate": 3.454702542582079e-05, "loss": 0.0206, "step": 1302 }, { "epoch": 0.9531821506949525, "grad_norm": 2.0284738540649414, "learning_rate": 3.453468279437176e-05, "loss": 0.1644, "step": 1303 }, { "epoch": 0.9539136795903438, "grad_norm": 9.571793556213379, "learning_rate": 3.452234016292274e-05, "loss": 0.0463, "step": 1304 }, { "epoch": 0.9546452084857352, "grad_norm": 5.340742588043213, "learning_rate": 3.4509997531473715e-05, "loss": 0.0422, "step": 1305 }, { "epoch": 0.9553767373811266, "grad_norm": 0.8035084009170532, "learning_rate": 3.449765490002469e-05, "loss": 0.0187, "step": 1306 }, { "epoch": 0.9561082662765179, "grad_norm": 1.8099313974380493, "learning_rate": 3.448531226857566e-05, "loss": 0.0257, "step": 1307 }, { "epoch": 0.9568397951719093, "grad_norm": 0.33656057715415955, "learning_rate": 3.447296963712664e-05, "loss": 0.0109, "step": 1308 }, { "epoch": 0.9575713240673006, "grad_norm": 0.4568520486354828, "learning_rate": 3.4460627005677615e-05, "loss": 0.009, "step": 1309 }, { "epoch": 0.958302852962692, "grad_norm": 0.46452251076698303, "learning_rate": 3.444828437422859e-05, "loss": 0.0127, "step": 1310 }, { "epoch": 0.9590343818580834, "grad_norm": 4.556276321411133, "learning_rate": 3.443594174277956e-05, "loss": 0.1459, "step": 1311 }, { "epoch": 0.9597659107534747, "grad_norm": 0.23156856000423431, "learning_rate": 3.442359911133054e-05, "loss": 0.0086, "step": 1312 }, { "epoch": 0.9604974396488661, "grad_norm": 8.81092357635498, "learning_rate": 3.4411256479881515e-05, "loss": 0.0338, "step": 1313 }, { "epoch": 0.9612289685442575, "grad_norm": 6.639081001281738, "learning_rate": 3.439891384843249e-05, "loss": 0.0673, "step": 1314 }, { "epoch": 0.9619604974396488, "grad_norm": 0.15702371299266815, "learning_rate": 3.438657121698346e-05, "loss": 0.0057, "step": 1315 }, { "epoch": 0.9626920263350403, "grad_norm": 7.776434421539307, "learning_rate": 3.437422858553443e-05, "loss": 0.0262, "step": 1316 }, { "epoch": 0.9634235552304315, "grad_norm": 0.1506807506084442, "learning_rate": 3.436188595408541e-05, "loss": 0.0043, "step": 1317 }, { "epoch": 0.964155084125823, "grad_norm": 1.17098069190979, "learning_rate": 3.4349543322636386e-05, "loss": 0.0092, "step": 1318 }, { "epoch": 0.9648866130212144, "grad_norm": 0.3691844344139099, "learning_rate": 3.433720069118736e-05, "loss": 0.0041, "step": 1319 }, { "epoch": 0.9656181419166057, "grad_norm": 1.5316411256790161, "learning_rate": 3.432485805973833e-05, "loss": 0.0078, "step": 1320 }, { "epoch": 0.9663496708119971, "grad_norm": 0.7032435536384583, "learning_rate": 3.431251542828931e-05, "loss": 0.0046, "step": 1321 }, { "epoch": 0.9670811997073885, "grad_norm": 0.48857125639915466, "learning_rate": 3.4300172796840286e-05, "loss": 0.0058, "step": 1322 }, { "epoch": 0.9678127286027798, "grad_norm": 0.24567103385925293, "learning_rate": 3.428783016539126e-05, "loss": 0.0033, "step": 1323 }, { "epoch": 0.9685442574981712, "grad_norm": 0.3992704749107361, "learning_rate": 3.427548753394223e-05, "loss": 0.0041, "step": 1324 }, { "epoch": 0.9692757863935626, "grad_norm": 0.07489871233701706, "learning_rate": 3.426314490249321e-05, "loss": 0.0019, "step": 1325 }, { "epoch": 0.9700073152889539, "grad_norm": 0.14131614565849304, "learning_rate": 3.4250802271044187e-05, "loss": 0.0029, "step": 1326 }, { "epoch": 0.9707388441843453, "grad_norm": 26.853607177734375, "learning_rate": 3.423845963959516e-05, "loss": 0.2438, "step": 1327 }, { "epoch": 0.9714703730797366, "grad_norm": 0.0405794121325016, "learning_rate": 3.422611700814613e-05, "loss": 0.0016, "step": 1328 }, { "epoch": 0.972201901975128, "grad_norm": 17.78426742553711, "learning_rate": 3.421377437669711e-05, "loss": 0.0338, "step": 1329 }, { "epoch": 0.9729334308705194, "grad_norm": 0.03602667152881622, "learning_rate": 3.420143174524809e-05, "loss": 0.0014, "step": 1330 }, { "epoch": 0.9736649597659107, "grad_norm": 3.299311637878418, "learning_rate": 3.4189089113799064e-05, "loss": 0.202, "step": 1331 }, { "epoch": 0.9743964886613021, "grad_norm": 13.286209106445312, "learning_rate": 3.4176746482350034e-05, "loss": 0.075, "step": 1332 }, { "epoch": 0.9751280175566935, "grad_norm": 35.12629699707031, "learning_rate": 3.416440385090101e-05, "loss": 0.2472, "step": 1333 }, { "epoch": 0.9758595464520848, "grad_norm": 27.41487693786621, "learning_rate": 3.415206121945199e-05, "loss": 0.0449, "step": 1334 }, { "epoch": 0.9765910753474762, "grad_norm": 30.795169830322266, "learning_rate": 3.4139718588002964e-05, "loss": 0.0291, "step": 1335 }, { "epoch": 0.9773226042428675, "grad_norm": 2.476339340209961, "learning_rate": 3.4127375956553934e-05, "loss": 0.2017, "step": 1336 }, { "epoch": 0.978054133138259, "grad_norm": 0.028546392917633057, "learning_rate": 3.411503332510491e-05, "loss": 0.0013, "step": 1337 }, { "epoch": 0.9787856620336504, "grad_norm": 0.03754347562789917, "learning_rate": 3.410269069365589e-05, "loss": 0.0013, "step": 1338 }, { "epoch": 0.9795171909290417, "grad_norm": 7.141325950622559, "learning_rate": 3.4090348062206865e-05, "loss": 0.3399, "step": 1339 }, { "epoch": 0.9802487198244331, "grad_norm": 2.107999086380005, "learning_rate": 3.4078005430757835e-05, "loss": 0.1833, "step": 1340 }, { "epoch": 0.9809802487198245, "grad_norm": 1.6641510725021362, "learning_rate": 3.406566279930881e-05, "loss": 0.2085, "step": 1341 }, { "epoch": 0.9817117776152158, "grad_norm": 18.024417877197266, "learning_rate": 3.405332016785979e-05, "loss": 0.2938, "step": 1342 }, { "epoch": 0.9824433065106072, "grad_norm": 6.3488030433654785, "learning_rate": 3.4040977536410765e-05, "loss": 0.3224, "step": 1343 }, { "epoch": 0.9831748354059985, "grad_norm": 0.04848192259669304, "learning_rate": 3.4028634904961735e-05, "loss": 0.002, "step": 1344 }, { "epoch": 0.9839063643013899, "grad_norm": 20.976848602294922, "learning_rate": 3.401629227351271e-05, "loss": 0.0871, "step": 1345 }, { "epoch": 0.9846378931967813, "grad_norm": 1.4569735527038574, "learning_rate": 3.400394964206369e-05, "loss": 0.3727, "step": 1346 }, { "epoch": 0.9853694220921726, "grad_norm": 0.09759881347417831, "learning_rate": 3.3991607010614665e-05, "loss": 0.0039, "step": 1347 }, { "epoch": 0.986100950987564, "grad_norm": 0.09496621787548065, "learning_rate": 3.3979264379165635e-05, "loss": 0.0039, "step": 1348 }, { "epoch": 0.9868324798829554, "grad_norm": 1.4227921962738037, "learning_rate": 3.396692174771661e-05, "loss": 0.1581, "step": 1349 }, { "epoch": 0.9875640087783467, "grad_norm": 0.17230813205242157, "learning_rate": 3.395457911626759e-05, "loss": 0.0077, "step": 1350 }, { "epoch": 0.9882955376737381, "grad_norm": 0.27443447709083557, "learning_rate": 3.3942236484818566e-05, "loss": 0.0092, "step": 1351 }, { "epoch": 0.9890270665691295, "grad_norm": 0.4629533886909485, "learning_rate": 3.3929893853369536e-05, "loss": 0.0108, "step": 1352 }, { "epoch": 0.9897585954645208, "grad_norm": 0.2199287861585617, "learning_rate": 3.391755122192051e-05, "loss": 0.0091, "step": 1353 }, { "epoch": 0.9904901243599122, "grad_norm": 1.5315794944763184, "learning_rate": 3.390520859047149e-05, "loss": 0.1067, "step": 1354 }, { "epoch": 0.9912216532553035, "grad_norm": 6.000120639801025, "learning_rate": 3.3892865959022466e-05, "loss": 0.0992, "step": 1355 }, { "epoch": 0.991953182150695, "grad_norm": 2.5323264598846436, "learning_rate": 3.3880523327573436e-05, "loss": 0.1003, "step": 1356 }, { "epoch": 0.9926847110460864, "grad_norm": 0.21028047800064087, "learning_rate": 3.386818069612441e-05, "loss": 0.01, "step": 1357 }, { "epoch": 0.9934162399414777, "grad_norm": 3.440890312194824, "learning_rate": 3.385583806467539e-05, "loss": 0.0166, "step": 1358 }, { "epoch": 0.9941477688368691, "grad_norm": 1.734386920928955, "learning_rate": 3.3843495433226366e-05, "loss": 0.0172, "step": 1359 }, { "epoch": 0.9948792977322605, "grad_norm": 1.7134753465652466, "learning_rate": 3.3831152801777336e-05, "loss": 0.0304, "step": 1360 }, { "epoch": 0.9956108266276518, "grad_norm": 9.079950332641602, "learning_rate": 3.381881017032831e-05, "loss": 0.0298, "step": 1361 }, { "epoch": 0.9963423555230432, "grad_norm": 0.4615827798843384, "learning_rate": 3.380646753887929e-05, "loss": 0.0105, "step": 1362 }, { "epoch": 0.9970738844184345, "grad_norm": 0.4126068353652954, "learning_rate": 3.379412490743027e-05, "loss": 0.0108, "step": 1363 }, { "epoch": 0.9978054133138259, "grad_norm": 5.417700290679932, "learning_rate": 3.378178227598124e-05, "loss": 0.0213, "step": 1364 }, { "epoch": 0.9985369422092173, "grad_norm": 3.4001145362854004, "learning_rate": 3.3769439644532214e-05, "loss": 0.1431, "step": 1365 }, { "epoch": 0.9992684711046086, "grad_norm": 0.37163859605789185, "learning_rate": 3.375709701308319e-05, "loss": 0.0084, "step": 1366 }, { "epoch": 1.0, "grad_norm": 0.10806220769882202, "learning_rate": 3.374475438163416e-05, "loss": 0.0048, "step": 1367 }, { "epoch": 1.0, "eval_accuracy": 0.9814738575545492, "eval_loss": 0.07000496238470078, "eval_runtime": 90.7622, "eval_samples_per_second": 53.524, "eval_steps_per_second": 1.675, "step": 1367 }, { "epoch": 1.0007315288953913, "grad_norm": 0.14122112095355988, "learning_rate": 3.373241175018514e-05, "loss": 0.005, "step": 1368 }, { "epoch": 1.0014630577907828, "grad_norm": 1.8560173511505127, "learning_rate": 3.3720069118736114e-05, "loss": 0.1455, "step": 1369 }, { "epoch": 1.0021945866861741, "grad_norm": 0.11914806067943573, "learning_rate": 3.370772648728709e-05, "loss": 0.0041, "step": 1370 }, { "epoch": 1.0029261155815654, "grad_norm": 0.4728309214115143, "learning_rate": 3.369538385583806e-05, "loss": 0.0048, "step": 1371 }, { "epoch": 1.003657644476957, "grad_norm": 0.10563748329877853, "learning_rate": 3.368304122438904e-05, "loss": 0.0046, "step": 1372 }, { "epoch": 1.0043891733723482, "grad_norm": 0.068609319627285, "learning_rate": 3.3670698592940014e-05, "loss": 0.0031, "step": 1373 }, { "epoch": 1.0051207022677395, "grad_norm": 7.077928066253662, "learning_rate": 3.365835596149099e-05, "loss": 0.0939, "step": 1374 }, { "epoch": 1.005852231163131, "grad_norm": 0.0787535160779953, "learning_rate": 3.364601333004196e-05, "loss": 0.0033, "step": 1375 }, { "epoch": 1.0065837600585223, "grad_norm": 12.91358470916748, "learning_rate": 3.363367069859294e-05, "loss": 0.0178, "step": 1376 }, { "epoch": 1.0073152889539136, "grad_norm": 9.928356170654297, "learning_rate": 3.3621328067143915e-05, "loss": 0.0234, "step": 1377 }, { "epoch": 1.008046817849305, "grad_norm": 1.0486674308776855, "learning_rate": 3.360898543569489e-05, "loss": 0.18, "step": 1378 }, { "epoch": 1.0087783467446965, "grad_norm": 13.070659637451172, "learning_rate": 3.359664280424586e-05, "loss": 0.1408, "step": 1379 }, { "epoch": 1.0095098756400878, "grad_norm": 0.2253500074148178, "learning_rate": 3.358430017279684e-05, "loss": 0.0042, "step": 1380 }, { "epoch": 1.010241404535479, "grad_norm": 13.050259590148926, "learning_rate": 3.3571957541347815e-05, "loss": 0.0588, "step": 1381 }, { "epoch": 1.0109729334308706, "grad_norm": 0.07391895353794098, "learning_rate": 3.355961490989879e-05, "loss": 0.003, "step": 1382 }, { "epoch": 1.0117044623262619, "grad_norm": 0.44378185272216797, "learning_rate": 3.354727227844976e-05, "loss": 0.0041, "step": 1383 }, { "epoch": 1.0124359912216532, "grad_norm": 0.10089312493801117, "learning_rate": 3.353492964700074e-05, "loss": 0.0037, "step": 1384 }, { "epoch": 1.0131675201170447, "grad_norm": 0.07311253249645233, "learning_rate": 3.3522587015551716e-05, "loss": 0.0028, "step": 1385 }, { "epoch": 1.013899049012436, "grad_norm": 0.49573928117752075, "learning_rate": 3.351024438410269e-05, "loss": 0.0043, "step": 1386 }, { "epoch": 1.0146305779078273, "grad_norm": 7.240993976593018, "learning_rate": 3.349790175265366e-05, "loss": 0.0316, "step": 1387 }, { "epoch": 1.0153621068032188, "grad_norm": 8.932456970214844, "learning_rate": 3.348555912120464e-05, "loss": 0.1062, "step": 1388 }, { "epoch": 1.01609363569861, "grad_norm": 0.07182615995407104, "learning_rate": 3.3473216489755616e-05, "loss": 0.0031, "step": 1389 }, { "epoch": 1.0168251645940014, "grad_norm": 0.7465510368347168, "learning_rate": 3.346087385830659e-05, "loss": 0.1726, "step": 1390 }, { "epoch": 1.017556693489393, "grad_norm": 0.06261222064495087, "learning_rate": 3.344853122685756e-05, "loss": 0.0027, "step": 1391 }, { "epoch": 1.0182882223847842, "grad_norm": 9.52116870880127, "learning_rate": 3.343618859540854e-05, "loss": 0.029, "step": 1392 }, { "epoch": 1.0190197512801755, "grad_norm": 0.09411641210317612, "learning_rate": 3.3423845963959516e-05, "loss": 0.0037, "step": 1393 }, { "epoch": 1.019751280175567, "grad_norm": 6.062636375427246, "learning_rate": 3.341150333251049e-05, "loss": 0.1288, "step": 1394 }, { "epoch": 1.0204828090709583, "grad_norm": 15.433335304260254, "learning_rate": 3.339916070106146e-05, "loss": 0.147, "step": 1395 }, { "epoch": 1.0212143379663496, "grad_norm": 0.11908871680498123, "learning_rate": 3.338681806961244e-05, "loss": 0.0037, "step": 1396 }, { "epoch": 1.021945866861741, "grad_norm": 0.1538693904876709, "learning_rate": 3.337447543816342e-05, "loss": 0.0044, "step": 1397 }, { "epoch": 1.0226773957571325, "grad_norm": 15.297937393188477, "learning_rate": 3.3362132806714394e-05, "loss": 0.0261, "step": 1398 }, { "epoch": 1.0234089246525238, "grad_norm": 0.08026278018951416, "learning_rate": 3.3349790175265364e-05, "loss": 0.0033, "step": 1399 }, { "epoch": 1.024140453547915, "grad_norm": 0.05177146941423416, "learning_rate": 3.333744754381634e-05, "loss": 0.0024, "step": 1400 }, { "epoch": 1.0248719824433066, "grad_norm": 0.19196343421936035, "learning_rate": 3.332510491236732e-05, "loss": 0.0039, "step": 1401 }, { "epoch": 1.0256035113386979, "grad_norm": 1.0192885398864746, "learning_rate": 3.3312762280918294e-05, "loss": 0.21, "step": 1402 }, { "epoch": 1.0263350402340892, "grad_norm": 3.841942071914673, "learning_rate": 3.3300419649469264e-05, "loss": 0.118, "step": 1403 }, { "epoch": 1.0270665691294807, "grad_norm": 0.3954388201236725, "learning_rate": 3.328807701802024e-05, "loss": 0.0045, "step": 1404 }, { "epoch": 1.027798098024872, "grad_norm": 0.13675998151302338, "learning_rate": 3.327573438657122e-05, "loss": 0.0047, "step": 1405 }, { "epoch": 1.0285296269202633, "grad_norm": 0.10269396752119064, "learning_rate": 3.3263391755122194e-05, "loss": 0.0038, "step": 1406 }, { "epoch": 1.0292611558156548, "grad_norm": 1.173494815826416, "learning_rate": 3.3251049123673164e-05, "loss": 0.1708, "step": 1407 }, { "epoch": 1.029992684711046, "grad_norm": 4.8429856300354, "learning_rate": 3.323870649222414e-05, "loss": 0.0612, "step": 1408 }, { "epoch": 1.0307242136064374, "grad_norm": 10.792043685913086, "learning_rate": 3.322636386077512e-05, "loss": 0.2334, "step": 1409 }, { "epoch": 1.031455742501829, "grad_norm": 5.702601909637451, "learning_rate": 3.3214021229326095e-05, "loss": 0.0146, "step": 1410 }, { "epoch": 1.0321872713972202, "grad_norm": 0.44920292496681213, "learning_rate": 3.3201678597877065e-05, "loss": 0.0082, "step": 1411 }, { "epoch": 1.0329188002926115, "grad_norm": 0.8219587802886963, "learning_rate": 3.318933596642804e-05, "loss": 0.0103, "step": 1412 }, { "epoch": 1.0336503291880028, "grad_norm": 16.69764518737793, "learning_rate": 3.317699333497902e-05, "loss": 0.0763, "step": 1413 }, { "epoch": 1.0343818580833943, "grad_norm": 4.961604595184326, "learning_rate": 3.3164650703529995e-05, "loss": 0.118, "step": 1414 }, { "epoch": 1.0351133869787856, "grad_norm": 0.11345091462135315, "learning_rate": 3.3152308072080965e-05, "loss": 0.0053, "step": 1415 }, { "epoch": 1.035844915874177, "grad_norm": 0.12337178736925125, "learning_rate": 3.313996544063194e-05, "loss": 0.0051, "step": 1416 }, { "epoch": 1.0365764447695685, "grad_norm": 1.660003423690796, "learning_rate": 3.312762280918292e-05, "loss": 0.013, "step": 1417 }, { "epoch": 1.0373079736649597, "grad_norm": 3.2699437141418457, "learning_rate": 3.3115280177733896e-05, "loss": 0.0169, "step": 1418 }, { "epoch": 1.038039502560351, "grad_norm": 6.860514163970947, "learning_rate": 3.3102937546284866e-05, "loss": 0.1033, "step": 1419 }, { "epoch": 1.0387710314557426, "grad_norm": 5.467836856842041, "learning_rate": 3.309059491483584e-05, "loss": 0.0309, "step": 1420 }, { "epoch": 1.0395025603511339, "grad_norm": 2.2552571296691895, "learning_rate": 3.307825228338682e-05, "loss": 0.014, "step": 1421 }, { "epoch": 1.0402340892465252, "grad_norm": 0.12284594774246216, "learning_rate": 3.3065909651937796e-05, "loss": 0.0053, "step": 1422 }, { "epoch": 1.0409656181419167, "grad_norm": 9.828011512756348, "learning_rate": 3.3053567020488766e-05, "loss": 0.0205, "step": 1423 }, { "epoch": 1.041697147037308, "grad_norm": 0.2413022220134735, "learning_rate": 3.304122438903974e-05, "loss": 0.0059, "step": 1424 }, { "epoch": 1.0424286759326993, "grad_norm": 3.397561550140381, "learning_rate": 3.302888175759072e-05, "loss": 0.0124, "step": 1425 }, { "epoch": 1.0431602048280908, "grad_norm": 2.463991403579712, "learning_rate": 3.3016539126141696e-05, "loss": 0.0083, "step": 1426 }, { "epoch": 1.043891733723482, "grad_norm": 0.1121060699224472, "learning_rate": 3.3004196494692666e-05, "loss": 0.0047, "step": 1427 }, { "epoch": 1.0446232626188734, "grad_norm": 0.1086101233959198, "learning_rate": 3.299185386324364e-05, "loss": 0.0041, "step": 1428 }, { "epoch": 1.045354791514265, "grad_norm": 0.14224845170974731, "learning_rate": 3.297951123179462e-05, "loss": 0.0042, "step": 1429 }, { "epoch": 1.0460863204096562, "grad_norm": 0.13674986362457275, "learning_rate": 3.29671686003456e-05, "loss": 0.0037, "step": 1430 }, { "epoch": 1.0468178493050475, "grad_norm": 0.06888135522603989, "learning_rate": 3.295482596889657e-05, "loss": 0.0029, "step": 1431 }, { "epoch": 1.047549378200439, "grad_norm": 1.7888869047164917, "learning_rate": 3.2942483337447544e-05, "loss": 0.0062, "step": 1432 }, { "epoch": 1.0482809070958303, "grad_norm": 24.489238739013672, "learning_rate": 3.293014070599852e-05, "loss": 0.134, "step": 1433 }, { "epoch": 1.0490124359912216, "grad_norm": 39.44647216796875, "learning_rate": 3.29177980745495e-05, "loss": 0.0479, "step": 1434 }, { "epoch": 1.049743964886613, "grad_norm": 0.061819396913051605, "learning_rate": 3.290545544310047e-05, "loss": 0.0028, "step": 1435 }, { "epoch": 1.0504754937820044, "grad_norm": 0.04811714589595795, "learning_rate": 3.2893112811651444e-05, "loss": 0.0023, "step": 1436 }, { "epoch": 1.0512070226773957, "grad_norm": 5.214432239532471, "learning_rate": 3.288077018020242e-05, "loss": 0.135, "step": 1437 }, { "epoch": 1.051938551572787, "grad_norm": 0.9701353907585144, "learning_rate": 3.28684275487534e-05, "loss": 0.0046, "step": 1438 }, { "epoch": 1.0526700804681786, "grad_norm": 0.06117640435695648, "learning_rate": 3.285608491730437e-05, "loss": 0.0027, "step": 1439 }, { "epoch": 1.0534016093635699, "grad_norm": 0.7926205396652222, "learning_rate": 3.2843742285855344e-05, "loss": 0.1807, "step": 1440 }, { "epoch": 1.0541331382589612, "grad_norm": 0.055869486182928085, "learning_rate": 3.283139965440632e-05, "loss": 0.0023, "step": 1441 }, { "epoch": 1.0548646671543527, "grad_norm": 12.322328567504883, "learning_rate": 3.28190570229573e-05, "loss": 0.6107, "step": 1442 }, { "epoch": 1.055596196049744, "grad_norm": 0.05562188848853111, "learning_rate": 3.280671439150827e-05, "loss": 0.0022, "step": 1443 }, { "epoch": 1.0563277249451353, "grad_norm": 19.12864112854004, "learning_rate": 3.2794371760059245e-05, "loss": 0.0285, "step": 1444 }, { "epoch": 1.0570592538405268, "grad_norm": 6.648989677429199, "learning_rate": 3.278202912861022e-05, "loss": 0.104, "step": 1445 }, { "epoch": 1.057790782735918, "grad_norm": 4.608105659484863, "learning_rate": 3.27696864971612e-05, "loss": 0.1216, "step": 1446 }, { "epoch": 1.0585223116313094, "grad_norm": 0.17034007608890533, "learning_rate": 3.275734386571217e-05, "loss": 0.0032, "step": 1447 }, { "epoch": 1.059253840526701, "grad_norm": 0.12877295911312103, "learning_rate": 3.2745001234263145e-05, "loss": 0.0033, "step": 1448 }, { "epoch": 1.0599853694220922, "grad_norm": 18.68419647216797, "learning_rate": 3.273265860281412e-05, "loss": 0.0481, "step": 1449 }, { "epoch": 1.0607168983174835, "grad_norm": 0.0699402391910553, "learning_rate": 3.27203159713651e-05, "loss": 0.0027, "step": 1450 }, { "epoch": 1.0614484272128748, "grad_norm": 0.42460328340530396, "learning_rate": 3.270797333991607e-05, "loss": 0.0048, "step": 1451 }, { "epoch": 1.0621799561082663, "grad_norm": 0.5198849439620972, "learning_rate": 3.2695630708467045e-05, "loss": 0.0058, "step": 1452 }, { "epoch": 1.0629114850036576, "grad_norm": 0.053956206887960434, "learning_rate": 3.268328807701802e-05, "loss": 0.0025, "step": 1453 }, { "epoch": 1.063643013899049, "grad_norm": 0.04987230896949768, "learning_rate": 3.267094544556899e-05, "loss": 0.0023, "step": 1454 }, { "epoch": 1.0643745427944404, "grad_norm": 1.8493869304656982, "learning_rate": 3.265860281411997e-05, "loss": 0.1954, "step": 1455 }, { "epoch": 1.0651060716898317, "grad_norm": 0.16805218160152435, "learning_rate": 3.2646260182670946e-05, "loss": 0.0042, "step": 1456 }, { "epoch": 1.065837600585223, "grad_norm": 10.349654197692871, "learning_rate": 3.263391755122192e-05, "loss": 0.086, "step": 1457 }, { "epoch": 1.0665691294806146, "grad_norm": 1.000514030456543, "learning_rate": 3.262157491977289e-05, "loss": 0.0052, "step": 1458 }, { "epoch": 1.0673006583760059, "grad_norm": 0.07145128399133682, "learning_rate": 3.260923228832387e-05, "loss": 0.0028, "step": 1459 }, { "epoch": 1.0680321872713971, "grad_norm": 0.09175445884466171, "learning_rate": 3.2596889656874846e-05, "loss": 0.0034, "step": 1460 }, { "epoch": 1.0687637161667887, "grad_norm": 0.08439642935991287, "learning_rate": 3.258454702542582e-05, "loss": 0.0034, "step": 1461 }, { "epoch": 1.06949524506218, "grad_norm": 0.11184411495923996, "learning_rate": 3.257220439397679e-05, "loss": 0.0034, "step": 1462 }, { "epoch": 1.0702267739575713, "grad_norm": 0.11912410706281662, "learning_rate": 3.255986176252777e-05, "loss": 0.0029, "step": 1463 }, { "epoch": 1.0709583028529628, "grad_norm": 5.104803562164307, "learning_rate": 3.254751913107875e-05, "loss": 0.2706, "step": 1464 }, { "epoch": 1.071689831748354, "grad_norm": 0.07239337265491486, "learning_rate": 3.2535176499629723e-05, "loss": 0.0026, "step": 1465 }, { "epoch": 1.0724213606437454, "grad_norm": 10.735053062438965, "learning_rate": 3.2522833868180693e-05, "loss": 0.2412, "step": 1466 }, { "epoch": 1.073152889539137, "grad_norm": 6.872513294219971, "learning_rate": 3.251049123673167e-05, "loss": 0.3183, "step": 1467 }, { "epoch": 1.0738844184345282, "grad_norm": 0.0859685018658638, "learning_rate": 3.249814860528265e-05, "loss": 0.0027, "step": 1468 }, { "epoch": 1.0746159473299195, "grad_norm": 0.10366534441709518, "learning_rate": 3.2485805973833624e-05, "loss": 0.0029, "step": 1469 }, { "epoch": 1.075347476225311, "grad_norm": 0.04511750489473343, "learning_rate": 3.2473463342384594e-05, "loss": 0.0021, "step": 1470 }, { "epoch": 1.0760790051207023, "grad_norm": 8.41960620880127, "learning_rate": 3.246112071093557e-05, "loss": 0.2614, "step": 1471 }, { "epoch": 1.0768105340160936, "grad_norm": 1.840263843536377, "learning_rate": 3.244877807948655e-05, "loss": 0.1751, "step": 1472 }, { "epoch": 1.077542062911485, "grad_norm": 0.09714819490909576, "learning_rate": 3.2436435448037524e-05, "loss": 0.0022, "step": 1473 }, { "epoch": 1.0782735918068764, "grad_norm": 4.732848644256592, "learning_rate": 3.2424092816588494e-05, "loss": 0.1119, "step": 1474 }, { "epoch": 1.0790051207022677, "grad_norm": 4.838768005371094, "learning_rate": 3.241175018513947e-05, "loss": 0.1108, "step": 1475 }, { "epoch": 1.079736649597659, "grad_norm": 3.0129973888397217, "learning_rate": 3.239940755369045e-05, "loss": 0.1186, "step": 1476 }, { "epoch": 1.0804681784930505, "grad_norm": 5.401240348815918, "learning_rate": 3.2387064922241425e-05, "loss": 0.0234, "step": 1477 }, { "epoch": 1.0811997073884418, "grad_norm": 1.2867423295974731, "learning_rate": 3.2374722290792395e-05, "loss": 0.0108, "step": 1478 }, { "epoch": 1.0819312362838331, "grad_norm": 0.136821448802948, "learning_rate": 3.236237965934337e-05, "loss": 0.0051, "step": 1479 }, { "epoch": 1.0826627651792247, "grad_norm": 8.928567886352539, "learning_rate": 3.235003702789435e-05, "loss": 0.0248, "step": 1480 }, { "epoch": 1.083394294074616, "grad_norm": 0.2059083878993988, "learning_rate": 3.2337694396445325e-05, "loss": 0.0049, "step": 1481 }, { "epoch": 1.0841258229700073, "grad_norm": 0.4736698865890503, "learning_rate": 3.2325351764996295e-05, "loss": 0.0057, "step": 1482 }, { "epoch": 1.0848573518653988, "grad_norm": 0.9132351279258728, "learning_rate": 3.231300913354727e-05, "loss": 0.0062, "step": 1483 }, { "epoch": 1.08558888076079, "grad_norm": 6.135921478271484, "learning_rate": 3.230066650209825e-05, "loss": 0.0321, "step": 1484 }, { "epoch": 1.0863204096561814, "grad_norm": 5.88595724105835, "learning_rate": 3.2288323870649225e-05, "loss": 0.0435, "step": 1485 }, { "epoch": 1.0870519385515727, "grad_norm": 8.652595520019531, "learning_rate": 3.2275981239200195e-05, "loss": 0.2464, "step": 1486 }, { "epoch": 1.0877834674469642, "grad_norm": 0.16429723799228668, "learning_rate": 3.226363860775117e-05, "loss": 0.0058, "step": 1487 }, { "epoch": 1.0885149963423555, "grad_norm": 8.09591293334961, "learning_rate": 3.225129597630215e-05, "loss": 0.0737, "step": 1488 }, { "epoch": 1.0892465252377468, "grad_norm": 7.221816539764404, "learning_rate": 3.2238953344853126e-05, "loss": 0.1224, "step": 1489 }, { "epoch": 1.0899780541331383, "grad_norm": 0.914445698261261, "learning_rate": 3.2226610713404096e-05, "loss": 0.2019, "step": 1490 }, { "epoch": 1.0907095830285296, "grad_norm": 5.0742669105529785, "learning_rate": 3.221426808195507e-05, "loss": 0.0292, "step": 1491 }, { "epoch": 1.091441111923921, "grad_norm": 5.8741302490234375, "learning_rate": 3.220192545050605e-05, "loss": 0.0143, "step": 1492 }, { "epoch": 1.0921726408193124, "grad_norm": 1.12317955493927, "learning_rate": 3.2189582819057026e-05, "loss": 0.0106, "step": 1493 }, { "epoch": 1.0929041697147037, "grad_norm": 19.554725646972656, "learning_rate": 3.2177240187607996e-05, "loss": 0.0447, "step": 1494 }, { "epoch": 1.093635698610095, "grad_norm": 5.269433975219727, "learning_rate": 3.216489755615897e-05, "loss": 0.121, "step": 1495 }, { "epoch": 1.0943672275054865, "grad_norm": 0.20054961740970612, "learning_rate": 3.215255492470995e-05, "loss": 0.0035, "step": 1496 }, { "epoch": 1.0950987564008778, "grad_norm": 12.444910049438477, "learning_rate": 3.2140212293260927e-05, "loss": 0.0394, "step": 1497 }, { "epoch": 1.0958302852962691, "grad_norm": 12.25739574432373, "learning_rate": 3.2127869661811897e-05, "loss": 0.1361, "step": 1498 }, { "epoch": 1.0965618141916607, "grad_norm": 0.13976997137069702, "learning_rate": 3.2115527030362873e-05, "loss": 0.0049, "step": 1499 }, { "epoch": 1.097293343087052, "grad_norm": 0.08523549884557724, "learning_rate": 3.210318439891385e-05, "loss": 0.0038, "step": 1500 }, { "epoch": 1.0980248719824433, "grad_norm": 0.871315062046051, "learning_rate": 3.209084176746483e-05, "loss": 0.0083, "step": 1501 }, { "epoch": 1.0987564008778348, "grad_norm": 0.25701168179512024, "learning_rate": 3.20784991360158e-05, "loss": 0.0042, "step": 1502 }, { "epoch": 1.099487929773226, "grad_norm": 0.05790635570883751, "learning_rate": 3.2066156504566774e-05, "loss": 0.0026, "step": 1503 }, { "epoch": 1.1002194586686174, "grad_norm": 2.1417555809020996, "learning_rate": 3.205381387311775e-05, "loss": 0.1481, "step": 1504 }, { "epoch": 1.1009509875640089, "grad_norm": 0.16815879940986633, "learning_rate": 3.204147124166873e-05, "loss": 0.004, "step": 1505 }, { "epoch": 1.1016825164594002, "grad_norm": 3.009018898010254, "learning_rate": 3.20291286102197e-05, "loss": 0.0086, "step": 1506 }, { "epoch": 1.1024140453547915, "grad_norm": 13.025223731994629, "learning_rate": 3.2016785978770674e-05, "loss": 0.0658, "step": 1507 }, { "epoch": 1.1031455742501828, "grad_norm": 0.3647003769874573, "learning_rate": 3.200444334732165e-05, "loss": 0.0057, "step": 1508 }, { "epoch": 1.1038771031455743, "grad_norm": 0.07714732736349106, "learning_rate": 3.199210071587263e-05, "loss": 0.0032, "step": 1509 }, { "epoch": 1.1046086320409656, "grad_norm": 6.103455543518066, "learning_rate": 3.19797580844236e-05, "loss": 0.0999, "step": 1510 }, { "epoch": 1.105340160936357, "grad_norm": 0.22920747101306915, "learning_rate": 3.1967415452974575e-05, "loss": 0.005, "step": 1511 }, { "epoch": 1.1060716898317484, "grad_norm": 0.07083591818809509, "learning_rate": 3.195507282152555e-05, "loss": 0.0032, "step": 1512 }, { "epoch": 1.1068032187271397, "grad_norm": 3.7413618564605713, "learning_rate": 3.194273019007653e-05, "loss": 0.1602, "step": 1513 }, { "epoch": 1.107534747622531, "grad_norm": 10.588398933410645, "learning_rate": 3.19303875586275e-05, "loss": 0.0719, "step": 1514 }, { "epoch": 1.1082662765179225, "grad_norm": 0.7151607275009155, "learning_rate": 3.1918044927178475e-05, "loss": 0.006, "step": 1515 }, { "epoch": 1.1089978054133138, "grad_norm": 0.18764668703079224, "learning_rate": 3.190570229572945e-05, "loss": 0.004, "step": 1516 }, { "epoch": 1.1097293343087051, "grad_norm": 2.40268874168396, "learning_rate": 3.189335966428043e-05, "loss": 0.0126, "step": 1517 }, { "epoch": 1.1104608632040966, "grad_norm": 15.237897872924805, "learning_rate": 3.18810170328314e-05, "loss": 0.041, "step": 1518 }, { "epoch": 1.111192392099488, "grad_norm": 6.379678249359131, "learning_rate": 3.1868674401382375e-05, "loss": 0.0978, "step": 1519 }, { "epoch": 1.1119239209948792, "grad_norm": 7.113809585571289, "learning_rate": 3.185633176993335e-05, "loss": 0.1167, "step": 1520 }, { "epoch": 1.1126554498902708, "grad_norm": 0.29278329014778137, "learning_rate": 3.184398913848433e-05, "loss": 0.0046, "step": 1521 }, { "epoch": 1.113386978785662, "grad_norm": 0.03681964799761772, "learning_rate": 3.18316465070353e-05, "loss": 0.0016, "step": 1522 }, { "epoch": 1.1141185076810534, "grad_norm": 0.14353488385677338, "learning_rate": 3.1819303875586276e-05, "loss": 0.0043, "step": 1523 }, { "epoch": 1.1148500365764447, "grad_norm": 0.07111746817827225, "learning_rate": 3.180696124413725e-05, "loss": 0.0028, "step": 1524 }, { "epoch": 1.1155815654718362, "grad_norm": 6.321747303009033, "learning_rate": 3.179461861268823e-05, "loss": 0.0082, "step": 1525 }, { "epoch": 1.1163130943672275, "grad_norm": 0.0818285197019577, "learning_rate": 3.17822759812392e-05, "loss": 0.0031, "step": 1526 }, { "epoch": 1.1170446232626188, "grad_norm": 19.652467727661133, "learning_rate": 3.1769933349790176e-05, "loss": 0.0598, "step": 1527 }, { "epoch": 1.1177761521580103, "grad_norm": 0.03774288296699524, "learning_rate": 3.175759071834115e-05, "loss": 0.0017, "step": 1528 }, { "epoch": 1.1185076810534016, "grad_norm": 0.2036275863647461, "learning_rate": 3.174524808689213e-05, "loss": 0.0036, "step": 1529 }, { "epoch": 1.119239209948793, "grad_norm": 0.18142901360988617, "learning_rate": 3.17329054554431e-05, "loss": 0.0031, "step": 1530 }, { "epoch": 1.1199707388441844, "grad_norm": 0.21418355405330658, "learning_rate": 3.1720562823994077e-05, "loss": 0.0035, "step": 1531 }, { "epoch": 1.1207022677395757, "grad_norm": 0.05197026580572128, "learning_rate": 3.170822019254505e-05, "loss": 0.0019, "step": 1532 }, { "epoch": 1.121433796634967, "grad_norm": 6.9361114501953125, "learning_rate": 3.169587756109603e-05, "loss": 0.0138, "step": 1533 }, { "epoch": 1.1221653255303585, "grad_norm": 16.258949279785156, "learning_rate": 3.1683534929647e-05, "loss": 0.0414, "step": 1534 }, { "epoch": 1.1228968544257498, "grad_norm": 0.10933981835842133, "learning_rate": 3.167119229819798e-05, "loss": 0.0022, "step": 1535 }, { "epoch": 1.1236283833211411, "grad_norm": 0.2289559245109558, "learning_rate": 3.1658849666748954e-05, "loss": 0.0025, "step": 1536 }, { "epoch": 1.1243599122165326, "grad_norm": 4.00492000579834, "learning_rate": 3.1646507035299924e-05, "loss": 0.0112, "step": 1537 }, { "epoch": 1.125091441111924, "grad_norm": 0.14629590511322021, "learning_rate": 3.16341644038509e-05, "loss": 0.0022, "step": 1538 }, { "epoch": 1.1258229700073152, "grad_norm": 0.04909786209464073, "learning_rate": 3.162182177240188e-05, "loss": 0.0014, "step": 1539 }, { "epoch": 1.1265544989027068, "grad_norm": 0.04039532318711281, "learning_rate": 3.1609479140952854e-05, "loss": 0.0016, "step": 1540 }, { "epoch": 1.127286027798098, "grad_norm": 0.13166558742523193, "learning_rate": 3.1597136509503824e-05, "loss": 0.0018, "step": 1541 }, { "epoch": 1.1280175566934894, "grad_norm": 8.47961139678955, "learning_rate": 3.15847938780548e-05, "loss": 0.0977, "step": 1542 }, { "epoch": 1.1287490855888809, "grad_norm": 5.756829261779785, "learning_rate": 3.157245124660578e-05, "loss": 0.1118, "step": 1543 }, { "epoch": 1.1294806144842722, "grad_norm": 0.5578294992446899, "learning_rate": 3.1560108615156754e-05, "loss": 0.0027, "step": 1544 }, { "epoch": 1.1302121433796635, "grad_norm": 0.027394704520702362, "learning_rate": 3.1547765983707725e-05, "loss": 0.0011, "step": 1545 }, { "epoch": 1.1309436722750548, "grad_norm": 16.374839782714844, "learning_rate": 3.15354233522587e-05, "loss": 0.1479, "step": 1546 }, { "epoch": 1.1316752011704463, "grad_norm": 0.023247702047228813, "learning_rate": 3.152308072080968e-05, "loss": 0.001, "step": 1547 }, { "epoch": 1.1324067300658376, "grad_norm": 0.02205943502485752, "learning_rate": 3.1510738089360655e-05, "loss": 0.0008, "step": 1548 }, { "epoch": 1.1331382589612289, "grad_norm": 0.07589301466941833, "learning_rate": 3.1498395457911625e-05, "loss": 0.0011, "step": 1549 }, { "epoch": 1.1338697878566204, "grad_norm": 24.99009132385254, "learning_rate": 3.14860528264626e-05, "loss": 0.0183, "step": 1550 }, { "epoch": 1.1346013167520117, "grad_norm": 0.02027117647230625, "learning_rate": 3.147371019501358e-05, "loss": 0.0009, "step": 1551 }, { "epoch": 1.135332845647403, "grad_norm": 0.40076255798339844, "learning_rate": 3.1461367563564555e-05, "loss": 0.001, "step": 1552 }, { "epoch": 1.1360643745427945, "grad_norm": 12.699847221374512, "learning_rate": 3.1449024932115525e-05, "loss": 0.2822, "step": 1553 }, { "epoch": 1.1367959034381858, "grad_norm": 0.04021407663822174, "learning_rate": 3.14366823006665e-05, "loss": 0.0012, "step": 1554 }, { "epoch": 1.1375274323335771, "grad_norm": 0.02318812906742096, "learning_rate": 3.142433966921748e-05, "loss": 0.001, "step": 1555 }, { "epoch": 1.1382589612289686, "grad_norm": 0.023945214226841927, "learning_rate": 3.1411997037768456e-05, "loss": 0.001, "step": 1556 }, { "epoch": 1.13899049012436, "grad_norm": 0.02693130075931549, "learning_rate": 3.1399654406319426e-05, "loss": 0.001, "step": 1557 }, { "epoch": 1.1397220190197512, "grad_norm": 0.7980631589889526, "learning_rate": 3.13873117748704e-05, "loss": 0.2129, "step": 1558 }, { "epoch": 1.1404535479151425, "grad_norm": 0.020025407895445824, "learning_rate": 3.137496914342138e-05, "loss": 0.0009, "step": 1559 }, { "epoch": 1.141185076810534, "grad_norm": 0.06492270529270172, "learning_rate": 3.1362626511972356e-05, "loss": 0.0013, "step": 1560 }, { "epoch": 1.1419166057059253, "grad_norm": 0.13198503851890564, "learning_rate": 3.1350283880523326e-05, "loss": 0.0021, "step": 1561 }, { "epoch": 1.1426481346013166, "grad_norm": 0.05959869921207428, "learning_rate": 3.13379412490743e-05, "loss": 0.0015, "step": 1562 }, { "epoch": 1.1433796634967082, "grad_norm": 3.2337894439697266, "learning_rate": 3.132559861762528e-05, "loss": 0.1574, "step": 1563 }, { "epoch": 1.1441111923920995, "grad_norm": 1.8628816604614258, "learning_rate": 3.1313255986176256e-05, "loss": 0.008, "step": 1564 }, { "epoch": 1.1448427212874908, "grad_norm": 0.030986124649643898, "learning_rate": 3.1300913354727226e-05, "loss": 0.0013, "step": 1565 }, { "epoch": 1.1455742501828823, "grad_norm": 14.777390480041504, "learning_rate": 3.12885707232782e-05, "loss": 0.2169, "step": 1566 }, { "epoch": 1.1463057790782736, "grad_norm": 46.37224197387695, "learning_rate": 3.127622809182918e-05, "loss": 0.232, "step": 1567 }, { "epoch": 1.1470373079736649, "grad_norm": 0.040138136595487595, "learning_rate": 3.126388546038016e-05, "loss": 0.0015, "step": 1568 }, { "epoch": 1.1477688368690564, "grad_norm": 32.73142623901367, "learning_rate": 3.125154282893113e-05, "loss": 0.0662, "step": 1569 }, { "epoch": 1.1485003657644477, "grad_norm": 1.5155822038650513, "learning_rate": 3.1239200197482104e-05, "loss": 0.0084, "step": 1570 }, { "epoch": 1.149231894659839, "grad_norm": 0.032126687467098236, "learning_rate": 3.122685756603308e-05, "loss": 0.0014, "step": 1571 }, { "epoch": 1.1499634235552305, "grad_norm": 19.138383865356445, "learning_rate": 3.121451493458406e-05, "loss": 0.0392, "step": 1572 }, { "epoch": 1.1506949524506218, "grad_norm": 0.05278700962662697, "learning_rate": 3.120217230313503e-05, "loss": 0.0015, "step": 1573 }, { "epoch": 1.151426481346013, "grad_norm": 14.587158203125, "learning_rate": 3.1189829671686004e-05, "loss": 0.0308, "step": 1574 }, { "epoch": 1.1521580102414046, "grad_norm": 8.158071517944336, "learning_rate": 3.117748704023698e-05, "loss": 0.4835, "step": 1575 }, { "epoch": 1.152889539136796, "grad_norm": 5.949375629425049, "learning_rate": 3.116514440878796e-05, "loss": 0.0731, "step": 1576 }, { "epoch": 1.1536210680321872, "grad_norm": 5.124086380004883, "learning_rate": 3.115280177733893e-05, "loss": 0.0798, "step": 1577 }, { "epoch": 1.1543525969275787, "grad_norm": 13.480937957763672, "learning_rate": 3.1140459145889904e-05, "loss": 0.1894, "step": 1578 }, { "epoch": 1.15508412582297, "grad_norm": 17.983821868896484, "learning_rate": 3.112811651444088e-05, "loss": 0.0439, "step": 1579 }, { "epoch": 1.1558156547183613, "grad_norm": 4.515225410461426, "learning_rate": 3.111577388299186e-05, "loss": 0.0112, "step": 1580 }, { "epoch": 1.1565471836137529, "grad_norm": 10.896466255187988, "learning_rate": 3.110343125154283e-05, "loss": 0.0459, "step": 1581 }, { "epoch": 1.1572787125091442, "grad_norm": 0.7364944815635681, "learning_rate": 3.1091088620093805e-05, "loss": 0.011, "step": 1582 }, { "epoch": 1.1580102414045355, "grad_norm": 0.4527937173843384, "learning_rate": 3.107874598864478e-05, "loss": 0.0061, "step": 1583 }, { "epoch": 1.1587417702999268, "grad_norm": 0.5609968900680542, "learning_rate": 3.106640335719576e-05, "loss": 0.0074, "step": 1584 }, { "epoch": 1.1594732991953183, "grad_norm": 0.12457460910081863, "learning_rate": 3.105406072574673e-05, "loss": 0.0023, "step": 1585 }, { "epoch": 1.1602048280907096, "grad_norm": 2.023766279220581, "learning_rate": 3.1041718094297705e-05, "loss": 0.0104, "step": 1586 }, { "epoch": 1.1609363569861009, "grad_norm": 0.8874574899673462, "learning_rate": 3.102937546284868e-05, "loss": 0.0099, "step": 1587 }, { "epoch": 1.1616678858814924, "grad_norm": 2.2955386638641357, "learning_rate": 3.101703283139966e-05, "loss": 0.0098, "step": 1588 }, { "epoch": 1.1623994147768837, "grad_norm": 1.1736894845962524, "learning_rate": 3.100469019995063e-05, "loss": 0.0081, "step": 1589 }, { "epoch": 1.163130943672275, "grad_norm": 0.6517477631568909, "learning_rate": 3.0992347568501606e-05, "loss": 0.0068, "step": 1590 }, { "epoch": 1.1638624725676665, "grad_norm": 10.709287643432617, "learning_rate": 3.098000493705258e-05, "loss": 0.0751, "step": 1591 }, { "epoch": 1.1645940014630578, "grad_norm": 0.18679720163345337, "learning_rate": 3.096766230560356e-05, "loss": 0.0023, "step": 1592 }, { "epoch": 1.165325530358449, "grad_norm": 0.035226333886384964, "learning_rate": 3.095531967415453e-05, "loss": 0.0014, "step": 1593 }, { "epoch": 1.1660570592538406, "grad_norm": 13.105180740356445, "learning_rate": 3.0942977042705506e-05, "loss": 0.0668, "step": 1594 }, { "epoch": 1.166788588149232, "grad_norm": 0.2702266275882721, "learning_rate": 3.093063441125648e-05, "loss": 0.0027, "step": 1595 }, { "epoch": 1.1675201170446232, "grad_norm": 2.926208972930908, "learning_rate": 3.091829177980746e-05, "loss": 0.4726, "step": 1596 }, { "epoch": 1.1682516459400145, "grad_norm": 3.0077438354492188, "learning_rate": 3.090594914835843e-05, "loss": 0.2418, "step": 1597 }, { "epoch": 1.168983174835406, "grad_norm": 0.11135796457529068, "learning_rate": 3.0893606516909406e-05, "loss": 0.0015, "step": 1598 }, { "epoch": 1.1697147037307973, "grad_norm": 0.20386475324630737, "learning_rate": 3.088126388546038e-05, "loss": 0.0019, "step": 1599 }, { "epoch": 1.1704462326261886, "grad_norm": 31.039785385131836, "learning_rate": 3.086892125401136e-05, "loss": 0.0361, "step": 1600 }, { "epoch": 1.1711777615215802, "grad_norm": 0.04131462052464485, "learning_rate": 3.085657862256233e-05, "loss": 0.0015, "step": 1601 }, { "epoch": 1.1719092904169714, "grad_norm": 4.675927639007568, "learning_rate": 3.084423599111331e-05, "loss": 0.1297, "step": 1602 }, { "epoch": 1.1726408193123627, "grad_norm": 6.788532733917236, "learning_rate": 3.0831893359664284e-05, "loss": 0.1433, "step": 1603 }, { "epoch": 1.1733723482077543, "grad_norm": 5.031426906585693, "learning_rate": 3.081955072821526e-05, "loss": 0.0747, "step": 1604 }, { "epoch": 1.1741038771031456, "grad_norm": 0.033932894468307495, "learning_rate": 3.080720809676623e-05, "loss": 0.0016, "step": 1605 }, { "epoch": 1.1748354059985369, "grad_norm": 0.09379448741674423, "learning_rate": 3.079486546531721e-05, "loss": 0.0023, "step": 1606 }, { "epoch": 1.1755669348939284, "grad_norm": 0.11432037502527237, "learning_rate": 3.0782522833868184e-05, "loss": 0.0021, "step": 1607 }, { "epoch": 1.1762984637893197, "grad_norm": 7.291565418243408, "learning_rate": 3.077018020241916e-05, "loss": 0.1113, "step": 1608 }, { "epoch": 1.177029992684711, "grad_norm": 10.157113075256348, "learning_rate": 3.075783757097013e-05, "loss": 0.1923, "step": 1609 }, { "epoch": 1.1777615215801025, "grad_norm": 0.9367175698280334, "learning_rate": 3.074549493952111e-05, "loss": 0.0036, "step": 1610 }, { "epoch": 1.1784930504754938, "grad_norm": 6.477607727050781, "learning_rate": 3.0733152308072084e-05, "loss": 0.0153, "step": 1611 }, { "epoch": 1.179224579370885, "grad_norm": 0.04034687578678131, "learning_rate": 3.072080967662306e-05, "loss": 0.0017, "step": 1612 }, { "epoch": 1.1799561082662766, "grad_norm": 0.05804121866822243, "learning_rate": 3.070846704517403e-05, "loss": 0.0019, "step": 1613 }, { "epoch": 1.180687637161668, "grad_norm": 0.061619073152542114, "learning_rate": 3.069612441372501e-05, "loss": 0.0023, "step": 1614 }, { "epoch": 1.1814191660570592, "grad_norm": 0.8596300482749939, "learning_rate": 3.0683781782275985e-05, "loss": 0.0052, "step": 1615 }, { "epoch": 1.1821506949524507, "grad_norm": 0.03209119290113449, "learning_rate": 3.067143915082696e-05, "loss": 0.0015, "step": 1616 }, { "epoch": 1.182882223847842, "grad_norm": 0.22427067160606384, "learning_rate": 3.065909651937793e-05, "loss": 0.0029, "step": 1617 }, { "epoch": 1.1836137527432333, "grad_norm": 0.11086845397949219, "learning_rate": 3.064675388792891e-05, "loss": 0.0024, "step": 1618 }, { "epoch": 1.1843452816386248, "grad_norm": 0.032187022268772125, "learning_rate": 3.0634411256479885e-05, "loss": 0.0015, "step": 1619 }, { "epoch": 1.1850768105340161, "grad_norm": 0.18627429008483887, "learning_rate": 3.062206862503086e-05, "loss": 0.003, "step": 1620 }, { "epoch": 1.1858083394294074, "grad_norm": 28.166486740112305, "learning_rate": 3.060972599358183e-05, "loss": 0.1902, "step": 1621 }, { "epoch": 1.1865398683247987, "grad_norm": 0.0785357654094696, "learning_rate": 3.059738336213281e-05, "loss": 0.0021, "step": 1622 }, { "epoch": 1.1872713972201903, "grad_norm": 0.7987716794013977, "learning_rate": 3.0585040730683786e-05, "loss": 0.1969, "step": 1623 }, { "epoch": 1.1880029261155816, "grad_norm": 0.9492126107215881, "learning_rate": 3.0572698099234756e-05, "loss": 0.2053, "step": 1624 }, { "epoch": 1.1887344550109729, "grad_norm": 0.1755373626947403, "learning_rate": 3.056035546778573e-05, "loss": 0.0026, "step": 1625 }, { "epoch": 1.1894659839063644, "grad_norm": 0.06498007476329803, "learning_rate": 3.054801283633671e-05, "loss": 0.0024, "step": 1626 }, { "epoch": 1.1901975128017557, "grad_norm": 12.660638809204102, "learning_rate": 3.0535670204887686e-05, "loss": 0.2544, "step": 1627 }, { "epoch": 1.190929041697147, "grad_norm": 5.980586528778076, "learning_rate": 3.0523327573438656e-05, "loss": 0.1247, "step": 1628 }, { "epoch": 1.1916605705925385, "grad_norm": 1.2927130460739136, "learning_rate": 3.0510984941989633e-05, "loss": 0.0066, "step": 1629 }, { "epoch": 1.1923920994879298, "grad_norm": 0.8427209854125977, "learning_rate": 3.049864231054061e-05, "loss": 0.0069, "step": 1630 }, { "epoch": 1.193123628383321, "grad_norm": 1.7835006713867188, "learning_rate": 3.0486299679091583e-05, "loss": 0.0054, "step": 1631 }, { "epoch": 1.1938551572787124, "grad_norm": 5.662770748138428, "learning_rate": 3.047395704764256e-05, "loss": 0.121, "step": 1632 }, { "epoch": 1.194586686174104, "grad_norm": 0.10386084020137787, "learning_rate": 3.0461614416193533e-05, "loss": 0.0032, "step": 1633 }, { "epoch": 1.1953182150694952, "grad_norm": 0.1313236951828003, "learning_rate": 3.044927178474451e-05, "loss": 0.004, "step": 1634 }, { "epoch": 1.1960497439648865, "grad_norm": 0.07431826740503311, "learning_rate": 3.0436929153295483e-05, "loss": 0.003, "step": 1635 }, { "epoch": 1.196781272860278, "grad_norm": 3.341285228729248, "learning_rate": 3.042458652184646e-05, "loss": 0.0137, "step": 1636 }, { "epoch": 1.1975128017556693, "grad_norm": 1.1832809448242188, "learning_rate": 3.0412243890397434e-05, "loss": 0.1753, "step": 1637 }, { "epoch": 1.1982443306510606, "grad_norm": 0.34316128492355347, "learning_rate": 3.039990125894841e-05, "loss": 0.0049, "step": 1638 }, { "epoch": 1.1989758595464521, "grad_norm": 1.1339105367660522, "learning_rate": 3.0387558627499384e-05, "loss": 0.0079, "step": 1639 }, { "epoch": 1.1997073884418434, "grad_norm": 0.23538678884506226, "learning_rate": 3.037521599605036e-05, "loss": 0.0063, "step": 1640 }, { "epoch": 1.2004389173372347, "grad_norm": 9.485014915466309, "learning_rate": 3.0362873364601334e-05, "loss": 0.0664, "step": 1641 }, { "epoch": 1.2011704462326263, "grad_norm": 4.493890762329102, "learning_rate": 3.035053073315231e-05, "loss": 0.0164, "step": 1642 }, { "epoch": 1.2019019751280176, "grad_norm": 13.590413093566895, "learning_rate": 3.0338188101703284e-05, "loss": 0.1695, "step": 1643 }, { "epoch": 1.2026335040234089, "grad_norm": 2.727867364883423, "learning_rate": 3.032584547025426e-05, "loss": 0.0353, "step": 1644 }, { "epoch": 1.2033650329188004, "grad_norm": 0.48500943183898926, "learning_rate": 3.0313502838805234e-05, "loss": 0.0073, "step": 1645 }, { "epoch": 1.2040965618141917, "grad_norm": 17.705387115478516, "learning_rate": 3.030116020735621e-05, "loss": 0.0373, "step": 1646 }, { "epoch": 1.204828090709583, "grad_norm": 0.2087496966123581, "learning_rate": 3.0288817575907184e-05, "loss": 0.0063, "step": 1647 }, { "epoch": 1.2055596196049745, "grad_norm": 1.3934000730514526, "learning_rate": 3.027647494445816e-05, "loss": 0.0072, "step": 1648 }, { "epoch": 1.2062911485003658, "grad_norm": 0.12359736114740372, "learning_rate": 3.0264132313009135e-05, "loss": 0.0035, "step": 1649 }, { "epoch": 1.207022677395757, "grad_norm": 15.654494285583496, "learning_rate": 3.025178968156011e-05, "loss": 0.0156, "step": 1650 }, { "epoch": 1.2077542062911486, "grad_norm": 0.08845069259405136, "learning_rate": 3.0239447050111085e-05, "loss": 0.0037, "step": 1651 }, { "epoch": 1.20848573518654, "grad_norm": 0.20510566234588623, "learning_rate": 3.022710441866206e-05, "loss": 0.0053, "step": 1652 }, { "epoch": 1.2092172640819312, "grad_norm": 19.49769401550293, "learning_rate": 3.0214761787213035e-05, "loss": 0.1457, "step": 1653 }, { "epoch": 1.2099487929773227, "grad_norm": 0.0907098650932312, "learning_rate": 3.0202419155764012e-05, "loss": 0.0038, "step": 1654 }, { "epoch": 1.210680321872714, "grad_norm": 0.08256623148918152, "learning_rate": 3.0190076524314985e-05, "loss": 0.0036, "step": 1655 }, { "epoch": 1.2114118507681053, "grad_norm": 4.112320423126221, "learning_rate": 3.0177733892865962e-05, "loss": 0.0979, "step": 1656 }, { "epoch": 1.2121433796634966, "grad_norm": 14.926678657531738, "learning_rate": 3.0165391261416935e-05, "loss": 0.0759, "step": 1657 }, { "epoch": 1.2128749085588881, "grad_norm": 8.192556381225586, "learning_rate": 3.0153048629967912e-05, "loss": 0.0923, "step": 1658 }, { "epoch": 1.2136064374542794, "grad_norm": 31.163604736328125, "learning_rate": 3.0140705998518886e-05, "loss": 0.2345, "step": 1659 }, { "epoch": 1.2143379663496707, "grad_norm": 0.05771348997950554, "learning_rate": 3.0128363367069862e-05, "loss": 0.0026, "step": 1660 }, { "epoch": 1.2150694952450622, "grad_norm": 0.05896678566932678, "learning_rate": 3.0116020735620836e-05, "loss": 0.0026, "step": 1661 }, { "epoch": 1.2158010241404535, "grad_norm": 0.2986612915992737, "learning_rate": 3.0103678104171813e-05, "loss": 0.0034, "step": 1662 }, { "epoch": 1.2165325530358448, "grad_norm": 2.853342294692993, "learning_rate": 3.0091335472722786e-05, "loss": 0.1099, "step": 1663 }, { "epoch": 1.2172640819312364, "grad_norm": 12.048116683959961, "learning_rate": 3.0078992841273763e-05, "loss": 0.1175, "step": 1664 }, { "epoch": 1.2179956108266277, "grad_norm": 9.544814109802246, "learning_rate": 3.0066650209824736e-05, "loss": 0.2629, "step": 1665 }, { "epoch": 1.218727139722019, "grad_norm": 0.10915399342775345, "learning_rate": 3.0054307578375713e-05, "loss": 0.003, "step": 1666 }, { "epoch": 1.2194586686174105, "grad_norm": 0.1691514104604721, "learning_rate": 3.0041964946926686e-05, "loss": 0.0037, "step": 1667 }, { "epoch": 1.2201901975128018, "grad_norm": 0.24703381955623627, "learning_rate": 3.0029622315477663e-05, "loss": 0.0031, "step": 1668 }, { "epoch": 1.220921726408193, "grad_norm": 0.14907842874526978, "learning_rate": 3.0017279684028637e-05, "loss": 0.0028, "step": 1669 }, { "epoch": 1.2216532553035844, "grad_norm": 0.16693954169750214, "learning_rate": 3.0004937052579613e-05, "loss": 0.004, "step": 1670 }, { "epoch": 1.222384784198976, "grad_norm": 22.986526489257812, "learning_rate": 2.9992594421130587e-05, "loss": 0.0824, "step": 1671 }, { "epoch": 1.2231163130943672, "grad_norm": 0.5855056643486023, "learning_rate": 2.9980251789681564e-05, "loss": 0.005, "step": 1672 }, { "epoch": 1.2238478419897585, "grad_norm": 7.997975826263428, "learning_rate": 2.9967909158232537e-05, "loss": 0.0416, "step": 1673 }, { "epoch": 1.22457937088515, "grad_norm": 10.622967720031738, "learning_rate": 2.9955566526783514e-05, "loss": 0.1265, "step": 1674 }, { "epoch": 1.2253108997805413, "grad_norm": 1.1911790370941162, "learning_rate": 2.9943223895334487e-05, "loss": 0.0053, "step": 1675 }, { "epoch": 1.2260424286759326, "grad_norm": 0.0458611436188221, "learning_rate": 2.9930881263885464e-05, "loss": 0.0019, "step": 1676 }, { "epoch": 1.2267739575713241, "grad_norm": 0.08522952347993851, "learning_rate": 2.9918538632436437e-05, "loss": 0.0026, "step": 1677 }, { "epoch": 1.2275054864667154, "grad_norm": 0.16872771084308624, "learning_rate": 2.9906196000987414e-05, "loss": 0.0027, "step": 1678 }, { "epoch": 1.2282370153621067, "grad_norm": 4.932519435882568, "learning_rate": 2.9893853369538388e-05, "loss": 0.0146, "step": 1679 }, { "epoch": 1.2289685442574982, "grad_norm": 0.3464765250682831, "learning_rate": 2.9881510738089364e-05, "loss": 0.0031, "step": 1680 }, { "epoch": 1.2297000731528895, "grad_norm": 12.634867668151855, "learning_rate": 2.9869168106640338e-05, "loss": 0.1531, "step": 1681 }, { "epoch": 1.2304316020482808, "grad_norm": 0.20505546033382416, "learning_rate": 2.9856825475191315e-05, "loss": 0.0024, "step": 1682 }, { "epoch": 1.2311631309436724, "grad_norm": 0.04827539250254631, "learning_rate": 2.9844482843742288e-05, "loss": 0.0017, "step": 1683 }, { "epoch": 1.2318946598390637, "grad_norm": 0.07938750833272934, "learning_rate": 2.9832140212293265e-05, "loss": 0.0019, "step": 1684 }, { "epoch": 1.232626188734455, "grad_norm": 0.05551330745220184, "learning_rate": 2.9819797580844238e-05, "loss": 0.0016, "step": 1685 }, { "epoch": 1.2333577176298465, "grad_norm": 0.04772556573152542, "learning_rate": 2.9807454949395215e-05, "loss": 0.0017, "step": 1686 }, { "epoch": 1.2340892465252378, "grad_norm": 6.570145130157471, "learning_rate": 2.979511231794619e-05, "loss": 0.0076, "step": 1687 }, { "epoch": 1.234820775420629, "grad_norm": 0.03483905643224716, "learning_rate": 2.9782769686497165e-05, "loss": 0.0015, "step": 1688 }, { "epoch": 1.2355523043160206, "grad_norm": 2.9136552810668945, "learning_rate": 2.977042705504814e-05, "loss": 0.1651, "step": 1689 }, { "epoch": 1.2362838332114119, "grad_norm": 6.632139682769775, "learning_rate": 2.9758084423599115e-05, "loss": 0.1368, "step": 1690 }, { "epoch": 1.2370153621068032, "grad_norm": 1.0885772705078125, "learning_rate": 2.974574179215009e-05, "loss": 0.0063, "step": 1691 }, { "epoch": 1.2377468910021947, "grad_norm": 6.3700175285339355, "learning_rate": 2.9733399160701066e-05, "loss": 0.0135, "step": 1692 }, { "epoch": 1.238478419897586, "grad_norm": 17.181623458862305, "learning_rate": 2.972105652925204e-05, "loss": 0.1157, "step": 1693 }, { "epoch": 1.2392099487929773, "grad_norm": 10.06486701965332, "learning_rate": 2.9708713897803016e-05, "loss": 0.1141, "step": 1694 }, { "epoch": 1.2399414776883686, "grad_norm": 0.11908221244812012, "learning_rate": 2.969637126635399e-05, "loss": 0.0021, "step": 1695 }, { "epoch": 1.2406730065837601, "grad_norm": 1.0589749813079834, "learning_rate": 2.9684028634904966e-05, "loss": 0.0068, "step": 1696 }, { "epoch": 1.2414045354791514, "grad_norm": 24.13688850402832, "learning_rate": 2.967168600345594e-05, "loss": 0.0888, "step": 1697 }, { "epoch": 1.2421360643745427, "grad_norm": 0.037972912192344666, "learning_rate": 2.9659343372006916e-05, "loss": 0.0016, "step": 1698 }, { "epoch": 1.2428675932699342, "grad_norm": 0.07380838692188263, "learning_rate": 2.964700074055789e-05, "loss": 0.0021, "step": 1699 }, { "epoch": 1.2435991221653255, "grad_norm": 2.009352922439575, "learning_rate": 2.9634658109108866e-05, "loss": 0.0048, "step": 1700 }, { "epoch": 1.2443306510607168, "grad_norm": 13.190328598022461, "learning_rate": 2.962231547765984e-05, "loss": 0.2819, "step": 1701 }, { "epoch": 1.2450621799561084, "grad_norm": 5.920238018035889, "learning_rate": 2.9609972846210817e-05, "loss": 0.0201, "step": 1702 }, { "epoch": 1.2457937088514996, "grad_norm": 2.4342942237854004, "learning_rate": 2.959763021476179e-05, "loss": 0.1907, "step": 1703 }, { "epoch": 1.246525237746891, "grad_norm": 7.005025386810303, "learning_rate": 2.9585287583312767e-05, "loss": 0.073, "step": 1704 }, { "epoch": 1.2472567666422825, "grad_norm": 10.000044822692871, "learning_rate": 2.957294495186374e-05, "loss": 0.0945, "step": 1705 }, { "epoch": 1.2479882955376738, "grad_norm": 0.29097482562065125, "learning_rate": 2.9560602320414717e-05, "loss": 0.0038, "step": 1706 }, { "epoch": 1.248719824433065, "grad_norm": 0.03079846315085888, "learning_rate": 2.954825968896569e-05, "loss": 0.0012, "step": 1707 }, { "epoch": 1.2494513533284564, "grad_norm": 7.011050701141357, "learning_rate": 2.9535917057516667e-05, "loss": 0.1862, "step": 1708 }, { "epoch": 1.2501828822238479, "grad_norm": 0.18550288677215576, "learning_rate": 2.952357442606764e-05, "loss": 0.0023, "step": 1709 }, { "epoch": 1.2509144111192392, "grad_norm": 0.1431731879711151, "learning_rate": 2.9511231794618614e-05, "loss": 0.0032, "step": 1710 }, { "epoch": 1.2516459400146305, "grad_norm": 0.24383807182312012, "learning_rate": 2.949888916316959e-05, "loss": 0.0042, "step": 1711 }, { "epoch": 1.252377468910022, "grad_norm": 0.5750638842582703, "learning_rate": 2.9486546531720564e-05, "loss": 0.0105, "step": 1712 }, { "epoch": 1.2531089978054133, "grad_norm": 1.4272632598876953, "learning_rate": 2.947420390027154e-05, "loss": 0.0192, "step": 1713 }, { "epoch": 1.2538405267008046, "grad_norm": 0.8683145642280579, "learning_rate": 2.9461861268822514e-05, "loss": 0.0077, "step": 1714 }, { "epoch": 1.2545720555961961, "grad_norm": 0.21079601347446442, "learning_rate": 2.944951863737349e-05, "loss": 0.003, "step": 1715 }, { "epoch": 1.2553035844915874, "grad_norm": 4.934976100921631, "learning_rate": 2.9437176005924465e-05, "loss": 0.0149, "step": 1716 }, { "epoch": 1.2560351133869787, "grad_norm": 6.738858222961426, "learning_rate": 2.942483337447544e-05, "loss": 0.2653, "step": 1717 }, { "epoch": 1.2567666422823702, "grad_norm": 4.889317512512207, "learning_rate": 2.9412490743026415e-05, "loss": 0.1818, "step": 1718 }, { "epoch": 1.2574981711777615, "grad_norm": 3.7046451568603516, "learning_rate": 2.940014811157739e-05, "loss": 0.0135, "step": 1719 }, { "epoch": 1.2582297000731528, "grad_norm": 15.436553001403809, "learning_rate": 2.9387805480128365e-05, "loss": 0.1049, "step": 1720 }, { "epoch": 1.2589612289685443, "grad_norm": 0.06980207562446594, "learning_rate": 2.9375462848679342e-05, "loss": 0.0015, "step": 1721 }, { "epoch": 1.2596927578639356, "grad_norm": 0.1953294724225998, "learning_rate": 2.9363120217230315e-05, "loss": 0.0029, "step": 1722 }, { "epoch": 1.260424286759327, "grad_norm": 0.22735042870044708, "learning_rate": 2.9350777585781292e-05, "loss": 0.0027, "step": 1723 }, { "epoch": 1.2611558156547185, "grad_norm": 0.13602055609226227, "learning_rate": 2.9338434954332265e-05, "loss": 0.0022, "step": 1724 }, { "epoch": 1.2618873445501098, "grad_norm": 8.833062171936035, "learning_rate": 2.9326092322883242e-05, "loss": 0.1335, "step": 1725 }, { "epoch": 1.262618873445501, "grad_norm": 0.12505896389484406, "learning_rate": 2.9313749691434216e-05, "loss": 0.0017, "step": 1726 }, { "epoch": 1.2633504023408926, "grad_norm": 0.3321450352668762, "learning_rate": 2.9301407059985192e-05, "loss": 0.0038, "step": 1727 }, { "epoch": 1.2640819312362839, "grad_norm": 5.421343803405762, "learning_rate": 2.9289064428536166e-05, "loss": 0.0166, "step": 1728 }, { "epoch": 1.2648134601316752, "grad_norm": 11.461223602294922, "learning_rate": 2.9276721797087143e-05, "loss": 0.0791, "step": 1729 }, { "epoch": 1.2655449890270667, "grad_norm": 0.10768299549818039, "learning_rate": 2.9264379165638116e-05, "loss": 0.0017, "step": 1730 }, { "epoch": 1.266276517922458, "grad_norm": 0.3361212909221649, "learning_rate": 2.9252036534189093e-05, "loss": 0.004, "step": 1731 }, { "epoch": 1.2670080468178493, "grad_norm": 0.16215574741363525, "learning_rate": 2.9239693902740066e-05, "loss": 0.0018, "step": 1732 }, { "epoch": 1.2677395757132408, "grad_norm": 0.10245111584663391, "learning_rate": 2.9227351271291043e-05, "loss": 0.0026, "step": 1733 }, { "epoch": 1.268471104608632, "grad_norm": 0.028203364461660385, "learning_rate": 2.9215008639842016e-05, "loss": 0.0011, "step": 1734 }, { "epoch": 1.2692026335040234, "grad_norm": 6.584430694580078, "learning_rate": 2.9202666008392993e-05, "loss": 0.0686, "step": 1735 }, { "epoch": 1.2699341623994147, "grad_norm": 7.199831962585449, "learning_rate": 2.9190323376943966e-05, "loss": 0.1157, "step": 1736 }, { "epoch": 1.2706656912948062, "grad_norm": 0.11032611131668091, "learning_rate": 2.9177980745494943e-05, "loss": 0.0021, "step": 1737 }, { "epoch": 1.2713972201901975, "grad_norm": 2.9857046604156494, "learning_rate": 2.9165638114045917e-05, "loss": 0.0058, "step": 1738 }, { "epoch": 1.2721287490855888, "grad_norm": 4.30684232711792, "learning_rate": 2.9153295482596893e-05, "loss": 0.1747, "step": 1739 }, { "epoch": 1.2728602779809801, "grad_norm": 10.451104164123535, "learning_rate": 2.9140952851147867e-05, "loss": 0.0135, "step": 1740 }, { "epoch": 1.2735918068763716, "grad_norm": 0.23021431267261505, "learning_rate": 2.9128610219698844e-05, "loss": 0.0024, "step": 1741 }, { "epoch": 1.274323335771763, "grad_norm": 0.24207520484924316, "learning_rate": 2.9116267588249817e-05, "loss": 0.0024, "step": 1742 }, { "epoch": 1.2750548646671542, "grad_norm": 0.20254820585250854, "learning_rate": 2.9103924956800794e-05, "loss": 0.0019, "step": 1743 }, { "epoch": 1.2757863935625458, "grad_norm": 0.2636348307132721, "learning_rate": 2.9091582325351767e-05, "loss": 0.0024, "step": 1744 }, { "epoch": 1.276517922457937, "grad_norm": 2.9102933406829834, "learning_rate": 2.9079239693902744e-05, "loss": 0.0083, "step": 1745 }, { "epoch": 1.2772494513533283, "grad_norm": 0.1519523561000824, "learning_rate": 2.9066897062453717e-05, "loss": 0.0015, "step": 1746 }, { "epoch": 1.2779809802487199, "grad_norm": 0.10689350217580795, "learning_rate": 2.9054554431004694e-05, "loss": 0.0013, "step": 1747 }, { "epoch": 1.2787125091441112, "grad_norm": 9.286120414733887, "learning_rate": 2.9042211799555668e-05, "loss": 0.0679, "step": 1748 }, { "epoch": 1.2794440380395025, "grad_norm": 0.04809322580695152, "learning_rate": 2.9029869168106644e-05, "loss": 0.0012, "step": 1749 }, { "epoch": 1.280175566934894, "grad_norm": 0.05622591823339462, "learning_rate": 2.9017526536657618e-05, "loss": 0.0013, "step": 1750 }, { "epoch": 1.2809070958302853, "grad_norm": 5.328927516937256, "learning_rate": 2.9005183905208595e-05, "loss": 0.2634, "step": 1751 }, { "epoch": 1.2816386247256766, "grad_norm": 3.8786399364471436, "learning_rate": 2.8992841273759568e-05, "loss": 0.0093, "step": 1752 }, { "epoch": 1.282370153621068, "grad_norm": 0.1426449865102768, "learning_rate": 2.8980498642310545e-05, "loss": 0.0015, "step": 1753 }, { "epoch": 1.2831016825164594, "grad_norm": 15.98990249633789, "learning_rate": 2.8968156010861518e-05, "loss": 0.0697, "step": 1754 }, { "epoch": 1.2838332114118507, "grad_norm": 1.2392244338989258, "learning_rate": 2.8955813379412495e-05, "loss": 0.0062, "step": 1755 }, { "epoch": 1.2845647403072422, "grad_norm": 0.1232057511806488, "learning_rate": 2.894347074796347e-05, "loss": 0.002, "step": 1756 }, { "epoch": 1.2852962692026335, "grad_norm": 1.9946963787078857, "learning_rate": 2.8931128116514445e-05, "loss": 0.0083, "step": 1757 }, { "epoch": 1.2860277980980248, "grad_norm": 0.41285473108291626, "learning_rate": 2.891878548506542e-05, "loss": 0.0025, "step": 1758 }, { "epoch": 1.2867593269934163, "grad_norm": 0.09127204865217209, "learning_rate": 2.8906442853616395e-05, "loss": 0.0018, "step": 1759 }, { "epoch": 1.2874908558888076, "grad_norm": 0.055202458053827286, "learning_rate": 2.889410022216737e-05, "loss": 0.0011, "step": 1760 }, { "epoch": 1.288222384784199, "grad_norm": 12.214336395263672, "learning_rate": 2.8881757590718346e-05, "loss": 0.0541, "step": 1761 }, { "epoch": 1.2889539136795904, "grad_norm": 24.29200553894043, "learning_rate": 2.886941495926932e-05, "loss": 0.1358, "step": 1762 }, { "epoch": 1.2896854425749817, "grad_norm": 0.11344381421804428, "learning_rate": 2.8857072327820296e-05, "loss": 0.0014, "step": 1763 }, { "epoch": 1.290416971470373, "grad_norm": 7.158273696899414, "learning_rate": 2.884472969637127e-05, "loss": 0.1966, "step": 1764 }, { "epoch": 1.2911485003657646, "grad_norm": 0.07925455272197723, "learning_rate": 2.8832387064922246e-05, "loss": 0.0012, "step": 1765 }, { "epoch": 1.2918800292611559, "grad_norm": 0.08947025239467621, "learning_rate": 2.882004443347322e-05, "loss": 0.0016, "step": 1766 }, { "epoch": 1.2926115581565472, "grad_norm": 2.7473320960998535, "learning_rate": 2.8807701802024196e-05, "loss": 0.0098, "step": 1767 }, { "epoch": 1.2933430870519387, "grad_norm": 16.689218521118164, "learning_rate": 2.879535917057517e-05, "loss": 0.0437, "step": 1768 }, { "epoch": 1.29407461594733, "grad_norm": 5.78226375579834, "learning_rate": 2.8783016539126146e-05, "loss": 0.0083, "step": 1769 }, { "epoch": 1.2948061448427213, "grad_norm": 6.549846172332764, "learning_rate": 2.877067390767712e-05, "loss": 0.0115, "step": 1770 }, { "epoch": 1.2955376737381128, "grad_norm": 1.443947672843933, "learning_rate": 2.8758331276228097e-05, "loss": 0.0037, "step": 1771 }, { "epoch": 1.296269202633504, "grad_norm": 0.7236607074737549, "learning_rate": 2.874598864477907e-05, "loss": 0.0044, "step": 1772 }, { "epoch": 1.2970007315288954, "grad_norm": 0.06648309528827667, "learning_rate": 2.8733646013330047e-05, "loss": 0.0013, "step": 1773 }, { "epoch": 1.2977322604242867, "grad_norm": 0.1442808210849762, "learning_rate": 2.872130338188102e-05, "loss": 0.0014, "step": 1774 }, { "epoch": 1.2984637893196782, "grad_norm": 0.2124911993741989, "learning_rate": 2.8708960750431997e-05, "loss": 0.0018, "step": 1775 }, { "epoch": 1.2991953182150695, "grad_norm": 0.1275920420885086, "learning_rate": 2.869661811898297e-05, "loss": 0.0013, "step": 1776 }, { "epoch": 1.2999268471104608, "grad_norm": 0.12637832760810852, "learning_rate": 2.8684275487533947e-05, "loss": 0.0012, "step": 1777 }, { "epoch": 1.300658376005852, "grad_norm": 12.046690940856934, "learning_rate": 2.867193285608492e-05, "loss": 0.0165, "step": 1778 }, { "epoch": 1.3013899049012436, "grad_norm": 0.8406500816345215, "learning_rate": 2.8659590224635897e-05, "loss": 0.0028, "step": 1779 }, { "epoch": 1.302121433796635, "grad_norm": 14.542376518249512, "learning_rate": 2.864724759318687e-05, "loss": 0.1849, "step": 1780 }, { "epoch": 1.3028529626920262, "grad_norm": 16.356403350830078, "learning_rate": 2.8634904961737848e-05, "loss": 0.2988, "step": 1781 }, { "epoch": 1.3035844915874177, "grad_norm": 0.17808149755001068, "learning_rate": 2.862256233028882e-05, "loss": 0.0012, "step": 1782 }, { "epoch": 1.304316020482809, "grad_norm": 0.05183994024991989, "learning_rate": 2.8610219698839798e-05, "loss": 0.001, "step": 1783 }, { "epoch": 1.3050475493782003, "grad_norm": 1.9269253015518188, "learning_rate": 2.859787706739077e-05, "loss": 0.1846, "step": 1784 }, { "epoch": 1.3057790782735919, "grad_norm": 1.6317641735076904, "learning_rate": 2.8585534435941748e-05, "loss": 0.0035, "step": 1785 }, { "epoch": 1.3065106071689832, "grad_norm": 0.0653458759188652, "learning_rate": 2.857319180449272e-05, "loss": 0.0011, "step": 1786 }, { "epoch": 1.3072421360643744, "grad_norm": 2.3851935863494873, "learning_rate": 2.8560849173043698e-05, "loss": 0.2072, "step": 1787 }, { "epoch": 1.307973664959766, "grad_norm": 21.60157585144043, "learning_rate": 2.854850654159467e-05, "loss": 0.0322, "step": 1788 }, { "epoch": 1.3087051938551573, "grad_norm": 0.048195261508226395, "learning_rate": 2.853616391014565e-05, "loss": 0.0014, "step": 1789 }, { "epoch": 1.3094367227505486, "grad_norm": 2.5669822692871094, "learning_rate": 2.8523821278696622e-05, "loss": 0.0064, "step": 1790 }, { "epoch": 1.31016825164594, "grad_norm": 0.45758163928985596, "learning_rate": 2.85114786472476e-05, "loss": 0.0025, "step": 1791 }, { "epoch": 1.3108997805413314, "grad_norm": 0.24777279794216156, "learning_rate": 2.8499136015798572e-05, "loss": 0.0024, "step": 1792 }, { "epoch": 1.3116313094367227, "grad_norm": 0.050745632499456406, "learning_rate": 2.848679338434955e-05, "loss": 0.0014, "step": 1793 }, { "epoch": 1.3123628383321142, "grad_norm": 0.07784255594015121, "learning_rate": 2.8474450752900522e-05, "loss": 0.0018, "step": 1794 }, { "epoch": 1.3130943672275055, "grad_norm": 6.690812587738037, "learning_rate": 2.84621081214515e-05, "loss": 0.2669, "step": 1795 }, { "epoch": 1.3138258961228968, "grad_norm": 0.07176528126001358, "learning_rate": 2.8449765490002472e-05, "loss": 0.0014, "step": 1796 }, { "epoch": 1.3145574250182883, "grad_norm": 0.09396917372941971, "learning_rate": 2.8437422858553446e-05, "loss": 0.0022, "step": 1797 }, { "epoch": 1.3152889539136796, "grad_norm": 2.0198380947113037, "learning_rate": 2.8425080227104423e-05, "loss": 0.1405, "step": 1798 }, { "epoch": 1.316020482809071, "grad_norm": 0.12232647836208344, "learning_rate": 2.8412737595655396e-05, "loss": 0.0018, "step": 1799 }, { "epoch": 1.3167520117044624, "grad_norm": 0.043935712426900864, "learning_rate": 2.8400394964206373e-05, "loss": 0.0013, "step": 1800 }, { "epoch": 1.3174835405998537, "grad_norm": 7.246397972106934, "learning_rate": 2.8388052332757346e-05, "loss": 0.0158, "step": 1801 }, { "epoch": 1.318215069495245, "grad_norm": 0.060723353177309036, "learning_rate": 2.8375709701308323e-05, "loss": 0.0014, "step": 1802 }, { "epoch": 1.3189465983906365, "grad_norm": 10.260765075683594, "learning_rate": 2.8363367069859296e-05, "loss": 0.028, "step": 1803 }, { "epoch": 1.3196781272860278, "grad_norm": 2.2107884883880615, "learning_rate": 2.8351024438410273e-05, "loss": 0.008, "step": 1804 }, { "epoch": 1.3204096561814191, "grad_norm": 0.04980083927512169, "learning_rate": 2.8338681806961247e-05, "loss": 0.0014, "step": 1805 }, { "epoch": 1.3211411850768107, "grad_norm": 1.4493277072906494, "learning_rate": 2.8326339175512223e-05, "loss": 0.005, "step": 1806 }, { "epoch": 1.321872713972202, "grad_norm": 0.8558061122894287, "learning_rate": 2.8313996544063197e-05, "loss": 0.0034, "step": 1807 }, { "epoch": 1.3226042428675933, "grad_norm": 4.188941478729248, "learning_rate": 2.8301653912614174e-05, "loss": 0.1032, "step": 1808 }, { "epoch": 1.3233357717629846, "grad_norm": 0.03073713183403015, "learning_rate": 2.8289311281165147e-05, "loss": 0.0009, "step": 1809 }, { "epoch": 1.324067300658376, "grad_norm": 12.643792152404785, "learning_rate": 2.8276968649716124e-05, "loss": 0.2509, "step": 1810 }, { "epoch": 1.3247988295537674, "grad_norm": 5.317176818847656, "learning_rate": 2.8264626018267097e-05, "loss": 0.0112, "step": 1811 }, { "epoch": 1.3255303584491587, "grad_norm": 0.04628003388643265, "learning_rate": 2.8252283386818074e-05, "loss": 0.0014, "step": 1812 }, { "epoch": 1.32626188734455, "grad_norm": 0.14215944707393646, "learning_rate": 2.8239940755369047e-05, "loss": 0.0025, "step": 1813 }, { "epoch": 1.3269934162399415, "grad_norm": 0.32109466195106506, "learning_rate": 2.8227598123920024e-05, "loss": 0.0045, "step": 1814 }, { "epoch": 1.3277249451353328, "grad_norm": 0.27894678711891174, "learning_rate": 2.8215255492470998e-05, "loss": 0.0031, "step": 1815 }, { "epoch": 1.328456474030724, "grad_norm": 0.22320519387722015, "learning_rate": 2.8202912861021974e-05, "loss": 0.0029, "step": 1816 }, { "epoch": 1.3291880029261156, "grad_norm": 2.853996753692627, "learning_rate": 2.8190570229572948e-05, "loss": 0.0105, "step": 1817 }, { "epoch": 1.329919531821507, "grad_norm": 0.20187124609947205, "learning_rate": 2.8178227598123925e-05, "loss": 0.0023, "step": 1818 }, { "epoch": 1.3306510607168982, "grad_norm": 1.0648127794265747, "learning_rate": 2.8165884966674898e-05, "loss": 0.0086, "step": 1819 }, { "epoch": 1.3313825896122897, "grad_norm": 0.3492770791053772, "learning_rate": 2.8153542335225875e-05, "loss": 0.0035, "step": 1820 }, { "epoch": 1.332114118507681, "grad_norm": 0.7084671854972839, "learning_rate": 2.8141199703776848e-05, "loss": 0.0033, "step": 1821 }, { "epoch": 1.3328456474030723, "grad_norm": 0.1272716224193573, "learning_rate": 2.8128857072327825e-05, "loss": 0.0016, "step": 1822 }, { "epoch": 1.3335771762984638, "grad_norm": 13.195730209350586, "learning_rate": 2.8116514440878795e-05, "loss": 0.3114, "step": 1823 }, { "epoch": 1.3343087051938551, "grad_norm": 0.9591020345687866, "learning_rate": 2.810417180942977e-05, "loss": 0.2511, "step": 1824 }, { "epoch": 1.3350402340892464, "grad_norm": 0.12171340733766556, "learning_rate": 2.8091829177980745e-05, "loss": 0.0016, "step": 1825 }, { "epoch": 1.335771762984638, "grad_norm": 12.044936180114746, "learning_rate": 2.807948654653172e-05, "loss": 0.0126, "step": 1826 }, { "epoch": 1.3365032918800293, "grad_norm": 3.128566026687622, "learning_rate": 2.8067143915082695e-05, "loss": 0.2039, "step": 1827 }, { "epoch": 1.3372348207754206, "grad_norm": 0.03416173532605171, "learning_rate": 2.805480128363367e-05, "loss": 0.0009, "step": 1828 }, { "epoch": 1.337966349670812, "grad_norm": 0.03100651316344738, "learning_rate": 2.8042458652184646e-05, "loss": 0.001, "step": 1829 }, { "epoch": 1.3386978785662034, "grad_norm": 5.396693229675293, "learning_rate": 2.803011602073562e-05, "loss": 0.1981, "step": 1830 }, { "epoch": 1.3394294074615947, "grad_norm": 0.8990545868873596, "learning_rate": 2.8017773389286596e-05, "loss": 0.208, "step": 1831 }, { "epoch": 1.3401609363569862, "grad_norm": 3.901585578918457, "learning_rate": 2.800543075783757e-05, "loss": 0.1499, "step": 1832 }, { "epoch": 1.3408924652523775, "grad_norm": 1.1778756380081177, "learning_rate": 2.7993088126388546e-05, "loss": 0.2152, "step": 1833 }, { "epoch": 1.3416239941477688, "grad_norm": 3.331019401550293, "learning_rate": 2.798074549493952e-05, "loss": 0.1917, "step": 1834 }, { "epoch": 1.3423555230431603, "grad_norm": 68.6905517578125, "learning_rate": 2.7968402863490496e-05, "loss": 0.5845, "step": 1835 }, { "epoch": 1.3430870519385516, "grad_norm": 22.617740631103516, "learning_rate": 2.795606023204147e-05, "loss": 0.3213, "step": 1836 }, { "epoch": 1.343818580833943, "grad_norm": 4.543148517608643, "learning_rate": 2.7943717600592446e-05, "loss": 0.0957, "step": 1837 }, { "epoch": 1.3445501097293344, "grad_norm": 3.749443292617798, "learning_rate": 2.793137496914342e-05, "loss": 0.1098, "step": 1838 }, { "epoch": 1.3452816386247257, "grad_norm": 0.15112486481666565, "learning_rate": 2.7919032337694396e-05, "loss": 0.0052, "step": 1839 }, { "epoch": 1.346013167520117, "grad_norm": 9.077709197998047, "learning_rate": 2.790668970624537e-05, "loss": 0.0215, "step": 1840 }, { "epoch": 1.3467446964155085, "grad_norm": 0.21458321809768677, "learning_rate": 2.7894347074796347e-05, "loss": 0.0081, "step": 1841 }, { "epoch": 1.3474762253108998, "grad_norm": 11.09518051147461, "learning_rate": 2.788200444334732e-05, "loss": 0.0389, "step": 1842 }, { "epoch": 1.3482077542062911, "grad_norm": 3.68719482421875, "learning_rate": 2.7869661811898297e-05, "loss": 0.1365, "step": 1843 }, { "epoch": 1.3489392831016827, "grad_norm": 17.889047622680664, "learning_rate": 2.785731918044927e-05, "loss": 0.0784, "step": 1844 }, { "epoch": 1.349670811997074, "grad_norm": 0.4776712954044342, "learning_rate": 2.7844976549000247e-05, "loss": 0.0148, "step": 1845 }, { "epoch": 1.3504023408924652, "grad_norm": 28.027448654174805, "learning_rate": 2.783263391755122e-05, "loss": 0.2002, "step": 1846 }, { "epoch": 1.3511338697878565, "grad_norm": 3.170456647872925, "learning_rate": 2.7820291286102197e-05, "loss": 0.1335, "step": 1847 }, { "epoch": 1.351865398683248, "grad_norm": 6.830007076263428, "learning_rate": 2.780794865465317e-05, "loss": 0.0318, "step": 1848 }, { "epoch": 1.3525969275786394, "grad_norm": 15.843450546264648, "learning_rate": 2.7795606023204147e-05, "loss": 0.1593, "step": 1849 }, { "epoch": 1.3533284564740307, "grad_norm": 0.31081652641296387, "learning_rate": 2.778326339175512e-05, "loss": 0.0101, "step": 1850 }, { "epoch": 1.354059985369422, "grad_norm": 1.1095820665359497, "learning_rate": 2.7770920760306098e-05, "loss": 0.0116, "step": 1851 }, { "epoch": 1.3547915142648135, "grad_norm": 0.24018685519695282, "learning_rate": 2.775857812885707e-05, "loss": 0.0108, "step": 1852 }, { "epoch": 1.3555230431602048, "grad_norm": 0.1752191036939621, "learning_rate": 2.7746235497408048e-05, "loss": 0.0072, "step": 1853 }, { "epoch": 1.356254572055596, "grad_norm": 0.2222859263420105, "learning_rate": 2.773389286595902e-05, "loss": 0.0099, "step": 1854 }, { "epoch": 1.3569861009509876, "grad_norm": 12.656047821044922, "learning_rate": 2.7721550234509998e-05, "loss": 0.1552, "step": 1855 }, { "epoch": 1.357717629846379, "grad_norm": 0.111142098903656, "learning_rate": 2.770920760306097e-05, "loss": 0.0052, "step": 1856 }, { "epoch": 1.3584491587417702, "grad_norm": 15.149170875549316, "learning_rate": 2.7696864971611948e-05, "loss": 0.0667, "step": 1857 }, { "epoch": 1.3591806876371617, "grad_norm": 0.9991952776908875, "learning_rate": 2.768452234016292e-05, "loss": 0.0094, "step": 1858 }, { "epoch": 1.359912216532553, "grad_norm": 6.887957572937012, "learning_rate": 2.76721797087139e-05, "loss": 0.2715, "step": 1859 }, { "epoch": 1.3606437454279443, "grad_norm": 5.423961639404297, "learning_rate": 2.7659837077264872e-05, "loss": 0.1371, "step": 1860 }, { "epoch": 1.3613752743233358, "grad_norm": 5.058558940887451, "learning_rate": 2.7647494445815845e-05, "loss": 0.0955, "step": 1861 }, { "epoch": 1.3621068032187271, "grad_norm": 0.8887763023376465, "learning_rate": 2.7635151814366822e-05, "loss": 0.0081, "step": 1862 }, { "epoch": 1.3628383321141184, "grad_norm": 1.3155757188796997, "learning_rate": 2.7622809182917795e-05, "loss": 0.1618, "step": 1863 }, { "epoch": 1.36356986100951, "grad_norm": 1.3501636981964111, "learning_rate": 2.7610466551468772e-05, "loss": 0.0077, "step": 1864 }, { "epoch": 1.3643013899049012, "grad_norm": 6.575420379638672, "learning_rate": 2.7598123920019746e-05, "loss": 0.1077, "step": 1865 }, { "epoch": 1.3650329188002925, "grad_norm": 3.900825262069702, "learning_rate": 2.7585781288570722e-05, "loss": 0.1476, "step": 1866 }, { "epoch": 1.365764447695684, "grad_norm": 7.671548366546631, "learning_rate": 2.7573438657121696e-05, "loss": 0.0841, "step": 1867 }, { "epoch": 1.3664959765910754, "grad_norm": 0.4088749587535858, "learning_rate": 2.7561096025672673e-05, "loss": 0.0063, "step": 1868 }, { "epoch": 1.3672275054864667, "grad_norm": 3.9428651332855225, "learning_rate": 2.7548753394223646e-05, "loss": 0.1071, "step": 1869 }, { "epoch": 1.3679590343818582, "grad_norm": 19.405229568481445, "learning_rate": 2.7536410762774623e-05, "loss": 0.1647, "step": 1870 }, { "epoch": 1.3686905632772495, "grad_norm": 0.15099018812179565, "learning_rate": 2.7524068131325596e-05, "loss": 0.0047, "step": 1871 }, { "epoch": 1.3694220921726408, "grad_norm": 0.09660230576992035, "learning_rate": 2.7511725499876573e-05, "loss": 0.0039, "step": 1872 }, { "epoch": 1.3701536210680323, "grad_norm": 1.953984260559082, "learning_rate": 2.7499382868427546e-05, "loss": 0.009, "step": 1873 }, { "epoch": 1.3708851499634236, "grad_norm": 0.13164840638637543, "learning_rate": 2.7487040236978523e-05, "loss": 0.0042, "step": 1874 }, { "epoch": 1.3716166788588149, "grad_norm": 1.7017769813537598, "learning_rate": 2.7474697605529497e-05, "loss": 0.1226, "step": 1875 }, { "epoch": 1.3723482077542064, "grad_norm": 16.974620819091797, "learning_rate": 2.7462354974080473e-05, "loss": 0.076, "step": 1876 }, { "epoch": 1.3730797366495977, "grad_norm": 0.12400826066732407, "learning_rate": 2.7450012342631447e-05, "loss": 0.0052, "step": 1877 }, { "epoch": 1.373811265544989, "grad_norm": 0.7306184768676758, "learning_rate": 2.7437669711182424e-05, "loss": 0.0082, "step": 1878 }, { "epoch": 1.3745427944403805, "grad_norm": 0.20663860440254211, "learning_rate": 2.7425327079733397e-05, "loss": 0.0054, "step": 1879 }, { "epoch": 1.3752743233357718, "grad_norm": 0.15679652988910675, "learning_rate": 2.7412984448284374e-05, "loss": 0.0037, "step": 1880 }, { "epoch": 1.3760058522311631, "grad_norm": 0.0565185509622097, "learning_rate": 2.7400641816835347e-05, "loss": 0.0025, "step": 1881 }, { "epoch": 1.3767373811265544, "grad_norm": 0.09534664452075958, "learning_rate": 2.7388299185386324e-05, "loss": 0.0034, "step": 1882 }, { "epoch": 1.377468910021946, "grad_norm": 0.4799453318119049, "learning_rate": 2.7375956553937297e-05, "loss": 0.0047, "step": 1883 }, { "epoch": 1.3782004389173372, "grad_norm": 7.367191791534424, "learning_rate": 2.7363613922488274e-05, "loss": 0.0861, "step": 1884 }, { "epoch": 1.3789319678127285, "grad_norm": 0.1105622947216034, "learning_rate": 2.7351271291039248e-05, "loss": 0.0034, "step": 1885 }, { "epoch": 1.37966349670812, "grad_norm": 0.3214864432811737, "learning_rate": 2.7338928659590224e-05, "loss": 0.0043, "step": 1886 }, { "epoch": 1.3803950256035113, "grad_norm": 0.11503909528255463, "learning_rate": 2.7326586028141198e-05, "loss": 0.0038, "step": 1887 }, { "epoch": 1.3811265544989026, "grad_norm": 23.636587142944336, "learning_rate": 2.7314243396692175e-05, "loss": 0.2334, "step": 1888 }, { "epoch": 1.381858083394294, "grad_norm": 7.382476329803467, "learning_rate": 2.7301900765243148e-05, "loss": 0.0239, "step": 1889 }, { "epoch": 1.3825896122896855, "grad_norm": 0.1507202535867691, "learning_rate": 2.7289558133794125e-05, "loss": 0.0041, "step": 1890 }, { "epoch": 1.3833211411850768, "grad_norm": 0.12308833003044128, "learning_rate": 2.7277215502345098e-05, "loss": 0.0028, "step": 1891 }, { "epoch": 1.384052670080468, "grad_norm": 0.18975494801998138, "learning_rate": 2.7264872870896075e-05, "loss": 0.0034, "step": 1892 }, { "epoch": 1.3847841989758596, "grad_norm": 0.06159798055887222, "learning_rate": 2.725253023944705e-05, "loss": 0.0018, "step": 1893 }, { "epoch": 1.3855157278712509, "grad_norm": 0.3732939660549164, "learning_rate": 2.7240187607998025e-05, "loss": 0.0032, "step": 1894 }, { "epoch": 1.3862472567666422, "grad_norm": 24.957801818847656, "learning_rate": 2.7227844976549e-05, "loss": 0.0548, "step": 1895 }, { "epoch": 1.3869787856620337, "grad_norm": 0.23713409900665283, "learning_rate": 2.7215502345099975e-05, "loss": 0.0025, "step": 1896 }, { "epoch": 1.387710314557425, "grad_norm": 0.04666120558977127, "learning_rate": 2.720315971365095e-05, "loss": 0.0018, "step": 1897 }, { "epoch": 1.3884418434528163, "grad_norm": 0.04518498107790947, "learning_rate": 2.7190817082201926e-05, "loss": 0.0016, "step": 1898 }, { "epoch": 1.3891733723482078, "grad_norm": 0.03895093500614166, "learning_rate": 2.71784744507529e-05, "loss": 0.0017, "step": 1899 }, { "epoch": 1.3899049012435991, "grad_norm": 0.18201333284378052, "learning_rate": 2.7166131819303876e-05, "loss": 0.0019, "step": 1900 }, { "epoch": 1.3906364301389904, "grad_norm": 0.03502019867300987, "learning_rate": 2.715378918785485e-05, "loss": 0.0015, "step": 1901 }, { "epoch": 1.391367959034382, "grad_norm": 9.174792289733887, "learning_rate": 2.7141446556405826e-05, "loss": 0.1207, "step": 1902 }, { "epoch": 1.3920994879297732, "grad_norm": 2.2482147216796875, "learning_rate": 2.71291039249568e-05, "loss": 0.1625, "step": 1903 }, { "epoch": 1.3928310168251645, "grad_norm": 25.96994972229004, "learning_rate": 2.7116761293507776e-05, "loss": 0.5245, "step": 1904 }, { "epoch": 1.393562545720556, "grad_norm": 0.13429957628250122, "learning_rate": 2.710441866205875e-05, "loss": 0.0016, "step": 1905 }, { "epoch": 1.3942940746159473, "grad_norm": 0.04306050017476082, "learning_rate": 2.7092076030609726e-05, "loss": 0.0014, "step": 1906 }, { "epoch": 1.3950256035113386, "grad_norm": 0.02853712998330593, "learning_rate": 2.70797333991607e-05, "loss": 0.0013, "step": 1907 }, { "epoch": 1.3957571324067302, "grad_norm": 0.024068374186754227, "learning_rate": 2.7067390767711677e-05, "loss": 0.001, "step": 1908 }, { "epoch": 1.3964886613021215, "grad_norm": 23.008901596069336, "learning_rate": 2.705504813626265e-05, "loss": 0.1422, "step": 1909 }, { "epoch": 1.3972201901975128, "grad_norm": 0.035917263478040695, "learning_rate": 2.7042705504813627e-05, "loss": 0.0012, "step": 1910 }, { "epoch": 1.3979517190929043, "grad_norm": 1.1120612621307373, "learning_rate": 2.70303628733646e-05, "loss": 0.0049, "step": 1911 }, { "epoch": 1.3986832479882956, "grad_norm": 0.08372478187084198, "learning_rate": 2.7018020241915577e-05, "loss": 0.002, "step": 1912 }, { "epoch": 1.3994147768836869, "grad_norm": 0.06383096426725388, "learning_rate": 2.700567761046655e-05, "loss": 0.0015, "step": 1913 }, { "epoch": 1.4001463057790784, "grad_norm": 1.0622469186782837, "learning_rate": 2.6993334979017527e-05, "loss": 0.0034, "step": 1914 }, { "epoch": 1.4008778346744697, "grad_norm": 0.19346275925636292, "learning_rate": 2.69809923475685e-05, "loss": 0.0028, "step": 1915 }, { "epoch": 1.401609363569861, "grad_norm": 0.29795992374420166, "learning_rate": 2.6968649716119477e-05, "loss": 0.003, "step": 1916 }, { "epoch": 1.4023408924652525, "grad_norm": 0.5166987180709839, "learning_rate": 2.695630708467045e-05, "loss": 0.0031, "step": 1917 }, { "epoch": 1.4030724213606438, "grad_norm": 0.4176037609577179, "learning_rate": 2.6943964453221428e-05, "loss": 0.0024, "step": 1918 }, { "epoch": 1.403803950256035, "grad_norm": 0.03540739044547081, "learning_rate": 2.69316218217724e-05, "loss": 0.0013, "step": 1919 }, { "epoch": 1.4045354791514264, "grad_norm": 19.65083885192871, "learning_rate": 2.6919279190323378e-05, "loss": 0.0623, "step": 1920 }, { "epoch": 1.405267008046818, "grad_norm": 0.03842616081237793, "learning_rate": 2.690693655887435e-05, "loss": 0.0012, "step": 1921 }, { "epoch": 1.4059985369422092, "grad_norm": 0.0355440117418766, "learning_rate": 2.6894593927425328e-05, "loss": 0.0012, "step": 1922 }, { "epoch": 1.4067300658376005, "grad_norm": 0.058585405349731445, "learning_rate": 2.68822512959763e-05, "loss": 0.0015, "step": 1923 }, { "epoch": 1.4074615947329918, "grad_norm": 0.03709329292178154, "learning_rate": 2.6869908664527278e-05, "loss": 0.0012, "step": 1924 }, { "epoch": 1.4081931236283833, "grad_norm": 9.315570831298828, "learning_rate": 2.685756603307825e-05, "loss": 0.1474, "step": 1925 }, { "epoch": 1.4089246525237746, "grad_norm": 2.823421001434326, "learning_rate": 2.684522340162923e-05, "loss": 0.1721, "step": 1926 }, { "epoch": 1.409656181419166, "grad_norm": 0.02676502987742424, "learning_rate": 2.6832880770180202e-05, "loss": 0.0011, "step": 1927 }, { "epoch": 1.4103877103145575, "grad_norm": 9.508275032043457, "learning_rate": 2.682053813873118e-05, "loss": 0.1135, "step": 1928 }, { "epoch": 1.4111192392099488, "grad_norm": 0.5816958546638489, "learning_rate": 2.6808195507282152e-05, "loss": 0.0028, "step": 1929 }, { "epoch": 1.41185076810534, "grad_norm": 0.11964984238147736, "learning_rate": 2.679585287583313e-05, "loss": 0.0017, "step": 1930 }, { "epoch": 1.4125822970007316, "grad_norm": 0.0669301450252533, "learning_rate": 2.6783510244384102e-05, "loss": 0.0013, "step": 1931 }, { "epoch": 1.4133138258961229, "grad_norm": 0.1584072858095169, "learning_rate": 2.677116761293508e-05, "loss": 0.0022, "step": 1932 }, { "epoch": 1.4140453547915142, "grad_norm": 0.05313430726528168, "learning_rate": 2.6758824981486052e-05, "loss": 0.0014, "step": 1933 }, { "epoch": 1.4147768836869057, "grad_norm": 0.020158834755420685, "learning_rate": 2.674648235003703e-05, "loss": 0.0009, "step": 1934 }, { "epoch": 1.415508412582297, "grad_norm": 0.09021595120429993, "learning_rate": 2.6734139718588002e-05, "loss": 0.0021, "step": 1935 }, { "epoch": 1.4162399414776883, "grad_norm": 0.047552209347486496, "learning_rate": 2.672179708713898e-05, "loss": 0.0015, "step": 1936 }, { "epoch": 1.4169714703730798, "grad_norm": 0.19564680755138397, "learning_rate": 2.6709454455689953e-05, "loss": 0.002, "step": 1937 }, { "epoch": 1.417702999268471, "grad_norm": 5.186217308044434, "learning_rate": 2.669711182424093e-05, "loss": 0.1221, "step": 1938 }, { "epoch": 1.4184345281638624, "grad_norm": 0.08323316276073456, "learning_rate": 2.6684769192791903e-05, "loss": 0.0015, "step": 1939 }, { "epoch": 1.419166057059254, "grad_norm": 0.0786207914352417, "learning_rate": 2.667242656134288e-05, "loss": 0.0014, "step": 1940 }, { "epoch": 1.4198975859546452, "grad_norm": 0.05360596999526024, "learning_rate": 2.6660083929893853e-05, "loss": 0.0011, "step": 1941 }, { "epoch": 1.4206291148500365, "grad_norm": 14.971633911132812, "learning_rate": 2.664774129844483e-05, "loss": 0.0142, "step": 1942 }, { "epoch": 1.421360643745428, "grad_norm": 0.0863538309931755, "learning_rate": 2.6635398666995803e-05, "loss": 0.0017, "step": 1943 }, { "epoch": 1.4220921726408193, "grad_norm": 0.8601455092430115, "learning_rate": 2.662305603554678e-05, "loss": 0.2524, "step": 1944 }, { "epoch": 1.4228237015362106, "grad_norm": 0.13717709481716156, "learning_rate": 2.6610713404097753e-05, "loss": 0.0016, "step": 1945 }, { "epoch": 1.4235552304316021, "grad_norm": 0.06864059716463089, "learning_rate": 2.6598370772648727e-05, "loss": 0.0016, "step": 1946 }, { "epoch": 1.4242867593269934, "grad_norm": 0.11807791888713837, "learning_rate": 2.6586028141199704e-05, "loss": 0.0021, "step": 1947 }, { "epoch": 1.4250182882223847, "grad_norm": 0.03255591541528702, "learning_rate": 2.6573685509750677e-05, "loss": 0.0013, "step": 1948 }, { "epoch": 1.4257498171177763, "grad_norm": 0.04108911007642746, "learning_rate": 2.6561342878301654e-05, "loss": 0.0012, "step": 1949 }, { "epoch": 1.4264813460131676, "grad_norm": 6.554471969604492, "learning_rate": 2.6549000246852627e-05, "loss": 0.1055, "step": 1950 }, { "epoch": 1.4272128749085589, "grad_norm": 25.037179946899414, "learning_rate": 2.6536657615403604e-05, "loss": 0.0388, "step": 1951 }, { "epoch": 1.4279444038039504, "grad_norm": 3.0384576320648193, "learning_rate": 2.6524314983954577e-05, "loss": 0.0123, "step": 1952 }, { "epoch": 1.4286759326993417, "grad_norm": 0.21539872884750366, "learning_rate": 2.6511972352505554e-05, "loss": 0.0025, "step": 1953 }, { "epoch": 1.429407461594733, "grad_norm": 0.13712063431739807, "learning_rate": 2.6499629721056528e-05, "loss": 0.0015, "step": 1954 }, { "epoch": 1.4301389904901245, "grad_norm": 0.030307583510875702, "learning_rate": 2.6487287089607504e-05, "loss": 0.0011, "step": 1955 }, { "epoch": 1.4308705193855158, "grad_norm": 0.05411555618047714, "learning_rate": 2.6474944458158478e-05, "loss": 0.0014, "step": 1956 }, { "epoch": 1.431602048280907, "grad_norm": 5.8378987312316895, "learning_rate": 2.6462601826709455e-05, "loss": 0.1395, "step": 1957 }, { "epoch": 1.4323335771762984, "grad_norm": 0.13799643516540527, "learning_rate": 2.6450259195260428e-05, "loss": 0.0017, "step": 1958 }, { "epoch": 1.43306510607169, "grad_norm": 0.04097986966371536, "learning_rate": 2.6437916563811405e-05, "loss": 0.0014, "step": 1959 }, { "epoch": 1.4337966349670812, "grad_norm": 0.7332201600074768, "learning_rate": 2.6425573932362378e-05, "loss": 0.0041, "step": 1960 }, { "epoch": 1.4345281638624725, "grad_norm": 0.044218629598617554, "learning_rate": 2.6413231300913355e-05, "loss": 0.0012, "step": 1961 }, { "epoch": 1.4352596927578638, "grad_norm": 0.026521675288677216, "learning_rate": 2.640088866946433e-05, "loss": 0.0011, "step": 1962 }, { "epoch": 1.4359912216532553, "grad_norm": 0.02946970798075199, "learning_rate": 2.6388546038015305e-05, "loss": 0.0011, "step": 1963 }, { "epoch": 1.4367227505486466, "grad_norm": 2.846529722213745, "learning_rate": 2.637620340656628e-05, "loss": 0.1862, "step": 1964 }, { "epoch": 1.437454279444038, "grad_norm": 0.24511314928531647, "learning_rate": 2.6363860775117255e-05, "loss": 0.002, "step": 1965 }, { "epoch": 1.4381858083394294, "grad_norm": 0.03718717023730278, "learning_rate": 2.635151814366823e-05, "loss": 0.0012, "step": 1966 }, { "epoch": 1.4389173372348207, "grad_norm": 0.041254136711359024, "learning_rate": 2.6339175512219206e-05, "loss": 0.0013, "step": 1967 }, { "epoch": 1.439648866130212, "grad_norm": 21.570533752441406, "learning_rate": 2.632683288077018e-05, "loss": 0.2721, "step": 1968 }, { "epoch": 1.4403803950256036, "grad_norm": 0.8306796550750732, "learning_rate": 2.6314490249321156e-05, "loss": 0.0039, "step": 1969 }, { "epoch": 1.4411119239209949, "grad_norm": 2.239537477493286, "learning_rate": 2.630214761787213e-05, "loss": 0.0047, "step": 1970 }, { "epoch": 1.4418434528163862, "grad_norm": 0.038632832467556, "learning_rate": 2.6289804986423106e-05, "loss": 0.0013, "step": 1971 }, { "epoch": 1.4425749817117777, "grad_norm": 0.8731203079223633, "learning_rate": 2.627746235497408e-05, "loss": 0.2173, "step": 1972 }, { "epoch": 1.443306510607169, "grad_norm": 0.06816928833723068, "learning_rate": 2.6265119723525056e-05, "loss": 0.0016, "step": 1973 }, { "epoch": 1.4440380395025603, "grad_norm": 0.02997785620391369, "learning_rate": 2.625277709207603e-05, "loss": 0.0011, "step": 1974 }, { "epoch": 1.4447695683979518, "grad_norm": 0.08018834888935089, "learning_rate": 2.6240434460627006e-05, "loss": 0.0017, "step": 1975 }, { "epoch": 1.445501097293343, "grad_norm": 0.03949261084198952, "learning_rate": 2.622809182917798e-05, "loss": 0.0016, "step": 1976 }, { "epoch": 1.4462326261887344, "grad_norm": 0.03708605840802193, "learning_rate": 2.6215749197728957e-05, "loss": 0.0015, "step": 1977 }, { "epoch": 1.446964155084126, "grad_norm": 0.03449865058064461, "learning_rate": 2.620340656627993e-05, "loss": 0.0014, "step": 1978 }, { "epoch": 1.4476956839795172, "grad_norm": 7.810178756713867, "learning_rate": 2.6191063934830907e-05, "loss": 0.1208, "step": 1979 }, { "epoch": 1.4484272128749085, "grad_norm": 25.579721450805664, "learning_rate": 2.617872130338188e-05, "loss": 0.0578, "step": 1980 }, { "epoch": 1.4491587417703, "grad_norm": 4.636063098907471, "learning_rate": 2.6166378671932857e-05, "loss": 0.1344, "step": 1981 }, { "epoch": 1.4498902706656913, "grad_norm": 4.028949737548828, "learning_rate": 2.615403604048383e-05, "loss": 0.1297, "step": 1982 }, { "epoch": 1.4506217995610826, "grad_norm": 0.0466492734849453, "learning_rate": 2.6141693409034807e-05, "loss": 0.0016, "step": 1983 }, { "epoch": 1.4513533284564741, "grad_norm": 13.238863945007324, "learning_rate": 2.612935077758578e-05, "loss": 0.0221, "step": 1984 }, { "epoch": 1.4520848573518654, "grad_norm": 0.06563273817300797, "learning_rate": 2.6117008146136757e-05, "loss": 0.0025, "step": 1985 }, { "epoch": 1.4528163862472567, "grad_norm": 0.08094670623540878, "learning_rate": 2.610466551468773e-05, "loss": 0.0022, "step": 1986 }, { "epoch": 1.4535479151426483, "grad_norm": 0.046696510165929794, "learning_rate": 2.6092322883238708e-05, "loss": 0.0015, "step": 1987 }, { "epoch": 1.4542794440380395, "grad_norm": 0.10272881388664246, "learning_rate": 2.607998025178968e-05, "loss": 0.0032, "step": 1988 }, { "epoch": 1.4550109729334308, "grad_norm": 12.404629707336426, "learning_rate": 2.6067637620340658e-05, "loss": 0.0203, "step": 1989 }, { "epoch": 1.4557425018288224, "grad_norm": 0.32123270630836487, "learning_rate": 2.605529498889163e-05, "loss": 0.0048, "step": 1990 }, { "epoch": 1.4564740307242137, "grad_norm": 0.05419204384088516, "learning_rate": 2.6042952357442608e-05, "loss": 0.0016, "step": 1991 }, { "epoch": 1.457205559619605, "grad_norm": 8.075190544128418, "learning_rate": 2.603060972599358e-05, "loss": 0.084, "step": 1992 }, { "epoch": 1.4579370885149963, "grad_norm": 0.07712900638580322, "learning_rate": 2.6018267094544558e-05, "loss": 0.0027, "step": 1993 }, { "epoch": 1.4586686174103878, "grad_norm": 0.0794815868139267, "learning_rate": 2.600592446309553e-05, "loss": 0.002, "step": 1994 }, { "epoch": 1.459400146305779, "grad_norm": 0.040698979049921036, "learning_rate": 2.599358183164651e-05, "loss": 0.0016, "step": 1995 }, { "epoch": 1.4601316752011704, "grad_norm": 0.06088996306061745, "learning_rate": 2.5981239200197482e-05, "loss": 0.0017, "step": 1996 }, { "epoch": 1.460863204096562, "grad_norm": 0.5008528828620911, "learning_rate": 2.596889656874846e-05, "loss": 0.0052, "step": 1997 }, { "epoch": 1.4615947329919532, "grad_norm": 0.02558470517396927, "learning_rate": 2.5956553937299432e-05, "loss": 0.0012, "step": 1998 }, { "epoch": 1.4623262618873445, "grad_norm": 3.1339831352233887, "learning_rate": 2.594421130585041e-05, "loss": 0.0889, "step": 1999 }, { "epoch": 1.4630577907827358, "grad_norm": 0.25751927495002747, "learning_rate": 2.5931868674401382e-05, "loss": 0.0031, "step": 2000 }, { "epoch": 1.4637893196781273, "grad_norm": 0.029645945876836777, "learning_rate": 2.591952604295236e-05, "loss": 0.0012, "step": 2001 }, { "epoch": 1.4645208485735186, "grad_norm": 0.03329916298389435, "learning_rate": 2.5907183411503332e-05, "loss": 0.0012, "step": 2002 }, { "epoch": 1.46525237746891, "grad_norm": 0.02891611121594906, "learning_rate": 2.589484078005431e-05, "loss": 0.0013, "step": 2003 }, { "epoch": 1.4659839063643014, "grad_norm": 4.984777927398682, "learning_rate": 2.5882498148605283e-05, "loss": 0.1311, "step": 2004 }, { "epoch": 1.4667154352596927, "grad_norm": 0.12564000487327576, "learning_rate": 2.587015551715626e-05, "loss": 0.002, "step": 2005 }, { "epoch": 1.467446964155084, "grad_norm": 0.3840942084789276, "learning_rate": 2.5857812885707233e-05, "loss": 0.0039, "step": 2006 }, { "epoch": 1.4681784930504755, "grad_norm": 0.4351533055305481, "learning_rate": 2.584547025425821e-05, "loss": 0.0029, "step": 2007 }, { "epoch": 1.4689100219458668, "grad_norm": 6.025134563446045, "learning_rate": 2.5833127622809183e-05, "loss": 0.1342, "step": 2008 }, { "epoch": 1.4696415508412581, "grad_norm": 0.0451236255466938, "learning_rate": 2.582078499136016e-05, "loss": 0.0014, "step": 2009 }, { "epoch": 1.4703730797366497, "grad_norm": 0.0830737054347992, "learning_rate": 2.5808442359911133e-05, "loss": 0.0013, "step": 2010 }, { "epoch": 1.471104608632041, "grad_norm": 8.053807258605957, "learning_rate": 2.579609972846211e-05, "loss": 0.0225, "step": 2011 }, { "epoch": 1.4718361375274323, "grad_norm": 0.04312034323811531, "learning_rate": 2.5783757097013083e-05, "loss": 0.0014, "step": 2012 }, { "epoch": 1.4725676664228238, "grad_norm": 0.9501346349716187, "learning_rate": 2.577141446556406e-05, "loss": 0.0064, "step": 2013 }, { "epoch": 1.473299195318215, "grad_norm": 0.031026503071188927, "learning_rate": 2.5759071834115034e-05, "loss": 0.0011, "step": 2014 }, { "epoch": 1.4740307242136064, "grad_norm": 0.17020274698734283, "learning_rate": 2.574672920266601e-05, "loss": 0.0018, "step": 2015 }, { "epoch": 1.474762253108998, "grad_norm": 0.043253958225250244, "learning_rate": 2.5734386571216984e-05, "loss": 0.0013, "step": 2016 }, { "epoch": 1.4754937820043892, "grad_norm": 3.8486411571502686, "learning_rate": 2.572204393976796e-05, "loss": 0.2096, "step": 2017 }, { "epoch": 1.4762253108997805, "grad_norm": 0.02920745126903057, "learning_rate": 2.5709701308318934e-05, "loss": 0.0012, "step": 2018 }, { "epoch": 1.476956839795172, "grad_norm": 0.07786602526903152, "learning_rate": 2.569735867686991e-05, "loss": 0.0012, "step": 2019 }, { "epoch": 1.4776883686905633, "grad_norm": 13.07553768157959, "learning_rate": 2.5685016045420884e-05, "loss": 0.2143, "step": 2020 }, { "epoch": 1.4784198975859546, "grad_norm": 0.11074522882699966, "learning_rate": 2.567267341397186e-05, "loss": 0.0016, "step": 2021 }, { "epoch": 1.4791514264813461, "grad_norm": 0.03949734568595886, "learning_rate": 2.5660330782522834e-05, "loss": 0.0011, "step": 2022 }, { "epoch": 1.4798829553767374, "grad_norm": 0.041300442069768906, "learning_rate": 2.564798815107381e-05, "loss": 0.0012, "step": 2023 }, { "epoch": 1.4806144842721287, "grad_norm": 5.278043270111084, "learning_rate": 2.5635645519624785e-05, "loss": 0.1039, "step": 2024 }, { "epoch": 1.4813460131675202, "grad_norm": 0.0973777249455452, "learning_rate": 2.562330288817576e-05, "loss": 0.0014, "step": 2025 }, { "epoch": 1.4820775420629115, "grad_norm": 7.051520824432373, "learning_rate": 2.5610960256726735e-05, "loss": 0.1263, "step": 2026 }, { "epoch": 1.4828090709583028, "grad_norm": 0.39372581243515015, "learning_rate": 2.559861762527771e-05, "loss": 0.0033, "step": 2027 }, { "epoch": 1.4835405998536944, "grad_norm": 0.19568459689617157, "learning_rate": 2.5586274993828685e-05, "loss": 0.0028, "step": 2028 }, { "epoch": 1.4842721287490857, "grad_norm": 0.09231149405241013, "learning_rate": 2.557393236237966e-05, "loss": 0.0021, "step": 2029 }, { "epoch": 1.485003657644477, "grad_norm": 0.06367408484220505, "learning_rate": 2.5561589730930635e-05, "loss": 0.0013, "step": 2030 }, { "epoch": 1.4857351865398682, "grad_norm": 0.09873741865158081, "learning_rate": 2.5549247099481612e-05, "loss": 0.0018, "step": 2031 }, { "epoch": 1.4864667154352598, "grad_norm": 0.0337274968624115, "learning_rate": 2.5536904468032585e-05, "loss": 0.0011, "step": 2032 }, { "epoch": 1.487198244330651, "grad_norm": 0.3948884904384613, "learning_rate": 2.552456183658356e-05, "loss": 0.0037, "step": 2033 }, { "epoch": 1.4879297732260424, "grad_norm": 0.4969221353530884, "learning_rate": 2.5512219205134535e-05, "loss": 0.0047, "step": 2034 }, { "epoch": 1.4886613021214337, "grad_norm": 0.24492789804935455, "learning_rate": 2.549987657368551e-05, "loss": 0.0019, "step": 2035 }, { "epoch": 1.4893928310168252, "grad_norm": 0.06790785491466522, "learning_rate": 2.5487533942236486e-05, "loss": 0.0012, "step": 2036 }, { "epoch": 1.4901243599122165, "grad_norm": 12.477131843566895, "learning_rate": 2.547519131078746e-05, "loss": 0.0292, "step": 2037 }, { "epoch": 1.4908558888076078, "grad_norm": 12.20120620727539, "learning_rate": 2.5462848679338436e-05, "loss": 0.0292, "step": 2038 }, { "epoch": 1.4915874177029993, "grad_norm": 0.037982892245054245, "learning_rate": 2.545050604788941e-05, "loss": 0.0009, "step": 2039 }, { "epoch": 1.4923189465983906, "grad_norm": 0.10873787850141525, "learning_rate": 2.5438163416440386e-05, "loss": 0.0021, "step": 2040 }, { "epoch": 1.493050475493782, "grad_norm": 0.01581929437816143, "learning_rate": 2.542582078499136e-05, "loss": 0.0007, "step": 2041 }, { "epoch": 1.4937820043891734, "grad_norm": 0.037799131125211716, "learning_rate": 2.5413478153542336e-05, "loss": 0.0009, "step": 2042 }, { "epoch": 1.4945135332845647, "grad_norm": 0.4331555962562561, "learning_rate": 2.540113552209331e-05, "loss": 0.0022, "step": 2043 }, { "epoch": 1.495245062179956, "grad_norm": 16.956090927124023, "learning_rate": 2.5388792890644286e-05, "loss": 0.0462, "step": 2044 }, { "epoch": 1.4959765910753475, "grad_norm": 13.278166770935059, "learning_rate": 2.537645025919526e-05, "loss": 0.0835, "step": 2045 }, { "epoch": 1.4967081199707388, "grad_norm": 9.940723419189453, "learning_rate": 2.5364107627746237e-05, "loss": 0.1856, "step": 2046 }, { "epoch": 1.4974396488661301, "grad_norm": 0.07437563687562943, "learning_rate": 2.535176499629721e-05, "loss": 0.001, "step": 2047 }, { "epoch": 1.4981711777615216, "grad_norm": 0.09298329800367355, "learning_rate": 2.5339422364848187e-05, "loss": 0.0013, "step": 2048 }, { "epoch": 1.498902706656913, "grad_norm": 0.027558503672480583, "learning_rate": 2.532707973339916e-05, "loss": 0.0007, "step": 2049 }, { "epoch": 1.4996342355523042, "grad_norm": 0.05275377258658409, "learning_rate": 2.5314737101950137e-05, "loss": 0.0011, "step": 2050 }, { "epoch": 1.5003657644476958, "grad_norm": 0.27894043922424316, "learning_rate": 2.530239447050111e-05, "loss": 0.0019, "step": 2051 }, { "epoch": 1.501097293343087, "grad_norm": 0.3751441240310669, "learning_rate": 2.5290051839052087e-05, "loss": 0.0013, "step": 2052 }, { "epoch": 1.5018288222384784, "grad_norm": 0.01617000810801983, "learning_rate": 2.527770920760306e-05, "loss": 0.0006, "step": 2053 }, { "epoch": 1.5025603511338699, "grad_norm": 0.5398091077804565, "learning_rate": 2.5265366576154037e-05, "loss": 0.0013, "step": 2054 }, { "epoch": 1.5032918800292612, "grad_norm": 0.09681782126426697, "learning_rate": 2.525302394470501e-05, "loss": 0.001, "step": 2055 }, { "epoch": 1.5040234089246525, "grad_norm": 0.113011933863163, "learning_rate": 2.5240681313255988e-05, "loss": 0.0012, "step": 2056 }, { "epoch": 1.504754937820044, "grad_norm": 6.285932540893555, "learning_rate": 2.522833868180696e-05, "loss": 0.1377, "step": 2057 }, { "epoch": 1.5054864667154353, "grad_norm": 0.24208928644657135, "learning_rate": 2.5215996050357938e-05, "loss": 0.0015, "step": 2058 }, { "epoch": 1.5062179956108266, "grad_norm": 11.929615020751953, "learning_rate": 2.520365341890891e-05, "loss": 0.0116, "step": 2059 }, { "epoch": 1.506949524506218, "grad_norm": 0.016163159161806107, "learning_rate": 2.5191310787459888e-05, "loss": 0.0006, "step": 2060 }, { "epoch": 1.5076810534016094, "grad_norm": 14.400252342224121, "learning_rate": 2.517896815601086e-05, "loss": 0.1822, "step": 2061 }, { "epoch": 1.5084125822970007, "grad_norm": 0.10265344381332397, "learning_rate": 2.5166625524561838e-05, "loss": 0.0011, "step": 2062 }, { "epoch": 1.5091441111923922, "grad_norm": 0.012980208732187748, "learning_rate": 2.515428289311281e-05, "loss": 0.0006, "step": 2063 }, { "epoch": 1.5098756400877835, "grad_norm": 7.571216106414795, "learning_rate": 2.514194026166379e-05, "loss": 0.3319, "step": 2064 }, { "epoch": 1.5106071689831748, "grad_norm": 9.08866024017334, "learning_rate": 2.5129597630214762e-05, "loss": 0.1016, "step": 2065 }, { "epoch": 1.5113386978785663, "grad_norm": 9.717842102050781, "learning_rate": 2.511725499876574e-05, "loss": 0.0124, "step": 2066 }, { "epoch": 1.5120702267739574, "grad_norm": 0.22429661452770233, "learning_rate": 2.5104912367316712e-05, "loss": 0.0014, "step": 2067 }, { "epoch": 1.512801755669349, "grad_norm": 9.517912864685059, "learning_rate": 2.509256973586769e-05, "loss": 0.1422, "step": 2068 }, { "epoch": 1.5135332845647405, "grad_norm": 3.52030611038208, "learning_rate": 2.5080227104418662e-05, "loss": 0.0075, "step": 2069 }, { "epoch": 1.5142648134601315, "grad_norm": 0.05821005254983902, "learning_rate": 2.506788447296964e-05, "loss": 0.0007, "step": 2070 }, { "epoch": 1.514996342355523, "grad_norm": 21.80573081970215, "learning_rate": 2.5055541841520612e-05, "loss": 0.1189, "step": 2071 }, { "epoch": 1.5157278712509146, "grad_norm": 0.04134508594870567, "learning_rate": 2.504319921007159e-05, "loss": 0.001, "step": 2072 }, { "epoch": 1.5164594001463056, "grad_norm": 0.20870333909988403, "learning_rate": 2.5030856578622563e-05, "loss": 0.0008, "step": 2073 }, { "epoch": 1.5171909290416972, "grad_norm": 21.334503173828125, "learning_rate": 2.501851394717354e-05, "loss": 0.0499, "step": 2074 }, { "epoch": 1.5179224579370885, "grad_norm": 0.48392313718795776, "learning_rate": 2.5006171315724513e-05, "loss": 0.0029, "step": 2075 }, { "epoch": 1.5186539868324798, "grad_norm": 9.143455505371094, "learning_rate": 2.499382868427549e-05, "loss": 0.1255, "step": 2076 }, { "epoch": 1.5193855157278713, "grad_norm": 0.9720236659049988, "learning_rate": 2.4981486052826463e-05, "loss": 0.0046, "step": 2077 }, { "epoch": 1.5201170446232626, "grad_norm": 8.028404235839844, "learning_rate": 2.496914342137744e-05, "loss": 0.2383, "step": 2078 }, { "epoch": 1.5208485735186539, "grad_norm": 16.800121307373047, "learning_rate": 2.4956800789928413e-05, "loss": 0.1611, "step": 2079 }, { "epoch": 1.5215801024140454, "grad_norm": 4.422765731811523, "learning_rate": 2.494445815847939e-05, "loss": 0.2162, "step": 2080 }, { "epoch": 1.5223116313094367, "grad_norm": 1.0840510129928589, "learning_rate": 2.4932115527030363e-05, "loss": 0.0045, "step": 2081 }, { "epoch": 1.523043160204828, "grad_norm": 11.630794525146484, "learning_rate": 2.491977289558134e-05, "loss": 0.0784, "step": 2082 }, { "epoch": 1.5237746891002195, "grad_norm": 0.41257283091545105, "learning_rate": 2.4907430264132314e-05, "loss": 0.0045, "step": 2083 }, { "epoch": 1.5245062179956108, "grad_norm": 0.04903878644108772, "learning_rate": 2.489508763268329e-05, "loss": 0.0011, "step": 2084 }, { "epoch": 1.525237746891002, "grad_norm": 0.13361847400665283, "learning_rate": 2.4882745001234264e-05, "loss": 0.0018, "step": 2085 }, { "epoch": 1.5259692757863936, "grad_norm": 0.024118226021528244, "learning_rate": 2.487040236978524e-05, "loss": 0.0007, "step": 2086 }, { "epoch": 1.526700804681785, "grad_norm": 6.467494964599609, "learning_rate": 2.4858059738336214e-05, "loss": 0.2822, "step": 2087 }, { "epoch": 1.5274323335771762, "grad_norm": 0.4925498962402344, "learning_rate": 2.484571710688719e-05, "loss": 0.0047, "step": 2088 }, { "epoch": 1.5281638624725677, "grad_norm": 3.4251532554626465, "learning_rate": 2.4833374475438164e-05, "loss": 0.4143, "step": 2089 }, { "epoch": 1.528895391367959, "grad_norm": 0.1255357265472412, "learning_rate": 2.482103184398914e-05, "loss": 0.002, "step": 2090 }, { "epoch": 1.5296269202633503, "grad_norm": 0.19863352179527283, "learning_rate": 2.4808689212540114e-05, "loss": 0.0027, "step": 2091 }, { "epoch": 1.5303584491587419, "grad_norm": 1.028275728225708, "learning_rate": 2.479634658109109e-05, "loss": 0.0076, "step": 2092 }, { "epoch": 1.5310899780541332, "grad_norm": 0.9456101059913635, "learning_rate": 2.4784003949642065e-05, "loss": 0.0053, "step": 2093 }, { "epoch": 1.5318215069495245, "grad_norm": 0.21153861284255981, "learning_rate": 2.477166131819304e-05, "loss": 0.0036, "step": 2094 }, { "epoch": 1.532553035844916, "grad_norm": 0.029161468148231506, "learning_rate": 2.4759318686744015e-05, "loss": 0.0011, "step": 2095 }, { "epoch": 1.5332845647403073, "grad_norm": 8.403043746948242, "learning_rate": 2.474697605529499e-05, "loss": 0.133, "step": 2096 }, { "epoch": 1.5340160936356986, "grad_norm": 0.1335066705942154, "learning_rate": 2.4734633423845965e-05, "loss": 0.0028, "step": 2097 }, { "epoch": 1.53474762253109, "grad_norm": 0.34632954001426697, "learning_rate": 2.4722290792396942e-05, "loss": 0.0035, "step": 2098 }, { "epoch": 1.5354791514264814, "grad_norm": 0.054209865629673004, "learning_rate": 2.4709948160947915e-05, "loss": 0.0015, "step": 2099 }, { "epoch": 1.5362106803218727, "grad_norm": 7.724459171295166, "learning_rate": 2.4697605529498892e-05, "loss": 0.1965, "step": 2100 }, { "epoch": 1.5369422092172642, "grad_norm": 0.3506838083267212, "learning_rate": 2.4685262898049865e-05, "loss": 0.0021, "step": 2101 }, { "epoch": 1.5376737381126553, "grad_norm": 0.2718561291694641, "learning_rate": 2.4672920266600842e-05, "loss": 0.0035, "step": 2102 }, { "epoch": 1.5384052670080468, "grad_norm": 3.537734270095825, "learning_rate": 2.4660577635151816e-05, "loss": 0.174, "step": 2103 }, { "epoch": 1.5391367959034383, "grad_norm": 0.39597460627555847, "learning_rate": 2.4648235003702792e-05, "loss": 0.0057, "step": 2104 }, { "epoch": 1.5398683247988294, "grad_norm": 0.3976641297340393, "learning_rate": 2.4635892372253766e-05, "loss": 0.0055, "step": 2105 }, { "epoch": 1.540599853694221, "grad_norm": 7.167951583862305, "learning_rate": 2.4623549740804743e-05, "loss": 0.2589, "step": 2106 }, { "epoch": 1.5413313825896124, "grad_norm": 0.11426830291748047, "learning_rate": 2.4611207109355716e-05, "loss": 0.0016, "step": 2107 }, { "epoch": 1.5420629114850035, "grad_norm": 0.2973451018333435, "learning_rate": 2.4598864477906693e-05, "loss": 0.004, "step": 2108 }, { "epoch": 1.542794440380395, "grad_norm": 0.1464003324508667, "learning_rate": 2.4586521846457666e-05, "loss": 0.0032, "step": 2109 }, { "epoch": 1.5435259692757866, "grad_norm": 0.11458850651979446, "learning_rate": 2.4574179215008643e-05, "loss": 0.0018, "step": 2110 }, { "epoch": 1.5442574981711776, "grad_norm": 16.706283569335938, "learning_rate": 2.4561836583559616e-05, "loss": 0.04, "step": 2111 }, { "epoch": 1.5449890270665692, "grad_norm": 6.102662563323975, "learning_rate": 2.4549493952110593e-05, "loss": 0.1019, "step": 2112 }, { "epoch": 1.5457205559619605, "grad_norm": 0.054222527891397476, "learning_rate": 2.4537151320661567e-05, "loss": 0.0013, "step": 2113 }, { "epoch": 1.5464520848573517, "grad_norm": 5.313819885253906, "learning_rate": 2.4524808689212543e-05, "loss": 0.0106, "step": 2114 }, { "epoch": 1.5471836137527433, "grad_norm": 0.3333180844783783, "learning_rate": 2.4512466057763517e-05, "loss": 0.0023, "step": 2115 }, { "epoch": 1.5479151426481346, "grad_norm": 3.9451229572296143, "learning_rate": 2.4500123426314494e-05, "loss": 0.0122, "step": 2116 }, { "epoch": 1.5486466715435259, "grad_norm": 0.052427515387535095, "learning_rate": 2.4487780794865467e-05, "loss": 0.0016, "step": 2117 }, { "epoch": 1.5493782004389174, "grad_norm": 0.03281524404883385, "learning_rate": 2.4475438163416444e-05, "loss": 0.0012, "step": 2118 }, { "epoch": 1.5501097293343087, "grad_norm": 0.03999284282326698, "learning_rate": 2.4463095531967417e-05, "loss": 0.0012, "step": 2119 }, { "epoch": 1.5508412582297, "grad_norm": 0.055051762610673904, "learning_rate": 2.445075290051839e-05, "loss": 0.0014, "step": 2120 }, { "epoch": 1.5515727871250915, "grad_norm": 0.07059404253959656, "learning_rate": 2.4438410269069367e-05, "loss": 0.002, "step": 2121 }, { "epoch": 1.5523043160204828, "grad_norm": 0.8079754710197449, "learning_rate": 2.442606763762034e-05, "loss": 0.0039, "step": 2122 }, { "epoch": 1.553035844915874, "grad_norm": 0.0407632440328598, "learning_rate": 2.4413725006171317e-05, "loss": 0.0009, "step": 2123 }, { "epoch": 1.5537673738112656, "grad_norm": 0.05133582279086113, "learning_rate": 2.440138237472229e-05, "loss": 0.0014, "step": 2124 }, { "epoch": 1.554498902706657, "grad_norm": 0.7259159088134766, "learning_rate": 2.4389039743273268e-05, "loss": 0.0028, "step": 2125 }, { "epoch": 1.5552304316020482, "grad_norm": 1.587578535079956, "learning_rate": 2.437669711182424e-05, "loss": 0.0044, "step": 2126 }, { "epoch": 1.5559619604974397, "grad_norm": 1.2375259399414062, "learning_rate": 2.4364354480375218e-05, "loss": 0.0034, "step": 2127 }, { "epoch": 1.556693489392831, "grad_norm": 0.11425071954727173, "learning_rate": 2.435201184892619e-05, "loss": 0.0013, "step": 2128 }, { "epoch": 1.5574250182882223, "grad_norm": 0.027797192335128784, "learning_rate": 2.4339669217477168e-05, "loss": 0.0008, "step": 2129 }, { "epoch": 1.5581565471836138, "grad_norm": 0.20051851868629456, "learning_rate": 2.432732658602814e-05, "loss": 0.0014, "step": 2130 }, { "epoch": 1.5588880760790051, "grad_norm": 0.03466016799211502, "learning_rate": 2.4314983954579118e-05, "loss": 0.0009, "step": 2131 }, { "epoch": 1.5596196049743964, "grad_norm": 0.3084155023097992, "learning_rate": 2.430264132313009e-05, "loss": 0.0022, "step": 2132 }, { "epoch": 1.560351133869788, "grad_norm": 10.639766693115234, "learning_rate": 2.429029869168107e-05, "loss": 0.1312, "step": 2133 }, { "epoch": 1.5610826627651793, "grad_norm": 5.684080123901367, "learning_rate": 2.4277956060232042e-05, "loss": 0.3033, "step": 2134 }, { "epoch": 1.5618141916605706, "grad_norm": 6.03232479095459, "learning_rate": 2.426561342878302e-05, "loss": 0.1763, "step": 2135 }, { "epoch": 1.562545720555962, "grad_norm": 24.24619483947754, "learning_rate": 2.4253270797333992e-05, "loss": 0.0228, "step": 2136 }, { "epoch": 1.5632772494513534, "grad_norm": 0.014190349727869034, "learning_rate": 2.424092816588497e-05, "loss": 0.0005, "step": 2137 }, { "epoch": 1.5640087783467447, "grad_norm": 20.298490524291992, "learning_rate": 2.4228585534435942e-05, "loss": 0.0393, "step": 2138 }, { "epoch": 1.5647403072421362, "grad_norm": 0.2679484188556671, "learning_rate": 2.421624290298692e-05, "loss": 0.002, "step": 2139 }, { "epoch": 1.5654718361375273, "grad_norm": 0.2261887788772583, "learning_rate": 2.4203900271537892e-05, "loss": 0.0014, "step": 2140 }, { "epoch": 1.5662033650329188, "grad_norm": 0.040463149547576904, "learning_rate": 2.419155764008887e-05, "loss": 0.0011, "step": 2141 }, { "epoch": 1.5669348939283103, "grad_norm": 2.6676025390625, "learning_rate": 2.4179215008639843e-05, "loss": 0.2242, "step": 2142 }, { "epoch": 1.5676664228237014, "grad_norm": 0.08802150189876556, "learning_rate": 2.416687237719082e-05, "loss": 0.0018, "step": 2143 }, { "epoch": 1.568397951719093, "grad_norm": 0.027073252946138382, "learning_rate": 2.4154529745741793e-05, "loss": 0.0008, "step": 2144 }, { "epoch": 1.5691294806144844, "grad_norm": 41.01927185058594, "learning_rate": 2.414218711429277e-05, "loss": 0.0568, "step": 2145 }, { "epoch": 1.5698610095098755, "grad_norm": 0.37385767698287964, "learning_rate": 2.4129844482843743e-05, "loss": 0.0031, "step": 2146 }, { "epoch": 1.570592538405267, "grad_norm": 0.6795276999473572, "learning_rate": 2.411750185139472e-05, "loss": 0.0028, "step": 2147 }, { "epoch": 1.5713240673006583, "grad_norm": 0.029087908565998077, "learning_rate": 2.4105159219945693e-05, "loss": 0.0009, "step": 2148 }, { "epoch": 1.5720555961960496, "grad_norm": 0.2608378827571869, "learning_rate": 2.409281658849667e-05, "loss": 0.002, "step": 2149 }, { "epoch": 1.5727871250914411, "grad_norm": 4.495846271514893, "learning_rate": 2.4080473957047643e-05, "loss": 0.1224, "step": 2150 }, { "epoch": 1.5735186539868324, "grad_norm": 0.025128819048404694, "learning_rate": 2.406813132559862e-05, "loss": 0.0008, "step": 2151 }, { "epoch": 1.5742501828822237, "grad_norm": 0.03189104422926903, "learning_rate": 2.4055788694149594e-05, "loss": 0.001, "step": 2152 }, { "epoch": 1.5749817117776153, "grad_norm": 0.05835723876953125, "learning_rate": 2.404344606270057e-05, "loss": 0.0013, "step": 2153 }, { "epoch": 1.5757132406730066, "grad_norm": 0.021284818649291992, "learning_rate": 2.4031103431251544e-05, "loss": 0.0007, "step": 2154 }, { "epoch": 1.5764447695683979, "grad_norm": 0.057024721056222916, "learning_rate": 2.401876079980252e-05, "loss": 0.0012, "step": 2155 }, { "epoch": 1.5771762984637894, "grad_norm": 0.03328898176550865, "learning_rate": 2.4006418168353494e-05, "loss": 0.001, "step": 2156 }, { "epoch": 1.5779078273591807, "grad_norm": 3.247894763946533, "learning_rate": 2.399407553690447e-05, "loss": 0.1204, "step": 2157 }, { "epoch": 1.578639356254572, "grad_norm": 0.05724049359560013, "learning_rate": 2.3981732905455444e-05, "loss": 0.0009, "step": 2158 }, { "epoch": 1.5793708851499635, "grad_norm": 11.957796096801758, "learning_rate": 2.396939027400642e-05, "loss": 0.0225, "step": 2159 }, { "epoch": 1.5801024140453548, "grad_norm": 0.05104149132966995, "learning_rate": 2.3957047642557394e-05, "loss": 0.0011, "step": 2160 }, { "epoch": 1.580833942940746, "grad_norm": 34.98112869262695, "learning_rate": 2.394470501110837e-05, "loss": 0.1164, "step": 2161 }, { "epoch": 1.5815654718361376, "grad_norm": 5.5984063148498535, "learning_rate": 2.3932362379659345e-05, "loss": 0.0971, "step": 2162 }, { "epoch": 1.582297000731529, "grad_norm": 0.7531339526176453, "learning_rate": 2.392001974821032e-05, "loss": 0.0031, "step": 2163 }, { "epoch": 1.5830285296269202, "grad_norm": 0.1534377485513687, "learning_rate": 2.3907677116761295e-05, "loss": 0.0027, "step": 2164 }, { "epoch": 1.5837600585223117, "grad_norm": 0.07887899875640869, "learning_rate": 2.389533448531227e-05, "loss": 0.0016, "step": 2165 }, { "epoch": 1.584491587417703, "grad_norm": 0.02782522141933441, "learning_rate": 2.3882991853863245e-05, "loss": 0.0007, "step": 2166 }, { "epoch": 1.5852231163130943, "grad_norm": 0.22543983161449432, "learning_rate": 2.3870649222414222e-05, "loss": 0.0025, "step": 2167 }, { "epoch": 1.5859546452084858, "grad_norm": 0.49229252338409424, "learning_rate": 2.3858306590965195e-05, "loss": 0.0022, "step": 2168 }, { "epoch": 1.5866861741038771, "grad_norm": 0.046476349234580994, "learning_rate": 2.3845963959516172e-05, "loss": 0.0012, "step": 2169 }, { "epoch": 1.5874177029992684, "grad_norm": 0.020438242703676224, "learning_rate": 2.3833621328067145e-05, "loss": 0.0007, "step": 2170 }, { "epoch": 1.58814923189466, "grad_norm": 25.77008819580078, "learning_rate": 2.3821278696618122e-05, "loss": 0.0364, "step": 2171 }, { "epoch": 1.5888807607900512, "grad_norm": 13.585511207580566, "learning_rate": 2.3808936065169096e-05, "loss": 0.0627, "step": 2172 }, { "epoch": 1.5896122896854425, "grad_norm": 0.054018739610910416, "learning_rate": 2.3796593433720072e-05, "loss": 0.0014, "step": 2173 }, { "epoch": 1.590343818580834, "grad_norm": 4.799535274505615, "learning_rate": 2.3784250802271046e-05, "loss": 0.1417, "step": 2174 }, { "epoch": 1.5910753474762254, "grad_norm": 0.04381488263607025, "learning_rate": 2.3771908170822023e-05, "loss": 0.001, "step": 2175 }, { "epoch": 1.5918068763716167, "grad_norm": 0.01695006713271141, "learning_rate": 2.3759565539372996e-05, "loss": 0.0007, "step": 2176 }, { "epoch": 1.5925384052670082, "grad_norm": 0.12022367864847183, "learning_rate": 2.3747222907923973e-05, "loss": 0.0026, "step": 2177 }, { "epoch": 1.5932699341623993, "grad_norm": 0.23695920407772064, "learning_rate": 2.3734880276474946e-05, "loss": 0.0021, "step": 2178 }, { "epoch": 1.5940014630577908, "grad_norm": 0.036392949521541595, "learning_rate": 2.3722537645025923e-05, "loss": 0.0009, "step": 2179 }, { "epoch": 1.5947329919531823, "grad_norm": 0.16978906095027924, "learning_rate": 2.3710195013576896e-05, "loss": 0.0022, "step": 2180 }, { "epoch": 1.5954645208485734, "grad_norm": 0.07295698672533035, "learning_rate": 2.3697852382127873e-05, "loss": 0.0015, "step": 2181 }, { "epoch": 1.596196049743965, "grad_norm": 0.29714617133140564, "learning_rate": 2.3685509750678847e-05, "loss": 0.0024, "step": 2182 }, { "epoch": 1.5969275786393564, "grad_norm": 31.912202835083008, "learning_rate": 2.3673167119229823e-05, "loss": 0.1003, "step": 2183 }, { "epoch": 1.5976591075347475, "grad_norm": 8.119739532470703, "learning_rate": 2.3660824487780797e-05, "loss": 0.1362, "step": 2184 }, { "epoch": 1.598390636430139, "grad_norm": 0.0398697704076767, "learning_rate": 2.3648481856331774e-05, "loss": 0.0008, "step": 2185 }, { "epoch": 1.5991221653255303, "grad_norm": 0.4417366683483124, "learning_rate": 2.3636139224882747e-05, "loss": 0.0019, "step": 2186 }, { "epoch": 1.5998536942209216, "grad_norm": 0.13984644412994385, "learning_rate": 2.3623796593433724e-05, "loss": 0.0013, "step": 2187 }, { "epoch": 1.6005852231163131, "grad_norm": 0.039299897849559784, "learning_rate": 2.3611453961984697e-05, "loss": 0.0006, "step": 2188 }, { "epoch": 1.6013167520117044, "grad_norm": 0.03039645217359066, "learning_rate": 2.3599111330535674e-05, "loss": 0.0008, "step": 2189 }, { "epoch": 1.6020482809070957, "grad_norm": 8.1900053024292, "learning_rate": 2.3586768699086647e-05, "loss": 0.0285, "step": 2190 }, { "epoch": 1.6027798098024872, "grad_norm": 0.20503687858581543, "learning_rate": 2.3574426067637624e-05, "loss": 0.0021, "step": 2191 }, { "epoch": 1.6035113386978785, "grad_norm": 0.037508275359869, "learning_rate": 2.3562083436188598e-05, "loss": 0.0007, "step": 2192 }, { "epoch": 1.6042428675932698, "grad_norm": 31.13962173461914, "learning_rate": 2.3549740804739574e-05, "loss": 0.0338, "step": 2193 }, { "epoch": 1.6049743964886614, "grad_norm": 5.151216983795166, "learning_rate": 2.3537398173290548e-05, "loss": 0.0075, "step": 2194 }, { "epoch": 1.6057059253840527, "grad_norm": 7.51584005355835, "learning_rate": 2.3525055541841525e-05, "loss": 0.1407, "step": 2195 }, { "epoch": 1.606437454279444, "grad_norm": 0.10378946363925934, "learning_rate": 2.3512712910392498e-05, "loss": 0.0012, "step": 2196 }, { "epoch": 1.6071689831748355, "grad_norm": 10.608672142028809, "learning_rate": 2.3500370278943475e-05, "loss": 0.0979, "step": 2197 }, { "epoch": 1.6079005120702268, "grad_norm": 0.315543532371521, "learning_rate": 2.3488027647494448e-05, "loss": 0.0013, "step": 2198 }, { "epoch": 1.608632040965618, "grad_norm": 7.049254417419434, "learning_rate": 2.3475685016045425e-05, "loss": 0.1374, "step": 2199 }, { "epoch": 1.6093635698610096, "grad_norm": 1.315214991569519, "learning_rate": 2.34633423845964e-05, "loss": 0.0025, "step": 2200 }, { "epoch": 1.6100950987564009, "grad_norm": 4.453960418701172, "learning_rate": 2.3450999753147375e-05, "loss": 0.0081, "step": 2201 }, { "epoch": 1.6108266276517922, "grad_norm": 0.030046818777918816, "learning_rate": 2.343865712169835e-05, "loss": 0.0005, "step": 2202 }, { "epoch": 1.6115581565471837, "grad_norm": 0.08305227011442184, "learning_rate": 2.3426314490249322e-05, "loss": 0.0009, "step": 2203 }, { "epoch": 1.612289685442575, "grad_norm": 30.578914642333984, "learning_rate": 2.3413971858800295e-05, "loss": 0.0477, "step": 2204 }, { "epoch": 1.6130212143379663, "grad_norm": 0.28381866216659546, "learning_rate": 2.3401629227351272e-05, "loss": 0.0021, "step": 2205 }, { "epoch": 1.6137527432333578, "grad_norm": 0.11982627213001251, "learning_rate": 2.3389286595902246e-05, "loss": 0.0014, "step": 2206 }, { "epoch": 1.6144842721287491, "grad_norm": 0.19102312624454498, "learning_rate": 2.3376943964453222e-05, "loss": 0.0018, "step": 2207 }, { "epoch": 1.6152158010241404, "grad_norm": 0.010723567567765713, "learning_rate": 2.3364601333004196e-05, "loss": 0.0004, "step": 2208 }, { "epoch": 1.615947329919532, "grad_norm": 2.155540704727173, "learning_rate": 2.3352258701555173e-05, "loss": 0.0048, "step": 2209 }, { "epoch": 1.6166788588149232, "grad_norm": 0.40661853551864624, "learning_rate": 2.3339916070106146e-05, "loss": 0.002, "step": 2210 }, { "epoch": 1.6174103877103145, "grad_norm": 27.163410186767578, "learning_rate": 2.3327573438657123e-05, "loss": 0.1794, "step": 2211 }, { "epoch": 1.618141916605706, "grad_norm": 0.009299539029598236, "learning_rate": 2.3315230807208096e-05, "loss": 0.0004, "step": 2212 }, { "epoch": 1.6188734455010971, "grad_norm": 0.34174078702926636, "learning_rate": 2.3302888175759073e-05, "loss": 0.0026, "step": 2213 }, { "epoch": 1.6196049743964887, "grad_norm": 25.971460342407227, "learning_rate": 2.3290545544310046e-05, "loss": 0.2873, "step": 2214 }, { "epoch": 1.6203365032918802, "grad_norm": 11.645547866821289, "learning_rate": 2.3278202912861023e-05, "loss": 0.1266, "step": 2215 }, { "epoch": 1.6210680321872712, "grad_norm": 0.040238212794065475, "learning_rate": 2.3265860281411997e-05, "loss": 0.0007, "step": 2216 }, { "epoch": 1.6217995610826628, "grad_norm": 0.008379988372325897, "learning_rate": 2.3253517649962973e-05, "loss": 0.0004, "step": 2217 }, { "epoch": 1.6225310899780543, "grad_norm": 0.050803616642951965, "learning_rate": 2.3241175018513947e-05, "loss": 0.0006, "step": 2218 }, { "epoch": 1.6232626188734454, "grad_norm": 0.06048891693353653, "learning_rate": 2.3228832387064924e-05, "loss": 0.0008, "step": 2219 }, { "epoch": 1.6239941477688369, "grad_norm": 0.04873788729310036, "learning_rate": 2.3216489755615897e-05, "loss": 0.0005, "step": 2220 }, { "epoch": 1.6247256766642284, "grad_norm": 1.2642399072647095, "learning_rate": 2.3204147124166874e-05, "loss": 0.2577, "step": 2221 }, { "epoch": 1.6254572055596195, "grad_norm": 0.25182831287384033, "learning_rate": 2.3191804492717847e-05, "loss": 0.0015, "step": 2222 }, { "epoch": 1.626188734455011, "grad_norm": 0.00987474899739027, "learning_rate": 2.3179461861268824e-05, "loss": 0.0004, "step": 2223 }, { "epoch": 1.6269202633504023, "grad_norm": 0.015665089711546898, "learning_rate": 2.3167119229819797e-05, "loss": 0.0005, "step": 2224 }, { "epoch": 1.6276517922457936, "grad_norm": 5.6231818199157715, "learning_rate": 2.3154776598370774e-05, "loss": 0.131, "step": 2225 }, { "epoch": 1.6283833211411851, "grad_norm": 7.336514949798584, "learning_rate": 2.3142433966921747e-05, "loss": 0.1625, "step": 2226 }, { "epoch": 1.6291148500365764, "grad_norm": 0.025976670905947685, "learning_rate": 2.3130091335472724e-05, "loss": 0.0006, "step": 2227 }, { "epoch": 1.6298463789319677, "grad_norm": 1.2648016214370728, "learning_rate": 2.3117748704023698e-05, "loss": 0.0039, "step": 2228 }, { "epoch": 1.6305779078273592, "grad_norm": 0.032456304877996445, "learning_rate": 2.3105406072574674e-05, "loss": 0.0009, "step": 2229 }, { "epoch": 1.6313094367227505, "grad_norm": 9.895618438720703, "learning_rate": 2.3093063441125648e-05, "loss": 0.2246, "step": 2230 }, { "epoch": 1.6320409656181418, "grad_norm": 4.728765964508057, "learning_rate": 2.3080720809676625e-05, "loss": 0.1038, "step": 2231 }, { "epoch": 1.6327724945135333, "grad_norm": 0.034923773258924484, "learning_rate": 2.3068378178227598e-05, "loss": 0.001, "step": 2232 }, { "epoch": 1.6335040234089246, "grad_norm": 1.3621269464492798, "learning_rate": 2.3056035546778575e-05, "loss": 0.004, "step": 2233 }, { "epoch": 1.634235552304316, "grad_norm": 0.018116099759936333, "learning_rate": 2.3043692915329548e-05, "loss": 0.0007, "step": 2234 }, { "epoch": 1.6349670811997075, "grad_norm": 6.6421098709106445, "learning_rate": 2.3031350283880525e-05, "loss": 0.0771, "step": 2235 }, { "epoch": 1.6356986100950988, "grad_norm": 0.24568428099155426, "learning_rate": 2.30190076524315e-05, "loss": 0.0033, "step": 2236 }, { "epoch": 1.63643013899049, "grad_norm": 0.0442315936088562, "learning_rate": 2.3006665020982472e-05, "loss": 0.0011, "step": 2237 }, { "epoch": 1.6371616678858816, "grad_norm": 0.11773396283388138, "learning_rate": 2.299432238953345e-05, "loss": 0.0019, "step": 2238 }, { "epoch": 1.6378931967812729, "grad_norm": 0.138062983751297, "learning_rate": 2.2981979758084422e-05, "loss": 0.0031, "step": 2239 }, { "epoch": 1.6386247256766642, "grad_norm": 20.49701690673828, "learning_rate": 2.29696371266354e-05, "loss": 0.1938, "step": 2240 }, { "epoch": 1.6393562545720557, "grad_norm": 13.783907890319824, "learning_rate": 2.2957294495186372e-05, "loss": 0.1131, "step": 2241 }, { "epoch": 1.640087783467447, "grad_norm": 0.09121634066104889, "learning_rate": 2.294495186373735e-05, "loss": 0.0019, "step": 2242 }, { "epoch": 1.6408193123628383, "grad_norm": 2.772355794906616, "learning_rate": 2.2932609232288322e-05, "loss": 0.4263, "step": 2243 }, { "epoch": 1.6415508412582298, "grad_norm": 0.11473455280065536, "learning_rate": 2.29202666008393e-05, "loss": 0.0023, "step": 2244 }, { "epoch": 1.642282370153621, "grad_norm": 22.820354461669922, "learning_rate": 2.2907923969390273e-05, "loss": 0.0709, "step": 2245 }, { "epoch": 1.6430138990490124, "grad_norm": 0.0684506744146347, "learning_rate": 2.289558133794125e-05, "loss": 0.0018, "step": 2246 }, { "epoch": 1.643745427944404, "grad_norm": 16.149044036865234, "learning_rate": 2.2883238706492223e-05, "loss": 0.0512, "step": 2247 }, { "epoch": 1.6444769568397952, "grad_norm": 0.07152494788169861, "learning_rate": 2.28708960750432e-05, "loss": 0.0018, "step": 2248 }, { "epoch": 1.6452084857351865, "grad_norm": 0.04818805307149887, "learning_rate": 2.2858553443594173e-05, "loss": 0.0016, "step": 2249 }, { "epoch": 1.645940014630578, "grad_norm": 12.248247146606445, "learning_rate": 2.284621081214515e-05, "loss": 0.0194, "step": 2250 }, { "epoch": 1.6466715435259691, "grad_norm": 0.0452595092356205, "learning_rate": 2.2833868180696123e-05, "loss": 0.0011, "step": 2251 }, { "epoch": 1.6474030724213606, "grad_norm": 0.035047635436058044, "learning_rate": 2.28215255492471e-05, "loss": 0.0012, "step": 2252 }, { "epoch": 1.6481346013167522, "grad_norm": 0.07714125514030457, "learning_rate": 2.2809182917798073e-05, "loss": 0.0014, "step": 2253 }, { "epoch": 1.6488661302121432, "grad_norm": 0.053574491292238235, "learning_rate": 2.279684028634905e-05, "loss": 0.0017, "step": 2254 }, { "epoch": 1.6495976591075348, "grad_norm": 0.06629578024148941, "learning_rate": 2.2784497654900024e-05, "loss": 0.0013, "step": 2255 }, { "epoch": 1.6503291880029263, "grad_norm": 0.2832648456096649, "learning_rate": 2.2772155023451e-05, "loss": 0.0027, "step": 2256 }, { "epoch": 1.6510607168983173, "grad_norm": 1.9250409603118896, "learning_rate": 2.2759812392001974e-05, "loss": 0.1292, "step": 2257 }, { "epoch": 1.6517922457937089, "grad_norm": 2.0994551181793213, "learning_rate": 2.274746976055295e-05, "loss": 0.007, "step": 2258 }, { "epoch": 1.6525237746891002, "grad_norm": 1.506807804107666, "learning_rate": 2.2735127129103924e-05, "loss": 0.0058, "step": 2259 }, { "epoch": 1.6532553035844915, "grad_norm": 2.5949926376342773, "learning_rate": 2.27227844976549e-05, "loss": 0.1987, "step": 2260 }, { "epoch": 1.653986832479883, "grad_norm": 0.04559239745140076, "learning_rate": 2.2710441866205874e-05, "loss": 0.0016, "step": 2261 }, { "epoch": 1.6547183613752743, "grad_norm": 0.2547034025192261, "learning_rate": 2.269809923475685e-05, "loss": 0.0019, "step": 2262 }, { "epoch": 1.6554498902706656, "grad_norm": 0.0764126181602478, "learning_rate": 2.2685756603307824e-05, "loss": 0.0016, "step": 2263 }, { "epoch": 1.656181419166057, "grad_norm": 0.04680866375565529, "learning_rate": 2.26734139718588e-05, "loss": 0.0015, "step": 2264 }, { "epoch": 1.6569129480614484, "grad_norm": 0.08688395470380783, "learning_rate": 2.2661071340409775e-05, "loss": 0.003, "step": 2265 }, { "epoch": 1.6576444769568397, "grad_norm": 1.9589481353759766, "learning_rate": 2.264872870896075e-05, "loss": 0.1021, "step": 2266 }, { "epoch": 1.6583760058522312, "grad_norm": 0.08699989318847656, "learning_rate": 2.2636386077511725e-05, "loss": 0.0022, "step": 2267 }, { "epoch": 1.6591075347476225, "grad_norm": 7.249971389770508, "learning_rate": 2.26240434460627e-05, "loss": 0.0975, "step": 2268 }, { "epoch": 1.6598390636430138, "grad_norm": 0.09586846083402634, "learning_rate": 2.2611700814613675e-05, "loss": 0.0027, "step": 2269 }, { "epoch": 1.6605705925384053, "grad_norm": 0.1377706676721573, "learning_rate": 2.2599358183164652e-05, "loss": 0.0033, "step": 2270 }, { "epoch": 1.6613021214337966, "grad_norm": 0.05991235747933388, "learning_rate": 2.2587015551715625e-05, "loss": 0.0015, "step": 2271 }, { "epoch": 1.662033650329188, "grad_norm": 0.12259474396705627, "learning_rate": 2.2574672920266602e-05, "loss": 0.0025, "step": 2272 }, { "epoch": 1.6627651792245794, "grad_norm": 0.3148459196090698, "learning_rate": 2.2562330288817575e-05, "loss": 0.004, "step": 2273 }, { "epoch": 1.6634967081199707, "grad_norm": 3.433805465698242, "learning_rate": 2.2549987657368552e-05, "loss": 0.15, "step": 2274 }, { "epoch": 1.664228237015362, "grad_norm": 0.14644554257392883, "learning_rate": 2.2537645025919526e-05, "loss": 0.0023, "step": 2275 }, { "epoch": 1.6649597659107536, "grad_norm": 0.1313914954662323, "learning_rate": 2.2525302394470502e-05, "loss": 0.0029, "step": 2276 }, { "epoch": 1.6656912948061449, "grad_norm": 0.09136944264173508, "learning_rate": 2.2512959763021476e-05, "loss": 0.0019, "step": 2277 }, { "epoch": 1.6664228237015362, "grad_norm": 0.1144004687666893, "learning_rate": 2.2500617131572453e-05, "loss": 0.0023, "step": 2278 }, { "epoch": 1.6671543525969277, "grad_norm": 0.052429310977458954, "learning_rate": 2.2488274500123426e-05, "loss": 0.0012, "step": 2279 }, { "epoch": 1.667885881492319, "grad_norm": 5.593842029571533, "learning_rate": 2.2475931868674403e-05, "loss": 0.0725, "step": 2280 }, { "epoch": 1.6686174103877103, "grad_norm": 0.04641842469573021, "learning_rate": 2.2463589237225376e-05, "loss": 0.001, "step": 2281 }, { "epoch": 1.6693489392831018, "grad_norm": 0.14323067665100098, "learning_rate": 2.2451246605776353e-05, "loss": 0.0015, "step": 2282 }, { "epoch": 1.670080468178493, "grad_norm": 0.07737108319997787, "learning_rate": 2.2438903974327326e-05, "loss": 0.0014, "step": 2283 }, { "epoch": 1.6708119970738844, "grad_norm": 0.017364829778671265, "learning_rate": 2.2426561342878303e-05, "loss": 0.0007, "step": 2284 }, { "epoch": 1.671543525969276, "grad_norm": 0.14144662022590637, "learning_rate": 2.2414218711429277e-05, "loss": 0.0023, "step": 2285 }, { "epoch": 1.6722750548646672, "grad_norm": 14.01285171508789, "learning_rate": 2.2401876079980253e-05, "loss": 0.0876, "step": 2286 }, { "epoch": 1.6730065837600585, "grad_norm": 0.1433812379837036, "learning_rate": 2.2389533448531227e-05, "loss": 0.0015, "step": 2287 }, { "epoch": 1.67373811265545, "grad_norm": 3.826963424682617, "learning_rate": 2.2377190817082204e-05, "loss": 0.1473, "step": 2288 }, { "epoch": 1.674469641550841, "grad_norm": 10.176545143127441, "learning_rate": 2.2364848185633177e-05, "loss": 0.1579, "step": 2289 }, { "epoch": 1.6752011704462326, "grad_norm": 0.09285955131053925, "learning_rate": 2.2352505554184154e-05, "loss": 0.0011, "step": 2290 }, { "epoch": 1.6759326993416241, "grad_norm": 0.21285665035247803, "learning_rate": 2.2340162922735127e-05, "loss": 0.0023, "step": 2291 }, { "epoch": 1.6766642282370152, "grad_norm": 1.556618332862854, "learning_rate": 2.2327820291286104e-05, "loss": 0.006, "step": 2292 }, { "epoch": 1.6773957571324067, "grad_norm": 0.01336937677115202, "learning_rate": 2.2315477659837077e-05, "loss": 0.0006, "step": 2293 }, { "epoch": 1.6781272860277983, "grad_norm": 0.01512098778039217, "learning_rate": 2.2303135028388054e-05, "loss": 0.0006, "step": 2294 }, { "epoch": 1.6788588149231893, "grad_norm": 0.025907419621944427, "learning_rate": 2.2290792396939028e-05, "loss": 0.0005, "step": 2295 }, { "epoch": 1.6795903438185809, "grad_norm": 0.07790995389223099, "learning_rate": 2.2278449765490004e-05, "loss": 0.0008, "step": 2296 }, { "epoch": 1.6803218727139722, "grad_norm": 3.764277219772339, "learning_rate": 2.2266107134040978e-05, "loss": 0.1311, "step": 2297 }, { "epoch": 1.6810534016093635, "grad_norm": 13.498695373535156, "learning_rate": 2.2253764502591955e-05, "loss": 0.0329, "step": 2298 }, { "epoch": 1.681784930504755, "grad_norm": 11.373310089111328, "learning_rate": 2.2241421871142928e-05, "loss": 0.0253, "step": 2299 }, { "epoch": 1.6825164594001463, "grad_norm": 0.2247302234172821, "learning_rate": 2.2229079239693905e-05, "loss": 0.0023, "step": 2300 }, { "epoch": 1.6832479882955376, "grad_norm": 7.610408782958984, "learning_rate": 2.2216736608244878e-05, "loss": 0.1719, "step": 2301 }, { "epoch": 1.683979517190929, "grad_norm": 0.39131617546081543, "learning_rate": 2.2204393976795855e-05, "loss": 0.0033, "step": 2302 }, { "epoch": 1.6847110460863204, "grad_norm": 0.14075259864330292, "learning_rate": 2.219205134534683e-05, "loss": 0.001, "step": 2303 }, { "epoch": 1.6854425749817117, "grad_norm": 0.08654505759477615, "learning_rate": 2.2179708713897805e-05, "loss": 0.0016, "step": 2304 }, { "epoch": 1.6861741038771032, "grad_norm": 0.08323052525520325, "learning_rate": 2.216736608244878e-05, "loss": 0.002, "step": 2305 }, { "epoch": 1.6869056327724945, "grad_norm": 4.487375259399414, "learning_rate": 2.2155023450999755e-05, "loss": 0.1363, "step": 2306 }, { "epoch": 1.6876371616678858, "grad_norm": 0.015972113236784935, "learning_rate": 2.214268081955073e-05, "loss": 0.0005, "step": 2307 }, { "epoch": 1.6883686905632773, "grad_norm": 0.28204819560050964, "learning_rate": 2.2130338188101706e-05, "loss": 0.0049, "step": 2308 }, { "epoch": 1.6891002194586686, "grad_norm": 0.11053875088691711, "learning_rate": 2.211799555665268e-05, "loss": 0.0012, "step": 2309 }, { "epoch": 1.68983174835406, "grad_norm": 0.26565560698509216, "learning_rate": 2.2105652925203656e-05, "loss": 0.0017, "step": 2310 }, { "epoch": 1.6905632772494514, "grad_norm": 4.371658802032471, "learning_rate": 2.209331029375463e-05, "loss": 0.0119, "step": 2311 }, { "epoch": 1.6912948061448427, "grad_norm": 0.11865457147359848, "learning_rate": 2.2080967662305606e-05, "loss": 0.0021, "step": 2312 }, { "epoch": 1.692026335040234, "grad_norm": 0.11048805713653564, "learning_rate": 2.206862503085658e-05, "loss": 0.0009, "step": 2313 }, { "epoch": 1.6927578639356256, "grad_norm": 6.746449947357178, "learning_rate": 2.2056282399407556e-05, "loss": 0.1453, "step": 2314 }, { "epoch": 1.6934893928310168, "grad_norm": 0.14679460227489471, "learning_rate": 2.204393976795853e-05, "loss": 0.002, "step": 2315 }, { "epoch": 1.6942209217264081, "grad_norm": 13.584836959838867, "learning_rate": 2.2031597136509506e-05, "loss": 0.1257, "step": 2316 }, { "epoch": 1.6949524506217997, "grad_norm": 1.4520162343978882, "learning_rate": 2.201925450506048e-05, "loss": 0.0056, "step": 2317 }, { "epoch": 1.695683979517191, "grad_norm": 0.08210594952106476, "learning_rate": 2.2006911873611456e-05, "loss": 0.001, "step": 2318 }, { "epoch": 1.6964155084125823, "grad_norm": 3.2946865558624268, "learning_rate": 2.199456924216243e-05, "loss": 0.1074, "step": 2319 }, { "epoch": 1.6971470373079738, "grad_norm": 9.211634635925293, "learning_rate": 2.1982226610713407e-05, "loss": 0.0349, "step": 2320 }, { "epoch": 1.697878566203365, "grad_norm": 0.01327104214578867, "learning_rate": 2.196988397926438e-05, "loss": 0.0005, "step": 2321 }, { "epoch": 1.6986100950987564, "grad_norm": 0.051337048411369324, "learning_rate": 2.1957541347815357e-05, "loss": 0.0009, "step": 2322 }, { "epoch": 1.699341623994148, "grad_norm": 2.0545194149017334, "learning_rate": 2.194519871636633e-05, "loss": 0.132, "step": 2323 }, { "epoch": 1.700073152889539, "grad_norm": 4.263407230377197, "learning_rate": 2.1932856084917304e-05, "loss": 0.0111, "step": 2324 }, { "epoch": 1.7008046817849305, "grad_norm": 0.2291024774312973, "learning_rate": 2.192051345346828e-05, "loss": 0.0047, "step": 2325 }, { "epoch": 1.701536210680322, "grad_norm": 0.09996841847896576, "learning_rate": 2.1908170822019254e-05, "loss": 0.0015, "step": 2326 }, { "epoch": 1.702267739575713, "grad_norm": 6.782588958740234, "learning_rate": 2.189582819057023e-05, "loss": 0.1951, "step": 2327 }, { "epoch": 1.7029992684711046, "grad_norm": 0.8093364238739014, "learning_rate": 2.1883485559121204e-05, "loss": 0.0078, "step": 2328 }, { "epoch": 1.7037307973664961, "grad_norm": 0.7757140398025513, "learning_rate": 2.187114292767218e-05, "loss": 0.0077, "step": 2329 }, { "epoch": 1.7044623262618872, "grad_norm": 0.20712971687316895, "learning_rate": 2.1858800296223154e-05, "loss": 0.0023, "step": 2330 }, { "epoch": 1.7051938551572787, "grad_norm": 0.036026157438755035, "learning_rate": 2.184645766477413e-05, "loss": 0.0007, "step": 2331 }, { "epoch": 1.7059253840526702, "grad_norm": 3.8834893703460693, "learning_rate": 2.1834115033325104e-05, "loss": 0.0986, "step": 2332 }, { "epoch": 1.7066569129480613, "grad_norm": 0.10873401165008545, "learning_rate": 2.182177240187608e-05, "loss": 0.002, "step": 2333 }, { "epoch": 1.7073884418434528, "grad_norm": 0.150182843208313, "learning_rate": 2.1809429770427055e-05, "loss": 0.002, "step": 2334 }, { "epoch": 1.7081199707388441, "grad_norm": 0.11591543257236481, "learning_rate": 2.179708713897803e-05, "loss": 0.0022, "step": 2335 }, { "epoch": 1.7088514996342354, "grad_norm": 13.240229606628418, "learning_rate": 2.1784744507529005e-05, "loss": 0.0513, "step": 2336 }, { "epoch": 1.709583028529627, "grad_norm": 3.624253034591675, "learning_rate": 2.177240187607998e-05, "loss": 0.2281, "step": 2337 }, { "epoch": 1.7103145574250183, "grad_norm": 0.13604213297367096, "learning_rate": 2.1760059244630955e-05, "loss": 0.0008, "step": 2338 }, { "epoch": 1.7110460863204096, "grad_norm": 7.290753364562988, "learning_rate": 2.1747716613181932e-05, "loss": 0.0173, "step": 2339 }, { "epoch": 1.711777615215801, "grad_norm": 1.935315728187561, "learning_rate": 2.1735373981732905e-05, "loss": 0.0063, "step": 2340 }, { "epoch": 1.7125091441111924, "grad_norm": 0.07988929003477097, "learning_rate": 2.1723031350283882e-05, "loss": 0.0011, "step": 2341 }, { "epoch": 1.7132406730065837, "grad_norm": 2.2615768909454346, "learning_rate": 2.1710688718834855e-05, "loss": 0.2377, "step": 2342 }, { "epoch": 1.7139722019019752, "grad_norm": 8.589245796203613, "learning_rate": 2.1698346087385832e-05, "loss": 0.0476, "step": 2343 }, { "epoch": 1.7147037307973665, "grad_norm": 3.0397090911865234, "learning_rate": 2.1686003455936806e-05, "loss": 0.1168, "step": 2344 }, { "epoch": 1.7154352596927578, "grad_norm": 0.2660904824733734, "learning_rate": 2.1673660824487782e-05, "loss": 0.0024, "step": 2345 }, { "epoch": 1.7161667885881493, "grad_norm": 0.04162253439426422, "learning_rate": 2.1661318193038756e-05, "loss": 0.0007, "step": 2346 }, { "epoch": 1.7168983174835406, "grad_norm": 0.3829459249973297, "learning_rate": 2.1648975561589733e-05, "loss": 0.0038, "step": 2347 }, { "epoch": 1.717629846378932, "grad_norm": 0.19045627117156982, "learning_rate": 2.1636632930140706e-05, "loss": 0.0014, "step": 2348 }, { "epoch": 1.7183613752743234, "grad_norm": 0.10053291916847229, "learning_rate": 2.1624290298691683e-05, "loss": 0.0023, "step": 2349 }, { "epoch": 1.7190929041697147, "grad_norm": 1.4116891622543335, "learning_rate": 2.1611947667242656e-05, "loss": 0.2238, "step": 2350 }, { "epoch": 1.719824433065106, "grad_norm": 0.18169376254081726, "learning_rate": 2.1599605035793633e-05, "loss": 0.0014, "step": 2351 }, { "epoch": 1.7205559619604975, "grad_norm": 6.29320764541626, "learning_rate": 2.1587262404344606e-05, "loss": 0.1064, "step": 2352 }, { "epoch": 1.7212874908558888, "grad_norm": 4.813685894012451, "learning_rate": 2.1574919772895583e-05, "loss": 0.3017, "step": 2353 }, { "epoch": 1.7220190197512801, "grad_norm": 0.2909710109233856, "learning_rate": 2.1562577141446557e-05, "loss": 0.006, "step": 2354 }, { "epoch": 1.7227505486466717, "grad_norm": 0.5953356623649597, "learning_rate": 2.1550234509997533e-05, "loss": 0.0075, "step": 2355 }, { "epoch": 1.723482077542063, "grad_norm": 0.2877521216869354, "learning_rate": 2.1537891878548507e-05, "loss": 0.0072, "step": 2356 }, { "epoch": 1.7242136064374542, "grad_norm": 0.18724781274795532, "learning_rate": 2.1525549247099484e-05, "loss": 0.0044, "step": 2357 }, { "epoch": 1.7249451353328458, "grad_norm": 1.222082495689392, "learning_rate": 2.1513206615650457e-05, "loss": 0.0083, "step": 2358 }, { "epoch": 1.725676664228237, "grad_norm": 1.777861475944519, "learning_rate": 2.1500863984201434e-05, "loss": 0.1764, "step": 2359 }, { "epoch": 1.7264081931236284, "grad_norm": 1.8463268280029297, "learning_rate": 2.1488521352752407e-05, "loss": 0.0145, "step": 2360 }, { "epoch": 1.7271397220190199, "grad_norm": 0.25989803671836853, "learning_rate": 2.1476178721303384e-05, "loss": 0.006, "step": 2361 }, { "epoch": 1.727871250914411, "grad_norm": 0.11300887167453766, "learning_rate": 2.1463836089854357e-05, "loss": 0.0029, "step": 2362 }, { "epoch": 1.7286027798098025, "grad_norm": 0.1524154543876648, "learning_rate": 2.1451493458405334e-05, "loss": 0.0041, "step": 2363 }, { "epoch": 1.729334308705194, "grad_norm": 0.42335274815559387, "learning_rate": 2.1439150826956308e-05, "loss": 0.0043, "step": 2364 }, { "epoch": 1.730065837600585, "grad_norm": 2.2801806926727295, "learning_rate": 2.1426808195507284e-05, "loss": 0.128, "step": 2365 }, { "epoch": 1.7307973664959766, "grad_norm": 0.16857869923114777, "learning_rate": 2.1414465564058258e-05, "loss": 0.0046, "step": 2366 }, { "epoch": 1.7315288953913681, "grad_norm": 0.12602409720420837, "learning_rate": 2.1402122932609235e-05, "loss": 0.0038, "step": 2367 }, { "epoch": 1.7322604242867592, "grad_norm": 0.5265858173370361, "learning_rate": 2.1389780301160208e-05, "loss": 0.0058, "step": 2368 }, { "epoch": 1.7329919531821507, "grad_norm": 0.2188519835472107, "learning_rate": 2.1377437669711185e-05, "loss": 0.0048, "step": 2369 }, { "epoch": 1.733723482077542, "grad_norm": 0.13245682418346405, "learning_rate": 2.1365095038262158e-05, "loss": 0.0033, "step": 2370 }, { "epoch": 1.7344550109729333, "grad_norm": 0.10940945148468018, "learning_rate": 2.1352752406813135e-05, "loss": 0.0027, "step": 2371 }, { "epoch": 1.7351865398683248, "grad_norm": 3.2020256519317627, "learning_rate": 2.134040977536411e-05, "loss": 0.095, "step": 2372 }, { "epoch": 1.7359180687637161, "grad_norm": 0.0659165158867836, "learning_rate": 2.1328067143915085e-05, "loss": 0.0018, "step": 2373 }, { "epoch": 1.7366495976591074, "grad_norm": 0.13518394529819489, "learning_rate": 2.131572451246606e-05, "loss": 0.0026, "step": 2374 }, { "epoch": 1.737381126554499, "grad_norm": 0.05081988498568535, "learning_rate": 2.1303381881017035e-05, "loss": 0.0017, "step": 2375 }, { "epoch": 1.7381126554498902, "grad_norm": 0.2533520758152008, "learning_rate": 2.129103924956801e-05, "loss": 0.0037, "step": 2376 }, { "epoch": 1.7388441843452815, "grad_norm": 0.04590132087469101, "learning_rate": 2.1278696618118986e-05, "loss": 0.0015, "step": 2377 }, { "epoch": 1.739575713240673, "grad_norm": 0.07273361831903458, "learning_rate": 2.126635398666996e-05, "loss": 0.0017, "step": 2378 }, { "epoch": 1.7403072421360644, "grad_norm": 0.05246380716562271, "learning_rate": 2.1254011355220936e-05, "loss": 0.0014, "step": 2379 }, { "epoch": 1.7410387710314557, "grad_norm": 0.11540526896715164, "learning_rate": 2.124166872377191e-05, "loss": 0.0021, "step": 2380 }, { "epoch": 1.7417702999268472, "grad_norm": 0.05148240923881531, "learning_rate": 2.1229326092322886e-05, "loss": 0.0013, "step": 2381 }, { "epoch": 1.7425018288222385, "grad_norm": 7.783102512359619, "learning_rate": 2.121698346087386e-05, "loss": 0.3076, "step": 2382 }, { "epoch": 1.7432333577176298, "grad_norm": 0.01984223537147045, "learning_rate": 2.1204640829424836e-05, "loss": 0.0009, "step": 2383 }, { "epoch": 1.7439648866130213, "grad_norm": 0.08271722495555878, "learning_rate": 2.119229819797581e-05, "loss": 0.0013, "step": 2384 }, { "epoch": 1.7446964155084126, "grad_norm": 0.13465842604637146, "learning_rate": 2.1179955566526786e-05, "loss": 0.0029, "step": 2385 }, { "epoch": 1.7454279444038039, "grad_norm": 10.96595287322998, "learning_rate": 2.116761293507776e-05, "loss": 0.011, "step": 2386 }, { "epoch": 1.7461594732991954, "grad_norm": 2.2041077613830566, "learning_rate": 2.1155270303628737e-05, "loss": 0.0849, "step": 2387 }, { "epoch": 1.7468910021945867, "grad_norm": 0.11844433099031448, "learning_rate": 2.114292767217971e-05, "loss": 0.0011, "step": 2388 }, { "epoch": 1.747622531089978, "grad_norm": 0.14837191998958588, "learning_rate": 2.1130585040730687e-05, "loss": 0.0027, "step": 2389 }, { "epoch": 1.7483540599853695, "grad_norm": 0.1275898665189743, "learning_rate": 2.111824240928166e-05, "loss": 0.0016, "step": 2390 }, { "epoch": 1.7490855888807608, "grad_norm": 3.533932685852051, "learning_rate": 2.1105899777832637e-05, "loss": 0.0896, "step": 2391 }, { "epoch": 1.7498171177761521, "grad_norm": 0.3357201814651489, "learning_rate": 2.109355714638361e-05, "loss": 0.0029, "step": 2392 }, { "epoch": 1.7505486466715436, "grad_norm": 0.021157125011086464, "learning_rate": 2.1081214514934587e-05, "loss": 0.0008, "step": 2393 }, { "epoch": 1.751280175566935, "grad_norm": 0.1538439691066742, "learning_rate": 2.106887188348556e-05, "loss": 0.0015, "step": 2394 }, { "epoch": 1.7520117044623262, "grad_norm": 6.8321404457092285, "learning_rate": 2.1056529252036537e-05, "loss": 0.0077, "step": 2395 }, { "epoch": 1.7527432333577178, "grad_norm": 0.12986694276332855, "learning_rate": 2.104418662058751e-05, "loss": 0.0019, "step": 2396 }, { "epoch": 1.7534747622531088, "grad_norm": 0.15291981399059296, "learning_rate": 2.1031843989138488e-05, "loss": 0.0012, "step": 2397 }, { "epoch": 1.7542062911485004, "grad_norm": 0.022247515618801117, "learning_rate": 2.101950135768946e-05, "loss": 0.0006, "step": 2398 }, { "epoch": 1.7549378200438919, "grad_norm": 0.10534368455410004, "learning_rate": 2.1007158726240438e-05, "loss": 0.0016, "step": 2399 }, { "epoch": 1.755669348939283, "grad_norm": 17.345226287841797, "learning_rate": 2.099481609479141e-05, "loss": 0.0639, "step": 2400 }, { "epoch": 1.7564008778346745, "grad_norm": 36.7955207824707, "learning_rate": 2.0982473463342388e-05, "loss": 0.1014, "step": 2401 }, { "epoch": 1.757132406730066, "grad_norm": 0.10442593693733215, "learning_rate": 2.097013083189336e-05, "loss": 0.0016, "step": 2402 }, { "epoch": 1.757863935625457, "grad_norm": 0.03405054286122322, "learning_rate": 2.0957788200444338e-05, "loss": 0.0007, "step": 2403 }, { "epoch": 1.7585954645208486, "grad_norm": 0.07728569954633713, "learning_rate": 2.094544556899531e-05, "loss": 0.0011, "step": 2404 }, { "epoch": 1.75932699341624, "grad_norm": 5.07532262802124, "learning_rate": 2.093310293754629e-05, "loss": 0.2319, "step": 2405 }, { "epoch": 1.7600585223116312, "grad_norm": 0.01771933026611805, "learning_rate": 2.0920760306097262e-05, "loss": 0.0007, "step": 2406 }, { "epoch": 1.7607900512070227, "grad_norm": 0.021156560629606247, "learning_rate": 2.090841767464824e-05, "loss": 0.0006, "step": 2407 }, { "epoch": 1.761521580102414, "grad_norm": 0.014005905948579311, "learning_rate": 2.0896075043199212e-05, "loss": 0.0005, "step": 2408 }, { "epoch": 1.7622531089978053, "grad_norm": 0.01852523908019066, "learning_rate": 2.088373241175019e-05, "loss": 0.0007, "step": 2409 }, { "epoch": 1.7629846378931968, "grad_norm": 0.15647262334823608, "learning_rate": 2.0871389780301162e-05, "loss": 0.0016, "step": 2410 }, { "epoch": 1.7637161667885881, "grad_norm": 28.650230407714844, "learning_rate": 2.0859047148852136e-05, "loss": 0.0524, "step": 2411 }, { "epoch": 1.7644476956839794, "grad_norm": 3.0595903396606445, "learning_rate": 2.0846704517403112e-05, "loss": 0.0065, "step": 2412 }, { "epoch": 1.765179224579371, "grad_norm": 0.1382024735212326, "learning_rate": 2.0834361885954086e-05, "loss": 0.0013, "step": 2413 }, { "epoch": 1.7659107534747622, "grad_norm": 0.01552814431488514, "learning_rate": 2.0822019254505062e-05, "loss": 0.0006, "step": 2414 }, { "epoch": 1.7666422823701535, "grad_norm": 27.015085220336914, "learning_rate": 2.0809676623056036e-05, "loss": 0.0804, "step": 2415 }, { "epoch": 1.767373811265545, "grad_norm": 0.013392698019742966, "learning_rate": 2.0797333991607013e-05, "loss": 0.0006, "step": 2416 }, { "epoch": 1.7681053401609363, "grad_norm": 0.027036651968955994, "learning_rate": 2.0784991360157986e-05, "loss": 0.0007, "step": 2417 }, { "epoch": 1.7688368690563276, "grad_norm": 0.016258254647254944, "learning_rate": 2.0772648728708963e-05, "loss": 0.0007, "step": 2418 }, { "epoch": 1.7695683979517192, "grad_norm": 0.025817571207880974, "learning_rate": 2.0760306097259936e-05, "loss": 0.0007, "step": 2419 }, { "epoch": 1.7702999268471105, "grad_norm": 3.588778018951416, "learning_rate": 2.0747963465810913e-05, "loss": 0.209, "step": 2420 }, { "epoch": 1.7710314557425018, "grad_norm": 0.11585452407598495, "learning_rate": 2.0735620834361886e-05, "loss": 0.0013, "step": 2421 }, { "epoch": 1.7717629846378933, "grad_norm": 0.012264923192560673, "learning_rate": 2.0723278202912863e-05, "loss": 0.0005, "step": 2422 }, { "epoch": 1.7724945135332846, "grad_norm": 0.1716909259557724, "learning_rate": 2.0710935571463837e-05, "loss": 0.0015, "step": 2423 }, { "epoch": 1.7732260424286759, "grad_norm": 19.62703514099121, "learning_rate": 2.0698592940014813e-05, "loss": 0.0782, "step": 2424 }, { "epoch": 1.7739575713240674, "grad_norm": 0.02360418252646923, "learning_rate": 2.0686250308565787e-05, "loss": 0.0007, "step": 2425 }, { "epoch": 1.7746891002194587, "grad_norm": 5.396293640136719, "learning_rate": 2.0673907677116764e-05, "loss": 0.1395, "step": 2426 }, { "epoch": 1.77542062911485, "grad_norm": 6.141932010650635, "learning_rate": 2.0661565045667737e-05, "loss": 0.0212, "step": 2427 }, { "epoch": 1.7761521580102415, "grad_norm": 0.41874566674232483, "learning_rate": 2.0649222414218714e-05, "loss": 0.0019, "step": 2428 }, { "epoch": 1.7768836869056328, "grad_norm": 0.015757646411657333, "learning_rate": 2.0636879782769687e-05, "loss": 0.0007, "step": 2429 }, { "epoch": 1.777615215801024, "grad_norm": 0.02668982930481434, "learning_rate": 2.0624537151320664e-05, "loss": 0.0008, "step": 2430 }, { "epoch": 1.7783467446964156, "grad_norm": 0.47453370690345764, "learning_rate": 2.0612194519871637e-05, "loss": 0.0031, "step": 2431 }, { "epoch": 1.779078273591807, "grad_norm": 3.883572816848755, "learning_rate": 2.0599851888422614e-05, "loss": 0.0149, "step": 2432 }, { "epoch": 1.7798098024871982, "grad_norm": 1.5277211666107178, "learning_rate": 2.0587509256973588e-05, "loss": 0.2446, "step": 2433 }, { "epoch": 1.7805413313825897, "grad_norm": 0.12637507915496826, "learning_rate": 2.0575166625524564e-05, "loss": 0.001, "step": 2434 }, { "epoch": 1.7812728602779808, "grad_norm": 0.06384716928005219, "learning_rate": 2.0562823994075538e-05, "loss": 0.0011, "step": 2435 }, { "epoch": 1.7820043891733723, "grad_norm": 0.7974087595939636, "learning_rate": 2.0550481362626515e-05, "loss": 0.0026, "step": 2436 }, { "epoch": 1.7827359180687639, "grad_norm": 0.28784728050231934, "learning_rate": 2.0538138731177488e-05, "loss": 0.0023, "step": 2437 }, { "epoch": 1.783467446964155, "grad_norm": 8.794281005859375, "learning_rate": 2.0525796099728465e-05, "loss": 0.1495, "step": 2438 }, { "epoch": 1.7841989758595465, "grad_norm": 3.7402029037475586, "learning_rate": 2.0513453468279438e-05, "loss": 0.2148, "step": 2439 }, { "epoch": 1.784930504754938, "grad_norm": 0.08829863369464874, "learning_rate": 2.0501110836830415e-05, "loss": 0.0021, "step": 2440 }, { "epoch": 1.785662033650329, "grad_norm": 0.07885099947452545, "learning_rate": 2.048876820538139e-05, "loss": 0.0013, "step": 2441 }, { "epoch": 1.7863935625457206, "grad_norm": 0.720705509185791, "learning_rate": 2.0476425573932365e-05, "loss": 0.0037, "step": 2442 }, { "epoch": 1.787125091441112, "grad_norm": 1.6815015077590942, "learning_rate": 2.046408294248334e-05, "loss": 0.0041, "step": 2443 }, { "epoch": 1.7878566203365032, "grad_norm": 0.11127791553735733, "learning_rate": 2.0451740311034315e-05, "loss": 0.0015, "step": 2444 }, { "epoch": 1.7885881492318947, "grad_norm": 0.2930208444595337, "learning_rate": 2.043939767958529e-05, "loss": 0.0034, "step": 2445 }, { "epoch": 1.789319678127286, "grad_norm": 0.44455498456954956, "learning_rate": 2.0427055048136266e-05, "loss": 0.0036, "step": 2446 }, { "epoch": 1.7900512070226773, "grad_norm": 0.12104513496160507, "learning_rate": 2.041471241668724e-05, "loss": 0.002, "step": 2447 }, { "epoch": 1.7907827359180688, "grad_norm": 0.1595410257577896, "learning_rate": 2.0402369785238216e-05, "loss": 0.0013, "step": 2448 }, { "epoch": 1.79151426481346, "grad_norm": 0.033234603703022, "learning_rate": 2.039002715378919e-05, "loss": 0.0013, "step": 2449 }, { "epoch": 1.7922457937088514, "grad_norm": 23.20171546936035, "learning_rate": 2.0377684522340166e-05, "loss": 0.2221, "step": 2450 }, { "epoch": 1.792977322604243, "grad_norm": 0.07747198641300201, "learning_rate": 2.036534189089114e-05, "loss": 0.0016, "step": 2451 }, { "epoch": 1.7937088514996342, "grad_norm": 0.027267921715974808, "learning_rate": 2.0352999259442116e-05, "loss": 0.0011, "step": 2452 }, { "epoch": 1.7944403803950255, "grad_norm": 0.08308949321508408, "learning_rate": 2.034065662799309e-05, "loss": 0.0015, "step": 2453 }, { "epoch": 1.795171909290417, "grad_norm": 1.6515629291534424, "learning_rate": 2.0328313996544066e-05, "loss": 0.064, "step": 2454 }, { "epoch": 1.7959034381858083, "grad_norm": 0.04160116985440254, "learning_rate": 2.031597136509504e-05, "loss": 0.0013, "step": 2455 }, { "epoch": 1.7966349670811996, "grad_norm": 0.06489795446395874, "learning_rate": 2.0303628733646013e-05, "loss": 0.0015, "step": 2456 }, { "epoch": 1.7973664959765911, "grad_norm": 0.024168439209461212, "learning_rate": 2.0291286102196987e-05, "loss": 0.0009, "step": 2457 }, { "epoch": 1.7980980248719824, "grad_norm": 2.903688430786133, "learning_rate": 2.0278943470747963e-05, "loss": 0.2026, "step": 2458 }, { "epoch": 1.7988295537673737, "grad_norm": 2.390001058578491, "learning_rate": 2.0266600839298937e-05, "loss": 0.0052, "step": 2459 }, { "epoch": 1.7995610826627653, "grad_norm": 0.3007499575614929, "learning_rate": 2.0254258207849914e-05, "loss": 0.0025, "step": 2460 }, { "epoch": 1.8002926115581566, "grad_norm": 0.2605971097946167, "learning_rate": 2.0241915576400887e-05, "loss": 0.0029, "step": 2461 }, { "epoch": 1.8010241404535479, "grad_norm": 0.26053935289382935, "learning_rate": 2.0229572944951864e-05, "loss": 0.0019, "step": 2462 }, { "epoch": 1.8017556693489394, "grad_norm": 6.9659318923950195, "learning_rate": 2.0217230313502837e-05, "loss": 0.1888, "step": 2463 }, { "epoch": 1.8024871982443307, "grad_norm": 0.25490379333496094, "learning_rate": 2.0204887682053814e-05, "loss": 0.0023, "step": 2464 }, { "epoch": 1.803218727139722, "grad_norm": 0.13503926992416382, "learning_rate": 2.0192545050604787e-05, "loss": 0.0019, "step": 2465 }, { "epoch": 1.8039502560351135, "grad_norm": 0.23069633543491364, "learning_rate": 2.0180202419155764e-05, "loss": 0.0033, "step": 2466 }, { "epoch": 1.8046817849305048, "grad_norm": 0.12999579310417175, "learning_rate": 2.0167859787706738e-05, "loss": 0.002, "step": 2467 }, { "epoch": 1.805413313825896, "grad_norm": 0.27508044242858887, "learning_rate": 2.0155517156257714e-05, "loss": 0.0023, "step": 2468 }, { "epoch": 1.8061448427212876, "grad_norm": 4.581474304199219, "learning_rate": 2.0143174524808688e-05, "loss": 0.0076, "step": 2469 }, { "epoch": 1.806876371616679, "grad_norm": 0.03829294443130493, "learning_rate": 2.0130831893359665e-05, "loss": 0.0013, "step": 2470 }, { "epoch": 1.8076079005120702, "grad_norm": 19.690839767456055, "learning_rate": 2.0118489261910638e-05, "loss": 0.0361, "step": 2471 }, { "epoch": 1.8083394294074617, "grad_norm": 0.03386066481471062, "learning_rate": 2.0106146630461615e-05, "loss": 0.001, "step": 2472 }, { "epoch": 1.8090709583028528, "grad_norm": 1.8203538656234741, "learning_rate": 2.0093803999012588e-05, "loss": 0.2173, "step": 2473 }, { "epoch": 1.8098024871982443, "grad_norm": 0.07978606224060059, "learning_rate": 2.0081461367563565e-05, "loss": 0.0018, "step": 2474 }, { "epoch": 1.8105340160936358, "grad_norm": 2.9346745014190674, "learning_rate": 2.006911873611454e-05, "loss": 0.1988, "step": 2475 }, { "epoch": 1.811265544989027, "grad_norm": 0.027776435017585754, "learning_rate": 2.0056776104665515e-05, "loss": 0.0011, "step": 2476 }, { "epoch": 1.8119970738844184, "grad_norm": 5.142239093780518, "learning_rate": 2.004443347321649e-05, "loss": 0.2807, "step": 2477 }, { "epoch": 1.81272860277981, "grad_norm": 0.14309969544410706, "learning_rate": 2.0032090841767465e-05, "loss": 0.002, "step": 2478 }, { "epoch": 1.813460131675201, "grad_norm": 4.966380596160889, "learning_rate": 2.001974821031844e-05, "loss": 0.1818, "step": 2479 }, { "epoch": 1.8141916605705926, "grad_norm": 51.828277587890625, "learning_rate": 2.0007405578869416e-05, "loss": 0.0616, "step": 2480 }, { "epoch": 1.8149231894659839, "grad_norm": 26.612857818603516, "learning_rate": 1.999506294742039e-05, "loss": 0.1787, "step": 2481 }, { "epoch": 1.8156547183613752, "grad_norm": 0.16203776001930237, "learning_rate": 1.9982720315971366e-05, "loss": 0.0027, "step": 2482 }, { "epoch": 1.8163862472567667, "grad_norm": 1.467050552368164, "learning_rate": 1.997037768452234e-05, "loss": 0.0052, "step": 2483 }, { "epoch": 1.817117776152158, "grad_norm": 0.37250280380249023, "learning_rate": 1.9958035053073316e-05, "loss": 0.0057, "step": 2484 }, { "epoch": 1.8178493050475493, "grad_norm": 0.04357880353927612, "learning_rate": 1.994569242162429e-05, "loss": 0.0017, "step": 2485 }, { "epoch": 1.8185808339429408, "grad_norm": 0.06878070533275604, "learning_rate": 1.9933349790175266e-05, "loss": 0.0024, "step": 2486 }, { "epoch": 1.819312362838332, "grad_norm": 3.123441696166992, "learning_rate": 1.992100715872624e-05, "loss": 0.2061, "step": 2487 }, { "epoch": 1.8200438917337234, "grad_norm": 0.08753643184900284, "learning_rate": 1.9908664527277216e-05, "loss": 0.0025, "step": 2488 }, { "epoch": 1.820775420629115, "grad_norm": 0.0715978816151619, "learning_rate": 1.989632189582819e-05, "loss": 0.0026, "step": 2489 }, { "epoch": 1.8215069495245062, "grad_norm": 4.100738048553467, "learning_rate": 1.9883979264379167e-05, "loss": 0.1261, "step": 2490 }, { "epoch": 1.8222384784198975, "grad_norm": 5.160264015197754, "learning_rate": 1.987163663293014e-05, "loss": 0.0861, "step": 2491 }, { "epoch": 1.822970007315289, "grad_norm": 0.2204744666814804, "learning_rate": 1.9859294001481117e-05, "loss": 0.0049, "step": 2492 }, { "epoch": 1.8237015362106803, "grad_norm": 0.10323380678892136, "learning_rate": 1.984695137003209e-05, "loss": 0.0038, "step": 2493 }, { "epoch": 1.8244330651060716, "grad_norm": 0.13321919739246368, "learning_rate": 1.9834608738583067e-05, "loss": 0.004, "step": 2494 }, { "epoch": 1.8251645940014631, "grad_norm": 12.85378646850586, "learning_rate": 1.982226610713404e-05, "loss": 0.0759, "step": 2495 }, { "epoch": 1.8258961228968544, "grad_norm": 1.4340112209320068, "learning_rate": 1.9809923475685017e-05, "loss": 0.0065, "step": 2496 }, { "epoch": 1.8266276517922457, "grad_norm": 3.2953367233276367, "learning_rate": 1.979758084423599e-05, "loss": 0.0801, "step": 2497 }, { "epoch": 1.8273591806876373, "grad_norm": 0.1395678073167801, "learning_rate": 1.9785238212786967e-05, "loss": 0.0041, "step": 2498 }, { "epoch": 1.8280907095830286, "grad_norm": 0.1289362609386444, "learning_rate": 1.977289558133794e-05, "loss": 0.0034, "step": 2499 }, { "epoch": 1.8288222384784198, "grad_norm": 0.2011074274778366, "learning_rate": 1.9760552949888918e-05, "loss": 0.0055, "step": 2500 }, { "epoch": 1.8295537673738114, "grad_norm": 17.362144470214844, "learning_rate": 1.974821031843989e-05, "loss": 0.1081, "step": 2501 }, { "epoch": 1.8302852962692027, "grad_norm": 0.2194979339838028, "learning_rate": 1.9735867686990868e-05, "loss": 0.0044, "step": 2502 }, { "epoch": 1.831016825164594, "grad_norm": 3.3290793895721436, "learning_rate": 1.972352505554184e-05, "loss": 0.0921, "step": 2503 }, { "epoch": 1.8317483540599855, "grad_norm": 0.1328219175338745, "learning_rate": 1.9711182424092818e-05, "loss": 0.0028, "step": 2504 }, { "epoch": 1.8324798829553768, "grad_norm": 0.08437929302453995, "learning_rate": 1.969883979264379e-05, "loss": 0.0023, "step": 2505 }, { "epoch": 1.833211411850768, "grad_norm": 0.16009491682052612, "learning_rate": 1.9686497161194768e-05, "loss": 0.0032, "step": 2506 }, { "epoch": 1.8339429407461596, "grad_norm": 4.491008758544922, "learning_rate": 1.967415452974574e-05, "loss": 0.178, "step": 2507 }, { "epoch": 1.8346744696415507, "grad_norm": 3.7169528007507324, "learning_rate": 1.9661811898296718e-05, "loss": 0.0985, "step": 2508 }, { "epoch": 1.8354059985369422, "grad_norm": 0.09280610829591751, "learning_rate": 1.9649469266847692e-05, "loss": 0.0017, "step": 2509 }, { "epoch": 1.8361375274323337, "grad_norm": 0.3636914789676666, "learning_rate": 1.963712663539867e-05, "loss": 0.0043, "step": 2510 }, { "epoch": 1.8368690563277248, "grad_norm": 0.13578708469867706, "learning_rate": 1.9624784003949642e-05, "loss": 0.0025, "step": 2511 }, { "epoch": 1.8376005852231163, "grad_norm": 0.19403411448001862, "learning_rate": 1.961244137250062e-05, "loss": 0.0024, "step": 2512 }, { "epoch": 1.8383321141185078, "grad_norm": 0.1997678130865097, "learning_rate": 1.9600098741051592e-05, "loss": 0.0029, "step": 2513 }, { "epoch": 1.839063643013899, "grad_norm": 0.13161225616931915, "learning_rate": 1.958775610960257e-05, "loss": 0.0021, "step": 2514 }, { "epoch": 1.8397951719092904, "grad_norm": 3.404381513595581, "learning_rate": 1.9575413478153542e-05, "loss": 0.01, "step": 2515 }, { "epoch": 1.840526700804682, "grad_norm": 0.13564403355121613, "learning_rate": 1.956307084670452e-05, "loss": 0.0017, "step": 2516 }, { "epoch": 1.841258229700073, "grad_norm": 36.41548156738281, "learning_rate": 1.9550728215255492e-05, "loss": 0.1194, "step": 2517 }, { "epoch": 1.8419897585954645, "grad_norm": 0.061731234192848206, "learning_rate": 1.953838558380647e-05, "loss": 0.0013, "step": 2518 }, { "epoch": 1.8427212874908558, "grad_norm": 0.34271761775016785, "learning_rate": 1.9526042952357443e-05, "loss": 0.0032, "step": 2519 }, { "epoch": 1.8434528163862471, "grad_norm": 0.02119174413383007, "learning_rate": 1.951370032090842e-05, "loss": 0.0009, "step": 2520 }, { "epoch": 1.8441843452816387, "grad_norm": 0.020579583942890167, "learning_rate": 1.9501357689459393e-05, "loss": 0.0008, "step": 2521 }, { "epoch": 1.84491587417703, "grad_norm": 6.814996719360352, "learning_rate": 1.948901505801037e-05, "loss": 0.1876, "step": 2522 }, { "epoch": 1.8456474030724213, "grad_norm": 0.06264827400445938, "learning_rate": 1.9476672426561343e-05, "loss": 0.0016, "step": 2523 }, { "epoch": 1.8463789319678128, "grad_norm": 0.02351187728345394, "learning_rate": 1.946432979511232e-05, "loss": 0.0009, "step": 2524 }, { "epoch": 1.847110460863204, "grad_norm": 0.24239660799503326, "learning_rate": 1.9451987163663293e-05, "loss": 0.0016, "step": 2525 }, { "epoch": 1.8478419897585954, "grad_norm": 0.02069927006959915, "learning_rate": 1.943964453221427e-05, "loss": 0.0008, "step": 2526 }, { "epoch": 1.848573518653987, "grad_norm": 0.2952725887298584, "learning_rate": 1.9427301900765243e-05, "loss": 0.0024, "step": 2527 }, { "epoch": 1.8493050475493782, "grad_norm": 0.023910703137516975, "learning_rate": 1.9414959269316217e-05, "loss": 0.001, "step": 2528 }, { "epoch": 1.8500365764447695, "grad_norm": 0.020827772095799446, "learning_rate": 1.9402616637867194e-05, "loss": 0.0008, "step": 2529 }, { "epoch": 1.850768105340161, "grad_norm": 0.5139136910438538, "learning_rate": 1.9390274006418167e-05, "loss": 0.0027, "step": 2530 }, { "epoch": 1.8514996342355523, "grad_norm": 7.0299072265625, "learning_rate": 1.9377931374969144e-05, "loss": 0.1283, "step": 2531 }, { "epoch": 1.8522311631309436, "grad_norm": 0.12572336196899414, "learning_rate": 1.9365588743520117e-05, "loss": 0.0012, "step": 2532 }, { "epoch": 1.8529626920263351, "grad_norm": 9.254599571228027, "learning_rate": 1.9353246112071094e-05, "loss": 0.1679, "step": 2533 }, { "epoch": 1.8536942209217264, "grad_norm": 8.129678726196289, "learning_rate": 1.9340903480622067e-05, "loss": 0.0119, "step": 2534 }, { "epoch": 1.8544257498171177, "grad_norm": 0.23846332728862762, "learning_rate": 1.9328560849173044e-05, "loss": 0.0014, "step": 2535 }, { "epoch": 1.8551572787125092, "grad_norm": 0.02896236814558506, "learning_rate": 1.9316218217724018e-05, "loss": 0.0008, "step": 2536 }, { "epoch": 1.8558888076079005, "grad_norm": 28.919944763183594, "learning_rate": 1.9303875586274994e-05, "loss": 0.068, "step": 2537 }, { "epoch": 1.8566203365032918, "grad_norm": 0.015373148955404758, "learning_rate": 1.9291532954825968e-05, "loss": 0.0007, "step": 2538 }, { "epoch": 1.8573518653986834, "grad_norm": 0.018723851069808006, "learning_rate": 1.9279190323376945e-05, "loss": 0.0008, "step": 2539 }, { "epoch": 1.8580833942940747, "grad_norm": 7.595161437988281, "learning_rate": 1.9266847691927918e-05, "loss": 0.2104, "step": 2540 }, { "epoch": 1.858814923189466, "grad_norm": 0.03407429903745651, "learning_rate": 1.9254505060478895e-05, "loss": 0.0008, "step": 2541 }, { "epoch": 1.8595464520848575, "grad_norm": 1.698326587677002, "learning_rate": 1.9242162429029868e-05, "loss": 0.227, "step": 2542 }, { "epoch": 1.8602779809802488, "grad_norm": 5.355527400970459, "learning_rate": 1.9229819797580845e-05, "loss": 0.1851, "step": 2543 }, { "epoch": 1.86100950987564, "grad_norm": 4.304741382598877, "learning_rate": 1.921747716613182e-05, "loss": 0.0726, "step": 2544 }, { "epoch": 1.8617410387710316, "grad_norm": 8.781900405883789, "learning_rate": 1.9205134534682795e-05, "loss": 0.0162, "step": 2545 }, { "epoch": 1.8624725676664227, "grad_norm": 0.02100248634815216, "learning_rate": 1.919279190323377e-05, "loss": 0.0008, "step": 2546 }, { "epoch": 1.8632040965618142, "grad_norm": 0.030219977721571922, "learning_rate": 1.9180449271784745e-05, "loss": 0.001, "step": 2547 }, { "epoch": 1.8639356254572057, "grad_norm": 0.031416185200214386, "learning_rate": 1.916810664033572e-05, "loss": 0.0011, "step": 2548 }, { "epoch": 1.8646671543525968, "grad_norm": 12.306939125061035, "learning_rate": 1.9155764008886696e-05, "loss": 0.1507, "step": 2549 }, { "epoch": 1.8653986832479883, "grad_norm": 10.565922737121582, "learning_rate": 1.914342137743767e-05, "loss": 0.1013, "step": 2550 }, { "epoch": 1.8661302121433798, "grad_norm": 0.15429005026817322, "learning_rate": 1.9131078745988646e-05, "loss": 0.0026, "step": 2551 }, { "epoch": 1.866861741038771, "grad_norm": 0.578327476978302, "learning_rate": 1.911873611453962e-05, "loss": 0.0058, "step": 2552 }, { "epoch": 1.8675932699341624, "grad_norm": 0.4360547363758087, "learning_rate": 1.9106393483090596e-05, "loss": 0.0034, "step": 2553 }, { "epoch": 1.8683247988295537, "grad_norm": 0.8424513339996338, "learning_rate": 1.909405085164157e-05, "loss": 0.0044, "step": 2554 }, { "epoch": 1.869056327724945, "grad_norm": 1.7115949392318726, "learning_rate": 1.9081708220192546e-05, "loss": 0.0044, "step": 2555 }, { "epoch": 1.8697878566203365, "grad_norm": 0.0995204970240593, "learning_rate": 1.906936558874352e-05, "loss": 0.0018, "step": 2556 }, { "epoch": 1.8705193855157278, "grad_norm": 0.2038443237543106, "learning_rate": 1.9057022957294496e-05, "loss": 0.0029, "step": 2557 }, { "epoch": 1.8712509144111191, "grad_norm": 0.49115920066833496, "learning_rate": 1.904468032584547e-05, "loss": 0.0051, "step": 2558 }, { "epoch": 1.8719824433065106, "grad_norm": 0.14648817479610443, "learning_rate": 1.9032337694396447e-05, "loss": 0.0025, "step": 2559 }, { "epoch": 1.872713972201902, "grad_norm": 0.17543631792068481, "learning_rate": 1.901999506294742e-05, "loss": 0.0021, "step": 2560 }, { "epoch": 1.8734455010972932, "grad_norm": 0.5516785383224487, "learning_rate": 1.9007652431498397e-05, "loss": 0.0043, "step": 2561 }, { "epoch": 1.8741770299926848, "grad_norm": 0.06248515844345093, "learning_rate": 1.899530980004937e-05, "loss": 0.0011, "step": 2562 }, { "epoch": 1.874908558888076, "grad_norm": 0.028036508709192276, "learning_rate": 1.8982967168600347e-05, "loss": 0.001, "step": 2563 }, { "epoch": 1.8756400877834674, "grad_norm": 0.21057982742786407, "learning_rate": 1.897062453715132e-05, "loss": 0.0025, "step": 2564 }, { "epoch": 1.8763716166788589, "grad_norm": 0.03410279378294945, "learning_rate": 1.8958281905702297e-05, "loss": 0.0009, "step": 2565 }, { "epoch": 1.8771031455742502, "grad_norm": 0.026795854791998863, "learning_rate": 1.894593927425327e-05, "loss": 0.0009, "step": 2566 }, { "epoch": 1.8778346744696415, "grad_norm": 0.023115502670407295, "learning_rate": 1.8933596642804247e-05, "loss": 0.0009, "step": 2567 }, { "epoch": 1.878566203365033, "grad_norm": 21.484134674072266, "learning_rate": 1.892125401135522e-05, "loss": 0.2379, "step": 2568 }, { "epoch": 1.8792977322604243, "grad_norm": 0.08459746837615967, "learning_rate": 1.8908911379906198e-05, "loss": 0.0011, "step": 2569 }, { "epoch": 1.8800292611558156, "grad_norm": 0.19181212782859802, "learning_rate": 1.889656874845717e-05, "loss": 0.0011, "step": 2570 }, { "epoch": 1.880760790051207, "grad_norm": 0.030497752130031586, "learning_rate": 1.8884226117008148e-05, "loss": 0.0009, "step": 2571 }, { "epoch": 1.8814923189465984, "grad_norm": 4.278834342956543, "learning_rate": 1.887188348555912e-05, "loss": 0.1741, "step": 2572 }, { "epoch": 1.8822238478419897, "grad_norm": 0.22166159749031067, "learning_rate": 1.8859540854110098e-05, "loss": 0.0012, "step": 2573 }, { "epoch": 1.8829553767373812, "grad_norm": 0.023537660017609596, "learning_rate": 1.884719822266107e-05, "loss": 0.0008, "step": 2574 }, { "epoch": 1.8836869056327725, "grad_norm": 8.258960723876953, "learning_rate": 1.8834855591212048e-05, "loss": 0.1054, "step": 2575 }, { "epoch": 1.8844184345281638, "grad_norm": 0.02186509221792221, "learning_rate": 1.882251295976302e-05, "loss": 0.0008, "step": 2576 }, { "epoch": 1.8851499634235553, "grad_norm": 0.0193721242249012, "learning_rate": 1.8810170328314e-05, "loss": 0.0007, "step": 2577 }, { "epoch": 1.8858814923189466, "grad_norm": 0.021835656836628914, "learning_rate": 1.8797827696864972e-05, "loss": 0.0007, "step": 2578 }, { "epoch": 1.886613021214338, "grad_norm": 0.09610843658447266, "learning_rate": 1.878548506541595e-05, "loss": 0.0011, "step": 2579 }, { "epoch": 1.8873445501097295, "grad_norm": 0.041423946619033813, "learning_rate": 1.8773142433966922e-05, "loss": 0.001, "step": 2580 }, { "epoch": 1.8880760790051208, "grad_norm": 0.018519727513194084, "learning_rate": 1.87607998025179e-05, "loss": 0.0006, "step": 2581 }, { "epoch": 1.888807607900512, "grad_norm": 2.0970866680145264, "learning_rate": 1.8748457171068872e-05, "loss": 0.203, "step": 2582 }, { "epoch": 1.8895391367959036, "grad_norm": 0.03156367316842079, "learning_rate": 1.873611453961985e-05, "loss": 0.0008, "step": 2583 }, { "epoch": 1.8902706656912946, "grad_norm": 0.04034655541181564, "learning_rate": 1.8723771908170822e-05, "loss": 0.001, "step": 2584 }, { "epoch": 1.8910021945866862, "grad_norm": 0.026097426190972328, "learning_rate": 1.87114292767218e-05, "loss": 0.0008, "step": 2585 }, { "epoch": 1.8917337234820777, "grad_norm": 1.6389245986938477, "learning_rate": 1.8699086645272773e-05, "loss": 0.009, "step": 2586 }, { "epoch": 1.8924652523774688, "grad_norm": 0.03902347385883331, "learning_rate": 1.868674401382375e-05, "loss": 0.0009, "step": 2587 }, { "epoch": 1.8931967812728603, "grad_norm": 19.462390899658203, "learning_rate": 1.8674401382374723e-05, "loss": 0.0227, "step": 2588 }, { "epoch": 1.8939283101682518, "grad_norm": 1.259191870689392, "learning_rate": 1.86620587509257e-05, "loss": 0.2456, "step": 2589 }, { "epoch": 1.8946598390636429, "grad_norm": 0.08909304440021515, "learning_rate": 1.8649716119476673e-05, "loss": 0.0013, "step": 2590 }, { "epoch": 1.8953913679590344, "grad_norm": 15.46009635925293, "learning_rate": 1.863737348802765e-05, "loss": 0.1306, "step": 2591 }, { "epoch": 1.8961228968544257, "grad_norm": 0.3969930112361908, "learning_rate": 1.8625030856578623e-05, "loss": 0.0023, "step": 2592 }, { "epoch": 1.896854425749817, "grad_norm": 0.14260287582874298, "learning_rate": 1.86126882251296e-05, "loss": 0.0021, "step": 2593 }, { "epoch": 1.8975859546452085, "grad_norm": 0.0663200244307518, "learning_rate": 1.8600345593680573e-05, "loss": 0.0014, "step": 2594 }, { "epoch": 1.8983174835405998, "grad_norm": 0.11273681372404099, "learning_rate": 1.858800296223155e-05, "loss": 0.0015, "step": 2595 }, { "epoch": 1.8990490124359911, "grad_norm": 0.0330280140042305, "learning_rate": 1.8575660330782524e-05, "loss": 0.0011, "step": 2596 }, { "epoch": 1.8997805413313826, "grad_norm": 0.19789855182170868, "learning_rate": 1.85633176993335e-05, "loss": 0.0019, "step": 2597 }, { "epoch": 1.900512070226774, "grad_norm": 9.655193328857422, "learning_rate": 1.8550975067884474e-05, "loss": 0.1101, "step": 2598 }, { "epoch": 1.9012435991221652, "grad_norm": 0.16055859625339508, "learning_rate": 1.853863243643545e-05, "loss": 0.0019, "step": 2599 }, { "epoch": 1.9019751280175567, "grad_norm": 0.08000541478395462, "learning_rate": 1.8526289804986424e-05, "loss": 0.0017, "step": 2600 }, { "epoch": 1.902706656912948, "grad_norm": 8.844040870666504, "learning_rate": 1.85139471735374e-05, "loss": 0.2594, "step": 2601 }, { "epoch": 1.9034381858083393, "grad_norm": 17.571969985961914, "learning_rate": 1.8501604542088374e-05, "loss": 0.048, "step": 2602 }, { "epoch": 1.9041697147037309, "grad_norm": 0.05622512474656105, "learning_rate": 1.848926191063935e-05, "loss": 0.0014, "step": 2603 }, { "epoch": 1.9049012435991222, "grad_norm": 0.12799020111560822, "learning_rate": 1.8476919279190324e-05, "loss": 0.0017, "step": 2604 }, { "epoch": 1.9056327724945135, "grad_norm": 13.248217582702637, "learning_rate": 1.84645766477413e-05, "loss": 0.0435, "step": 2605 }, { "epoch": 1.906364301389905, "grad_norm": 0.05110577121376991, "learning_rate": 1.8452234016292274e-05, "loss": 0.0013, "step": 2606 }, { "epoch": 1.9070958302852963, "grad_norm": 0.1761082261800766, "learning_rate": 1.843989138484325e-05, "loss": 0.0026, "step": 2607 }, { "epoch": 1.9078273591806876, "grad_norm": 0.07197260111570358, "learning_rate": 1.8427548753394225e-05, "loss": 0.002, "step": 2608 }, { "epoch": 1.908558888076079, "grad_norm": 0.06340989470481873, "learning_rate": 1.84152061219452e-05, "loss": 0.0016, "step": 2609 }, { "epoch": 1.9092904169714704, "grad_norm": 2.0907909870147705, "learning_rate": 1.8402863490496175e-05, "loss": 0.15, "step": 2610 }, { "epoch": 1.9100219458668617, "grad_norm": 0.04362792521715164, "learning_rate": 1.839052085904715e-05, "loss": 0.0012, "step": 2611 }, { "epoch": 1.9107534747622532, "grad_norm": 0.12479276955127716, "learning_rate": 1.8378178227598125e-05, "loss": 0.002, "step": 2612 }, { "epoch": 1.9114850036576445, "grad_norm": 1.0186687707901, "learning_rate": 1.8365835596149102e-05, "loss": 0.0037, "step": 2613 }, { "epoch": 1.9122165325530358, "grad_norm": 2.6812210083007812, "learning_rate": 1.8353492964700075e-05, "loss": 0.1762, "step": 2614 }, { "epoch": 1.9129480614484273, "grad_norm": 0.8225247263908386, "learning_rate": 1.834115033325105e-05, "loss": 0.0041, "step": 2615 }, { "epoch": 1.9136795903438186, "grad_norm": 0.08185499161481857, "learning_rate": 1.8328807701802025e-05, "loss": 0.0015, "step": 2616 }, { "epoch": 1.91441111923921, "grad_norm": 0.20082294940948486, "learning_rate": 1.8316465070353e-05, "loss": 0.0031, "step": 2617 }, { "epoch": 1.9151426481346014, "grad_norm": 0.09795019775629044, "learning_rate": 1.8304122438903976e-05, "loss": 0.0025, "step": 2618 }, { "epoch": 1.9158741770299925, "grad_norm": 0.7624795436859131, "learning_rate": 1.829177980745495e-05, "loss": 0.0037, "step": 2619 }, { "epoch": 1.916605705925384, "grad_norm": 2.829026460647583, "learning_rate": 1.8279437176005926e-05, "loss": 0.1936, "step": 2620 }, { "epoch": 1.9173372348207756, "grad_norm": 0.04527934640645981, "learning_rate": 1.82670945445569e-05, "loss": 0.0017, "step": 2621 }, { "epoch": 1.9180687637161666, "grad_norm": 4.79710054397583, "learning_rate": 1.8254751913107876e-05, "loss": 0.1685, "step": 2622 }, { "epoch": 1.9188002926115582, "grad_norm": 0.10455581545829773, "learning_rate": 1.824240928165885e-05, "loss": 0.0024, "step": 2623 }, { "epoch": 1.9195318215069497, "grad_norm": 0.5302028656005859, "learning_rate": 1.8230066650209826e-05, "loss": 0.0033, "step": 2624 }, { "epoch": 1.9202633504023408, "grad_norm": 0.1996065229177475, "learning_rate": 1.82177240187608e-05, "loss": 0.0032, "step": 2625 }, { "epoch": 1.9209948792977323, "grad_norm": 0.04724473878741264, "learning_rate": 1.8205381387311776e-05, "loss": 0.0016, "step": 2626 }, { "epoch": 1.9217264081931238, "grad_norm": 0.6795578598976135, "learning_rate": 1.819303875586275e-05, "loss": 0.0052, "step": 2627 }, { "epoch": 1.9224579370885149, "grad_norm": 31.200828552246094, "learning_rate": 1.8180696124413727e-05, "loss": 0.035, "step": 2628 }, { "epoch": 1.9231894659839064, "grad_norm": 0.08736789226531982, "learning_rate": 1.81683534929647e-05, "loss": 0.0016, "step": 2629 }, { "epoch": 1.9239209948792977, "grad_norm": 0.23821480572223663, "learning_rate": 1.8156010861515677e-05, "loss": 0.0021, "step": 2630 }, { "epoch": 1.924652523774689, "grad_norm": 12.37523078918457, "learning_rate": 1.814366823006665e-05, "loss": 0.1479, "step": 2631 }, { "epoch": 1.9253840526700805, "grad_norm": 0.06179583817720413, "learning_rate": 1.8131325598617627e-05, "loss": 0.0018, "step": 2632 }, { "epoch": 1.9261155815654718, "grad_norm": 10.899209022521973, "learning_rate": 1.81189829671686e-05, "loss": 0.1008, "step": 2633 }, { "epoch": 1.926847110460863, "grad_norm": 10.065530776977539, "learning_rate": 1.8106640335719577e-05, "loss": 0.1393, "step": 2634 }, { "epoch": 1.9275786393562546, "grad_norm": 2.2473480701446533, "learning_rate": 1.809429770427055e-05, "loss": 0.1603, "step": 2635 }, { "epoch": 1.928310168251646, "grad_norm": 0.15838001668453217, "learning_rate": 1.8081955072821527e-05, "loss": 0.0022, "step": 2636 }, { "epoch": 1.9290416971470372, "grad_norm": 0.37669509649276733, "learning_rate": 1.80696124413725e-05, "loss": 0.0035, "step": 2637 }, { "epoch": 1.9297732260424287, "grad_norm": 3.2868640422821045, "learning_rate": 1.8057269809923478e-05, "loss": 0.1956, "step": 2638 }, { "epoch": 1.93050475493782, "grad_norm": 0.1922222226858139, "learning_rate": 1.804492717847445e-05, "loss": 0.0035, "step": 2639 }, { "epoch": 1.9312362838332113, "grad_norm": 0.08244549483060837, "learning_rate": 1.8032584547025428e-05, "loss": 0.0019, "step": 2640 }, { "epoch": 1.9319678127286029, "grad_norm": 0.05496051907539368, "learning_rate": 1.80202419155764e-05, "loss": 0.0018, "step": 2641 }, { "epoch": 1.9326993416239941, "grad_norm": 4.022226333618164, "learning_rate": 1.8007899284127378e-05, "loss": 0.1494, "step": 2642 }, { "epoch": 1.9334308705193854, "grad_norm": 0.7293280959129333, "learning_rate": 1.799555665267835e-05, "loss": 0.0052, "step": 2643 }, { "epoch": 1.934162399414777, "grad_norm": 0.1236305683851242, "learning_rate": 1.7983214021229328e-05, "loss": 0.0023, "step": 2644 }, { "epoch": 1.9348939283101683, "grad_norm": 0.09505993127822876, "learning_rate": 1.79708713897803e-05, "loss": 0.002, "step": 2645 }, { "epoch": 1.9356254572055596, "grad_norm": 0.152711883187294, "learning_rate": 1.795852875833128e-05, "loss": 0.0028, "step": 2646 }, { "epoch": 1.936356986100951, "grad_norm": 0.10286157578229904, "learning_rate": 1.7946186126882252e-05, "loss": 0.0027, "step": 2647 }, { "epoch": 1.9370885149963424, "grad_norm": 0.09204534441232681, "learning_rate": 1.793384349543323e-05, "loss": 0.0022, "step": 2648 }, { "epoch": 1.9378200438917337, "grad_norm": 0.14120905101299286, "learning_rate": 1.7921500863984202e-05, "loss": 0.0023, "step": 2649 }, { "epoch": 1.9385515727871252, "grad_norm": 0.0680990219116211, "learning_rate": 1.790915823253518e-05, "loss": 0.0016, "step": 2650 }, { "epoch": 1.9392831016825165, "grad_norm": 0.5259526371955872, "learning_rate": 1.7896815601086152e-05, "loss": 0.0063, "step": 2651 }, { "epoch": 1.9400146305779078, "grad_norm": 14.158636093139648, "learning_rate": 1.788447296963713e-05, "loss": 0.0225, "step": 2652 }, { "epoch": 1.9407461594732993, "grad_norm": 5.124611854553223, "learning_rate": 1.7872130338188102e-05, "loss": 0.1156, "step": 2653 }, { "epoch": 1.9414776883686906, "grad_norm": 3.5084011554718018, "learning_rate": 1.785978770673908e-05, "loss": 0.1116, "step": 2654 }, { "epoch": 1.942209217264082, "grad_norm": 10.71434497833252, "learning_rate": 1.7847445075290053e-05, "loss": 0.0593, "step": 2655 }, { "epoch": 1.9429407461594734, "grad_norm": 0.05127580836415291, "learning_rate": 1.783510244384103e-05, "loss": 0.0015, "step": 2656 }, { "epoch": 1.9436722750548645, "grad_norm": 0.06134875491261482, "learning_rate": 1.7822759812392003e-05, "loss": 0.0017, "step": 2657 }, { "epoch": 1.944403803950256, "grad_norm": 35.017723083496094, "learning_rate": 1.781041718094298e-05, "loss": 0.1085, "step": 2658 }, { "epoch": 1.9451353328456475, "grad_norm": 0.07899966835975647, "learning_rate": 1.7798074549493953e-05, "loss": 0.0023, "step": 2659 }, { "epoch": 1.9458668617410386, "grad_norm": 0.06608499586582184, "learning_rate": 1.778573191804493e-05, "loss": 0.0018, "step": 2660 }, { "epoch": 1.9465983906364301, "grad_norm": 2.3034887313842773, "learning_rate": 1.7773389286595903e-05, "loss": 0.006, "step": 2661 }, { "epoch": 1.9473299195318217, "grad_norm": 3.0021955966949463, "learning_rate": 1.776104665514688e-05, "loss": 0.1797, "step": 2662 }, { "epoch": 1.9480614484272127, "grad_norm": 0.07948583364486694, "learning_rate": 1.7748704023697853e-05, "loss": 0.0021, "step": 2663 }, { "epoch": 1.9487929773226043, "grad_norm": 0.9376155734062195, "learning_rate": 1.773636139224883e-05, "loss": 0.0043, "step": 2664 }, { "epoch": 1.9495245062179956, "grad_norm": 0.12796591222286224, "learning_rate": 1.7724018760799804e-05, "loss": 0.0024, "step": 2665 }, { "epoch": 1.9502560351133869, "grad_norm": 2.432555913925171, "learning_rate": 1.771167612935078e-05, "loss": 0.1176, "step": 2666 }, { "epoch": 1.9509875640087784, "grad_norm": 0.1423056572675705, "learning_rate": 1.7699333497901754e-05, "loss": 0.0022, "step": 2667 }, { "epoch": 1.9517190929041697, "grad_norm": 0.2039918750524521, "learning_rate": 1.768699086645273e-05, "loss": 0.0033, "step": 2668 }, { "epoch": 1.952450621799561, "grad_norm": 0.5135772824287415, "learning_rate": 1.7674648235003704e-05, "loss": 0.0064, "step": 2669 }, { "epoch": 1.9531821506949525, "grad_norm": 4.255556106567383, "learning_rate": 1.766230560355468e-05, "loss": 0.0085, "step": 2670 }, { "epoch": 1.9539136795903438, "grad_norm": 0.5214841961860657, "learning_rate": 1.7649962972105654e-05, "loss": 0.0037, "step": 2671 }, { "epoch": 1.954645208485735, "grad_norm": 0.10665778070688248, "learning_rate": 1.763762034065663e-05, "loss": 0.0029, "step": 2672 }, { "epoch": 1.9553767373811266, "grad_norm": 0.22752274572849274, "learning_rate": 1.7625277709207604e-05, "loss": 0.0039, "step": 2673 }, { "epoch": 1.956108266276518, "grad_norm": 2.536933183670044, "learning_rate": 1.761293507775858e-05, "loss": 0.1117, "step": 2674 }, { "epoch": 1.9568397951719092, "grad_norm": 0.0683784931898117, "learning_rate": 1.7600592446309555e-05, "loss": 0.0021, "step": 2675 }, { "epoch": 1.9575713240673007, "grad_norm": 0.0878123864531517, "learning_rate": 1.758824981486053e-05, "loss": 0.0013, "step": 2676 }, { "epoch": 1.958302852962692, "grad_norm": 2.2828235626220703, "learning_rate": 1.7575907183411505e-05, "loss": 0.0035, "step": 2677 }, { "epoch": 1.9590343818580833, "grad_norm": 14.134050369262695, "learning_rate": 1.756356455196248e-05, "loss": 0.064, "step": 2678 }, { "epoch": 1.9597659107534748, "grad_norm": 0.13648343086242676, "learning_rate": 1.7551221920513455e-05, "loss": 0.002, "step": 2679 }, { "epoch": 1.9604974396488661, "grad_norm": 0.1889023631811142, "learning_rate": 1.7538879289064432e-05, "loss": 0.002, "step": 2680 }, { "epoch": 1.9612289685442574, "grad_norm": 0.49090149998664856, "learning_rate": 1.7526536657615405e-05, "loss": 0.0039, "step": 2681 }, { "epoch": 1.961960497439649, "grad_norm": 0.07212325930595398, "learning_rate": 1.7514194026166382e-05, "loss": 0.0011, "step": 2682 }, { "epoch": 1.9626920263350403, "grad_norm": 3.4600865840911865, "learning_rate": 1.7501851394717355e-05, "loss": 0.1206, "step": 2683 }, { "epoch": 1.9634235552304315, "grad_norm": 0.3792521357536316, "learning_rate": 1.7489508763268332e-05, "loss": 0.0039, "step": 2684 }, { "epoch": 1.964155084125823, "grad_norm": 3.583441972732544, "learning_rate": 1.7477166131819306e-05, "loss": 0.0978, "step": 2685 }, { "epoch": 1.9648866130212144, "grad_norm": 5.6917524337768555, "learning_rate": 1.7464823500370282e-05, "loss": 0.2897, "step": 2686 }, { "epoch": 1.9656181419166057, "grad_norm": 0.2864365577697754, "learning_rate": 1.7452480868921256e-05, "loss": 0.0038, "step": 2687 }, { "epoch": 1.9663496708119972, "grad_norm": 0.14693520963191986, "learning_rate": 1.7440138237472233e-05, "loss": 0.0034, "step": 2688 }, { "epoch": 1.9670811997073885, "grad_norm": 0.16222581267356873, "learning_rate": 1.7427795606023206e-05, "loss": 0.0024, "step": 2689 }, { "epoch": 1.9678127286027798, "grad_norm": 6.833108901977539, "learning_rate": 1.7415452974574183e-05, "loss": 0.0836, "step": 2690 }, { "epoch": 1.9685442574981713, "grad_norm": 13.287453651428223, "learning_rate": 1.7403110343125156e-05, "loss": 0.0275, "step": 2691 }, { "epoch": 1.9692757863935626, "grad_norm": 0.32137542963027954, "learning_rate": 1.7390767711676133e-05, "loss": 0.0028, "step": 2692 }, { "epoch": 1.970007315288954, "grad_norm": 2.3656108379364014, "learning_rate": 1.7378425080227106e-05, "loss": 0.0058, "step": 2693 }, { "epoch": 1.9707388441843454, "grad_norm": 5.038529872894287, "learning_rate": 1.7366082448778083e-05, "loss": 0.1295, "step": 2694 }, { "epoch": 1.9714703730797365, "grad_norm": 10.650069236755371, "learning_rate": 1.7353739817329057e-05, "loss": 0.0445, "step": 2695 }, { "epoch": 1.972201901975128, "grad_norm": 0.6268818974494934, "learning_rate": 1.7341397185880033e-05, "loss": 0.0043, "step": 2696 }, { "epoch": 1.9729334308705195, "grad_norm": 0.060378167778253555, "learning_rate": 1.7329054554431007e-05, "loss": 0.0008, "step": 2697 }, { "epoch": 1.9736649597659106, "grad_norm": 0.2666529715061188, "learning_rate": 1.7316711922981983e-05, "loss": 0.0036, "step": 2698 }, { "epoch": 1.9743964886613021, "grad_norm": 13.228370666503906, "learning_rate": 1.7304369291532957e-05, "loss": 0.0494, "step": 2699 }, { "epoch": 1.9751280175566936, "grad_norm": 0.02247418463230133, "learning_rate": 1.7292026660083934e-05, "loss": 0.0006, "step": 2700 }, { "epoch": 1.9758595464520847, "grad_norm": 0.47171756625175476, "learning_rate": 1.7279684028634907e-05, "loss": 0.0029, "step": 2701 }, { "epoch": 1.9765910753474762, "grad_norm": 9.725468635559082, "learning_rate": 1.726734139718588e-05, "loss": 0.1629, "step": 2702 }, { "epoch": 1.9773226042428675, "grad_norm": 11.189971923828125, "learning_rate": 1.7254998765736857e-05, "loss": 0.0234, "step": 2703 }, { "epoch": 1.9780541331382588, "grad_norm": 0.026783039793372154, "learning_rate": 1.724265613428783e-05, "loss": 0.0006, "step": 2704 }, { "epoch": 1.9787856620336504, "grad_norm": 0.22489884495735168, "learning_rate": 1.7230313502838807e-05, "loss": 0.0015, "step": 2705 }, { "epoch": 1.9795171909290417, "grad_norm": 0.5902420282363892, "learning_rate": 1.721797087138978e-05, "loss": 0.0052, "step": 2706 }, { "epoch": 1.980248719824433, "grad_norm": 0.026676347479224205, "learning_rate": 1.7205628239940758e-05, "loss": 0.0006, "step": 2707 }, { "epoch": 1.9809802487198245, "grad_norm": 1.7209968566894531, "learning_rate": 1.719328560849173e-05, "loss": 0.0134, "step": 2708 }, { "epoch": 1.9817117776152158, "grad_norm": 0.2501102685928345, "learning_rate": 1.7180942977042704e-05, "loss": 0.0029, "step": 2709 }, { "epoch": 1.982443306510607, "grad_norm": 24.07085418701172, "learning_rate": 1.716860034559368e-05, "loss": 0.2385, "step": 2710 }, { "epoch": 1.9831748354059986, "grad_norm": 8.85936450958252, "learning_rate": 1.7156257714144655e-05, "loss": 0.102, "step": 2711 }, { "epoch": 1.98390636430139, "grad_norm": 7.508352279663086, "learning_rate": 1.714391508269563e-05, "loss": 0.0139, "step": 2712 }, { "epoch": 1.9846378931967812, "grad_norm": 0.682174563407898, "learning_rate": 1.7131572451246605e-05, "loss": 0.0039, "step": 2713 }, { "epoch": 1.9853694220921727, "grad_norm": 0.04828614741563797, "learning_rate": 1.711922981979758e-05, "loss": 0.001, "step": 2714 }, { "epoch": 1.986100950987564, "grad_norm": 0.032948944717645645, "learning_rate": 1.7106887188348555e-05, "loss": 0.0007, "step": 2715 }, { "epoch": 1.9868324798829553, "grad_norm": 0.056806761771440506, "learning_rate": 1.7094544556899532e-05, "loss": 0.0007, "step": 2716 }, { "epoch": 1.9875640087783468, "grad_norm": 0.050789326429367065, "learning_rate": 1.7082201925450505e-05, "loss": 0.0006, "step": 2717 }, { "epoch": 1.9882955376737381, "grad_norm": 0.03428562358021736, "learning_rate": 1.7069859294001482e-05, "loss": 0.0006, "step": 2718 }, { "epoch": 1.9890270665691294, "grad_norm": 2.553959608078003, "learning_rate": 1.7057516662552455e-05, "loss": 0.2262, "step": 2719 }, { "epoch": 1.989758595464521, "grad_norm": 0.32384589314460754, "learning_rate": 1.7045174031103432e-05, "loss": 0.002, "step": 2720 }, { "epoch": 1.9904901243599122, "grad_norm": 0.010760030709207058, "learning_rate": 1.7032831399654406e-05, "loss": 0.0005, "step": 2721 }, { "epoch": 1.9912216532553035, "grad_norm": 0.1429067850112915, "learning_rate": 1.7020488768205382e-05, "loss": 0.0019, "step": 2722 }, { "epoch": 1.991953182150695, "grad_norm": 0.03739142790436745, "learning_rate": 1.7008146136756356e-05, "loss": 0.0009, "step": 2723 }, { "epoch": 1.9926847110460864, "grad_norm": 13.472880363464355, "learning_rate": 1.6995803505307333e-05, "loss": 0.1542, "step": 2724 }, { "epoch": 1.9934162399414777, "grad_norm": 0.22376300394535065, "learning_rate": 1.6983460873858306e-05, "loss": 0.0021, "step": 2725 }, { "epoch": 1.9941477688368692, "grad_norm": 0.16766194999217987, "learning_rate": 1.6971118242409283e-05, "loss": 0.0028, "step": 2726 }, { "epoch": 1.9948792977322605, "grad_norm": 12.475022315979004, "learning_rate": 1.6958775610960256e-05, "loss": 0.162, "step": 2727 }, { "epoch": 1.9956108266276518, "grad_norm": 9.911751747131348, "learning_rate": 1.6946432979511233e-05, "loss": 0.0056, "step": 2728 }, { "epoch": 1.9963423555230433, "grad_norm": 0.031300242990255356, "learning_rate": 1.6934090348062206e-05, "loss": 0.0005, "step": 2729 }, { "epoch": 1.9970738844184344, "grad_norm": 0.0185474194586277, "learning_rate": 1.6921747716613183e-05, "loss": 0.0006, "step": 2730 }, { "epoch": 1.9978054133138259, "grad_norm": 0.19381113350391388, "learning_rate": 1.6909405085164157e-05, "loss": 0.0019, "step": 2731 }, { "epoch": 1.9985369422092174, "grad_norm": 1.6192822456359863, "learning_rate": 1.6897062453715133e-05, "loss": 0.0042, "step": 2732 }, { "epoch": 1.9992684711046085, "grad_norm": 0.06355704367160797, "learning_rate": 1.6884719822266107e-05, "loss": 0.0013, "step": 2733 }, { "epoch": 2.0, "grad_norm": 0.08131127059459686, "learning_rate": 1.687237719081708e-05, "loss": 0.0011, "step": 2734 }, { "epoch": 2.0, "eval_accuracy": 0.9857966241251543, "eval_loss": 0.06457369029521942, "eval_runtime": 90.9077, "eval_samples_per_second": 53.439, "eval_steps_per_second": 1.672, "step": 2734 } ], "logging_steps": 1, "max_steps": 4101, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.300617579184352e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }