{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.05055779729409, "eval_steps": 500, "global_step": 4266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009494422027059103, "grad_norm": 5.364119052886963, "learning_rate": 6.329113924050633e-07, "loss": 2.9616, "step": 1 }, { "epoch": 0.0018988844054118206, "grad_norm": 5.426063060760498, "learning_rate": 1.2658227848101265e-06, "loss": 2.9664, "step": 2 }, { "epoch": 0.0028483266081177306, "grad_norm": 5.507386684417725, "learning_rate": 1.8987341772151901e-06, "loss": 2.96, "step": 3 }, { "epoch": 0.0037977688108236413, "grad_norm": 5.467552185058594, "learning_rate": 2.531645569620253e-06, "loss": 2.975, "step": 4 }, { "epoch": 0.004747211013529551, "grad_norm": 5.386384963989258, "learning_rate": 3.1645569620253167e-06, "loss": 2.9473, "step": 5 }, { "epoch": 0.005696653216235461, "grad_norm": 4.9166951179504395, "learning_rate": 3.7974683544303802e-06, "loss": 2.8597, "step": 6 }, { "epoch": 0.006646095418941372, "grad_norm": 5.469020843505859, "learning_rate": 4.430379746835443e-06, "loss": 2.9633, "step": 7 }, { "epoch": 0.0075955376216472826, "grad_norm": 5.380453586578369, "learning_rate": 5.063291139240506e-06, "loss": 2.8931, "step": 8 }, { "epoch": 0.008544979824353193, "grad_norm": 4.922253131866455, "learning_rate": 5.69620253164557e-06, "loss": 2.7452, "step": 9 }, { "epoch": 0.009494422027059102, "grad_norm": 5.517508029937744, "learning_rate": 6.329113924050633e-06, "loss": 2.8326, "step": 10 }, { "epoch": 0.010443864229765013, "grad_norm": 5.810976982116699, "learning_rate": 6.9620253164556965e-06, "loss": 2.7854, "step": 11 }, { "epoch": 0.011393306432470923, "grad_norm": 5.690661430358887, "learning_rate": 7.5949367088607605e-06, "loss": 2.7069, "step": 12 }, { "epoch": 0.012342748635176834, "grad_norm": 5.994122505187988, "learning_rate": 8.227848101265822e-06, "loss": 2.5705, "step": 13 }, { "epoch": 0.013292190837882745, "grad_norm": 5.86803674697876, "learning_rate": 8.860759493670886e-06, "loss": 2.4461, "step": 14 }, { "epoch": 0.014241633040588654, "grad_norm": 5.448781490325928, "learning_rate": 9.49367088607595e-06, "loss": 2.2408, "step": 15 }, { "epoch": 0.015191075243294565, "grad_norm": 6.4004902839660645, "learning_rate": 1.0126582278481012e-05, "loss": 2.1205, "step": 16 }, { "epoch": 0.016140517446000476, "grad_norm": 6.970590591430664, "learning_rate": 1.0759493670886076e-05, "loss": 1.9474, "step": 17 }, { "epoch": 0.017089959648706386, "grad_norm": 7.423785209655762, "learning_rate": 1.139240506329114e-05, "loss": 1.7348, "step": 18 }, { "epoch": 0.018039401851412295, "grad_norm": 7.429481029510498, "learning_rate": 1.2025316455696203e-05, "loss": 1.4835, "step": 19 }, { "epoch": 0.018988844054118204, "grad_norm": 6.7193284034729, "learning_rate": 1.2658227848101267e-05, "loss": 1.2373, "step": 20 }, { "epoch": 0.019938286256824117, "grad_norm": 4.46099853515625, "learning_rate": 1.3291139240506329e-05, "loss": 1.1095, "step": 21 }, { "epoch": 0.020887728459530026, "grad_norm": 3.001573085784912, "learning_rate": 1.3924050632911393e-05, "loss": 0.8642, "step": 22 }, { "epoch": 0.021837170662235936, "grad_norm": 2.197000026702881, "learning_rate": 1.4556962025316457e-05, "loss": 0.7734, "step": 23 }, { "epoch": 0.022786612864941845, "grad_norm": 1.8113943338394165, "learning_rate": 1.5189873417721521e-05, "loss": 0.7341, "step": 24 }, { "epoch": 0.023736055067647758, "grad_norm": 1.7461305856704712, "learning_rate": 1.5822784810126583e-05, "loss": 0.6743, "step": 25 }, { "epoch": 0.024685497270353667, "grad_norm": 1.3315849304199219, "learning_rate": 1.6455696202531644e-05, "loss": 0.5975, "step": 26 }, { "epoch": 0.025634939473059577, "grad_norm": 0.726314127445221, "learning_rate": 1.7088607594936708e-05, "loss": 0.5659, "step": 27 }, { "epoch": 0.02658438167576549, "grad_norm": 0.6269010901451111, "learning_rate": 1.7721518987341772e-05, "loss": 0.5226, "step": 28 }, { "epoch": 0.0275338238784714, "grad_norm": 0.5819966197013855, "learning_rate": 1.8354430379746836e-05, "loss": 0.6132, "step": 29 }, { "epoch": 0.028483266081177308, "grad_norm": 0.6247850060462952, "learning_rate": 1.89873417721519e-05, "loss": 0.5, "step": 30 }, { "epoch": 0.029432708283883217, "grad_norm": 0.702621579170227, "learning_rate": 1.962025316455696e-05, "loss": 0.4958, "step": 31 }, { "epoch": 0.03038215048658913, "grad_norm": 0.6045309901237488, "learning_rate": 2.0253164556962025e-05, "loss": 0.4405, "step": 32 }, { "epoch": 0.031331592689295036, "grad_norm": 0.5436626076698303, "learning_rate": 2.088607594936709e-05, "loss": 0.5607, "step": 33 }, { "epoch": 0.03228103489200095, "grad_norm": 0.43146297335624695, "learning_rate": 2.1518987341772153e-05, "loss": 0.3987, "step": 34 }, { "epoch": 0.03323047709470686, "grad_norm": 0.5124548673629761, "learning_rate": 2.2151898734177217e-05, "loss": 0.5084, "step": 35 }, { "epoch": 0.03417991929741277, "grad_norm": 0.4466649293899536, "learning_rate": 2.278481012658228e-05, "loss": 0.3761, "step": 36 }, { "epoch": 0.03512936150011868, "grad_norm": 0.41221529245376587, "learning_rate": 2.341772151898734e-05, "loss": 0.3859, "step": 37 }, { "epoch": 0.03607880370282459, "grad_norm": 0.3802257180213928, "learning_rate": 2.4050632911392405e-05, "loss": 0.3447, "step": 38 }, { "epoch": 0.0370282459055305, "grad_norm": 0.47727710008621216, "learning_rate": 2.468354430379747e-05, "loss": 0.3914, "step": 39 }, { "epoch": 0.03797768810823641, "grad_norm": 0.41048529744148254, "learning_rate": 2.5316455696202533e-05, "loss": 0.2988, "step": 40 }, { "epoch": 0.038927130310942325, "grad_norm": 0.5019667744636536, "learning_rate": 2.5949367088607597e-05, "loss": 0.2938, "step": 41 }, { "epoch": 0.039876572513648234, "grad_norm": 0.42121732234954834, "learning_rate": 2.6582278481012658e-05, "loss": 0.2579, "step": 42 }, { "epoch": 0.04082601471635414, "grad_norm": 0.4193897247314453, "learning_rate": 2.7215189873417722e-05, "loss": 0.3262, "step": 43 }, { "epoch": 0.04177545691906005, "grad_norm": 0.2978931665420532, "learning_rate": 2.7848101265822786e-05, "loss": 0.2365, "step": 44 }, { "epoch": 0.04272489912176596, "grad_norm": 0.34771448373794556, "learning_rate": 2.848101265822785e-05, "loss": 0.2364, "step": 45 }, { "epoch": 0.04367434132447187, "grad_norm": 0.3881576955318451, "learning_rate": 2.9113924050632914e-05, "loss": 0.286, "step": 46 }, { "epoch": 0.04462378352717778, "grad_norm": 0.33863797783851624, "learning_rate": 2.9746835443037974e-05, "loss": 0.2739, "step": 47 }, { "epoch": 0.04557322572988369, "grad_norm": 0.2894616723060608, "learning_rate": 3.0379746835443042e-05, "loss": 0.2587, "step": 48 }, { "epoch": 0.046522667932589606, "grad_norm": 0.22292694449424744, "learning_rate": 3.10126582278481e-05, "loss": 0.1861, "step": 49 }, { "epoch": 0.047472110135295516, "grad_norm": 0.21907460689544678, "learning_rate": 3.1645569620253167e-05, "loss": 0.1755, "step": 50 }, { "epoch": 0.048421552338001425, "grad_norm": 0.29593944549560547, "learning_rate": 3.227848101265823e-05, "loss": 0.1856, "step": 51 }, { "epoch": 0.049370994540707334, "grad_norm": 0.23055657744407654, "learning_rate": 3.291139240506329e-05, "loss": 0.2102, "step": 52 }, { "epoch": 0.050320436743413244, "grad_norm": 0.18929323554039001, "learning_rate": 3.354430379746836e-05, "loss": 0.1909, "step": 53 }, { "epoch": 0.05126987894611915, "grad_norm": 0.15004883706569672, "learning_rate": 3.4177215189873416e-05, "loss": 0.1619, "step": 54 }, { "epoch": 0.05221932114882506, "grad_norm": 0.15621644258499146, "learning_rate": 3.4810126582278487e-05, "loss": 0.1759, "step": 55 }, { "epoch": 0.05316876335153098, "grad_norm": 0.16266578435897827, "learning_rate": 3.5443037974683544e-05, "loss": 0.1657, "step": 56 }, { "epoch": 0.05411820555423689, "grad_norm": 0.14417718350887299, "learning_rate": 3.607594936708861e-05, "loss": 0.1698, "step": 57 }, { "epoch": 0.0550676477569428, "grad_norm": 0.21402889490127563, "learning_rate": 3.670886075949367e-05, "loss": 0.2185, "step": 58 }, { "epoch": 0.05601708995964871, "grad_norm": 0.1997889280319214, "learning_rate": 3.7341772151898736e-05, "loss": 0.2143, "step": 59 }, { "epoch": 0.056966532162354616, "grad_norm": 0.13755086064338684, "learning_rate": 3.79746835443038e-05, "loss": 0.1677, "step": 60 }, { "epoch": 0.057915974365060525, "grad_norm": 0.19304363429546356, "learning_rate": 3.8607594936708864e-05, "loss": 0.2113, "step": 61 }, { "epoch": 0.058865416567766435, "grad_norm": 0.14066031575202942, "learning_rate": 3.924050632911392e-05, "loss": 0.1612, "step": 62 }, { "epoch": 0.059814858770472344, "grad_norm": 0.13375213742256165, "learning_rate": 3.987341772151899e-05, "loss": 0.164, "step": 63 }, { "epoch": 0.06076430097317826, "grad_norm": 0.15216922760009766, "learning_rate": 4.050632911392405e-05, "loss": 0.16, "step": 64 }, { "epoch": 0.06171374317588417, "grad_norm": 0.16130389273166656, "learning_rate": 4.113924050632912e-05, "loss": 0.1957, "step": 65 }, { "epoch": 0.06266318537859007, "grad_norm": 0.1791229248046875, "learning_rate": 4.177215189873418e-05, "loss": 0.1993, "step": 66 }, { "epoch": 0.06361262758129599, "grad_norm": 0.11038907617330551, "learning_rate": 4.240506329113924e-05, "loss": 0.1517, "step": 67 }, { "epoch": 0.0645620697840019, "grad_norm": 0.13327902555465698, "learning_rate": 4.3037974683544305e-05, "loss": 0.1501, "step": 68 }, { "epoch": 0.06551151198670781, "grad_norm": 0.13731731474399567, "learning_rate": 4.367088607594937e-05, "loss": 0.1596, "step": 69 }, { "epoch": 0.06646095418941372, "grad_norm": 0.13924308121204376, "learning_rate": 4.430379746835443e-05, "loss": 0.152, "step": 70 }, { "epoch": 0.06741039639211963, "grad_norm": 0.1482289433479309, "learning_rate": 4.49367088607595e-05, "loss": 0.1536, "step": 71 }, { "epoch": 0.06835983859482554, "grad_norm": 0.10759364813566208, "learning_rate": 4.556962025316456e-05, "loss": 0.1543, "step": 72 }, { "epoch": 0.06930928079753144, "grad_norm": 0.12899678945541382, "learning_rate": 4.6202531645569625e-05, "loss": 0.165, "step": 73 }, { "epoch": 0.07025872300023736, "grad_norm": 0.11689919233322144, "learning_rate": 4.683544303797468e-05, "loss": 0.1564, "step": 74 }, { "epoch": 0.07120816520294328, "grad_norm": 0.12697139382362366, "learning_rate": 4.7468354430379746e-05, "loss": 0.162, "step": 75 }, { "epoch": 0.07215760740564918, "grad_norm": 0.12069376558065414, "learning_rate": 4.810126582278481e-05, "loss": 0.1467, "step": 76 }, { "epoch": 0.0731070496083551, "grad_norm": 0.10199815034866333, "learning_rate": 4.8734177215189874e-05, "loss": 0.1528, "step": 77 }, { "epoch": 0.074056491811061, "grad_norm": 0.1142750009894371, "learning_rate": 4.936708860759494e-05, "loss": 0.1574, "step": 78 }, { "epoch": 0.07500593401376691, "grad_norm": 0.11019093543291092, "learning_rate": 5e-05, "loss": 0.1512, "step": 79 }, { "epoch": 0.07595537621647282, "grad_norm": 0.09426973015069962, "learning_rate": 5.0632911392405066e-05, "loss": 0.1481, "step": 80 }, { "epoch": 0.07690481841917873, "grad_norm": 0.09757663309574127, "learning_rate": 5.1265822784810124e-05, "loss": 0.1484, "step": 81 }, { "epoch": 0.07785426062188465, "grad_norm": 0.10646392405033112, "learning_rate": 5.1898734177215194e-05, "loss": 0.1549, "step": 82 }, { "epoch": 0.07880370282459055, "grad_norm": 0.12109784036874771, "learning_rate": 5.253164556962026e-05, "loss": 0.1448, "step": 83 }, { "epoch": 0.07975314502729647, "grad_norm": 0.12039211392402649, "learning_rate": 5.3164556962025316e-05, "loss": 0.1538, "step": 84 }, { "epoch": 0.08070258723000237, "grad_norm": 0.16873961687088013, "learning_rate": 5.379746835443038e-05, "loss": 0.1971, "step": 85 }, { "epoch": 0.08165202943270829, "grad_norm": 0.12140022218227386, "learning_rate": 5.4430379746835444e-05, "loss": 0.1497, "step": 86 }, { "epoch": 0.08260147163541419, "grad_norm": 0.14637599885463715, "learning_rate": 5.5063291139240514e-05, "loss": 0.1958, "step": 87 }, { "epoch": 0.0835509138381201, "grad_norm": 0.1141396313905716, "learning_rate": 5.569620253164557e-05, "loss": 0.1457, "step": 88 }, { "epoch": 0.08450035604082601, "grad_norm": 0.2128390371799469, "learning_rate": 5.6329113924050636e-05, "loss": 0.2339, "step": 89 }, { "epoch": 0.08544979824353192, "grad_norm": 0.18838858604431152, "learning_rate": 5.69620253164557e-05, "loss": 0.2029, "step": 90 }, { "epoch": 0.08639924044623784, "grad_norm": 0.19592566788196564, "learning_rate": 5.759493670886076e-05, "loss": 0.2276, "step": 91 }, { "epoch": 0.08734868264894374, "grad_norm": 0.14753012359142303, "learning_rate": 5.822784810126583e-05, "loss": 0.1916, "step": 92 }, { "epoch": 0.08829812485164966, "grad_norm": 0.1494351178407669, "learning_rate": 5.886075949367089e-05, "loss": 0.1913, "step": 93 }, { "epoch": 0.08924756705435556, "grad_norm": 0.1173478439450264, "learning_rate": 5.949367088607595e-05, "loss": 0.1438, "step": 94 }, { "epoch": 0.09019700925706148, "grad_norm": 0.12023188918828964, "learning_rate": 6.012658227848101e-05, "loss": 0.1516, "step": 95 }, { "epoch": 0.09114645145976738, "grad_norm": 0.1275833547115326, "learning_rate": 6.0759493670886084e-05, "loss": 0.1492, "step": 96 }, { "epoch": 0.0920958936624733, "grad_norm": 0.1360282599925995, "learning_rate": 6.139240506329115e-05, "loss": 0.1507, "step": 97 }, { "epoch": 0.09304533586517921, "grad_norm": 0.1586841195821762, "learning_rate": 6.20253164556962e-05, "loss": 0.1956, "step": 98 }, { "epoch": 0.09399477806788512, "grad_norm": 0.14281995594501495, "learning_rate": 6.265822784810128e-05, "loss": 0.1774, "step": 99 }, { "epoch": 0.09494422027059103, "grad_norm": 0.12553077936172485, "learning_rate": 6.329113924050633e-05, "loss": 0.148, "step": 100 }, { "epoch": 0.09589366247329693, "grad_norm": 0.1117570698261261, "learning_rate": 6.392405063291139e-05, "loss": 0.16, "step": 101 }, { "epoch": 0.09684310467600285, "grad_norm": 0.13955281674861908, "learning_rate": 6.455696202531646e-05, "loss": 0.1464, "step": 102 }, { "epoch": 0.09779254687870875, "grad_norm": 0.10990285873413086, "learning_rate": 6.518987341772153e-05, "loss": 0.147, "step": 103 }, { "epoch": 0.09874198908141467, "grad_norm": 0.10545991361141205, "learning_rate": 6.582278481012658e-05, "loss": 0.1436, "step": 104 }, { "epoch": 0.09969143128412059, "grad_norm": 0.1717437207698822, "learning_rate": 6.645569620253165e-05, "loss": 0.2278, "step": 105 }, { "epoch": 0.10064087348682649, "grad_norm": 0.10950994491577148, "learning_rate": 6.708860759493672e-05, "loss": 0.1493, "step": 106 }, { "epoch": 0.1015903156895324, "grad_norm": 0.11200258880853653, "learning_rate": 6.772151898734177e-05, "loss": 0.1536, "step": 107 }, { "epoch": 0.1025397578922383, "grad_norm": 0.10955105721950531, "learning_rate": 6.835443037974683e-05, "loss": 0.1483, "step": 108 }, { "epoch": 0.10348920009494422, "grad_norm": 0.11920775473117828, "learning_rate": 6.89873417721519e-05, "loss": 0.1492, "step": 109 }, { "epoch": 0.10443864229765012, "grad_norm": 0.1390092819929123, "learning_rate": 6.962025316455697e-05, "loss": 0.1849, "step": 110 }, { "epoch": 0.10538808450035604, "grad_norm": 0.1363140493631363, "learning_rate": 7.025316455696203e-05, "loss": 0.1849, "step": 111 }, { "epoch": 0.10633752670306196, "grad_norm": 0.09190025180578232, "learning_rate": 7.088607594936709e-05, "loss": 0.1587, "step": 112 }, { "epoch": 0.10728696890576786, "grad_norm": 0.09020426124334335, "learning_rate": 7.151898734177216e-05, "loss": 0.1377, "step": 113 }, { "epoch": 0.10823641110847378, "grad_norm": 0.10544883459806442, "learning_rate": 7.215189873417722e-05, "loss": 0.1516, "step": 114 }, { "epoch": 0.10918585331117968, "grad_norm": 0.12401281297206879, "learning_rate": 7.278481012658229e-05, "loss": 0.154, "step": 115 }, { "epoch": 0.1101352955138856, "grad_norm": 0.1008707657456398, "learning_rate": 7.341772151898734e-05, "loss": 0.1448, "step": 116 }, { "epoch": 0.1110847377165915, "grad_norm": 0.10302747040987015, "learning_rate": 7.40506329113924e-05, "loss": 0.1451, "step": 117 }, { "epoch": 0.11203417991929741, "grad_norm": 0.12748293578624725, "learning_rate": 7.468354430379747e-05, "loss": 0.1829, "step": 118 }, { "epoch": 0.11298362212200333, "grad_norm": 0.10413361340761185, "learning_rate": 7.531645569620254e-05, "loss": 0.1371, "step": 119 }, { "epoch": 0.11393306432470923, "grad_norm": 0.1243433803319931, "learning_rate": 7.59493670886076e-05, "loss": 0.1409, "step": 120 }, { "epoch": 0.11488250652741515, "grad_norm": 0.11630933731794357, "learning_rate": 7.658227848101266e-05, "loss": 0.1372, "step": 121 }, { "epoch": 0.11583194873012105, "grad_norm": 0.17981529235839844, "learning_rate": 7.721518987341773e-05, "loss": 0.2257, "step": 122 }, { "epoch": 0.11678139093282697, "grad_norm": 0.14063452184200287, "learning_rate": 7.78481012658228e-05, "loss": 0.1841, "step": 123 }, { "epoch": 0.11773083313553287, "grad_norm": 0.1264188438653946, "learning_rate": 7.848101265822784e-05, "loss": 0.1471, "step": 124 }, { "epoch": 0.11868027533823879, "grad_norm": 0.12827955186367035, "learning_rate": 7.911392405063291e-05, "loss": 0.1493, "step": 125 }, { "epoch": 0.11962971754094469, "grad_norm": 0.09800329059362411, "learning_rate": 7.974683544303798e-05, "loss": 0.1414, "step": 126 }, { "epoch": 0.1205791597436506, "grad_norm": 0.09902197122573853, "learning_rate": 8.037974683544304e-05, "loss": 0.1462, "step": 127 }, { "epoch": 0.12152860194635652, "grad_norm": 0.09450504928827286, "learning_rate": 8.10126582278481e-05, "loss": 0.1484, "step": 128 }, { "epoch": 0.12247804414906242, "grad_norm": 0.11012883484363556, "learning_rate": 8.164556962025317e-05, "loss": 0.1437, "step": 129 }, { "epoch": 0.12342748635176834, "grad_norm": 0.11717642843723297, "learning_rate": 8.227848101265824e-05, "loss": 0.1478, "step": 130 }, { "epoch": 0.12437692855447424, "grad_norm": 0.08754123747348785, "learning_rate": 8.29113924050633e-05, "loss": 0.1408, "step": 131 }, { "epoch": 0.12532637075718014, "grad_norm": 0.10017862170934677, "learning_rate": 8.354430379746835e-05, "loss": 0.1476, "step": 132 }, { "epoch": 0.12627581295988607, "grad_norm": 0.08994068205356598, "learning_rate": 8.417721518987342e-05, "loss": 0.1478, "step": 133 }, { "epoch": 0.12722525516259198, "grad_norm": 0.09894968569278717, "learning_rate": 8.481012658227848e-05, "loss": 0.1309, "step": 134 }, { "epoch": 0.12817469736529788, "grad_norm": 0.10028701275587082, "learning_rate": 8.544303797468355e-05, "loss": 0.1433, "step": 135 }, { "epoch": 0.1291241395680038, "grad_norm": 0.0897536426782608, "learning_rate": 8.607594936708861e-05, "loss": 0.1459, "step": 136 }, { "epoch": 0.1300735817707097, "grad_norm": 0.10435349494218826, "learning_rate": 8.670886075949367e-05, "loss": 0.1434, "step": 137 }, { "epoch": 0.13102302397341561, "grad_norm": 0.11718117445707321, "learning_rate": 8.734177215189874e-05, "loss": 0.1509, "step": 138 }, { "epoch": 0.13197246617612152, "grad_norm": 0.14426474273204803, "learning_rate": 8.797468354430381e-05, "loss": 0.1373, "step": 139 }, { "epoch": 0.13292190837882745, "grad_norm": 0.13101965188980103, "learning_rate": 8.860759493670887e-05, "loss": 0.1358, "step": 140 }, { "epoch": 0.13387135058153335, "grad_norm": 0.11235956102609634, "learning_rate": 8.924050632911392e-05, "loss": 0.1394, "step": 141 }, { "epoch": 0.13482079278423925, "grad_norm": 0.11327100545167923, "learning_rate": 8.9873417721519e-05, "loss": 0.1443, "step": 142 }, { "epoch": 0.13577023498694518, "grad_norm": 0.10912016034126282, "learning_rate": 9.050632911392407e-05, "loss": 0.1698, "step": 143 }, { "epoch": 0.13671967718965108, "grad_norm": 0.16535617411136627, "learning_rate": 9.113924050632912e-05, "loss": 0.2255, "step": 144 }, { "epoch": 0.137669119392357, "grad_norm": 0.10184327512979507, "learning_rate": 9.177215189873418e-05, "loss": 0.1371, "step": 145 }, { "epoch": 0.1386185615950629, "grad_norm": 0.10998040437698364, "learning_rate": 9.240506329113925e-05, "loss": 0.1794, "step": 146 }, { "epoch": 0.13956800379776882, "grad_norm": 0.08974044770002365, "learning_rate": 9.303797468354431e-05, "loss": 0.144, "step": 147 }, { "epoch": 0.14051744600047472, "grad_norm": 0.12724193930625916, "learning_rate": 9.367088607594936e-05, "loss": 0.1794, "step": 148 }, { "epoch": 0.14146688820318062, "grad_norm": 0.1079091802239418, "learning_rate": 9.430379746835444e-05, "loss": 0.1399, "step": 149 }, { "epoch": 0.14241633040588655, "grad_norm": 0.09480807185173035, "learning_rate": 9.493670886075949e-05, "loss": 0.1395, "step": 150 }, { "epoch": 0.14336577260859246, "grad_norm": 0.08620745688676834, "learning_rate": 9.556962025316456e-05, "loss": 0.1415, "step": 151 }, { "epoch": 0.14431521481129836, "grad_norm": 0.10517002642154694, "learning_rate": 9.620253164556962e-05, "loss": 0.1723, "step": 152 }, { "epoch": 0.14526465701400426, "grad_norm": 0.0956311896443367, "learning_rate": 9.683544303797469e-05, "loss": 0.1515, "step": 153 }, { "epoch": 0.1462140992167102, "grad_norm": 0.08050324022769928, "learning_rate": 9.746835443037975e-05, "loss": 0.1322, "step": 154 }, { "epoch": 0.1471635414194161, "grad_norm": 0.0853201299905777, "learning_rate": 9.810126582278482e-05, "loss": 0.142, "step": 155 }, { "epoch": 0.148112983622122, "grad_norm": 0.09991180151700974, "learning_rate": 9.873417721518988e-05, "loss": 0.1348, "step": 156 }, { "epoch": 0.14906242582482793, "grad_norm": 0.08640603721141815, "learning_rate": 9.936708860759493e-05, "loss": 0.1397, "step": 157 }, { "epoch": 0.15001186802753383, "grad_norm": 0.09057717025279999, "learning_rate": 0.0001, "loss": 0.1381, "step": 158 }, { "epoch": 0.15096131023023973, "grad_norm": 0.09916041046380997, "learning_rate": 0.00010063291139240508, "loss": 0.1509, "step": 159 }, { "epoch": 0.15191075243294563, "grad_norm": 0.09434045851230621, "learning_rate": 0.00010126582278481013, "loss": 0.1388, "step": 160 }, { "epoch": 0.15286019463565156, "grad_norm": 0.1273377537727356, "learning_rate": 0.0001018987341772152, "loss": 0.1401, "step": 161 }, { "epoch": 0.15380963683835747, "grad_norm": 0.1297912299633026, "learning_rate": 0.00010253164556962025, "loss": 0.1852, "step": 162 }, { "epoch": 0.15475907904106337, "grad_norm": 0.1151595488190651, "learning_rate": 0.00010316455696202532, "loss": 0.1848, "step": 163 }, { "epoch": 0.1557085212437693, "grad_norm": 0.13381290435791016, "learning_rate": 0.00010379746835443039, "loss": 0.1438, "step": 164 }, { "epoch": 0.1566579634464752, "grad_norm": 0.07880119979381561, "learning_rate": 0.00010443037974683545, "loss": 0.1327, "step": 165 }, { "epoch": 0.1576074056491811, "grad_norm": 0.0843740776181221, "learning_rate": 0.00010506329113924052, "loss": 0.1398, "step": 166 }, { "epoch": 0.158556847851887, "grad_norm": 0.0981813594698906, "learning_rate": 0.00010569620253164559, "loss": 0.1409, "step": 167 }, { "epoch": 0.15950629005459294, "grad_norm": 0.10005304962396622, "learning_rate": 0.00010632911392405063, "loss": 0.1783, "step": 168 }, { "epoch": 0.16045573225729884, "grad_norm": 0.08365727961063385, "learning_rate": 0.00010696202531645569, "loss": 0.1275, "step": 169 }, { "epoch": 0.16140517446000474, "grad_norm": 0.1017635315656662, "learning_rate": 0.00010759493670886076, "loss": 0.1792, "step": 170 }, { "epoch": 0.16235461666271067, "grad_norm": 0.07007888704538345, "learning_rate": 0.00010822784810126583, "loss": 0.1473, "step": 171 }, { "epoch": 0.16330405886541657, "grad_norm": 0.07718679308891296, "learning_rate": 0.00010886075949367089, "loss": 0.1396, "step": 172 }, { "epoch": 0.16425350106812248, "grad_norm": 0.07228100299835205, "learning_rate": 0.00010949367088607596, "loss": 0.1398, "step": 173 }, { "epoch": 0.16520294327082838, "grad_norm": 0.07955378293991089, "learning_rate": 0.00011012658227848103, "loss": 0.1402, "step": 174 }, { "epoch": 0.1661523854735343, "grad_norm": 0.0816427692770958, "learning_rate": 0.00011075949367088607, "loss": 0.1345, "step": 175 }, { "epoch": 0.1671018276762402, "grad_norm": 0.07641757279634476, "learning_rate": 0.00011139240506329114, "loss": 0.1373, "step": 176 }, { "epoch": 0.1680512698789461, "grad_norm": 0.07354450225830078, "learning_rate": 0.0001120253164556962, "loss": 0.1394, "step": 177 }, { "epoch": 0.16900071208165202, "grad_norm": 0.08322398364543915, "learning_rate": 0.00011265822784810127, "loss": 0.138, "step": 178 }, { "epoch": 0.16995015428435795, "grad_norm": 0.13528607785701752, "learning_rate": 0.00011329113924050634, "loss": 0.2188, "step": 179 }, { "epoch": 0.17089959648706385, "grad_norm": 0.10803692042827606, "learning_rate": 0.0001139240506329114, "loss": 0.1782, "step": 180 }, { "epoch": 0.17184903868976975, "grad_norm": 0.08404573053121567, "learning_rate": 0.00011455696202531647, "loss": 0.1394, "step": 181 }, { "epoch": 0.17279848089247568, "grad_norm": 0.12790893018245697, "learning_rate": 0.00011518987341772151, "loss": 0.2157, "step": 182 }, { "epoch": 0.17374792309518158, "grad_norm": 0.09879907220602036, "learning_rate": 0.00011582278481012658, "loss": 0.1693, "step": 183 }, { "epoch": 0.17469736529788749, "grad_norm": 0.08092228323221207, "learning_rate": 0.00011645569620253166, "loss": 0.136, "step": 184 }, { "epoch": 0.1756468075005934, "grad_norm": 0.07660632580518723, "learning_rate": 0.00011708860759493671, "loss": 0.1332, "step": 185 }, { "epoch": 0.17659624970329932, "grad_norm": 0.07474201172590256, "learning_rate": 0.00011772151898734178, "loss": 0.1301, "step": 186 }, { "epoch": 0.17754569190600522, "grad_norm": 0.09162931889295578, "learning_rate": 0.00011835443037974685, "loss": 0.1407, "step": 187 }, { "epoch": 0.17849513410871112, "grad_norm": 0.08646775782108307, "learning_rate": 0.0001189873417721519, "loss": 0.139, "step": 188 }, { "epoch": 0.17944457631141705, "grad_norm": 0.0759253203868866, "learning_rate": 0.00011962025316455696, "loss": 0.1342, "step": 189 }, { "epoch": 0.18039401851412296, "grad_norm": 0.08292865008115768, "learning_rate": 0.00012025316455696203, "loss": 0.1389, "step": 190 }, { "epoch": 0.18134346071682886, "grad_norm": 0.12379574030637741, "learning_rate": 0.0001208860759493671, "loss": 0.1795, "step": 191 }, { "epoch": 0.18229290291953476, "grad_norm": 0.10240278393030167, "learning_rate": 0.00012151898734177217, "loss": 0.1721, "step": 192 }, { "epoch": 0.1832423451222407, "grad_norm": 0.09666036069393158, "learning_rate": 0.00012215189873417722, "loss": 0.1783, "step": 193 }, { "epoch": 0.1841917873249466, "grad_norm": 0.08314768224954605, "learning_rate": 0.0001227848101265823, "loss": 0.1429, "step": 194 }, { "epoch": 0.1851412295276525, "grad_norm": 0.07590368390083313, "learning_rate": 0.00012341772151898734, "loss": 0.1393, "step": 195 }, { "epoch": 0.18609067173035843, "grad_norm": 0.10585250705480576, "learning_rate": 0.0001240506329113924, "loss": 0.2155, "step": 196 }, { "epoch": 0.18704011393306433, "grad_norm": 0.06995555013418198, "learning_rate": 0.00012468354430379748, "loss": 0.1374, "step": 197 }, { "epoch": 0.18798955613577023, "grad_norm": 0.07370735704898834, "learning_rate": 0.00012531645569620255, "loss": 0.1367, "step": 198 }, { "epoch": 0.18893899833847613, "grad_norm": 0.07194443792104721, "learning_rate": 0.0001259493670886076, "loss": 0.1437, "step": 199 }, { "epoch": 0.18988844054118206, "grad_norm": 0.06982647627592087, "learning_rate": 0.00012658227848101267, "loss": 0.1358, "step": 200 }, { "epoch": 0.19083788274388797, "grad_norm": 0.06538347154855728, "learning_rate": 0.0001272151898734177, "loss": 0.1354, "step": 201 }, { "epoch": 0.19178732494659387, "grad_norm": 0.07789324969053268, "learning_rate": 0.00012784810126582278, "loss": 0.178, "step": 202 }, { "epoch": 0.1927367671492998, "grad_norm": 0.07376820594072342, "learning_rate": 0.00012848101265822785, "loss": 0.1621, "step": 203 }, { "epoch": 0.1936862093520057, "grad_norm": 0.0720745250582695, "learning_rate": 0.00012911392405063292, "loss": 0.132, "step": 204 }, { "epoch": 0.1946356515547116, "grad_norm": 0.06211116537451744, "learning_rate": 0.000129746835443038, "loss": 0.1387, "step": 205 }, { "epoch": 0.1955850937574175, "grad_norm": 0.06701771914958954, "learning_rate": 0.00013037974683544306, "loss": 0.14, "step": 206 }, { "epoch": 0.19653453596012344, "grad_norm": 0.07692532986402512, "learning_rate": 0.0001310126582278481, "loss": 0.1322, "step": 207 }, { "epoch": 0.19748397816282934, "grad_norm": 0.07763269543647766, "learning_rate": 0.00013164556962025315, "loss": 0.1393, "step": 208 }, { "epoch": 0.19843342036553524, "grad_norm": 0.08769022673368454, "learning_rate": 0.00013227848101265822, "loss": 0.1489, "step": 209 }, { "epoch": 0.19938286256824117, "grad_norm": 0.08881859481334686, "learning_rate": 0.0001329113924050633, "loss": 0.1765, "step": 210 }, { "epoch": 0.20033230477094707, "grad_norm": 0.06811822950839996, "learning_rate": 0.00013354430379746836, "loss": 0.1332, "step": 211 }, { "epoch": 0.20128174697365298, "grad_norm": 0.06390922516584396, "learning_rate": 0.00013417721518987343, "loss": 0.1343, "step": 212 }, { "epoch": 0.20223118917635888, "grad_norm": 0.06630406528711319, "learning_rate": 0.0001348101265822785, "loss": 0.1329, "step": 213 }, { "epoch": 0.2031806313790648, "grad_norm": 0.0730772465467453, "learning_rate": 0.00013544303797468355, "loss": 0.1354, "step": 214 }, { "epoch": 0.2041300735817707, "grad_norm": 0.06487323343753815, "learning_rate": 0.00013607594936708862, "loss": 0.1297, "step": 215 }, { "epoch": 0.2050795157844766, "grad_norm": 0.06967955082654953, "learning_rate": 0.00013670886075949366, "loss": 0.1398, "step": 216 }, { "epoch": 0.20602895798718254, "grad_norm": 0.08531820774078369, "learning_rate": 0.00013734177215189873, "loss": 0.1336, "step": 217 }, { "epoch": 0.20697840018988845, "grad_norm": 0.0757659375667572, "learning_rate": 0.0001379746835443038, "loss": 0.1606, "step": 218 }, { "epoch": 0.20792784239259435, "grad_norm": 0.060206469148397446, "learning_rate": 0.00013860759493670888, "loss": 0.1337, "step": 219 }, { "epoch": 0.20887728459530025, "grad_norm": 0.07996556162834167, "learning_rate": 0.00013924050632911395, "loss": 0.1308, "step": 220 }, { "epoch": 0.20982672679800618, "grad_norm": 0.06206861138343811, "learning_rate": 0.000139873417721519, "loss": 0.1347, "step": 221 }, { "epoch": 0.21077616900071208, "grad_norm": 0.08736416697502136, "learning_rate": 0.00014050632911392406, "loss": 0.1768, "step": 222 }, { "epoch": 0.21172561120341798, "grad_norm": 0.06427916139364243, "learning_rate": 0.00014113924050632913, "loss": 0.1374, "step": 223 }, { "epoch": 0.21267505340612392, "grad_norm": 0.10996536910533905, "learning_rate": 0.00014177215189873418, "loss": 0.222, "step": 224 }, { "epoch": 0.21362449560882982, "grad_norm": 0.08439125120639801, "learning_rate": 0.00014240506329113925, "loss": 0.1854, "step": 225 }, { "epoch": 0.21457393781153572, "grad_norm": 0.06892693787813187, "learning_rate": 0.00014303797468354432, "loss": 0.139, "step": 226 }, { "epoch": 0.21552338001424162, "grad_norm": 0.08241122961044312, "learning_rate": 0.0001436708860759494, "loss": 0.173, "step": 227 }, { "epoch": 0.21647282221694755, "grad_norm": 0.07911046594381332, "learning_rate": 0.00014430379746835443, "loss": 0.1418, "step": 228 }, { "epoch": 0.21742226441965345, "grad_norm": 0.06346064805984497, "learning_rate": 0.0001449367088607595, "loss": 0.1406, "step": 229 }, { "epoch": 0.21837170662235936, "grad_norm": 0.060393668711185455, "learning_rate": 0.00014556962025316457, "loss": 0.1417, "step": 230 }, { "epoch": 0.2193211488250653, "grad_norm": 0.05912507325410843, "learning_rate": 0.00014620253164556962, "loss": 0.1298, "step": 231 }, { "epoch": 0.2202705910277712, "grad_norm": 0.07730337232351303, "learning_rate": 0.0001468354430379747, "loss": 0.1769, "step": 232 }, { "epoch": 0.2212200332304771, "grad_norm": 0.07612381875514984, "learning_rate": 0.00014746835443037976, "loss": 0.1338, "step": 233 }, { "epoch": 0.222169475433183, "grad_norm": 0.055311791598796844, "learning_rate": 0.0001481012658227848, "loss": 0.1313, "step": 234 }, { "epoch": 0.22311891763588892, "grad_norm": 0.08492033183574677, "learning_rate": 0.00014873417721518987, "loss": 0.1367, "step": 235 }, { "epoch": 0.22406835983859483, "grad_norm": 0.07133237272500992, "learning_rate": 0.00014936708860759494, "loss": 0.1308, "step": 236 }, { "epoch": 0.22501780204130073, "grad_norm": 0.07148605585098267, "learning_rate": 0.00015000000000000001, "loss": 0.133, "step": 237 }, { "epoch": 0.22596724424400666, "grad_norm": 0.06900472939014435, "learning_rate": 0.00015063291139240508, "loss": 0.138, "step": 238 }, { "epoch": 0.22691668644671256, "grad_norm": 0.062325432896614075, "learning_rate": 0.00015126582278481013, "loss": 0.1338, "step": 239 }, { "epoch": 0.22786612864941846, "grad_norm": 0.06719667464494705, "learning_rate": 0.0001518987341772152, "loss": 0.1316, "step": 240 }, { "epoch": 0.22881557085212437, "grad_norm": 0.07456009089946747, "learning_rate": 0.00015253164556962024, "loss": 0.1412, "step": 241 }, { "epoch": 0.2297650130548303, "grad_norm": 0.05619575083255768, "learning_rate": 0.00015316455696202531, "loss": 0.1342, "step": 242 }, { "epoch": 0.2307144552575362, "grad_norm": 0.06157098710536957, "learning_rate": 0.00015379746835443038, "loss": 0.1329, "step": 243 }, { "epoch": 0.2316638974602421, "grad_norm": 0.06759827584028244, "learning_rate": 0.00015443037974683546, "loss": 0.1411, "step": 244 }, { "epoch": 0.232613339662948, "grad_norm": 0.06892479956150055, "learning_rate": 0.00015506329113924053, "loss": 0.1484, "step": 245 }, { "epoch": 0.23356278186565393, "grad_norm": 0.08536699414253235, "learning_rate": 0.0001556962025316456, "loss": 0.1855, "step": 246 }, { "epoch": 0.23451222406835984, "grad_norm": 0.06800314784049988, "learning_rate": 0.00015632911392405064, "loss": 0.1379, "step": 247 }, { "epoch": 0.23546166627106574, "grad_norm": 0.0625622496008873, "learning_rate": 0.00015696202531645568, "loss": 0.1344, "step": 248 }, { "epoch": 0.23641110847377167, "grad_norm": 0.06030593812465668, "learning_rate": 0.00015759493670886075, "loss": 0.1254, "step": 249 }, { "epoch": 0.23736055067647757, "grad_norm": 0.06694353371858597, "learning_rate": 0.00015822784810126583, "loss": 0.1413, "step": 250 }, { "epoch": 0.23830999287918347, "grad_norm": 0.06594134122133255, "learning_rate": 0.0001588607594936709, "loss": 0.1394, "step": 251 }, { "epoch": 0.23925943508188938, "grad_norm": 0.09062930941581726, "learning_rate": 0.00015949367088607597, "loss": 0.1883, "step": 252 }, { "epoch": 0.2402088772845953, "grad_norm": 0.06029089167714119, "learning_rate": 0.00016012658227848104, "loss": 0.1271, "step": 253 }, { "epoch": 0.2411583194873012, "grad_norm": 0.08471622318029404, "learning_rate": 0.00016075949367088608, "loss": 0.172, "step": 254 }, { "epoch": 0.2421077616900071, "grad_norm": 0.061710160225629807, "learning_rate": 0.00016139240506329115, "loss": 0.1348, "step": 255 }, { "epoch": 0.24305720389271304, "grad_norm": 0.0812671035528183, "learning_rate": 0.0001620253164556962, "loss": 0.1312, "step": 256 }, { "epoch": 0.24400664609541894, "grad_norm": 0.06917005032300949, "learning_rate": 0.00016265822784810127, "loss": 0.1464, "step": 257 }, { "epoch": 0.24495608829812485, "grad_norm": 0.0905887708067894, "learning_rate": 0.00016329113924050634, "loss": 0.1759, "step": 258 }, { "epoch": 0.24590553050083075, "grad_norm": 0.05976787209510803, "learning_rate": 0.0001639240506329114, "loss": 0.1404, "step": 259 }, { "epoch": 0.24685497270353668, "grad_norm": 0.07545675337314606, "learning_rate": 0.00016455696202531648, "loss": 0.1322, "step": 260 }, { "epoch": 0.24780441490624258, "grad_norm": 0.07035024464130402, "learning_rate": 0.00016518987341772152, "loss": 0.1378, "step": 261 }, { "epoch": 0.24875385710894848, "grad_norm": 0.07665737718343735, "learning_rate": 0.0001658227848101266, "loss": 0.1827, "step": 262 }, { "epoch": 0.24970329931165441, "grad_norm": 0.06619013845920563, "learning_rate": 0.00016645569620253166, "loss": 0.1284, "step": 263 }, { "epoch": 0.2506527415143603, "grad_norm": 0.0647001713514328, "learning_rate": 0.0001670886075949367, "loss": 0.133, "step": 264 }, { "epoch": 0.2516021837170662, "grad_norm": 0.060702718794345856, "learning_rate": 0.00016772151898734178, "loss": 0.1335, "step": 265 }, { "epoch": 0.25255162591977215, "grad_norm": 0.0508468896150589, "learning_rate": 0.00016835443037974685, "loss": 0.1333, "step": 266 }, { "epoch": 0.253501068122478, "grad_norm": 0.09877864271402359, "learning_rate": 0.0001689873417721519, "loss": 0.2031, "step": 267 }, { "epoch": 0.25445051032518395, "grad_norm": 0.06673337519168854, "learning_rate": 0.00016962025316455696, "loss": 0.1356, "step": 268 }, { "epoch": 0.2553999525278899, "grad_norm": 0.10604165494441986, "learning_rate": 0.00017025316455696204, "loss": 0.2517, "step": 269 }, { "epoch": 0.25634939473059576, "grad_norm": 0.07689858227968216, "learning_rate": 0.0001708860759493671, "loss": 0.1761, "step": 270 }, { "epoch": 0.2572988369333017, "grad_norm": 0.05482449755072594, "learning_rate": 0.00017151898734177218, "loss": 0.131, "step": 271 }, { "epoch": 0.2582482791360076, "grad_norm": 0.08622145652770996, "learning_rate": 0.00017215189873417722, "loss": 0.1335, "step": 272 }, { "epoch": 0.2591977213387135, "grad_norm": 0.0748213455080986, "learning_rate": 0.0001727848101265823, "loss": 0.176, "step": 273 }, { "epoch": 0.2601471635414194, "grad_norm": 0.06163305416703224, "learning_rate": 0.00017341772151898733, "loss": 0.1381, "step": 274 }, { "epoch": 0.26109660574412535, "grad_norm": 0.06141841039061546, "learning_rate": 0.0001740506329113924, "loss": 0.1353, "step": 275 }, { "epoch": 0.26204604794683123, "grad_norm": 0.07326913625001907, "learning_rate": 0.00017468354430379748, "loss": 0.1441, "step": 276 }, { "epoch": 0.26299549014953716, "grad_norm": 0.05951124057173729, "learning_rate": 0.00017531645569620255, "loss": 0.1321, "step": 277 }, { "epoch": 0.26394493235224303, "grad_norm": 0.08364073932170868, "learning_rate": 0.00017594936708860762, "loss": 0.187, "step": 278 }, { "epoch": 0.26489437455494896, "grad_norm": 0.05849132314324379, "learning_rate": 0.00017658227848101266, "loss": 0.1393, "step": 279 }, { "epoch": 0.2658438167576549, "grad_norm": 0.05452360957860947, "learning_rate": 0.00017721518987341773, "loss": 0.1342, "step": 280 }, { "epoch": 0.26679325896036077, "grad_norm": 0.04878188297152519, "learning_rate": 0.00017784810126582278, "loss": 0.1445, "step": 281 }, { "epoch": 0.2677427011630667, "grad_norm": 0.06066753342747688, "learning_rate": 0.00017848101265822785, "loss": 0.1423, "step": 282 }, { "epoch": 0.26869214336577263, "grad_norm": 0.04918207973241806, "learning_rate": 0.00017911392405063292, "loss": 0.1316, "step": 283 }, { "epoch": 0.2696415855684785, "grad_norm": 0.05103525519371033, "learning_rate": 0.000179746835443038, "loss": 0.1313, "step": 284 }, { "epoch": 0.27059102777118443, "grad_norm": 0.05667628347873688, "learning_rate": 0.00018037974683544306, "loss": 0.1434, "step": 285 }, { "epoch": 0.27154046997389036, "grad_norm": 0.06226016581058502, "learning_rate": 0.00018101265822784813, "loss": 0.1357, "step": 286 }, { "epoch": 0.27248991217659624, "grad_norm": 0.04695293679833412, "learning_rate": 0.00018164556962025317, "loss": 0.1314, "step": 287 }, { "epoch": 0.27343935437930217, "grad_norm": 0.05762844532728195, "learning_rate": 0.00018227848101265824, "loss": 0.1349, "step": 288 }, { "epoch": 0.27438879658200804, "grad_norm": 0.05454534292221069, "learning_rate": 0.0001829113924050633, "loss": 0.1432, "step": 289 }, { "epoch": 0.275338238784714, "grad_norm": 0.050270579755306244, "learning_rate": 0.00018354430379746836, "loss": 0.1272, "step": 290 }, { "epoch": 0.2762876809874199, "grad_norm": 0.0688452497124672, "learning_rate": 0.00018417721518987343, "loss": 0.1708, "step": 291 }, { "epoch": 0.2772371231901258, "grad_norm": 0.06213200092315674, "learning_rate": 0.0001848101265822785, "loss": 0.1674, "step": 292 }, { "epoch": 0.2781865653928317, "grad_norm": 0.059717319905757904, "learning_rate": 0.00018544303797468354, "loss": 0.169, "step": 293 }, { "epoch": 0.27913600759553764, "grad_norm": 0.06223325803875923, "learning_rate": 0.00018607594936708861, "loss": 0.1369, "step": 294 }, { "epoch": 0.2800854497982435, "grad_norm": 0.053163208067417145, "learning_rate": 0.00018670886075949369, "loss": 0.133, "step": 295 }, { "epoch": 0.28103489200094944, "grad_norm": 0.06647945195436478, "learning_rate": 0.00018734177215189873, "loss": 0.1438, "step": 296 }, { "epoch": 0.2819843342036554, "grad_norm": 0.0588272288441658, "learning_rate": 0.0001879746835443038, "loss": 0.1338, "step": 297 }, { "epoch": 0.28293377640636125, "grad_norm": 0.05841274932026863, "learning_rate": 0.00018860759493670887, "loss": 0.1329, "step": 298 }, { "epoch": 0.2838832186090672, "grad_norm": 0.09033369272947311, "learning_rate": 0.00018924050632911394, "loss": 0.1747, "step": 299 }, { "epoch": 0.2848326608117731, "grad_norm": 0.052215326577425, "learning_rate": 0.00018987341772151899, "loss": 0.1296, "step": 300 }, { "epoch": 0.285782103014479, "grad_norm": 0.05880101025104523, "learning_rate": 0.00019050632911392406, "loss": 0.1287, "step": 301 }, { "epoch": 0.2867315452171849, "grad_norm": 0.0691700354218483, "learning_rate": 0.00019113924050632913, "loss": 0.1676, "step": 302 }, { "epoch": 0.2876809874198908, "grad_norm": 0.057025909423828125, "learning_rate": 0.0001917721518987342, "loss": 0.1346, "step": 303 }, { "epoch": 0.2886304296225967, "grad_norm": 0.04936329275369644, "learning_rate": 0.00019240506329113924, "loss": 0.1354, "step": 304 }, { "epoch": 0.28957987182530265, "grad_norm": 0.0680055245757103, "learning_rate": 0.0001930379746835443, "loss": 0.1344, "step": 305 }, { "epoch": 0.2905293140280085, "grad_norm": 0.07374466210603714, "learning_rate": 0.00019367088607594938, "loss": 0.1428, "step": 306 }, { "epoch": 0.29147875623071445, "grad_norm": 0.061204761266708374, "learning_rate": 0.00019430379746835443, "loss": 0.1246, "step": 307 }, { "epoch": 0.2924281984334204, "grad_norm": 0.053467705845832825, "learning_rate": 0.0001949367088607595, "loss": 0.1342, "step": 308 }, { "epoch": 0.29337764063612626, "grad_norm": 0.057525087147951126, "learning_rate": 0.00019556962025316457, "loss": 0.1377, "step": 309 }, { "epoch": 0.2943270828388322, "grad_norm": 0.07857844978570938, "learning_rate": 0.00019620253164556964, "loss": 0.2076, "step": 310 }, { "epoch": 0.2952765250415381, "grad_norm": 0.05250545218586922, "learning_rate": 0.0001968354430379747, "loss": 0.1432, "step": 311 }, { "epoch": 0.296225967244244, "grad_norm": 0.07495012134313583, "learning_rate": 0.00019746835443037975, "loss": 0.1766, "step": 312 }, { "epoch": 0.2971754094469499, "grad_norm": 0.04692578688263893, "learning_rate": 0.0001981012658227848, "loss": 0.1408, "step": 313 }, { "epoch": 0.29812485164965585, "grad_norm": 0.055666085332632065, "learning_rate": 0.00019873417721518987, "loss": 0.1391, "step": 314 }, { "epoch": 0.29907429385236173, "grad_norm": 0.050465911626815796, "learning_rate": 0.00019936708860759494, "loss": 0.1415, "step": 315 }, { "epoch": 0.30002373605506766, "grad_norm": 0.051260240375995636, "learning_rate": 0.0002, "loss": 0.1423, "step": 316 }, { "epoch": 0.30097317825777353, "grad_norm": 0.0503215529024601, "learning_rate": 0.000199999938945738, "loss": 0.1348, "step": 317 }, { "epoch": 0.30192262046047946, "grad_norm": 0.04917483776807785, "learning_rate": 0.0001999997557830265, "loss": 0.1342, "step": 318 }, { "epoch": 0.3028720626631854, "grad_norm": 0.06354209035634995, "learning_rate": 0.00019999945051208916, "loss": 0.1365, "step": 319 }, { "epoch": 0.30382150486589127, "grad_norm": 0.04878314957022667, "learning_rate": 0.0001999990231332988, "loss": 0.13, "step": 320 }, { "epoch": 0.3047709470685972, "grad_norm": 0.07046223431825638, "learning_rate": 0.0001999984736471772, "loss": 0.1394, "step": 321 }, { "epoch": 0.30572038927130313, "grad_norm": 0.04456232488155365, "learning_rate": 0.00019999780205439538, "loss": 0.1278, "step": 322 }, { "epoch": 0.306669831474009, "grad_norm": 0.06280628591775894, "learning_rate": 0.00019999700835577342, "loss": 0.1715, "step": 323 }, { "epoch": 0.30761927367671493, "grad_norm": 0.07462131977081299, "learning_rate": 0.00019999609255228046, "loss": 0.1772, "step": 324 }, { "epoch": 0.30856871587942086, "grad_norm": 0.059642352163791656, "learning_rate": 0.00019999505464503482, "loss": 0.1294, "step": 325 }, { "epoch": 0.30951815808212674, "grad_norm": 0.06458820402622223, "learning_rate": 0.00019999389463530383, "loss": 0.173, "step": 326 }, { "epoch": 0.31046760028483267, "grad_norm": 0.05901939421892166, "learning_rate": 0.00019999261252450396, "loss": 0.1419, "step": 327 }, { "epoch": 0.3114170424875386, "grad_norm": 0.055540215224027634, "learning_rate": 0.00019999120831420083, "loss": 0.1314, "step": 328 }, { "epoch": 0.3123664846902445, "grad_norm": 0.0546739287674427, "learning_rate": 0.00019998968200610903, "loss": 0.1354, "step": 329 }, { "epoch": 0.3133159268929504, "grad_norm": 0.0689477026462555, "learning_rate": 0.00019998803360209234, "loss": 0.132, "step": 330 }, { "epoch": 0.3142653690956563, "grad_norm": 0.05279696360230446, "learning_rate": 0.00019998626310416365, "loss": 0.1424, "step": 331 }, { "epoch": 0.3152148112983622, "grad_norm": 0.055384278297424316, "learning_rate": 0.00019998437051448482, "loss": 0.141, "step": 332 }, { "epoch": 0.31616425350106814, "grad_norm": 0.04636182263493538, "learning_rate": 0.0001999823558353669, "loss": 0.1414, "step": 333 }, { "epoch": 0.317113695703774, "grad_norm": 0.04795726016163826, "learning_rate": 0.00019998021906926993, "loss": 0.1255, "step": 334 }, { "epoch": 0.31806313790647994, "grad_norm": 0.05326540395617485, "learning_rate": 0.00019997796021880318, "loss": 0.1309, "step": 335 }, { "epoch": 0.3190125801091859, "grad_norm": 0.0684736892580986, "learning_rate": 0.00019997557928672484, "loss": 0.1825, "step": 336 }, { "epoch": 0.31996202231189175, "grad_norm": 0.042282164096832275, "learning_rate": 0.0001999730762759422, "loss": 0.12, "step": 337 }, { "epoch": 0.3209114645145977, "grad_norm": 0.05297423154115677, "learning_rate": 0.00019997045118951175, "loss": 0.1309, "step": 338 }, { "epoch": 0.3218609067173036, "grad_norm": 0.080621138215065, "learning_rate": 0.00019996770403063883, "loss": 0.2134, "step": 339 }, { "epoch": 0.3228103489200095, "grad_norm": 0.05552308261394501, "learning_rate": 0.00019996483480267803, "loss": 0.1361, "step": 340 }, { "epoch": 0.3237597911227154, "grad_norm": 0.05070111155509949, "learning_rate": 0.00019996184350913287, "loss": 0.1314, "step": 341 }, { "epoch": 0.32470923332542134, "grad_norm": 0.04412266984581947, "learning_rate": 0.00019995873015365601, "loss": 0.1299, "step": 342 }, { "epoch": 0.3256586755281272, "grad_norm": 0.0445338599383831, "learning_rate": 0.00019995549474004917, "loss": 0.1313, "step": 343 }, { "epoch": 0.32660811773083315, "grad_norm": 0.08224980533123016, "learning_rate": 0.000199952137272263, "loss": 0.1844, "step": 344 }, { "epoch": 0.327557559933539, "grad_norm": 0.04331446811556816, "learning_rate": 0.0001999486577543972, "loss": 0.133, "step": 345 }, { "epoch": 0.32850700213624495, "grad_norm": 0.049314577132463455, "learning_rate": 0.00019994505619070068, "loss": 0.1351, "step": 346 }, { "epoch": 0.3294564443389509, "grad_norm": 0.0697011798620224, "learning_rate": 0.00019994133258557117, "loss": 0.1709, "step": 347 }, { "epoch": 0.33040588654165676, "grad_norm": 0.0510990135371685, "learning_rate": 0.00019993748694355557, "loss": 0.1365, "step": 348 }, { "epoch": 0.3313553287443627, "grad_norm": 0.05100785568356514, "learning_rate": 0.00019993351926934967, "loss": 0.1302, "step": 349 }, { "epoch": 0.3323047709470686, "grad_norm": 0.08001980185508728, "learning_rate": 0.00019992942956779838, "loss": 0.1736, "step": 350 }, { "epoch": 0.3332542131497745, "grad_norm": 0.05298507958650589, "learning_rate": 0.00019992521784389559, "loss": 0.159, "step": 351 }, { "epoch": 0.3342036553524804, "grad_norm": 0.04655485600233078, "learning_rate": 0.00019992088410278414, "loss": 0.1401, "step": 352 }, { "epoch": 0.33515309755518635, "grad_norm": 0.047509439289569855, "learning_rate": 0.00019991642834975594, "loss": 0.1369, "step": 353 }, { "epoch": 0.3361025397578922, "grad_norm": 0.046006906777620316, "learning_rate": 0.0001999118505902518, "loss": 0.1384, "step": 354 }, { "epoch": 0.33705198196059816, "grad_norm": 0.07522892951965332, "learning_rate": 0.00019990715082986155, "loss": 0.2254, "step": 355 }, { "epoch": 0.33800142416330403, "grad_norm": 0.048646144568920135, "learning_rate": 0.00019990232907432404, "loss": 0.1355, "step": 356 }, { "epoch": 0.33895086636600996, "grad_norm": 0.03941798582673073, "learning_rate": 0.000199897385329527, "loss": 0.1242, "step": 357 }, { "epoch": 0.3399003085687159, "grad_norm": 0.04582727700471878, "learning_rate": 0.0001998923196015072, "loss": 0.1347, "step": 358 }, { "epoch": 0.34084975077142177, "grad_norm": 0.05890033766627312, "learning_rate": 0.00019988713189645027, "loss": 0.1356, "step": 359 }, { "epoch": 0.3417991929741277, "grad_norm": 0.050398606806993484, "learning_rate": 0.00019988182222069093, "loss": 0.1379, "step": 360 }, { "epoch": 0.3427486351768336, "grad_norm": 0.053657352924346924, "learning_rate": 0.00019987639058071267, "loss": 0.1417, "step": 361 }, { "epoch": 0.3436980773795395, "grad_norm": 0.04928993433713913, "learning_rate": 0.00019987083698314804, "loss": 0.1269, "step": 362 }, { "epoch": 0.34464751958224543, "grad_norm": 0.04932550713419914, "learning_rate": 0.0001998651614347784, "loss": 0.1429, "step": 363 }, { "epoch": 0.34559696178495136, "grad_norm": 0.0531768873333931, "learning_rate": 0.00019985936394253413, "loss": 0.1367, "step": 364 }, { "epoch": 0.34654640398765724, "grad_norm": 0.05342009291052818, "learning_rate": 0.00019985344451349443, "loss": 0.1365, "step": 365 }, { "epoch": 0.34749584619036317, "grad_norm": 0.04960772022604942, "learning_rate": 0.00019984740315488742, "loss": 0.133, "step": 366 }, { "epoch": 0.3484452883930691, "grad_norm": 0.04490765556693077, "learning_rate": 0.00019984123987409013, "loss": 0.1347, "step": 367 }, { "epoch": 0.34939473059577497, "grad_norm": 0.05546121671795845, "learning_rate": 0.0001998349546786285, "loss": 0.169, "step": 368 }, { "epoch": 0.3503441727984809, "grad_norm": 0.04962169751524925, "learning_rate": 0.0001998285475761772, "loss": 0.1325, "step": 369 }, { "epoch": 0.3512936150011868, "grad_norm": 0.0451858825981617, "learning_rate": 0.00019982201857455988, "loss": 0.1291, "step": 370 }, { "epoch": 0.3522430572038927, "grad_norm": 0.07738906145095825, "learning_rate": 0.00019981536768174903, "loss": 0.1841, "step": 371 }, { "epoch": 0.35319249940659864, "grad_norm": 0.05104148015379906, "learning_rate": 0.000199808594905866, "loss": 0.1375, "step": 372 }, { "epoch": 0.3541419416093045, "grad_norm": 0.04850155860185623, "learning_rate": 0.00019980170025518082, "loss": 0.1335, "step": 373 }, { "epoch": 0.35509138381201044, "grad_norm": 0.050271324813365936, "learning_rate": 0.00019979468373811248, "loss": 0.1394, "step": 374 }, { "epoch": 0.35604082601471637, "grad_norm": 0.050799645483493805, "learning_rate": 0.0001997875453632288, "loss": 0.135, "step": 375 }, { "epoch": 0.35699026821742225, "grad_norm": 0.05703526735305786, "learning_rate": 0.00019978028513924627, "loss": 0.1371, "step": 376 }, { "epoch": 0.3579397104201282, "grad_norm": 0.06665853410959244, "learning_rate": 0.00019977290307503028, "loss": 0.1837, "step": 377 }, { "epoch": 0.3588891526228341, "grad_norm": 0.04639972746372223, "learning_rate": 0.000199765399179595, "loss": 0.1315, "step": 378 }, { "epoch": 0.35983859482554, "grad_norm": 0.07625308632850647, "learning_rate": 0.00019975777346210326, "loss": 0.2064, "step": 379 }, { "epoch": 0.3607880370282459, "grad_norm": 0.048770248889923096, "learning_rate": 0.00019975002593186674, "loss": 0.1363, "step": 380 }, { "epoch": 0.36173747923095184, "grad_norm": 0.04932136833667755, "learning_rate": 0.00019974215659834582, "loss": 0.1374, "step": 381 }, { "epoch": 0.3626869214336577, "grad_norm": 0.03848756104707718, "learning_rate": 0.00019973416547114964, "loss": 0.1333, "step": 382 }, { "epoch": 0.36363636363636365, "grad_norm": 0.04468891769647598, "learning_rate": 0.00019972605256003605, "loss": 0.129, "step": 383 }, { "epoch": 0.3645858058390695, "grad_norm": 0.048413511365652084, "learning_rate": 0.0001997178178749116, "loss": 0.1314, "step": 384 }, { "epoch": 0.36553524804177545, "grad_norm": 0.045054856687784195, "learning_rate": 0.00019970946142583155, "loss": 0.1323, "step": 385 }, { "epoch": 0.3664846902444814, "grad_norm": 0.05541200935840607, "learning_rate": 0.00019970098322299982, "loss": 0.1342, "step": 386 }, { "epoch": 0.36743413244718726, "grad_norm": 0.06861472874879837, "learning_rate": 0.00019969238327676906, "loss": 0.1347, "step": 387 }, { "epoch": 0.3683835746498932, "grad_norm": 0.043996453285217285, "learning_rate": 0.00019968366159764047, "loss": 0.132, "step": 388 }, { "epoch": 0.3693330168525991, "grad_norm": 0.06562239676713943, "learning_rate": 0.000199674818196264, "loss": 0.1759, "step": 389 }, { "epoch": 0.370282459055305, "grad_norm": 0.04714899882674217, "learning_rate": 0.00019966585308343822, "loss": 0.1274, "step": 390 }, { "epoch": 0.3712319012580109, "grad_norm": 0.04736959934234619, "learning_rate": 0.00019965676627011026, "loss": 0.1265, "step": 391 }, { "epoch": 0.37218134346071685, "grad_norm": 0.056829433888196945, "learning_rate": 0.0001996475577673759, "loss": 0.1402, "step": 392 }, { "epoch": 0.3731307856634227, "grad_norm": 0.0426231250166893, "learning_rate": 0.00019963822758647953, "loss": 0.1364, "step": 393 }, { "epoch": 0.37408022786612866, "grad_norm": 0.07376877963542938, "learning_rate": 0.00019962877573881404, "loss": 0.2042, "step": 394 }, { "epoch": 0.3750296700688346, "grad_norm": 0.043273668736219406, "learning_rate": 0.00019961920223592104, "loss": 0.132, "step": 395 }, { "epoch": 0.37597911227154046, "grad_norm": 0.044406965374946594, "learning_rate": 0.00019960950708949052, "loss": 0.1344, "step": 396 }, { "epoch": 0.3769285544742464, "grad_norm": 0.040342606604099274, "learning_rate": 0.00019959969031136106, "loss": 0.1214, "step": 397 }, { "epoch": 0.37787799667695227, "grad_norm": 0.05118388682603836, "learning_rate": 0.00019958975191351983, "loss": 0.14, "step": 398 }, { "epoch": 0.3788274388796582, "grad_norm": 0.045876793563365936, "learning_rate": 0.00019957969190810245, "loss": 0.1335, "step": 399 }, { "epoch": 0.3797768810823641, "grad_norm": 0.0645332932472229, "learning_rate": 0.00019956951030739308, "loss": 0.1702, "step": 400 }, { "epoch": 0.38072632328507, "grad_norm": 0.05039132386445999, "learning_rate": 0.00019955920712382423, "loss": 0.136, "step": 401 }, { "epoch": 0.38167576548777593, "grad_norm": 0.052004653960466385, "learning_rate": 0.00019954878236997704, "loss": 0.1386, "step": 402 }, { "epoch": 0.38262520769048186, "grad_norm": 0.05021458491683006, "learning_rate": 0.00019953823605858105, "loss": 0.1378, "step": 403 }, { "epoch": 0.38357464989318774, "grad_norm": 0.058653559535741806, "learning_rate": 0.0001995275682025141, "loss": 0.1437, "step": 404 }, { "epoch": 0.38452409209589367, "grad_norm": 0.04466673359274864, "learning_rate": 0.00019951677881480264, "loss": 0.1334, "step": 405 }, { "epoch": 0.3854735342985996, "grad_norm": 0.06119415909051895, "learning_rate": 0.00019950586790862138, "loss": 0.1296, "step": 406 }, { "epoch": 0.38642297650130547, "grad_norm": 0.04749077931046486, "learning_rate": 0.0001994948354972935, "loss": 0.1341, "step": 407 }, { "epoch": 0.3873724187040114, "grad_norm": 0.037752799689769745, "learning_rate": 0.00019948368159429053, "loss": 0.134, "step": 408 }, { "epoch": 0.38832186090671733, "grad_norm": 0.08903038501739502, "learning_rate": 0.00019947240621323226, "loss": 0.2155, "step": 409 }, { "epoch": 0.3892713031094232, "grad_norm": 0.03878140076994896, "learning_rate": 0.00019946100936788698, "loss": 0.1176, "step": 410 }, { "epoch": 0.39022074531212914, "grad_norm": 0.04927309602499008, "learning_rate": 0.00019944949107217113, "loss": 0.1344, "step": 411 }, { "epoch": 0.391170187514835, "grad_norm": 0.04933890327811241, "learning_rate": 0.00019943785134014962, "loss": 0.1315, "step": 412 }, { "epoch": 0.39211962971754094, "grad_norm": 0.06702516227960587, "learning_rate": 0.0001994260901860355, "loss": 0.1826, "step": 413 }, { "epoch": 0.39306907192024687, "grad_norm": 0.048132237046957016, "learning_rate": 0.00019941420762419014, "loss": 0.1436, "step": 414 }, { "epoch": 0.39401851412295275, "grad_norm": 0.07756894826889038, "learning_rate": 0.00019940220366912318, "loss": 0.2162, "step": 415 }, { "epoch": 0.3949679563256587, "grad_norm": 0.04789011925458908, "learning_rate": 0.00019939007833549242, "loss": 0.1295, "step": 416 }, { "epoch": 0.3959173985283646, "grad_norm": 0.04369444027543068, "learning_rate": 0.000199377831638104, "loss": 0.1322, "step": 417 }, { "epoch": 0.3968668407310705, "grad_norm": 0.05376122146844864, "learning_rate": 0.00019936546359191216, "loss": 0.1743, "step": 418 }, { "epoch": 0.3978162829337764, "grad_norm": 0.045930229127407074, "learning_rate": 0.0001993529742120193, "loss": 0.1336, "step": 419 }, { "epoch": 0.39876572513648234, "grad_norm": 0.039980966597795486, "learning_rate": 0.00019934036351367606, "loss": 0.1349, "step": 420 }, { "epoch": 0.3997151673391882, "grad_norm": 0.03797341510653496, "learning_rate": 0.00019932763151228115, "loss": 0.1256, "step": 421 }, { "epoch": 0.40066460954189415, "grad_norm": 0.04779914394021034, "learning_rate": 0.00019931477822338146, "loss": 0.1411, "step": 422 }, { "epoch": 0.4016140517446, "grad_norm": 0.040458668023347855, "learning_rate": 0.00019930180366267193, "loss": 0.126, "step": 423 }, { "epoch": 0.40256349394730595, "grad_norm": 0.04114462807774544, "learning_rate": 0.0001992887078459956, "loss": 0.127, "step": 424 }, { "epoch": 0.4035129361500119, "grad_norm": 0.048119012266397476, "learning_rate": 0.00019927549078934358, "loss": 0.1346, "step": 425 }, { "epoch": 0.40446237835271776, "grad_norm": 0.0545562319457531, "learning_rate": 0.00019926215250885504, "loss": 0.1387, "step": 426 }, { "epoch": 0.4054118205554237, "grad_norm": 0.052092909812927246, "learning_rate": 0.00019924869302081715, "loss": 0.1389, "step": 427 }, { "epoch": 0.4063612627581296, "grad_norm": 0.03847799077630043, "learning_rate": 0.0001992351123416651, "loss": 0.1234, "step": 428 }, { "epoch": 0.4073107049608355, "grad_norm": 0.0436912477016449, "learning_rate": 0.000199221410487982, "loss": 0.1362, "step": 429 }, { "epoch": 0.4082601471635414, "grad_norm": 0.04420888423919678, "learning_rate": 0.00019920758747649908, "loss": 0.1243, "step": 430 }, { "epoch": 0.40920958936624735, "grad_norm": 0.037297070026397705, "learning_rate": 0.00019919364332409535, "loss": 0.1331, "step": 431 }, { "epoch": 0.4101590315689532, "grad_norm": 0.03854360058903694, "learning_rate": 0.00019917957804779782, "loss": 0.1266, "step": 432 }, { "epoch": 0.41110847377165916, "grad_norm": 0.04071418195962906, "learning_rate": 0.00019916539166478137, "loss": 0.1292, "step": 433 }, { "epoch": 0.4120579159743651, "grad_norm": 0.04560808837413788, "learning_rate": 0.00019915108419236882, "loss": 0.1381, "step": 434 }, { "epoch": 0.41300735817707096, "grad_norm": 0.06313233822584152, "learning_rate": 0.00019913665564803078, "loss": 0.2031, "step": 435 }, { "epoch": 0.4139568003797769, "grad_norm": 0.04507524147629738, "learning_rate": 0.00019912210604938578, "loss": 0.1277, "step": 436 }, { "epoch": 0.41490624258248276, "grad_norm": 0.05048058554530144, "learning_rate": 0.00019910743541420007, "loss": 0.1315, "step": 437 }, { "epoch": 0.4158556847851887, "grad_norm": 0.04872648045420647, "learning_rate": 0.0001990926437603878, "loss": 0.1292, "step": 438 }, { "epoch": 0.4168051269878946, "grad_norm": 0.04400710016489029, "learning_rate": 0.00019907773110601075, "loss": 0.1236, "step": 439 }, { "epoch": 0.4177545691906005, "grad_norm": 0.051591627299785614, "learning_rate": 0.00019906269746927863, "loss": 0.1358, "step": 440 }, { "epoch": 0.41870401139330643, "grad_norm": 0.04288725182414055, "learning_rate": 0.00019904754286854877, "loss": 0.126, "step": 441 }, { "epoch": 0.41965345359601236, "grad_norm": 0.04984726384282112, "learning_rate": 0.00019903226732232622, "loss": 0.1326, "step": 442 }, { "epoch": 0.42060289579871823, "grad_norm": 0.041585132479667664, "learning_rate": 0.00019901687084926373, "loss": 0.136, "step": 443 }, { "epoch": 0.42155233800142417, "grad_norm": 0.05849035084247589, "learning_rate": 0.0001990013534681617, "loss": 0.1727, "step": 444 }, { "epoch": 0.4225017802041301, "grad_norm": 0.043387994170188904, "learning_rate": 0.00019898571519796817, "loss": 0.1393, "step": 445 }, { "epoch": 0.42345122240683597, "grad_norm": 0.05867496132850647, "learning_rate": 0.0001989699560577788, "loss": 0.1664, "step": 446 }, { "epoch": 0.4244006646095419, "grad_norm": 0.07019232958555222, "learning_rate": 0.00019895407606683685, "loss": 0.1653, "step": 447 }, { "epoch": 0.42535010681224783, "grad_norm": 0.04676515609025955, "learning_rate": 0.00019893807524453314, "loss": 0.1368, "step": 448 }, { "epoch": 0.4262995490149537, "grad_norm": 0.06640240550041199, "learning_rate": 0.00019892195361040607, "loss": 0.2089, "step": 449 }, { "epoch": 0.42724899121765963, "grad_norm": 0.044658735394477844, "learning_rate": 0.00019890571118414148, "loss": 0.1298, "step": 450 }, { "epoch": 0.4281984334203655, "grad_norm": 0.04810122773051262, "learning_rate": 0.00019888934798557278, "loss": 0.1288, "step": 451 }, { "epoch": 0.42914787562307144, "grad_norm": 0.0425436794757843, "learning_rate": 0.0001988728640346808, "loss": 0.1354, "step": 452 }, { "epoch": 0.43009731782577737, "grad_norm": 0.04513363912701607, "learning_rate": 0.0001988562593515939, "loss": 0.1346, "step": 453 }, { "epoch": 0.43104676002848324, "grad_norm": 0.052022870630025864, "learning_rate": 0.0001988395339565878, "loss": 0.1302, "step": 454 }, { "epoch": 0.4319962022311892, "grad_norm": 0.04852641373872757, "learning_rate": 0.0001988226878700856, "loss": 0.1388, "step": 455 }, { "epoch": 0.4329456444338951, "grad_norm": 0.04990584775805473, "learning_rate": 0.00019880572111265785, "loss": 0.1552, "step": 456 }, { "epoch": 0.433895086636601, "grad_norm": 0.052271679043769836, "learning_rate": 0.00019878863370502238, "loss": 0.1404, "step": 457 }, { "epoch": 0.4348445288393069, "grad_norm": 0.04795520752668381, "learning_rate": 0.00019877142566804436, "loss": 0.1341, "step": 458 }, { "epoch": 0.43579397104201284, "grad_norm": 0.048165664076805115, "learning_rate": 0.00019875409702273632, "loss": 0.1343, "step": 459 }, { "epoch": 0.4367434132447187, "grad_norm": 0.04213611036539078, "learning_rate": 0.000198736647790258, "loss": 0.1369, "step": 460 }, { "epoch": 0.43769285544742464, "grad_norm": 0.05819966271519661, "learning_rate": 0.00019871907799191632, "loss": 0.1615, "step": 461 }, { "epoch": 0.4386422976501306, "grad_norm": 0.057378821074962616, "learning_rate": 0.00019870138764916558, "loss": 0.175, "step": 462 }, { "epoch": 0.43959173985283645, "grad_norm": 0.0432853177189827, "learning_rate": 0.00019868357678360724, "loss": 0.1371, "step": 463 }, { "epoch": 0.4405411820555424, "grad_norm": 0.03890872746706009, "learning_rate": 0.0001986656454169898, "loss": 0.1332, "step": 464 }, { "epoch": 0.44149062425824825, "grad_norm": 0.04006613418459892, "learning_rate": 0.00019864759357120896, "loss": 0.1342, "step": 465 }, { "epoch": 0.4424400664609542, "grad_norm": 0.049053166061639786, "learning_rate": 0.00019862942126830767, "loss": 0.1756, "step": 466 }, { "epoch": 0.4433895086636601, "grad_norm": 0.03966079652309418, "learning_rate": 0.00019861112853047577, "loss": 0.1303, "step": 467 }, { "epoch": 0.444338950866366, "grad_norm": 0.04506433755159378, "learning_rate": 0.0001985927153800503, "loss": 0.136, "step": 468 }, { "epoch": 0.4452883930690719, "grad_norm": 0.04392915591597557, "learning_rate": 0.00019857418183951526, "loss": 0.1397, "step": 469 }, { "epoch": 0.44623783527177785, "grad_norm": 0.038007620722055435, "learning_rate": 0.0001985555279315017, "loss": 0.1246, "step": 470 }, { "epoch": 0.4471872774744837, "grad_norm": 0.048948097974061966, "learning_rate": 0.00019853675367878764, "loss": 0.1329, "step": 471 }, { "epoch": 0.44813671967718965, "grad_norm": 0.04174380376935005, "learning_rate": 0.00019851785910429806, "loss": 0.13, "step": 472 }, { "epoch": 0.4490861618798956, "grad_norm": 0.048575468361377716, "learning_rate": 0.00019849884423110478, "loss": 0.1385, "step": 473 }, { "epoch": 0.45003560408260146, "grad_norm": 0.05167670175433159, "learning_rate": 0.00019847970908242664, "loss": 0.1684, "step": 474 }, { "epoch": 0.4509850462853074, "grad_norm": 0.06849198788404465, "learning_rate": 0.00019846045368162923, "loss": 0.1795, "step": 475 }, { "epoch": 0.4519344884880133, "grad_norm": 0.044273603707551956, "learning_rate": 0.0001984410780522251, "loss": 0.1246, "step": 476 }, { "epoch": 0.4528839306907192, "grad_norm": 0.048194363713264465, "learning_rate": 0.00019842158221787353, "loss": 0.1366, "step": 477 }, { "epoch": 0.4538333728934251, "grad_norm": 0.033906418830156326, "learning_rate": 0.00019840196620238057, "loss": 0.1235, "step": 478 }, { "epoch": 0.454782815096131, "grad_norm": 0.043933141976594925, "learning_rate": 0.00019838223002969905, "loss": 0.1195, "step": 479 }, { "epoch": 0.45573225729883693, "grad_norm": 0.056823644787073135, "learning_rate": 0.00019836237372392854, "loss": 0.1757, "step": 480 }, { "epoch": 0.45668169950154286, "grad_norm": 0.07587820291519165, "learning_rate": 0.00019834239730931526, "loss": 0.1784, "step": 481 }, { "epoch": 0.45763114170424873, "grad_norm": 0.04008018970489502, "learning_rate": 0.0001983223008102521, "loss": 0.1306, "step": 482 }, { "epoch": 0.45858058390695466, "grad_norm": 0.05180038511753082, "learning_rate": 0.00019830208425127867, "loss": 0.1485, "step": 483 }, { "epoch": 0.4595300261096606, "grad_norm": 0.0691617876291275, "learning_rate": 0.00019828174765708104, "loss": 0.1249, "step": 484 }, { "epoch": 0.46047946831236647, "grad_norm": 0.0565367266535759, "learning_rate": 0.00019826129105249195, "loss": 0.1744, "step": 485 }, { "epoch": 0.4614289105150724, "grad_norm": 0.044927019625902176, "learning_rate": 0.00019824071446249072, "loss": 0.1341, "step": 486 }, { "epoch": 0.46237835271777833, "grad_norm": 0.04481721669435501, "learning_rate": 0.00019822001791220298, "loss": 0.1354, "step": 487 }, { "epoch": 0.4633277949204842, "grad_norm": 0.05233500525355339, "learning_rate": 0.0001981992014269011, "loss": 0.1501, "step": 488 }, { "epoch": 0.46427723712319013, "grad_norm": 0.044350553303956985, "learning_rate": 0.00019817826503200372, "loss": 0.1335, "step": 489 }, { "epoch": 0.465226679325896, "grad_norm": 0.03551819548010826, "learning_rate": 0.000198157208753076, "loss": 0.1322, "step": 490 }, { "epoch": 0.46617612152860194, "grad_norm": 0.04409592226147652, "learning_rate": 0.00019813603261582943, "loss": 0.1561, "step": 491 }, { "epoch": 0.46712556373130787, "grad_norm": 0.04842127487063408, "learning_rate": 0.0001981147366461219, "loss": 0.1296, "step": 492 }, { "epoch": 0.46807500593401374, "grad_norm": 0.04349881038069725, "learning_rate": 0.00019809332086995757, "loss": 0.1319, "step": 493 }, { "epoch": 0.4690244481367197, "grad_norm": 0.04413028433918953, "learning_rate": 0.00019807178531348698, "loss": 0.1321, "step": 494 }, { "epoch": 0.4699738903394256, "grad_norm": 0.03972313553094864, "learning_rate": 0.00019805013000300683, "loss": 0.1358, "step": 495 }, { "epoch": 0.4709233325421315, "grad_norm": 0.052269116044044495, "learning_rate": 0.00019802835496496012, "loss": 0.1389, "step": 496 }, { "epoch": 0.4718727747448374, "grad_norm": 0.0379653237760067, "learning_rate": 0.00019800646022593603, "loss": 0.1283, "step": 497 }, { "epoch": 0.47282221694754334, "grad_norm": 0.04370688647031784, "learning_rate": 0.0001979844458126699, "loss": 0.1278, "step": 498 }, { "epoch": 0.4737716591502492, "grad_norm": 0.03912369906902313, "learning_rate": 0.0001979623117520432, "loss": 0.1257, "step": 499 }, { "epoch": 0.47472110135295514, "grad_norm": 0.039594005793333054, "learning_rate": 0.00019794005807108352, "loss": 0.1375, "step": 500 }, { "epoch": 0.4756705435556611, "grad_norm": 0.03889892250299454, "learning_rate": 0.00019791768479696448, "loss": 0.13, "step": 501 }, { "epoch": 0.47661998575836695, "grad_norm": 0.03966660797595978, "learning_rate": 0.00019789519195700578, "loss": 0.1268, "step": 502 }, { "epoch": 0.4775694279610729, "grad_norm": 0.04501716047525406, "learning_rate": 0.00019787257957867306, "loss": 0.1423, "step": 503 }, { "epoch": 0.47851887016377875, "grad_norm": 0.06255436688661575, "learning_rate": 0.000197849847689578, "loss": 0.1799, "step": 504 }, { "epoch": 0.4794683123664847, "grad_norm": 0.050308458507061005, "learning_rate": 0.00019782699631747813, "loss": 0.1733, "step": 505 }, { "epoch": 0.4804177545691906, "grad_norm": 0.0357963964343071, "learning_rate": 0.00019780402549027698, "loss": 0.1268, "step": 506 }, { "epoch": 0.4813671967718965, "grad_norm": 0.03651968017220497, "learning_rate": 0.00019778093523602384, "loss": 0.1267, "step": 507 }, { "epoch": 0.4823166389746024, "grad_norm": 0.043042074888944626, "learning_rate": 0.0001977577255829139, "loss": 0.1256, "step": 508 }, { "epoch": 0.48326608117730835, "grad_norm": 0.07031014561653137, "learning_rate": 0.00019773439655928815, "loss": 0.1796, "step": 509 }, { "epoch": 0.4842155233800142, "grad_norm": 0.04429268836975098, "learning_rate": 0.00019771094819363326, "loss": 0.1298, "step": 510 }, { "epoch": 0.48516496558272015, "grad_norm": 0.0373898483812809, "learning_rate": 0.00019768738051458172, "loss": 0.1232, "step": 511 }, { "epoch": 0.4861144077854261, "grad_norm": 0.05853155627846718, "learning_rate": 0.00019766369355091166, "loss": 0.1694, "step": 512 }, { "epoch": 0.48706384998813196, "grad_norm": 0.05050895735621452, "learning_rate": 0.00019763988733154686, "loss": 0.1665, "step": 513 }, { "epoch": 0.4880132921908379, "grad_norm": 0.04074448347091675, "learning_rate": 0.0001976159618855568, "loss": 0.1336, "step": 514 }, { "epoch": 0.4889627343935438, "grad_norm": 0.03826110064983368, "learning_rate": 0.00019759191724215644, "loss": 0.132, "step": 515 }, { "epoch": 0.4899121765962497, "grad_norm": 0.04392875358462334, "learning_rate": 0.0001975677534307064, "loss": 0.1204, "step": 516 }, { "epoch": 0.4908616187989556, "grad_norm": 0.04615531116724014, "learning_rate": 0.0001975434704807127, "loss": 0.1358, "step": 517 }, { "epoch": 0.4918110610016615, "grad_norm": 0.053060565143823624, "learning_rate": 0.00019751906842182688, "loss": 0.1299, "step": 518 }, { "epoch": 0.49276050320436743, "grad_norm": 0.04905511438846588, "learning_rate": 0.00019749454728384594, "loss": 0.1284, "step": 519 }, { "epoch": 0.49370994540707336, "grad_norm": 0.04257996007800102, "learning_rate": 0.00019746990709671234, "loss": 0.1353, "step": 520 }, { "epoch": 0.49465938760977923, "grad_norm": 0.05581909418106079, "learning_rate": 0.0001974451478905138, "loss": 0.1594, "step": 521 }, { "epoch": 0.49560882981248516, "grad_norm": 0.04603990167379379, "learning_rate": 0.00019742026969548338, "loss": 0.1383, "step": 522 }, { "epoch": 0.4965582720151911, "grad_norm": 0.058511972427368164, "learning_rate": 0.00019739527254199958, "loss": 0.1725, "step": 523 }, { "epoch": 0.49750771421789697, "grad_norm": 0.03875808045268059, "learning_rate": 0.000197370156460586, "loss": 0.1405, "step": 524 }, { "epoch": 0.4984571564206029, "grad_norm": 0.040860000997781754, "learning_rate": 0.00019734492148191151, "loss": 0.139, "step": 525 }, { "epoch": 0.49940659862330883, "grad_norm": 0.06110459193587303, "learning_rate": 0.00019731956763679014, "loss": 0.223, "step": 526 }, { "epoch": 0.5003560408260147, "grad_norm": 0.05238598585128784, "learning_rate": 0.00019729409495618117, "loss": 0.1681, "step": 527 }, { "epoch": 0.5013054830287206, "grad_norm": 0.05180145800113678, "learning_rate": 0.00019726850347118885, "loss": 0.1743, "step": 528 }, { "epoch": 0.5022549252314266, "grad_norm": 0.05066410079598427, "learning_rate": 0.00019724279321306262, "loss": 0.1634, "step": 529 }, { "epoch": 0.5032043674341324, "grad_norm": 0.06856084614992142, "learning_rate": 0.00019721696421319684, "loss": 0.1685, "step": 530 }, { "epoch": 0.5041538096368383, "grad_norm": 0.045972324907779694, "learning_rate": 0.00019719101650313096, "loss": 0.1245, "step": 531 }, { "epoch": 0.5051032518395443, "grad_norm": 0.04522623121738434, "learning_rate": 0.00019716495011454934, "loss": 0.1367, "step": 532 }, { "epoch": 0.5060526940422502, "grad_norm": 0.0780516117811203, "learning_rate": 0.00019713876507928126, "loss": 0.1351, "step": 533 }, { "epoch": 0.507002136244956, "grad_norm": 0.04264210909605026, "learning_rate": 0.00019711246142930088, "loss": 0.1312, "step": 534 }, { "epoch": 0.507951578447662, "grad_norm": 0.059501100331544876, "learning_rate": 0.00019708603919672718, "loss": 0.1698, "step": 535 }, { "epoch": 0.5089010206503679, "grad_norm": 0.060105033218860626, "learning_rate": 0.00019705949841382396, "loss": 0.1303, "step": 536 }, { "epoch": 0.5098504628530738, "grad_norm": 0.04733967408537865, "learning_rate": 0.00019703283911299982, "loss": 0.1245, "step": 537 }, { "epoch": 0.5107999050557798, "grad_norm": 0.04254663735628128, "learning_rate": 0.00019700606132680798, "loss": 0.1343, "step": 538 }, { "epoch": 0.5117493472584856, "grad_norm": 0.06302463263273239, "learning_rate": 0.00019697916508794645, "loss": 0.1831, "step": 539 }, { "epoch": 0.5126987894611915, "grad_norm": 0.05301344394683838, "learning_rate": 0.0001969521504292578, "loss": 0.1316, "step": 540 }, { "epoch": 0.5136482316638975, "grad_norm": 0.04151083528995514, "learning_rate": 0.00019692501738372922, "loss": 0.1335, "step": 541 }, { "epoch": 0.5145976738666034, "grad_norm": 0.05647062510251999, "learning_rate": 0.00019689776598449257, "loss": 0.1688, "step": 542 }, { "epoch": 0.5155471160693093, "grad_norm": 0.037060294300317764, "learning_rate": 0.000196870396264824, "loss": 0.1339, "step": 543 }, { "epoch": 0.5164965582720152, "grad_norm": 0.04036247730255127, "learning_rate": 0.0001968429082581443, "loss": 0.1361, "step": 544 }, { "epoch": 0.5174460004747211, "grad_norm": 0.040889665484428406, "learning_rate": 0.00019681530199801875, "loss": 0.1356, "step": 545 }, { "epoch": 0.518395442677427, "grad_norm": 0.0538480207324028, "learning_rate": 0.00019678757751815686, "loss": 0.1689, "step": 546 }, { "epoch": 0.519344884880133, "grad_norm": 0.04074794426560402, "learning_rate": 0.0001967597348524126, "loss": 0.1329, "step": 547 }, { "epoch": 0.5202943270828388, "grad_norm": 0.03896891698241234, "learning_rate": 0.00019673177403478428, "loss": 0.1356, "step": 548 }, { "epoch": 0.5212437692855447, "grad_norm": 0.04619259387254715, "learning_rate": 0.00019670369509941442, "loss": 0.163, "step": 549 }, { "epoch": 0.5221932114882507, "grad_norm": 0.035968657582998276, "learning_rate": 0.00019667549808058976, "loss": 0.1242, "step": 550 }, { "epoch": 0.5231426536909566, "grad_norm": 0.04564007744193077, "learning_rate": 0.0001966471830127413, "loss": 0.1364, "step": 551 }, { "epoch": 0.5240920958936625, "grad_norm": 0.03991610184311867, "learning_rate": 0.00019661874993044415, "loss": 0.1312, "step": 552 }, { "epoch": 0.5250415380963683, "grad_norm": 0.037240512669086456, "learning_rate": 0.00019659019886841752, "loss": 0.1279, "step": 553 }, { "epoch": 0.5259909802990743, "grad_norm": 0.06598762422800064, "learning_rate": 0.00019656152986152468, "loss": 0.2165, "step": 554 }, { "epoch": 0.5269404225017802, "grad_norm": 0.03867746889591217, "learning_rate": 0.00019653274294477292, "loss": 0.1233, "step": 555 }, { "epoch": 0.5278898647044861, "grad_norm": 0.051915477961301804, "learning_rate": 0.00019650383815331357, "loss": 0.168, "step": 556 }, { "epoch": 0.528839306907192, "grad_norm": 0.054896485060453415, "learning_rate": 0.00019647481552244182, "loss": 0.1678, "step": 557 }, { "epoch": 0.5297887491098979, "grad_norm": 0.05439051240682602, "learning_rate": 0.00019644567508759675, "loss": 0.1607, "step": 558 }, { "epoch": 0.5307381913126038, "grad_norm": 0.03601578250527382, "learning_rate": 0.00019641641688436135, "loss": 0.1271, "step": 559 }, { "epoch": 0.5316876335153098, "grad_norm": 0.06025104597210884, "learning_rate": 0.00019638704094846236, "loss": 0.176, "step": 560 }, { "epoch": 0.5326370757180157, "grad_norm": 0.04126368835568428, "learning_rate": 0.00019635754731577032, "loss": 0.1319, "step": 561 }, { "epoch": 0.5335865179207215, "grad_norm": 0.05305393040180206, "learning_rate": 0.00019632793602229943, "loss": 0.1699, "step": 562 }, { "epoch": 0.5345359601234275, "grad_norm": 0.03538331016898155, "learning_rate": 0.00019629820710420764, "loss": 0.124, "step": 563 }, { "epoch": 0.5354854023261334, "grad_norm": 0.05861300975084305, "learning_rate": 0.0001962683605977965, "loss": 0.1688, "step": 564 }, { "epoch": 0.5364348445288393, "grad_norm": 0.040226079523563385, "learning_rate": 0.0001962383965395111, "loss": 0.1334, "step": 565 }, { "epoch": 0.5373842867315453, "grad_norm": 0.035788875073194504, "learning_rate": 0.00019620831496594017, "loss": 0.1281, "step": 566 }, { "epoch": 0.5383337289342511, "grad_norm": 0.0334162712097168, "learning_rate": 0.0001961781159138158, "loss": 0.1317, "step": 567 }, { "epoch": 0.539283171136957, "grad_norm": 0.03352081775665283, "learning_rate": 0.00019614779942001364, "loss": 0.1334, "step": 568 }, { "epoch": 0.540232613339663, "grad_norm": 0.03684060648083687, "learning_rate": 0.00019611736552155274, "loss": 0.1349, "step": 569 }, { "epoch": 0.5411820555423689, "grad_norm": 0.03640671446919441, "learning_rate": 0.00019608681425559542, "loss": 0.1278, "step": 570 }, { "epoch": 0.5421314977450747, "grad_norm": 0.04167250171303749, "learning_rate": 0.00019605614565944748, "loss": 0.1384, "step": 571 }, { "epoch": 0.5430809399477807, "grad_norm": 0.0416824147105217, "learning_rate": 0.00019602535977055778, "loss": 0.1319, "step": 572 }, { "epoch": 0.5440303821504866, "grad_norm": 0.03897137567400932, "learning_rate": 0.00019599445662651861, "loss": 0.1389, "step": 573 }, { "epoch": 0.5449798243531925, "grad_norm": 0.03894896060228348, "learning_rate": 0.00019596343626506526, "loss": 0.1341, "step": 574 }, { "epoch": 0.5459292665558985, "grad_norm": 0.04211690276861191, "learning_rate": 0.00019593229872407627, "loss": 0.1377, "step": 575 }, { "epoch": 0.5468787087586043, "grad_norm": 0.04308454692363739, "learning_rate": 0.00019590104404157327, "loss": 0.1268, "step": 576 }, { "epoch": 0.5478281509613102, "grad_norm": 0.0525001622736454, "learning_rate": 0.00019586967225572086, "loss": 0.1775, "step": 577 }, { "epoch": 0.5487775931640161, "grad_norm": 0.056315965950489044, "learning_rate": 0.00019583818340482664, "loss": 0.1688, "step": 578 }, { "epoch": 0.5497270353667221, "grad_norm": 0.03801283985376358, "learning_rate": 0.0001958065775273412, "loss": 0.1309, "step": 579 }, { "epoch": 0.550676477569428, "grad_norm": 0.03738854080438614, "learning_rate": 0.00019577485466185804, "loss": 0.137, "step": 580 }, { "epoch": 0.5516259197721338, "grad_norm": 0.03772661089897156, "learning_rate": 0.0001957430148471134, "loss": 0.1276, "step": 581 }, { "epoch": 0.5525753619748398, "grad_norm": 0.039842378348112106, "learning_rate": 0.00019571105812198652, "loss": 0.1329, "step": 582 }, { "epoch": 0.5535248041775457, "grad_norm": 0.033689334988594055, "learning_rate": 0.0001956789845254992, "loss": 0.1265, "step": 583 }, { "epoch": 0.5544742463802516, "grad_norm": 0.046588387340307236, "learning_rate": 0.00019564679409681608, "loss": 0.1645, "step": 584 }, { "epoch": 0.5554236885829575, "grad_norm": 0.03861064463853836, "learning_rate": 0.0001956144868752444, "loss": 0.1267, "step": 585 }, { "epoch": 0.5563731307856634, "grad_norm": 0.03467525169253349, "learning_rate": 0.000195582062900234, "loss": 0.1299, "step": 586 }, { "epoch": 0.5573225729883693, "grad_norm": 0.03659389913082123, "learning_rate": 0.0001955495222113774, "loss": 0.1286, "step": 587 }, { "epoch": 0.5582720151910753, "grad_norm": 0.03826770931482315, "learning_rate": 0.0001955168648484095, "loss": 0.1313, "step": 588 }, { "epoch": 0.5592214573937812, "grad_norm": 0.038110729306936264, "learning_rate": 0.00019548409085120772, "loss": 0.137, "step": 589 }, { "epoch": 0.560170899596487, "grad_norm": 0.03989555314183235, "learning_rate": 0.0001954512002597919, "loss": 0.132, "step": 590 }, { "epoch": 0.561120341799193, "grad_norm": 0.05395180359482765, "learning_rate": 0.00019541819311432427, "loss": 0.1401, "step": 591 }, { "epoch": 0.5620697840018989, "grad_norm": 0.05007918179035187, "learning_rate": 0.00019538506945510938, "loss": 0.1584, "step": 592 }, { "epoch": 0.5630192262046048, "grad_norm": 0.047849785536527634, "learning_rate": 0.00019535182932259404, "loss": 0.1265, "step": 593 }, { "epoch": 0.5639686684073107, "grad_norm": 0.04303041473031044, "learning_rate": 0.00019531847275736726, "loss": 0.1245, "step": 594 }, { "epoch": 0.5649181106100166, "grad_norm": 0.04128289222717285, "learning_rate": 0.00019528499980016025, "loss": 0.1317, "step": 595 }, { "epoch": 0.5658675528127225, "grad_norm": 0.04311414808034897, "learning_rate": 0.00019525141049184637, "loss": 0.1364, "step": 596 }, { "epoch": 0.5668169950154285, "grad_norm": 0.03765838220715523, "learning_rate": 0.00019521770487344103, "loss": 0.1268, "step": 597 }, { "epoch": 0.5677664372181344, "grad_norm": 0.03674585744738579, "learning_rate": 0.00019518388298610164, "loss": 0.1297, "step": 598 }, { "epoch": 0.5687158794208402, "grad_norm": 0.036937762051820755, "learning_rate": 0.0001951499448711276, "loss": 0.1303, "step": 599 }, { "epoch": 0.5696653216235462, "grad_norm": 0.03748161345720291, "learning_rate": 0.0001951158905699603, "loss": 0.1328, "step": 600 }, { "epoch": 0.5706147638262521, "grad_norm": 0.04011257737874985, "learning_rate": 0.00019508172012418283, "loss": 0.1346, "step": 601 }, { "epoch": 0.571564206028958, "grad_norm": 0.03853931650519371, "learning_rate": 0.00019504743357552035, "loss": 0.1279, "step": 602 }, { "epoch": 0.572513648231664, "grad_norm": 0.03750459849834442, "learning_rate": 0.0001950130309658396, "loss": 0.1227, "step": 603 }, { "epoch": 0.5734630904343698, "grad_norm": 0.05542079731822014, "learning_rate": 0.00019497851233714908, "loss": 0.1647, "step": 604 }, { "epoch": 0.5744125326370757, "grad_norm": 0.04472218081355095, "learning_rate": 0.00019494387773159898, "loss": 0.1416, "step": 605 }, { "epoch": 0.5753619748397816, "grad_norm": 0.052323974668979645, "learning_rate": 0.00019490912719148114, "loss": 0.1367, "step": 606 }, { "epoch": 0.5763114170424876, "grad_norm": 0.037580832839012146, "learning_rate": 0.00019487426075922893, "loss": 0.131, "step": 607 }, { "epoch": 0.5772608592451934, "grad_norm": 0.03929577395319939, "learning_rate": 0.0001948392784774172, "loss": 0.128, "step": 608 }, { "epoch": 0.5782103014478993, "grad_norm": 0.03706606104969978, "learning_rate": 0.0001948041803887623, "loss": 0.1316, "step": 609 }, { "epoch": 0.5791597436506053, "grad_norm": 0.038938358426094055, "learning_rate": 0.00019476896653612203, "loss": 0.1275, "step": 610 }, { "epoch": 0.5801091858533112, "grad_norm": 0.04818068817257881, "learning_rate": 0.00019473363696249546, "loss": 0.1662, "step": 611 }, { "epoch": 0.581058628056017, "grad_norm": 0.03735940158367157, "learning_rate": 0.00019469819171102304, "loss": 0.1361, "step": 612 }, { "epoch": 0.582008070258723, "grad_norm": 0.03568827733397484, "learning_rate": 0.00019466263082498645, "loss": 0.1216, "step": 613 }, { "epoch": 0.5829575124614289, "grad_norm": 0.03913251310586929, "learning_rate": 0.0001946269543478085, "loss": 0.1321, "step": 614 }, { "epoch": 0.5839069546641348, "grad_norm": 0.062009479850530624, "learning_rate": 0.0001945911623230533, "loss": 0.1778, "step": 615 }, { "epoch": 0.5848563968668408, "grad_norm": 0.039088111370801926, "learning_rate": 0.0001945552547944259, "loss": 0.1352, "step": 616 }, { "epoch": 0.5858058390695466, "grad_norm": 0.041976600885391235, "learning_rate": 0.0001945192318057725, "loss": 0.1394, "step": 617 }, { "epoch": 0.5867552812722525, "grad_norm": 0.03723563253879547, "learning_rate": 0.00019448309340108018, "loss": 0.1246, "step": 618 }, { "epoch": 0.5877047234749585, "grad_norm": 0.0382399819791317, "learning_rate": 0.00019444683962447707, "loss": 0.1232, "step": 619 }, { "epoch": 0.5886541656776644, "grad_norm": 0.03758077695965767, "learning_rate": 0.0001944104705202321, "loss": 0.1417, "step": 620 }, { "epoch": 0.5896036078803703, "grad_norm": 0.034823786467313766, "learning_rate": 0.000194373986132755, "loss": 0.1304, "step": 621 }, { "epoch": 0.5905530500830762, "grad_norm": 0.03755120187997818, "learning_rate": 0.00019433738650659641, "loss": 0.133, "step": 622 }, { "epoch": 0.5915024922857821, "grad_norm": 0.03759913146495819, "learning_rate": 0.00019430067168644754, "loss": 0.1222, "step": 623 }, { "epoch": 0.592451934488488, "grad_norm": 0.06232694163918495, "learning_rate": 0.0001942638417171403, "loss": 0.1778, "step": 624 }, { "epoch": 0.593401376691194, "grad_norm": 0.05642306059598923, "learning_rate": 0.00019422689664364725, "loss": 0.1706, "step": 625 }, { "epoch": 0.5943508188938998, "grad_norm": 0.0827709287405014, "learning_rate": 0.00019418983651108148, "loss": 0.2371, "step": 626 }, { "epoch": 0.5953002610966057, "grad_norm": 0.03614366054534912, "learning_rate": 0.00019415266136469652, "loss": 0.1225, "step": 627 }, { "epoch": 0.5962497032993117, "grad_norm": 0.042416494339704514, "learning_rate": 0.00019411537124988643, "loss": 0.1239, "step": 628 }, { "epoch": 0.5971991455020176, "grad_norm": 0.037246908992528915, "learning_rate": 0.00019407796621218566, "loss": 0.1292, "step": 629 }, { "epoch": 0.5981485877047235, "grad_norm": 0.05374092981219292, "learning_rate": 0.00019404044629726887, "loss": 0.1782, "step": 630 }, { "epoch": 0.5990980299074293, "grad_norm": 0.052854426205158234, "learning_rate": 0.00019400281155095112, "loss": 0.1711, "step": 631 }, { "epoch": 0.6000474721101353, "grad_norm": 0.038800131529569626, "learning_rate": 0.00019396506201918765, "loss": 0.1285, "step": 632 }, { "epoch": 0.6009969143128412, "grad_norm": 0.040118250995874405, "learning_rate": 0.0001939271977480738, "loss": 0.1335, "step": 633 }, { "epoch": 0.6019463565155471, "grad_norm": 0.07007341086864471, "learning_rate": 0.00019388921878384517, "loss": 0.2115, "step": 634 }, { "epoch": 0.602895798718253, "grad_norm": 0.03245210647583008, "learning_rate": 0.0001938511251728772, "loss": 0.1304, "step": 635 }, { "epoch": 0.6038452409209589, "grad_norm": 0.03384733200073242, "learning_rate": 0.00019381291696168553, "loss": 0.1297, "step": 636 }, { "epoch": 0.6047946831236648, "grad_norm": 0.04325825348496437, "learning_rate": 0.0001937745941969256, "loss": 0.1337, "step": 637 }, { "epoch": 0.6057441253263708, "grad_norm": 0.046986173838377, "learning_rate": 0.00019373615692539275, "loss": 0.1385, "step": 638 }, { "epoch": 0.6066935675290767, "grad_norm": 0.03726234659552574, "learning_rate": 0.0001936976051940222, "loss": 0.1393, "step": 639 }, { "epoch": 0.6076430097317825, "grad_norm": 0.05574486404657364, "learning_rate": 0.0001936589390498889, "loss": 0.1698, "step": 640 }, { "epoch": 0.6085924519344885, "grad_norm": 0.052818477153778076, "learning_rate": 0.0001936201585402075, "loss": 0.1722, "step": 641 }, { "epoch": 0.6095418941371944, "grad_norm": 0.03535636141896248, "learning_rate": 0.00019358126371233231, "loss": 0.129, "step": 642 }, { "epoch": 0.6104913363399003, "grad_norm": 0.03453061729669571, "learning_rate": 0.00019354225461375724, "loss": 0.1313, "step": 643 }, { "epoch": 0.6114407785426063, "grad_norm": 0.030467770993709564, "learning_rate": 0.0001935031312921157, "loss": 0.1241, "step": 644 }, { "epoch": 0.6123902207453121, "grad_norm": 0.03996508568525314, "learning_rate": 0.0001934638937951806, "loss": 0.1265, "step": 645 }, { "epoch": 0.613339662948018, "grad_norm": 0.034416794776916504, "learning_rate": 0.00019342454217086429, "loss": 0.1341, "step": 646 }, { "epoch": 0.614289105150724, "grad_norm": 0.03674698621034622, "learning_rate": 0.00019338507646721845, "loss": 0.1399, "step": 647 }, { "epoch": 0.6152385473534299, "grad_norm": 0.037850040942430496, "learning_rate": 0.0001933454967324341, "loss": 0.1295, "step": 648 }, { "epoch": 0.6161879895561357, "grad_norm": 0.037829235196113586, "learning_rate": 0.0001933058030148414, "loss": 0.1302, "step": 649 }, { "epoch": 0.6171374317588417, "grad_norm": 0.03579702973365784, "learning_rate": 0.00019326599536290983, "loss": 0.1352, "step": 650 }, { "epoch": 0.6180868739615476, "grad_norm": 0.052539851516485214, "learning_rate": 0.00019322607382524785, "loss": 0.1744, "step": 651 }, { "epoch": 0.6190363161642535, "grad_norm": 0.03814668953418732, "learning_rate": 0.0001931860384506031, "loss": 0.1303, "step": 652 }, { "epoch": 0.6199857583669595, "grad_norm": 0.03730069473385811, "learning_rate": 0.00019314588928786224, "loss": 0.1236, "step": 653 }, { "epoch": 0.6209352005696653, "grad_norm": 0.04081875458359718, "learning_rate": 0.00019310562638605078, "loss": 0.1328, "step": 654 }, { "epoch": 0.6218846427723712, "grad_norm": 0.03532617911696434, "learning_rate": 0.00019306524979433308, "loss": 0.1238, "step": 655 }, { "epoch": 0.6228340849750772, "grad_norm": 0.035857025533914566, "learning_rate": 0.00019302475956201254, "loss": 0.1244, "step": 656 }, { "epoch": 0.6237835271777831, "grad_norm": 0.036031339317560196, "learning_rate": 0.0001929841557385311, "loss": 0.1251, "step": 657 }, { "epoch": 0.624732969380489, "grad_norm": 0.037832874804735184, "learning_rate": 0.00019294343837346944, "loss": 0.1262, "step": 658 }, { "epoch": 0.6256824115831948, "grad_norm": 0.03651989623904228, "learning_rate": 0.00019290260751654706, "loss": 0.1239, "step": 659 }, { "epoch": 0.6266318537859008, "grad_norm": 0.03595907241106033, "learning_rate": 0.00019286166321762184, "loss": 0.1342, "step": 660 }, { "epoch": 0.6275812959886067, "grad_norm": 0.04714696854352951, "learning_rate": 0.00019282060552669025, "loss": 0.1712, "step": 661 }, { "epoch": 0.6285307381913126, "grad_norm": 0.0448799654841423, "learning_rate": 0.00019277943449388726, "loss": 0.1601, "step": 662 }, { "epoch": 0.6294801803940185, "grad_norm": 0.03133920207619667, "learning_rate": 0.0001927381501694862, "loss": 0.127, "step": 663 }, { "epoch": 0.6304296225967244, "grad_norm": 0.051593225449323654, "learning_rate": 0.00019269675260389876, "loss": 0.1659, "step": 664 }, { "epoch": 0.6313790647994303, "grad_norm": 0.03713349625468254, "learning_rate": 0.0001926552418476749, "loss": 0.1294, "step": 665 }, { "epoch": 0.6323285070021363, "grad_norm": 0.03420734405517578, "learning_rate": 0.00019261361795150275, "loss": 0.1376, "step": 666 }, { "epoch": 0.6332779492048421, "grad_norm": 0.04476429522037506, "learning_rate": 0.00019257188096620867, "loss": 0.1595, "step": 667 }, { "epoch": 0.634227391407548, "grad_norm": 0.05289504677057266, "learning_rate": 0.00019253003094275707, "loss": 0.1589, "step": 668 }, { "epoch": 0.635176833610254, "grad_norm": 0.042022526264190674, "learning_rate": 0.0001924880679322504, "loss": 0.1316, "step": 669 }, { "epoch": 0.6361262758129599, "grad_norm": 0.0408223457634449, "learning_rate": 0.00019244599198592907, "loss": 0.1386, "step": 670 }, { "epoch": 0.6370757180156658, "grad_norm": 0.03941584751009941, "learning_rate": 0.00019240380315517142, "loss": 0.1325, "step": 671 }, { "epoch": 0.6380251602183717, "grad_norm": 0.03860325738787651, "learning_rate": 0.00019236150149149357, "loss": 0.1215, "step": 672 }, { "epoch": 0.6389746024210776, "grad_norm": 0.0342581607401371, "learning_rate": 0.00019231908704654948, "loss": 0.1247, "step": 673 }, { "epoch": 0.6399240446237835, "grad_norm": 0.04099750518798828, "learning_rate": 0.00019227655987213077, "loss": 0.1335, "step": 674 }, { "epoch": 0.6408734868264895, "grad_norm": 0.031005796045064926, "learning_rate": 0.00019223392002016678, "loss": 0.1297, "step": 675 }, { "epoch": 0.6418229290291954, "grad_norm": 0.05248212069272995, "learning_rate": 0.0001921911675427244, "loss": 0.1737, "step": 676 }, { "epoch": 0.6427723712319012, "grad_norm": 0.04168983921408653, "learning_rate": 0.00019214830249200806, "loss": 0.1373, "step": 677 }, { "epoch": 0.6437218134346072, "grad_norm": 0.03659060224890709, "learning_rate": 0.0001921053249203596, "loss": 0.1263, "step": 678 }, { "epoch": 0.6446712556373131, "grad_norm": 0.042256928980350494, "learning_rate": 0.00019206223488025834, "loss": 0.1646, "step": 679 }, { "epoch": 0.645620697840019, "grad_norm": 0.04420709237456322, "learning_rate": 0.00019201903242432086, "loss": 0.1577, "step": 680 }, { "epoch": 0.646570140042725, "grad_norm": 0.03781798109412193, "learning_rate": 0.00019197571760530107, "loss": 0.1253, "step": 681 }, { "epoch": 0.6475195822454308, "grad_norm": 0.03728644549846649, "learning_rate": 0.00019193229047609003, "loss": 0.1423, "step": 682 }, { "epoch": 0.6484690244481367, "grad_norm": 0.05171523615717888, "learning_rate": 0.00019188875108971598, "loss": 0.177, "step": 683 }, { "epoch": 0.6494184666508427, "grad_norm": 0.05022161453962326, "learning_rate": 0.0001918450994993442, "loss": 0.1616, "step": 684 }, { "epoch": 0.6503679088535486, "grad_norm": 0.037774864584207535, "learning_rate": 0.00019180133575827707, "loss": 0.1257, "step": 685 }, { "epoch": 0.6513173510562544, "grad_norm": 0.056198425590991974, "learning_rate": 0.00019175745991995377, "loss": 0.1751, "step": 686 }, { "epoch": 0.6522667932589603, "grad_norm": 0.05259314179420471, "learning_rate": 0.0001917134720379505, "loss": 0.1655, "step": 687 }, { "epoch": 0.6532162354616663, "grad_norm": 0.04018954187631607, "learning_rate": 0.00019166937216598013, "loss": 0.1178, "step": 688 }, { "epoch": 0.6541656776643722, "grad_norm": 0.057170454412698746, "learning_rate": 0.00019162516035789247, "loss": 0.1744, "step": 689 }, { "epoch": 0.655115119867078, "grad_norm": 0.04647281393408775, "learning_rate": 0.00019158083666767381, "loss": 0.1343, "step": 690 }, { "epoch": 0.656064562069784, "grad_norm": 0.056390274316072464, "learning_rate": 0.00019153640114944723, "loss": 0.2029, "step": 691 }, { "epoch": 0.6570140042724899, "grad_norm": 0.03656432405114174, "learning_rate": 0.00019149185385747224, "loss": 0.1249, "step": 692 }, { "epoch": 0.6579634464751958, "grad_norm": 0.031422629952430725, "learning_rate": 0.0001914471948461449, "loss": 0.1232, "step": 693 }, { "epoch": 0.6589128886779018, "grad_norm": 0.0463186614215374, "learning_rate": 0.00019140242416999765, "loss": 0.1675, "step": 694 }, { "epoch": 0.6598623308806076, "grad_norm": 0.03907819464802742, "learning_rate": 0.0001913575418836993, "loss": 0.1307, "step": 695 }, { "epoch": 0.6608117730833135, "grad_norm": 0.04354274645447731, "learning_rate": 0.00019131254804205498, "loss": 0.1381, "step": 696 }, { "epoch": 0.6617612152860195, "grad_norm": 0.0355788990855217, "learning_rate": 0.00019126744270000598, "loss": 0.1273, "step": 697 }, { "epoch": 0.6627106574887254, "grad_norm": 0.0382835678756237, "learning_rate": 0.0001912222259126298, "loss": 0.1184, "step": 698 }, { "epoch": 0.6636600996914312, "grad_norm": 0.05007009580731392, "learning_rate": 0.00019117689773513993, "loss": 0.1751, "step": 699 }, { "epoch": 0.6646095418941372, "grad_norm": 0.05426732823252678, "learning_rate": 0.000191131458222886, "loss": 0.175, "step": 700 }, { "epoch": 0.6655589840968431, "grad_norm": 0.033966466784477234, "learning_rate": 0.00019108590743135352, "loss": 0.123, "step": 701 }, { "epoch": 0.666508426299549, "grad_norm": 0.04007060080766678, "learning_rate": 0.00019104024541616386, "loss": 0.1386, "step": 702 }, { "epoch": 0.667457868502255, "grad_norm": 0.05075724050402641, "learning_rate": 0.00019099447223307423, "loss": 0.1698, "step": 703 }, { "epoch": 0.6684073107049608, "grad_norm": 0.04677930474281311, "learning_rate": 0.00019094858793797757, "loss": 0.1633, "step": 704 }, { "epoch": 0.6693567529076667, "grad_norm": 0.04063379392027855, "learning_rate": 0.00019090259258690263, "loss": 0.1414, "step": 705 }, { "epoch": 0.6703061951103727, "grad_norm": 0.039291396737098694, "learning_rate": 0.00019085648623601352, "loss": 0.1273, "step": 706 }, { "epoch": 0.6712556373130786, "grad_norm": 0.04960642755031586, "learning_rate": 0.00019081026894161008, "loss": 0.1512, "step": 707 }, { "epoch": 0.6722050795157845, "grad_norm": 0.04266348108649254, "learning_rate": 0.00019076394076012756, "loss": 0.1352, "step": 708 }, { "epoch": 0.6731545217184904, "grad_norm": 0.03943296894431114, "learning_rate": 0.00019071750174813663, "loss": 0.1332, "step": 709 }, { "epoch": 0.6741039639211963, "grad_norm": 0.04927997291088104, "learning_rate": 0.0001906709519623433, "loss": 0.1645, "step": 710 }, { "epoch": 0.6750534061239022, "grad_norm": 0.0418451763689518, "learning_rate": 0.00019062429145958877, "loss": 0.1279, "step": 711 }, { "epoch": 0.6760028483266081, "grad_norm": 0.04283139482140541, "learning_rate": 0.0001905775202968495, "loss": 0.1388, "step": 712 }, { "epoch": 0.676952290529314, "grad_norm": 0.05674710497260094, "learning_rate": 0.00019053063853123714, "loss": 0.171, "step": 713 }, { "epoch": 0.6779017327320199, "grad_norm": 0.03568726405501366, "learning_rate": 0.00019048364621999825, "loss": 0.1329, "step": 714 }, { "epoch": 0.6788511749347258, "grad_norm": 0.03796301409602165, "learning_rate": 0.00019043654342051447, "loss": 0.1352, "step": 715 }, { "epoch": 0.6798006171374318, "grad_norm": 0.03538963943719864, "learning_rate": 0.00019038933019030233, "loss": 0.1328, "step": 716 }, { "epoch": 0.6807500593401377, "grad_norm": 0.05234035104513168, "learning_rate": 0.00019034200658701322, "loss": 0.1649, "step": 717 }, { "epoch": 0.6816995015428435, "grad_norm": 0.03719701990485191, "learning_rate": 0.00019029457266843327, "loss": 0.1295, "step": 718 }, { "epoch": 0.6826489437455495, "grad_norm": 0.03594352304935455, "learning_rate": 0.00019024702849248335, "loss": 0.128, "step": 719 }, { "epoch": 0.6835983859482554, "grad_norm": 0.04097168892621994, "learning_rate": 0.00019019937411721895, "loss": 0.1331, "step": 720 }, { "epoch": 0.6845478281509613, "grad_norm": 0.03943239524960518, "learning_rate": 0.00019015160960083013, "loss": 0.1337, "step": 721 }, { "epoch": 0.6854972703536673, "grad_norm": 0.0411958172917366, "learning_rate": 0.00019010373500164145, "loss": 0.1603, "step": 722 }, { "epoch": 0.6864467125563731, "grad_norm": 0.05295250564813614, "learning_rate": 0.00019005575037811184, "loss": 0.1644, "step": 723 }, { "epoch": 0.687396154759079, "grad_norm": 0.03916552662849426, "learning_rate": 0.00019000765578883465, "loss": 0.135, "step": 724 }, { "epoch": 0.688345596961785, "grad_norm": 0.03871094062924385, "learning_rate": 0.00018995945129253745, "loss": 0.1276, "step": 725 }, { "epoch": 0.6892950391644909, "grad_norm": 0.03405594825744629, "learning_rate": 0.00018991113694808204, "loss": 0.1327, "step": 726 }, { "epoch": 0.6902444813671967, "grad_norm": 0.03824371099472046, "learning_rate": 0.00018986271281446436, "loss": 0.1357, "step": 727 }, { "epoch": 0.6911939235699027, "grad_norm": 0.03813684731721878, "learning_rate": 0.0001898141789508144, "loss": 0.1341, "step": 728 }, { "epoch": 0.6921433657726086, "grad_norm": 0.03283112868666649, "learning_rate": 0.0001897655354163962, "loss": 0.1299, "step": 729 }, { "epoch": 0.6930928079753145, "grad_norm": 0.03226768597960472, "learning_rate": 0.00018971678227060757, "loss": 0.1272, "step": 730 }, { "epoch": 0.6940422501780205, "grad_norm": 0.037317484617233276, "learning_rate": 0.0001896679195729803, "loss": 0.1339, "step": 731 }, { "epoch": 0.6949916923807263, "grad_norm": 0.05428892746567726, "learning_rate": 0.0001896189473831799, "loss": 0.1667, "step": 732 }, { "epoch": 0.6959411345834322, "grad_norm": 0.04177982360124588, "learning_rate": 0.0001895698657610056, "loss": 0.1337, "step": 733 }, { "epoch": 0.6968905767861382, "grad_norm": 0.041572730988264084, "learning_rate": 0.00018952067476639024, "loss": 0.1332, "step": 734 }, { "epoch": 0.6978400189888441, "grad_norm": 0.03430505469441414, "learning_rate": 0.00018947137445940023, "loss": 0.1265, "step": 735 }, { "epoch": 0.6987894611915499, "grad_norm": 0.03863980621099472, "learning_rate": 0.00018942196490023542, "loss": 0.1337, "step": 736 }, { "epoch": 0.6997389033942559, "grad_norm": 0.06445252895355225, "learning_rate": 0.00018937244614922912, "loss": 0.2032, "step": 737 }, { "epoch": 0.7006883455969618, "grad_norm": 0.03358490392565727, "learning_rate": 0.00018932281826684793, "loss": 0.127, "step": 738 }, { "epoch": 0.7016377877996677, "grad_norm": 0.034341324120759964, "learning_rate": 0.00018927308131369173, "loss": 0.1303, "step": 739 }, { "epoch": 0.7025872300023736, "grad_norm": 0.035848621279001236, "learning_rate": 0.00018922323535049354, "loss": 0.1272, "step": 740 }, { "epoch": 0.7035366722050795, "grad_norm": 0.03865866735577583, "learning_rate": 0.0001891732804381196, "loss": 0.136, "step": 741 }, { "epoch": 0.7044861144077854, "grad_norm": 0.045944251120090485, "learning_rate": 0.0001891232166375691, "loss": 0.1741, "step": 742 }, { "epoch": 0.7054355566104913, "grad_norm": 0.04418769106268883, "learning_rate": 0.00018907304400997418, "loss": 0.1504, "step": 743 }, { "epoch": 0.7063849988131973, "grad_norm": 0.062257930636405945, "learning_rate": 0.0001890227626165999, "loss": 0.1786, "step": 744 }, { "epoch": 0.7073344410159031, "grad_norm": 0.037457846105098724, "learning_rate": 0.00018897237251884415, "loss": 0.1389, "step": 745 }, { "epoch": 0.708283883218609, "grad_norm": 0.039091672748327255, "learning_rate": 0.0001889218737782375, "loss": 0.1264, "step": 746 }, { "epoch": 0.709233325421315, "grad_norm": 0.035011596977710724, "learning_rate": 0.00018887126645644324, "loss": 0.1363, "step": 747 }, { "epoch": 0.7101827676240209, "grad_norm": 0.104104183614254, "learning_rate": 0.00018882055061525722, "loss": 0.1588, "step": 748 }, { "epoch": 0.7111322098267268, "grad_norm": 0.03222833201289177, "learning_rate": 0.0001887697263166078, "loss": 0.1259, "step": 749 }, { "epoch": 0.7120816520294327, "grad_norm": 0.049904145300388336, "learning_rate": 0.0001887187936225558, "loss": 0.1676, "step": 750 }, { "epoch": 0.7130310942321386, "grad_norm": 0.15150390565395355, "learning_rate": 0.00018866775259529435, "loss": 0.1369, "step": 751 }, { "epoch": 0.7139805364348445, "grad_norm": 0.03994397446513176, "learning_rate": 0.0001886166032971489, "loss": 0.1294, "step": 752 }, { "epoch": 0.7149299786375505, "grad_norm": 0.06274881213903427, "learning_rate": 0.00018856534579057713, "loss": 0.1659, "step": 753 }, { "epoch": 0.7158794208402564, "grad_norm": 0.04001612216234207, "learning_rate": 0.00018851398013816883, "loss": 0.1305, "step": 754 }, { "epoch": 0.7168288630429622, "grad_norm": 0.03961142525076866, "learning_rate": 0.0001884625064026458, "loss": 0.1265, "step": 755 }, { "epoch": 0.7177783052456682, "grad_norm": 0.033916253596544266, "learning_rate": 0.00018841092464686186, "loss": 0.1336, "step": 756 }, { "epoch": 0.7187277474483741, "grad_norm": 0.040992431342601776, "learning_rate": 0.00018835923493380278, "loss": 0.1403, "step": 757 }, { "epoch": 0.71967718965108, "grad_norm": 0.03410341590642929, "learning_rate": 0.00018830743732658608, "loss": 0.1233, "step": 758 }, { "epoch": 0.720626631853786, "grad_norm": 0.05984083190560341, "learning_rate": 0.000188255531888461, "loss": 0.1417, "step": 759 }, { "epoch": 0.7215760740564918, "grad_norm": 0.03874243050813675, "learning_rate": 0.00018820351868280858, "loss": 0.1366, "step": 760 }, { "epoch": 0.7225255162591977, "grad_norm": 0.05256400629878044, "learning_rate": 0.00018815139777314136, "loss": 0.172, "step": 761 }, { "epoch": 0.7234749584619037, "grad_norm": 0.039005253463983536, "learning_rate": 0.0001880991692231034, "loss": 0.1312, "step": 762 }, { "epoch": 0.7244244006646096, "grad_norm": 0.04029637575149536, "learning_rate": 0.0001880468330964702, "loss": 0.1327, "step": 763 }, { "epoch": 0.7253738428673154, "grad_norm": 0.04493672773241997, "learning_rate": 0.00018799438945714866, "loss": 0.1555, "step": 764 }, { "epoch": 0.7263232850700213, "grad_norm": 0.03862634301185608, "learning_rate": 0.0001879418383691769, "loss": 0.133, "step": 765 }, { "epoch": 0.7272727272727273, "grad_norm": 0.03904002159833908, "learning_rate": 0.00018788917989672434, "loss": 0.1259, "step": 766 }, { "epoch": 0.7282221694754332, "grad_norm": 0.037936531007289886, "learning_rate": 0.0001878364141040914, "loss": 0.1263, "step": 767 }, { "epoch": 0.729171611678139, "grad_norm": 0.03802201896905899, "learning_rate": 0.0001877835410557096, "loss": 0.134, "step": 768 }, { "epoch": 0.730121053880845, "grad_norm": 0.03759211301803589, "learning_rate": 0.00018773056081614154, "loss": 0.1383, "step": 769 }, { "epoch": 0.7310704960835509, "grad_norm": 0.0498163104057312, "learning_rate": 0.0001876774734500805, "loss": 0.1573, "step": 770 }, { "epoch": 0.7320199382862568, "grad_norm": 0.036126043647527695, "learning_rate": 0.00018762427902235072, "loss": 0.1274, "step": 771 }, { "epoch": 0.7329693804889628, "grad_norm": 0.044809550046920776, "learning_rate": 0.0001875709775979071, "loss": 0.1703, "step": 772 }, { "epoch": 0.7339188226916686, "grad_norm": 0.050454337149858475, "learning_rate": 0.0001875175692418353, "loss": 0.1699, "step": 773 }, { "epoch": 0.7348682648943745, "grad_norm": 0.06160600110888481, "learning_rate": 0.00018746405401935142, "loss": 0.1806, "step": 774 }, { "epoch": 0.7358177070970805, "grad_norm": 0.05408332124352455, "learning_rate": 0.0001874104319958021, "loss": 0.1681, "step": 775 }, { "epoch": 0.7367671492997864, "grad_norm": 0.03859655559062958, "learning_rate": 0.00018735670323666442, "loss": 0.1297, "step": 776 }, { "epoch": 0.7377165915024922, "grad_norm": 0.05268474668264389, "learning_rate": 0.00018730286780754577, "loss": 0.1658, "step": 777 }, { "epoch": 0.7386660337051982, "grad_norm": 0.06406822055578232, "learning_rate": 0.00018724892577418381, "loss": 0.199, "step": 778 }, { "epoch": 0.7396154759079041, "grad_norm": 0.05488892272114754, "learning_rate": 0.00018719487720244638, "loss": 0.1669, "step": 779 }, { "epoch": 0.74056491811061, "grad_norm": 0.03732241317629814, "learning_rate": 0.00018714072215833132, "loss": 0.1337, "step": 780 }, { "epoch": 0.741514360313316, "grad_norm": 0.05548230558633804, "learning_rate": 0.00018708646070796664, "loss": 0.1652, "step": 781 }, { "epoch": 0.7424638025160218, "grad_norm": 0.06930623203516006, "learning_rate": 0.0001870320929176101, "loss": 0.1647, "step": 782 }, { "epoch": 0.7434132447187277, "grad_norm": 0.05485931411385536, "learning_rate": 0.0001869776188536495, "loss": 0.2149, "step": 783 }, { "epoch": 0.7443626869214337, "grad_norm": 0.03739183023571968, "learning_rate": 0.00018692303858260228, "loss": 0.1257, "step": 784 }, { "epoch": 0.7453121291241396, "grad_norm": 0.03913332521915436, "learning_rate": 0.00018686835217111557, "loss": 0.1293, "step": 785 }, { "epoch": 0.7462615713268455, "grad_norm": 0.03580600768327713, "learning_rate": 0.0001868135596859662, "loss": 0.1278, "step": 786 }, { "epoch": 0.7472110135295514, "grad_norm": 0.03586685657501221, "learning_rate": 0.00018675866119406042, "loss": 0.128, "step": 787 }, { "epoch": 0.7481604557322573, "grad_norm": 0.04061829298734665, "learning_rate": 0.00018670365676243397, "loss": 0.1256, "step": 788 }, { "epoch": 0.7491098979349632, "grad_norm": 0.03580275923013687, "learning_rate": 0.000186648546458252, "loss": 0.1265, "step": 789 }, { "epoch": 0.7500593401376692, "grad_norm": 0.04277309030294418, "learning_rate": 0.00018659333034880884, "loss": 0.1678, "step": 790 }, { "epoch": 0.751008782340375, "grad_norm": 0.03997024893760681, "learning_rate": 0.00018653800850152808, "loss": 0.1251, "step": 791 }, { "epoch": 0.7519582245430809, "grad_norm": 0.03809446841478348, "learning_rate": 0.0001864825809839624, "loss": 0.1354, "step": 792 }, { "epoch": 0.7529076667457868, "grad_norm": 0.05002079904079437, "learning_rate": 0.00018642704786379354, "loss": 0.1492, "step": 793 }, { "epoch": 0.7538571089484928, "grad_norm": 0.03734049201011658, "learning_rate": 0.00018637140920883217, "loss": 0.1328, "step": 794 }, { "epoch": 0.7548065511511987, "grad_norm": 0.034287337213754654, "learning_rate": 0.00018631566508701784, "loss": 0.1261, "step": 795 }, { "epoch": 0.7557559933539045, "grad_norm": 0.0322953499853611, "learning_rate": 0.00018625981556641882, "loss": 0.1251, "step": 796 }, { "epoch": 0.7567054355566105, "grad_norm": 0.03397887200117111, "learning_rate": 0.00018620386071523218, "loss": 0.1226, "step": 797 }, { "epoch": 0.7576548777593164, "grad_norm": 0.048685140907764435, "learning_rate": 0.0001861478006017836, "loss": 0.1677, "step": 798 }, { "epoch": 0.7586043199620223, "grad_norm": 0.06330600380897522, "learning_rate": 0.00018609163529452723, "loss": 0.2012, "step": 799 }, { "epoch": 0.7595537621647283, "grad_norm": 0.04262509569525719, "learning_rate": 0.00018603536486204564, "loss": 0.1271, "step": 800 }, { "epoch": 0.7605032043674341, "grad_norm": 0.04021213576197624, "learning_rate": 0.00018597898937304988, "loss": 0.1426, "step": 801 }, { "epoch": 0.76145264657014, "grad_norm": 0.05070256441831589, "learning_rate": 0.0001859225088963792, "loss": 0.209, "step": 802 }, { "epoch": 0.762402088772846, "grad_norm": 0.05344654247164726, "learning_rate": 0.00018586592350100113, "loss": 0.2093, "step": 803 }, { "epoch": 0.7633515309755519, "grad_norm": 0.03695262596011162, "learning_rate": 0.0001858092332560112, "loss": 0.1264, "step": 804 }, { "epoch": 0.7643009731782577, "grad_norm": 0.041282836347818375, "learning_rate": 0.00018575243823063306, "loss": 0.1275, "step": 805 }, { "epoch": 0.7652504153809637, "grad_norm": 0.038663093000650406, "learning_rate": 0.00018569553849421828, "loss": 0.1285, "step": 806 }, { "epoch": 0.7661998575836696, "grad_norm": 0.05324345454573631, "learning_rate": 0.00018563853411624628, "loss": 0.1691, "step": 807 }, { "epoch": 0.7671492997863755, "grad_norm": 0.0382021889090538, "learning_rate": 0.00018558142516632425, "loss": 0.1299, "step": 808 }, { "epoch": 0.7680987419890815, "grad_norm": 0.05059641972184181, "learning_rate": 0.00018552421171418712, "loss": 0.1685, "step": 809 }, { "epoch": 0.7690481841917873, "grad_norm": 0.041547179222106934, "learning_rate": 0.00018546689382969737, "loss": 0.1322, "step": 810 }, { "epoch": 0.7699976263944932, "grad_norm": 0.047367729246616364, "learning_rate": 0.00018540947158284503, "loss": 0.1662, "step": 811 }, { "epoch": 0.7709470685971992, "grad_norm": 0.07076044380664825, "learning_rate": 0.00018535194504374754, "loss": 0.1749, "step": 812 }, { "epoch": 0.7718965107999051, "grad_norm": 0.05194571986794472, "learning_rate": 0.00018529431428264973, "loss": 0.1595, "step": 813 }, { "epoch": 0.7728459530026109, "grad_norm": 0.034832440316677094, "learning_rate": 0.00018523657936992367, "loss": 0.1279, "step": 814 }, { "epoch": 0.7737953952053169, "grad_norm": 0.03709466755390167, "learning_rate": 0.00018517874037606862, "loss": 0.1161, "step": 815 }, { "epoch": 0.7747448374080228, "grad_norm": 0.03341936320066452, "learning_rate": 0.00018512079737171086, "loss": 0.1277, "step": 816 }, { "epoch": 0.7756942796107287, "grad_norm": 0.0411679781973362, "learning_rate": 0.00018506275042760382, "loss": 0.1284, "step": 817 }, { "epoch": 0.7766437218134347, "grad_norm": 0.04416754096746445, "learning_rate": 0.00018500459961462773, "loss": 0.1647, "step": 818 }, { "epoch": 0.7775931640161405, "grad_norm": 0.03680622950196266, "learning_rate": 0.00018494634500378966, "loss": 0.1371, "step": 819 }, { "epoch": 0.7785426062188464, "grad_norm": 0.037342917174100876, "learning_rate": 0.0001848879866662235, "loss": 0.1308, "step": 820 }, { "epoch": 0.7794920484215523, "grad_norm": 0.04237838089466095, "learning_rate": 0.00018482952467318976, "loss": 0.1623, "step": 821 }, { "epoch": 0.7804414906242583, "grad_norm": 0.04467133805155754, "learning_rate": 0.00018477095909607546, "loss": 0.1651, "step": 822 }, { "epoch": 0.7813909328269641, "grad_norm": 0.04672664403915405, "learning_rate": 0.00018471229000639424, "loss": 0.1735, "step": 823 }, { "epoch": 0.78234037502967, "grad_norm": 0.03545104339718819, "learning_rate": 0.00018465351747578597, "loss": 0.1342, "step": 824 }, { "epoch": 0.783289817232376, "grad_norm": 0.04771837964653969, "learning_rate": 0.000184594641576017, "loss": 0.175, "step": 825 }, { "epoch": 0.7842392594350819, "grad_norm": 0.03531822934746742, "learning_rate": 0.00018453566237897976, "loss": 0.1321, "step": 826 }, { "epoch": 0.7851887016377878, "grad_norm": 0.04098953306674957, "learning_rate": 0.00018447657995669295, "loss": 0.1372, "step": 827 }, { "epoch": 0.7861381438404937, "grad_norm": 0.053972020745277405, "learning_rate": 0.00018441739438130114, "loss": 0.1673, "step": 828 }, { "epoch": 0.7870875860431996, "grad_norm": 0.03818265721201897, "learning_rate": 0.00018435810572507507, "loss": 0.1322, "step": 829 }, { "epoch": 0.7880370282459055, "grad_norm": 0.033827316015958786, "learning_rate": 0.0001842987140604112, "loss": 0.1445, "step": 830 }, { "epoch": 0.7889864704486115, "grad_norm": 0.041385356336832047, "learning_rate": 0.00018423921945983179, "loss": 0.1332, "step": 831 }, { "epoch": 0.7899359126513174, "grad_norm": 0.03948013484477997, "learning_rate": 0.00018417962199598483, "loss": 0.1412, "step": 832 }, { "epoch": 0.7908853548540232, "grad_norm": 0.044912584125995636, "learning_rate": 0.00018411992174164393, "loss": 0.1684, "step": 833 }, { "epoch": 0.7918347970567292, "grad_norm": 0.03675195202231407, "learning_rate": 0.0001840601187697082, "loss": 0.1334, "step": 834 }, { "epoch": 0.7927842392594351, "grad_norm": 0.0349728949368, "learning_rate": 0.0001840002131532021, "loss": 0.1323, "step": 835 }, { "epoch": 0.793733681462141, "grad_norm": 0.03763123229146004, "learning_rate": 0.0001839402049652755, "loss": 0.1283, "step": 836 }, { "epoch": 0.794683123664847, "grad_norm": 0.036798711866140366, "learning_rate": 0.00018388009427920362, "loss": 0.1272, "step": 837 }, { "epoch": 0.7956325658675528, "grad_norm": 0.036771487444639206, "learning_rate": 0.00018381988116838663, "loss": 0.126, "step": 838 }, { "epoch": 0.7965820080702587, "grad_norm": 0.060571007430553436, "learning_rate": 0.00018375956570634987, "loss": 0.1736, "step": 839 }, { "epoch": 0.7975314502729647, "grad_norm": 0.0332857109606266, "learning_rate": 0.00018369914796674373, "loss": 0.1301, "step": 840 }, { "epoch": 0.7984808924756706, "grad_norm": 0.045279379934072495, "learning_rate": 0.00018363862802334334, "loss": 0.1602, "step": 841 }, { "epoch": 0.7994303346783764, "grad_norm": 0.03676297515630722, "learning_rate": 0.00018357800595004877, "loss": 0.1299, "step": 842 }, { "epoch": 0.8003797768810824, "grad_norm": 0.05098710209131241, "learning_rate": 0.0001835172818208847, "loss": 0.1289, "step": 843 }, { "epoch": 0.8013292190837883, "grad_norm": 0.047296855598688126, "learning_rate": 0.00018345645571000052, "loss": 0.1716, "step": 844 }, { "epoch": 0.8022786612864942, "grad_norm": 0.03570317476987839, "learning_rate": 0.00018339552769167003, "loss": 0.1337, "step": 845 }, { "epoch": 0.8032281034892, "grad_norm": 0.03380590304732323, "learning_rate": 0.00018333449784029156, "loss": 0.1218, "step": 846 }, { "epoch": 0.804177545691906, "grad_norm": 0.0340820774435997, "learning_rate": 0.00018327336623038778, "loss": 0.1324, "step": 847 }, { "epoch": 0.8051269878946119, "grad_norm": 0.03311248868703842, "learning_rate": 0.00018321213293660558, "loss": 0.1308, "step": 848 }, { "epoch": 0.8060764300973178, "grad_norm": 0.035102471709251404, "learning_rate": 0.00018315079803371605, "loss": 0.1345, "step": 849 }, { "epoch": 0.8070258723000238, "grad_norm": 0.03358345478773117, "learning_rate": 0.0001830893615966143, "loss": 0.1341, "step": 850 }, { "epoch": 0.8079753145027296, "grad_norm": 0.06460444629192352, "learning_rate": 0.00018302782370031948, "loss": 0.2051, "step": 851 }, { "epoch": 0.8089247567054355, "grad_norm": 0.033203575760126114, "learning_rate": 0.0001829661844199746, "loss": 0.1357, "step": 852 }, { "epoch": 0.8098741989081415, "grad_norm": 0.03588509559631348, "learning_rate": 0.0001829044438308465, "loss": 0.1335, "step": 853 }, { "epoch": 0.8108236411108474, "grad_norm": 0.04263895004987717, "learning_rate": 0.00018284260200832563, "loss": 0.1739, "step": 854 }, { "epoch": 0.8117730833135532, "grad_norm": 0.04004021733999252, "learning_rate": 0.00018278065902792618, "loss": 0.131, "step": 855 }, { "epoch": 0.8127225255162592, "grad_norm": 0.035174135118722916, "learning_rate": 0.00018271861496528584, "loss": 0.1248, "step": 856 }, { "epoch": 0.8136719677189651, "grad_norm": 0.03610173240303993, "learning_rate": 0.00018265646989616566, "loss": 0.1287, "step": 857 }, { "epoch": 0.814621409921671, "grad_norm": 0.035818714648485184, "learning_rate": 0.00018259422389645008, "loss": 0.1335, "step": 858 }, { "epoch": 0.815570852124377, "grad_norm": 0.03248162940144539, "learning_rate": 0.00018253187704214672, "loss": 0.1308, "step": 859 }, { "epoch": 0.8165202943270828, "grad_norm": 0.031658854335546494, "learning_rate": 0.00018246942940938646, "loss": 0.1339, "step": 860 }, { "epoch": 0.8174697365297887, "grad_norm": 0.035879503935575485, "learning_rate": 0.0001824068810744232, "loss": 0.1222, "step": 861 }, { "epoch": 0.8184191787324947, "grad_norm": 0.05258049815893173, "learning_rate": 0.0001823442321136337, "loss": 0.1937, "step": 862 }, { "epoch": 0.8193686209352006, "grad_norm": 0.03625549003481865, "learning_rate": 0.0001822814826035178, "loss": 0.1268, "step": 863 }, { "epoch": 0.8203180631379065, "grad_norm": 0.04990942031145096, "learning_rate": 0.00018221863262069793, "loss": 0.1661, "step": 864 }, { "epoch": 0.8212675053406124, "grad_norm": 0.0631263256072998, "learning_rate": 0.00018215568224191927, "loss": 0.2126, "step": 865 }, { "epoch": 0.8222169475433183, "grad_norm": 0.03726550564169884, "learning_rate": 0.00018209263154404958, "loss": 0.1334, "step": 866 }, { "epoch": 0.8231663897460242, "grad_norm": 0.040383536368608475, "learning_rate": 0.0001820294806040792, "loss": 0.1619, "step": 867 }, { "epoch": 0.8241158319487302, "grad_norm": 0.03525468334555626, "learning_rate": 0.00018196622949912078, "loss": 0.1263, "step": 868 }, { "epoch": 0.825065274151436, "grad_norm": 0.03585941344499588, "learning_rate": 0.00018190287830640933, "loss": 0.1245, "step": 869 }, { "epoch": 0.8260147163541419, "grad_norm": 0.03207286074757576, "learning_rate": 0.00018183942710330202, "loss": 0.1262, "step": 870 }, { "epoch": 0.8269641585568479, "grad_norm": 0.04638965427875519, "learning_rate": 0.00018177587596727822, "loss": 0.1653, "step": 871 }, { "epoch": 0.8279136007595538, "grad_norm": 0.030705489218235016, "learning_rate": 0.00018171222497593922, "loss": 0.1276, "step": 872 }, { "epoch": 0.8288630429622597, "grad_norm": 0.03139735013246536, "learning_rate": 0.00018164847420700837, "loss": 0.1344, "step": 873 }, { "epoch": 0.8298124851649655, "grad_norm": 0.039802953600883484, "learning_rate": 0.00018158462373833078, "loss": 0.1373, "step": 874 }, { "epoch": 0.8307619273676715, "grad_norm": 0.03284341096878052, "learning_rate": 0.00018152067364787325, "loss": 0.1236, "step": 875 }, { "epoch": 0.8317113695703774, "grad_norm": 0.056572429835796356, "learning_rate": 0.0001814566240137244, "loss": 0.1665, "step": 876 }, { "epoch": 0.8326608117730833, "grad_norm": 0.03471997380256653, "learning_rate": 0.00018139247491409424, "loss": 0.13, "step": 877 }, { "epoch": 0.8336102539757893, "grad_norm": 0.03601829707622528, "learning_rate": 0.00018132822642731426, "loss": 0.127, "step": 878 }, { "epoch": 0.8345596961784951, "grad_norm": 0.032708846032619476, "learning_rate": 0.00018126387863183737, "loss": 0.1264, "step": 879 }, { "epoch": 0.835509138381201, "grad_norm": 0.035340629518032074, "learning_rate": 0.00018119943160623773, "loss": 0.1334, "step": 880 }, { "epoch": 0.836458580583907, "grad_norm": 0.030397990718483925, "learning_rate": 0.00018113488542921061, "loss": 0.1254, "step": 881 }, { "epoch": 0.8374080227866129, "grad_norm": 0.03871999308466911, "learning_rate": 0.00018107024017957244, "loss": 0.132, "step": 882 }, { "epoch": 0.8383574649893187, "grad_norm": 0.04331507533788681, "learning_rate": 0.00018100549593626052, "loss": 0.1354, "step": 883 }, { "epoch": 0.8393069071920247, "grad_norm": 0.03445984423160553, "learning_rate": 0.00018094065277833314, "loss": 0.129, "step": 884 }, { "epoch": 0.8402563493947306, "grad_norm": 0.03362146392464638, "learning_rate": 0.0001808757107849693, "loss": 0.125, "step": 885 }, { "epoch": 0.8412057915974365, "grad_norm": 0.041491370648145676, "learning_rate": 0.00018081067003546876, "loss": 0.1314, "step": 886 }, { "epoch": 0.8421552338001425, "grad_norm": 0.034560974687337875, "learning_rate": 0.00018074553060925175, "loss": 0.126, "step": 887 }, { "epoch": 0.8431046760028483, "grad_norm": 0.049931105226278305, "learning_rate": 0.0001806802925858591, "loss": 0.1709, "step": 888 }, { "epoch": 0.8440541182055542, "grad_norm": 0.035841234028339386, "learning_rate": 0.00018061495604495195, "loss": 0.1396, "step": 889 }, { "epoch": 0.8450035604082602, "grad_norm": 0.03359563648700714, "learning_rate": 0.00018054952106631188, "loss": 0.1323, "step": 890 }, { "epoch": 0.8459530026109661, "grad_norm": 0.03390706703066826, "learning_rate": 0.00018048398772984046, "loss": 0.1287, "step": 891 }, { "epoch": 0.8469024448136719, "grad_norm": 0.0474267303943634, "learning_rate": 0.00018041835611555957, "loss": 0.1693, "step": 892 }, { "epoch": 0.8478518870163779, "grad_norm": 0.0334562286734581, "learning_rate": 0.00018035262630361097, "loss": 0.1295, "step": 893 }, { "epoch": 0.8488013292190838, "grad_norm": 0.03383705019950867, "learning_rate": 0.00018028679837425634, "loss": 0.1259, "step": 894 }, { "epoch": 0.8497507714217897, "grad_norm": 0.03384934738278389, "learning_rate": 0.00018022087240787728, "loss": 0.1218, "step": 895 }, { "epoch": 0.8507002136244957, "grad_norm": 0.04088185727596283, "learning_rate": 0.0001801548484849749, "loss": 0.1343, "step": 896 }, { "epoch": 0.8516496558272015, "grad_norm": 0.05273745581507683, "learning_rate": 0.00018008872668617013, "loss": 0.1688, "step": 897 }, { "epoch": 0.8525990980299074, "grad_norm": 0.03253067284822464, "learning_rate": 0.00018002250709220325, "loss": 0.1333, "step": 898 }, { "epoch": 0.8535485402326133, "grad_norm": 0.03033488616347313, "learning_rate": 0.0001799561897839341, "loss": 0.1292, "step": 899 }, { "epoch": 0.8544979824353193, "grad_norm": 0.033945854753255844, "learning_rate": 0.00017988977484234174, "loss": 0.1415, "step": 900 }, { "epoch": 0.8554474246380251, "grad_norm": 0.04456301033496857, "learning_rate": 0.0001798232623485244, "loss": 0.1762, "step": 901 }, { "epoch": 0.856396866840731, "grad_norm": 0.03912430256605148, "learning_rate": 0.00017975665238369962, "loss": 0.142, "step": 902 }, { "epoch": 0.857346309043437, "grad_norm": 0.032741378992795944, "learning_rate": 0.0001796899450292038, "loss": 0.1212, "step": 903 }, { "epoch": 0.8582957512461429, "grad_norm": 0.047262486070394516, "learning_rate": 0.0001796231403664923, "loss": 0.1762, "step": 904 }, { "epoch": 0.8592451934488488, "grad_norm": 0.03242664784193039, "learning_rate": 0.00017955623847713928, "loss": 0.1323, "step": 905 }, { "epoch": 0.8601946356515547, "grad_norm": 0.030855266377329826, "learning_rate": 0.0001794892394428377, "loss": 0.1258, "step": 906 }, { "epoch": 0.8611440778542606, "grad_norm": 0.03360726311802864, "learning_rate": 0.00017942214334539907, "loss": 0.1325, "step": 907 }, { "epoch": 0.8620935200569665, "grad_norm": 0.032459285110235214, "learning_rate": 0.00017935495026675345, "loss": 0.1267, "step": 908 }, { "epoch": 0.8630429622596725, "grad_norm": 0.04160567373037338, "learning_rate": 0.00017928766028894928, "loss": 0.1255, "step": 909 }, { "epoch": 0.8639924044623783, "grad_norm": 0.03851740434765816, "learning_rate": 0.0001792202734941534, "loss": 0.1212, "step": 910 }, { "epoch": 0.8649418466650842, "grad_norm": 0.03414515405893326, "learning_rate": 0.00017915278996465084, "loss": 0.1239, "step": 911 }, { "epoch": 0.8658912888677902, "grad_norm": 0.17817381024360657, "learning_rate": 0.0001790852097828447, "loss": 0.1336, "step": 912 }, { "epoch": 0.8668407310704961, "grad_norm": 0.03545542433857918, "learning_rate": 0.0001790175330312562, "loss": 0.1353, "step": 913 }, { "epoch": 0.867790173273202, "grad_norm": 0.03207210451364517, "learning_rate": 0.00017894975979252436, "loss": 0.1243, "step": 914 }, { "epoch": 0.868739615475908, "grad_norm": 0.046145763248205185, "learning_rate": 0.0001788818901494061, "loss": 0.1668, "step": 915 }, { "epoch": 0.8696890576786138, "grad_norm": 0.03051767125725746, "learning_rate": 0.00017881392418477607, "loss": 0.1311, "step": 916 }, { "epoch": 0.8706384998813197, "grad_norm": 0.03918071463704109, "learning_rate": 0.00017874586198162647, "loss": 0.1692, "step": 917 }, { "epoch": 0.8715879420840257, "grad_norm": 0.03229302540421486, "learning_rate": 0.0001786777036230671, "loss": 0.1276, "step": 918 }, { "epoch": 0.8725373842867316, "grad_norm": 0.032113853842020035, "learning_rate": 0.00017860944919232503, "loss": 0.1256, "step": 919 }, { "epoch": 0.8734868264894374, "grad_norm": 0.03725959360599518, "learning_rate": 0.00017854109877274484, "loss": 0.1363, "step": 920 }, { "epoch": 0.8744362686921434, "grad_norm": 0.02805374562740326, "learning_rate": 0.00017847265244778817, "loss": 0.1259, "step": 921 }, { "epoch": 0.8753857108948493, "grad_norm": 0.03541216999292374, "learning_rate": 0.00017840411030103383, "loss": 0.1288, "step": 922 }, { "epoch": 0.8763351530975552, "grad_norm": 0.04267534613609314, "learning_rate": 0.0001783354724161776, "loss": 0.1601, "step": 923 }, { "epoch": 0.8772845953002611, "grad_norm": 0.04881501942873001, "learning_rate": 0.00017826673887703223, "loss": 0.1686, "step": 924 }, { "epoch": 0.878234037502967, "grad_norm": 0.0337185375392437, "learning_rate": 0.00017819790976752718, "loss": 0.131, "step": 925 }, { "epoch": 0.8791834797056729, "grad_norm": 0.033597834408283234, "learning_rate": 0.00017812898517170872, "loss": 0.1365, "step": 926 }, { "epoch": 0.8801329219083788, "grad_norm": 0.047949645668268204, "learning_rate": 0.00017805996517373962, "loss": 0.178, "step": 927 }, { "epoch": 0.8810823641110848, "grad_norm": 0.03533579409122467, "learning_rate": 0.00017799084985789916, "loss": 0.1281, "step": 928 }, { "epoch": 0.8820318063137906, "grad_norm": 0.03638564050197601, "learning_rate": 0.0001779216393085831, "loss": 0.136, "step": 929 }, { "epoch": 0.8829812485164965, "grad_norm": 0.034585777670145035, "learning_rate": 0.00017785233361030333, "loss": 0.1221, "step": 930 }, { "epoch": 0.8839306907192025, "grad_norm": 0.03344082459807396, "learning_rate": 0.00017778293284768807, "loss": 0.1335, "step": 931 }, { "epoch": 0.8848801329219084, "grad_norm": 0.029832901433110237, "learning_rate": 0.00017771343710548155, "loss": 0.131, "step": 932 }, { "epoch": 0.8858295751246142, "grad_norm": 0.030377686023712158, "learning_rate": 0.00017764384646854405, "loss": 0.1216, "step": 933 }, { "epoch": 0.8867790173273202, "grad_norm": 0.036345310509204865, "learning_rate": 0.0001775741610218516, "loss": 0.1289, "step": 934 }, { "epoch": 0.8877284595300261, "grad_norm": 0.04609441012144089, "learning_rate": 0.00017750438085049606, "loss": 0.1598, "step": 935 }, { "epoch": 0.888677901732732, "grad_norm": 0.03439109027385712, "learning_rate": 0.00017743450603968506, "loss": 0.1316, "step": 936 }, { "epoch": 0.889627343935438, "grad_norm": 0.07119124382734299, "learning_rate": 0.0001773645366747416, "loss": 0.1664, "step": 937 }, { "epoch": 0.8905767861381438, "grad_norm": 0.03385334461927414, "learning_rate": 0.0001772944728411043, "loss": 0.1294, "step": 938 }, { "epoch": 0.8915262283408497, "grad_norm": 0.033481206744909286, "learning_rate": 0.00017722431462432705, "loss": 0.1218, "step": 939 }, { "epoch": 0.8924756705435557, "grad_norm": 0.03365306556224823, "learning_rate": 0.00017715406211007902, "loss": 0.1295, "step": 940 }, { "epoch": 0.8934251127462616, "grad_norm": 0.03675035014748573, "learning_rate": 0.0001770837153841445, "loss": 0.1237, "step": 941 }, { "epoch": 0.8943745549489674, "grad_norm": 0.03245026618242264, "learning_rate": 0.00017701327453242284, "loss": 0.1304, "step": 942 }, { "epoch": 0.8953239971516734, "grad_norm": 0.03346354141831398, "learning_rate": 0.00017694273964092837, "loss": 0.1274, "step": 943 }, { "epoch": 0.8962734393543793, "grad_norm": 0.048563096672296524, "learning_rate": 0.00017687211079579017, "loss": 0.1719, "step": 944 }, { "epoch": 0.8972228815570852, "grad_norm": 0.04709222912788391, "learning_rate": 0.0001768013880832521, "loss": 0.1281, "step": 945 }, { "epoch": 0.8981723237597912, "grad_norm": 0.030402177944779396, "learning_rate": 0.00017673057158967254, "loss": 0.1229, "step": 946 }, { "epoch": 0.899121765962497, "grad_norm": 0.03577994927763939, "learning_rate": 0.00017665966140152458, "loss": 0.1255, "step": 947 }, { "epoch": 0.9000712081652029, "grad_norm": 0.04566454887390137, "learning_rate": 0.00017658865760539552, "loss": 0.1617, "step": 948 }, { "epoch": 0.9010206503679089, "grad_norm": 0.04077988117933273, "learning_rate": 0.00017651756028798713, "loss": 0.1619, "step": 949 }, { "epoch": 0.9019700925706148, "grad_norm": 0.045764826238155365, "learning_rate": 0.00017644636953611522, "loss": 0.1608, "step": 950 }, { "epoch": 0.9029195347733207, "grad_norm": 0.035656195133924484, "learning_rate": 0.0001763750854367098, "loss": 0.1288, "step": 951 }, { "epoch": 0.9038689769760266, "grad_norm": 0.04220154508948326, "learning_rate": 0.0001763037080768148, "loss": 0.1688, "step": 952 }, { "epoch": 0.9048184191787325, "grad_norm": 0.03406943380832672, "learning_rate": 0.0001762322375435881, "loss": 0.1314, "step": 953 }, { "epoch": 0.9057678613814384, "grad_norm": 0.037942539900541306, "learning_rate": 0.00017616067392430126, "loss": 0.1342, "step": 954 }, { "epoch": 0.9067173035841443, "grad_norm": 0.06412187963724136, "learning_rate": 0.00017608901730633964, "loss": 0.2207, "step": 955 }, { "epoch": 0.9076667457868502, "grad_norm": 0.0313476026058197, "learning_rate": 0.00017601726777720202, "loss": 0.1249, "step": 956 }, { "epoch": 0.9086161879895561, "grad_norm": 0.0276046060025692, "learning_rate": 0.00017594542542450072, "loss": 0.1212, "step": 957 }, { "epoch": 0.909565630192262, "grad_norm": 0.032439909875392914, "learning_rate": 0.00017587349033596134, "loss": 0.1277, "step": 958 }, { "epoch": 0.910515072394968, "grad_norm": 0.039732351899147034, "learning_rate": 0.00017580146259942278, "loss": 0.1222, "step": 959 }, { "epoch": 0.9114645145976739, "grad_norm": 0.033820103853940964, "learning_rate": 0.00017572934230283707, "loss": 0.1246, "step": 960 }, { "epoch": 0.9124139568003797, "grad_norm": 0.03361973166465759, "learning_rate": 0.00017565712953426918, "loss": 0.1328, "step": 961 }, { "epoch": 0.9133633990030857, "grad_norm": 0.0338444709777832, "learning_rate": 0.00017558482438189712, "loss": 0.1306, "step": 962 }, { "epoch": 0.9143128412057916, "grad_norm": 0.04851710423827171, "learning_rate": 0.0001755124269340116, "loss": 0.1765, "step": 963 }, { "epoch": 0.9152622834084975, "grad_norm": 0.03290700539946556, "learning_rate": 0.0001754399372790161, "loss": 0.1386, "step": 964 }, { "epoch": 0.9162117256112035, "grad_norm": 0.034565720707178116, "learning_rate": 0.00017536735550542661, "loss": 0.1212, "step": 965 }, { "epoch": 0.9171611678139093, "grad_norm": 0.04606771841645241, "learning_rate": 0.00017529468170187176, "loss": 0.1567, "step": 966 }, { "epoch": 0.9181106100166152, "grad_norm": 0.03279464691877365, "learning_rate": 0.00017522191595709238, "loss": 0.1214, "step": 967 }, { "epoch": 0.9190600522193212, "grad_norm": 0.036700885742902756, "learning_rate": 0.00017514905835994168, "loss": 0.1314, "step": 968 }, { "epoch": 0.9200094944220271, "grad_norm": 0.04098424315452576, "learning_rate": 0.00017507610899938501, "loss": 0.164, "step": 969 }, { "epoch": 0.9209589366247329, "grad_norm": 0.033782679587602615, "learning_rate": 0.0001750030679644997, "loss": 0.1376, "step": 970 }, { "epoch": 0.9219083788274389, "grad_norm": 0.03304159641265869, "learning_rate": 0.00017492993534447515, "loss": 0.1244, "step": 971 }, { "epoch": 0.9228578210301448, "grad_norm": 0.03158386051654816, "learning_rate": 0.0001748567112286125, "loss": 0.1345, "step": 972 }, { "epoch": 0.9238072632328507, "grad_norm": 0.03615015745162964, "learning_rate": 0.00017478339570632458, "loss": 0.1434, "step": 973 }, { "epoch": 0.9247567054355567, "grad_norm": 0.033553823828697205, "learning_rate": 0.00017470998886713596, "loss": 0.1292, "step": 974 }, { "epoch": 0.9257061476382625, "grad_norm": 0.03953874111175537, "learning_rate": 0.00017463649080068266, "loss": 0.1621, "step": 975 }, { "epoch": 0.9266555898409684, "grad_norm": 0.03288433700799942, "learning_rate": 0.00017456290159671202, "loss": 0.1357, "step": 976 }, { "epoch": 0.9276050320436744, "grad_norm": 0.03154657408595085, "learning_rate": 0.00017448922134508275, "loss": 0.1322, "step": 977 }, { "epoch": 0.9285544742463803, "grad_norm": 0.05669796094298363, "learning_rate": 0.00017441545013576477, "loss": 0.1761, "step": 978 }, { "epoch": 0.9295039164490861, "grad_norm": 0.026679178699851036, "learning_rate": 0.00017434158805883896, "loss": 0.1295, "step": 979 }, { "epoch": 0.930453358651792, "grad_norm": 0.03597673401236534, "learning_rate": 0.00017426763520449721, "loss": 0.1265, "step": 980 }, { "epoch": 0.931402800854498, "grad_norm": 0.03097674809396267, "learning_rate": 0.0001741935916630423, "loss": 0.1339, "step": 981 }, { "epoch": 0.9323522430572039, "grad_norm": 0.030252935364842415, "learning_rate": 0.00017411945752488766, "loss": 0.1247, "step": 982 }, { "epoch": 0.9333016852599098, "grad_norm": 0.03460918739438057, "learning_rate": 0.00017404523288055743, "loss": 0.1323, "step": 983 }, { "epoch": 0.9342511274626157, "grad_norm": 0.035575591027736664, "learning_rate": 0.00017397091782068622, "loss": 0.1258, "step": 984 }, { "epoch": 0.9352005696653216, "grad_norm": 0.05128021538257599, "learning_rate": 0.00017389651243601904, "loss": 0.1364, "step": 985 }, { "epoch": 0.9361500118680275, "grad_norm": 0.04355672374367714, "learning_rate": 0.00017382201681741122, "loss": 0.1656, "step": 986 }, { "epoch": 0.9370994540707335, "grad_norm": 0.03357682749629021, "learning_rate": 0.0001737474310558282, "loss": 0.1285, "step": 987 }, { "epoch": 0.9380488962734393, "grad_norm": 0.10623644292354584, "learning_rate": 0.00017367275524234565, "loss": 0.1726, "step": 988 }, { "epoch": 0.9389983384761452, "grad_norm": 0.03605256229639053, "learning_rate": 0.00017359798946814907, "loss": 0.1358, "step": 989 }, { "epoch": 0.9399477806788512, "grad_norm": 0.039663393050432205, "learning_rate": 0.00017352313382453378, "loss": 0.1299, "step": 990 }, { "epoch": 0.9408972228815571, "grad_norm": 0.12416961044073105, "learning_rate": 0.000173448188402905, "loss": 0.1666, "step": 991 }, { "epoch": 0.941846665084263, "grad_norm": 0.045010216534137726, "learning_rate": 0.00017337315329477742, "loss": 0.1733, "step": 992 }, { "epoch": 0.9427961072869689, "grad_norm": 0.03456486761569977, "learning_rate": 0.0001732980285917753, "loss": 0.1312, "step": 993 }, { "epoch": 0.9437455494896748, "grad_norm": 0.039561979472637177, "learning_rate": 0.00017322281438563234, "loss": 0.1354, "step": 994 }, { "epoch": 0.9446949916923807, "grad_norm": 0.043275121599435806, "learning_rate": 0.00017314751076819146, "loss": 0.1651, "step": 995 }, { "epoch": 0.9456444338950867, "grad_norm": 0.0392397940158844, "learning_rate": 0.00017307211783140482, "loss": 0.1647, "step": 996 }, { "epoch": 0.9465938760977926, "grad_norm": 0.03428703919053078, "learning_rate": 0.0001729966356673336, "loss": 0.128, "step": 997 }, { "epoch": 0.9475433183004984, "grad_norm": 0.03511650487780571, "learning_rate": 0.000172921064368148, "loss": 0.1297, "step": 998 }, { "epoch": 0.9484927605032044, "grad_norm": 0.030319994315505028, "learning_rate": 0.00017284540402612696, "loss": 0.1269, "step": 999 }, { "epoch": 0.9494422027059103, "grad_norm": 0.03071141429245472, "learning_rate": 0.00017276965473365827, "loss": 0.1224, "step": 1000 }, { "epoch": 0.9503916449086162, "grad_norm": 0.04097789525985718, "learning_rate": 0.00017269381658323822, "loss": 0.1597, "step": 1001 }, { "epoch": 0.9513410871113221, "grad_norm": 0.03407077491283417, "learning_rate": 0.00017261788966747168, "loss": 0.1268, "step": 1002 }, { "epoch": 0.952290529314028, "grad_norm": 0.035802800208330154, "learning_rate": 0.00017254187407907189, "loss": 0.1338, "step": 1003 }, { "epoch": 0.9532399715167339, "grad_norm": 0.030097633600234985, "learning_rate": 0.00017246576991086034, "loss": 0.1222, "step": 1004 }, { "epoch": 0.9541894137194399, "grad_norm": 0.047994308173656464, "learning_rate": 0.0001723895772557667, "loss": 0.1632, "step": 1005 }, { "epoch": 0.9551388559221458, "grad_norm": 0.03451845049858093, "learning_rate": 0.00017231329620682876, "loss": 0.1278, "step": 1006 }, { "epoch": 0.9560882981248516, "grad_norm": 0.036820750683546066, "learning_rate": 0.00017223692685719213, "loss": 0.1355, "step": 1007 }, { "epoch": 0.9570377403275575, "grad_norm": 0.03521284461021423, "learning_rate": 0.0001721604693001103, "loss": 0.1383, "step": 1008 }, { "epoch": 0.9579871825302635, "grad_norm": 0.036953702569007874, "learning_rate": 0.00017208392362894447, "loss": 0.1352, "step": 1009 }, { "epoch": 0.9589366247329694, "grad_norm": 0.031185979023575783, "learning_rate": 0.00017200728993716345, "loss": 0.1262, "step": 1010 }, { "epoch": 0.9598860669356752, "grad_norm": 0.030822455883026123, "learning_rate": 0.00017193056831834346, "loss": 0.1211, "step": 1011 }, { "epoch": 0.9608355091383812, "grad_norm": 0.031467005610466, "learning_rate": 0.0001718537588661682, "loss": 0.1271, "step": 1012 }, { "epoch": 0.9617849513410871, "grad_norm": 0.03788928687572479, "learning_rate": 0.0001717768616744285, "loss": 0.1413, "step": 1013 }, { "epoch": 0.962734393543793, "grad_norm": 0.03359632566571236, "learning_rate": 0.00017169987683702243, "loss": 0.1276, "step": 1014 }, { "epoch": 0.963683835746499, "grad_norm": 0.03274601325392723, "learning_rate": 0.000171622804447955, "loss": 0.1308, "step": 1015 }, { "epoch": 0.9646332779492048, "grad_norm": 0.03634633496403694, "learning_rate": 0.0001715456446013382, "loss": 0.1384, "step": 1016 }, { "epoch": 0.9655827201519107, "grad_norm": 0.02978476695716381, "learning_rate": 0.00017146839739139077, "loss": 0.1301, "step": 1017 }, { "epoch": 0.9665321623546167, "grad_norm": 0.03389682248234749, "learning_rate": 0.0001713910629124381, "loss": 0.1264, "step": 1018 }, { "epoch": 0.9674816045573226, "grad_norm": 0.03452256694436073, "learning_rate": 0.00017131364125891224, "loss": 0.1317, "step": 1019 }, { "epoch": 0.9684310467600284, "grad_norm": 0.03967840224504471, "learning_rate": 0.00017123613252535163, "loss": 0.1308, "step": 1020 }, { "epoch": 0.9693804889627344, "grad_norm": 0.04021480306982994, "learning_rate": 0.00017115853680640098, "loss": 0.1637, "step": 1021 }, { "epoch": 0.9703299311654403, "grad_norm": 0.02766057476401329, "learning_rate": 0.00017108085419681132, "loss": 0.1239, "step": 1022 }, { "epoch": 0.9712793733681462, "grad_norm": 0.029945319518446922, "learning_rate": 0.00017100308479143974, "loss": 0.1236, "step": 1023 }, { "epoch": 0.9722288155708522, "grad_norm": 0.03135136887431145, "learning_rate": 0.00017092522868524928, "loss": 0.1203, "step": 1024 }, { "epoch": 0.973178257773558, "grad_norm": 0.04876153543591499, "learning_rate": 0.00017084728597330893, "loss": 0.1802, "step": 1025 }, { "epoch": 0.9741276999762639, "grad_norm": 0.042958084493875504, "learning_rate": 0.00017076925675079335, "loss": 0.1656, "step": 1026 }, { "epoch": 0.9750771421789699, "grad_norm": 0.04739035665988922, "learning_rate": 0.00017069114111298287, "loss": 0.167, "step": 1027 }, { "epoch": 0.9760265843816758, "grad_norm": 0.042968571186065674, "learning_rate": 0.00017061293915526335, "loss": 0.173, "step": 1028 }, { "epoch": 0.9769760265843817, "grad_norm": 0.07628528028726578, "learning_rate": 0.00017053465097312606, "loss": 0.1351, "step": 1029 }, { "epoch": 0.9779254687870876, "grad_norm": 0.032479528337717056, "learning_rate": 0.00017045627666216755, "loss": 0.1294, "step": 1030 }, { "epoch": 0.9788749109897935, "grad_norm": 0.029842333868145943, "learning_rate": 0.0001703778163180895, "loss": 0.1264, "step": 1031 }, { "epoch": 0.9798243531924994, "grad_norm": 0.03622937202453613, "learning_rate": 0.00017029927003669868, "loss": 0.1287, "step": 1032 }, { "epoch": 0.9807737953952054, "grad_norm": 0.05245399475097656, "learning_rate": 0.00017022063791390684, "loss": 0.1923, "step": 1033 }, { "epoch": 0.9817232375979112, "grad_norm": 0.03335704281926155, "learning_rate": 0.00017014192004573047, "loss": 0.1241, "step": 1034 }, { "epoch": 0.9826726798006171, "grad_norm": 0.03645642474293709, "learning_rate": 0.0001700631165282908, "loss": 0.1321, "step": 1035 }, { "epoch": 0.983622122003323, "grad_norm": 0.05331774801015854, "learning_rate": 0.00016998422745781363, "loss": 0.169, "step": 1036 }, { "epoch": 0.984571564206029, "grad_norm": 0.04615236446261406, "learning_rate": 0.00016990525293062927, "loss": 0.1623, "step": 1037 }, { "epoch": 0.9855210064087349, "grad_norm": 0.047434594482183456, "learning_rate": 0.00016982619304317233, "loss": 0.1303, "step": 1038 }, { "epoch": 0.9864704486114407, "grad_norm": 0.03144746273756027, "learning_rate": 0.00016974704789198168, "loss": 0.1203, "step": 1039 }, { "epoch": 0.9874198908141467, "grad_norm": 0.04035501554608345, "learning_rate": 0.00016966781757370028, "loss": 0.1246, "step": 1040 }, { "epoch": 0.9883693330168526, "grad_norm": 0.03864790499210358, "learning_rate": 0.0001695885021850751, "loss": 0.1305, "step": 1041 }, { "epoch": 0.9893187752195585, "grad_norm": 0.03547806292772293, "learning_rate": 0.00016950910182295705, "loss": 0.1319, "step": 1042 }, { "epoch": 0.9902682174222645, "grad_norm": 0.03442002460360527, "learning_rate": 0.0001694296165843007, "loss": 0.1344, "step": 1043 }, { "epoch": 0.9912176596249703, "grad_norm": 0.0333750881254673, "learning_rate": 0.00016935004656616425, "loss": 0.1278, "step": 1044 }, { "epoch": 0.9921671018276762, "grad_norm": 0.03143637254834175, "learning_rate": 0.00016927039186570954, "loss": 0.1237, "step": 1045 }, { "epoch": 0.9931165440303822, "grad_norm": 0.03841651603579521, "learning_rate": 0.0001691906525802017, "loss": 0.1395, "step": 1046 }, { "epoch": 0.9940659862330881, "grad_norm": 0.03443494066596031, "learning_rate": 0.00016911082880700926, "loss": 0.1422, "step": 1047 }, { "epoch": 0.9950154284357939, "grad_norm": 0.027661804109811783, "learning_rate": 0.0001690309206436038, "loss": 0.1251, "step": 1048 }, { "epoch": 0.9959648706384999, "grad_norm": 0.036862559616565704, "learning_rate": 0.00016895092818756006, "loss": 0.1337, "step": 1049 }, { "epoch": 0.9969143128412058, "grad_norm": 0.035234466195106506, "learning_rate": 0.00016887085153655554, "loss": 0.1267, "step": 1050 }, { "epoch": 0.9978637550439117, "grad_norm": 0.032372791320085526, "learning_rate": 0.00016879069078837075, "loss": 0.1254, "step": 1051 }, { "epoch": 0.9988131972466177, "grad_norm": 0.037299785763025284, "learning_rate": 0.00016871044604088877, "loss": 0.1324, "step": 1052 }, { "epoch": 0.9997626394493235, "grad_norm": 0.03843718767166138, "learning_rate": 0.00016863011739209527, "loss": 0.1328, "step": 1053 }, { "epoch": 1.0007120816520294, "grad_norm": 0.03160862624645233, "learning_rate": 0.00016854970494007836, "loss": 0.1284, "step": 1054 }, { "epoch": 1.0016615238547353, "grad_norm": 0.05188068002462387, "learning_rate": 0.00016846920878302852, "loss": 0.1775, "step": 1055 }, { "epoch": 1.0026109660574412, "grad_norm": 0.04362662881612778, "learning_rate": 0.00016838862901923842, "loss": 0.1577, "step": 1056 }, { "epoch": 1.0035604082601473, "grad_norm": 0.033426132053136826, "learning_rate": 0.00016830796574710284, "loss": 0.1252, "step": 1057 }, { "epoch": 1.0045098504628531, "grad_norm": 0.06085265800356865, "learning_rate": 0.00016822721906511844, "loss": 0.1769, "step": 1058 }, { "epoch": 1.005459292665559, "grad_norm": 0.03222273662686348, "learning_rate": 0.00016814638907188388, "loss": 0.1239, "step": 1059 }, { "epoch": 1.0064087348682649, "grad_norm": 0.032014038413763046, "learning_rate": 0.00016806547586609947, "loss": 0.1191, "step": 1060 }, { "epoch": 1.0073581770709708, "grad_norm": 0.03323471546173096, "learning_rate": 0.00016798447954656707, "loss": 0.1334, "step": 1061 }, { "epoch": 1.0083076192736766, "grad_norm": 0.04325219243764877, "learning_rate": 0.0001679034002121901, "loss": 0.1623, "step": 1062 }, { "epoch": 1.0092570614763827, "grad_norm": 0.029746338725090027, "learning_rate": 0.0001678222379619734, "loss": 0.1292, "step": 1063 }, { "epoch": 1.0102065036790886, "grad_norm": 0.03265037387609482, "learning_rate": 0.00016774099289502297, "loss": 0.1271, "step": 1064 }, { "epoch": 1.0111559458817945, "grad_norm": 0.04023383557796478, "learning_rate": 0.0001676596651105459, "loss": 0.1537, "step": 1065 }, { "epoch": 1.0121053880845003, "grad_norm": 0.036106862127780914, "learning_rate": 0.00016757825470785042, "loss": 0.1237, "step": 1066 }, { "epoch": 1.0130548302872062, "grad_norm": 0.04061293974518776, "learning_rate": 0.00016749676178634556, "loss": 0.1324, "step": 1067 }, { "epoch": 1.014004272489912, "grad_norm": 0.050820399075746536, "learning_rate": 0.0001674151864455411, "loss": 0.1705, "step": 1068 }, { "epoch": 1.0149537146926182, "grad_norm": 0.037347592413425446, "learning_rate": 0.00016733352878504752, "loss": 0.1248, "step": 1069 }, { "epoch": 1.015903156895324, "grad_norm": 0.04108656942844391, "learning_rate": 0.00016725178890457571, "loss": 0.1201, "step": 1070 }, { "epoch": 1.01685259909803, "grad_norm": 0.051215577870607376, "learning_rate": 0.00016716996690393715, "loss": 0.1705, "step": 1071 }, { "epoch": 1.0178020413007358, "grad_norm": 0.05008477717638016, "learning_rate": 0.00016708806288304336, "loss": 0.1606, "step": 1072 }, { "epoch": 1.0187514835034417, "grad_norm": 0.05916628614068031, "learning_rate": 0.00016700607694190617, "loss": 0.1824, "step": 1073 }, { "epoch": 1.0197009257061476, "grad_norm": 0.03331366181373596, "learning_rate": 0.00016692400918063744, "loss": 0.1256, "step": 1074 }, { "epoch": 1.0206503679088534, "grad_norm": 0.03364944830536842, "learning_rate": 0.00016684185969944885, "loss": 0.1273, "step": 1075 }, { "epoch": 1.0215998101115595, "grad_norm": 0.02990981563925743, "learning_rate": 0.000166759628598652, "loss": 0.1284, "step": 1076 }, { "epoch": 1.0225492523142654, "grad_norm": 0.03323819115757942, "learning_rate": 0.00016667731597865796, "loss": 0.1258, "step": 1077 }, { "epoch": 1.0234986945169713, "grad_norm": 0.03008713200688362, "learning_rate": 0.0001665949219399775, "loss": 0.1244, "step": 1078 }, { "epoch": 1.0244481367196772, "grad_norm": 0.04623178020119667, "learning_rate": 0.00016651244658322085, "loss": 0.1537, "step": 1079 }, { "epoch": 1.025397578922383, "grad_norm": 0.034869614988565445, "learning_rate": 0.00016642989000909732, "loss": 0.1272, "step": 1080 }, { "epoch": 1.026347021125089, "grad_norm": 0.03236447647213936, "learning_rate": 0.0001663472523184156, "loss": 0.1299, "step": 1081 }, { "epoch": 1.027296463327795, "grad_norm": 0.02806561440229416, "learning_rate": 0.00016626453361208335, "loss": 0.1198, "step": 1082 }, { "epoch": 1.0282459055305009, "grad_norm": 0.04762514680624008, "learning_rate": 0.0001661817339911071, "loss": 0.1695, "step": 1083 }, { "epoch": 1.0291953477332068, "grad_norm": 0.039319079369306564, "learning_rate": 0.00016609885355659234, "loss": 0.1612, "step": 1084 }, { "epoch": 1.0301447899359126, "grad_norm": 0.030540715903043747, "learning_rate": 0.0001660158924097431, "loss": 0.1251, "step": 1085 }, { "epoch": 1.0310942321386185, "grad_norm": 0.029828663915395737, "learning_rate": 0.000165932850651862, "loss": 0.1287, "step": 1086 }, { "epoch": 1.0320436743413244, "grad_norm": 0.030012918636202812, "learning_rate": 0.0001658497283843501, "loss": 0.132, "step": 1087 }, { "epoch": 1.0329931165440305, "grad_norm": 0.03255194425582886, "learning_rate": 0.0001657665257087068, "loss": 0.1257, "step": 1088 }, { "epoch": 1.0339425587467364, "grad_norm": 0.040951523929834366, "learning_rate": 0.00016568324272652965, "loss": 0.1507, "step": 1089 }, { "epoch": 1.0348920009494422, "grad_norm": 0.027678990736603737, "learning_rate": 0.00016559987953951427, "loss": 0.1217, "step": 1090 }, { "epoch": 1.035841443152148, "grad_norm": 0.03241724148392677, "learning_rate": 0.0001655164362494542, "loss": 0.1298, "step": 1091 }, { "epoch": 1.036790885354854, "grad_norm": 0.038156237453222275, "learning_rate": 0.00016543291295824085, "loss": 0.1252, "step": 1092 }, { "epoch": 1.0377403275575598, "grad_norm": 0.029806343838572502, "learning_rate": 0.00016534930976786323, "loss": 0.1265, "step": 1093 }, { "epoch": 1.038689769760266, "grad_norm": 0.035036977380514145, "learning_rate": 0.00016526562678040804, "loss": 0.1247, "step": 1094 }, { "epoch": 1.0396392119629718, "grad_norm": 0.032223109155893326, "learning_rate": 0.00016518186409805922, "loss": 0.1326, "step": 1095 }, { "epoch": 1.0405886541656777, "grad_norm": 0.03192323073744774, "learning_rate": 0.0001650980218230982, "loss": 0.1186, "step": 1096 }, { "epoch": 1.0415380963683836, "grad_norm": 0.031004801392555237, "learning_rate": 0.00016501410005790362, "loss": 0.1292, "step": 1097 }, { "epoch": 1.0424875385710894, "grad_norm": 0.03421878442168236, "learning_rate": 0.00016493009890495102, "loss": 0.1362, "step": 1098 }, { "epoch": 1.0434369807737953, "grad_norm": 0.03153158724308014, "learning_rate": 0.00016484601846681297, "loss": 0.1182, "step": 1099 }, { "epoch": 1.0443864229765012, "grad_norm": 0.03977439925074577, "learning_rate": 0.0001647618588461589, "loss": 0.1327, "step": 1100 }, { "epoch": 1.0453358651792073, "grad_norm": 0.03982316702604294, "learning_rate": 0.00016467762014575485, "loss": 0.1582, "step": 1101 }, { "epoch": 1.0462853073819132, "grad_norm": 0.034796085208654404, "learning_rate": 0.00016459330246846348, "loss": 0.1258, "step": 1102 }, { "epoch": 1.047234749584619, "grad_norm": 0.039261046797037125, "learning_rate": 0.0001645089059172438, "loss": 0.1321, "step": 1103 }, { "epoch": 1.048184191787325, "grad_norm": 0.04305882379412651, "learning_rate": 0.00016442443059515126, "loss": 0.1406, "step": 1104 }, { "epoch": 1.0491336339900308, "grad_norm": 0.03491320461034775, "learning_rate": 0.00016433987660533742, "loss": 0.1312, "step": 1105 }, { "epoch": 1.0500830761927367, "grad_norm": 0.04404641315340996, "learning_rate": 0.00016425524405104986, "loss": 0.1267, "step": 1106 }, { "epoch": 1.0510325183954428, "grad_norm": 0.034407854080200195, "learning_rate": 0.0001641705330356322, "loss": 0.1268, "step": 1107 }, { "epoch": 1.0519819605981486, "grad_norm": 0.04843935742974281, "learning_rate": 0.00016408574366252374, "loss": 0.1601, "step": 1108 }, { "epoch": 1.0529314028008545, "grad_norm": 0.03394000977277756, "learning_rate": 0.0001640008760352596, "loss": 0.13, "step": 1109 }, { "epoch": 1.0538808450035604, "grad_norm": 0.027672087773680687, "learning_rate": 0.00016391593025747038, "loss": 0.1202, "step": 1110 }, { "epoch": 1.0548302872062663, "grad_norm": 0.03761329874396324, "learning_rate": 0.0001638309064328821, "loss": 0.1562, "step": 1111 }, { "epoch": 1.0557797294089721, "grad_norm": 0.048850156366825104, "learning_rate": 0.0001637458046653161, "loss": 0.1707, "step": 1112 }, { "epoch": 1.0567291716116782, "grad_norm": 0.027066387236118317, "learning_rate": 0.00016366062505868888, "loss": 0.1204, "step": 1113 }, { "epoch": 1.057678613814384, "grad_norm": 0.034062668681144714, "learning_rate": 0.00016357536771701198, "loss": 0.1378, "step": 1114 }, { "epoch": 1.05862805601709, "grad_norm": 0.0422850139439106, "learning_rate": 0.00016349003274439194, "loss": 0.1583, "step": 1115 }, { "epoch": 1.0595774982197959, "grad_norm": 0.0334283784031868, "learning_rate": 0.00016340462024503, "loss": 0.1276, "step": 1116 }, { "epoch": 1.0605269404225017, "grad_norm": 0.03338415175676346, "learning_rate": 0.00016331913032322212, "loss": 0.1229, "step": 1117 }, { "epoch": 1.0614763826252076, "grad_norm": 0.03128555044531822, "learning_rate": 0.00016323356308335876, "loss": 0.1167, "step": 1118 }, { "epoch": 1.0624258248279137, "grad_norm": 0.033790841698646545, "learning_rate": 0.00016314791862992486, "loss": 0.1236, "step": 1119 }, { "epoch": 1.0633752670306196, "grad_norm": 0.03544427454471588, "learning_rate": 0.00016306219706749953, "loss": 0.1319, "step": 1120 }, { "epoch": 1.0643247092333254, "grad_norm": 0.03969413787126541, "learning_rate": 0.0001629763985007561, "loss": 0.1612, "step": 1121 }, { "epoch": 1.0652741514360313, "grad_norm": 0.042924992740154266, "learning_rate": 0.00016289052303446202, "loss": 0.1659, "step": 1122 }, { "epoch": 1.0662235936387372, "grad_norm": 0.04624541476368904, "learning_rate": 0.00016280457077347848, "loss": 0.1617, "step": 1123 }, { "epoch": 1.067173035841443, "grad_norm": 0.034341566264629364, "learning_rate": 0.00016271854182276058, "loss": 0.1188, "step": 1124 }, { "epoch": 1.068122478044149, "grad_norm": 0.03228682279586792, "learning_rate": 0.00016263243628735695, "loss": 0.129, "step": 1125 }, { "epoch": 1.069071920246855, "grad_norm": 0.036037541925907135, "learning_rate": 0.00016254625427240978, "loss": 0.1309, "step": 1126 }, { "epoch": 1.070021362449561, "grad_norm": 0.027421532198786736, "learning_rate": 0.0001624599958831547, "loss": 0.1176, "step": 1127 }, { "epoch": 1.0709708046522668, "grad_norm": 0.030262261629104614, "learning_rate": 0.00016237366122492052, "loss": 0.1195, "step": 1128 }, { "epoch": 1.0719202468549727, "grad_norm": 0.041230421513319016, "learning_rate": 0.00016228725040312925, "loss": 0.1562, "step": 1129 }, { "epoch": 1.0728696890576785, "grad_norm": 0.03141395375132561, "learning_rate": 0.00016220076352329582, "loss": 0.13, "step": 1130 }, { "epoch": 1.0738191312603846, "grad_norm": 0.0343187153339386, "learning_rate": 0.00016211420069102815, "loss": 0.134, "step": 1131 }, { "epoch": 1.0747685734630905, "grad_norm": 0.04862483590841293, "learning_rate": 0.0001620275620120268, "loss": 0.1574, "step": 1132 }, { "epoch": 1.0757180156657964, "grad_norm": 0.04204042628407478, "learning_rate": 0.00016194084759208494, "loss": 0.162, "step": 1133 }, { "epoch": 1.0766674578685023, "grad_norm": 0.03309663385152817, "learning_rate": 0.00016185405753708833, "loss": 0.1251, "step": 1134 }, { "epoch": 1.0776169000712081, "grad_norm": 0.03191671893000603, "learning_rate": 0.00016176719195301503, "loss": 0.125, "step": 1135 }, { "epoch": 1.078566342273914, "grad_norm": 0.036822427064180374, "learning_rate": 0.0001616802509459353, "loss": 0.1484, "step": 1136 }, { "epoch": 1.0795157844766199, "grad_norm": 0.029125772416591644, "learning_rate": 0.00016159323462201149, "loss": 0.1192, "step": 1137 }, { "epoch": 1.080465226679326, "grad_norm": 0.034059688448905945, "learning_rate": 0.000161506143087498, "loss": 0.1309, "step": 1138 }, { "epoch": 1.0814146688820319, "grad_norm": 0.03434915095567703, "learning_rate": 0.00016141897644874096, "loss": 0.1336, "step": 1139 }, { "epoch": 1.0823641110847377, "grad_norm": 0.0348944216966629, "learning_rate": 0.00016133173481217833, "loss": 0.1317, "step": 1140 }, { "epoch": 1.0833135532874436, "grad_norm": 0.033639729022979736, "learning_rate": 0.00016124441828433957, "loss": 0.1187, "step": 1141 }, { "epoch": 1.0842629954901495, "grad_norm": 0.03063533827662468, "learning_rate": 0.00016115702697184556, "loss": 0.1332, "step": 1142 }, { "epoch": 1.0852124376928554, "grad_norm": 0.03273540362715721, "learning_rate": 0.00016106956098140858, "loss": 0.1284, "step": 1143 }, { "epoch": 1.0861618798955615, "grad_norm": 0.05293993651866913, "learning_rate": 0.00016098202041983206, "loss": 0.1687, "step": 1144 }, { "epoch": 1.0871113220982673, "grad_norm": 0.03251373767852783, "learning_rate": 0.00016089440539401046, "loss": 0.1252, "step": 1145 }, { "epoch": 1.0880607643009732, "grad_norm": 0.0367170013487339, "learning_rate": 0.00016080671601092922, "loss": 0.1419, "step": 1146 }, { "epoch": 1.089010206503679, "grad_norm": 0.030752060934901237, "learning_rate": 0.00016071895237766457, "loss": 0.1257, "step": 1147 }, { "epoch": 1.089959648706385, "grad_norm": 0.035168685019016266, "learning_rate": 0.00016063111460138334, "loss": 0.1385, "step": 1148 }, { "epoch": 1.0909090909090908, "grad_norm": 0.03252134099602699, "learning_rate": 0.00016054320278934296, "loss": 0.1232, "step": 1149 }, { "epoch": 1.0918585331117967, "grad_norm": 0.028666459023952484, "learning_rate": 0.00016045521704889128, "loss": 0.1242, "step": 1150 }, { "epoch": 1.0928079753145028, "grad_norm": 0.047707218676805496, "learning_rate": 0.00016036715748746634, "loss": 0.1643, "step": 1151 }, { "epoch": 1.0937574175172087, "grad_norm": 0.035980336368083954, "learning_rate": 0.00016027902421259638, "loss": 0.1329, "step": 1152 }, { "epoch": 1.0947068597199145, "grad_norm": 0.04506576433777809, "learning_rate": 0.00016019081733189967, "loss": 0.1631, "step": 1153 }, { "epoch": 1.0956563019226204, "grad_norm": 0.030268298462033272, "learning_rate": 0.0001601025369530843, "loss": 0.1319, "step": 1154 }, { "epoch": 1.0966057441253263, "grad_norm": 0.056095585227012634, "learning_rate": 0.00016001418318394817, "loss": 0.1529, "step": 1155 }, { "epoch": 1.0975551863280324, "grad_norm": 0.029666630551218987, "learning_rate": 0.0001599257561323787, "loss": 0.126, "step": 1156 }, { "epoch": 1.0985046285307383, "grad_norm": 0.03648681938648224, "learning_rate": 0.00015983725590635293, "loss": 0.1378, "step": 1157 }, { "epoch": 1.0994540707334441, "grad_norm": 0.03170529007911682, "learning_rate": 0.00015974868261393714, "loss": 0.1238, "step": 1158 }, { "epoch": 1.10040351293615, "grad_norm": 0.032316990196704865, "learning_rate": 0.0001596600363632869, "loss": 0.1305, "step": 1159 }, { "epoch": 1.101352955138856, "grad_norm": 0.03148328512907028, "learning_rate": 0.00015957131726264677, "loss": 0.1303, "step": 1160 }, { "epoch": 1.1023023973415618, "grad_norm": 0.03739064186811447, "learning_rate": 0.00015948252542035042, "loss": 0.16, "step": 1161 }, { "epoch": 1.1032518395442676, "grad_norm": 0.034856993705034256, "learning_rate": 0.00015939366094482025, "loss": 0.1273, "step": 1162 }, { "epoch": 1.1042012817469737, "grad_norm": 0.03102080523967743, "learning_rate": 0.0001593047239445673, "loss": 0.1331, "step": 1163 }, { "epoch": 1.1051507239496796, "grad_norm": 0.026448125019669533, "learning_rate": 0.00015921571452819127, "loss": 0.1241, "step": 1164 }, { "epoch": 1.1061001661523855, "grad_norm": 0.034301795065402985, "learning_rate": 0.0001591266328043802, "loss": 0.1283, "step": 1165 }, { "epoch": 1.1070496083550914, "grad_norm": 0.03346949443221092, "learning_rate": 0.00015903747888191053, "loss": 0.1355, "step": 1166 }, { "epoch": 1.1079990505577972, "grad_norm": 0.0324571467936039, "learning_rate": 0.00015894825286964675, "loss": 0.1354, "step": 1167 }, { "epoch": 1.1089484927605031, "grad_norm": 0.05366596579551697, "learning_rate": 0.00015885895487654147, "loss": 0.2099, "step": 1168 }, { "epoch": 1.1098979349632092, "grad_norm": 0.04298697039484978, "learning_rate": 0.00015876958501163512, "loss": 0.1709, "step": 1169 }, { "epoch": 1.110847377165915, "grad_norm": 0.02922794409096241, "learning_rate": 0.00015868014338405592, "loss": 0.1201, "step": 1170 }, { "epoch": 1.111796819368621, "grad_norm": 0.02963380515575409, "learning_rate": 0.00015859063010301974, "loss": 0.119, "step": 1171 }, { "epoch": 1.1127462615713268, "grad_norm": 0.05272309482097626, "learning_rate": 0.0001585010452778299, "loss": 0.1785, "step": 1172 }, { "epoch": 1.1136957037740327, "grad_norm": 0.031892240047454834, "learning_rate": 0.00015841138901787714, "loss": 0.1292, "step": 1173 }, { "epoch": 1.1146451459767386, "grad_norm": 0.02971399575471878, "learning_rate": 0.0001583216614326394, "loss": 0.124, "step": 1174 }, { "epoch": 1.1155945881794447, "grad_norm": 0.03037869744002819, "learning_rate": 0.00015823186263168169, "loss": 0.1303, "step": 1175 }, { "epoch": 1.1165440303821506, "grad_norm": 0.02748207375407219, "learning_rate": 0.000158141992724656, "loss": 0.1243, "step": 1176 }, { "epoch": 1.1174934725848564, "grad_norm": 0.033940836787223816, "learning_rate": 0.00015805205182130113, "loss": 0.1254, "step": 1177 }, { "epoch": 1.1184429147875623, "grad_norm": 0.03295721858739853, "learning_rate": 0.00015796204003144264, "loss": 0.1235, "step": 1178 }, { "epoch": 1.1193923569902682, "grad_norm": 0.031388405710458755, "learning_rate": 0.00015787195746499254, "loss": 0.1221, "step": 1179 }, { "epoch": 1.120341799192974, "grad_norm": 0.033072203397750854, "learning_rate": 0.00015778180423194936, "loss": 0.1284, "step": 1180 }, { "epoch": 1.1212912413956801, "grad_norm": 0.03310628607869148, "learning_rate": 0.00015769158044239787, "loss": 0.136, "step": 1181 }, { "epoch": 1.122240683598386, "grad_norm": 0.030244866386055946, "learning_rate": 0.000157601286206509, "loss": 0.1255, "step": 1182 }, { "epoch": 1.123190125801092, "grad_norm": 0.04400714859366417, "learning_rate": 0.0001575109216345397, "loss": 0.1706, "step": 1183 }, { "epoch": 1.1241395680037978, "grad_norm": 0.03094104304909706, "learning_rate": 0.00015742048683683288, "loss": 0.1261, "step": 1184 }, { "epoch": 1.1250890102065036, "grad_norm": 0.03327153995633125, "learning_rate": 0.00015732998192381707, "loss": 0.1334, "step": 1185 }, { "epoch": 1.1260384524092095, "grad_norm": 0.03229563683271408, "learning_rate": 0.0001572394070060065, "loss": 0.1168, "step": 1186 }, { "epoch": 1.1269878946119154, "grad_norm": 0.03267960995435715, "learning_rate": 0.0001571487621940009, "loss": 0.1331, "step": 1187 }, { "epoch": 1.1279373368146215, "grad_norm": 0.02902175299823284, "learning_rate": 0.00015705804759848523, "loss": 0.1302, "step": 1188 }, { "epoch": 1.1288867790173274, "grad_norm": 0.030303161591291428, "learning_rate": 0.00015696726333022984, "loss": 0.1267, "step": 1189 }, { "epoch": 1.1298362212200332, "grad_norm": 0.031242702156305313, "learning_rate": 0.00015687640950009, "loss": 0.125, "step": 1190 }, { "epoch": 1.1307856634227391, "grad_norm": 0.02697862684726715, "learning_rate": 0.00015678548621900597, "loss": 0.1207, "step": 1191 }, { "epoch": 1.131735105625445, "grad_norm": 0.03510352969169617, "learning_rate": 0.0001566944935980029, "loss": 0.132, "step": 1192 }, { "epoch": 1.1326845478281509, "grad_norm": 0.03274201229214668, "learning_rate": 0.00015660343174819045, "loss": 0.1262, "step": 1193 }, { "epoch": 1.133633990030857, "grad_norm": 0.03378736600279808, "learning_rate": 0.00015651230078076296, "loss": 0.1318, "step": 1194 }, { "epoch": 1.1345834322335628, "grad_norm": 0.032314665615558624, "learning_rate": 0.00015642110080699907, "loss": 0.117, "step": 1195 }, { "epoch": 1.1355328744362687, "grad_norm": 0.04177004471421242, "learning_rate": 0.00015632983193826174, "loss": 0.1527, "step": 1196 }, { "epoch": 1.1364823166389746, "grad_norm": 0.05647768825292587, "learning_rate": 0.00015623849428599804, "loss": 0.2019, "step": 1197 }, { "epoch": 1.1374317588416805, "grad_norm": 0.03124004229903221, "learning_rate": 0.00015614708796173906, "loss": 0.1228, "step": 1198 }, { "epoch": 1.1383812010443863, "grad_norm": 0.029155779629945755, "learning_rate": 0.00015605561307709964, "loss": 0.126, "step": 1199 }, { "epoch": 1.1393306432470922, "grad_norm": 0.030367044731974602, "learning_rate": 0.0001559640697437785, "loss": 0.1296, "step": 1200 }, { "epoch": 1.1402800854497983, "grad_norm": 0.06225927174091339, "learning_rate": 0.00015587245807355778, "loss": 0.2258, "step": 1201 }, { "epoch": 1.1412295276525042, "grad_norm": 0.03878935053944588, "learning_rate": 0.00015578077817830313, "loss": 0.1322, "step": 1202 }, { "epoch": 1.14217896985521, "grad_norm": 0.030811108648777008, "learning_rate": 0.0001556890301699636, "loss": 0.127, "step": 1203 }, { "epoch": 1.143128412057916, "grad_norm": 0.0561293326318264, "learning_rate": 0.00015559721416057127, "loss": 0.1689, "step": 1204 }, { "epoch": 1.1440778542606218, "grad_norm": 0.05256973206996918, "learning_rate": 0.0001555053302622413, "loss": 0.1735, "step": 1205 }, { "epoch": 1.145027296463328, "grad_norm": 0.037241023033857346, "learning_rate": 0.0001554133785871718, "loss": 0.1304, "step": 1206 }, { "epoch": 1.1459767386660338, "grad_norm": 0.04502008110284805, "learning_rate": 0.00015532135924764358, "loss": 0.1594, "step": 1207 }, { "epoch": 1.1469261808687397, "grad_norm": 0.053607337176799774, "learning_rate": 0.00015522927235602014, "loss": 0.1683, "step": 1208 }, { "epoch": 1.1478756230714455, "grad_norm": 0.029219908639788628, "learning_rate": 0.00015513711802474735, "loss": 0.1267, "step": 1209 }, { "epoch": 1.1488250652741514, "grad_norm": 0.03088328242301941, "learning_rate": 0.0001550448963663536, "loss": 0.1318, "step": 1210 }, { "epoch": 1.1497745074768573, "grad_norm": 0.03560802713036537, "learning_rate": 0.00015495260749344932, "loss": 0.1433, "step": 1211 }, { "epoch": 1.1507239496795632, "grad_norm": 0.033212240785360336, "learning_rate": 0.00015486025151872706, "loss": 0.1222, "step": 1212 }, { "epoch": 1.1516733918822692, "grad_norm": 0.032883357256650925, "learning_rate": 0.00015476782855496145, "loss": 0.1317, "step": 1213 }, { "epoch": 1.1526228340849751, "grad_norm": 0.028118513524532318, "learning_rate": 0.0001546753387150087, "loss": 0.1306, "step": 1214 }, { "epoch": 1.153572276287681, "grad_norm": 0.040216926485300064, "learning_rate": 0.00015458278211180688, "loss": 0.1482, "step": 1215 }, { "epoch": 1.1545217184903869, "grad_norm": 0.029117384925484657, "learning_rate": 0.00015449015885837542, "loss": 0.1287, "step": 1216 }, { "epoch": 1.1554711606930927, "grad_norm": 0.08133453875780106, "learning_rate": 0.00015439746906781524, "loss": 0.2109, "step": 1217 }, { "epoch": 1.1564206028957986, "grad_norm": 0.03554106503725052, "learning_rate": 0.00015430471285330846, "loss": 0.1297, "step": 1218 }, { "epoch": 1.1573700450985047, "grad_norm": 0.037721745669841766, "learning_rate": 0.00015421189032811835, "loss": 0.1364, "step": 1219 }, { "epoch": 1.1583194873012106, "grad_norm": 0.025993864983320236, "learning_rate": 0.00015411900160558912, "loss": 0.1198, "step": 1220 }, { "epoch": 1.1592689295039165, "grad_norm": 0.03330320492386818, "learning_rate": 0.00015402604679914575, "loss": 0.1253, "step": 1221 }, { "epoch": 1.1602183717066223, "grad_norm": 0.03463476151227951, "learning_rate": 0.00015393302602229408, "loss": 0.1235, "step": 1222 }, { "epoch": 1.1611678139093282, "grad_norm": 0.03458210453391075, "learning_rate": 0.00015383993938862037, "loss": 0.1238, "step": 1223 }, { "epoch": 1.162117256112034, "grad_norm": 0.03344335779547691, "learning_rate": 0.00015374678701179134, "loss": 0.1267, "step": 1224 }, { "epoch": 1.16306669831474, "grad_norm": 0.027585169300436974, "learning_rate": 0.00015365356900555395, "loss": 0.1192, "step": 1225 }, { "epoch": 1.164016140517446, "grad_norm": 0.03154926374554634, "learning_rate": 0.00015356028548373538, "loss": 0.1288, "step": 1226 }, { "epoch": 1.164965582720152, "grad_norm": 0.030677665024995804, "learning_rate": 0.00015346693656024271, "loss": 0.1292, "step": 1227 }, { "epoch": 1.1659150249228578, "grad_norm": 0.030145341530442238, "learning_rate": 0.00015337352234906298, "loss": 0.1331, "step": 1228 }, { "epoch": 1.1668644671255637, "grad_norm": 0.03683342784643173, "learning_rate": 0.00015328004296426287, "loss": 0.125, "step": 1229 }, { "epoch": 1.1678139093282696, "grad_norm": 0.029177436605095863, "learning_rate": 0.0001531864985199887, "loss": 0.1313, "step": 1230 }, { "epoch": 1.1687633515309757, "grad_norm": 0.0415952131152153, "learning_rate": 0.0001530928891304662, "loss": 0.1642, "step": 1231 }, { "epoch": 1.1697127937336815, "grad_norm": 0.027380244806408882, "learning_rate": 0.00015299921491000043, "loss": 0.1254, "step": 1232 }, { "epoch": 1.1706622359363874, "grad_norm": 0.03130066767334938, "learning_rate": 0.00015290547597297555, "loss": 0.1291, "step": 1233 }, { "epoch": 1.1716116781390933, "grad_norm": 0.04524728283286095, "learning_rate": 0.00015281167243385484, "loss": 0.1627, "step": 1234 }, { "epoch": 1.1725611203417992, "grad_norm": 0.04698526859283447, "learning_rate": 0.0001527178044071804, "loss": 0.1892, "step": 1235 }, { "epoch": 1.173510562544505, "grad_norm": 0.041941914707422256, "learning_rate": 0.00015262387200757314, "loss": 0.1603, "step": 1236 }, { "epoch": 1.174460004747211, "grad_norm": 0.03519544377923012, "learning_rate": 0.0001525298753497324, "loss": 0.1306, "step": 1237 }, { "epoch": 1.175409446949917, "grad_norm": 0.036025673151016235, "learning_rate": 0.00015243581454843624, "loss": 0.1315, "step": 1238 }, { "epoch": 1.1763588891526229, "grad_norm": 0.05133717134594917, "learning_rate": 0.0001523416897185409, "loss": 0.1661, "step": 1239 }, { "epoch": 1.1773083313553288, "grad_norm": 0.03367958217859268, "learning_rate": 0.00015224750097498073, "loss": 0.1311, "step": 1240 }, { "epoch": 1.1782577735580346, "grad_norm": 0.035372741520404816, "learning_rate": 0.0001521532484327683, "loss": 0.1324, "step": 1241 }, { "epoch": 1.1792072157607405, "grad_norm": 0.048746585845947266, "learning_rate": 0.000152058932206994, "loss": 0.1756, "step": 1242 }, { "epoch": 1.1801566579634466, "grad_norm": 0.03578799590468407, "learning_rate": 0.00015196455241282592, "loss": 0.1344, "step": 1243 }, { "epoch": 1.1811061001661525, "grad_norm": 0.030654437839984894, "learning_rate": 0.00015187010916550988, "loss": 0.1268, "step": 1244 }, { "epoch": 1.1820555423688583, "grad_norm": 0.02881826087832451, "learning_rate": 0.0001517756025803691, "loss": 0.1149, "step": 1245 }, { "epoch": 1.1830049845715642, "grad_norm": 0.031242484226822853, "learning_rate": 0.00015168103277280422, "loss": 0.1338, "step": 1246 }, { "epoch": 1.18395442677427, "grad_norm": 0.028528152033686638, "learning_rate": 0.000151586399858293, "loss": 0.1203, "step": 1247 }, { "epoch": 1.184903868976976, "grad_norm": 0.028410421684384346, "learning_rate": 0.00015149170395239035, "loss": 0.1296, "step": 1248 }, { "epoch": 1.1858533111796818, "grad_norm": 0.029383866116404533, "learning_rate": 0.00015139694517072796, "loss": 0.1284, "step": 1249 }, { "epoch": 1.186802753382388, "grad_norm": 0.031084850430488586, "learning_rate": 0.00015130212362901447, "loss": 0.1272, "step": 1250 }, { "epoch": 1.1877521955850938, "grad_norm": 0.035449109971523285, "learning_rate": 0.00015120723944303497, "loss": 0.1293, "step": 1251 }, { "epoch": 1.1887016377877997, "grad_norm": 0.030890950933098793, "learning_rate": 0.0001511122927286512, "loss": 0.1221, "step": 1252 }, { "epoch": 1.1896510799905056, "grad_norm": 0.06967780739068985, "learning_rate": 0.0001510172836018012, "loss": 0.1277, "step": 1253 }, { "epoch": 1.1906005221932114, "grad_norm": 0.03075585328042507, "learning_rate": 0.00015092221217849917, "loss": 0.1278, "step": 1254 }, { "epoch": 1.1915499643959173, "grad_norm": 0.040119290351867676, "learning_rate": 0.00015082707857483544, "loss": 0.1546, "step": 1255 }, { "epoch": 1.1924994065986234, "grad_norm": 0.03376394882798195, "learning_rate": 0.0001507318829069763, "loss": 0.1325, "step": 1256 }, { "epoch": 1.1934488488013293, "grad_norm": 0.034935321658849716, "learning_rate": 0.00015063662529116368, "loss": 0.1361, "step": 1257 }, { "epoch": 1.1943982910040352, "grad_norm": 0.043972812592983246, "learning_rate": 0.00015054130584371528, "loss": 0.1292, "step": 1258 }, { "epoch": 1.195347733206741, "grad_norm": 0.06039128825068474, "learning_rate": 0.0001504459246810243, "loss": 0.1958, "step": 1259 }, { "epoch": 1.196297175409447, "grad_norm": 0.030998772010207176, "learning_rate": 0.00015035048191955927, "loss": 0.1166, "step": 1260 }, { "epoch": 1.1972466176121528, "grad_norm": 0.0286384467035532, "learning_rate": 0.00015025497767586393, "loss": 0.1225, "step": 1261 }, { "epoch": 1.1981960598148587, "grad_norm": 0.03200898319482803, "learning_rate": 0.0001501594120665571, "loss": 0.1244, "step": 1262 }, { "epoch": 1.1991455020175648, "grad_norm": 0.032870370894670486, "learning_rate": 0.00015006378520833252, "loss": 0.126, "step": 1263 }, { "epoch": 1.2000949442202706, "grad_norm": 0.034849826246500015, "learning_rate": 0.00014996809721795872, "loss": 0.1263, "step": 1264 }, { "epoch": 1.2010443864229765, "grad_norm": 0.045324552804231644, "learning_rate": 0.00014987234821227898, "loss": 0.1668, "step": 1265 }, { "epoch": 1.2019938286256824, "grad_norm": 0.036612797528505325, "learning_rate": 0.0001497765383082109, "loss": 0.1595, "step": 1266 }, { "epoch": 1.2029432708283883, "grad_norm": 0.03746375814080238, "learning_rate": 0.00014968066762274657, "loss": 0.1644, "step": 1267 }, { "epoch": 1.2038927130310944, "grad_norm": 0.03137432038784027, "learning_rate": 0.0001495847362729523, "loss": 0.1239, "step": 1268 }, { "epoch": 1.2048421552338002, "grad_norm": 0.0314825214445591, "learning_rate": 0.0001494887443759684, "loss": 0.1258, "step": 1269 }, { "epoch": 1.205791597436506, "grad_norm": 0.032157186418771744, "learning_rate": 0.00014939269204900917, "loss": 0.1233, "step": 1270 }, { "epoch": 1.206741039639212, "grad_norm": 0.0410330593585968, "learning_rate": 0.0001492965794093627, "loss": 0.153, "step": 1271 }, { "epoch": 1.2076904818419179, "grad_norm": 0.0325077585875988, "learning_rate": 0.0001492004065743907, "loss": 0.1241, "step": 1272 }, { "epoch": 1.2086399240446237, "grad_norm": 0.033166393637657166, "learning_rate": 0.00014910417366152844, "loss": 0.1292, "step": 1273 }, { "epoch": 1.2095893662473296, "grad_norm": 0.02926860749721527, "learning_rate": 0.0001490078807882845, "loss": 0.1242, "step": 1274 }, { "epoch": 1.2105388084500357, "grad_norm": 0.04637501388788223, "learning_rate": 0.00014891152807224066, "loss": 0.1404, "step": 1275 }, { "epoch": 1.2114882506527416, "grad_norm": 0.035617321729660034, "learning_rate": 0.0001488151156310518, "loss": 0.1292, "step": 1276 }, { "epoch": 1.2124376928554474, "grad_norm": 0.036330446600914, "learning_rate": 0.00014871864358244574, "loss": 0.1326, "step": 1277 }, { "epoch": 1.2133871350581533, "grad_norm": 0.034302353858947754, "learning_rate": 0.00014862211204422305, "loss": 0.1296, "step": 1278 }, { "epoch": 1.2143365772608592, "grad_norm": 0.027070587500929832, "learning_rate": 0.00014852552113425702, "loss": 0.1227, "step": 1279 }, { "epoch": 1.215286019463565, "grad_norm": 0.029872050508856773, "learning_rate": 0.00014842887097049333, "loss": 0.1333, "step": 1280 }, { "epoch": 1.2162354616662712, "grad_norm": 0.0336853489279747, "learning_rate": 0.0001483321616709501, "loss": 0.1264, "step": 1281 }, { "epoch": 1.217184903868977, "grad_norm": 0.03892628103494644, "learning_rate": 0.00014823539335371763, "loss": 0.1516, "step": 1282 }, { "epoch": 1.218134346071683, "grad_norm": 0.03041486255824566, "learning_rate": 0.00014813856613695825, "loss": 0.1303, "step": 1283 }, { "epoch": 1.2190837882743888, "grad_norm": 0.042988792061805725, "learning_rate": 0.00014804168013890628, "loss": 0.1697, "step": 1284 }, { "epoch": 1.2200332304770947, "grad_norm": 0.03176325559616089, "learning_rate": 0.00014794473547786777, "loss": 0.1309, "step": 1285 }, { "epoch": 1.2209826726798005, "grad_norm": 0.032539233565330505, "learning_rate": 0.00014784773227222042, "loss": 0.1336, "step": 1286 }, { "epoch": 1.2219321148825064, "grad_norm": 0.03153330832719803, "learning_rate": 0.00014775067064041341, "loss": 0.1244, "step": 1287 }, { "epoch": 1.2228815570852125, "grad_norm": 0.03229093924164772, "learning_rate": 0.00014765355070096728, "loss": 0.1331, "step": 1288 }, { "epoch": 1.2238309992879184, "grad_norm": 0.03116600587964058, "learning_rate": 0.0001475563725724737, "loss": 0.1263, "step": 1289 }, { "epoch": 1.2247804414906243, "grad_norm": 0.027543194591999054, "learning_rate": 0.0001474591363735955, "loss": 0.1319, "step": 1290 }, { "epoch": 1.2257298836933301, "grad_norm": 0.031299810856580734, "learning_rate": 0.00014736184222306637, "loss": 0.1235, "step": 1291 }, { "epoch": 1.226679325896036, "grad_norm": 0.030574094504117966, "learning_rate": 0.00014726449023969073, "loss": 0.1337, "step": 1292 }, { "epoch": 1.227628768098742, "grad_norm": 0.04210914671421051, "learning_rate": 0.0001471670805423437, "loss": 0.1439, "step": 1293 }, { "epoch": 1.228578210301448, "grad_norm": 0.03234979510307312, "learning_rate": 0.00014706961324997077, "loss": 0.1339, "step": 1294 }, { "epoch": 1.2295276525041539, "grad_norm": 0.028707873076200485, "learning_rate": 0.00014697208848158782, "loss": 0.1271, "step": 1295 }, { "epoch": 1.2304770947068597, "grad_norm": 0.03162172809243202, "learning_rate": 0.0001468745063562809, "loss": 0.1291, "step": 1296 }, { "epoch": 1.2314265369095656, "grad_norm": 0.032575272023677826, "learning_rate": 0.00014677686699320614, "loss": 0.1345, "step": 1297 }, { "epoch": 1.2323759791122715, "grad_norm": 0.02751932106912136, "learning_rate": 0.0001466791705115895, "loss": 0.12, "step": 1298 }, { "epoch": 1.2333254213149774, "grad_norm": 0.026365652680397034, "learning_rate": 0.00014658141703072675, "loss": 0.1147, "step": 1299 }, { "epoch": 1.2342748635176835, "grad_norm": 0.03227808326482773, "learning_rate": 0.00014648360666998314, "loss": 0.1332, "step": 1300 }, { "epoch": 1.2352243057203893, "grad_norm": 0.057433344423770905, "learning_rate": 0.00014638573954879356, "loss": 0.2349, "step": 1301 }, { "epoch": 1.2361737479230952, "grad_norm": 0.0376310870051384, "learning_rate": 0.000146287815786662, "loss": 0.1588, "step": 1302 }, { "epoch": 1.237123190125801, "grad_norm": 0.04079489782452583, "learning_rate": 0.00014618983550316182, "loss": 0.1625, "step": 1303 }, { "epoch": 1.238072632328507, "grad_norm": 0.03367112949490547, "learning_rate": 0.00014609179881793524, "loss": 0.1266, "step": 1304 }, { "epoch": 1.2390220745312128, "grad_norm": 0.030257422477006912, "learning_rate": 0.0001459937058506934, "loss": 0.132, "step": 1305 }, { "epoch": 1.239971516733919, "grad_norm": 0.03830450400710106, "learning_rate": 0.00014589555672121622, "loss": 0.1349, "step": 1306 }, { "epoch": 1.2409209589366248, "grad_norm": 0.030318403616547585, "learning_rate": 0.00014579735154935213, "loss": 0.1346, "step": 1307 }, { "epoch": 1.2418704011393307, "grad_norm": 0.033780913800001144, "learning_rate": 0.000145699090455018, "loss": 0.1373, "step": 1308 }, { "epoch": 1.2428198433420365, "grad_norm": 0.03642027825117111, "learning_rate": 0.00014560077355819904, "loss": 0.1279, "step": 1309 }, { "epoch": 1.2437692855447424, "grad_norm": 0.027665693312883377, "learning_rate": 0.00014550240097894852, "loss": 0.1177, "step": 1310 }, { "epoch": 1.2447187277474483, "grad_norm": 0.030852187424898148, "learning_rate": 0.00014540397283738777, "loss": 0.1373, "step": 1311 }, { "epoch": 1.2456681699501542, "grad_norm": 0.03137564659118652, "learning_rate": 0.00014530548925370594, "loss": 0.129, "step": 1312 }, { "epoch": 1.2466176121528603, "grad_norm": 0.032402969896793365, "learning_rate": 0.0001452069503481599, "loss": 0.1394, "step": 1313 }, { "epoch": 1.2475670543555661, "grad_norm": 0.049039825797080994, "learning_rate": 0.00014510835624107396, "loss": 0.1593, "step": 1314 }, { "epoch": 1.248516496558272, "grad_norm": 0.0339655838906765, "learning_rate": 0.00014500970705284006, "loss": 0.1367, "step": 1315 }, { "epoch": 1.249465938760978, "grad_norm": 0.040319304913282394, "learning_rate": 0.00014491100290391716, "loss": 0.1679, "step": 1316 }, { "epoch": 1.2504153809636838, "grad_norm": 0.024075858294963837, "learning_rate": 0.00014481224391483152, "loss": 0.1273, "step": 1317 }, { "epoch": 1.2513648231663899, "grad_norm": 0.02570619434118271, "learning_rate": 0.00014471343020617625, "loss": 0.1277, "step": 1318 }, { "epoch": 1.2523142653690957, "grad_norm": 0.028086170554161072, "learning_rate": 0.00014461456189861132, "loss": 0.1246, "step": 1319 }, { "epoch": 1.2532637075718016, "grad_norm": 0.02902062050998211, "learning_rate": 0.0001445156391128633, "loss": 0.1305, "step": 1320 }, { "epoch": 1.2542131497745075, "grad_norm": 0.030352482572197914, "learning_rate": 0.00014441666196972542, "loss": 0.1329, "step": 1321 }, { "epoch": 1.2551625919772134, "grad_norm": 0.029998816549777985, "learning_rate": 0.00014431763059005718, "loss": 0.131, "step": 1322 }, { "epoch": 1.2561120341799192, "grad_norm": 0.028111204504966736, "learning_rate": 0.00014421854509478435, "loss": 0.124, "step": 1323 }, { "epoch": 1.257061476382625, "grad_norm": 0.02926759235560894, "learning_rate": 0.00014411940560489877, "loss": 0.1215, "step": 1324 }, { "epoch": 1.258010918585331, "grad_norm": 0.03321680426597595, "learning_rate": 0.00014402021224145815, "loss": 0.1216, "step": 1325 }, { "epoch": 1.258960360788037, "grad_norm": 0.03386010602116585, "learning_rate": 0.00014392096512558613, "loss": 0.1335, "step": 1326 }, { "epoch": 1.259909802990743, "grad_norm": 0.03989921137690544, "learning_rate": 0.0001438216643784718, "loss": 0.1481, "step": 1327 }, { "epoch": 1.2608592451934488, "grad_norm": 0.030915161594748497, "learning_rate": 0.00014372231012136995, "loss": 0.1254, "step": 1328 }, { "epoch": 1.2618086873961547, "grad_norm": 0.04395739734172821, "learning_rate": 0.00014362290247560053, "loss": 0.1537, "step": 1329 }, { "epoch": 1.2627581295988608, "grad_norm": 0.02942941151559353, "learning_rate": 0.00014352344156254873, "loss": 0.1248, "step": 1330 }, { "epoch": 1.2637075718015667, "grad_norm": 0.02858722023665905, "learning_rate": 0.00014342392750366485, "loss": 0.1236, "step": 1331 }, { "epoch": 1.2646570140042726, "grad_norm": 0.029218707233667374, "learning_rate": 0.000143324360420464, "loss": 0.1227, "step": 1332 }, { "epoch": 1.2656064562069784, "grad_norm": 0.03079938143491745, "learning_rate": 0.0001432247404345261, "loss": 0.1251, "step": 1333 }, { "epoch": 1.2665558984096843, "grad_norm": 0.03626713901758194, "learning_rate": 0.00014312506766749563, "loss": 0.1407, "step": 1334 }, { "epoch": 1.2675053406123902, "grad_norm": 0.026556458324193954, "learning_rate": 0.00014302534224108152, "loss": 0.1235, "step": 1335 }, { "epoch": 1.268454782815096, "grad_norm": 0.033421531319618225, "learning_rate": 0.00014292556427705706, "loss": 0.1324, "step": 1336 }, { "epoch": 1.269404225017802, "grad_norm": 0.0425841398537159, "learning_rate": 0.00014282573389725966, "loss": 0.1674, "step": 1337 }, { "epoch": 1.270353667220508, "grad_norm": 0.03258546441793442, "learning_rate": 0.00014272585122359068, "loss": 0.131, "step": 1338 }, { "epoch": 1.271303109423214, "grad_norm": 0.03566194325685501, "learning_rate": 0.00014262591637801536, "loss": 0.128, "step": 1339 }, { "epoch": 1.2722525516259198, "grad_norm": 0.03155380114912987, "learning_rate": 0.0001425259294825627, "loss": 0.1277, "step": 1340 }, { "epoch": 1.2732019938286256, "grad_norm": 0.04435742273926735, "learning_rate": 0.00014242589065932524, "loss": 0.1594, "step": 1341 }, { "epoch": 1.2741514360313315, "grad_norm": 0.03839895501732826, "learning_rate": 0.0001423258000304589, "loss": 0.1598, "step": 1342 }, { "epoch": 1.2751008782340376, "grad_norm": 0.037625472992658615, "learning_rate": 0.00014222565771818282, "loss": 0.1276, "step": 1343 }, { "epoch": 1.2760503204367435, "grad_norm": 0.029852135106921196, "learning_rate": 0.00014212546384477934, "loss": 0.1272, "step": 1344 }, { "epoch": 1.2769997626394494, "grad_norm": 0.04018719121813774, "learning_rate": 0.00014202521853259368, "loss": 0.153, "step": 1345 }, { "epoch": 1.2779492048421552, "grad_norm": 0.028893720358610153, "learning_rate": 0.00014192492190403402, "loss": 0.1245, "step": 1346 }, { "epoch": 1.2788986470448611, "grad_norm": 0.029052307829260826, "learning_rate": 0.000141824574081571, "loss": 0.1272, "step": 1347 }, { "epoch": 1.279848089247567, "grad_norm": 0.03023959882557392, "learning_rate": 0.00014172417518773788, "loss": 0.1259, "step": 1348 }, { "epoch": 1.2807975314502729, "grad_norm": 0.06486006826162338, "learning_rate": 0.00014162372534513027, "loss": 0.2279, "step": 1349 }, { "epoch": 1.2817469736529787, "grad_norm": 0.034129489213228226, "learning_rate": 0.00014152322467640599, "loss": 0.138, "step": 1350 }, { "epoch": 1.2826964158556848, "grad_norm": 0.034559451043605804, "learning_rate": 0.0001414226733042849, "loss": 0.1321, "step": 1351 }, { "epoch": 1.2836458580583907, "grad_norm": 0.03269064798951149, "learning_rate": 0.0001413220713515489, "loss": 0.1297, "step": 1352 }, { "epoch": 1.2845953002610966, "grad_norm": 0.030022764578461647, "learning_rate": 0.0001412214189410414, "loss": 0.1278, "step": 1353 }, { "epoch": 1.2855447424638025, "grad_norm": 0.05017710104584694, "learning_rate": 0.00014112071619566766, "loss": 0.1572, "step": 1354 }, { "epoch": 1.2864941846665086, "grad_norm": 0.035493746399879456, "learning_rate": 0.00014101996323839433, "loss": 0.1277, "step": 1355 }, { "epoch": 1.2874436268692144, "grad_norm": 0.03152285888791084, "learning_rate": 0.0001409191601922493, "loss": 0.1321, "step": 1356 }, { "epoch": 1.2883930690719203, "grad_norm": 0.029683001339435577, "learning_rate": 0.00014081830718032175, "loss": 0.1224, "step": 1357 }, { "epoch": 1.2893425112746262, "grad_norm": 0.03202647715806961, "learning_rate": 0.0001407174043257617, "loss": 0.13, "step": 1358 }, { "epoch": 1.290291953477332, "grad_norm": 0.026956327259540558, "learning_rate": 0.00014061645175178025, "loss": 0.1225, "step": 1359 }, { "epoch": 1.291241395680038, "grad_norm": 0.03426060825586319, "learning_rate": 0.00014051544958164903, "loss": 0.1345, "step": 1360 }, { "epoch": 1.2921908378827438, "grad_norm": 0.031120451167225838, "learning_rate": 0.00014041439793870036, "loss": 0.1246, "step": 1361 }, { "epoch": 1.2931402800854497, "grad_norm": 0.02656574547290802, "learning_rate": 0.00014031329694632683, "loss": 0.1297, "step": 1362 }, { "epoch": 1.2940897222881558, "grad_norm": 0.02752675488591194, "learning_rate": 0.00014021214672798143, "loss": 0.1294, "step": 1363 }, { "epoch": 1.2950391644908616, "grad_norm": 0.02884535677731037, "learning_rate": 0.00014011094740717714, "loss": 0.126, "step": 1364 }, { "epoch": 1.2959886066935675, "grad_norm": 0.03029620461165905, "learning_rate": 0.00014000969910748704, "loss": 0.1338, "step": 1365 }, { "epoch": 1.2969380488962734, "grad_norm": 0.04302069917321205, "learning_rate": 0.00013990840195254384, "loss": 0.1653, "step": 1366 }, { "epoch": 1.2978874910989793, "grad_norm": 0.048259928822517395, "learning_rate": 0.00013980705606604011, "loss": 0.1269, "step": 1367 }, { "epoch": 1.2988369333016854, "grad_norm": 0.029876641929149628, "learning_rate": 0.00013970566157172774, "loss": 0.1282, "step": 1368 }, { "epoch": 1.2997863755043912, "grad_norm": 0.35130763053894043, "learning_rate": 0.00013960421859341804, "loss": 0.1434, "step": 1369 }, { "epoch": 1.3007358177070971, "grad_norm": 0.23888662457466125, "learning_rate": 0.00013950272725498156, "loss": 0.186, "step": 1370 }, { "epoch": 1.301685259909803, "grad_norm": 0.24867364764213562, "learning_rate": 0.00013940118768034792, "loss": 0.1585, "step": 1371 }, { "epoch": 1.3026347021125089, "grad_norm": 0.10958488285541534, "learning_rate": 0.0001392995999935055, "loss": 0.1448, "step": 1372 }, { "epoch": 1.3035841443152147, "grad_norm": 0.05493846908211708, "learning_rate": 0.0001391979643185016, "loss": 0.1292, "step": 1373 }, { "epoch": 1.3045335865179206, "grad_norm": 0.0429663360118866, "learning_rate": 0.000139096280779442, "loss": 0.1243, "step": 1374 }, { "epoch": 1.3054830287206267, "grad_norm": 0.02995472215116024, "learning_rate": 0.000138994549500491, "loss": 0.1216, "step": 1375 }, { "epoch": 1.3064324709233326, "grad_norm": 0.04113904386758804, "learning_rate": 0.00013889277060587119, "loss": 0.1586, "step": 1376 }, { "epoch": 1.3073819131260385, "grad_norm": 0.030523164197802544, "learning_rate": 0.0001387909442198632, "loss": 0.1212, "step": 1377 }, { "epoch": 1.3083313553287443, "grad_norm": 0.04093822091817856, "learning_rate": 0.00013868907046680576, "loss": 0.1254, "step": 1378 }, { "epoch": 1.3092807975314502, "grad_norm": 0.04895343258976936, "learning_rate": 0.0001385871494710954, "loss": 0.1636, "step": 1379 }, { "epoch": 1.3102302397341563, "grad_norm": 0.09062381833791733, "learning_rate": 0.0001384851813571864, "loss": 0.167, "step": 1380 }, { "epoch": 1.3111796819368622, "grad_norm": 0.034514930099248886, "learning_rate": 0.00013838316624959044, "loss": 0.1186, "step": 1381 }, { "epoch": 1.312129124139568, "grad_norm": 0.052746132016181946, "learning_rate": 0.0001382811042728767, "loss": 0.1289, "step": 1382 }, { "epoch": 1.313078566342274, "grad_norm": 0.06090299040079117, "learning_rate": 0.00013817899555167154, "loss": 0.1599, "step": 1383 }, { "epoch": 1.3140280085449798, "grad_norm": 0.036167677491903305, "learning_rate": 0.00013807684021065842, "loss": 0.1294, "step": 1384 }, { "epoch": 1.3149774507476857, "grad_norm": 0.05916972458362579, "learning_rate": 0.00013797463837457775, "loss": 0.1263, "step": 1385 }, { "epoch": 1.3159268929503916, "grad_norm": 0.03488500416278839, "learning_rate": 0.00013787239016822662, "loss": 0.1347, "step": 1386 }, { "epoch": 1.3168763351530974, "grad_norm": 0.038088779896497726, "learning_rate": 0.00013777009571645885, "loss": 0.1302, "step": 1387 }, { "epoch": 1.3178257773558035, "grad_norm": 0.05069038271903992, "learning_rate": 0.00013766775514418469, "loss": 0.1553, "step": 1388 }, { "epoch": 1.3187752195585094, "grad_norm": 0.06475794315338135, "learning_rate": 0.00013756536857637065, "loss": 0.2018, "step": 1389 }, { "epoch": 1.3197246617612153, "grad_norm": 0.03393110632896423, "learning_rate": 0.00013746293613803952, "loss": 0.1238, "step": 1390 }, { "epoch": 1.3206741039639212, "grad_norm": 0.04623769596219063, "learning_rate": 0.00013736045795427002, "loss": 0.1603, "step": 1391 }, { "epoch": 1.321623546166627, "grad_norm": 0.03696979209780693, "learning_rate": 0.0001372579341501967, "loss": 0.1291, "step": 1392 }, { "epoch": 1.3225729883693331, "grad_norm": 0.041281502693891525, "learning_rate": 0.00013715536485100994, "loss": 0.1344, "step": 1393 }, { "epoch": 1.323522430572039, "grad_norm": 0.03636416420340538, "learning_rate": 0.00013705275018195557, "loss": 0.1334, "step": 1394 }, { "epoch": 1.3244718727747449, "grad_norm": 0.037941355258226395, "learning_rate": 0.0001369500902683348, "loss": 0.1256, "step": 1395 }, { "epoch": 1.3254213149774507, "grad_norm": 0.04853476956486702, "learning_rate": 0.0001368473852355042, "loss": 0.1604, "step": 1396 }, { "epoch": 1.3263707571801566, "grad_norm": 0.034059878438711166, "learning_rate": 0.00013674463520887533, "loss": 0.1308, "step": 1397 }, { "epoch": 1.3273201993828625, "grad_norm": 0.03482759743928909, "learning_rate": 0.00013664184031391473, "loss": 0.128, "step": 1398 }, { "epoch": 1.3282696415855684, "grad_norm": 0.032961416989564896, "learning_rate": 0.00013653900067614377, "loss": 0.1276, "step": 1399 }, { "epoch": 1.3292190837882745, "grad_norm": 0.03538922220468521, "learning_rate": 0.00013643611642113842, "loss": 0.1215, "step": 1400 }, { "epoch": 1.3301685259909803, "grad_norm": 0.032544538378715515, "learning_rate": 0.00013633318767452903, "loss": 0.1235, "step": 1401 }, { "epoch": 1.3311179681936862, "grad_norm": 0.04236935079097748, "learning_rate": 0.00013623021456200048, "loss": 0.1663, "step": 1402 }, { "epoch": 1.332067410396392, "grad_norm": 0.04283679649233818, "learning_rate": 0.00013612719720929164, "loss": 0.1622, "step": 1403 }, { "epoch": 1.333016852599098, "grad_norm": 0.03691123425960541, "learning_rate": 0.00013602413574219553, "loss": 0.1247, "step": 1404 }, { "epoch": 1.333966294801804, "grad_norm": 0.028608130291104317, "learning_rate": 0.000135921030286559, "loss": 0.1153, "step": 1405 }, { "epoch": 1.33491573700451, "grad_norm": 0.03310587257146835, "learning_rate": 0.00013581788096828253, "loss": 0.1305, "step": 1406 }, { "epoch": 1.3358651792072158, "grad_norm": 0.03368659317493439, "learning_rate": 0.00013571468791332024, "loss": 0.128, "step": 1407 }, { "epoch": 1.3368146214099217, "grad_norm": 0.04785076901316643, "learning_rate": 0.00013561145124767968, "loss": 0.1715, "step": 1408 }, { "epoch": 1.3377640636126276, "grad_norm": 0.03625485301017761, "learning_rate": 0.0001355081710974217, "loss": 0.1305, "step": 1409 }, { "epoch": 1.3387135058153334, "grad_norm": 0.03318242356181145, "learning_rate": 0.00013540484758866, "loss": 0.1244, "step": 1410 }, { "epoch": 1.3396629480180393, "grad_norm": 0.03148429095745087, "learning_rate": 0.0001353014808475615, "loss": 0.1311, "step": 1411 }, { "epoch": 1.3406123902207452, "grad_norm": 0.03518190607428551, "learning_rate": 0.00013519807100034577, "loss": 0.1276, "step": 1412 }, { "epoch": 1.3415618324234513, "grad_norm": 0.031286850571632385, "learning_rate": 0.00013509461817328507, "loss": 0.1252, "step": 1413 }, { "epoch": 1.3425112746261572, "grad_norm": 0.04668812453746796, "learning_rate": 0.00013499112249270407, "loss": 0.1639, "step": 1414 }, { "epoch": 1.343460716828863, "grad_norm": 0.03220203518867493, "learning_rate": 0.00013488758408497988, "loss": 0.1254, "step": 1415 }, { "epoch": 1.344410159031569, "grad_norm": 0.03599967062473297, "learning_rate": 0.0001347840030765417, "loss": 0.1307, "step": 1416 }, { "epoch": 1.3453596012342748, "grad_norm": 0.03225992992520332, "learning_rate": 0.00013468037959387075, "loss": 0.12, "step": 1417 }, { "epoch": 1.3463090434369809, "grad_norm": 0.0338221937417984, "learning_rate": 0.00013457671376350012, "loss": 0.1199, "step": 1418 }, { "epoch": 1.3472584856396868, "grad_norm": 0.046623844653367996, "learning_rate": 0.00013447300571201468, "loss": 0.1695, "step": 1419 }, { "epoch": 1.3482079278423926, "grad_norm": 0.029645482078194618, "learning_rate": 0.00013436925556605078, "loss": 0.127, "step": 1420 }, { "epoch": 1.3491573700450985, "grad_norm": 0.03167693316936493, "learning_rate": 0.00013426546345229618, "loss": 0.1268, "step": 1421 }, { "epoch": 1.3501068122478044, "grad_norm": 0.04903126880526543, "learning_rate": 0.0001341616294974899, "loss": 0.1616, "step": 1422 }, { "epoch": 1.3510562544505103, "grad_norm": 0.03323996067047119, "learning_rate": 0.00013405775382842206, "loss": 0.1345, "step": 1423 }, { "epoch": 1.3520056966532161, "grad_norm": 0.03047449141740799, "learning_rate": 0.0001339538365719337, "loss": 0.1286, "step": 1424 }, { "epoch": 1.3529551388559222, "grad_norm": 0.03170877322554588, "learning_rate": 0.00013384987785491665, "loss": 0.1264, "step": 1425 }, { "epoch": 1.353904581058628, "grad_norm": 0.03209366276860237, "learning_rate": 0.00013374587780431337, "loss": 0.1256, "step": 1426 }, { "epoch": 1.354854023261334, "grad_norm": 0.04508209601044655, "learning_rate": 0.00013364183654711678, "loss": 0.1675, "step": 1427 }, { "epoch": 1.3558034654640398, "grad_norm": 0.0423690564930439, "learning_rate": 0.00013353775421037008, "loss": 0.1578, "step": 1428 }, { "epoch": 1.3567529076667457, "grad_norm": 0.03641896694898605, "learning_rate": 0.0001334336309211668, "loss": 0.1321, "step": 1429 }, { "epoch": 1.3577023498694518, "grad_norm": 0.03118027374148369, "learning_rate": 0.00013332946680665023, "loss": 0.1284, "step": 1430 }, { "epoch": 1.3586517920721577, "grad_norm": 0.029232513159513474, "learning_rate": 0.00013322526199401367, "loss": 0.1251, "step": 1431 }, { "epoch": 1.3596012342748636, "grad_norm": 0.028796685859560966, "learning_rate": 0.00013312101661050007, "loss": 0.1266, "step": 1432 }, { "epoch": 1.3605506764775694, "grad_norm": 0.04318710416555405, "learning_rate": 0.00013301673078340196, "loss": 0.162, "step": 1433 }, { "epoch": 1.3615001186802753, "grad_norm": 0.0319884791970253, "learning_rate": 0.00013291240464006118, "loss": 0.1248, "step": 1434 }, { "epoch": 1.3624495608829812, "grad_norm": 0.034977950155735016, "learning_rate": 0.0001328080383078689, "loss": 0.1325, "step": 1435 }, { "epoch": 1.363399003085687, "grad_norm": 0.026079673320055008, "learning_rate": 0.00013270363191426524, "loss": 0.1266, "step": 1436 }, { "epoch": 1.364348445288393, "grad_norm": 0.03158127889037132, "learning_rate": 0.0001325991855867394, "loss": 0.1326, "step": 1437 }, { "epoch": 1.365297887491099, "grad_norm": 0.030608203262090683, "learning_rate": 0.00013249469945282916, "loss": 0.1358, "step": 1438 }, { "epoch": 1.366247329693805, "grad_norm": 0.028404321521520615, "learning_rate": 0.00013239017364012105, "loss": 0.1273, "step": 1439 }, { "epoch": 1.3671967718965108, "grad_norm": 0.03177287429571152, "learning_rate": 0.00013228560827624995, "loss": 0.1241, "step": 1440 }, { "epoch": 1.3681462140992167, "grad_norm": 0.030803440138697624, "learning_rate": 0.00013218100348889912, "loss": 0.1271, "step": 1441 }, { "epoch": 1.3690956563019228, "grad_norm": 0.038757532835006714, "learning_rate": 0.0001320763594057999, "loss": 0.1302, "step": 1442 }, { "epoch": 1.3700450985046286, "grad_norm": 0.030192391946911812, "learning_rate": 0.00013197167615473164, "loss": 0.1246, "step": 1443 }, { "epoch": 1.3709945407073345, "grad_norm": 0.056465089321136475, "learning_rate": 0.00013186695386352158, "loss": 0.186, "step": 1444 }, { "epoch": 1.3719439829100404, "grad_norm": 0.029975950717926025, "learning_rate": 0.00013176219266004442, "loss": 0.1238, "step": 1445 }, { "epoch": 1.3728934251127463, "grad_norm": 0.03526061028242111, "learning_rate": 0.00013165739267222262, "loss": 0.1198, "step": 1446 }, { "epoch": 1.3738428673154521, "grad_norm": 0.031407009810209274, "learning_rate": 0.0001315525540280259, "loss": 0.1283, "step": 1447 }, { "epoch": 1.374792309518158, "grad_norm": 0.05062803998589516, "learning_rate": 0.0001314476768554712, "loss": 0.1598, "step": 1448 }, { "epoch": 1.3757417517208639, "grad_norm": 0.05837153270840645, "learning_rate": 0.0001313427612826224, "loss": 0.1706, "step": 1449 }, { "epoch": 1.37669119392357, "grad_norm": 0.032810915261507034, "learning_rate": 0.0001312378074375904, "loss": 0.1228, "step": 1450 }, { "epoch": 1.3776406361262759, "grad_norm": 0.052471403032541275, "learning_rate": 0.0001311328154485328, "loss": 0.2028, "step": 1451 }, { "epoch": 1.3785900783289817, "grad_norm": 0.062009330838918686, "learning_rate": 0.00013102778544365378, "loss": 0.1659, "step": 1452 }, { "epoch": 1.3795395205316876, "grad_norm": 0.03157106414437294, "learning_rate": 0.00013092271755120392, "loss": 0.1103, "step": 1453 }, { "epoch": 1.3804889627343935, "grad_norm": 0.03269607201218605, "learning_rate": 0.00013081761189948006, "loss": 0.1274, "step": 1454 }, { "epoch": 1.3814384049370996, "grad_norm": 0.032491281628608704, "learning_rate": 0.00013071246861682515, "loss": 0.1299, "step": 1455 }, { "epoch": 1.3823878471398054, "grad_norm": 0.04476369544863701, "learning_rate": 0.00013060728783162814, "loss": 0.1543, "step": 1456 }, { "epoch": 1.3833372893425113, "grad_norm": 0.03646747022867203, "learning_rate": 0.0001305020696723237, "loss": 0.1401, "step": 1457 }, { "epoch": 1.3842867315452172, "grad_norm": 0.03056688979268074, "learning_rate": 0.0001303968142673922, "loss": 0.1318, "step": 1458 }, { "epoch": 1.385236173747923, "grad_norm": 0.0505138523876667, "learning_rate": 0.00013029152174535942, "loss": 0.1702, "step": 1459 }, { "epoch": 1.386185615950629, "grad_norm": 0.03478003665804863, "learning_rate": 0.00013018619223479654, "loss": 0.136, "step": 1460 }, { "epoch": 1.3871350581533348, "grad_norm": 0.03196396306157112, "learning_rate": 0.00013008082586431983, "loss": 0.1235, "step": 1461 }, { "epoch": 1.3880845003560407, "grad_norm": 0.030580265447497368, "learning_rate": 0.0001299754227625907, "loss": 0.1274, "step": 1462 }, { "epoch": 1.3890339425587468, "grad_norm": 0.043844275176525116, "learning_rate": 0.00012986998305831524, "loss": 0.172, "step": 1463 }, { "epoch": 1.3899833847614527, "grad_norm": 0.031638097018003464, "learning_rate": 0.00012976450688024433, "loss": 0.1221, "step": 1464 }, { "epoch": 1.3909328269641585, "grad_norm": 0.030004551634192467, "learning_rate": 0.00012965899435717337, "loss": 0.1321, "step": 1465 }, { "epoch": 1.3918822691668644, "grad_norm": 0.031170252710580826, "learning_rate": 0.00012955344561794218, "loss": 0.1327, "step": 1466 }, { "epoch": 1.3928317113695705, "grad_norm": 0.02902391366660595, "learning_rate": 0.00012944786079143472, "loss": 0.1266, "step": 1467 }, { "epoch": 1.3937811535722764, "grad_norm": 0.031269557774066925, "learning_rate": 0.00012934224000657913, "loss": 0.1237, "step": 1468 }, { "epoch": 1.3947305957749823, "grad_norm": 0.04232865571975708, "learning_rate": 0.0001292365833923473, "loss": 0.1529, "step": 1469 }, { "epoch": 1.3956800379776881, "grad_norm": 0.03645455837249756, "learning_rate": 0.00012913089107775502, "loss": 0.1594, "step": 1470 }, { "epoch": 1.396629480180394, "grad_norm": 0.029367268085479736, "learning_rate": 0.00012902516319186161, "loss": 0.121, "step": 1471 }, { "epoch": 1.3975789223830999, "grad_norm": 0.03407928720116615, "learning_rate": 0.00012891939986376985, "loss": 0.1289, "step": 1472 }, { "epoch": 1.3985283645858058, "grad_norm": 0.034839022904634476, "learning_rate": 0.00012881360122262575, "loss": 0.1321, "step": 1473 }, { "epoch": 1.3994778067885116, "grad_norm": 0.04005248472094536, "learning_rate": 0.00012870776739761847, "loss": 0.17, "step": 1474 }, { "epoch": 1.4004272489912177, "grad_norm": 0.045347243547439575, "learning_rate": 0.00012860189851798012, "loss": 0.16, "step": 1475 }, { "epoch": 1.4013766911939236, "grad_norm": 0.031093263998627663, "learning_rate": 0.00012849599471298565, "loss": 0.1268, "step": 1476 }, { "epoch": 1.4023261333966295, "grad_norm": 0.025807669386267662, "learning_rate": 0.00012839005611195269, "loss": 0.1155, "step": 1477 }, { "epoch": 1.4032755755993354, "grad_norm": 0.04496198520064354, "learning_rate": 0.00012828408284424117, "loss": 0.1696, "step": 1478 }, { "epoch": 1.4042250178020412, "grad_norm": 0.028986521065235138, "learning_rate": 0.00012817807503925357, "loss": 0.1244, "step": 1479 }, { "epoch": 1.4051744600047473, "grad_norm": 0.043852079659700394, "learning_rate": 0.00012807203282643443, "loss": 0.1562, "step": 1480 }, { "epoch": 1.4061239022074532, "grad_norm": 0.028449110686779022, "learning_rate": 0.00012796595633527032, "loss": 0.1276, "step": 1481 }, { "epoch": 1.407073344410159, "grad_norm": 0.03772464022040367, "learning_rate": 0.00012785984569528975, "loss": 0.163, "step": 1482 }, { "epoch": 1.408022786612865, "grad_norm": 0.03227540850639343, "learning_rate": 0.00012775370103606276, "loss": 0.1272, "step": 1483 }, { "epoch": 1.4089722288155708, "grad_norm": 0.03001963160932064, "learning_rate": 0.0001276475224872011, "loss": 0.1256, "step": 1484 }, { "epoch": 1.4099216710182767, "grad_norm": 0.0357728935778141, "learning_rate": 0.00012754131017835777, "loss": 0.1355, "step": 1485 }, { "epoch": 1.4108711132209826, "grad_norm": 0.03219794109463692, "learning_rate": 0.0001274350642392271, "loss": 0.1304, "step": 1486 }, { "epoch": 1.4118205554236885, "grad_norm": 0.04606242850422859, "learning_rate": 0.00012732878479954445, "loss": 0.1629, "step": 1487 }, { "epoch": 1.4127699976263945, "grad_norm": 0.04288827255368233, "learning_rate": 0.000127222471989086, "loss": 0.1667, "step": 1488 }, { "epoch": 1.4137194398291004, "grad_norm": 0.031533095985651016, "learning_rate": 0.0001271161259376688, "loss": 0.1266, "step": 1489 }, { "epoch": 1.4146688820318063, "grad_norm": 0.03418329730629921, "learning_rate": 0.00012700974677515046, "loss": 0.1441, "step": 1490 }, { "epoch": 1.4156183242345122, "grad_norm": 0.028918685391545296, "learning_rate": 0.00012690333463142897, "loss": 0.117, "step": 1491 }, { "epoch": 1.4165677664372183, "grad_norm": 0.04630662500858307, "learning_rate": 0.00012679688963644265, "loss": 0.1694, "step": 1492 }, { "epoch": 1.4175172086399241, "grad_norm": 0.028670761734247208, "learning_rate": 0.00012669041192016993, "loss": 0.1218, "step": 1493 }, { "epoch": 1.41846665084263, "grad_norm": 0.03250902146100998, "learning_rate": 0.0001265839016126291, "loss": 0.1353, "step": 1494 }, { "epoch": 1.419416093045336, "grad_norm": 0.03904202580451965, "learning_rate": 0.00012647735884387842, "loss": 0.1566, "step": 1495 }, { "epoch": 1.4203655352480418, "grad_norm": 0.030610278248786926, "learning_rate": 0.00012637078374401568, "loss": 0.1248, "step": 1496 }, { "epoch": 1.4213149774507476, "grad_norm": 0.0320439413189888, "learning_rate": 0.00012626417644317808, "loss": 0.1341, "step": 1497 }, { "epoch": 1.4222644196534535, "grad_norm": 0.03740748390555382, "learning_rate": 0.0001261575370715423, "loss": 0.1374, "step": 1498 }, { "epoch": 1.4232138618561594, "grad_norm": 0.041164278984069824, "learning_rate": 0.00012605086575932407, "loss": 0.1242, "step": 1499 }, { "epoch": 1.4241633040588655, "grad_norm": 0.02908271551132202, "learning_rate": 0.00012594416263677816, "loss": 0.1224, "step": 1500 }, { "epoch": 1.4251127462615714, "grad_norm": 0.030539128929376602, "learning_rate": 0.0001258374278341982, "loss": 0.1236, "step": 1501 }, { "epoch": 1.4260621884642772, "grad_norm": 0.027197500690817833, "learning_rate": 0.00012573066148191647, "loss": 0.1254, "step": 1502 }, { "epoch": 1.427011630666983, "grad_norm": 0.031813718378543854, "learning_rate": 0.00012562386371030377, "loss": 0.1294, "step": 1503 }, { "epoch": 1.427961072869689, "grad_norm": 0.04184641316533089, "learning_rate": 0.00012551703464976928, "loss": 0.1615, "step": 1504 }, { "epoch": 1.428910515072395, "grad_norm": 0.03790717199444771, "learning_rate": 0.00012541017443076042, "loss": 0.1638, "step": 1505 }, { "epoch": 1.429859957275101, "grad_norm": 0.03084125556051731, "learning_rate": 0.00012530328318376258, "loss": 0.1292, "step": 1506 }, { "epoch": 1.4308093994778068, "grad_norm": 0.042278289794921875, "learning_rate": 0.00012519636103929912, "loss": 0.1691, "step": 1507 }, { "epoch": 1.4317588416805127, "grad_norm": 0.02734595723450184, "learning_rate": 0.0001250894081279311, "loss": 0.1248, "step": 1508 }, { "epoch": 1.4327082838832186, "grad_norm": 0.02997264452278614, "learning_rate": 0.00012498242458025712, "loss": 0.124, "step": 1509 }, { "epoch": 1.4336577260859245, "grad_norm": 0.031008126214146614, "learning_rate": 0.00012487541052691323, "loss": 0.1335, "step": 1510 }, { "epoch": 1.4346071682886303, "grad_norm": 0.042471520602703094, "learning_rate": 0.0001247683660985727, "loss": 0.1589, "step": 1511 }, { "epoch": 1.4355566104913362, "grad_norm": 0.027912134304642677, "learning_rate": 0.00012466129142594588, "loss": 0.1208, "step": 1512 }, { "epoch": 1.4365060526940423, "grad_norm": 0.03753120079636574, "learning_rate": 0.0001245541866397801, "loss": 0.1626, "step": 1513 }, { "epoch": 1.4374554948967482, "grad_norm": 0.02756452187895775, "learning_rate": 0.0001244470518708594, "loss": 0.1215, "step": 1514 }, { "epoch": 1.438404937099454, "grad_norm": 0.03357706964015961, "learning_rate": 0.0001243398872500045, "loss": 0.1376, "step": 1515 }, { "epoch": 1.43935437930216, "grad_norm": 0.032955266535282135, "learning_rate": 0.00012423269290807258, "loss": 0.1364, "step": 1516 }, { "epoch": 1.440303821504866, "grad_norm": 0.03405732661485672, "learning_rate": 0.000124125468975957, "loss": 0.126, "step": 1517 }, { "epoch": 1.441253263707572, "grad_norm": 0.03659766912460327, "learning_rate": 0.00012401821558458728, "loss": 0.1561, "step": 1518 }, { "epoch": 1.4422027059102778, "grad_norm": 0.030413653701543808, "learning_rate": 0.00012391093286492905, "loss": 0.1253, "step": 1519 }, { "epoch": 1.4431521481129836, "grad_norm": 0.027322586625814438, "learning_rate": 0.00012380362094798362, "loss": 0.1217, "step": 1520 }, { "epoch": 1.4441015903156895, "grad_norm": 0.037558842450380325, "learning_rate": 0.00012369627996478797, "loss": 0.1348, "step": 1521 }, { "epoch": 1.4450510325183954, "grad_norm": 0.028927726671099663, "learning_rate": 0.0001235889100464146, "loss": 0.1184, "step": 1522 }, { "epoch": 1.4460004747211013, "grad_norm": 0.028258686885237694, "learning_rate": 0.00012348151132397133, "loss": 0.1276, "step": 1523 }, { "epoch": 1.4469499169238071, "grad_norm": 0.027749182656407356, "learning_rate": 0.00012337408392860117, "loss": 0.1235, "step": 1524 }, { "epoch": 1.4478993591265132, "grad_norm": 0.03684193268418312, "learning_rate": 0.0001232666279914821, "loss": 0.1421, "step": 1525 }, { "epoch": 1.4488488013292191, "grad_norm": 0.033882539719343185, "learning_rate": 0.00012315914364382705, "loss": 0.1236, "step": 1526 }, { "epoch": 1.449798243531925, "grad_norm": 0.03675851225852966, "learning_rate": 0.00012305163101688352, "loss": 0.1618, "step": 1527 }, { "epoch": 1.4507476857346309, "grad_norm": 0.030739063397049904, "learning_rate": 0.00012294409024193355, "loss": 0.127, "step": 1528 }, { "epoch": 1.4516971279373367, "grad_norm": 0.026678606867790222, "learning_rate": 0.00012283652145029362, "loss": 0.1191, "step": 1529 }, { "epoch": 1.4526465701400428, "grad_norm": 0.028862981125712395, "learning_rate": 0.0001227289247733144, "loss": 0.1255, "step": 1530 }, { "epoch": 1.4535960123427487, "grad_norm": 0.02993926964700222, "learning_rate": 0.0001226213003423807, "loss": 0.1235, "step": 1531 }, { "epoch": 1.4545454545454546, "grad_norm": 0.02765621617436409, "learning_rate": 0.00012251364828891093, "loss": 0.127, "step": 1532 }, { "epoch": 1.4554948967481605, "grad_norm": 0.03139231353998184, "learning_rate": 0.00012240596874435756, "loss": 0.1225, "step": 1533 }, { "epoch": 1.4564443389508663, "grad_norm": 0.030977755784988403, "learning_rate": 0.00012229826184020649, "loss": 0.1249, "step": 1534 }, { "epoch": 1.4573937811535722, "grad_norm": 0.03158799931406975, "learning_rate": 0.000122190527707977, "loss": 0.1254, "step": 1535 }, { "epoch": 1.458343223356278, "grad_norm": 0.03206062689423561, "learning_rate": 0.00012208276647922162, "loss": 0.1332, "step": 1536 }, { "epoch": 1.459292665558984, "grad_norm": 0.028509238734841347, "learning_rate": 0.00012197497828552601, "loss": 0.1196, "step": 1537 }, { "epoch": 1.46024210776169, "grad_norm": 0.03207945078611374, "learning_rate": 0.0001218671632585088, "loss": 0.1241, "step": 1538 }, { "epoch": 1.461191549964396, "grad_norm": 0.026839956641197205, "learning_rate": 0.00012175932152982125, "loss": 0.1246, "step": 1539 }, { "epoch": 1.4621409921671018, "grad_norm": 0.03129103407263756, "learning_rate": 0.0001216514532311474, "loss": 0.1192, "step": 1540 }, { "epoch": 1.4630904343698077, "grad_norm": 0.027821926400065422, "learning_rate": 0.00012154355849420353, "loss": 0.1221, "step": 1541 }, { "epoch": 1.4640398765725138, "grad_norm": 0.028374498710036278, "learning_rate": 0.00012143563745073836, "loss": 0.1253, "step": 1542 }, { "epoch": 1.4649893187752197, "grad_norm": 0.05225376784801483, "learning_rate": 0.0001213276902325327, "loss": 0.1194, "step": 1543 }, { "epoch": 1.4659387609779255, "grad_norm": 0.04315062612295151, "learning_rate": 0.00012121971697139926, "loss": 0.1613, "step": 1544 }, { "epoch": 1.4668882031806314, "grad_norm": 0.027962563559412956, "learning_rate": 0.00012111171779918264, "loss": 0.1284, "step": 1545 }, { "epoch": 1.4678376453833373, "grad_norm": 0.0343998521566391, "learning_rate": 0.000121003692847759, "loss": 0.1208, "step": 1546 }, { "epoch": 1.4687870875860431, "grad_norm": 0.031139155849814415, "learning_rate": 0.00012089564224903607, "loss": 0.1323, "step": 1547 }, { "epoch": 1.469736529788749, "grad_norm": 0.028015103191137314, "learning_rate": 0.00012078756613495277, "loss": 0.1266, "step": 1548 }, { "epoch": 1.470685971991455, "grad_norm": 0.04571033641695976, "learning_rate": 0.00012067946463747928, "loss": 0.1561, "step": 1549 }, { "epoch": 1.471635414194161, "grad_norm": 0.027564501389861107, "learning_rate": 0.00012057133788861677, "loss": 0.1214, "step": 1550 }, { "epoch": 1.4725848563968669, "grad_norm": 0.027498599141836166, "learning_rate": 0.00012046318602039717, "loss": 0.1233, "step": 1551 }, { "epoch": 1.4735342985995727, "grad_norm": 0.040304671972990036, "learning_rate": 0.00012035500916488316, "loss": 0.1636, "step": 1552 }, { "epoch": 1.4744837408022786, "grad_norm": 0.045689020305871964, "learning_rate": 0.00012024680745416787, "loss": 0.1696, "step": 1553 }, { "epoch": 1.4754331830049845, "grad_norm": 0.02754673734307289, "learning_rate": 0.00012013858102037485, "loss": 0.1265, "step": 1554 }, { "epoch": 1.4763826252076906, "grad_norm": 0.04088296741247177, "learning_rate": 0.0001200303299956578, "loss": 0.1675, "step": 1555 }, { "epoch": 1.4773320674103965, "grad_norm": 0.03440406173467636, "learning_rate": 0.0001199220545122004, "loss": 0.1245, "step": 1556 }, { "epoch": 1.4782815096131023, "grad_norm": 0.04637245833873749, "learning_rate": 0.00011981375470221628, "loss": 0.1668, "step": 1557 }, { "epoch": 1.4792309518158082, "grad_norm": 0.03825761005282402, "learning_rate": 0.00011970543069794875, "loss": 0.1599, "step": 1558 }, { "epoch": 1.480180394018514, "grad_norm": 0.031823255121707916, "learning_rate": 0.00011959708263167067, "loss": 0.1232, "step": 1559 }, { "epoch": 1.48112983622122, "grad_norm": 0.032104745507240295, "learning_rate": 0.00011948871063568419, "loss": 0.1237, "step": 1560 }, { "epoch": 1.4820792784239258, "grad_norm": 0.05664534121751785, "learning_rate": 0.00011938031484232079, "loss": 0.1617, "step": 1561 }, { "epoch": 1.483028720626632, "grad_norm": 0.02819441631436348, "learning_rate": 0.00011927189538394101, "loss": 0.1198, "step": 1562 }, { "epoch": 1.4839781628293378, "grad_norm": 0.030618941411376, "learning_rate": 0.00011916345239293423, "loss": 0.1291, "step": 1563 }, { "epoch": 1.4849276050320437, "grad_norm": 0.027746165171265602, "learning_rate": 0.00011905498600171859, "loss": 0.1287, "step": 1564 }, { "epoch": 1.4858770472347496, "grad_norm": 0.05675269663333893, "learning_rate": 0.00011894649634274075, "loss": 0.1988, "step": 1565 }, { "epoch": 1.4868264894374554, "grad_norm": 0.044594231992959976, "learning_rate": 0.00011883798354847589, "loss": 0.1611, "step": 1566 }, { "epoch": 1.4877759316401615, "grad_norm": 0.050554268062114716, "learning_rate": 0.00011872944775142736, "loss": 0.1762, "step": 1567 }, { "epoch": 1.4887253738428674, "grad_norm": 0.04185627028346062, "learning_rate": 0.0001186208890841266, "loss": 0.1593, "step": 1568 }, { "epoch": 1.4896748160455733, "grad_norm": 0.030089320614933968, "learning_rate": 0.00011851230767913303, "loss": 0.1217, "step": 1569 }, { "epoch": 1.4906242582482792, "grad_norm": 0.033343639224767685, "learning_rate": 0.00011840370366903382, "loss": 0.1284, "step": 1570 }, { "epoch": 1.491573700450985, "grad_norm": 0.030400337651371956, "learning_rate": 0.00011829507718644366, "loss": 0.1315, "step": 1571 }, { "epoch": 1.492523142653691, "grad_norm": 0.026966162025928497, "learning_rate": 0.00011818642836400477, "loss": 0.1206, "step": 1572 }, { "epoch": 1.4934725848563968, "grad_norm": 0.030626913532614708, "learning_rate": 0.00011807775733438664, "loss": 0.1228, "step": 1573 }, { "epoch": 1.4944220270591027, "grad_norm": 0.03993997722864151, "learning_rate": 0.00011796906423028588, "loss": 0.1488, "step": 1574 }, { "epoch": 1.4953714692618088, "grad_norm": 0.030942171812057495, "learning_rate": 0.00011786034918442596, "loss": 0.1284, "step": 1575 }, { "epoch": 1.4963209114645146, "grad_norm": 0.027237005531787872, "learning_rate": 0.00011775161232955729, "loss": 0.1278, "step": 1576 }, { "epoch": 1.4972703536672205, "grad_norm": 0.030200913548469543, "learning_rate": 0.0001176428537984568, "loss": 0.1264, "step": 1577 }, { "epoch": 1.4982197958699264, "grad_norm": 0.03261629864573479, "learning_rate": 0.00011753407372392795, "loss": 0.128, "step": 1578 }, { "epoch": 1.4991692380726322, "grad_norm": 0.033451877534389496, "learning_rate": 0.0001174252722388005, "loss": 0.1345, "step": 1579 }, { "epoch": 1.5001186802753383, "grad_norm": 0.045393262058496475, "learning_rate": 0.00011731644947593026, "loss": 0.1627, "step": 1580 }, { "epoch": 1.5010681224780442, "grad_norm": 0.05379907041788101, "learning_rate": 0.00011720760556819916, "loss": 0.1867, "step": 1581 }, { "epoch": 1.50201756468075, "grad_norm": 0.025756366550922394, "learning_rate": 0.00011709874064851487, "loss": 0.1229, "step": 1582 }, { "epoch": 1.502967006883456, "grad_norm": 0.03149167448282242, "learning_rate": 0.00011698985484981077, "loss": 0.124, "step": 1583 }, { "epoch": 1.5039164490861618, "grad_norm": 0.044556062668561935, "learning_rate": 0.00011688094830504566, "loss": 0.1522, "step": 1584 }, { "epoch": 1.5048658912888677, "grad_norm": 0.029473567381501198, "learning_rate": 0.00011677202114720374, "loss": 0.1231, "step": 1585 }, { "epoch": 1.5058153334915736, "grad_norm": 0.02735304646193981, "learning_rate": 0.00011666307350929435, "loss": 0.1242, "step": 1586 }, { "epoch": 1.5067647756942795, "grad_norm": 0.04760657250881195, "learning_rate": 0.00011655410552435184, "loss": 0.1574, "step": 1587 }, { "epoch": 1.5077142178969856, "grad_norm": 0.029798056930303574, "learning_rate": 0.00011644511732543542, "loss": 0.1289, "step": 1588 }, { "epoch": 1.5086636600996914, "grad_norm": 0.0329173281788826, "learning_rate": 0.00011633610904562892, "loss": 0.1251, "step": 1589 }, { "epoch": 1.5096131023023973, "grad_norm": 0.030921783298254013, "learning_rate": 0.00011622708081804081, "loss": 0.1257, "step": 1590 }, { "epoch": 1.5105625445051034, "grad_norm": 0.031381431967020035, "learning_rate": 0.0001161180327758038, "loss": 0.1258, "step": 1591 }, { "epoch": 1.5115119867078093, "grad_norm": 0.050078343600034714, "learning_rate": 0.00011600896505207488, "loss": 0.1764, "step": 1592 }, { "epoch": 1.5124614289105152, "grad_norm": 0.04285876452922821, "learning_rate": 0.00011589987778003501, "loss": 0.1607, "step": 1593 }, { "epoch": 1.513410871113221, "grad_norm": 0.02959441766142845, "learning_rate": 0.00011579077109288907, "loss": 0.1266, "step": 1594 }, { "epoch": 1.514360313315927, "grad_norm": 0.03024190105497837, "learning_rate": 0.00011568164512386559, "loss": 0.1306, "step": 1595 }, { "epoch": 1.5153097555186328, "grad_norm": 0.031076421961188316, "learning_rate": 0.00011557250000621674, "loss": 0.1326, "step": 1596 }, { "epoch": 1.5162591977213387, "grad_norm": 0.10080650448799133, "learning_rate": 0.00011546333587321795, "loss": 0.1596, "step": 1597 }, { "epoch": 1.5172086399240445, "grad_norm": 0.029658254235982895, "learning_rate": 0.000115354152858168, "loss": 0.1306, "step": 1598 }, { "epoch": 1.5181580821267504, "grad_norm": 0.025450505316257477, "learning_rate": 0.00011524495109438857, "loss": 0.1199, "step": 1599 }, { "epoch": 1.5191075243294563, "grad_norm": 0.0450344942510128, "learning_rate": 0.00011513573071522439, "loss": 0.1671, "step": 1600 }, { "epoch": 1.5200569665321624, "grad_norm": 0.02928159199655056, "learning_rate": 0.00011502649185404281, "loss": 0.1247, "step": 1601 }, { "epoch": 1.5210064087348683, "grad_norm": 0.04587600752711296, "learning_rate": 0.00011491723464423385, "loss": 0.1746, "step": 1602 }, { "epoch": 1.5219558509375741, "grad_norm": 0.03615221753716469, "learning_rate": 0.00011480795921920984, "loss": 0.153, "step": 1603 }, { "epoch": 1.5229052931402802, "grad_norm": 0.026470355689525604, "learning_rate": 0.00011469866571240535, "loss": 0.1204, "step": 1604 }, { "epoch": 1.523854735342986, "grad_norm": 0.0324716791510582, "learning_rate": 0.00011458935425727713, "loss": 0.127, "step": 1605 }, { "epoch": 1.524804177545692, "grad_norm": 0.028570136055350304, "learning_rate": 0.00011448002498730375, "loss": 0.1244, "step": 1606 }, { "epoch": 1.5257536197483978, "grad_norm": 0.03165844827890396, "learning_rate": 0.00011437067803598558, "loss": 0.1286, "step": 1607 }, { "epoch": 1.5267030619511037, "grad_norm": 0.029820239171385765, "learning_rate": 0.00011426131353684457, "loss": 0.1232, "step": 1608 }, { "epoch": 1.5276525041538096, "grad_norm": 0.03704296052455902, "learning_rate": 0.00011415193162342407, "loss": 0.1262, "step": 1609 }, { "epoch": 1.5286019463565155, "grad_norm": 0.029848331585526466, "learning_rate": 0.00011404253242928877, "loss": 0.1345, "step": 1610 }, { "epoch": 1.5295513885592213, "grad_norm": 0.02494947612285614, "learning_rate": 0.00011393311608802437, "loss": 0.1247, "step": 1611 }, { "epoch": 1.5305008307619272, "grad_norm": 0.024150602519512177, "learning_rate": 0.0001138236827332376, "loss": 0.1195, "step": 1612 }, { "epoch": 1.5314502729646333, "grad_norm": 0.028073903173208237, "learning_rate": 0.00011371423249855584, "loss": 0.1298, "step": 1613 }, { "epoch": 1.5323997151673392, "grad_norm": 0.03696022182703018, "learning_rate": 0.0001136047655176272, "loss": 0.1658, "step": 1614 }, { "epoch": 1.533349157370045, "grad_norm": 0.026254741474986076, "learning_rate": 0.00011349528192412018, "loss": 0.1209, "step": 1615 }, { "epoch": 1.5342985995727512, "grad_norm": 0.04131542891263962, "learning_rate": 0.0001133857818517236, "loss": 0.1566, "step": 1616 }, { "epoch": 1.535248041775457, "grad_norm": 0.028996463865041733, "learning_rate": 0.00011327626543414636, "loss": 0.1287, "step": 1617 }, { "epoch": 1.536197483978163, "grad_norm": 0.029789695516228676, "learning_rate": 0.00011316673280511738, "loss": 0.1303, "step": 1618 }, { "epoch": 1.5371469261808688, "grad_norm": 0.026316309347748756, "learning_rate": 0.00011305718409838528, "loss": 0.1224, "step": 1619 }, { "epoch": 1.5380963683835747, "grad_norm": 0.029919691383838654, "learning_rate": 0.0001129476194477184, "loss": 0.1297, "step": 1620 }, { "epoch": 1.5390458105862805, "grad_norm": 0.029496189206838608, "learning_rate": 0.0001128380389869045, "loss": 0.1259, "step": 1621 }, { "epoch": 1.5399952527889864, "grad_norm": 0.0289089847356081, "learning_rate": 0.00011272844284975072, "loss": 0.1138, "step": 1622 }, { "epoch": 1.5409446949916923, "grad_norm": 0.03120460920035839, "learning_rate": 0.00011261883117008321, "loss": 0.1255, "step": 1623 }, { "epoch": 1.5418941371943982, "grad_norm": 0.030355585739016533, "learning_rate": 0.0001125092040817472, "loss": 0.1234, "step": 1624 }, { "epoch": 1.542843579397104, "grad_norm": 0.026462506502866745, "learning_rate": 0.00011239956171860675, "loss": 0.1194, "step": 1625 }, { "epoch": 1.5437930215998101, "grad_norm": 0.04788212105631828, "learning_rate": 0.00011228990421454449, "loss": 0.1721, "step": 1626 }, { "epoch": 1.544742463802516, "grad_norm": 0.04839539900422096, "learning_rate": 0.00011218023170346159, "loss": 0.1616, "step": 1627 }, { "epoch": 1.5456919060052219, "grad_norm": 0.029102666303515434, "learning_rate": 0.00011207054431927752, "loss": 0.1244, "step": 1628 }, { "epoch": 1.546641348207928, "grad_norm": 0.030667860060930252, "learning_rate": 0.00011196084219592994, "loss": 0.1211, "step": 1629 }, { "epoch": 1.5475907904106339, "grad_norm": 0.029911190271377563, "learning_rate": 0.00011185112546737451, "loss": 0.1209, "step": 1630 }, { "epoch": 1.5485402326133397, "grad_norm": 0.026976125314831734, "learning_rate": 0.00011174139426758466, "loss": 0.1201, "step": 1631 }, { "epoch": 1.5494896748160456, "grad_norm": 0.04256618767976761, "learning_rate": 0.00011163164873055158, "loss": 0.1716, "step": 1632 }, { "epoch": 1.5504391170187515, "grad_norm": 0.04431037977337837, "learning_rate": 0.00011152188899028393, "loss": 0.1535, "step": 1633 }, { "epoch": 1.5513885592214574, "grad_norm": 0.040324628353118896, "learning_rate": 0.00011141211518080768, "loss": 0.1559, "step": 1634 }, { "epoch": 1.5523380014241632, "grad_norm": 0.02631363458931446, "learning_rate": 0.00011130232743616602, "loss": 0.1254, "step": 1635 }, { "epoch": 1.553287443626869, "grad_norm": 0.031095106154680252, "learning_rate": 0.00011119252589041917, "loss": 0.1296, "step": 1636 }, { "epoch": 1.554236885829575, "grad_norm": 0.029540112242102623, "learning_rate": 0.00011108271067764413, "loss": 0.1237, "step": 1637 }, { "epoch": 1.555186328032281, "grad_norm": 0.032384589314460754, "learning_rate": 0.00011097288193193465, "loss": 0.122, "step": 1638 }, { "epoch": 1.556135770234987, "grad_norm": 0.041656699031591415, "learning_rate": 0.00011086303978740102, "loss": 0.1627, "step": 1639 }, { "epoch": 1.5570852124376928, "grad_norm": 0.02969949133694172, "learning_rate": 0.00011075318437816981, "loss": 0.1278, "step": 1640 }, { "epoch": 1.558034654640399, "grad_norm": 0.029392141848802567, "learning_rate": 0.00011064331583838389, "loss": 0.1222, "step": 1641 }, { "epoch": 1.5589840968431048, "grad_norm": 0.03221196308732033, "learning_rate": 0.0001105334343022021, "loss": 0.1301, "step": 1642 }, { "epoch": 1.5599335390458107, "grad_norm": 0.04113316535949707, "learning_rate": 0.00011042353990379917, "loss": 0.1545, "step": 1643 }, { "epoch": 1.5608829812485165, "grad_norm": 0.04606552794575691, "learning_rate": 0.00011031363277736546, "loss": 0.1582, "step": 1644 }, { "epoch": 1.5618324234512224, "grad_norm": 0.05224507302045822, "learning_rate": 0.00011020371305710701, "loss": 0.1702, "step": 1645 }, { "epoch": 1.5627818656539283, "grad_norm": 0.03016183339059353, "learning_rate": 0.00011009378087724518, "loss": 0.1299, "step": 1646 }, { "epoch": 1.5637313078566342, "grad_norm": 0.02981068380177021, "learning_rate": 0.00010998383637201648, "loss": 0.1258, "step": 1647 }, { "epoch": 1.56468075005934, "grad_norm": 0.027657071128487587, "learning_rate": 0.00010987387967567252, "loss": 0.1338, "step": 1648 }, { "epoch": 1.565630192262046, "grad_norm": 0.030992772430181503, "learning_rate": 0.00010976391092247986, "loss": 0.1249, "step": 1649 }, { "epoch": 1.566579634464752, "grad_norm": 0.039394572377204895, "learning_rate": 0.00010965393024671966, "loss": 0.1598, "step": 1650 }, { "epoch": 1.5675290766674579, "grad_norm": 0.042735736817121506, "learning_rate": 0.00010954393778268777, "loss": 0.1566, "step": 1651 }, { "epoch": 1.5684785188701638, "grad_norm": 0.03833623602986336, "learning_rate": 0.00010943393366469427, "loss": 0.1557, "step": 1652 }, { "epoch": 1.5694279610728696, "grad_norm": 0.028264719992876053, "learning_rate": 0.00010932391802706363, "loss": 0.1295, "step": 1653 }, { "epoch": 1.5703774032755757, "grad_norm": 0.030619991943240166, "learning_rate": 0.00010921389100413428, "loss": 0.1294, "step": 1654 }, { "epoch": 1.5713268454782816, "grad_norm": 0.0320441797375679, "learning_rate": 0.00010910385273025865, "loss": 0.1266, "step": 1655 }, { "epoch": 1.5722762876809875, "grad_norm": 0.028138084337115288, "learning_rate": 0.00010899380333980282, "loss": 0.1177, "step": 1656 }, { "epoch": 1.5732257298836934, "grad_norm": 0.030276020988821983, "learning_rate": 0.00010888374296714644, "loss": 0.1258, "step": 1657 }, { "epoch": 1.5741751720863992, "grad_norm": 0.04478145390748978, "learning_rate": 0.00010877367174668269, "loss": 0.1555, "step": 1658 }, { "epoch": 1.575124614289105, "grad_norm": 0.025850724428892136, "learning_rate": 0.00010866358981281783, "loss": 0.1186, "step": 1659 }, { "epoch": 1.576074056491811, "grad_norm": 0.03069223277270794, "learning_rate": 0.00010855349729997135, "loss": 0.1314, "step": 1660 }, { "epoch": 1.5770234986945169, "grad_norm": 0.050107646733522415, "learning_rate": 0.00010844339434257558, "loss": 0.1632, "step": 1661 }, { "epoch": 1.5779729408972227, "grad_norm": 0.04577158764004707, "learning_rate": 0.00010833328107507556, "loss": 0.1777, "step": 1662 }, { "epoch": 1.5789223830999288, "grad_norm": 0.030337292701005936, "learning_rate": 0.00010822315763192903, "loss": 0.1277, "step": 1663 }, { "epoch": 1.5798718253026347, "grad_norm": 0.05210689827799797, "learning_rate": 0.00010811302414760609, "loss": 0.1583, "step": 1664 }, { "epoch": 1.5808212675053406, "grad_norm": 0.056502800434827805, "learning_rate": 0.00010800288075658911, "loss": 0.1533, "step": 1665 }, { "epoch": 1.5817707097080467, "grad_norm": 0.03072887845337391, "learning_rate": 0.00010789272759337257, "loss": 0.1349, "step": 1666 }, { "epoch": 1.5827201519107525, "grad_norm": 0.030606022104620934, "learning_rate": 0.00010778256479246283, "loss": 0.1288, "step": 1667 }, { "epoch": 1.5836695941134584, "grad_norm": 0.027298327535390854, "learning_rate": 0.00010767239248837811, "loss": 0.1256, "step": 1668 }, { "epoch": 1.5846190363161643, "grad_norm": 0.044606730341911316, "learning_rate": 0.00010756221081564813, "loss": 0.1673, "step": 1669 }, { "epoch": 1.5855684785188702, "grad_norm": 0.04783207178115845, "learning_rate": 0.00010745201990881417, "loss": 0.1539, "step": 1670 }, { "epoch": 1.586517920721576, "grad_norm": 0.03072645701467991, "learning_rate": 0.00010734181990242868, "loss": 0.1255, "step": 1671 }, { "epoch": 1.587467362924282, "grad_norm": 0.028861412778496742, "learning_rate": 0.00010723161093105527, "loss": 0.1252, "step": 1672 }, { "epoch": 1.5884168051269878, "grad_norm": 0.06253904849290848, "learning_rate": 0.0001071213931292685, "loss": 0.208, "step": 1673 }, { "epoch": 1.5893662473296937, "grad_norm": 0.02638799510896206, "learning_rate": 0.00010701116663165368, "loss": 0.1307, "step": 1674 }, { "epoch": 1.5903156895323998, "grad_norm": 0.02984490990638733, "learning_rate": 0.00010690093157280678, "loss": 0.1305, "step": 1675 }, { "epoch": 1.5912651317351056, "grad_norm": 0.029151547700166702, "learning_rate": 0.0001067906880873342, "loss": 0.1278, "step": 1676 }, { "epoch": 1.5922145739378115, "grad_norm": 0.03524734824895859, "learning_rate": 0.00010668043630985259, "loss": 0.1426, "step": 1677 }, { "epoch": 1.5931640161405174, "grad_norm": 0.03428010269999504, "learning_rate": 0.00010657017637498881, "loss": 0.125, "step": 1678 }, { "epoch": 1.5941134583432235, "grad_norm": 0.05122271180152893, "learning_rate": 0.00010645990841737965, "loss": 0.1569, "step": 1679 }, { "epoch": 1.5950629005459294, "grad_norm": 0.035184647887945175, "learning_rate": 0.00010634963257167167, "loss": 0.1358, "step": 1680 }, { "epoch": 1.5960123427486352, "grad_norm": 0.03079393319785595, "learning_rate": 0.00010623934897252106, "loss": 0.1268, "step": 1681 }, { "epoch": 1.5969617849513411, "grad_norm": 0.03128993511199951, "learning_rate": 0.00010612905775459349, "loss": 0.1223, "step": 1682 }, { "epoch": 1.597911227154047, "grad_norm": 0.02892274223268032, "learning_rate": 0.00010601875905256398, "loss": 0.1293, "step": 1683 }, { "epoch": 1.5988606693567529, "grad_norm": 0.030298085883259773, "learning_rate": 0.00010590845300111663, "loss": 0.1293, "step": 1684 }, { "epoch": 1.5998101115594587, "grad_norm": 0.028357641771435738, "learning_rate": 0.00010579813973494454, "loss": 0.1269, "step": 1685 }, { "epoch": 1.6007595537621646, "grad_norm": 0.027196258306503296, "learning_rate": 0.00010568781938874959, "loss": 0.1296, "step": 1686 }, { "epoch": 1.6017089959648705, "grad_norm": 0.05107175186276436, "learning_rate": 0.00010557749209724233, "loss": 0.1604, "step": 1687 }, { "epoch": 1.6026584381675766, "grad_norm": 0.050482697784900665, "learning_rate": 0.00010546715799514178, "loss": 0.1902, "step": 1688 }, { "epoch": 1.6036078803702825, "grad_norm": 0.026253553107380867, "learning_rate": 0.00010535681721717529, "loss": 0.1226, "step": 1689 }, { "epoch": 1.6045573225729883, "grad_norm": 0.03308340907096863, "learning_rate": 0.0001052464698980784, "loss": 0.1384, "step": 1690 }, { "epoch": 1.6055067647756944, "grad_norm": 0.03511514514684677, "learning_rate": 0.00010513611617259454, "loss": 0.1577, "step": 1691 }, { "epoch": 1.6064562069784003, "grad_norm": 0.0325862281024456, "learning_rate": 0.00010502575617547501, "loss": 0.1343, "step": 1692 }, { "epoch": 1.6074056491811062, "grad_norm": 0.029174668714404106, "learning_rate": 0.00010491539004147879, "loss": 0.1222, "step": 1693 }, { "epoch": 1.608355091383812, "grad_norm": 0.028774891048669815, "learning_rate": 0.00010480501790537236, "loss": 0.1237, "step": 1694 }, { "epoch": 1.609304533586518, "grad_norm": 0.030504655092954636, "learning_rate": 0.00010469463990192947, "loss": 0.1228, "step": 1695 }, { "epoch": 1.6102539757892238, "grad_norm": 0.02889893390238285, "learning_rate": 0.0001045842561659311, "loss": 0.1208, "step": 1696 }, { "epoch": 1.6112034179919297, "grad_norm": 0.028366010636091232, "learning_rate": 0.00010447386683216518, "loss": 0.1193, "step": 1697 }, { "epoch": 1.6121528601946356, "grad_norm": 0.028841307386755943, "learning_rate": 0.0001043634720354265, "loss": 0.1287, "step": 1698 }, { "epoch": 1.6131023023973414, "grad_norm": 0.03739466145634651, "learning_rate": 0.00010425307191051654, "loss": 0.1369, "step": 1699 }, { "epoch": 1.6140517446000475, "grad_norm": 0.031514909118413925, "learning_rate": 0.00010414266659224323, "loss": 0.1301, "step": 1700 }, { "epoch": 1.6150011868027534, "grad_norm": 0.04335467517375946, "learning_rate": 0.00010403225621542089, "loss": 0.1543, "step": 1701 }, { "epoch": 1.6159506290054593, "grad_norm": 0.026403702795505524, "learning_rate": 0.00010392184091487, "loss": 0.1229, "step": 1702 }, { "epoch": 1.6169000712081654, "grad_norm": 0.06492079049348831, "learning_rate": 0.00010381142082541706, "loss": 0.1526, "step": 1703 }, { "epoch": 1.6178495134108712, "grad_norm": 0.031183555722236633, "learning_rate": 0.00010370099608189439, "loss": 0.1255, "step": 1704 }, { "epoch": 1.6187989556135771, "grad_norm": 0.02694527618587017, "learning_rate": 0.00010359056681914006, "loss": 0.1178, "step": 1705 }, { "epoch": 1.619748397816283, "grad_norm": 0.031590498983860016, "learning_rate": 0.00010348013317199756, "loss": 0.1311, "step": 1706 }, { "epoch": 1.6206978400189889, "grad_norm": 0.034355372190475464, "learning_rate": 0.00010336969527531577, "loss": 0.1363, "step": 1707 }, { "epoch": 1.6216472822216947, "grad_norm": 0.04307783022522926, "learning_rate": 0.00010325925326394886, "loss": 0.1659, "step": 1708 }, { "epoch": 1.6225967244244006, "grad_norm": 0.05346130579710007, "learning_rate": 0.00010314880727275591, "loss": 0.2022, "step": 1709 }, { "epoch": 1.6235461666271065, "grad_norm": 0.028053171932697296, "learning_rate": 0.00010303835743660086, "loss": 0.1251, "step": 1710 }, { "epoch": 1.6244956088298124, "grad_norm": 0.03092275932431221, "learning_rate": 0.00010292790389035239, "loss": 0.1299, "step": 1711 }, { "epoch": 1.6254450510325182, "grad_norm": 0.03486338630318642, "learning_rate": 0.00010281744676888368, "loss": 0.1374, "step": 1712 }, { "epoch": 1.6263944932352243, "grad_norm": 0.030160879716277122, "learning_rate": 0.00010270698620707231, "loss": 0.1381, "step": 1713 }, { "epoch": 1.6273439354379302, "grad_norm": 0.03339090943336487, "learning_rate": 0.00010259652233980007, "loss": 0.1316, "step": 1714 }, { "epoch": 1.628293377640636, "grad_norm": 0.027464497834444046, "learning_rate": 0.00010248605530195268, "loss": 0.1204, "step": 1715 }, { "epoch": 1.6292428198433422, "grad_norm": 0.02733561024069786, "learning_rate": 0.00010237558522841985, "loss": 0.1259, "step": 1716 }, { "epoch": 1.630192262046048, "grad_norm": 0.029772773385047913, "learning_rate": 0.00010226511225409499, "loss": 0.1252, "step": 1717 }, { "epoch": 1.631141704248754, "grad_norm": 0.04465902969241142, "learning_rate": 0.00010215463651387499, "loss": 0.151, "step": 1718 }, { "epoch": 1.6320911464514598, "grad_norm": 0.028140738606452942, "learning_rate": 0.0001020441581426601, "loss": 0.1221, "step": 1719 }, { "epoch": 1.6330405886541657, "grad_norm": 0.026517389342188835, "learning_rate": 0.00010193367727535392, "loss": 0.1253, "step": 1720 }, { "epoch": 1.6339900308568716, "grad_norm": 0.06271334737539291, "learning_rate": 0.00010182319404686293, "loss": 0.2072, "step": 1721 }, { "epoch": 1.6349394730595774, "grad_norm": 0.051109135150909424, "learning_rate": 0.00010171270859209662, "loss": 0.1923, "step": 1722 }, { "epoch": 1.6358889152622833, "grad_norm": 0.02858104184269905, "learning_rate": 0.00010160222104596716, "loss": 0.1317, "step": 1723 }, { "epoch": 1.6368383574649892, "grad_norm": 0.031905338168144226, "learning_rate": 0.00010149173154338917, "loss": 0.1334, "step": 1724 }, { "epoch": 1.6377877996676953, "grad_norm": 0.035984478890895844, "learning_rate": 0.00010138124021927984, "loss": 0.1308, "step": 1725 }, { "epoch": 1.6387372418704012, "grad_norm": 0.029515955597162247, "learning_rate": 0.00010127074720855845, "loss": 0.1232, "step": 1726 }, { "epoch": 1.639686684073107, "grad_norm": 0.03353870287537575, "learning_rate": 0.0001011602526461464, "loss": 0.1342, "step": 1727 }, { "epoch": 1.6406361262758131, "grad_norm": 0.02787208929657936, "learning_rate": 0.00010104975666696697, "loss": 0.1216, "step": 1728 }, { "epoch": 1.641585568478519, "grad_norm": 0.061213839799165726, "learning_rate": 0.0001009392594059452, "loss": 0.2093, "step": 1729 }, { "epoch": 1.6425350106812249, "grad_norm": 0.034235142171382904, "learning_rate": 0.0001008287609980076, "loss": 0.1329, "step": 1730 }, { "epoch": 1.6434844528839307, "grad_norm": 0.026360424235463142, "learning_rate": 0.00010071826157808217, "loss": 0.1239, "step": 1731 }, { "epoch": 1.6444338950866366, "grad_norm": 0.026264041662216187, "learning_rate": 0.00010060776128109812, "loss": 0.12, "step": 1732 }, { "epoch": 1.6453833372893425, "grad_norm": 0.02740940824151039, "learning_rate": 0.00010049726024198578, "loss": 0.1314, "step": 1733 }, { "epoch": 1.6463327794920484, "grad_norm": 0.04096614569425583, "learning_rate": 0.00010038675859567628, "loss": 0.1681, "step": 1734 }, { "epoch": 1.6472822216947542, "grad_norm": 0.04552573338150978, "learning_rate": 0.00010027625647710155, "loss": 0.16, "step": 1735 }, { "epoch": 1.6482316638974601, "grad_norm": 0.034032173454761505, "learning_rate": 0.00010016575402119413, "loss": 0.1326, "step": 1736 }, { "epoch": 1.649181106100166, "grad_norm": 0.03644052520394325, "learning_rate": 0.00010005525136288692, "loss": 0.146, "step": 1737 }, { "epoch": 1.650130548302872, "grad_norm": 0.04277161881327629, "learning_rate": 9.994474863711311e-05, "loss": 0.1719, "step": 1738 }, { "epoch": 1.651079990505578, "grad_norm": 0.027901561930775642, "learning_rate": 9.98342459788059e-05, "loss": 0.1238, "step": 1739 }, { "epoch": 1.6520294327082838, "grad_norm": 0.030957000330090523, "learning_rate": 9.972374352289848e-05, "loss": 0.1315, "step": 1740 }, { "epoch": 1.65297887491099, "grad_norm": 0.029299341142177582, "learning_rate": 9.961324140432376e-05, "loss": 0.1247, "step": 1741 }, { "epoch": 1.6539283171136958, "grad_norm": 0.0292718093842268, "learning_rate": 9.950273975801424e-05, "loss": 0.1296, "step": 1742 }, { "epoch": 1.6548777593164017, "grad_norm": 0.03113977424800396, "learning_rate": 9.93922387189019e-05, "loss": 0.1294, "step": 1743 }, { "epoch": 1.6558272015191076, "grad_norm": 0.05127384141087532, "learning_rate": 9.928173842191786e-05, "loss": 0.1623, "step": 1744 }, { "epoch": 1.6567766437218134, "grad_norm": 0.03058856725692749, "learning_rate": 9.917123900199245e-05, "loss": 0.1251, "step": 1745 }, { "epoch": 1.6577260859245193, "grad_norm": 0.043525367975234985, "learning_rate": 9.906074059405486e-05, "loss": 0.1584, "step": 1746 }, { "epoch": 1.6586755281272252, "grad_norm": 0.02724611759185791, "learning_rate": 9.895024333303305e-05, "loss": 0.1273, "step": 1747 }, { "epoch": 1.659624970329931, "grad_norm": 0.026182804256677628, "learning_rate": 9.883974735385361e-05, "loss": 0.1165, "step": 1748 }, { "epoch": 1.660574412532637, "grad_norm": 0.030495688319206238, "learning_rate": 9.87292527914416e-05, "loss": 0.1251, "step": 1749 }, { "epoch": 1.661523854735343, "grad_norm": 0.03013971447944641, "learning_rate": 9.861875978072017e-05, "loss": 0.1278, "step": 1750 }, { "epoch": 1.662473296938049, "grad_norm": 0.03999912738800049, "learning_rate": 9.850826845661082e-05, "loss": 0.1519, "step": 1751 }, { "epoch": 1.6634227391407548, "grad_norm": 0.029559755697846413, "learning_rate": 9.839777895403287e-05, "loss": 0.1293, "step": 1752 }, { "epoch": 1.6643721813434609, "grad_norm": 0.04213762283325195, "learning_rate": 9.828729140790337e-05, "loss": 0.1696, "step": 1753 }, { "epoch": 1.6653216235461668, "grad_norm": 0.029974251985549927, "learning_rate": 9.817680595313705e-05, "loss": 0.1182, "step": 1754 }, { "epoch": 1.6662710657488726, "grad_norm": 0.03835977986454964, "learning_rate": 9.806632272464607e-05, "loss": 0.1467, "step": 1755 }, { "epoch": 1.6672205079515785, "grad_norm": 0.031473349779844284, "learning_rate": 9.795584185733988e-05, "loss": 0.1305, "step": 1756 }, { "epoch": 1.6681699501542844, "grad_norm": 0.02675897814333439, "learning_rate": 9.784536348612504e-05, "loss": 0.1196, "step": 1757 }, { "epoch": 1.6691193923569903, "grad_norm": 0.0419435016810894, "learning_rate": 9.773488774590504e-05, "loss": 0.1558, "step": 1758 }, { "epoch": 1.6700688345596961, "grad_norm": 0.027311773970723152, "learning_rate": 9.762441477158016e-05, "loss": 0.1236, "step": 1759 }, { "epoch": 1.671018276762402, "grad_norm": 0.05605548992753029, "learning_rate": 9.751394469804734e-05, "loss": 0.1787, "step": 1760 }, { "epoch": 1.6719677189651079, "grad_norm": 0.025175364688038826, "learning_rate": 9.740347766019997e-05, "loss": 0.1275, "step": 1761 }, { "epoch": 1.6729171611678137, "grad_norm": 0.04951293021440506, "learning_rate": 9.729301379292773e-05, "loss": 0.2023, "step": 1762 }, { "epoch": 1.6738666033705198, "grad_norm": 0.02842806465923786, "learning_rate": 9.718255323111635e-05, "loss": 0.1238, "step": 1763 }, { "epoch": 1.6748160455732257, "grad_norm": 0.029241712763905525, "learning_rate": 9.707209610964765e-05, "loss": 0.121, "step": 1764 }, { "epoch": 1.6757654877759316, "grad_norm": 0.03337705507874489, "learning_rate": 9.696164256339917e-05, "loss": 0.1354, "step": 1765 }, { "epoch": 1.6767149299786377, "grad_norm": 0.030520187690854073, "learning_rate": 9.685119272724411e-05, "loss": 0.1256, "step": 1766 }, { "epoch": 1.6776643721813436, "grad_norm": 0.0318867489695549, "learning_rate": 9.674074673605115e-05, "loss": 0.1286, "step": 1767 }, { "epoch": 1.6786138143840494, "grad_norm": 0.02671106904745102, "learning_rate": 9.663030472468424e-05, "loss": 0.1297, "step": 1768 }, { "epoch": 1.6795632565867553, "grad_norm": 0.041651055216789246, "learning_rate": 9.651986682800249e-05, "loss": 0.1618, "step": 1769 }, { "epoch": 1.6805126987894612, "grad_norm": 0.03195889666676521, "learning_rate": 9.640943318085999e-05, "loss": 0.1279, "step": 1770 }, { "epoch": 1.681462140992167, "grad_norm": 0.04086165875196457, "learning_rate": 9.629900391810563e-05, "loss": 0.1678, "step": 1771 }, { "epoch": 1.682411583194873, "grad_norm": 0.025256391614675522, "learning_rate": 9.618857917458298e-05, "loss": 0.1197, "step": 1772 }, { "epoch": 1.6833610253975788, "grad_norm": 0.03352576494216919, "learning_rate": 9.607815908513005e-05, "loss": 0.1345, "step": 1773 }, { "epoch": 1.6843104676002847, "grad_norm": 0.06082432344555855, "learning_rate": 9.596774378457916e-05, "loss": 0.1639, "step": 1774 }, { "epoch": 1.6852599098029908, "grad_norm": 0.029191186651587486, "learning_rate": 9.585733340775677e-05, "loss": 0.1305, "step": 1775 }, { "epoch": 1.6862093520056967, "grad_norm": 0.029343895614147186, "learning_rate": 9.574692808948348e-05, "loss": 0.1265, "step": 1776 }, { "epoch": 1.6871587942084025, "grad_norm": 0.02953837811946869, "learning_rate": 9.56365279645735e-05, "loss": 0.1281, "step": 1777 }, { "epoch": 1.6881082364111086, "grad_norm": 0.028798846527934074, "learning_rate": 9.552613316783483e-05, "loss": 0.1257, "step": 1778 }, { "epoch": 1.6890576786138145, "grad_norm": 0.02905990555882454, "learning_rate": 9.54157438340689e-05, "loss": 0.1308, "step": 1779 }, { "epoch": 1.6900071208165204, "grad_norm": 0.02965502068400383, "learning_rate": 9.530536009807053e-05, "loss": 0.1296, "step": 1780 }, { "epoch": 1.6909565630192263, "grad_norm": 0.029197504743933678, "learning_rate": 9.519498209462766e-05, "loss": 0.1204, "step": 1781 }, { "epoch": 1.6919060052219321, "grad_norm": 0.04930657148361206, "learning_rate": 9.508460995852122e-05, "loss": 0.1522, "step": 1782 }, { "epoch": 1.692855447424638, "grad_norm": 0.027312377467751503, "learning_rate": 9.497424382452501e-05, "loss": 0.1203, "step": 1783 }, { "epoch": 1.6938048896273439, "grad_norm": 0.03260885551571846, "learning_rate": 9.486388382740548e-05, "loss": 0.1334, "step": 1784 }, { "epoch": 1.6947543318300498, "grad_norm": 0.052055153995752335, "learning_rate": 9.475353010192162e-05, "loss": 0.2113, "step": 1785 }, { "epoch": 1.6957037740327556, "grad_norm": 0.038476429879665375, "learning_rate": 9.464318278282472e-05, "loss": 0.1669, "step": 1786 }, { "epoch": 1.6966532162354615, "grad_norm": 0.03111964277923107, "learning_rate": 9.453284200485825e-05, "loss": 0.1255, "step": 1787 }, { "epoch": 1.6976026584381676, "grad_norm": 0.02803085930645466, "learning_rate": 9.44225079027577e-05, "loss": 0.1297, "step": 1788 }, { "epoch": 1.6985521006408735, "grad_norm": 0.029160544276237488, "learning_rate": 9.431218061125044e-05, "loss": 0.1263, "step": 1789 }, { "epoch": 1.6995015428435793, "grad_norm": 0.03121958300471306, "learning_rate": 9.420186026505548e-05, "loss": 0.1277, "step": 1790 }, { "epoch": 1.7004509850462854, "grad_norm": 0.027002684772014618, "learning_rate": 9.40915469988834e-05, "loss": 0.1236, "step": 1791 }, { "epoch": 1.7014004272489913, "grad_norm": 0.027172109112143517, "learning_rate": 9.398124094743604e-05, "loss": 0.1241, "step": 1792 }, { "epoch": 1.7023498694516972, "grad_norm": 0.02658463642001152, "learning_rate": 9.387094224540653e-05, "loss": 0.1268, "step": 1793 }, { "epoch": 1.703299311654403, "grad_norm": 0.028862452134490013, "learning_rate": 9.376065102747898e-05, "loss": 0.1317, "step": 1794 }, { "epoch": 1.704248753857109, "grad_norm": 0.025097506120800972, "learning_rate": 9.365036742832838e-05, "loss": 0.1164, "step": 1795 }, { "epoch": 1.7051981960598148, "grad_norm": 0.04557095095515251, "learning_rate": 9.354009158262038e-05, "loss": 0.1522, "step": 1796 }, { "epoch": 1.7061476382625207, "grad_norm": 0.03487172722816467, "learning_rate": 9.342982362501123e-05, "loss": 0.1398, "step": 1797 }, { "epoch": 1.7070970804652266, "grad_norm": 0.02509194053709507, "learning_rate": 9.331956369014746e-05, "loss": 0.1166, "step": 1798 }, { "epoch": 1.7080465226679324, "grad_norm": 0.030363403260707855, "learning_rate": 9.320931191266587e-05, "loss": 0.1191, "step": 1799 }, { "epoch": 1.7089959648706385, "grad_norm": 0.08008571714162827, "learning_rate": 9.309906842719323e-05, "loss": 0.1494, "step": 1800 }, { "epoch": 1.7099454070733444, "grad_norm": 0.027857676148414612, "learning_rate": 9.298883336834633e-05, "loss": 0.1307, "step": 1801 }, { "epoch": 1.7108948492760503, "grad_norm": 0.02744341269135475, "learning_rate": 9.28786068707315e-05, "loss": 0.1217, "step": 1802 }, { "epoch": 1.7118442914787564, "grad_norm": 0.0324774868786335, "learning_rate": 9.276838906894472e-05, "loss": 0.1311, "step": 1803 }, { "epoch": 1.7127937336814623, "grad_norm": 0.028761887922883034, "learning_rate": 9.265818009757132e-05, "loss": 0.1275, "step": 1804 }, { "epoch": 1.7137431758841681, "grad_norm": 0.02950756810605526, "learning_rate": 9.254798009118584e-05, "loss": 0.1262, "step": 1805 }, { "epoch": 1.714692618086874, "grad_norm": 0.027881214395165443, "learning_rate": 9.243778918435187e-05, "loss": 0.1266, "step": 1806 }, { "epoch": 1.7156420602895799, "grad_norm": 0.05155957117676735, "learning_rate": 9.232760751162193e-05, "loss": 0.1936, "step": 1807 }, { "epoch": 1.7165915024922858, "grad_norm": 0.029041916131973267, "learning_rate": 9.221743520753719e-05, "loss": 0.1204, "step": 1808 }, { "epoch": 1.7175409446949916, "grad_norm": 0.030144108459353447, "learning_rate": 9.210727240662747e-05, "loss": 0.1285, "step": 1809 }, { "epoch": 1.7184903868976975, "grad_norm": 0.028103960677981377, "learning_rate": 9.199711924341093e-05, "loss": 0.125, "step": 1810 }, { "epoch": 1.7194398291004034, "grad_norm": 0.03844073414802551, "learning_rate": 9.188697585239394e-05, "loss": 0.1525, "step": 1811 }, { "epoch": 1.7203892713031093, "grad_norm": 0.04454744979739189, "learning_rate": 9.177684236807099e-05, "loss": 0.1616, "step": 1812 }, { "epoch": 1.7213387135058154, "grad_norm": 0.027989163994789124, "learning_rate": 9.166671892492446e-05, "loss": 0.1265, "step": 1813 }, { "epoch": 1.7222881557085212, "grad_norm": 0.04422546178102493, "learning_rate": 9.155660565742444e-05, "loss": 0.159, "step": 1814 }, { "epoch": 1.723237597911227, "grad_norm": 0.027917252853512764, "learning_rate": 9.144650270002866e-05, "loss": 0.1229, "step": 1815 }, { "epoch": 1.7241870401139332, "grad_norm": 0.05252804979681969, "learning_rate": 9.133641018718217e-05, "loss": 0.1955, "step": 1816 }, { "epoch": 1.725136482316639, "grad_norm": 0.029228439554572105, "learning_rate": 9.122632825331733e-05, "loss": 0.1197, "step": 1817 }, { "epoch": 1.726085924519345, "grad_norm": 0.02810599096119404, "learning_rate": 9.111625703285356e-05, "loss": 0.1284, "step": 1818 }, { "epoch": 1.7270353667220508, "grad_norm": 0.02618074230849743, "learning_rate": 9.10061966601972e-05, "loss": 0.1239, "step": 1819 }, { "epoch": 1.7279848089247567, "grad_norm": 0.026649268344044685, "learning_rate": 9.089614726974137e-05, "loss": 0.1218, "step": 1820 }, { "epoch": 1.7289342511274626, "grad_norm": 0.02857782505452633, "learning_rate": 9.078610899586575e-05, "loss": 0.133, "step": 1821 }, { "epoch": 1.7298836933301684, "grad_norm": 0.026309454813599586, "learning_rate": 9.067608197293642e-05, "loss": 0.1175, "step": 1822 }, { "epoch": 1.7308331355328743, "grad_norm": 0.02791914902627468, "learning_rate": 9.056606633530578e-05, "loss": 0.12, "step": 1823 }, { "epoch": 1.7317825777355802, "grad_norm": 0.030874181538820267, "learning_rate": 9.045606221731229e-05, "loss": 0.1307, "step": 1824 }, { "epoch": 1.7327320199382863, "grad_norm": 0.030806539580225945, "learning_rate": 9.034606975328033e-05, "loss": 0.1188, "step": 1825 }, { "epoch": 1.7336814621409922, "grad_norm": 0.028665419667959213, "learning_rate": 9.023608907752015e-05, "loss": 0.1311, "step": 1826 }, { "epoch": 1.734630904343698, "grad_norm": 0.029301505535840988, "learning_rate": 9.012612032432747e-05, "loss": 0.1325, "step": 1827 }, { "epoch": 1.7355803465464041, "grad_norm": 0.0394410640001297, "learning_rate": 9.001616362798353e-05, "loss": 0.164, "step": 1828 }, { "epoch": 1.73652978874911, "grad_norm": 0.02754109725356102, "learning_rate": 8.990621912275484e-05, "loss": 0.12, "step": 1829 }, { "epoch": 1.737479230951816, "grad_norm": 0.02719545178115368, "learning_rate": 8.9796286942893e-05, "loss": 0.1203, "step": 1830 }, { "epoch": 1.7384286731545218, "grad_norm": 0.02480783686041832, "learning_rate": 8.968636722263455e-05, "loss": 0.1225, "step": 1831 }, { "epoch": 1.7393781153572276, "grad_norm": 0.025418803095817566, "learning_rate": 8.957646009620085e-05, "loss": 0.125, "step": 1832 }, { "epoch": 1.7403275575599335, "grad_norm": 0.024165470153093338, "learning_rate": 8.94665656977979e-05, "loss": 0.125, "step": 1833 }, { "epoch": 1.7412769997626394, "grad_norm": 0.03011813573539257, "learning_rate": 8.935668416161612e-05, "loss": 0.1337, "step": 1834 }, { "epoch": 1.7422264419653453, "grad_norm": 0.038413502275943756, "learning_rate": 8.92468156218302e-05, "loss": 0.1715, "step": 1835 }, { "epoch": 1.7431758841680511, "grad_norm": 0.023849591612815857, "learning_rate": 8.9136960212599e-05, "loss": 0.1148, "step": 1836 }, { "epoch": 1.7441253263707572, "grad_norm": 0.027159664779901505, "learning_rate": 8.902711806806536e-05, "loss": 0.1255, "step": 1837 }, { "epoch": 1.745074768573463, "grad_norm": 0.030395383015275, "learning_rate": 8.89172893223559e-05, "loss": 0.1267, "step": 1838 }, { "epoch": 1.746024210776169, "grad_norm": 0.025772254914045334, "learning_rate": 8.880747410958085e-05, "loss": 0.1212, "step": 1839 }, { "epoch": 1.7469736529788749, "grad_norm": 0.04073212668299675, "learning_rate": 8.8697672563834e-05, "loss": 0.1229, "step": 1840 }, { "epoch": 1.747923095181581, "grad_norm": 0.048602957278490067, "learning_rate": 8.858788481919235e-05, "loss": 0.1587, "step": 1841 }, { "epoch": 1.7488725373842868, "grad_norm": 0.030672159045934677, "learning_rate": 8.84781110097161e-05, "loss": 0.1276, "step": 1842 }, { "epoch": 1.7498219795869927, "grad_norm": 0.029867272824048996, "learning_rate": 8.836835126944843e-05, "loss": 0.1316, "step": 1843 }, { "epoch": 1.7507714217896986, "grad_norm": 0.03122364915907383, "learning_rate": 8.825860573241535e-05, "loss": 0.1276, "step": 1844 }, { "epoch": 1.7517208639924045, "grad_norm": 0.03530842810869217, "learning_rate": 8.814887453262555e-05, "loss": 0.1272, "step": 1845 }, { "epoch": 1.7526703061951103, "grad_norm": 0.028104711323976517, "learning_rate": 8.803915780407009e-05, "loss": 0.1277, "step": 1846 }, { "epoch": 1.7536197483978162, "grad_norm": 0.02434263750910759, "learning_rate": 8.792945568072252e-05, "loss": 0.1136, "step": 1847 }, { "epoch": 1.754569190600522, "grad_norm": 0.027843188494443893, "learning_rate": 8.781976829653846e-05, "loss": 0.1199, "step": 1848 }, { "epoch": 1.755518632803228, "grad_norm": 0.03688850998878479, "learning_rate": 8.771009578545553e-05, "loss": 0.1345, "step": 1849 }, { "epoch": 1.756468075005934, "grad_norm": 0.027186516672372818, "learning_rate": 8.760043828139325e-05, "loss": 0.1149, "step": 1850 }, { "epoch": 1.75741751720864, "grad_norm": 0.04049715772271156, "learning_rate": 8.749079591825278e-05, "loss": 0.1585, "step": 1851 }, { "epoch": 1.7583669594113458, "grad_norm": 0.02956775203347206, "learning_rate": 8.738116882991679e-05, "loss": 0.1303, "step": 1852 }, { "epoch": 1.759316401614052, "grad_norm": 0.026160147041082382, "learning_rate": 8.72715571502493e-05, "loss": 0.1237, "step": 1853 }, { "epoch": 1.7602658438167578, "grad_norm": 0.023719090968370438, "learning_rate": 8.71619610130955e-05, "loss": 0.1172, "step": 1854 }, { "epoch": 1.7612152860194636, "grad_norm": 0.024884294718503952, "learning_rate": 8.705238055228161e-05, "loss": 0.123, "step": 1855 }, { "epoch": 1.7621647282221695, "grad_norm": 0.028241394087672234, "learning_rate": 8.694281590161474e-05, "loss": 0.129, "step": 1856 }, { "epoch": 1.7631141704248754, "grad_norm": 0.028791090473532677, "learning_rate": 8.683326719488263e-05, "loss": 0.121, "step": 1857 }, { "epoch": 1.7640636126275813, "grad_norm": 0.046369921416044235, "learning_rate": 8.672373456585365e-05, "loss": 0.1666, "step": 1858 }, { "epoch": 1.7650130548302871, "grad_norm": 0.025271739810705185, "learning_rate": 8.661421814827641e-05, "loss": 0.1196, "step": 1859 }, { "epoch": 1.765962497032993, "grad_norm": 0.02569795772433281, "learning_rate": 8.650471807587983e-05, "loss": 0.1235, "step": 1860 }, { "epoch": 1.766911939235699, "grad_norm": 0.03638843819499016, "learning_rate": 8.639523448237282e-05, "loss": 0.1523, "step": 1861 }, { "epoch": 1.767861381438405, "grad_norm": 0.03260574862360954, "learning_rate": 8.628576750144419e-05, "loss": 0.1328, "step": 1862 }, { "epoch": 1.7688108236411109, "grad_norm": 0.02770201303064823, "learning_rate": 8.617631726676243e-05, "loss": 0.1256, "step": 1863 }, { "epoch": 1.7697602658438167, "grad_norm": 0.02869422361254692, "learning_rate": 8.606688391197564e-05, "loss": 0.1261, "step": 1864 }, { "epoch": 1.7707097080465226, "grad_norm": 0.02792002633213997, "learning_rate": 8.595746757071125e-05, "loss": 0.1277, "step": 1865 }, { "epoch": 1.7716591502492287, "grad_norm": 0.025662843137979507, "learning_rate": 8.584806837657594e-05, "loss": 0.1163, "step": 1866 }, { "epoch": 1.7726085924519346, "grad_norm": 0.027771448716521263, "learning_rate": 8.573868646315546e-05, "loss": 0.1273, "step": 1867 }, { "epoch": 1.7735580346546405, "grad_norm": 0.026355689391493797, "learning_rate": 8.562932196401444e-05, "loss": 0.1241, "step": 1868 }, { "epoch": 1.7745074768573463, "grad_norm": 0.028244782239198685, "learning_rate": 8.551997501269629e-05, "loss": 0.1319, "step": 1869 }, { "epoch": 1.7754569190600522, "grad_norm": 0.027661755681037903, "learning_rate": 8.541064574272292e-05, "loss": 0.134, "step": 1870 }, { "epoch": 1.776406361262758, "grad_norm": 0.026287924498319626, "learning_rate": 8.530133428759468e-05, "loss": 0.1215, "step": 1871 }, { "epoch": 1.777355803465464, "grad_norm": 0.040449049323797226, "learning_rate": 8.519204078079021e-05, "loss": 0.179, "step": 1872 }, { "epoch": 1.7783052456681698, "grad_norm": 0.022792836651206017, "learning_rate": 8.508276535576619e-05, "loss": 0.1208, "step": 1873 }, { "epoch": 1.7792546878708757, "grad_norm": 0.042618922889232635, "learning_rate": 8.497350814595721e-05, "loss": 0.1666, "step": 1874 }, { "epoch": 1.7802041300735818, "grad_norm": 0.0521213673055172, "learning_rate": 8.486426928477561e-05, "loss": 0.1858, "step": 1875 }, { "epoch": 1.7811535722762877, "grad_norm": 0.04682205617427826, "learning_rate": 8.475504890561142e-05, "loss": 0.2037, "step": 1876 }, { "epoch": 1.7821030144789936, "grad_norm": 0.041265182197093964, "learning_rate": 8.464584714183204e-05, "loss": 0.1775, "step": 1877 }, { "epoch": 1.7830524566816996, "grad_norm": 0.027602121233940125, "learning_rate": 8.453666412678206e-05, "loss": 0.1186, "step": 1878 }, { "epoch": 1.7840018988844055, "grad_norm": 0.025885846465826035, "learning_rate": 8.442749999378327e-05, "loss": 0.1275, "step": 1879 }, { "epoch": 1.7849513410871114, "grad_norm": 0.05111463740468025, "learning_rate": 8.43183548761344e-05, "loss": 0.1535, "step": 1880 }, { "epoch": 1.7859007832898173, "grad_norm": 0.026447484269738197, "learning_rate": 8.420922890711094e-05, "loss": 0.1244, "step": 1881 }, { "epoch": 1.7868502254925231, "grad_norm": 0.046114444732666016, "learning_rate": 8.410012221996502e-05, "loss": 0.1549, "step": 1882 }, { "epoch": 1.787799667695229, "grad_norm": 0.027883663773536682, "learning_rate": 8.399103494792514e-05, "loss": 0.1186, "step": 1883 }, { "epoch": 1.788749109897935, "grad_norm": 0.02667239122092724, "learning_rate": 8.388196722419621e-05, "loss": 0.1367, "step": 1884 }, { "epoch": 1.7896985521006408, "grad_norm": 0.028317047283053398, "learning_rate": 8.377291918195922e-05, "loss": 0.1293, "step": 1885 }, { "epoch": 1.7906479943033466, "grad_norm": 0.044224657118320465, "learning_rate": 8.36638909543711e-05, "loss": 0.1722, "step": 1886 }, { "epoch": 1.7915974365060527, "grad_norm": 0.029220551252365112, "learning_rate": 8.35548826745646e-05, "loss": 0.1315, "step": 1887 }, { "epoch": 1.7925468787087586, "grad_norm": 0.028357302770018578, "learning_rate": 8.344589447564818e-05, "loss": 0.1271, "step": 1888 }, { "epoch": 1.7934963209114645, "grad_norm": 0.027882913127541542, "learning_rate": 8.333692649070568e-05, "loss": 0.1311, "step": 1889 }, { "epoch": 1.7944457631141706, "grad_norm": 0.029344897717237473, "learning_rate": 8.322797885279627e-05, "loss": 0.1231, "step": 1890 }, { "epoch": 1.7953952053168765, "grad_norm": 0.039409589022397995, "learning_rate": 8.311905169495435e-05, "loss": 0.1651, "step": 1891 }, { "epoch": 1.7963446475195823, "grad_norm": 0.025551313534379005, "learning_rate": 8.301014515018925e-05, "loss": 0.1162, "step": 1892 }, { "epoch": 1.7972940897222882, "grad_norm": 0.027775781229138374, "learning_rate": 8.290125935148516e-05, "loss": 0.1254, "step": 1893 }, { "epoch": 1.798243531924994, "grad_norm": 0.025555282831192017, "learning_rate": 8.279239443180088e-05, "loss": 0.1173, "step": 1894 }, { "epoch": 1.7991929741277, "grad_norm": 0.027120131999254227, "learning_rate": 8.268355052406978e-05, "loss": 0.123, "step": 1895 }, { "epoch": 1.8001424163304058, "grad_norm": 0.027624819427728653, "learning_rate": 8.257472776119957e-05, "loss": 0.1313, "step": 1896 }, { "epoch": 1.8010918585331117, "grad_norm": 0.04341182857751846, "learning_rate": 8.246592627607208e-05, "loss": 0.1695, "step": 1897 }, { "epoch": 1.8020413007358176, "grad_norm": 0.049969110637903214, "learning_rate": 8.235714620154323e-05, "loss": 0.1609, "step": 1898 }, { "epoch": 1.8029907429385235, "grad_norm": 0.033960528671741486, "learning_rate": 8.224838767044275e-05, "loss": 0.1299, "step": 1899 }, { "epoch": 1.8039401851412296, "grad_norm": 0.045656926929950714, "learning_rate": 8.213965081557402e-05, "loss": 0.1613, "step": 1900 }, { "epoch": 1.8048896273439354, "grad_norm": 0.028559250757098198, "learning_rate": 8.203093576971414e-05, "loss": 0.1222, "step": 1901 }, { "epoch": 1.8058390695466413, "grad_norm": 0.026110464707016945, "learning_rate": 8.192224266561336e-05, "loss": 0.1204, "step": 1902 }, { "epoch": 1.8067885117493474, "grad_norm": 0.023995952680706978, "learning_rate": 8.181357163599522e-05, "loss": 0.1164, "step": 1903 }, { "epoch": 1.8077379539520533, "grad_norm": 0.054921507835388184, "learning_rate": 8.170492281355635e-05, "loss": 0.1535, "step": 1904 }, { "epoch": 1.8086873961547592, "grad_norm": 0.03065885417163372, "learning_rate": 8.159629633096619e-05, "loss": 0.1325, "step": 1905 }, { "epoch": 1.809636838357465, "grad_norm": 0.02722746506333351, "learning_rate": 8.148769232086698e-05, "loss": 0.1201, "step": 1906 }, { "epoch": 1.810586280560171, "grad_norm": 0.026619885116815567, "learning_rate": 8.13791109158734e-05, "loss": 0.125, "step": 1907 }, { "epoch": 1.8115357227628768, "grad_norm": 0.02867223508656025, "learning_rate": 8.127055224857266e-05, "loss": 0.1279, "step": 1908 }, { "epoch": 1.8124851649655827, "grad_norm": 0.029073316603899002, "learning_rate": 8.116201645152412e-05, "loss": 0.1195, "step": 1909 }, { "epoch": 1.8134346071682885, "grad_norm": 0.04955434426665306, "learning_rate": 8.105350365725926e-05, "loss": 0.1657, "step": 1910 }, { "epoch": 1.8143840493709944, "grad_norm": 0.03038748912513256, "learning_rate": 8.094501399828143e-05, "loss": 0.1244, "step": 1911 }, { "epoch": 1.8153334915737005, "grad_norm": 0.08984460681676865, "learning_rate": 8.08365476070658e-05, "loss": 0.1602, "step": 1912 }, { "epoch": 1.8162829337764064, "grad_norm": 0.028402511030435562, "learning_rate": 8.0728104616059e-05, "loss": 0.1162, "step": 1913 }, { "epoch": 1.8172323759791122, "grad_norm": 0.03234838321805, "learning_rate": 8.061968515767922e-05, "loss": 0.1271, "step": 1914 }, { "epoch": 1.8181818181818183, "grad_norm": 0.031242968514561653, "learning_rate": 8.051128936431584e-05, "loss": 0.1266, "step": 1915 }, { "epoch": 1.8191312603845242, "grad_norm": 0.030700111761689186, "learning_rate": 8.040291736832937e-05, "loss": 0.1294, "step": 1916 }, { "epoch": 1.82008070258723, "grad_norm": 0.031359221786260605, "learning_rate": 8.029456930205128e-05, "loss": 0.1316, "step": 1917 }, { "epoch": 1.821030144789936, "grad_norm": 0.03048795275390148, "learning_rate": 8.018624529778375e-05, "loss": 0.127, "step": 1918 }, { "epoch": 1.8219795869926418, "grad_norm": 0.03642559424042702, "learning_rate": 8.007794548779964e-05, "loss": 0.1577, "step": 1919 }, { "epoch": 1.8229290291953477, "grad_norm": 0.029665078967809677, "learning_rate": 7.996967000434224e-05, "loss": 0.1266, "step": 1920 }, { "epoch": 1.8238784713980536, "grad_norm": 0.027418775483965874, "learning_rate": 7.986141897962518e-05, "loss": 0.1319, "step": 1921 }, { "epoch": 1.8248279136007595, "grad_norm": 0.03040868602693081, "learning_rate": 7.975319254583216e-05, "loss": 0.1263, "step": 1922 }, { "epoch": 1.8257773558034653, "grad_norm": 0.030304348096251488, "learning_rate": 7.96449908351169e-05, "loss": 0.1357, "step": 1923 }, { "epoch": 1.8267267980061712, "grad_norm": 0.029055261984467506, "learning_rate": 7.953681397960287e-05, "loss": 0.1293, "step": 1924 }, { "epoch": 1.8276762402088773, "grad_norm": 0.030226033180952072, "learning_rate": 7.942866211138324e-05, "loss": 0.1335, "step": 1925 }, { "epoch": 1.8286256824115832, "grad_norm": 0.02857894077897072, "learning_rate": 7.93205353625207e-05, "loss": 0.1307, "step": 1926 }, { "epoch": 1.829575124614289, "grad_norm": 0.031932733952999115, "learning_rate": 7.921243386504723e-05, "loss": 0.1345, "step": 1927 }, { "epoch": 1.8305245668169952, "grad_norm": 0.030729882419109344, "learning_rate": 7.910435775096394e-05, "loss": 0.1256, "step": 1928 }, { "epoch": 1.831474009019701, "grad_norm": 0.056101903319358826, "learning_rate": 7.899630715224098e-05, "loss": 0.1858, "step": 1929 }, { "epoch": 1.832423451222407, "grad_norm": 0.02785342186689377, "learning_rate": 7.888828220081738e-05, "loss": 0.1244, "step": 1930 }, { "epoch": 1.8333728934251128, "grad_norm": 0.031731851398944855, "learning_rate": 7.878028302860076e-05, "loss": 0.1339, "step": 1931 }, { "epoch": 1.8343223356278187, "grad_norm": 0.02918057143688202, "learning_rate": 7.867230976746733e-05, "loss": 0.1309, "step": 1932 }, { "epoch": 1.8352717778305245, "grad_norm": 0.029841450974345207, "learning_rate": 7.856436254926165e-05, "loss": 0.1244, "step": 1933 }, { "epoch": 1.8362212200332304, "grad_norm": 0.02823001891374588, "learning_rate": 7.845644150579649e-05, "loss": 0.1253, "step": 1934 }, { "epoch": 1.8371706622359363, "grad_norm": 0.027422424405813217, "learning_rate": 7.834854676885262e-05, "loss": 0.1141, "step": 1935 }, { "epoch": 1.8381201044386422, "grad_norm": 0.02671034075319767, "learning_rate": 7.824067847017876e-05, "loss": 0.1204, "step": 1936 }, { "epoch": 1.8390695466413483, "grad_norm": 0.02999771386384964, "learning_rate": 7.813283674149123e-05, "loss": 0.1258, "step": 1937 }, { "epoch": 1.8400189888440541, "grad_norm": 0.026322634890675545, "learning_rate": 7.8025021714474e-05, "loss": 0.117, "step": 1938 }, { "epoch": 1.84096843104676, "grad_norm": 0.02665727399289608, "learning_rate": 7.791723352077842e-05, "loss": 0.1168, "step": 1939 }, { "epoch": 1.841917873249466, "grad_norm": 0.043415650725364685, "learning_rate": 7.780947229202305e-05, "loss": 0.1666, "step": 1940 }, { "epoch": 1.842867315452172, "grad_norm": 0.03161248564720154, "learning_rate": 7.770173815979356e-05, "loss": 0.1307, "step": 1941 }, { "epoch": 1.8438167576548778, "grad_norm": 0.028511585667729378, "learning_rate": 7.759403125564246e-05, "loss": 0.1329, "step": 1942 }, { "epoch": 1.8447661998575837, "grad_norm": 0.03340164199471474, "learning_rate": 7.74863517110891e-05, "loss": 0.1233, "step": 1943 }, { "epoch": 1.8457156420602896, "grad_norm": 0.027353493496775627, "learning_rate": 7.737869965761937e-05, "loss": 0.1227, "step": 1944 }, { "epoch": 1.8466650842629955, "grad_norm": 0.04435974359512329, "learning_rate": 7.72710752266856e-05, "loss": 0.1586, "step": 1945 }, { "epoch": 1.8476145264657013, "grad_norm": 0.03443425893783569, "learning_rate": 7.716347854970642e-05, "loss": 0.1285, "step": 1946 }, { "epoch": 1.8485639686684072, "grad_norm": 0.02941983938217163, "learning_rate": 7.705590975806652e-05, "loss": 0.1227, "step": 1947 }, { "epoch": 1.849513410871113, "grad_norm": 0.031541094183921814, "learning_rate": 7.694836898311654e-05, "loss": 0.1307, "step": 1948 }, { "epoch": 1.850462853073819, "grad_norm": 0.030199352651834488, "learning_rate": 7.684085635617297e-05, "loss": 0.126, "step": 1949 }, { "epoch": 1.851412295276525, "grad_norm": 0.024474412202835083, "learning_rate": 7.673337200851787e-05, "loss": 0.1222, "step": 1950 }, { "epoch": 1.852361737479231, "grad_norm": 0.02779853343963623, "learning_rate": 7.662591607139882e-05, "loss": 0.1242, "step": 1951 }, { "epoch": 1.8533111796819368, "grad_norm": 0.036591142416000366, "learning_rate": 7.651848867602867e-05, "loss": 0.1593, "step": 1952 }, { "epoch": 1.854260621884643, "grad_norm": 0.029311561957001686, "learning_rate": 7.641108995358542e-05, "loss": 0.1237, "step": 1953 }, { "epoch": 1.8552100640873488, "grad_norm": 0.029321955516934395, "learning_rate": 7.630372003521204e-05, "loss": 0.1289, "step": 1954 }, { "epoch": 1.8561595062900547, "grad_norm": 0.034341856837272644, "learning_rate": 7.619637905201642e-05, "loss": 0.1323, "step": 1955 }, { "epoch": 1.8571089484927605, "grad_norm": 0.02771337330341339, "learning_rate": 7.608906713507098e-05, "loss": 0.133, "step": 1956 }, { "epoch": 1.8580583906954664, "grad_norm": 0.03146693855524063, "learning_rate": 7.598178441541274e-05, "loss": 0.1346, "step": 1957 }, { "epoch": 1.8590078328981723, "grad_norm": 0.028372354805469513, "learning_rate": 7.587453102404306e-05, "loss": 0.1194, "step": 1958 }, { "epoch": 1.8599572751008782, "grad_norm": 0.027766333892941475, "learning_rate": 7.576730709192744e-05, "loss": 0.1241, "step": 1959 }, { "epoch": 1.860906717303584, "grad_norm": 0.026262789964675903, "learning_rate": 7.566011274999549e-05, "loss": 0.1191, "step": 1960 }, { "epoch": 1.86185615950629, "grad_norm": 0.028471313416957855, "learning_rate": 7.555294812914061e-05, "loss": 0.1208, "step": 1961 }, { "epoch": 1.862805601708996, "grad_norm": 0.0403280183672905, "learning_rate": 7.544581336021994e-05, "loss": 0.169, "step": 1962 }, { "epoch": 1.8637550439117019, "grad_norm": 0.029336489737033844, "learning_rate": 7.533870857405414e-05, "loss": 0.1275, "step": 1963 }, { "epoch": 1.8647044861144078, "grad_norm": 0.05861514061689377, "learning_rate": 7.523163390142732e-05, "loss": 0.1984, "step": 1964 }, { "epoch": 1.8656539283171139, "grad_norm": 0.026019204407930374, "learning_rate": 7.51245894730868e-05, "loss": 0.1194, "step": 1965 }, { "epoch": 1.8666033705198197, "grad_norm": 0.043394673615694046, "learning_rate": 7.501757541974289e-05, "loss": 0.1598, "step": 1966 }, { "epoch": 1.8675528127225256, "grad_norm": 0.025404971092939377, "learning_rate": 7.49105918720689e-05, "loss": 0.1159, "step": 1967 }, { "epoch": 1.8685022549252315, "grad_norm": 0.028354499489068985, "learning_rate": 7.480363896070089e-05, "loss": 0.1216, "step": 1968 }, { "epoch": 1.8694516971279374, "grad_norm": 0.06748262792825699, "learning_rate": 7.469671681623742e-05, "loss": 0.1888, "step": 1969 }, { "epoch": 1.8704011393306432, "grad_norm": 0.027621906250715256, "learning_rate": 7.458982556923963e-05, "loss": 0.1196, "step": 1970 }, { "epoch": 1.871350581533349, "grad_norm": 0.031015096232295036, "learning_rate": 7.448296535023077e-05, "loss": 0.1266, "step": 1971 }, { "epoch": 1.872300023736055, "grad_norm": 0.02895331382751465, "learning_rate": 7.437613628969627e-05, "loss": 0.1284, "step": 1972 }, { "epoch": 1.8732494659387608, "grad_norm": 0.04110453650355339, "learning_rate": 7.426933851808355e-05, "loss": 0.1545, "step": 1973 }, { "epoch": 1.8741989081414667, "grad_norm": 0.030046746134757996, "learning_rate": 7.416257216580181e-05, "loss": 0.1269, "step": 1974 }, { "epoch": 1.8751483503441728, "grad_norm": 0.04101106524467468, "learning_rate": 7.405583736322182e-05, "loss": 0.1621, "step": 1975 }, { "epoch": 1.8760977925468787, "grad_norm": 0.04438061639666557, "learning_rate": 7.394913424067591e-05, "loss": 0.1693, "step": 1976 }, { "epoch": 1.8770472347495846, "grad_norm": 0.028554782271385193, "learning_rate": 7.38424629284577e-05, "loss": 0.1265, "step": 1977 }, { "epoch": 1.8779966769522907, "grad_norm": 0.028575632721185684, "learning_rate": 7.373582355682191e-05, "loss": 0.12, "step": 1978 }, { "epoch": 1.8789461191549965, "grad_norm": 0.027870802208781242, "learning_rate": 7.362921625598436e-05, "loss": 0.1301, "step": 1979 }, { "epoch": 1.8798955613577024, "grad_norm": 0.03442731872200966, "learning_rate": 7.352264115612158e-05, "loss": 0.1537, "step": 1980 }, { "epoch": 1.8808450035604083, "grad_norm": 0.03244437277317047, "learning_rate": 7.341609838737089e-05, "loss": 0.1356, "step": 1981 }, { "epoch": 1.8817944457631142, "grad_norm": 0.0298260897397995, "learning_rate": 7.330958807983011e-05, "loss": 0.1238, "step": 1982 }, { "epoch": 1.88274388796582, "grad_norm": 0.02521882764995098, "learning_rate": 7.320311036355736e-05, "loss": 0.1185, "step": 1983 }, { "epoch": 1.883693330168526, "grad_norm": 0.02781338430941105, "learning_rate": 7.309666536857106e-05, "loss": 0.1214, "step": 1984 }, { "epoch": 1.8846427723712318, "grad_norm": 0.05946779251098633, "learning_rate": 7.299025322484958e-05, "loss": 0.1897, "step": 1985 }, { "epoch": 1.8855922145739377, "grad_norm": 0.028507012873888016, "learning_rate": 7.288387406233122e-05, "loss": 0.1282, "step": 1986 }, { "epoch": 1.8865416567766438, "grad_norm": 0.029610810801386833, "learning_rate": 7.277752801091404e-05, "loss": 0.1302, "step": 1987 }, { "epoch": 1.8874910989793496, "grad_norm": 0.030304808169603348, "learning_rate": 7.267121520045558e-05, "loss": 0.132, "step": 1988 }, { "epoch": 1.8884405411820555, "grad_norm": 0.028647607192397118, "learning_rate": 7.256493576077292e-05, "loss": 0.1309, "step": 1989 }, { "epoch": 1.8893899833847616, "grad_norm": 0.04364948347210884, "learning_rate": 7.245868982164226e-05, "loss": 0.1628, "step": 1990 }, { "epoch": 1.8903394255874675, "grad_norm": 0.029457390308380127, "learning_rate": 7.235247751279893e-05, "loss": 0.1163, "step": 1991 }, { "epoch": 1.8912888677901734, "grad_norm": 0.059156183153390884, "learning_rate": 7.224629896393726e-05, "loss": 0.2033, "step": 1992 }, { "epoch": 1.8922383099928792, "grad_norm": 0.02888781949877739, "learning_rate": 7.214015430471028e-05, "loss": 0.1242, "step": 1993 }, { "epoch": 1.893187752195585, "grad_norm": 0.03040069155395031, "learning_rate": 7.20340436647297e-05, "loss": 0.1211, "step": 1994 }, { "epoch": 1.894137194398291, "grad_norm": 0.028204258531332016, "learning_rate": 7.192796717356562e-05, "loss": 0.1267, "step": 1995 }, { "epoch": 1.8950866366009969, "grad_norm": 0.030367571860551834, "learning_rate": 7.182192496074648e-05, "loss": 0.1232, "step": 1996 }, { "epoch": 1.8960360788037027, "grad_norm": 0.02587362751364708, "learning_rate": 7.171591715575888e-05, "loss": 0.1261, "step": 1997 }, { "epoch": 1.8969855210064086, "grad_norm": 0.028903882950544357, "learning_rate": 7.160994388804736e-05, "loss": 0.1318, "step": 1998 }, { "epoch": 1.8979349632091147, "grad_norm": 0.025526562705636024, "learning_rate": 7.150400528701436e-05, "loss": 0.1205, "step": 1999 }, { "epoch": 1.8988844054118206, "grad_norm": 0.029438691213726997, "learning_rate": 7.139810148201987e-05, "loss": 0.131, "step": 2000 }, { "epoch": 1.8998338476145264, "grad_norm": 0.026545461267232895, "learning_rate": 7.129223260238154e-05, "loss": 0.1219, "step": 2001 }, { "epoch": 1.9007832898172323, "grad_norm": 0.03137153759598732, "learning_rate": 7.118639877737425e-05, "loss": 0.1389, "step": 2002 }, { "epoch": 1.9017327320199384, "grad_norm": 0.03655494004487991, "learning_rate": 7.108060013623017e-05, "loss": 0.1592, "step": 2003 }, { "epoch": 1.9026821742226443, "grad_norm": 0.0271841399371624, "learning_rate": 7.09748368081384e-05, "loss": 0.1309, "step": 2004 }, { "epoch": 1.9036316164253502, "grad_norm": 0.028577405959367752, "learning_rate": 7.086910892224499e-05, "loss": 0.1261, "step": 2005 }, { "epoch": 1.904581058628056, "grad_norm": 0.028025876730680466, "learning_rate": 7.076341660765271e-05, "loss": 0.1323, "step": 2006 }, { "epoch": 1.905530500830762, "grad_norm": 0.03332342579960823, "learning_rate": 7.065775999342091e-05, "loss": 0.1327, "step": 2007 }, { "epoch": 1.9064799430334678, "grad_norm": 0.043180011212825775, "learning_rate": 7.055213920856529e-05, "loss": 0.1613, "step": 2008 }, { "epoch": 1.9074293852361737, "grad_norm": 0.04228482022881508, "learning_rate": 7.044655438205785e-05, "loss": 0.1594, "step": 2009 }, { "epoch": 1.9083788274388795, "grad_norm": 0.029172802343964577, "learning_rate": 7.034100564282664e-05, "loss": 0.1258, "step": 2010 }, { "epoch": 1.9093282696415854, "grad_norm": 0.0426810160279274, "learning_rate": 7.02354931197557e-05, "loss": 0.16, "step": 2011 }, { "epoch": 1.9102777118442915, "grad_norm": 0.025085503235459328, "learning_rate": 7.013001694168478e-05, "loss": 0.1233, "step": 2012 }, { "epoch": 1.9112271540469974, "grad_norm": 0.0266293715685606, "learning_rate": 7.002457723740934e-05, "loss": 0.1214, "step": 2013 }, { "epoch": 1.9121765962497033, "grad_norm": 0.03064984828233719, "learning_rate": 6.991917413568017e-05, "loss": 0.1186, "step": 2014 }, { "epoch": 1.9131260384524094, "grad_norm": 0.026003271341323853, "learning_rate": 6.981380776520348e-05, "loss": 0.1228, "step": 2015 }, { "epoch": 1.9140754806551152, "grad_norm": 0.045436155050992966, "learning_rate": 6.970847825464059e-05, "loss": 0.174, "step": 2016 }, { "epoch": 1.915024922857821, "grad_norm": 0.029938362538814545, "learning_rate": 6.960318573260783e-05, "loss": 0.1201, "step": 2017 }, { "epoch": 1.915974365060527, "grad_norm": 0.026935014873743057, "learning_rate": 6.949793032767634e-05, "loss": 0.1165, "step": 2018 }, { "epoch": 1.9169238072632329, "grad_norm": 0.02809876948595047, "learning_rate": 6.93927121683719e-05, "loss": 0.1248, "step": 2019 }, { "epoch": 1.9178732494659387, "grad_norm": 0.03932083770632744, "learning_rate": 6.928753138317488e-05, "loss": 0.1607, "step": 2020 }, { "epoch": 1.9188226916686446, "grad_norm": 0.029043098911643028, "learning_rate": 6.918238810051999e-05, "loss": 0.1292, "step": 2021 }, { "epoch": 1.9197721338713505, "grad_norm": 0.03849990293383598, "learning_rate": 6.907728244879611e-05, "loss": 0.1611, "step": 2022 }, { "epoch": 1.9207215760740564, "grad_norm": 0.028439447283744812, "learning_rate": 6.897221455634624e-05, "loss": 0.1265, "step": 2023 }, { "epoch": 1.9216710182767625, "grad_norm": 0.028611112385988235, "learning_rate": 6.886718455146724e-05, "loss": 0.1312, "step": 2024 }, { "epoch": 1.9226204604794683, "grad_norm": 0.02605103701353073, "learning_rate": 6.87621925624096e-05, "loss": 0.1241, "step": 2025 }, { "epoch": 1.9235699026821742, "grad_norm": 0.06604333966970444, "learning_rate": 6.865723871737762e-05, "loss": 0.2016, "step": 2026 }, { "epoch": 1.92451934488488, "grad_norm": 0.044974714517593384, "learning_rate": 6.855232314452884e-05, "loss": 0.1778, "step": 2027 }, { "epoch": 1.9254687870875862, "grad_norm": 0.03168616443872452, "learning_rate": 6.844744597197409e-05, "loss": 0.1327, "step": 2028 }, { "epoch": 1.926418229290292, "grad_norm": 0.029546428471803665, "learning_rate": 6.834260732777736e-05, "loss": 0.1302, "step": 2029 }, { "epoch": 1.927367671492998, "grad_norm": 0.05021713301539421, "learning_rate": 6.823780733995557e-05, "loss": 0.1863, "step": 2030 }, { "epoch": 1.9283171136957038, "grad_norm": 0.030026502907276154, "learning_rate": 6.813304613647845e-05, "loss": 0.1349, "step": 2031 }, { "epoch": 1.9292665558984097, "grad_norm": 0.03538592904806137, "learning_rate": 6.802832384526836e-05, "loss": 0.1374, "step": 2032 }, { "epoch": 1.9302159981011155, "grad_norm": 0.027488164603710175, "learning_rate": 6.792364059420012e-05, "loss": 0.1237, "step": 2033 }, { "epoch": 1.9311654403038214, "grad_norm": 0.03553836792707443, "learning_rate": 6.781899651110091e-05, "loss": 0.1522, "step": 2034 }, { "epoch": 1.9321148825065273, "grad_norm": 0.029753949493169785, "learning_rate": 6.771439172375007e-05, "loss": 0.1222, "step": 2035 }, { "epoch": 1.9330643247092332, "grad_norm": 0.03108718991279602, "learning_rate": 6.760982635987899e-05, "loss": 0.1186, "step": 2036 }, { "epoch": 1.9340137669119393, "grad_norm": 0.02662482298910618, "learning_rate": 6.750530054717088e-05, "loss": 0.1189, "step": 2037 }, { "epoch": 1.9349632091146451, "grad_norm": 0.029288165271282196, "learning_rate": 6.740081441326062e-05, "loss": 0.1179, "step": 2038 }, { "epoch": 1.935912651317351, "grad_norm": 0.03478897735476494, "learning_rate": 6.729636808573476e-05, "loss": 0.1249, "step": 2039 }, { "epoch": 1.9368620935200571, "grad_norm": 0.03969739004969597, "learning_rate": 6.719196169213114e-05, "loss": 0.1579, "step": 2040 }, { "epoch": 1.937811535722763, "grad_norm": 0.030195200815796852, "learning_rate": 6.708759535993884e-05, "loss": 0.115, "step": 2041 }, { "epoch": 1.9387609779254689, "grad_norm": 0.03426138311624527, "learning_rate": 6.698326921659808e-05, "loss": 0.1266, "step": 2042 }, { "epoch": 1.9397104201281747, "grad_norm": 0.05202037841081619, "learning_rate": 6.687898338949998e-05, "loss": 0.193, "step": 2043 }, { "epoch": 1.9406598623308806, "grad_norm": 0.027649085968732834, "learning_rate": 6.67747380059864e-05, "loss": 0.1222, "step": 2044 }, { "epoch": 1.9416093045335865, "grad_norm": 0.026928169652819633, "learning_rate": 6.667053319334982e-05, "loss": 0.1204, "step": 2045 }, { "epoch": 1.9425587467362924, "grad_norm": 0.056547269225120544, "learning_rate": 6.656636907883325e-05, "loss": 0.1602, "step": 2046 }, { "epoch": 1.9435081889389982, "grad_norm": 0.026589645072817802, "learning_rate": 6.646224578962993e-05, "loss": 0.1214, "step": 2047 }, { "epoch": 1.9444576311417041, "grad_norm": 0.02858765795826912, "learning_rate": 6.635816345288329e-05, "loss": 0.1242, "step": 2048 }, { "epoch": 1.9454070733444102, "grad_norm": 0.04160701856017113, "learning_rate": 6.625412219568668e-05, "loss": 0.1606, "step": 2049 }, { "epoch": 1.946356515547116, "grad_norm": 0.03329680487513542, "learning_rate": 6.615012214508336e-05, "loss": 0.1346, "step": 2050 }, { "epoch": 1.947305957749822, "grad_norm": 0.041767850518226624, "learning_rate": 6.604616342806632e-05, "loss": 0.1566, "step": 2051 }, { "epoch": 1.948255399952528, "grad_norm": 0.027340400964021683, "learning_rate": 6.594224617157795e-05, "loss": 0.1253, "step": 2052 }, { "epoch": 1.949204842155234, "grad_norm": 0.06383645534515381, "learning_rate": 6.583837050251012e-05, "loss": 0.1518, "step": 2053 }, { "epoch": 1.9501542843579398, "grad_norm": 0.051231034100055695, "learning_rate": 6.573453654770383e-05, "loss": 0.1565, "step": 2054 }, { "epoch": 1.9501542843579398, "eval_loss": 0.37301480770111084, "eval_runtime": 38.0432, "eval_samples_per_second": 2.261, "eval_steps_per_second": 2.261, "step": 2054 }, { "epoch": 1.9513410871113221, "grad_norm": 0.4340180456638336, "learning_rate": 0.00013378114170405474, "loss": 0.3769, "step": 2055 }, { "epoch": 1.952290529314028, "grad_norm": 0.2337399125099182, "learning_rate": 0.00013372497405242763, "loss": 0.3148, "step": 2056 }, { "epoch": 1.953239971516734, "grad_norm": 0.15890651941299438, "learning_rate": 0.00013366879439324493, "loss": 0.3167, "step": 2057 }, { "epoch": 1.95418941371944, "grad_norm": 0.14962317049503326, "learning_rate": 0.00013361260274650906, "loss": 0.3146, "step": 2058 }, { "epoch": 1.9551388559221459, "grad_norm": 5.296742916107178, "learning_rate": 0.00013355639913222668, "loss": 0.4622, "step": 2059 }, { "epoch": 1.9560882981248517, "grad_norm": 7.226221084594727, "learning_rate": 0.0001335001835704087, "loss": 1.4115, "step": 2060 }, { "epoch": 1.9570377403275576, "grad_norm": 3.240274667739868, "learning_rate": 0.00013344395608107031, "loss": 1.2552, "step": 2061 }, { "epoch": 1.9579871825302635, "grad_norm": 2.299501657485962, "learning_rate": 0.00013338771668423095, "loss": 0.3784, "step": 2062 }, { "epoch": 1.9589366247329694, "grad_norm": 2.4971210956573486, "learning_rate": 0.00013333146539991431, "loss": 0.6146, "step": 2063 }, { "epoch": 1.9598860669356752, "grad_norm": 1.7239331007003784, "learning_rate": 0.00013327520224814822, "loss": 0.4257, "step": 2064 }, { "epoch": 1.9608355091383811, "grad_norm": 0.29740026593208313, "learning_rate": 0.00013321892724896484, "loss": 0.3187, "step": 2065 }, { "epoch": 1.961784951341087, "grad_norm": 8.102334022521973, "learning_rate": 0.0001331626404224005, "loss": 0.481, "step": 2066 }, { "epoch": 1.9627343935437929, "grad_norm": 0.29957181215286255, "learning_rate": 0.0001331063417884958, "loss": 0.3117, "step": 2067 }, { "epoch": 1.963683835746499, "grad_norm": 2.238389730453491, "learning_rate": 0.00013305003136729552, "loss": 0.3736, "step": 2068 }, { "epoch": 1.9646332779492048, "grad_norm": 0.37475112080574036, "learning_rate": 0.0001329937091788485, "loss": 0.3093, "step": 2069 }, { "epoch": 1.9655827201519107, "grad_norm": 1.1860514879226685, "learning_rate": 0.00013293737524320797, "loss": 0.3951, "step": 2070 }, { "epoch": 1.9665321623546168, "grad_norm": 0.2817871868610382, "learning_rate": 0.00013288102958043126, "loss": 0.3127, "step": 2071 }, { "epoch": 1.9674816045573227, "grad_norm": 0.3843158483505249, "learning_rate": 0.00013282467221057984, "loss": 0.2984, "step": 2072 }, { "epoch": 1.9684310467600286, "grad_norm": 0.40091991424560547, "learning_rate": 0.0001327683031537194, "loss": 0.2966, "step": 2073 }, { "epoch": 1.9693804889627344, "grad_norm": 0.34588465094566345, "learning_rate": 0.00013271192242991976, "loss": 0.3163, "step": 2074 }, { "epoch": 1.9703299311654403, "grad_norm": 0.19917060434818268, "learning_rate": 0.00013265553005925492, "loss": 0.3001, "step": 2075 }, { "epoch": 1.9712793733681462, "grad_norm": 0.17843176424503326, "learning_rate": 0.00013259912606180301, "loss": 0.3018, "step": 2076 }, { "epoch": 1.972228815570852, "grad_norm": 0.10518278181552887, "learning_rate": 0.00013254271045764636, "loss": 0.2883, "step": 2077 }, { "epoch": 1.973178257773558, "grad_norm": 0.16444529592990875, "learning_rate": 0.00013248628326687124, "loss": 0.3041, "step": 2078 }, { "epoch": 1.9741276999762638, "grad_norm": 0.1926691085100174, "learning_rate": 0.00013242984450956828, "loss": 0.2763, "step": 2079 }, { "epoch": 1.97507714217897, "grad_norm": 0.24896161258220673, "learning_rate": 0.00013237339420583212, "loss": 0.2895, "step": 2080 }, { "epoch": 1.9760265843816758, "grad_norm": 0.23915739357471466, "learning_rate": 0.00013231693237576148, "loss": 0.2901, "step": 2081 }, { "epoch": 1.9769760265843817, "grad_norm": 0.08436968922615051, "learning_rate": 0.00013226045903945926, "loss": 0.278, "step": 2082 }, { "epoch": 1.9779254687870877, "grad_norm": 0.9301303625106812, "learning_rate": 0.00013220397421703247, "loss": 0.316, "step": 2083 }, { "epoch": 1.9788749109897936, "grad_norm": 1.2519832849502563, "learning_rate": 0.00013214747792859201, "loss": 0.2931, "step": 2084 }, { "epoch": 1.9798243531924995, "grad_norm": 0.4805239140987396, "learning_rate": 0.00013209097019425316, "loss": 0.4146, "step": 2085 }, { "epoch": 1.9807737953952054, "grad_norm": 0.21979232132434845, "learning_rate": 0.00013203445103413507, "loss": 0.3, "step": 2086 }, { "epoch": 1.9817232375979112, "grad_norm": 0.1640891432762146, "learning_rate": 0.000131977920468361, "loss": 0.2969, "step": 2087 }, { "epoch": 1.9826726798006171, "grad_norm": 0.17019522190093994, "learning_rate": 0.0001319213785170583, "loss": 0.2914, "step": 2088 }, { "epoch": 1.983622122003323, "grad_norm": 0.09475825726985931, "learning_rate": 0.00013186482520035839, "loss": 0.297, "step": 2089 }, { "epoch": 1.9845715642060289, "grad_norm": 0.09213607013225555, "learning_rate": 0.00013180826053839668, "loss": 0.288, "step": 2090 }, { "epoch": 1.9855210064087347, "grad_norm": 0.11374935507774353, "learning_rate": 0.00013175168455131263, "loss": 0.2796, "step": 2091 }, { "epoch": 1.9864704486114406, "grad_norm": 0.10812429338693619, "learning_rate": 0.0001316950972592498, "loss": 0.3057, "step": 2092 }, { "epoch": 1.9874198908141467, "grad_norm": 0.07910951226949692, "learning_rate": 0.00013163849868235564, "loss": 0.2877, "step": 2093 }, { "epoch": 1.9883693330168526, "grad_norm": 0.09240693598985672, "learning_rate": 0.00013158188884078182, "loss": 0.2906, "step": 2094 }, { "epoch": 1.9893187752195585, "grad_norm": 0.097608283162117, "learning_rate": 0.00013152526775468378, "loss": 0.2906, "step": 2095 }, { "epoch": 1.9902682174222646, "grad_norm": 0.2190292775630951, "learning_rate": 0.00013146863544422118, "loss": 0.2835, "step": 2096 }, { "epoch": 1.9912176596249704, "grad_norm": 0.07066213339567184, "learning_rate": 0.00013141199192955751, "loss": 0.2856, "step": 2097 }, { "epoch": 1.9921671018276763, "grad_norm": 0.10716898739337921, "learning_rate": 0.0001313553372308604, "loss": 0.3012, "step": 2098 }, { "epoch": 1.9931165440303822, "grad_norm": 0.07971798628568649, "learning_rate": 0.00013129867136830127, "loss": 0.2678, "step": 2099 }, { "epoch": 1.994065986233088, "grad_norm": 0.11225918680429459, "learning_rate": 0.00013124199436205576, "loss": 0.2799, "step": 2100 }, { "epoch": 1.995015428435794, "grad_norm": 0.08741844445466995, "learning_rate": 0.00013118530623230327, "loss": 0.284, "step": 2101 }, { "epoch": 1.9959648706384998, "grad_norm": 0.07644308358430862, "learning_rate": 0.00013112860699922722, "loss": 0.2988, "step": 2102 }, { "epoch": 1.9969143128412057, "grad_norm": 0.07610399276018143, "learning_rate": 0.00013107189668301508, "loss": 0.2813, "step": 2103 }, { "epoch": 1.9978637550439116, "grad_norm": 0.1364275962114334, "learning_rate": 0.0001310151753038581, "loss": 0.3006, "step": 2104 }, { "epoch": 1.9988131972466177, "grad_norm": 0.06598393619060516, "learning_rate": 0.0001309584428819516, "loss": 0.2822, "step": 2105 }, { "epoch": 1.9997626394493235, "grad_norm": 0.08182472735643387, "learning_rate": 0.00013090169943749476, "loss": 0.2757, "step": 2106 }, { "epoch": 2.0007120816520296, "grad_norm": 0.06756250560283661, "learning_rate": 0.0001308449449906907, "loss": 0.2619, "step": 2107 }, { "epoch": 2.0016615238547355, "grad_norm": 0.05981763079762459, "learning_rate": 0.00013078817956174656, "loss": 0.2856, "step": 2108 }, { "epoch": 2.0026109660574414, "grad_norm": 0.0790615975856781, "learning_rate": 0.0001307314031708732, "loss": 0.2875, "step": 2109 }, { "epoch": 2.0035604082601473, "grad_norm": 0.06421328336000443, "learning_rate": 0.00013067461583828553, "loss": 0.2683, "step": 2110 }, { "epoch": 2.004509850462853, "grad_norm": 0.06607569754123688, "learning_rate": 0.0001306178175842023, "loss": 0.271, "step": 2111 }, { "epoch": 2.005459292665559, "grad_norm": 0.06524945050477982, "learning_rate": 0.00013056100842884612, "loss": 0.2796, "step": 2112 }, { "epoch": 2.006408734868265, "grad_norm": 0.05927155539393425, "learning_rate": 0.00013050418839244355, "loss": 0.2755, "step": 2113 }, { "epoch": 2.0073581770709708, "grad_norm": 0.06408464163541794, "learning_rate": 0.000130447357495225, "loss": 0.2748, "step": 2114 }, { "epoch": 2.0083076192736766, "grad_norm": 0.05964144691824913, "learning_rate": 0.0001303905157574247, "loss": 0.2772, "step": 2115 }, { "epoch": 2.0092570614763825, "grad_norm": 0.05294380709528923, "learning_rate": 0.00013033366319928079, "loss": 0.2543, "step": 2116 }, { "epoch": 2.0102065036790884, "grad_norm": 0.06316480785608292, "learning_rate": 0.00013027679984103528, "loss": 0.2659, "step": 2117 }, { "epoch": 2.0111559458817942, "grad_norm": 0.0780426636338234, "learning_rate": 0.000130219925702934, "loss": 0.2809, "step": 2118 }, { "epoch": 2.0121053880845006, "grad_norm": 0.05921616032719612, "learning_rate": 0.00013016304080522656, "loss": 0.2651, "step": 2119 }, { "epoch": 2.0130548302872064, "grad_norm": 0.063509002327919, "learning_rate": 0.0001301061451681665, "loss": 0.2766, "step": 2120 }, { "epoch": 2.0140042724899123, "grad_norm": 0.06251564621925354, "learning_rate": 0.0001300492388120111, "loss": 0.2826, "step": 2121 }, { "epoch": 2.014953714692618, "grad_norm": 0.07721933722496033, "learning_rate": 0.0001299923217570215, "loss": 0.2876, "step": 2122 }, { "epoch": 2.015903156895324, "grad_norm": 0.22655093669891357, "learning_rate": 0.0001299353940234627, "loss": 0.3023, "step": 2123 }, { "epoch": 2.01685259909803, "grad_norm": 0.16343270242214203, "learning_rate": 0.0001298784556316034, "loss": 0.2902, "step": 2124 }, { "epoch": 2.017802041300736, "grad_norm": 0.0674663856625557, "learning_rate": 0.00012982150660171613, "loss": 0.2639, "step": 2125 }, { "epoch": 2.0187514835034417, "grad_norm": 0.06698331236839294, "learning_rate": 0.00012976454695407723, "loss": 0.2918, "step": 2126 }, { "epoch": 2.0197009257061476, "grad_norm": 0.05850343778729439, "learning_rate": 0.00012970757670896683, "loss": 0.2691, "step": 2127 }, { "epoch": 2.0206503679088534, "grad_norm": 0.05704069882631302, "learning_rate": 0.0001296505958866688, "loss": 0.2688, "step": 2128 }, { "epoch": 2.0215998101115593, "grad_norm": 0.060891758650541306, "learning_rate": 0.00012959360450747075, "loss": 0.2652, "step": 2129 }, { "epoch": 2.022549252314265, "grad_norm": 0.061691515147686005, "learning_rate": 0.00012953660259166412, "loss": 0.2756, "step": 2130 }, { "epoch": 2.023498694516971, "grad_norm": 0.059189558029174805, "learning_rate": 0.00012947959015954406, "loss": 0.2759, "step": 2131 }, { "epoch": 2.0244481367196774, "grad_norm": 0.06447713077068329, "learning_rate": 0.00012942256723140952, "loss": 0.2773, "step": 2132 }, { "epoch": 2.0253975789223833, "grad_norm": 0.06263953447341919, "learning_rate": 0.0001293655338275631, "loss": 0.2867, "step": 2133 }, { "epoch": 2.026347021125089, "grad_norm": 0.0576293058693409, "learning_rate": 0.00012930848996831114, "loss": 0.2776, "step": 2134 }, { "epoch": 2.027296463327795, "grad_norm": 0.05699608847498894, "learning_rate": 0.00012925143567396374, "loss": 0.2757, "step": 2135 }, { "epoch": 2.028245905530501, "grad_norm": 0.052561014890670776, "learning_rate": 0.00012919437096483476, "loss": 0.2555, "step": 2136 }, { "epoch": 2.0291953477332068, "grad_norm": 0.053198445588350296, "learning_rate": 0.00012913729586124165, "loss": 0.2676, "step": 2137 }, { "epoch": 2.0301447899359126, "grad_norm": 0.09329196810722351, "learning_rate": 0.00012908021038350568, "loss": 0.2796, "step": 2138 }, { "epoch": 2.0310942321386185, "grad_norm": 0.07239534705877304, "learning_rate": 0.00012902311455195172, "loss": 0.2809, "step": 2139 }, { "epoch": 2.0320436743413244, "grad_norm": 0.06299670785665512, "learning_rate": 0.00012896600838690838, "loss": 0.2672, "step": 2140 }, { "epoch": 2.0329931165440303, "grad_norm": 0.05467437952756882, "learning_rate": 0.00012890889190870795, "loss": 0.268, "step": 2141 }, { "epoch": 2.033942558746736, "grad_norm": 0.0641472190618515, "learning_rate": 0.00012885176513768637, "loss": 0.2844, "step": 2142 }, { "epoch": 2.034892000949442, "grad_norm": 0.06481951475143433, "learning_rate": 0.00012879462809418325, "loss": 0.2883, "step": 2143 }, { "epoch": 2.0358414431521483, "grad_norm": 0.05889345332980156, "learning_rate": 0.0001287374807985418, "loss": 0.2688, "step": 2144 }, { "epoch": 2.036790885354854, "grad_norm": 0.05446067079901695, "learning_rate": 0.00012868032327110904, "loss": 0.2699, "step": 2145 }, { "epoch": 2.03774032755756, "grad_norm": 0.0558142326772213, "learning_rate": 0.00012862315553223547, "loss": 0.2662, "step": 2146 }, { "epoch": 2.038689769760266, "grad_norm": 0.05485325679183006, "learning_rate": 0.0001285659776022753, "loss": 0.2684, "step": 2147 }, { "epoch": 2.039639211962972, "grad_norm": 0.05541551858186722, "learning_rate": 0.0001285087895015864, "loss": 0.2674, "step": 2148 }, { "epoch": 2.0405886541656777, "grad_norm": 0.10139881074428558, "learning_rate": 0.0001284515912505301, "loss": 0.2737, "step": 2149 }, { "epoch": 2.0415380963683836, "grad_norm": 0.05179375782608986, "learning_rate": 0.00012839438286947163, "loss": 0.2647, "step": 2150 }, { "epoch": 2.0424875385710894, "grad_norm": 0.0590873584151268, "learning_rate": 0.0001283371643787795, "loss": 0.2743, "step": 2151 }, { "epoch": 2.0434369807737953, "grad_norm": 0.0546240359544754, "learning_rate": 0.00012827993579882612, "loss": 0.2798, "step": 2152 }, { "epoch": 2.044386422976501, "grad_norm": 0.056896887719631195, "learning_rate": 0.0001282226971499872, "loss": 0.2717, "step": 2153 }, { "epoch": 2.045335865179207, "grad_norm": 0.052284859120845795, "learning_rate": 0.00012816544845264228, "loss": 0.2719, "step": 2154 }, { "epoch": 2.046285307381913, "grad_norm": 0.060961298644542694, "learning_rate": 0.0001281081897271744, "loss": 0.2764, "step": 2155 }, { "epoch": 2.047234749584619, "grad_norm": 0.08830570429563522, "learning_rate": 0.0001280509209939701, "loss": 0.2946, "step": 2156 }, { "epoch": 2.048184191787325, "grad_norm": 0.05548688769340515, "learning_rate": 0.00012799364227341955, "loss": 0.2647, "step": 2157 }, { "epoch": 2.049133633990031, "grad_norm": 0.05134082958102226, "learning_rate": 0.00012793635358591645, "loss": 0.2724, "step": 2158 }, { "epoch": 2.050083076192737, "grad_norm": 0.06974118202924728, "learning_rate": 0.0001278790549518581, "loss": 0.2873, "step": 2159 }, { "epoch": 2.0510325183954428, "grad_norm": 0.06583964079618454, "learning_rate": 0.0001278217463916453, "loss": 0.2823, "step": 2160 }, { "epoch": 2.0519819605981486, "grad_norm": 0.05401783436536789, "learning_rate": 0.00012776442792568232, "loss": 0.2788, "step": 2161 }, { "epoch": 2.0529314028008545, "grad_norm": 0.09343112260103226, "learning_rate": 0.00012770709957437708, "loss": 0.2824, "step": 2162 }, { "epoch": 2.0538808450035604, "grad_norm": 0.11026190966367722, "learning_rate": 0.00012764976135814094, "loss": 0.2861, "step": 2163 }, { "epoch": 2.0548302872062663, "grad_norm": 0.05160842835903168, "learning_rate": 0.00012759241329738887, "loss": 0.2615, "step": 2164 }, { "epoch": 2.055779729408972, "grad_norm": 0.057216208428144455, "learning_rate": 0.00012753505541253916, "loss": 0.2757, "step": 2165 }, { "epoch": 2.056729171611678, "grad_norm": 0.07923352718353271, "learning_rate": 0.00012747768772401378, "loss": 0.2659, "step": 2166 }, { "epoch": 2.057678613814384, "grad_norm": 0.055502623319625854, "learning_rate": 0.0001274203102522381, "loss": 0.2757, "step": 2167 }, { "epoch": 2.0586280560170898, "grad_norm": 0.10472196340560913, "learning_rate": 0.00012736292301764098, "loss": 0.2938, "step": 2168 }, { "epoch": 2.059577498219796, "grad_norm": 0.1105305552482605, "learning_rate": 0.00012730552604065475, "loss": 0.2846, "step": 2169 }, { "epoch": 2.060526940422502, "grad_norm": 0.06079312413930893, "learning_rate": 0.0001272481193417153, "loss": 0.2724, "step": 2170 }, { "epoch": 2.061476382625208, "grad_norm": 0.06276509910821915, "learning_rate": 0.00012719070294126182, "loss": 0.2704, "step": 2171 }, { "epoch": 2.0624258248279137, "grad_norm": 0.08746016025543213, "learning_rate": 0.00012713327685973707, "loss": 0.2834, "step": 2172 }, { "epoch": 2.0633752670306196, "grad_norm": 0.053869761526584625, "learning_rate": 0.0001270758411175873, "loss": 0.2712, "step": 2173 }, { "epoch": 2.0643247092333254, "grad_norm": 0.05118397995829582, "learning_rate": 0.00012701839573526206, "loss": 0.2737, "step": 2174 }, { "epoch": 2.0652741514360313, "grad_norm": 0.05905655771493912, "learning_rate": 0.0001269609407332144, "loss": 0.2663, "step": 2175 }, { "epoch": 2.066223593638737, "grad_norm": 0.049641139805316925, "learning_rate": 0.00012690347613190082, "loss": 0.263, "step": 2176 }, { "epoch": 2.067173035841443, "grad_norm": 0.04823688417673111, "learning_rate": 0.00012684600195178117, "loss": 0.2667, "step": 2177 }, { "epoch": 2.068122478044149, "grad_norm": 0.07979489117860794, "learning_rate": 0.00012678851821331882, "loss": 0.2854, "step": 2178 }, { "epoch": 2.069071920246855, "grad_norm": 0.06123083457350731, "learning_rate": 0.00012673102493698042, "loss": 0.2832, "step": 2179 }, { "epoch": 2.0700213624495607, "grad_norm": 0.07498030364513397, "learning_rate": 0.00012667352214323614, "loss": 0.3061, "step": 2180 }, { "epoch": 2.0709708046522666, "grad_norm": 0.059050336480140686, "learning_rate": 0.0001266160098525594, "loss": 0.2623, "step": 2181 }, { "epoch": 2.071920246854973, "grad_norm": 0.060739047825336456, "learning_rate": 0.00012655848808542709, "loss": 0.282, "step": 2182 }, { "epoch": 2.0728696890576788, "grad_norm": 0.059133414179086685, "learning_rate": 0.00012650095686231953, "loss": 0.2637, "step": 2183 }, { "epoch": 2.0738191312603846, "grad_norm": 0.05270388349890709, "learning_rate": 0.00012644341620372023, "loss": 0.2742, "step": 2184 }, { "epoch": 2.0747685734630905, "grad_norm": 0.049184754490852356, "learning_rate": 0.00012638586613011624, "loss": 0.2582, "step": 2185 }, { "epoch": 2.0757180156657964, "grad_norm": 0.05757623910903931, "learning_rate": 0.0001263283066619978, "loss": 0.2672, "step": 2186 }, { "epoch": 2.0766674578685023, "grad_norm": 0.051976773887872696, "learning_rate": 0.0001262707378198587, "loss": 0.2769, "step": 2187 }, { "epoch": 2.077616900071208, "grad_norm": 0.04786711558699608, "learning_rate": 0.00012621315962419585, "loss": 0.2661, "step": 2188 }, { "epoch": 2.078566342273914, "grad_norm": 0.0624409057199955, "learning_rate": 0.00012615557209550967, "loss": 0.2867, "step": 2189 }, { "epoch": 2.07951578447662, "grad_norm": 0.05563337355852127, "learning_rate": 0.00012609797525430373, "loss": 0.2778, "step": 2190 }, { "epoch": 2.0804652266793258, "grad_norm": 0.04968985542654991, "learning_rate": 0.00012604036912108505, "loss": 0.2562, "step": 2191 }, { "epoch": 2.0814146688820316, "grad_norm": 0.05211299657821655, "learning_rate": 0.00012598275371636394, "loss": 0.2746, "step": 2192 }, { "epoch": 2.0823641110847375, "grad_norm": 0.0466628223657608, "learning_rate": 0.00012592512906065397, "loss": 0.2654, "step": 2193 }, { "epoch": 2.083313553287444, "grad_norm": 0.056648485362529755, "learning_rate": 0.000125867495174472, "loss": 0.28, "step": 2194 }, { "epoch": 2.0842629954901497, "grad_norm": 0.056760817766189575, "learning_rate": 0.0001258098520783382, "loss": 0.2732, "step": 2195 }, { "epoch": 2.0852124376928556, "grad_norm": 0.05097498744726181, "learning_rate": 0.00012575219979277602, "loss": 0.261, "step": 2196 }, { "epoch": 2.0861618798955615, "grad_norm": 0.05032607540488243, "learning_rate": 0.00012569453833831222, "loss": 0.2769, "step": 2197 }, { "epoch": 2.0871113220982673, "grad_norm": 0.04438967630267143, "learning_rate": 0.00012563686773547675, "loss": 0.2561, "step": 2198 }, { "epoch": 2.088060764300973, "grad_norm": 0.05397673696279526, "learning_rate": 0.00012557918800480282, "loss": 0.2712, "step": 2199 }, { "epoch": 2.089010206503679, "grad_norm": 0.05158831924200058, "learning_rate": 0.00012552149916682695, "loss": 0.2685, "step": 2200 }, { "epoch": 2.089959648706385, "grad_norm": 0.06279024481773376, "learning_rate": 0.00012546380124208887, "loss": 0.2722, "step": 2201 }, { "epoch": 2.090909090909091, "grad_norm": 0.04665720462799072, "learning_rate": 0.00012540609425113156, "loss": 0.2604, "step": 2202 }, { "epoch": 2.0918585331117967, "grad_norm": 0.059546615928411484, "learning_rate": 0.00012534837821450117, "loss": 0.2721, "step": 2203 }, { "epoch": 2.0928079753145026, "grad_norm": 0.0592176578938961, "learning_rate": 0.0001252906531527472, "loss": 0.2716, "step": 2204 }, { "epoch": 2.0937574175172085, "grad_norm": 0.04968995600938797, "learning_rate": 0.00012523291908642217, "loss": 0.2474, "step": 2205 }, { "epoch": 2.0947068597199143, "grad_norm": 0.052708033472299576, "learning_rate": 0.00012517517603608203, "loss": 0.2668, "step": 2206 }, { "epoch": 2.0956563019226206, "grad_norm": 0.06978727877140045, "learning_rate": 0.0001251174240222857, "loss": 0.2729, "step": 2207 }, { "epoch": 2.0966057441253265, "grad_norm": 0.061792004853487015, "learning_rate": 0.0001250596630655955, "loss": 0.2706, "step": 2208 }, { "epoch": 2.0975551863280324, "grad_norm": 0.05177111551165581, "learning_rate": 0.00012500189318657675, "loss": 0.2759, "step": 2209 }, { "epoch": 2.0985046285307383, "grad_norm": 0.05225459113717079, "learning_rate": 0.00012494411440579814, "loss": 0.2662, "step": 2210 }, { "epoch": 2.099454070733444, "grad_norm": 0.046468012034893036, "learning_rate": 0.00012488632674383134, "loss": 0.2712, "step": 2211 }, { "epoch": 2.10040351293615, "grad_norm": 0.044963154941797256, "learning_rate": 0.00012482853022125132, "loss": 0.2685, "step": 2212 }, { "epoch": 2.101352955138856, "grad_norm": 0.04540163278579712, "learning_rate": 0.00012477072485863613, "loss": 0.2625, "step": 2213 }, { "epoch": 2.1023023973415618, "grad_norm": 0.05036984756588936, "learning_rate": 0.00012471291067656697, "loss": 0.271, "step": 2214 }, { "epoch": 2.1032518395442676, "grad_norm": 0.05176820978522301, "learning_rate": 0.00012465508769562823, "loss": 0.2819, "step": 2215 }, { "epoch": 2.1042012817469735, "grad_norm": 0.07643739879131317, "learning_rate": 0.0001245972559364074, "loss": 0.2868, "step": 2216 }, { "epoch": 2.1051507239496794, "grad_norm": 0.04632152244448662, "learning_rate": 0.0001245394154194951, "loss": 0.2686, "step": 2217 }, { "epoch": 2.1061001661523853, "grad_norm": 0.0568397156894207, "learning_rate": 0.00012448156616548506, "loss": 0.2626, "step": 2218 }, { "epoch": 2.1070496083550916, "grad_norm": 0.0479881577193737, "learning_rate": 0.0001244237081949741, "loss": 0.258, "step": 2219 }, { "epoch": 2.1079990505577975, "grad_norm": 0.05392912030220032, "learning_rate": 0.0001243658415285622, "loss": 0.2601, "step": 2220 }, { "epoch": 2.1089484927605033, "grad_norm": 0.055512312799692154, "learning_rate": 0.00012430796618685244, "loss": 0.2748, "step": 2221 }, { "epoch": 2.109897934963209, "grad_norm": 0.05387381836771965, "learning_rate": 0.00012425008219045088, "loss": 0.2764, "step": 2222 }, { "epoch": 2.110847377165915, "grad_norm": 0.05436617136001587, "learning_rate": 0.00012419218955996676, "loss": 0.2746, "step": 2223 }, { "epoch": 2.111796819368621, "grad_norm": 0.06682915985584259, "learning_rate": 0.00012413428831601245, "loss": 0.2932, "step": 2224 }, { "epoch": 2.112746261571327, "grad_norm": 0.048501718789339066, "learning_rate": 0.0001240763784792032, "loss": 0.2506, "step": 2225 }, { "epoch": 2.1136957037740327, "grad_norm": 0.050359319895505905, "learning_rate": 0.0001240184600701575, "loss": 0.2746, "step": 2226 }, { "epoch": 2.1146451459767386, "grad_norm": 0.061569176614284515, "learning_rate": 0.00012396053310949673, "loss": 0.2874, "step": 2227 }, { "epoch": 2.1155945881794445, "grad_norm": 0.053734250366687775, "learning_rate": 0.00012390259761784552, "loss": 0.2691, "step": 2228 }, { "epoch": 2.1165440303821503, "grad_norm": 0.07295026630163193, "learning_rate": 0.00012384465361583134, "loss": 0.2892, "step": 2229 }, { "epoch": 2.117493472584856, "grad_norm": 0.07634708285331726, "learning_rate": 0.0001237867011240848, "loss": 0.2754, "step": 2230 }, { "epoch": 2.118442914787562, "grad_norm": 0.08975531905889511, "learning_rate": 0.00012372874016323951, "loss": 0.2806, "step": 2231 }, { "epoch": 2.1193923569902684, "grad_norm": 0.07255006581544876, "learning_rate": 0.0001236707707539321, "loss": 0.3048, "step": 2232 }, { "epoch": 2.1203417991929743, "grad_norm": 0.049513548612594604, "learning_rate": 0.00012361279291680214, "loss": 0.2512, "step": 2233 }, { "epoch": 2.12129124139568, "grad_norm": 0.05127749219536781, "learning_rate": 0.00012355480667249232, "loss": 0.2614, "step": 2234 }, { "epoch": 2.122240683598386, "grad_norm": 0.05769433453679085, "learning_rate": 0.00012349681204164824, "loss": 0.2564, "step": 2235 }, { "epoch": 2.123190125801092, "grad_norm": 0.0693785548210144, "learning_rate": 0.00012343880904491848, "loss": 0.2714, "step": 2236 }, { "epoch": 2.1241395680037978, "grad_norm": 0.056581392884254456, "learning_rate": 0.00012338079770295466, "loss": 0.2684, "step": 2237 }, { "epoch": 2.1250890102065036, "grad_norm": 0.06454044580459595, "learning_rate": 0.00012332277803641135, "loss": 0.2638, "step": 2238 }, { "epoch": 2.1260384524092095, "grad_norm": 0.05345448851585388, "learning_rate": 0.00012326475006594606, "loss": 0.2638, "step": 2239 }, { "epoch": 2.1269878946119154, "grad_norm": 0.05101858824491501, "learning_rate": 0.0001232067138122192, "loss": 0.2654, "step": 2240 }, { "epoch": 2.1279373368146213, "grad_norm": 0.051356613636016846, "learning_rate": 0.00012314866929589432, "loss": 0.2598, "step": 2241 }, { "epoch": 2.128886779017327, "grad_norm": 0.07914724946022034, "learning_rate": 0.0001230906165376377, "loss": 0.2704, "step": 2242 }, { "epoch": 2.1298362212200335, "grad_norm": 0.05196173116564751, "learning_rate": 0.00012303255555811866, "loss": 0.2676, "step": 2243 }, { "epoch": 2.1307856634227393, "grad_norm": 0.0463208444416523, "learning_rate": 0.00012297448637800943, "loss": 0.2658, "step": 2244 }, { "epoch": 2.131735105625445, "grad_norm": 0.0464724637567997, "learning_rate": 0.0001229164090179852, "loss": 0.2739, "step": 2245 }, { "epoch": 2.132684547828151, "grad_norm": 0.044462431222200394, "learning_rate": 0.000122858323498724, "loss": 0.2724, "step": 2246 }, { "epoch": 2.133633990030857, "grad_norm": 0.059855107218027115, "learning_rate": 0.00012280022984090675, "loss": 0.2885, "step": 2247 }, { "epoch": 2.134583432233563, "grad_norm": 0.05929682031273842, "learning_rate": 0.0001227421280652174, "loss": 0.2774, "step": 2248 }, { "epoch": 2.1355328744362687, "grad_norm": 0.05959324166178703, "learning_rate": 0.0001226840181923427, "loss": 0.285, "step": 2249 }, { "epoch": 2.1364823166389746, "grad_norm": 0.049392689019441605, "learning_rate": 0.00012262590024297225, "loss": 0.2723, "step": 2250 }, { "epoch": 2.1374317588416805, "grad_norm": 0.05052879452705383, "learning_rate": 0.00012256777423779851, "loss": 0.267, "step": 2251 }, { "epoch": 2.1383812010443863, "grad_norm": 0.04806723818182945, "learning_rate": 0.00012250964019751696, "loss": 0.2865, "step": 2252 }, { "epoch": 2.139330643247092, "grad_norm": 0.06047017127275467, "learning_rate": 0.00012245149814282583, "loss": 0.2818, "step": 2253 }, { "epoch": 2.140280085449798, "grad_norm": 0.04983370006084442, "learning_rate": 0.0001223933480944262, "loss": 0.2511, "step": 2254 }, { "epoch": 2.141229527652504, "grad_norm": 0.05417335778474808, "learning_rate": 0.00012233519007302202, "loss": 0.2737, "step": 2255 }, { "epoch": 2.14217896985521, "grad_norm": 0.06751585006713867, "learning_rate": 0.00012227702409932001, "loss": 0.2818, "step": 2256 }, { "epoch": 2.143128412057916, "grad_norm": 0.13051903247833252, "learning_rate": 0.00012221885019402984, "loss": 0.2715, "step": 2257 }, { "epoch": 2.144077854260622, "grad_norm": 0.05568401888012886, "learning_rate": 0.000122160668377864, "loss": 0.2831, "step": 2258 }, { "epoch": 2.145027296463328, "grad_norm": 0.060945216566324234, "learning_rate": 0.00012210247867153765, "loss": 0.2827, "step": 2259 }, { "epoch": 2.145976738666034, "grad_norm": 0.0824795663356781, "learning_rate": 0.00012204428109576887, "loss": 0.28, "step": 2260 }, { "epoch": 2.1469261808687397, "grad_norm": 0.07995422184467316, "learning_rate": 0.00012198607567127854, "loss": 0.2945, "step": 2261 }, { "epoch": 2.1478756230714455, "grad_norm": 0.047262005507946014, "learning_rate": 0.00012192786241879033, "loss": 0.2664, "step": 2262 }, { "epoch": 2.1488250652741514, "grad_norm": 0.04705559089779854, "learning_rate": 0.0001218696413590307, "loss": 0.2495, "step": 2263 }, { "epoch": 2.1497745074768573, "grad_norm": 0.09175438433885574, "learning_rate": 0.00012181141251272885, "loss": 0.2982, "step": 2264 }, { "epoch": 2.150723949679563, "grad_norm": 0.07894378155469894, "learning_rate": 0.00012175317590061674, "loss": 0.2846, "step": 2265 }, { "epoch": 2.151673391882269, "grad_norm": 0.08106362819671631, "learning_rate": 0.00012169493154342922, "loss": 0.2836, "step": 2266 }, { "epoch": 2.152622834084975, "grad_norm": 0.04687848687171936, "learning_rate": 0.00012163667946190376, "loss": 0.2573, "step": 2267 }, { "epoch": 2.153572276287681, "grad_norm": 0.05097149685025215, "learning_rate": 0.00012157841967678063, "loss": 0.2633, "step": 2268 }, { "epoch": 2.154521718490387, "grad_norm": 0.056790512055158615, "learning_rate": 0.00012152015220880284, "loss": 0.2841, "step": 2269 }, { "epoch": 2.155471160693093, "grad_norm": 0.058457743376493454, "learning_rate": 0.00012146187707871617, "loss": 0.2911, "step": 2270 }, { "epoch": 2.156420602895799, "grad_norm": 0.04693342000246048, "learning_rate": 0.00012140359430726906, "loss": 0.2573, "step": 2271 }, { "epoch": 2.1573700450985047, "grad_norm": 0.04494727402925491, "learning_rate": 0.00012134530391521275, "loss": 0.261, "step": 2272 }, { "epoch": 2.1583194873012106, "grad_norm": 0.056293174624443054, "learning_rate": 0.00012128700592330114, "loss": 0.2954, "step": 2273 }, { "epoch": 2.1592689295039165, "grad_norm": 0.04697936400771141, "learning_rate": 0.0001212287003522908, "loss": 0.2672, "step": 2274 }, { "epoch": 2.1602183717066223, "grad_norm": 0.04936970770359039, "learning_rate": 0.0001211703872229411, "loss": 0.2756, "step": 2275 }, { "epoch": 2.161167813909328, "grad_norm": 0.1244848370552063, "learning_rate": 0.000121112066556014, "loss": 0.2897, "step": 2276 }, { "epoch": 2.162117256112034, "grad_norm": 0.04847177118062973, "learning_rate": 0.00012105373837227425, "loss": 0.276, "step": 2277 }, { "epoch": 2.16306669831474, "grad_norm": 0.053024183958768845, "learning_rate": 0.00012099540269248917, "loss": 0.2755, "step": 2278 }, { "epoch": 2.164016140517446, "grad_norm": 0.07293348014354706, "learning_rate": 0.0001209370595374288, "loss": 0.272, "step": 2279 }, { "epoch": 2.1649655827201517, "grad_norm": 0.044251903891563416, "learning_rate": 0.00012087870892786588, "loss": 0.2546, "step": 2280 }, { "epoch": 2.1659150249228576, "grad_norm": 0.0693570151925087, "learning_rate": 0.00012082035088457573, "loss": 0.2901, "step": 2281 }, { "epoch": 2.166864467125564, "grad_norm": 0.05188895761966705, "learning_rate": 0.00012076198542833632, "loss": 0.2698, "step": 2282 }, { "epoch": 2.16781390932827, "grad_norm": 0.07463851571083069, "learning_rate": 0.00012070361257992832, "loss": 0.2682, "step": 2283 }, { "epoch": 2.1687633515309757, "grad_norm": 0.05541319400072098, "learning_rate": 0.00012064523236013498, "loss": 0.2913, "step": 2284 }, { "epoch": 2.1697127937336815, "grad_norm": 0.059766124933958054, "learning_rate": 0.00012058684478974224, "loss": 0.2764, "step": 2285 }, { "epoch": 2.1706622359363874, "grad_norm": 0.06126366928219795, "learning_rate": 0.0001205284498895386, "loss": 0.2891, "step": 2286 }, { "epoch": 2.1716116781390933, "grad_norm": 0.04642792418599129, "learning_rate": 0.00012047004768031513, "loss": 0.2599, "step": 2287 }, { "epoch": 2.172561120341799, "grad_norm": 0.07450695335865021, "learning_rate": 0.00012041163818286559, "loss": 0.2758, "step": 2288 }, { "epoch": 2.173510562544505, "grad_norm": 0.05021700635552406, "learning_rate": 0.00012035322141798629, "loss": 0.2677, "step": 2289 }, { "epoch": 2.174460004747211, "grad_norm": 0.03968047723174095, "learning_rate": 0.00012029479740647613, "loss": 0.2594, "step": 2290 }, { "epoch": 2.175409446949917, "grad_norm": 0.048740074038505554, "learning_rate": 0.00012023636616913663, "loss": 0.2715, "step": 2291 }, { "epoch": 2.1763588891526227, "grad_norm": 0.042600784450769424, "learning_rate": 0.00012017792772677177, "loss": 0.2652, "step": 2292 }, { "epoch": 2.177308331355329, "grad_norm": 0.05429399758577347, "learning_rate": 0.00012011948210018827, "loss": 0.2842, "step": 2293 }, { "epoch": 2.178257773558035, "grad_norm": 0.04825136438012123, "learning_rate": 0.00012006102931019522, "loss": 0.2652, "step": 2294 }, { "epoch": 2.1792072157607407, "grad_norm": 0.06344690918922424, "learning_rate": 0.00012000256937760445, "loss": 0.2872, "step": 2295 }, { "epoch": 2.1801566579634466, "grad_norm": 0.04262791574001312, "learning_rate": 0.00011994410232323017, "loss": 0.2635, "step": 2296 }, { "epoch": 2.1811061001661525, "grad_norm": 0.046105436980724335, "learning_rate": 0.00011988562816788921, "loss": 0.2673, "step": 2297 }, { "epoch": 2.1820555423688583, "grad_norm": 0.04826546832919121, "learning_rate": 0.00011982714693240089, "loss": 0.2731, "step": 2298 }, { "epoch": 2.1830049845715642, "grad_norm": 0.04453393071889877, "learning_rate": 0.00011976865863758708, "loss": 0.2595, "step": 2299 }, { "epoch": 2.18395442677427, "grad_norm": 0.04711470380425453, "learning_rate": 0.00011971016330427215, "loss": 0.2655, "step": 2300 }, { "epoch": 2.184903868976976, "grad_norm": 0.05461464077234268, "learning_rate": 0.00011965166095328301, "loss": 0.267, "step": 2301 }, { "epoch": 2.185853311179682, "grad_norm": 0.05645698308944702, "learning_rate": 0.00011959315160544901, "loss": 0.2922, "step": 2302 }, { "epoch": 2.1868027533823877, "grad_norm": 0.04374222829937935, "learning_rate": 0.00011953463528160202, "loss": 0.2592, "step": 2303 }, { "epoch": 2.1877521955850936, "grad_norm": 0.0473443977534771, "learning_rate": 0.0001194761120025764, "loss": 0.2613, "step": 2304 }, { "epoch": 2.1887016377877995, "grad_norm": 0.0495041161775589, "learning_rate": 0.00011941758178920898, "loss": 0.2672, "step": 2305 }, { "epoch": 2.1896510799905053, "grad_norm": 0.048812057822942734, "learning_rate": 0.00011935904466233907, "loss": 0.2771, "step": 2306 }, { "epoch": 2.1906005221932117, "grad_norm": 0.06169416755437851, "learning_rate": 0.00011930050064280838, "loss": 0.2887, "step": 2307 }, { "epoch": 2.1915499643959175, "grad_norm": 0.0863117203116417, "learning_rate": 0.00011924194975146117, "loss": 0.2709, "step": 2308 }, { "epoch": 2.1924994065986234, "grad_norm": 0.05131746456027031, "learning_rate": 0.00011918339200914407, "loss": 0.27, "step": 2309 }, { "epoch": 2.1934488488013293, "grad_norm": 0.0597839280962944, "learning_rate": 0.00011912482743670624, "loss": 0.2834, "step": 2310 }, { "epoch": 2.194398291004035, "grad_norm": 0.044621869921684265, "learning_rate": 0.00011906625605499915, "loss": 0.2526, "step": 2311 }, { "epoch": 2.195347733206741, "grad_norm": 0.04936400428414345, "learning_rate": 0.00011900767788487674, "loss": 0.2632, "step": 2312 }, { "epoch": 2.196297175409447, "grad_norm": 0.0530143566429615, "learning_rate": 0.00011894909294719547, "loss": 0.278, "step": 2313 }, { "epoch": 2.197246617612153, "grad_norm": 0.053086057305336, "learning_rate": 0.00011889050126281405, "loss": 0.2652, "step": 2314 }, { "epoch": 2.1981960598148587, "grad_norm": 0.047230660915374756, "learning_rate": 0.00011883190285259369, "loss": 0.2574, "step": 2315 }, { "epoch": 2.1991455020175645, "grad_norm": 0.07178585976362228, "learning_rate": 0.00011877329773739794, "loss": 0.2919, "step": 2316 }, { "epoch": 2.2000949442202704, "grad_norm": 0.04587990790605545, "learning_rate": 0.0001187146859380928, "loss": 0.2559, "step": 2317 }, { "epoch": 2.2010443864229767, "grad_norm": 0.050480857491493225, "learning_rate": 0.00011865606747554663, "loss": 0.259, "step": 2318 }, { "epoch": 2.2019938286256826, "grad_norm": 0.06074557080864906, "learning_rate": 0.00011859744237063011, "loss": 0.2819, "step": 2319 }, { "epoch": 2.2029432708283885, "grad_norm": 0.09227973967790604, "learning_rate": 0.00011853881064421634, "loss": 0.2809, "step": 2320 }, { "epoch": 2.2038927130310944, "grad_norm": 0.05148273706436157, "learning_rate": 0.00011848017231718076, "loss": 0.2523, "step": 2321 }, { "epoch": 2.2048421552338002, "grad_norm": 0.06572956591844559, "learning_rate": 0.00011842152741040116, "loss": 0.286, "step": 2322 }, { "epoch": 2.205791597436506, "grad_norm": 0.05026514083147049, "learning_rate": 0.0001183628759447577, "loss": 0.2789, "step": 2323 }, { "epoch": 2.206741039639212, "grad_norm": 0.04762961342930794, "learning_rate": 0.0001183042179411328, "loss": 0.2495, "step": 2324 }, { "epoch": 2.207690481841918, "grad_norm": 0.05108138173818588, "learning_rate": 0.00011824555342041128, "loss": 0.2777, "step": 2325 }, { "epoch": 2.2086399240446237, "grad_norm": 0.06025318801403046, "learning_rate": 0.00011818688240348024, "loss": 0.2865, "step": 2326 }, { "epoch": 2.2095893662473296, "grad_norm": 0.04540006071329117, "learning_rate": 0.00011812820491122918, "loss": 0.267, "step": 2327 }, { "epoch": 2.2105388084500355, "grad_norm": 0.056342415511608124, "learning_rate": 0.00011806952096454975, "loss": 0.2905, "step": 2328 }, { "epoch": 2.2114882506527413, "grad_norm": 0.054570119827985764, "learning_rate": 0.00011801083058433607, "loss": 0.2711, "step": 2329 }, { "epoch": 2.2124376928554472, "grad_norm": 0.04845500364899635, "learning_rate": 0.00011795213379148436, "loss": 0.2776, "step": 2330 }, { "epoch": 2.213387135058153, "grad_norm": 0.043802276253700256, "learning_rate": 0.00011789343060689329, "loss": 0.2641, "step": 2331 }, { "epoch": 2.2143365772608594, "grad_norm": 0.04750855267047882, "learning_rate": 0.00011783472105146376, "loss": 0.2687, "step": 2332 }, { "epoch": 2.2152860194635653, "grad_norm": 0.04865497350692749, "learning_rate": 0.00011777600514609886, "loss": 0.26, "step": 2333 }, { "epoch": 2.216235461666271, "grad_norm": 0.04385308921337128, "learning_rate": 0.00011771728291170407, "loss": 0.264, "step": 2334 }, { "epoch": 2.217184903868977, "grad_norm": 0.059330157935619354, "learning_rate": 0.00011765855436918701, "loss": 0.2792, "step": 2335 }, { "epoch": 2.218134346071683, "grad_norm": 0.05777224153280258, "learning_rate": 0.0001175998195394576, "loss": 0.2746, "step": 2336 }, { "epoch": 2.219083788274389, "grad_norm": 0.051478311419487, "learning_rate": 0.00011754107844342803, "loss": 0.2696, "step": 2337 }, { "epoch": 2.2200332304770947, "grad_norm": 0.046342022716999054, "learning_rate": 0.00011748233110201265, "loss": 0.2468, "step": 2338 }, { "epoch": 2.2209826726798005, "grad_norm": 0.04625660553574562, "learning_rate": 0.00011742357753612803, "loss": 0.2608, "step": 2339 }, { "epoch": 2.2219321148825064, "grad_norm": 0.06775734573602676, "learning_rate": 0.00011736481776669306, "loss": 0.263, "step": 2340 }, { "epoch": 2.2228815570852123, "grad_norm": 0.04909680038690567, "learning_rate": 0.00011730605181462871, "loss": 0.2515, "step": 2341 }, { "epoch": 2.223830999287918, "grad_norm": 0.06404335796833038, "learning_rate": 0.00011724727970085824, "loss": 0.2906, "step": 2342 }, { "epoch": 2.2247804414906245, "grad_norm": 0.04836789518594742, "learning_rate": 0.00011718850144630709, "loss": 0.2721, "step": 2343 }, { "epoch": 2.2257298836933304, "grad_norm": 0.041848376393318176, "learning_rate": 0.00011712971707190283, "loss": 0.2588, "step": 2344 }, { "epoch": 2.2266793258960362, "grad_norm": 0.05488808453083038, "learning_rate": 0.00011707092659857531, "loss": 0.2845, "step": 2345 }, { "epoch": 2.227628768098742, "grad_norm": 0.04626215994358063, "learning_rate": 0.00011701213004725644, "loss": 0.24, "step": 2346 }, { "epoch": 2.228578210301448, "grad_norm": 0.04804147407412529, "learning_rate": 0.00011695332743888036, "loss": 0.2764, "step": 2347 }, { "epoch": 2.229527652504154, "grad_norm": 0.046024907380342484, "learning_rate": 0.00011689451879438338, "loss": 0.2643, "step": 2348 }, { "epoch": 2.2304770947068597, "grad_norm": 0.055915262550115585, "learning_rate": 0.00011683570413470383, "loss": 0.284, "step": 2349 }, { "epoch": 2.2314265369095656, "grad_norm": 0.04396609589457512, "learning_rate": 0.00011677688348078244, "loss": 0.262, "step": 2350 }, { "epoch": 2.2323759791122715, "grad_norm": 0.07269623130559921, "learning_rate": 0.00011671805685356183, "loss": 0.3068, "step": 2351 }, { "epoch": 2.2333254213149774, "grad_norm": 0.04890700802206993, "learning_rate": 0.00011665922427398683, "loss": 0.2723, "step": 2352 }, { "epoch": 2.2342748635176832, "grad_norm": 0.0517299585044384, "learning_rate": 0.00011660038576300443, "loss": 0.2776, "step": 2353 }, { "epoch": 2.235224305720389, "grad_norm": 0.07574246823787689, "learning_rate": 0.00011654154134156364, "loss": 0.276, "step": 2354 }, { "epoch": 2.236173747923095, "grad_norm": 0.043694209307432175, "learning_rate": 0.00011648269103061566, "loss": 0.2651, "step": 2355 }, { "epoch": 2.2371231901258013, "grad_norm": 0.04978908598423004, "learning_rate": 0.00011642383485111378, "loss": 0.2474, "step": 2356 }, { "epoch": 2.238072632328507, "grad_norm": 0.05337736755609512, "learning_rate": 0.00011636497282401331, "loss": 0.2703, "step": 2357 }, { "epoch": 2.239022074531213, "grad_norm": 0.05080539733171463, "learning_rate": 0.00011630610497027174, "loss": 0.2715, "step": 2358 }, { "epoch": 2.239971516733919, "grad_norm": 0.05404691770672798, "learning_rate": 0.00011624723131084854, "loss": 0.2808, "step": 2359 }, { "epoch": 2.240920958936625, "grad_norm": 0.05867183208465576, "learning_rate": 0.00011618835186670531, "loss": 0.2622, "step": 2360 }, { "epoch": 2.2418704011393307, "grad_norm": 0.0530124269425869, "learning_rate": 0.00011612946665880571, "loss": 0.2747, "step": 2361 }, { "epoch": 2.2428198433420365, "grad_norm": 0.056134361773729324, "learning_rate": 0.0001160705757081154, "loss": 0.276, "step": 2362 }, { "epoch": 2.2437692855447424, "grad_norm": 0.041885942220687866, "learning_rate": 0.00011601167903560208, "loss": 0.2659, "step": 2363 }, { "epoch": 2.2447187277474483, "grad_norm": 0.04484890028834343, "learning_rate": 0.00011595277666223561, "loss": 0.2608, "step": 2364 }, { "epoch": 2.245668169950154, "grad_norm": 0.06328167766332626, "learning_rate": 0.00011589386860898772, "loss": 0.2881, "step": 2365 }, { "epoch": 2.24661761215286, "grad_norm": 0.055244866758584976, "learning_rate": 0.00011583495489683229, "loss": 0.2801, "step": 2366 }, { "epoch": 2.247567054355566, "grad_norm": 0.04593993350863457, "learning_rate": 0.00011577603554674514, "loss": 0.2721, "step": 2367 }, { "epoch": 2.2485164965582722, "grad_norm": 0.0732899010181427, "learning_rate": 0.00011571711057970409, "loss": 0.2729, "step": 2368 }, { "epoch": 2.249465938760978, "grad_norm": 0.05230560898780823, "learning_rate": 0.00011565818001668904, "loss": 0.2744, "step": 2369 }, { "epoch": 2.250415380963684, "grad_norm": 0.0604710727930069, "learning_rate": 0.00011559924387868179, "loss": 0.2783, "step": 2370 }, { "epoch": 2.25136482316639, "grad_norm": 0.04672817140817642, "learning_rate": 0.00011554030218666619, "loss": 0.2673, "step": 2371 }, { "epoch": 2.2523142653690957, "grad_norm": 0.05419154092669487, "learning_rate": 0.00011548135496162799, "loss": 0.2692, "step": 2372 }, { "epoch": 2.2532637075718016, "grad_norm": 0.07617273926734924, "learning_rate": 0.00011542240222455502, "loss": 0.2545, "step": 2373 }, { "epoch": 2.2542131497745075, "grad_norm": 0.04525422304868698, "learning_rate": 0.00011536344399643701, "loss": 0.2746, "step": 2374 }, { "epoch": 2.2551625919772134, "grad_norm": 0.059686966240406036, "learning_rate": 0.00011530448029826566, "loss": 0.2917, "step": 2375 }, { "epoch": 2.2561120341799192, "grad_norm": 0.04423639923334122, "learning_rate": 0.00011524551115103454, "loss": 0.2499, "step": 2376 }, { "epoch": 2.257061476382625, "grad_norm": 0.04687541723251343, "learning_rate": 0.0001151865365757393, "loss": 0.2728, "step": 2377 }, { "epoch": 2.258010918585331, "grad_norm": 0.0430920235812664, "learning_rate": 0.00011512755659337742, "loss": 0.266, "step": 2378 }, { "epoch": 2.258960360788037, "grad_norm": 0.04343993961811066, "learning_rate": 0.00011506857122494831, "loss": 0.2517, "step": 2379 }, { "epoch": 2.2599098029907427, "grad_norm": 0.04559265822172165, "learning_rate": 0.0001150095804914534, "loss": 0.2614, "step": 2380 }, { "epoch": 2.2608592451934486, "grad_norm": 0.0903773307800293, "learning_rate": 0.00011495058441389586, "loss": 0.2708, "step": 2381 }, { "epoch": 2.261808687396155, "grad_norm": 0.06456193327903748, "learning_rate": 0.00011489158301328092, "loss": 0.2749, "step": 2382 }, { "epoch": 2.262758129598861, "grad_norm": 0.049897704273462296, "learning_rate": 0.00011483257631061562, "loss": 0.2732, "step": 2383 }, { "epoch": 2.2637075718015667, "grad_norm": 0.054795295000076294, "learning_rate": 0.00011477356432690891, "loss": 0.2798, "step": 2384 }, { "epoch": 2.2646570140042726, "grad_norm": 0.06840714812278748, "learning_rate": 0.00011471454708317162, "loss": 0.2906, "step": 2385 }, { "epoch": 2.2656064562069784, "grad_norm": 0.05174139142036438, "learning_rate": 0.00011465552460041644, "loss": 0.2689, "step": 2386 }, { "epoch": 2.2665558984096843, "grad_norm": 0.1207653358578682, "learning_rate": 0.00011459649689965797, "loss": 0.2827, "step": 2387 }, { "epoch": 2.26750534061239, "grad_norm": 0.0508701354265213, "learning_rate": 0.00011453746400191261, "loss": 0.2607, "step": 2388 }, { "epoch": 2.268454782815096, "grad_norm": 0.05010244995355606, "learning_rate": 0.00011447842592819866, "loss": 0.2688, "step": 2389 }, { "epoch": 2.269404225017802, "grad_norm": 0.044035494327545166, "learning_rate": 0.00011441938269953618, "loss": 0.274, "step": 2390 }, { "epoch": 2.270353667220508, "grad_norm": 0.04500316083431244, "learning_rate": 0.00011436033433694718, "loss": 0.2707, "step": 2391 }, { "epoch": 2.2713031094232137, "grad_norm": 0.04644524306058884, "learning_rate": 0.00011430128086145542, "loss": 0.2616, "step": 2392 }, { "epoch": 2.27225255162592, "grad_norm": 0.04804198071360588, "learning_rate": 0.00011424222229408652, "loss": 0.2726, "step": 2393 }, { "epoch": 2.273201993828626, "grad_norm": 0.04829863831400871, "learning_rate": 0.00011418315865586788, "loss": 0.2732, "step": 2394 }, { "epoch": 2.2741514360313317, "grad_norm": 0.0522797591984272, "learning_rate": 0.00011412408996782871, "loss": 0.2761, "step": 2395 }, { "epoch": 2.2751008782340376, "grad_norm": 0.04852959141135216, "learning_rate": 0.00011406501625100006, "loss": 0.277, "step": 2396 }, { "epoch": 2.2760503204367435, "grad_norm": 0.04573357105255127, "learning_rate": 0.00011400593752641473, "loss": 0.266, "step": 2397 }, { "epoch": 2.2769997626394494, "grad_norm": 0.06536805629730225, "learning_rate": 0.00011394685381510726, "loss": 0.289, "step": 2398 }, { "epoch": 2.2779492048421552, "grad_norm": 0.04507270082831383, "learning_rate": 0.0001138877651381141, "loss": 0.2629, "step": 2399 }, { "epoch": 2.278898647044861, "grad_norm": 0.046577729284763336, "learning_rate": 0.00011382867151647332, "loss": 0.2701, "step": 2400 }, { "epoch": 2.279848089247567, "grad_norm": 0.04779546707868576, "learning_rate": 0.00011376957297122486, "loss": 0.2628, "step": 2401 }, { "epoch": 2.280797531450273, "grad_norm": 0.04618161544203758, "learning_rate": 0.00011371046952341034, "loss": 0.2676, "step": 2402 }, { "epoch": 2.2817469736529787, "grad_norm": 0.04306333512067795, "learning_rate": 0.00011365136119407319, "loss": 0.2658, "step": 2403 }, { "epoch": 2.2826964158556846, "grad_norm": 0.044828448444604874, "learning_rate": 0.00011359224800425849, "loss": 0.2584, "step": 2404 }, { "epoch": 2.2836458580583905, "grad_norm": 0.045854486525058746, "learning_rate": 0.00011353312997501313, "loss": 0.258, "step": 2405 }, { "epoch": 2.2845953002610964, "grad_norm": 0.05045664310455322, "learning_rate": 0.00011347400712738567, "loss": 0.2658, "step": 2406 }, { "epoch": 2.2855447424638027, "grad_norm": 0.04601123183965683, "learning_rate": 0.00011341487948242648, "loss": 0.2602, "step": 2407 }, { "epoch": 2.2864941846665086, "grad_norm": 0.055264122784137726, "learning_rate": 0.00011335574706118754, "loss": 0.2856, "step": 2408 }, { "epoch": 2.2874436268692144, "grad_norm": 0.054593365639448166, "learning_rate": 0.00011329660988472253, "loss": 0.2663, "step": 2409 }, { "epoch": 2.2883930690719203, "grad_norm": 0.04195168986916542, "learning_rate": 0.00011323746797408688, "loss": 0.2623, "step": 2410 }, { "epoch": 2.289342511274626, "grad_norm": 0.04949670657515526, "learning_rate": 0.00011317832135033766, "loss": 0.2635, "step": 2411 }, { "epoch": 2.290291953477332, "grad_norm": 0.0510125458240509, "learning_rate": 0.00011311917003453365, "loss": 0.2701, "step": 2412 }, { "epoch": 2.291241395680038, "grad_norm": 0.04839101806282997, "learning_rate": 0.0001130600140477353, "loss": 0.2687, "step": 2413 }, { "epoch": 2.292190837882744, "grad_norm": 0.04288674145936966, "learning_rate": 0.00011300085341100466, "loss": 0.268, "step": 2414 }, { "epoch": 2.2931402800854497, "grad_norm": 0.04649089276790619, "learning_rate": 0.00011294168814540553, "loss": 0.2649, "step": 2415 }, { "epoch": 2.2940897222881556, "grad_norm": 0.03943301737308502, "learning_rate": 0.00011288251827200334, "loss": 0.2675, "step": 2416 }, { "epoch": 2.2950391644908614, "grad_norm": 0.06496595591306686, "learning_rate": 0.0001128233438118651, "loss": 0.265, "step": 2417 }, { "epoch": 2.2959886066935677, "grad_norm": 0.05821962654590607, "learning_rate": 0.00011276416478605949, "loss": 0.2705, "step": 2418 }, { "epoch": 2.2969380488962736, "grad_norm": 0.041703104972839355, "learning_rate": 0.00011270498121565678, "loss": 0.2568, "step": 2419 }, { "epoch": 2.2978874910989795, "grad_norm": 0.053453344851732254, "learning_rate": 0.00011264579312172895, "loss": 0.2734, "step": 2420 }, { "epoch": 2.2988369333016854, "grad_norm": 0.04353068396449089, "learning_rate": 0.00011258660052534951, "loss": 0.2633, "step": 2421 }, { "epoch": 2.2997863755043912, "grad_norm": 0.04292818903923035, "learning_rate": 0.00011252740344759356, "loss": 0.2594, "step": 2422 }, { "epoch": 2.300735817707097, "grad_norm": 0.05199714004993439, "learning_rate": 0.00011246820190953791, "loss": 0.2703, "step": 2423 }, { "epoch": 2.301685259909803, "grad_norm": 0.04548816755414009, "learning_rate": 0.0001124089959322608, "loss": 0.2739, "step": 2424 }, { "epoch": 2.302634702112509, "grad_norm": 0.03984839841723442, "learning_rate": 0.00011234978553684219, "loss": 0.2573, "step": 2425 }, { "epoch": 2.3035841443152147, "grad_norm": 0.0392242856323719, "learning_rate": 0.00011229057074436351, "loss": 0.2595, "step": 2426 }, { "epoch": 2.3045335865179206, "grad_norm": 0.03876666724681854, "learning_rate": 0.00011223135157590783, "loss": 0.2577, "step": 2427 }, { "epoch": 2.3054830287206265, "grad_norm": 0.051638517528772354, "learning_rate": 0.00011217212805255968, "loss": 0.279, "step": 2428 }, { "epoch": 2.3064324709233324, "grad_norm": 0.04209835082292557, "learning_rate": 0.0001121129001954053, "loss": 0.2604, "step": 2429 }, { "epoch": 2.3073819131260382, "grad_norm": 0.05616452917456627, "learning_rate": 0.0001120536680255323, "loss": 0.2774, "step": 2430 }, { "epoch": 2.308331355328744, "grad_norm": 0.04177265986800194, "learning_rate": 0.00011199443156402998, "loss": 0.2616, "step": 2431 }, { "epoch": 2.3092807975314504, "grad_norm": 0.04343918338418007, "learning_rate": 0.00011193519083198905, "loss": 0.269, "step": 2432 }, { "epoch": 2.3102302397341563, "grad_norm": 0.06559456884860992, "learning_rate": 0.00011187594585050174, "loss": 0.2846, "step": 2433 }, { "epoch": 2.311179681936862, "grad_norm": 0.045688070356845856, "learning_rate": 0.00011181669664066192, "loss": 0.2586, "step": 2434 }, { "epoch": 2.312129124139568, "grad_norm": 0.09138458222150803, "learning_rate": 0.00011175744322356487, "loss": 0.2793, "step": 2435 }, { "epoch": 2.313078566342274, "grad_norm": 0.06139371171593666, "learning_rate": 0.00011169818562030733, "loss": 0.2962, "step": 2436 }, { "epoch": 2.31402800854498, "grad_norm": 0.043909333646297455, "learning_rate": 0.0001116389238519876, "loss": 0.2517, "step": 2437 }, { "epoch": 2.3149774507476857, "grad_norm": 0.06976691633462906, "learning_rate": 0.00011157965793970544, "loss": 0.2834, "step": 2438 }, { "epoch": 2.3159268929503916, "grad_norm": 0.041745271533727646, "learning_rate": 0.00011152038790456211, "loss": 0.2596, "step": 2439 }, { "epoch": 2.3168763351530974, "grad_norm": 0.03987791761755943, "learning_rate": 0.00011146111376766033, "loss": 0.2538, "step": 2440 }, { "epoch": 2.3178257773558033, "grad_norm": 0.08367100358009338, "learning_rate": 0.00011140183555010424, "loss": 0.2819, "step": 2441 }, { "epoch": 2.318775219558509, "grad_norm": 0.042115338146686554, "learning_rate": 0.00011134255327299943, "loss": 0.2511, "step": 2442 }, { "epoch": 2.3197246617612155, "grad_norm": 0.047765813767910004, "learning_rate": 0.00011128326695745301, "loss": 0.2543, "step": 2443 }, { "epoch": 2.3206741039639214, "grad_norm": 0.0455872118473053, "learning_rate": 0.0001112239766245735, "loss": 0.2798, "step": 2444 }, { "epoch": 2.3216235461666272, "grad_norm": 0.04317116364836693, "learning_rate": 0.0001111646822954708, "loss": 0.2677, "step": 2445 }, { "epoch": 2.322572988369333, "grad_norm": 0.04277396202087402, "learning_rate": 0.00011110538399125625, "loss": 0.255, "step": 2446 }, { "epoch": 2.323522430572039, "grad_norm": 0.046795833855867386, "learning_rate": 0.00011104608173304262, "loss": 0.2725, "step": 2447 }, { "epoch": 2.324471872774745, "grad_norm": 0.04250842332839966, "learning_rate": 0.00011098677554194417, "loss": 0.2645, "step": 2448 }, { "epoch": 2.3254213149774507, "grad_norm": 0.05834075063467026, "learning_rate": 0.00011092746543907642, "loss": 0.2849, "step": 2449 }, { "epoch": 2.3263707571801566, "grad_norm": 0.04197695106267929, "learning_rate": 0.00011086815144555633, "loss": 0.26, "step": 2450 }, { "epoch": 2.3273201993828625, "grad_norm": 0.0852176696062088, "learning_rate": 0.00011080883358250225, "loss": 0.2636, "step": 2451 }, { "epoch": 2.3282696415855684, "grad_norm": 0.04568492993712425, "learning_rate": 0.00011074951187103397, "loss": 0.2607, "step": 2452 }, { "epoch": 2.3292190837882742, "grad_norm": 0.05550101399421692, "learning_rate": 0.00011069018633227257, "loss": 0.2831, "step": 2453 }, { "epoch": 2.33016852599098, "grad_norm": 0.06848349422216415, "learning_rate": 0.00011063085698734052, "loss": 0.2621, "step": 2454 }, { "epoch": 2.331117968193686, "grad_norm": 0.07170405983924866, "learning_rate": 0.0001105715238573616, "loss": 0.2849, "step": 2455 }, { "epoch": 2.332067410396392, "grad_norm": 0.05143957957625389, "learning_rate": 0.00011051218696346104, "loss": 0.2639, "step": 2456 }, { "epoch": 2.333016852599098, "grad_norm": 0.073124960064888, "learning_rate": 0.00011045284632676536, "loss": 0.2829, "step": 2457 }, { "epoch": 2.333966294801804, "grad_norm": 0.08123373240232468, "learning_rate": 0.00011039350196840235, "loss": 0.273, "step": 2458 }, { "epoch": 2.33491573700451, "grad_norm": 0.04808083176612854, "learning_rate": 0.00011033415390950121, "loss": 0.2654, "step": 2459 }, { "epoch": 2.335865179207216, "grad_norm": 0.0888887345790863, "learning_rate": 0.00011027480217119244, "loss": 0.281, "step": 2460 }, { "epoch": 2.3368146214099217, "grad_norm": 0.07269085198640823, "learning_rate": 0.00011021544677460778, "loss": 0.2757, "step": 2461 }, { "epoch": 2.3377640636126276, "grad_norm": 0.06212505325675011, "learning_rate": 0.00011015608774088039, "loss": 0.2792, "step": 2462 }, { "epoch": 2.3387135058153334, "grad_norm": 0.045006170868873596, "learning_rate": 0.00011009672509114461, "loss": 0.263, "step": 2463 }, { "epoch": 2.3396629480180393, "grad_norm": 0.04857483506202698, "learning_rate": 0.0001100373588465362, "loss": 0.2705, "step": 2464 }, { "epoch": 2.340612390220745, "grad_norm": 0.05448603257536888, "learning_rate": 0.00010997798902819208, "loss": 0.2762, "step": 2465 }, { "epoch": 2.341561832423451, "grad_norm": 0.04412839934229851, "learning_rate": 0.00010991861565725044, "loss": 0.2638, "step": 2466 }, { "epoch": 2.3425112746261574, "grad_norm": 0.05423035845160484, "learning_rate": 0.00010985923875485083, "loss": 0.2739, "step": 2467 }, { "epoch": 2.3434607168288633, "grad_norm": 0.0835297629237175, "learning_rate": 0.00010979985834213399, "loss": 0.3022, "step": 2468 }, { "epoch": 2.344410159031569, "grad_norm": 0.061721861362457275, "learning_rate": 0.00010974047444024195, "loss": 0.2896, "step": 2469 }, { "epoch": 2.345359601234275, "grad_norm": 0.04539257660508156, "learning_rate": 0.00010968108707031792, "loss": 0.2548, "step": 2470 }, { "epoch": 2.346309043436981, "grad_norm": 0.07001818716526031, "learning_rate": 0.0001096216962535064, "loss": 0.306, "step": 2471 }, { "epoch": 2.3472584856396868, "grad_norm": 0.04498448595404625, "learning_rate": 0.00010956230201095312, "loss": 0.2589, "step": 2472 }, { "epoch": 2.3482079278423926, "grad_norm": 0.045867983251810074, "learning_rate": 0.00010950290436380499, "loss": 0.2639, "step": 2473 }, { "epoch": 2.3491573700450985, "grad_norm": 0.053373124450445175, "learning_rate": 0.00010944350333321014, "loss": 0.2703, "step": 2474 }, { "epoch": 2.3501068122478044, "grad_norm": 0.04680047556757927, "learning_rate": 0.00010938409894031794, "loss": 0.2705, "step": 2475 }, { "epoch": 2.3510562544505103, "grad_norm": 0.05392623320221901, "learning_rate": 0.00010932469120627893, "loss": 0.2738, "step": 2476 }, { "epoch": 2.352005696653216, "grad_norm": 0.046140771359205246, "learning_rate": 0.00010926528015224485, "loss": 0.2654, "step": 2477 }, { "epoch": 2.352955138855922, "grad_norm": 0.16307492554187775, "learning_rate": 0.00010920586579936858, "loss": 0.2649, "step": 2478 }, { "epoch": 2.353904581058628, "grad_norm": 0.04612316936254501, "learning_rate": 0.00010914644816880423, "loss": 0.2639, "step": 2479 }, { "epoch": 2.3548540232613338, "grad_norm": 0.044796962291002274, "learning_rate": 0.00010908702728170705, "loss": 0.2622, "step": 2480 }, { "epoch": 2.35580346546404, "grad_norm": 0.045174986124038696, "learning_rate": 0.00010902760315923352, "loss": 0.256, "step": 2481 }, { "epoch": 2.356752907666746, "grad_norm": 0.1589425504207611, "learning_rate": 0.00010896817582254113, "loss": 0.2754, "step": 2482 }, { "epoch": 2.357702349869452, "grad_norm": 0.0925155058503151, "learning_rate": 0.00010890874529278865, "loss": 0.2678, "step": 2483 }, { "epoch": 2.3586517920721577, "grad_norm": 0.09574342519044876, "learning_rate": 0.00010884931159113586, "loss": 0.2534, "step": 2484 }, { "epoch": 2.3596012342748636, "grad_norm": 0.050892025232315063, "learning_rate": 0.0001087898747387438, "loss": 0.2677, "step": 2485 }, { "epoch": 2.3605506764775694, "grad_norm": 0.0862436443567276, "learning_rate": 0.00010873043475677455, "loss": 0.2834, "step": 2486 }, { "epoch": 2.3615001186802753, "grad_norm": 0.0461130365729332, "learning_rate": 0.0001086709916663913, "loss": 0.2673, "step": 2487 }, { "epoch": 2.362449560882981, "grad_norm": 0.046872738748788834, "learning_rate": 0.00010861154548875843, "loss": 0.2693, "step": 2488 }, { "epoch": 2.363399003085687, "grad_norm": 0.04682895913720131, "learning_rate": 0.0001085520962450413, "loss": 0.2593, "step": 2489 }, { "epoch": 2.364348445288393, "grad_norm": 0.04921196401119232, "learning_rate": 0.00010849264395640649, "loss": 0.2593, "step": 2490 }, { "epoch": 2.365297887491099, "grad_norm": 0.04984479025006294, "learning_rate": 0.00010843318864402154, "loss": 0.2619, "step": 2491 }, { "epoch": 2.366247329693805, "grad_norm": 0.0450386144220829, "learning_rate": 0.00010837373032905515, "loss": 0.2517, "step": 2492 }, { "epoch": 2.367196771896511, "grad_norm": 0.04759416729211807, "learning_rate": 0.00010831426903267706, "loss": 0.2665, "step": 2493 }, { "epoch": 2.368146214099217, "grad_norm": 0.04298888519406319, "learning_rate": 0.00010825480477605805, "loss": 0.2587, "step": 2494 }, { "epoch": 2.3690956563019228, "grad_norm": 0.047177206724882126, "learning_rate": 0.00010819533758037002, "loss": 0.2651, "step": 2495 }, { "epoch": 2.3700450985046286, "grad_norm": 0.04960494488477707, "learning_rate": 0.00010813586746678583, "loss": 0.2618, "step": 2496 }, { "epoch": 2.3709945407073345, "grad_norm": 0.06197304278612137, "learning_rate": 0.00010807639445647948, "loss": 0.2797, "step": 2497 }, { "epoch": 2.3719439829100404, "grad_norm": 0.053523700684309006, "learning_rate": 0.00010801691857062586, "loss": 0.2833, "step": 2498 }, { "epoch": 2.3728934251127463, "grad_norm": 0.04236749932169914, "learning_rate": 0.00010795743983040107, "loss": 0.2627, "step": 2499 }, { "epoch": 2.373842867315452, "grad_norm": 0.04760702699422836, "learning_rate": 0.00010789795825698205, "loss": 0.272, "step": 2500 }, { "epoch": 2.373842867315452, "eval_loss": 0.2669290602207184, "eval_runtime": 37.912, "eval_samples_per_second": 2.268, "eval_steps_per_second": 2.268, "step": 2500 }, { "epoch": 2.374792309518158, "grad_norm": 0.0647454559803009, "learning_rate": 0.00010783847387154687, "loss": 0.2765, "step": 2501 }, { "epoch": 2.375741751720864, "grad_norm": 0.049812283366918564, "learning_rate": 0.00010777898669527449, "loss": 0.2766, "step": 2502 }, { "epoch": 2.3766911939235698, "grad_norm": 0.04538872838020325, "learning_rate": 0.00010771949674934499, "loss": 0.2674, "step": 2503 }, { "epoch": 2.3776406361262756, "grad_norm": 0.05949138104915619, "learning_rate": 0.00010766000405493936, "loss": 0.28, "step": 2504 }, { "epoch": 2.3785900783289815, "grad_norm": 0.04528718441724777, "learning_rate": 0.00010760050863323961, "loss": 0.2641, "step": 2505 }, { "epoch": 2.379539520531688, "grad_norm": 0.0555870421230793, "learning_rate": 0.00010754101050542865, "loss": 0.275, "step": 2506 }, { "epoch": 2.3804889627343937, "grad_norm": 0.04176267981529236, "learning_rate": 0.00010748150969269038, "loss": 0.2577, "step": 2507 }, { "epoch": 2.3814384049370996, "grad_norm": 0.04654645174741745, "learning_rate": 0.00010742200621620972, "loss": 0.2701, "step": 2508 }, { "epoch": 2.3823878471398054, "grad_norm": 0.043700382113456726, "learning_rate": 0.00010736250009717247, "loss": 0.2638, "step": 2509 }, { "epoch": 2.3833372893425113, "grad_norm": 0.040448714047670364, "learning_rate": 0.00010730299135676545, "loss": 0.2539, "step": 2510 }, { "epoch": 2.384286731545217, "grad_norm": 0.05429979786276817, "learning_rate": 0.00010724348001617625, "loss": 0.2624, "step": 2511 }, { "epoch": 2.385236173747923, "grad_norm": 0.048745136708021164, "learning_rate": 0.00010718396609659356, "loss": 0.2586, "step": 2512 }, { "epoch": 2.386185615950629, "grad_norm": 0.053171828389167786, "learning_rate": 0.00010712444961920691, "loss": 0.263, "step": 2513 }, { "epoch": 2.387135058153335, "grad_norm": 0.05061480775475502, "learning_rate": 0.00010706493060520678, "loss": 0.2672, "step": 2514 }, { "epoch": 2.3880845003560407, "grad_norm": 0.04448014125227928, "learning_rate": 0.00010700540907578447, "loss": 0.2694, "step": 2515 }, { "epoch": 2.3890339425587466, "grad_norm": 0.044203128665685654, "learning_rate": 0.00010694588505213224, "loss": 0.2608, "step": 2516 }, { "epoch": 2.389983384761453, "grad_norm": 0.045293718576431274, "learning_rate": 0.00010688635855544326, "loss": 0.2761, "step": 2517 }, { "epoch": 2.3909328269641588, "grad_norm": 0.04204658046364784, "learning_rate": 0.00010682682960691153, "loss": 0.2577, "step": 2518 }, { "epoch": 2.3918822691668646, "grad_norm": 0.043219927698373795, "learning_rate": 0.00010676729822773193, "loss": 0.2671, "step": 2519 }, { "epoch": 2.3928317113695705, "grad_norm": 0.05860576406121254, "learning_rate": 0.00010670776443910024, "loss": 0.2806, "step": 2520 }, { "epoch": 2.3937811535722764, "grad_norm": 0.05496351793408394, "learning_rate": 0.00010664822826221309, "loss": 0.2816, "step": 2521 }, { "epoch": 2.3947305957749823, "grad_norm": 0.05229606106877327, "learning_rate": 0.00010658868971826785, "loss": 0.2858, "step": 2522 }, { "epoch": 2.395680037977688, "grad_norm": 0.04436548799276352, "learning_rate": 0.00010652914882846295, "loss": 0.2618, "step": 2523 }, { "epoch": 2.396629480180394, "grad_norm": 0.04846099019050598, "learning_rate": 0.00010646960561399745, "loss": 0.2825, "step": 2524 }, { "epoch": 2.3975789223831, "grad_norm": 0.04811537638306618, "learning_rate": 0.00010641006009607137, "loss": 0.2656, "step": 2525 }, { "epoch": 2.3985283645858058, "grad_norm": 0.04264158755540848, "learning_rate": 0.00010635051229588546, "loss": 0.2611, "step": 2526 }, { "epoch": 2.3994778067885116, "grad_norm": 0.04192167893052101, "learning_rate": 0.00010629096223464137, "loss": 0.2544, "step": 2527 }, { "epoch": 2.4004272489912175, "grad_norm": 0.042320821434259415, "learning_rate": 0.00010623140993354144, "loss": 0.2729, "step": 2528 }, { "epoch": 2.4013766911939234, "grad_norm": 0.07291635125875473, "learning_rate": 0.00010617185541378895, "loss": 0.2919, "step": 2529 }, { "epoch": 2.4023261333966293, "grad_norm": 0.04448839649558067, "learning_rate": 0.00010611229869658785, "loss": 0.2625, "step": 2530 }, { "epoch": 2.4032755755993356, "grad_norm": 0.05060156062245369, "learning_rate": 0.00010605273980314292, "loss": 0.2783, "step": 2531 }, { "epoch": 2.4042250178020415, "grad_norm": 0.04214916750788689, "learning_rate": 0.00010599317875465976, "loss": 0.2518, "step": 2532 }, { "epoch": 2.4051744600047473, "grad_norm": 0.05565851926803589, "learning_rate": 0.00010593361557234462, "loss": 0.2541, "step": 2533 }, { "epoch": 2.406123902207453, "grad_norm": 0.07710932940244675, "learning_rate": 0.00010587405027740465, "loss": 0.2522, "step": 2534 }, { "epoch": 2.407073344410159, "grad_norm": 0.040221601724624634, "learning_rate": 0.00010581448289104758, "loss": 0.2545, "step": 2535 }, { "epoch": 2.408022786612865, "grad_norm": 0.04807737097144127, "learning_rate": 0.0001057549134344821, "loss": 0.2642, "step": 2536 }, { "epoch": 2.408972228815571, "grad_norm": 0.0500488318502903, "learning_rate": 0.00010569534192891748, "loss": 0.2764, "step": 2537 }, { "epoch": 2.4099216710182767, "grad_norm": 0.04549793899059296, "learning_rate": 0.00010563576839556374, "loss": 0.2528, "step": 2538 }, { "epoch": 2.4108711132209826, "grad_norm": 0.05493444204330444, "learning_rate": 0.0001055761928556317, "loss": 0.2679, "step": 2539 }, { "epoch": 2.4118205554236885, "grad_norm": 0.04553236812353134, "learning_rate": 0.00010551661533033275, "loss": 0.2621, "step": 2540 }, { "epoch": 2.4127699976263943, "grad_norm": 0.11890033632516861, "learning_rate": 0.00010545703584087918, "loss": 0.2874, "step": 2541 }, { "epoch": 2.4137194398291006, "grad_norm": 0.08107148110866547, "learning_rate": 0.0001053974544084838, "loss": 0.2864, "step": 2542 }, { "epoch": 2.4146688820318065, "grad_norm": 0.047696180641651154, "learning_rate": 0.00010533787105436026, "loss": 0.2657, "step": 2543 }, { "epoch": 2.4156183242345124, "grad_norm": 0.04514947161078453, "learning_rate": 0.00010527828579972272, "loss": 0.2515, "step": 2544 }, { "epoch": 2.4165677664372183, "grad_norm": 0.04135221242904663, "learning_rate": 0.0001052186986657862, "loss": 0.2609, "step": 2545 }, { "epoch": 2.417517208639924, "grad_norm": 0.0875149667263031, "learning_rate": 0.00010515910967376627, "loss": 0.2666, "step": 2546 }, { "epoch": 2.41846665084263, "grad_norm": 0.038753919303417206, "learning_rate": 0.00010509951884487926, "loss": 0.2605, "step": 2547 }, { "epoch": 2.419416093045336, "grad_norm": 0.059343352913856506, "learning_rate": 0.00010503992620034202, "loss": 0.2795, "step": 2548 }, { "epoch": 2.4203655352480418, "grad_norm": 0.057376034557819366, "learning_rate": 0.00010498033176137212, "loss": 0.2805, "step": 2549 }, { "epoch": 2.4213149774507476, "grad_norm": 0.04280192777514458, "learning_rate": 0.00010492073554918782, "loss": 0.2592, "step": 2550 }, { "epoch": 2.4222644196534535, "grad_norm": 0.0482187457382679, "learning_rate": 0.00010486113758500795, "loss": 0.261, "step": 2551 }, { "epoch": 2.4232138618561594, "grad_norm": 0.0874866172671318, "learning_rate": 0.00010480153789005193, "loss": 0.2593, "step": 2552 }, { "epoch": 2.4241633040588653, "grad_norm": 0.04283559322357178, "learning_rate": 0.00010474193648553989, "loss": 0.2628, "step": 2553 }, { "epoch": 2.425112746261571, "grad_norm": 0.046166982501745224, "learning_rate": 0.0001046823333926925, "loss": 0.264, "step": 2554 }, { "epoch": 2.426062188464277, "grad_norm": 0.09907100349664688, "learning_rate": 0.00010462272863273104, "loss": 0.2804, "step": 2555 }, { "epoch": 2.4270116306669833, "grad_norm": 0.04987293481826782, "learning_rate": 0.0001045631222268774, "loss": 0.2623, "step": 2556 }, { "epoch": 2.427961072869689, "grad_norm": 0.05097109079360962, "learning_rate": 0.00010450351419635407, "loss": 0.2684, "step": 2557 }, { "epoch": 2.428910515072395, "grad_norm": 0.04925335571169853, "learning_rate": 0.00010444390456238404, "loss": 0.2693, "step": 2558 }, { "epoch": 2.429859957275101, "grad_norm": 0.041472528129816055, "learning_rate": 0.00010438429334619102, "loss": 0.2552, "step": 2559 }, { "epoch": 2.430809399477807, "grad_norm": 0.04419026896357536, "learning_rate": 0.00010432468056899909, "loss": 0.2625, "step": 2560 }, { "epoch": 2.4317588416805127, "grad_norm": 0.039240069687366486, "learning_rate": 0.00010426506625203307, "loss": 0.2633, "step": 2561 }, { "epoch": 2.4327082838832186, "grad_norm": 0.040763020515441895, "learning_rate": 0.00010420545041651822, "loss": 0.2507, "step": 2562 }, { "epoch": 2.4336577260859245, "grad_norm": 0.051095739006996155, "learning_rate": 0.00010414583308368033, "loss": 0.2643, "step": 2563 }, { "epoch": 2.4346071682886303, "grad_norm": 0.05506645515561104, "learning_rate": 0.00010408621427474581, "loss": 0.2605, "step": 2564 }, { "epoch": 2.435556610491336, "grad_norm": 0.03904623165726662, "learning_rate": 0.00010402659401094152, "loss": 0.2638, "step": 2565 }, { "epoch": 2.436506052694042, "grad_norm": 0.06974940747022629, "learning_rate": 0.00010396697231349488, "loss": 0.261, "step": 2566 }, { "epoch": 2.4374554948967484, "grad_norm": 0.048252761363983154, "learning_rate": 0.00010390734920363379, "loss": 0.2738, "step": 2567 }, { "epoch": 2.4384049370994543, "grad_norm": 0.04159051179885864, "learning_rate": 0.00010384772470258663, "loss": 0.2663, "step": 2568 }, { "epoch": 2.43935437930216, "grad_norm": 0.051444459706544876, "learning_rate": 0.0001037880988315824, "loss": 0.2678, "step": 2569 }, { "epoch": 2.440303821504866, "grad_norm": 0.0407598651945591, "learning_rate": 0.00010372847161185046, "loss": 0.2562, "step": 2570 }, { "epoch": 2.441253263707572, "grad_norm": 0.055472977459430695, "learning_rate": 0.00010366884306462068, "loss": 0.2881, "step": 2571 }, { "epoch": 2.4422027059102778, "grad_norm": 0.04310688376426697, "learning_rate": 0.00010360921321112337, "loss": 0.2578, "step": 2572 }, { "epoch": 2.4431521481129836, "grad_norm": 0.04362298175692558, "learning_rate": 0.00010354958207258945, "loss": 0.268, "step": 2573 }, { "epoch": 2.4441015903156895, "grad_norm": 0.04311308637261391, "learning_rate": 0.00010348994967025012, "loss": 0.272, "step": 2574 }, { "epoch": 2.4450510325183954, "grad_norm": 0.06475937366485596, "learning_rate": 0.00010343031602533713, "loss": 0.2882, "step": 2575 }, { "epoch": 2.4460004747211013, "grad_norm": 0.04065021127462387, "learning_rate": 0.0001033706811590826, "loss": 0.2578, "step": 2576 }, { "epoch": 2.446949916923807, "grad_norm": 0.044748466461896896, "learning_rate": 0.00010331104509271918, "loss": 0.2667, "step": 2577 }, { "epoch": 2.447899359126513, "grad_norm": 0.04572344943881035, "learning_rate": 0.00010325140784747993, "loss": 0.2669, "step": 2578 }, { "epoch": 2.448848801329219, "grad_norm": 0.05856434255838394, "learning_rate": 0.00010319176944459826, "loss": 0.2708, "step": 2579 }, { "epoch": 2.4497982435319248, "grad_norm": 0.03956478834152222, "learning_rate": 0.00010313212990530803, "loss": 0.2524, "step": 2580 }, { "epoch": 2.450747685734631, "grad_norm": 0.04141313582658768, "learning_rate": 0.00010307248925084352, "loss": 0.2576, "step": 2581 }, { "epoch": 2.451697127937337, "grad_norm": 0.04271996021270752, "learning_rate": 0.00010301284750243936, "loss": 0.2643, "step": 2582 }, { "epoch": 2.452646570140043, "grad_norm": 0.06122612580657005, "learning_rate": 0.00010295320468133066, "loss": 0.2799, "step": 2583 }, { "epoch": 2.4535960123427487, "grad_norm": 0.037602152675390244, "learning_rate": 0.00010289356080875277, "loss": 0.2491, "step": 2584 }, { "epoch": 2.4545454545454546, "grad_norm": 0.04972713813185692, "learning_rate": 0.00010283391590594161, "loss": 0.2869, "step": 2585 }, { "epoch": 2.4554948967481605, "grad_norm": 0.045384474098682404, "learning_rate": 0.00010277426999413327, "loss": 0.2762, "step": 2586 }, { "epoch": 2.4564443389508663, "grad_norm": 0.04337489232420921, "learning_rate": 0.0001027146230945643, "loss": 0.255, "step": 2587 }, { "epoch": 2.457393781153572, "grad_norm": 0.09511115401983261, "learning_rate": 0.00010265497522847162, "loss": 0.2713, "step": 2588 }, { "epoch": 2.458343223356278, "grad_norm": 0.08762829005718231, "learning_rate": 0.00010259532641709247, "loss": 0.2665, "step": 2589 }, { "epoch": 2.459292665558984, "grad_norm": 0.040508076548576355, "learning_rate": 0.00010253567668166435, "loss": 0.2485, "step": 2590 }, { "epoch": 2.46024210776169, "grad_norm": 0.04852940887212753, "learning_rate": 0.00010247602604342519, "loss": 0.2613, "step": 2591 }, { "epoch": 2.461191549964396, "grad_norm": 0.05233342573046684, "learning_rate": 0.00010241637452361323, "loss": 0.2781, "step": 2592 }, { "epoch": 2.462140992167102, "grad_norm": 0.08143714815378189, "learning_rate": 0.000102356722143467, "loss": 0.2548, "step": 2593 }, { "epoch": 2.463090434369808, "grad_norm": 0.05217353627085686, "learning_rate": 0.00010229706892422531, "loss": 0.2708, "step": 2594 }, { "epoch": 2.4640398765725138, "grad_norm": 0.09524723887443542, "learning_rate": 0.00010223741488712733, "loss": 0.2688, "step": 2595 }, { "epoch": 2.4649893187752197, "grad_norm": 0.05073004588484764, "learning_rate": 0.00010217776005341241, "loss": 0.2732, "step": 2596 }, { "epoch": 2.4659387609779255, "grad_norm": 0.04612806811928749, "learning_rate": 0.0001021181044443204, "loss": 0.2479, "step": 2597 }, { "epoch": 2.4668882031806314, "grad_norm": 0.045733220875263214, "learning_rate": 0.00010205844808109117, "loss": 0.2621, "step": 2598 }, { "epoch": 2.4678376453833373, "grad_norm": 0.044287122786045074, "learning_rate": 0.00010199879098496504, "loss": 0.2652, "step": 2599 }, { "epoch": 2.468787087586043, "grad_norm": 0.06199018657207489, "learning_rate": 0.00010193913317718244, "loss": 0.278, "step": 2600 }, { "epoch": 2.469736529788749, "grad_norm": 0.041201118379831314, "learning_rate": 0.00010187947467898425, "loss": 0.2632, "step": 2601 }, { "epoch": 2.470685971991455, "grad_norm": 0.04487983509898186, "learning_rate": 0.00010181981551161144, "loss": 0.27, "step": 2602 }, { "epoch": 2.4716354141941608, "grad_norm": 0.04142885282635689, "learning_rate": 0.00010176015569630526, "loss": 0.2537, "step": 2603 }, { "epoch": 2.4725848563968666, "grad_norm": 0.04327712580561638, "learning_rate": 0.0001017004952543072, "loss": 0.2615, "step": 2604 }, { "epoch": 2.4735342985995725, "grad_norm": 0.043669018894433975, "learning_rate": 0.00010164083420685897, "loss": 0.2617, "step": 2605 }, { "epoch": 2.474483740802279, "grad_norm": 0.05305905640125275, "learning_rate": 0.0001015811725752025, "loss": 0.2806, "step": 2606 }, { "epoch": 2.4754331830049847, "grad_norm": 0.042252250015735626, "learning_rate": 0.00010152151038057993, "loss": 0.2672, "step": 2607 }, { "epoch": 2.4763826252076906, "grad_norm": 0.043947260826826096, "learning_rate": 0.00010146184764423357, "loss": 0.2652, "step": 2608 }, { "epoch": 2.4773320674103965, "grad_norm": 0.04211874678730965, "learning_rate": 0.00010140218438740591, "loss": 0.2544, "step": 2609 }, { "epoch": 2.4782815096131023, "grad_norm": 0.04319967329502106, "learning_rate": 0.00010134252063133975, "loss": 0.2689, "step": 2610 }, { "epoch": 2.479230951815808, "grad_norm": 0.041697051376104355, "learning_rate": 0.00010128285639727792, "loss": 0.2566, "step": 2611 }, { "epoch": 2.480180394018514, "grad_norm": 0.04475269839167595, "learning_rate": 0.0001012231917064635, "loss": 0.2641, "step": 2612 }, { "epoch": 2.48112983622122, "grad_norm": 0.03968726098537445, "learning_rate": 0.00010116352658013973, "loss": 0.2552, "step": 2613 }, { "epoch": 2.482079278423926, "grad_norm": 0.0413176491856575, "learning_rate": 0.00010110386103954992, "loss": 0.2516, "step": 2614 }, { "epoch": 2.4830287206266317, "grad_norm": 0.04182640090584755, "learning_rate": 0.00010104419510593764, "loss": 0.2602, "step": 2615 }, { "epoch": 2.4839781628293376, "grad_norm": 0.04140539839863777, "learning_rate": 0.00010098452880054656, "loss": 0.2595, "step": 2616 }, { "epoch": 2.484927605032044, "grad_norm": 0.042879413813352585, "learning_rate": 0.00010092486214462045, "loss": 0.2722, "step": 2617 }, { "epoch": 2.48587704723475, "grad_norm": 0.04176124185323715, "learning_rate": 0.00010086519515940326, "loss": 0.255, "step": 2618 }, { "epoch": 2.4868264894374557, "grad_norm": 0.04227910190820694, "learning_rate": 0.00010080552786613899, "loss": 0.2731, "step": 2619 }, { "epoch": 2.4877759316401615, "grad_norm": 0.050102487206459045, "learning_rate": 0.00010074586028607184, "loss": 0.2863, "step": 2620 }, { "epoch": 2.4887253738428674, "grad_norm": 0.04757722094655037, "learning_rate": 0.00010068619244044604, "loss": 0.2633, "step": 2621 }, { "epoch": 2.4896748160455733, "grad_norm": 0.0399179607629776, "learning_rate": 0.00010062652435050592, "loss": 0.2608, "step": 2622 }, { "epoch": 2.490624258248279, "grad_norm": 0.041110094636678696, "learning_rate": 0.00010056685603749589, "loss": 0.2614, "step": 2623 }, { "epoch": 2.491573700450985, "grad_norm": 0.04052494838833809, "learning_rate": 0.00010050718752266053, "loss": 0.2494, "step": 2624 }, { "epoch": 2.492523142653691, "grad_norm": 0.04536557197570801, "learning_rate": 0.00010044751882724435, "loss": 0.2645, "step": 2625 }, { "epoch": 2.493472584856397, "grad_norm": 0.04890064895153046, "learning_rate": 0.00010038784997249205, "loss": 0.2648, "step": 2626 }, { "epoch": 2.4944220270591027, "grad_norm": 0.04206657037138939, "learning_rate": 0.00010032818097964829, "loss": 0.2629, "step": 2627 }, { "epoch": 2.4953714692618085, "grad_norm": 0.049437765032052994, "learning_rate": 0.00010026851186995785, "loss": 0.2711, "step": 2628 }, { "epoch": 2.4963209114645144, "grad_norm": 0.04828893393278122, "learning_rate": 0.00010020884266466554, "loss": 0.2704, "step": 2629 }, { "epoch": 2.4972703536672203, "grad_norm": 0.041123807430267334, "learning_rate": 0.00010014917338501618, "loss": 0.2713, "step": 2630 }, { "epoch": 2.4982197958699266, "grad_norm": 0.04797301068902016, "learning_rate": 0.00010008950405225462, "loss": 0.2658, "step": 2631 }, { "epoch": 2.4991692380726325, "grad_norm": 0.04327033832669258, "learning_rate": 0.0001000298346876257, "loss": 0.2627, "step": 2632 }, { "epoch": 2.5001186802753383, "grad_norm": 0.04332072660326958, "learning_rate": 9.997016531237432e-05, "loss": 0.2722, "step": 2633 }, { "epoch": 2.501068122478044, "grad_norm": 0.047892216593027115, "learning_rate": 9.991049594774543e-05, "loss": 0.264, "step": 2634 }, { "epoch": 2.50201756468075, "grad_norm": 0.04232776165008545, "learning_rate": 9.985082661498383e-05, "loss": 0.2598, "step": 2635 }, { "epoch": 2.502967006883456, "grad_norm": 0.0745973065495491, "learning_rate": 9.979115733533449e-05, "loss": 0.2631, "step": 2636 }, { "epoch": 2.503916449086162, "grad_norm": 0.04066299647092819, "learning_rate": 9.973148813004216e-05, "loss": 0.2542, "step": 2637 }, { "epoch": 2.5048658912888677, "grad_norm": 0.10445679724216461, "learning_rate": 9.96718190203517e-05, "loss": 0.2801, "step": 2638 }, { "epoch": 2.5058153334915736, "grad_norm": 0.04454142972826958, "learning_rate": 9.961215002750799e-05, "loss": 0.2698, "step": 2639 }, { "epoch": 2.5067647756942795, "grad_norm": 0.03955570235848427, "learning_rate": 9.955248117275566e-05, "loss": 0.2588, "step": 2640 }, { "epoch": 2.507714217896986, "grad_norm": 0.042795125395059586, "learning_rate": 9.949281247733952e-05, "loss": 0.2619, "step": 2641 }, { "epoch": 2.5086636600996917, "grad_norm": 0.045542147010564804, "learning_rate": 9.943314396250413e-05, "loss": 0.2599, "step": 2642 }, { "epoch": 2.5096131023023975, "grad_norm": 0.04280371963977814, "learning_rate": 9.937347564949413e-05, "loss": 0.2557, "step": 2643 }, { "epoch": 2.5105625445051034, "grad_norm": 0.04497581720352173, "learning_rate": 9.931380755955398e-05, "loss": 0.2683, "step": 2644 }, { "epoch": 2.5115119867078093, "grad_norm": 0.05084951967000961, "learning_rate": 9.925413971392815e-05, "loss": 0.2593, "step": 2645 }, { "epoch": 2.512461428910515, "grad_norm": 0.04743589088320732, "learning_rate": 9.919447213386103e-05, "loss": 0.2686, "step": 2646 }, { "epoch": 2.513410871113221, "grad_norm": 0.051552269607782364, "learning_rate": 9.913480484059676e-05, "loss": 0.2576, "step": 2647 }, { "epoch": 2.514360313315927, "grad_norm": 0.05527034029364586, "learning_rate": 9.907513785537957e-05, "loss": 0.2859, "step": 2648 }, { "epoch": 2.515309755518633, "grad_norm": 0.05772867426276207, "learning_rate": 9.901547119945345e-05, "loss": 0.2871, "step": 2649 }, { "epoch": 2.5162591977213387, "grad_norm": 0.07210524380207062, "learning_rate": 9.89558048940624e-05, "loss": 0.2582, "step": 2650 }, { "epoch": 2.5172086399240445, "grad_norm": 0.053755562752485275, "learning_rate": 9.889613896045012e-05, "loss": 0.2677, "step": 2651 }, { "epoch": 2.5181580821267504, "grad_norm": 0.06051605939865112, "learning_rate": 9.883647341986032e-05, "loss": 0.2792, "step": 2652 }, { "epoch": 2.5191075243294563, "grad_norm": 0.04284793883562088, "learning_rate": 9.87768082935365e-05, "loss": 0.2507, "step": 2653 }, { "epoch": 2.520056966532162, "grad_norm": 0.04125799611210823, "learning_rate": 9.871714360272208e-05, "loss": 0.258, "step": 2654 }, { "epoch": 2.521006408734868, "grad_norm": 0.04322294518351555, "learning_rate": 9.865747936866027e-05, "loss": 0.2678, "step": 2655 }, { "epoch": 2.521955850937574, "grad_norm": 0.03939296305179596, "learning_rate": 9.85978156125941e-05, "loss": 0.254, "step": 2656 }, { "epoch": 2.5229052931402802, "grad_norm": 0.0440739206969738, "learning_rate": 9.853815235576648e-05, "loss": 0.2542, "step": 2657 }, { "epoch": 2.523854735342986, "grad_norm": 0.04515873268246651, "learning_rate": 9.847848961942008e-05, "loss": 0.2556, "step": 2658 }, { "epoch": 2.524804177545692, "grad_norm": 0.044637441635131836, "learning_rate": 9.841882742479753e-05, "loss": 0.2648, "step": 2659 }, { "epoch": 2.525753619748398, "grad_norm": 0.04760422930121422, "learning_rate": 9.835916579314105e-05, "loss": 0.2581, "step": 2660 }, { "epoch": 2.5267030619511037, "grad_norm": 0.09039561450481415, "learning_rate": 9.829950474569281e-05, "loss": 0.2744, "step": 2661 }, { "epoch": 2.5276525041538096, "grad_norm": 0.04204951971769333, "learning_rate": 9.823984430369477e-05, "loss": 0.2474, "step": 2662 }, { "epoch": 2.5286019463565155, "grad_norm": 0.04530685022473335, "learning_rate": 9.818018448838855e-05, "loss": 0.2687, "step": 2663 }, { "epoch": 2.5295513885592213, "grad_norm": 0.05104728043079376, "learning_rate": 9.812052532101578e-05, "loss": 0.2701, "step": 2664 }, { "epoch": 2.5305008307619272, "grad_norm": 0.03922882676124573, "learning_rate": 9.806086682281758e-05, "loss": 0.2582, "step": 2665 }, { "epoch": 2.5314502729646335, "grad_norm": 0.043734561651945114, "learning_rate": 9.800120901503503e-05, "loss": 0.2689, "step": 2666 }, { "epoch": 2.5323997151673394, "grad_norm": 0.043197888880968094, "learning_rate": 9.794155191890885e-05, "loss": 0.2589, "step": 2667 }, { "epoch": 2.5333491573700453, "grad_norm": 0.041970640420913696, "learning_rate": 9.788189555567966e-05, "loss": 0.2527, "step": 2668 }, { "epoch": 2.534298599572751, "grad_norm": 0.04040301963686943, "learning_rate": 9.78222399465876e-05, "loss": 0.2578, "step": 2669 }, { "epoch": 2.535248041775457, "grad_norm": 0.041597891598939896, "learning_rate": 9.776258511287271e-05, "loss": 0.2593, "step": 2670 }, { "epoch": 2.536197483978163, "grad_norm": 0.05185459926724434, "learning_rate": 9.770293107577471e-05, "loss": 0.2746, "step": 2671 }, { "epoch": 2.537146926180869, "grad_norm": 0.03791874647140503, "learning_rate": 9.764327785653302e-05, "loss": 0.257, "step": 2672 }, { "epoch": 2.5380963683835747, "grad_norm": 0.04940661042928696, "learning_rate": 9.75836254763868e-05, "loss": 0.2589, "step": 2673 }, { "epoch": 2.5390458105862805, "grad_norm": 0.040791600942611694, "learning_rate": 9.752397395657482e-05, "loss": 0.2616, "step": 2674 }, { "epoch": 2.5399952527889864, "grad_norm": 0.0469420962035656, "learning_rate": 9.746432331833569e-05, "loss": 0.2655, "step": 2675 }, { "epoch": 2.5409446949916923, "grad_norm": 0.046912360936403275, "learning_rate": 9.740467358290755e-05, "loss": 0.2618, "step": 2676 }, { "epoch": 2.541894137194398, "grad_norm": 0.043544989079236984, "learning_rate": 9.734502477152841e-05, "loss": 0.2634, "step": 2677 }, { "epoch": 2.542843579397104, "grad_norm": 0.050445009022951126, "learning_rate": 9.728537690543572e-05, "loss": 0.2671, "step": 2678 }, { "epoch": 2.54379302159981, "grad_norm": 0.07378991693258286, "learning_rate": 9.722573000586676e-05, "loss": 0.2881, "step": 2679 }, { "epoch": 2.544742463802516, "grad_norm": 0.03695583716034889, "learning_rate": 9.716608409405842e-05, "loss": 0.2499, "step": 2680 }, { "epoch": 2.5456919060052217, "grad_norm": 0.05563429370522499, "learning_rate": 9.710643919124723e-05, "loss": 0.2725, "step": 2681 }, { "epoch": 2.546641348207928, "grad_norm": 0.04267679899930954, "learning_rate": 9.704679531866941e-05, "loss": 0.2511, "step": 2682 }, { "epoch": 2.547590790410634, "grad_norm": 0.04398363456130028, "learning_rate": 9.698715249756067e-05, "loss": 0.2644, "step": 2683 }, { "epoch": 2.5485402326133397, "grad_norm": 0.04224398359656334, "learning_rate": 9.692751074915653e-05, "loss": 0.2566, "step": 2684 }, { "epoch": 2.5494896748160456, "grad_norm": 0.039920974522829056, "learning_rate": 9.6867870094692e-05, "loss": 0.2487, "step": 2685 }, { "epoch": 2.5504391170187515, "grad_norm": 0.05325039476156235, "learning_rate": 9.680823055540174e-05, "loss": 0.2793, "step": 2686 }, { "epoch": 2.5513885592214574, "grad_norm": 0.05420837551355362, "learning_rate": 9.67485921525201e-05, "loss": 0.2778, "step": 2687 }, { "epoch": 2.5523380014241632, "grad_norm": 0.043418120592832565, "learning_rate": 9.668895490728082e-05, "loss": 0.27, "step": 2688 }, { "epoch": 2.553287443626869, "grad_norm": 0.04475219547748566, "learning_rate": 9.662931884091741e-05, "loss": 0.2699, "step": 2689 }, { "epoch": 2.554236885829575, "grad_norm": 0.047918371856212616, "learning_rate": 9.656968397466291e-05, "loss": 0.2694, "step": 2690 }, { "epoch": 2.5551863280322813, "grad_norm": 0.0412350669503212, "learning_rate": 9.651005032974994e-05, "loss": 0.2625, "step": 2691 }, { "epoch": 2.556135770234987, "grad_norm": 0.0425032414495945, "learning_rate": 9.645041792741057e-05, "loss": 0.2581, "step": 2692 }, { "epoch": 2.557085212437693, "grad_norm": 0.0597444623708725, "learning_rate": 9.639078678887665e-05, "loss": 0.2771, "step": 2693 }, { "epoch": 2.558034654640399, "grad_norm": 0.05487224459648132, "learning_rate": 9.633115693537935e-05, "loss": 0.2642, "step": 2694 }, { "epoch": 2.558984096843105, "grad_norm": 0.04886789247393608, "learning_rate": 9.627152838814953e-05, "loss": 0.2695, "step": 2695 }, { "epoch": 2.5599335390458107, "grad_norm": 0.042438726872205734, "learning_rate": 9.62119011684176e-05, "loss": 0.2689, "step": 2696 }, { "epoch": 2.5608829812485165, "grad_norm": 0.04784049838781357, "learning_rate": 9.615227529741335e-05, "loss": 0.2725, "step": 2697 }, { "epoch": 2.5618324234512224, "grad_norm": 0.08410549908876419, "learning_rate": 9.609265079636623e-05, "loss": 0.2688, "step": 2698 }, { "epoch": 2.5627818656539283, "grad_norm": 0.03873563930392265, "learning_rate": 9.603302768650513e-05, "loss": 0.2621, "step": 2699 }, { "epoch": 2.563731307856634, "grad_norm": 0.12595133483409882, "learning_rate": 9.597340598905852e-05, "loss": 0.2718, "step": 2700 }, { "epoch": 2.56468075005934, "grad_norm": 0.041138943284749985, "learning_rate": 9.591378572525422e-05, "loss": 0.2611, "step": 2701 }, { "epoch": 2.565630192262046, "grad_norm": 0.05437808111310005, "learning_rate": 9.585416691631968e-05, "loss": 0.2836, "step": 2702 }, { "epoch": 2.566579634464752, "grad_norm": 0.04374735429883003, "learning_rate": 9.57945495834818e-05, "loss": 0.2672, "step": 2703 }, { "epoch": 2.5675290766674577, "grad_norm": 0.04583863914012909, "learning_rate": 9.573493374796693e-05, "loss": 0.2499, "step": 2704 }, { "epoch": 2.5684785188701635, "grad_norm": 0.0510207898914814, "learning_rate": 9.567531943100093e-05, "loss": 0.2767, "step": 2705 }, { "epoch": 2.5694279610728694, "grad_norm": 0.0616254098713398, "learning_rate": 9.561570665380901e-05, "loss": 0.2633, "step": 2706 }, { "epoch": 2.5703774032755757, "grad_norm": 0.044144704937934875, "learning_rate": 9.555609543761597e-05, "loss": 0.2559, "step": 2707 }, { "epoch": 2.5713268454782816, "grad_norm": 0.051671102643013, "learning_rate": 9.549648580364595e-05, "loss": 0.2632, "step": 2708 }, { "epoch": 2.5722762876809875, "grad_norm": 0.05204401910305023, "learning_rate": 9.543687777312263e-05, "loss": 0.2675, "step": 2709 }, { "epoch": 2.5732257298836934, "grad_norm": 0.07156354933977127, "learning_rate": 9.537727136726898e-05, "loss": 0.2718, "step": 2710 }, { "epoch": 2.5741751720863992, "grad_norm": 0.05089721456170082, "learning_rate": 9.531766660730752e-05, "loss": 0.2683, "step": 2711 }, { "epoch": 2.575124614289105, "grad_norm": 0.04808243736624718, "learning_rate": 9.525806351446013e-05, "loss": 0.2496, "step": 2712 }, { "epoch": 2.576074056491811, "grad_norm": 0.06348177045583725, "learning_rate": 9.519846210994806e-05, "loss": 0.2876, "step": 2713 }, { "epoch": 2.577023498694517, "grad_norm": 0.06046038866043091, "learning_rate": 9.513886241499209e-05, "loss": 0.2763, "step": 2714 }, { "epoch": 2.5779729408972227, "grad_norm": 0.04020876809954643, "learning_rate": 9.507926445081219e-05, "loss": 0.2655, "step": 2715 }, { "epoch": 2.578922383099929, "grad_norm": 0.05419212952256203, "learning_rate": 9.50196682386279e-05, "loss": 0.2615, "step": 2716 }, { "epoch": 2.579871825302635, "grad_norm": 0.056060582399368286, "learning_rate": 9.496007379965801e-05, "loss": 0.2801, "step": 2717 }, { "epoch": 2.580821267505341, "grad_norm": 0.05008630454540253, "learning_rate": 9.490048115512074e-05, "loss": 0.2834, "step": 2718 }, { "epoch": 2.5817707097080467, "grad_norm": 0.040566056966781616, "learning_rate": 9.484089032623374e-05, "loss": 0.2591, "step": 2719 }, { "epoch": 2.5827201519107525, "grad_norm": 0.04528141766786575, "learning_rate": 9.47813013342138e-05, "loss": 0.2619, "step": 2720 }, { "epoch": 2.5836695941134584, "grad_norm": 0.041971296072006226, "learning_rate": 9.47217142002773e-05, "loss": 0.2645, "step": 2721 }, { "epoch": 2.5846190363161643, "grad_norm": 0.047894254326820374, "learning_rate": 9.466212894563977e-05, "loss": 0.2742, "step": 2722 }, { "epoch": 2.58556847851887, "grad_norm": 0.0397222638130188, "learning_rate": 9.460254559151622e-05, "loss": 0.2553, "step": 2723 }, { "epoch": 2.586517920721576, "grad_norm": 0.04544184356927872, "learning_rate": 9.454296415912085e-05, "loss": 0.2614, "step": 2724 }, { "epoch": 2.587467362924282, "grad_norm": 0.04659304767847061, "learning_rate": 9.448338466966726e-05, "loss": 0.2764, "step": 2725 }, { "epoch": 2.588416805126988, "grad_norm": 0.03721113130450249, "learning_rate": 9.442380714436834e-05, "loss": 0.2489, "step": 2726 }, { "epoch": 2.5893662473296937, "grad_norm": 0.08494356274604797, "learning_rate": 9.436423160443625e-05, "loss": 0.2695, "step": 2727 }, { "epoch": 2.5903156895323995, "grad_norm": 0.04514550790190697, "learning_rate": 9.430465807108255e-05, "loss": 0.271, "step": 2728 }, { "epoch": 2.5912651317351054, "grad_norm": 0.044937096536159515, "learning_rate": 9.42450865655179e-05, "loss": 0.265, "step": 2729 }, { "epoch": 2.5922145739378113, "grad_norm": 0.04702681675553322, "learning_rate": 9.418551710895243e-05, "loss": 0.2615, "step": 2730 }, { "epoch": 2.593164016140517, "grad_norm": 0.0370311513543129, "learning_rate": 9.412594972259539e-05, "loss": 0.2461, "step": 2731 }, { "epoch": 2.5941134583432235, "grad_norm": 0.03818555921316147, "learning_rate": 9.406638442765542e-05, "loss": 0.2596, "step": 2732 }, { "epoch": 2.5950629005459294, "grad_norm": 0.04003351554274559, "learning_rate": 9.400682124534027e-05, "loss": 0.2651, "step": 2733 }, { "epoch": 2.5960123427486352, "grad_norm": 0.05568593740463257, "learning_rate": 9.394726019685706e-05, "loss": 0.2783, "step": 2734 }, { "epoch": 2.596961784951341, "grad_norm": 0.1012004017829895, "learning_rate": 9.388770130341217e-05, "loss": 0.3044, "step": 2735 }, { "epoch": 2.597911227154047, "grad_norm": 0.03857073932886124, "learning_rate": 9.382814458621106e-05, "loss": 0.2567, "step": 2736 }, { "epoch": 2.598860669356753, "grad_norm": 0.041371818631887436, "learning_rate": 9.376859006645859e-05, "loss": 0.2647, "step": 2737 }, { "epoch": 2.5998101115594587, "grad_norm": 0.043267469853162766, "learning_rate": 9.370903776535865e-05, "loss": 0.2643, "step": 2738 }, { "epoch": 2.6007595537621646, "grad_norm": 0.04230300337076187, "learning_rate": 9.364948770411456e-05, "loss": 0.2575, "step": 2739 }, { "epoch": 2.6017089959648705, "grad_norm": 0.06302177906036377, "learning_rate": 9.358993990392864e-05, "loss": 0.2686, "step": 2740 }, { "epoch": 2.602658438167577, "grad_norm": 0.04174618795514107, "learning_rate": 9.353039438600257e-05, "loss": 0.2623, "step": 2741 }, { "epoch": 2.6036078803702827, "grad_norm": 0.07566291093826294, "learning_rate": 9.347085117153707e-05, "loss": 0.2529, "step": 2742 }, { "epoch": 2.6045573225729886, "grad_norm": 0.036149147897958755, "learning_rate": 9.341131028173214e-05, "loss": 0.2588, "step": 2743 }, { "epoch": 2.6055067647756944, "grad_norm": 0.03942830488085747, "learning_rate": 9.335177173778695e-05, "loss": 0.2507, "step": 2744 }, { "epoch": 2.6064562069784003, "grad_norm": 0.04223598912358284, "learning_rate": 9.329223556089975e-05, "loss": 0.2633, "step": 2745 }, { "epoch": 2.607405649181106, "grad_norm": 0.04097060486674309, "learning_rate": 9.32327017722681e-05, "loss": 0.2551, "step": 2746 }, { "epoch": 2.608355091383812, "grad_norm": 0.045439936220645905, "learning_rate": 9.317317039308848e-05, "loss": 0.266, "step": 2747 }, { "epoch": 2.609304533586518, "grad_norm": 0.047136351466178894, "learning_rate": 9.311364144455679e-05, "loss": 0.2593, "step": 2748 }, { "epoch": 2.610253975789224, "grad_norm": 0.04558572918176651, "learning_rate": 9.305411494786779e-05, "loss": 0.2715, "step": 2749 }, { "epoch": 2.6112034179919297, "grad_norm": 0.06827898323535919, "learning_rate": 9.299459092421558e-05, "loss": 0.2836, "step": 2750 }, { "epoch": 2.6121528601946356, "grad_norm": 0.0461437962949276, "learning_rate": 9.293506939479325e-05, "loss": 0.2716, "step": 2751 }, { "epoch": 2.6131023023973414, "grad_norm": 0.06343681365251541, "learning_rate": 9.287555038079309e-05, "loss": 0.2752, "step": 2752 }, { "epoch": 2.6140517446000473, "grad_norm": 0.038738228380680084, "learning_rate": 9.281603390340648e-05, "loss": 0.2575, "step": 2753 }, { "epoch": 2.615001186802753, "grad_norm": 0.04323815181851387, "learning_rate": 9.275651998382377e-05, "loss": 0.2642, "step": 2754 }, { "epoch": 2.615950629005459, "grad_norm": 0.04197486490011215, "learning_rate": 9.26970086432346e-05, "loss": 0.261, "step": 2755 }, { "epoch": 2.6169000712081654, "grad_norm": 0.04667133465409279, "learning_rate": 9.263749990282754e-05, "loss": 0.2699, "step": 2756 }, { "epoch": 2.6178495134108712, "grad_norm": 0.06833603978157043, "learning_rate": 9.257799378379032e-05, "loss": 0.2754, "step": 2757 }, { "epoch": 2.618798955613577, "grad_norm": 0.04214934632182121, "learning_rate": 9.251849030730964e-05, "loss": 0.2579, "step": 2758 }, { "epoch": 2.619748397816283, "grad_norm": 0.1015283465385437, "learning_rate": 9.245898949457139e-05, "loss": 0.2642, "step": 2759 }, { "epoch": 2.620697840018989, "grad_norm": 0.08113419264554977, "learning_rate": 9.239949136676041e-05, "loss": 0.2616, "step": 2760 }, { "epoch": 2.6216472822216947, "grad_norm": 0.04192844033241272, "learning_rate": 9.233999594506063e-05, "loss": 0.2628, "step": 2761 }, { "epoch": 2.6225967244244006, "grad_norm": 0.045566219836473465, "learning_rate": 9.228050325065503e-05, "loss": 0.2687, "step": 2762 }, { "epoch": 2.6235461666271065, "grad_norm": 0.08133133500814438, "learning_rate": 9.222101330472552e-05, "loss": 0.2685, "step": 2763 }, { "epoch": 2.6244956088298124, "grad_norm": 0.043140675872564316, "learning_rate": 9.216152612845318e-05, "loss": 0.2577, "step": 2764 }, { "epoch": 2.6254450510325182, "grad_norm": 0.047479353845119476, "learning_rate": 9.210204174301796e-05, "loss": 0.2658, "step": 2765 }, { "epoch": 2.6263944932352246, "grad_norm": 0.041498858481645584, "learning_rate": 9.204256016959898e-05, "loss": 0.2435, "step": 2766 }, { "epoch": 2.6273439354379304, "grad_norm": 0.06182354688644409, "learning_rate": 9.198308142937415e-05, "loss": 0.2795, "step": 2767 }, { "epoch": 2.6282933776406363, "grad_norm": 0.13252820074558258, "learning_rate": 9.192360554352055e-05, "loss": 0.2846, "step": 2768 }, { "epoch": 2.629242819843342, "grad_norm": 0.04402116686105728, "learning_rate": 9.186413253321418e-05, "loss": 0.2558, "step": 2769 }, { "epoch": 2.630192262046048, "grad_norm": 0.05568404123187065, "learning_rate": 9.180466241962999e-05, "loss": 0.2732, "step": 2770 }, { "epoch": 2.631141704248754, "grad_norm": 0.04584593325853348, "learning_rate": 9.174519522394198e-05, "loss": 0.2588, "step": 2771 }, { "epoch": 2.63209114645146, "grad_norm": 0.06789962947368622, "learning_rate": 9.168573096732297e-05, "loss": 0.2826, "step": 2772 }, { "epoch": 2.6330405886541657, "grad_norm": 0.05142885819077492, "learning_rate": 9.162626967094487e-05, "loss": 0.2692, "step": 2773 }, { "epoch": 2.6339900308568716, "grad_norm": 0.04692273959517479, "learning_rate": 9.156681135597847e-05, "loss": 0.2583, "step": 2774 }, { "epoch": 2.6349394730595774, "grad_norm": 0.05026277154684067, "learning_rate": 9.150735604359351e-05, "loss": 0.2655, "step": 2775 }, { "epoch": 2.6358889152622833, "grad_norm": 0.062294721603393555, "learning_rate": 9.144790375495871e-05, "loss": 0.2696, "step": 2776 }, { "epoch": 2.636838357464989, "grad_norm": 0.038956400007009506, "learning_rate": 9.138845451124158e-05, "loss": 0.2477, "step": 2777 }, { "epoch": 2.637787799667695, "grad_norm": 0.1065063551068306, "learning_rate": 9.132900833360871e-05, "loss": 0.251, "step": 2778 }, { "epoch": 2.638737241870401, "grad_norm": 0.05274252966046333, "learning_rate": 9.126956524322547e-05, "loss": 0.2849, "step": 2779 }, { "epoch": 2.639686684073107, "grad_norm": 0.04132537916302681, "learning_rate": 9.121012526125626e-05, "loss": 0.2652, "step": 2780 }, { "epoch": 2.640636126275813, "grad_norm": 0.06332124769687653, "learning_rate": 9.115068840886417e-05, "loss": 0.2764, "step": 2781 }, { "epoch": 2.641585568478519, "grad_norm": 0.04629790782928467, "learning_rate": 9.10912547072114e-05, "loss": 0.2632, "step": 2782 }, { "epoch": 2.642535010681225, "grad_norm": 0.05298149958252907, "learning_rate": 9.103182417745888e-05, "loss": 0.2641, "step": 2783 }, { "epoch": 2.6434844528839307, "grad_norm": 0.039357319474220276, "learning_rate": 9.097239684076649e-05, "loss": 0.2529, "step": 2784 }, { "epoch": 2.6444338950866366, "grad_norm": 0.05280270054936409, "learning_rate": 9.091297271829296e-05, "loss": 0.2716, "step": 2785 }, { "epoch": 2.6453833372893425, "grad_norm": 0.04004097357392311, "learning_rate": 9.085355183119579e-05, "loss": 0.2507, "step": 2786 }, { "epoch": 2.6463327794920484, "grad_norm": 0.039364296942949295, "learning_rate": 9.079413420063147e-05, "loss": 0.2649, "step": 2787 }, { "epoch": 2.6472822216947542, "grad_norm": 0.040332429111003876, "learning_rate": 9.073471984775519e-05, "loss": 0.266, "step": 2788 }, { "epoch": 2.64823166389746, "grad_norm": 0.0906134694814682, "learning_rate": 9.067530879372111e-05, "loss": 0.2763, "step": 2789 }, { "epoch": 2.649181106100166, "grad_norm": 0.03673803433775902, "learning_rate": 9.061590105968208e-05, "loss": 0.2585, "step": 2790 }, { "epoch": 2.6501305483028723, "grad_norm": 0.03839144855737686, "learning_rate": 9.055649666678987e-05, "loss": 0.2607, "step": 2791 }, { "epoch": 2.651079990505578, "grad_norm": 0.04190480336546898, "learning_rate": 9.049709563619503e-05, "loss": 0.2703, "step": 2792 }, { "epoch": 2.652029432708284, "grad_norm": 0.041215989738702774, "learning_rate": 9.043769798904689e-05, "loss": 0.2639, "step": 2793 }, { "epoch": 2.65297887491099, "grad_norm": 0.04381866008043289, "learning_rate": 9.037830374649364e-05, "loss": 0.2626, "step": 2794 }, { "epoch": 2.653928317113696, "grad_norm": 0.03813505917787552, "learning_rate": 9.03189129296821e-05, "loss": 0.2556, "step": 2795 }, { "epoch": 2.6548777593164017, "grad_norm": 0.03909599408507347, "learning_rate": 9.025952555975808e-05, "loss": 0.2465, "step": 2796 }, { "epoch": 2.6558272015191076, "grad_norm": 0.04648647457361221, "learning_rate": 9.020014165786602e-05, "loss": 0.2682, "step": 2797 }, { "epoch": 2.6567766437218134, "grad_norm": 0.041546739637851715, "learning_rate": 9.014076124514922e-05, "loss": 0.2696, "step": 2798 }, { "epoch": 2.6577260859245193, "grad_norm": 0.03989914432168007, "learning_rate": 9.00813843427496e-05, "loss": 0.2628, "step": 2799 }, { "epoch": 2.658675528127225, "grad_norm": 0.04056751728057861, "learning_rate": 9.002201097180796e-05, "loss": 0.2635, "step": 2800 }, { "epoch": 2.659624970329931, "grad_norm": 0.058337803930044174, "learning_rate": 8.996264115346382e-05, "loss": 0.2656, "step": 2801 }, { "epoch": 2.660574412532637, "grad_norm": 0.038185685873031616, "learning_rate": 8.990327490885537e-05, "loss": 0.2566, "step": 2802 }, { "epoch": 2.661523854735343, "grad_norm": 0.045001666992902756, "learning_rate": 8.984391225911966e-05, "loss": 0.2669, "step": 2803 }, { "epoch": 2.6624732969380487, "grad_norm": 0.04217304661870003, "learning_rate": 8.978455322539225e-05, "loss": 0.2605, "step": 2804 }, { "epoch": 2.6634227391407546, "grad_norm": 0.03670491650700569, "learning_rate": 8.97251978288076e-05, "loss": 0.2588, "step": 2805 }, { "epoch": 2.664372181343461, "grad_norm": 0.03951079025864601, "learning_rate": 8.96658460904988e-05, "loss": 0.2562, "step": 2806 }, { "epoch": 2.6653216235461668, "grad_norm": 0.062239449471235275, "learning_rate": 8.960649803159765e-05, "loss": 0.2726, "step": 2807 }, { "epoch": 2.6662710657488726, "grad_norm": 0.04627174511551857, "learning_rate": 8.954715367323468e-05, "loss": 0.267, "step": 2808 }, { "epoch": 2.6672205079515785, "grad_norm": 0.038264404982328415, "learning_rate": 8.948781303653896e-05, "loss": 0.2558, "step": 2809 }, { "epoch": 2.6681699501542844, "grad_norm": 0.05796501412987709, "learning_rate": 8.94284761426384e-05, "loss": 0.2663, "step": 2810 }, { "epoch": 2.6691193923569903, "grad_norm": 0.07112397253513336, "learning_rate": 8.93691430126595e-05, "loss": 0.2607, "step": 2811 }, { "epoch": 2.670068834559696, "grad_norm": 0.042541395872831345, "learning_rate": 8.930981366772746e-05, "loss": 0.251, "step": 2812 }, { "epoch": 2.671018276762402, "grad_norm": 0.04384302347898483, "learning_rate": 8.925048812896605e-05, "loss": 0.2618, "step": 2813 }, { "epoch": 2.671967718965108, "grad_norm": 0.04728015884757042, "learning_rate": 8.919116641749776e-05, "loss": 0.2611, "step": 2814 }, { "epoch": 2.6729171611678137, "grad_norm": 0.04660949110984802, "learning_rate": 8.91318485544437e-05, "loss": 0.2575, "step": 2815 }, { "epoch": 2.67386660337052, "grad_norm": 0.04812900722026825, "learning_rate": 8.907253456092359e-05, "loss": 0.2563, "step": 2816 }, { "epoch": 2.674816045573226, "grad_norm": 0.05315341800451279, "learning_rate": 8.901322445805586e-05, "loss": 0.2801, "step": 2817 }, { "epoch": 2.675765487775932, "grad_norm": 0.07543495297431946, "learning_rate": 8.895391826695737e-05, "loss": 0.2603, "step": 2818 }, { "epoch": 2.6767149299786377, "grad_norm": 0.047977060079574585, "learning_rate": 8.889461600874378e-05, "loss": 0.2663, "step": 2819 }, { "epoch": 2.6776643721813436, "grad_norm": 0.04339151456952095, "learning_rate": 8.883531770452923e-05, "loss": 0.258, "step": 2820 }, { "epoch": 2.6786138143840494, "grad_norm": 0.04932010546326637, "learning_rate": 8.877602337542655e-05, "loss": 0.2543, "step": 2821 }, { "epoch": 2.6795632565867553, "grad_norm": 0.04265378788113594, "learning_rate": 8.8716733042547e-05, "loss": 0.2642, "step": 2822 }, { "epoch": 2.680512698789461, "grad_norm": 0.06549558788537979, "learning_rate": 8.86574467270006e-05, "loss": 0.275, "step": 2823 }, { "epoch": 2.681462140992167, "grad_norm": 0.042048145085573196, "learning_rate": 8.85981644498958e-05, "loss": 0.264, "step": 2824 }, { "epoch": 2.682411583194873, "grad_norm": 0.03816340118646622, "learning_rate": 8.853888623233967e-05, "loss": 0.2575, "step": 2825 }, { "epoch": 2.683361025397579, "grad_norm": 0.04002130404114723, "learning_rate": 8.84796120954379e-05, "loss": 0.2635, "step": 2826 }, { "epoch": 2.6843104676002847, "grad_norm": 0.04486146941781044, "learning_rate": 8.842034206029456e-05, "loss": 0.256, "step": 2827 }, { "epoch": 2.6852599098029906, "grad_norm": 0.037432510405778885, "learning_rate": 8.836107614801243e-05, "loss": 0.2508, "step": 2828 }, { "epoch": 2.6862093520056964, "grad_norm": 0.06960198283195496, "learning_rate": 8.830181437969269e-05, "loss": 0.2759, "step": 2829 }, { "epoch": 2.6871587942084023, "grad_norm": 0.04306569695472717, "learning_rate": 8.824255677643518e-05, "loss": 0.2545, "step": 2830 }, { "epoch": 2.6881082364111086, "grad_norm": 0.0954747125506401, "learning_rate": 8.818330335933809e-05, "loss": 0.2554, "step": 2831 }, { "epoch": 2.6890576786138145, "grad_norm": 0.047692351043224335, "learning_rate": 8.812405414949825e-05, "loss": 0.2569, "step": 2832 }, { "epoch": 2.6900071208165204, "grad_norm": 0.04142098128795624, "learning_rate": 8.806480916801099e-05, "loss": 0.2584, "step": 2833 }, { "epoch": 2.6909565630192263, "grad_norm": 0.039575859904289246, "learning_rate": 8.800556843597002e-05, "loss": 0.2518, "step": 2834 }, { "epoch": 2.691906005221932, "grad_norm": 0.07023467123508453, "learning_rate": 8.79463319744677e-05, "loss": 0.263, "step": 2835 }, { "epoch": 2.692855447424638, "grad_norm": 0.07248935103416443, "learning_rate": 8.788709980459472e-05, "loss": 0.2771, "step": 2836 }, { "epoch": 2.693804889627344, "grad_norm": 0.07990462332963943, "learning_rate": 8.782787194744033e-05, "loss": 0.2648, "step": 2837 }, { "epoch": 2.6947543318300498, "grad_norm": 0.05330291762948036, "learning_rate": 8.77686484240922e-05, "loss": 0.2738, "step": 2838 }, { "epoch": 2.6957037740327556, "grad_norm": 0.07456424087285995, "learning_rate": 8.770942925563654e-05, "loss": 0.2845, "step": 2839 }, { "epoch": 2.6966532162354615, "grad_norm": 0.04465353116393089, "learning_rate": 8.765021446315785e-05, "loss": 0.2571, "step": 2840 }, { "epoch": 2.697602658438168, "grad_norm": 0.03848935291171074, "learning_rate": 8.75910040677392e-05, "loss": 0.2521, "step": 2841 }, { "epoch": 2.6985521006408737, "grad_norm": 0.05211983248591423, "learning_rate": 8.753179809046211e-05, "loss": 0.2553, "step": 2842 }, { "epoch": 2.6995015428435796, "grad_norm": 0.04083755984902382, "learning_rate": 8.747259655240642e-05, "loss": 0.26, "step": 2843 }, { "epoch": 2.7004509850462854, "grad_norm": 0.04455624893307686, "learning_rate": 8.741339947465054e-05, "loss": 0.2516, "step": 2844 }, { "epoch": 2.7014004272489913, "grad_norm": 0.04386875405907631, "learning_rate": 8.735420687827107e-05, "loss": 0.2632, "step": 2845 }, { "epoch": 2.702349869451697, "grad_norm": 0.044795092195272446, "learning_rate": 8.729501878434325e-05, "loss": 0.2694, "step": 2846 }, { "epoch": 2.703299311654403, "grad_norm": 0.045176975429058075, "learning_rate": 8.723583521394054e-05, "loss": 0.2647, "step": 2847 }, { "epoch": 2.704248753857109, "grad_norm": 0.044440098106861115, "learning_rate": 8.717665618813491e-05, "loss": 0.257, "step": 2848 }, { "epoch": 2.705198196059815, "grad_norm": 0.04165451228618622, "learning_rate": 8.711748172799667e-05, "loss": 0.252, "step": 2849 }, { "epoch": 2.7061476382625207, "grad_norm": 0.04421938210725784, "learning_rate": 8.705831185459445e-05, "loss": 0.2784, "step": 2850 }, { "epoch": 2.7070970804652266, "grad_norm": 0.05041582137346268, "learning_rate": 8.699914658899535e-05, "loss": 0.267, "step": 2851 }, { "epoch": 2.7080465226679324, "grad_norm": 0.061946917325258255, "learning_rate": 8.693998595226473e-05, "loss": 0.2769, "step": 2852 }, { "epoch": 2.7089959648706383, "grad_norm": 0.06854099780321121, "learning_rate": 8.68808299654664e-05, "loss": 0.2858, "step": 2853 }, { "epoch": 2.709945407073344, "grad_norm": 0.04000728577375412, "learning_rate": 8.682167864966236e-05, "loss": 0.2626, "step": 2854 }, { "epoch": 2.71089484927605, "grad_norm": 0.047487881034612656, "learning_rate": 8.676253202591317e-05, "loss": 0.2628, "step": 2855 }, { "epoch": 2.7118442914787564, "grad_norm": 0.038573890924453735, "learning_rate": 8.670339011527748e-05, "loss": 0.2601, "step": 2856 }, { "epoch": 2.7127937336814623, "grad_norm": 0.04861799627542496, "learning_rate": 8.664425293881247e-05, "loss": 0.2686, "step": 2857 }, { "epoch": 2.713743175884168, "grad_norm": 0.042451322078704834, "learning_rate": 8.658512051757353e-05, "loss": 0.2652, "step": 2858 }, { "epoch": 2.714692618086874, "grad_norm": 0.044802404940128326, "learning_rate": 8.652599287261431e-05, "loss": 0.2687, "step": 2859 }, { "epoch": 2.71564206028958, "grad_norm": 0.0430615097284317, "learning_rate": 8.646687002498692e-05, "loss": 0.2524, "step": 2860 }, { "epoch": 2.7165915024922858, "grad_norm": 0.04607314616441727, "learning_rate": 8.640775199574154e-05, "loss": 0.2753, "step": 2861 }, { "epoch": 2.7175409446949916, "grad_norm": 0.04442737251520157, "learning_rate": 8.634863880592686e-05, "loss": 0.2571, "step": 2862 }, { "epoch": 2.7184903868976975, "grad_norm": 0.0405968502163887, "learning_rate": 8.628953047658967e-05, "loss": 0.2634, "step": 2863 }, { "epoch": 2.7194398291004034, "grad_norm": 0.04561749845743179, "learning_rate": 8.623042702877515e-05, "loss": 0.2418, "step": 2864 }, { "epoch": 2.7203892713031093, "grad_norm": 0.039593134075403214, "learning_rate": 8.617132848352671e-05, "loss": 0.2687, "step": 2865 }, { "epoch": 2.7213387135058156, "grad_norm": 0.04114865884184837, "learning_rate": 8.611223486188591e-05, "loss": 0.2582, "step": 2866 }, { "epoch": 2.7222881557085215, "grad_norm": 0.073084756731987, "learning_rate": 8.605314618489275e-05, "loss": 0.2665, "step": 2867 }, { "epoch": 2.7232375979112273, "grad_norm": 0.03542012348771095, "learning_rate": 8.59940624735853e-05, "loss": 0.2596, "step": 2868 }, { "epoch": 2.724187040113933, "grad_norm": 0.07219967246055603, "learning_rate": 8.593498374899998e-05, "loss": 0.2651, "step": 2869 }, { "epoch": 2.725136482316639, "grad_norm": 0.05063614621758461, "learning_rate": 8.58759100321713e-05, "loss": 0.284, "step": 2870 }, { "epoch": 2.726085924519345, "grad_norm": 0.03707686811685562, "learning_rate": 8.581684134413216e-05, "loss": 0.2555, "step": 2871 }, { "epoch": 2.727035366722051, "grad_norm": 0.0445532388985157, "learning_rate": 8.57577777059135e-05, "loss": 0.2623, "step": 2872 }, { "epoch": 2.7279848089247567, "grad_norm": 0.040541306138038635, "learning_rate": 8.569871913854458e-05, "loss": 0.2613, "step": 2873 }, { "epoch": 2.7289342511274626, "grad_norm": 0.03940880671143532, "learning_rate": 8.563966566305286e-05, "loss": 0.2498, "step": 2874 }, { "epoch": 2.7298836933301684, "grad_norm": 0.04151361435651779, "learning_rate": 8.558061730046384e-05, "loss": 0.2578, "step": 2875 }, { "epoch": 2.7308331355328743, "grad_norm": 0.04065687954425812, "learning_rate": 8.552157407180139e-05, "loss": 0.2538, "step": 2876 }, { "epoch": 2.73178257773558, "grad_norm": 0.04001820832490921, "learning_rate": 8.54625359980874e-05, "loss": 0.2525, "step": 2877 }, { "epoch": 2.732732019938286, "grad_norm": 0.05147264897823334, "learning_rate": 8.540350310034206e-05, "loss": 0.2731, "step": 2878 }, { "epoch": 2.733681462140992, "grad_norm": 0.04745444655418396, "learning_rate": 8.534447539958358e-05, "loss": 0.2603, "step": 2879 }, { "epoch": 2.734630904343698, "grad_norm": 0.07255495339632034, "learning_rate": 8.528545291682838e-05, "loss": 0.2694, "step": 2880 }, { "epoch": 2.735580346546404, "grad_norm": 0.04382932558655739, "learning_rate": 8.522643567309112e-05, "loss": 0.2683, "step": 2881 }, { "epoch": 2.73652978874911, "grad_norm": 0.051482584327459335, "learning_rate": 8.516742368938439e-05, "loss": 0.251, "step": 2882 }, { "epoch": 2.737479230951816, "grad_norm": 0.05244822800159454, "learning_rate": 8.510841698671912e-05, "loss": 0.2603, "step": 2883 }, { "epoch": 2.7384286731545218, "grad_norm": 0.035695601254701614, "learning_rate": 8.504941558610416e-05, "loss": 0.2546, "step": 2884 }, { "epoch": 2.7393781153572276, "grad_norm": 0.04054943472146988, "learning_rate": 8.499041950854665e-05, "loss": 0.2577, "step": 2885 }, { "epoch": 2.7403275575599335, "grad_norm": 0.04638770595192909, "learning_rate": 8.49314287750517e-05, "loss": 0.2614, "step": 2886 }, { "epoch": 2.7412769997626394, "grad_norm": 0.0429297499358654, "learning_rate": 8.487244340662264e-05, "loss": 0.2651, "step": 2887 }, { "epoch": 2.7422264419653453, "grad_norm": 0.03873150050640106, "learning_rate": 8.481346342426073e-05, "loss": 0.2543, "step": 2888 }, { "epoch": 2.743175884168051, "grad_norm": 0.0463930107653141, "learning_rate": 8.475448884896547e-05, "loss": 0.2638, "step": 2889 }, { "epoch": 2.7441253263707575, "grad_norm": 0.04211205989122391, "learning_rate": 8.469551970173437e-05, "loss": 0.2513, "step": 2890 }, { "epoch": 2.7450747685734633, "grad_norm": 0.05451524630188942, "learning_rate": 8.463655600356297e-05, "loss": 0.2805, "step": 2891 }, { "epoch": 2.746024210776169, "grad_norm": 0.0439947172999382, "learning_rate": 8.457759777544499e-05, "loss": 0.2668, "step": 2892 }, { "epoch": 2.746973652978875, "grad_norm": 0.039369143545627594, "learning_rate": 8.451864503837202e-05, "loss": 0.2489, "step": 2893 }, { "epoch": 2.747923095181581, "grad_norm": 0.03729270026087761, "learning_rate": 8.445969781333385e-05, "loss": 0.2619, "step": 2894 }, { "epoch": 2.748872537384287, "grad_norm": 0.049005176872015, "learning_rate": 8.440075612131823e-05, "loss": 0.2662, "step": 2895 }, { "epoch": 2.7498219795869927, "grad_norm": 0.03956499695777893, "learning_rate": 8.434181998331101e-05, "loss": 0.2656, "step": 2896 }, { "epoch": 2.7507714217896986, "grad_norm": 0.039099693298339844, "learning_rate": 8.428288942029593e-05, "loss": 0.2699, "step": 2897 }, { "epoch": 2.7517208639924045, "grad_norm": 0.04464574530720711, "learning_rate": 8.422396445325487e-05, "loss": 0.2733, "step": 2898 }, { "epoch": 2.7526703061951103, "grad_norm": 0.06105086952447891, "learning_rate": 8.416504510316773e-05, "loss": 0.2753, "step": 2899 }, { "epoch": 2.753619748397816, "grad_norm": 0.06804367899894714, "learning_rate": 8.410613139101227e-05, "loss": 0.2527, "step": 2900 }, { "epoch": 2.754569190600522, "grad_norm": 0.04179168865084648, "learning_rate": 8.404722333776444e-05, "loss": 0.2512, "step": 2901 }, { "epoch": 2.755518632803228, "grad_norm": 0.056286826729774475, "learning_rate": 8.398832096439795e-05, "loss": 0.2412, "step": 2902 }, { "epoch": 2.756468075005934, "grad_norm": 0.04326535761356354, "learning_rate": 8.392942429188466e-05, "loss": 0.2639, "step": 2903 }, { "epoch": 2.7574175172086397, "grad_norm": 0.04712899401783943, "learning_rate": 8.387053334119432e-05, "loss": 0.2626, "step": 2904 }, { "epoch": 2.7583669594113456, "grad_norm": 0.04596768319606781, "learning_rate": 8.381164813329469e-05, "loss": 0.254, "step": 2905 }, { "epoch": 2.759316401614052, "grad_norm": 0.04214185103774071, "learning_rate": 8.375276868915148e-05, "loss": 0.2611, "step": 2906 }, { "epoch": 2.7602658438167578, "grad_norm": 0.05700606480240822, "learning_rate": 8.369389502972828e-05, "loss": 0.2549, "step": 2907 }, { "epoch": 2.7612152860194636, "grad_norm": 0.04782046377658844, "learning_rate": 8.36350271759867e-05, "loss": 0.2588, "step": 2908 }, { "epoch": 2.7621647282221695, "grad_norm": 0.05099222809076309, "learning_rate": 8.357616514888624e-05, "loss": 0.2648, "step": 2909 }, { "epoch": 2.7631141704248754, "grad_norm": 0.06003917381167412, "learning_rate": 8.351730896938437e-05, "loss": 0.2615, "step": 2910 }, { "epoch": 2.7640636126275813, "grad_norm": 0.039956480264663696, "learning_rate": 8.34584586584364e-05, "loss": 0.2519, "step": 2911 }, { "epoch": 2.765013054830287, "grad_norm": 0.040129296481609344, "learning_rate": 8.339961423699562e-05, "loss": 0.2535, "step": 2912 }, { "epoch": 2.765962497032993, "grad_norm": 0.05875218287110329, "learning_rate": 8.334077572601318e-05, "loss": 0.2833, "step": 2913 }, { "epoch": 2.766911939235699, "grad_norm": 0.05164382606744766, "learning_rate": 8.328194314643816e-05, "loss": 0.2645, "step": 2914 }, { "epoch": 2.767861381438405, "grad_norm": 0.05058974772691727, "learning_rate": 8.322311651921759e-05, "loss": 0.2667, "step": 2915 }, { "epoch": 2.768810823641111, "grad_norm": 0.03610742464661598, "learning_rate": 8.316429586529615e-05, "loss": 0.2556, "step": 2916 }, { "epoch": 2.769760265843817, "grad_norm": 0.04182818531990051, "learning_rate": 8.310548120561667e-05, "loss": 0.2643, "step": 2917 }, { "epoch": 2.770709708046523, "grad_norm": 0.041662104427814484, "learning_rate": 8.304667256111965e-05, "loss": 0.2564, "step": 2918 }, { "epoch": 2.7716591502492287, "grad_norm": 0.04125606641173363, "learning_rate": 8.29878699527436e-05, "loss": 0.2596, "step": 2919 }, { "epoch": 2.7726085924519346, "grad_norm": 0.036673858761787415, "learning_rate": 8.292907340142471e-05, "loss": 0.2568, "step": 2920 }, { "epoch": 2.7735580346546405, "grad_norm": 0.04233416169881821, "learning_rate": 8.287028292809717e-05, "loss": 0.2677, "step": 2921 }, { "epoch": 2.7745074768573463, "grad_norm": 0.042057108134031296, "learning_rate": 8.281149855369293e-05, "loss": 0.2509, "step": 2922 }, { "epoch": 2.775456919060052, "grad_norm": 0.05955182760953903, "learning_rate": 8.275272029914177e-05, "loss": 0.2785, "step": 2923 }, { "epoch": 2.776406361262758, "grad_norm": 0.036538127809762955, "learning_rate": 8.269394818537133e-05, "loss": 0.2476, "step": 2924 }, { "epoch": 2.777355803465464, "grad_norm": 0.052744511514902115, "learning_rate": 8.263518223330697e-05, "loss": 0.2859, "step": 2925 }, { "epoch": 2.77830524566817, "grad_norm": 0.06520146876573563, "learning_rate": 8.2576422463872e-05, "loss": 0.2907, "step": 2926 }, { "epoch": 2.7792546878708757, "grad_norm": 0.04221632331609726, "learning_rate": 8.251766889798738e-05, "loss": 0.2617, "step": 2927 }, { "epoch": 2.7802041300735816, "grad_norm": 0.042480263859033585, "learning_rate": 8.245892155657201e-05, "loss": 0.2684, "step": 2928 }, { "epoch": 2.7811535722762875, "grad_norm": 0.04169554263353348, "learning_rate": 8.240018046054241e-05, "loss": 0.2633, "step": 2929 }, { "epoch": 2.7821030144789933, "grad_norm": 0.04474500194191933, "learning_rate": 8.2341445630813e-05, "loss": 0.2685, "step": 2930 }, { "epoch": 2.7830524566816996, "grad_norm": 0.043009109795093536, "learning_rate": 8.228271708829595e-05, "loss": 0.2658, "step": 2931 }, { "epoch": 2.7840018988844055, "grad_norm": 0.04732086881995201, "learning_rate": 8.222399485390114e-05, "loss": 0.2713, "step": 2932 }, { "epoch": 2.7849513410871114, "grad_norm": 0.051858678460121155, "learning_rate": 8.216527894853629e-05, "loss": 0.2784, "step": 2933 }, { "epoch": 2.7859007832898173, "grad_norm": 0.08715417236089706, "learning_rate": 8.210656939310672e-05, "loss": 0.2532, "step": 2934 }, { "epoch": 2.786850225492523, "grad_norm": 0.04026304930448532, "learning_rate": 8.204786620851568e-05, "loss": 0.2589, "step": 2935 }, { "epoch": 2.787799667695229, "grad_norm": 0.0430745892226696, "learning_rate": 8.198916941566397e-05, "loss": 0.2661, "step": 2936 }, { "epoch": 2.788749109897935, "grad_norm": 0.04209771007299423, "learning_rate": 8.193047903545023e-05, "loss": 0.2562, "step": 2937 }, { "epoch": 2.7896985521006408, "grad_norm": 0.03844306617975235, "learning_rate": 8.187179508877085e-05, "loss": 0.2528, "step": 2938 }, { "epoch": 2.7906479943033466, "grad_norm": 0.045125510543584824, "learning_rate": 8.181311759651975e-05, "loss": 0.2618, "step": 2939 }, { "epoch": 2.791597436506053, "grad_norm": 0.054230380803346634, "learning_rate": 8.175444657958876e-05, "loss": 0.2593, "step": 2940 }, { "epoch": 2.792546878708759, "grad_norm": 0.04647655412554741, "learning_rate": 8.16957820588672e-05, "loss": 0.2635, "step": 2941 }, { "epoch": 2.7934963209114647, "grad_norm": 0.046050019562244415, "learning_rate": 8.163712405524235e-05, "loss": 0.2712, "step": 2942 }, { "epoch": 2.7944457631141706, "grad_norm": 0.04571513459086418, "learning_rate": 8.157847258959885e-05, "loss": 0.2592, "step": 2943 }, { "epoch": 2.7953952053168765, "grad_norm": 0.0384996272623539, "learning_rate": 8.151982768281927e-05, "loss": 0.2524, "step": 2944 }, { "epoch": 2.7963446475195823, "grad_norm": 0.05060908943414688, "learning_rate": 8.146118935578367e-05, "loss": 0.2643, "step": 2945 }, { "epoch": 2.797294089722288, "grad_norm": 0.04255904257297516, "learning_rate": 8.140255762936989e-05, "loss": 0.2671, "step": 2946 }, { "epoch": 2.798243531924994, "grad_norm": 0.045090414583683014, "learning_rate": 8.13439325244534e-05, "loss": 0.2673, "step": 2947 }, { "epoch": 2.7991929741277, "grad_norm": 0.11718107759952545, "learning_rate": 8.128531406190721e-05, "loss": 0.2731, "step": 2948 }, { "epoch": 2.800142416330406, "grad_norm": 0.03717552870512009, "learning_rate": 8.122670226260207e-05, "loss": 0.2577, "step": 2949 }, { "epoch": 2.8010918585331117, "grad_norm": 0.03922560065984726, "learning_rate": 8.116809714740634e-05, "loss": 0.266, "step": 2950 }, { "epoch": 2.8020413007358176, "grad_norm": 0.04070358723402023, "learning_rate": 8.1109498737186e-05, "loss": 0.2696, "step": 2951 }, { "epoch": 2.8029907429385235, "grad_norm": 0.08717595040798187, "learning_rate": 8.105090705280456e-05, "loss": 0.2698, "step": 2952 }, { "epoch": 2.8039401851412293, "grad_norm": 0.04092169925570488, "learning_rate": 8.099232211512326e-05, "loss": 0.267, "step": 2953 }, { "epoch": 2.804889627343935, "grad_norm": 0.03971577063202858, "learning_rate": 8.093374394500088e-05, "loss": 0.2692, "step": 2954 }, { "epoch": 2.805839069546641, "grad_norm": 0.04230726882815361, "learning_rate": 8.087517256329376e-05, "loss": 0.2655, "step": 2955 }, { "epoch": 2.8067885117493474, "grad_norm": 0.04096872732043266, "learning_rate": 8.081660799085594e-05, "loss": 0.2658, "step": 2956 }, { "epoch": 2.8077379539520533, "grad_norm": 0.04708344489336014, "learning_rate": 8.075805024853884e-05, "loss": 0.2707, "step": 2957 }, { "epoch": 2.808687396154759, "grad_norm": 0.054247885942459106, "learning_rate": 8.069949935719165e-05, "loss": 0.2598, "step": 2958 }, { "epoch": 2.809636838357465, "grad_norm": 0.05410867556929588, "learning_rate": 8.064095533766095e-05, "loss": 0.2899, "step": 2959 }, { "epoch": 2.810586280560171, "grad_norm": 0.10300780832767487, "learning_rate": 8.058241821079105e-05, "loss": 0.2654, "step": 2960 }, { "epoch": 2.8115357227628768, "grad_norm": 0.04165608435869217, "learning_rate": 8.052388799742361e-05, "loss": 0.2596, "step": 2961 }, { "epoch": 2.8124851649655827, "grad_norm": 0.04763954505324364, "learning_rate": 8.046536471839798e-05, "loss": 0.2515, "step": 2962 }, { "epoch": 2.8134346071682885, "grad_norm": 0.04077402129769325, "learning_rate": 8.0406848394551e-05, "loss": 0.2553, "step": 2963 }, { "epoch": 2.8143840493709944, "grad_norm": 0.04058780148625374, "learning_rate": 8.034833904671698e-05, "loss": 0.2518, "step": 2964 }, { "epoch": 2.8153334915737007, "grad_norm": 0.052179377526044846, "learning_rate": 8.028983669572786e-05, "loss": 0.2647, "step": 2965 }, { "epoch": 2.8162829337764066, "grad_norm": 0.04284640774130821, "learning_rate": 8.023134136241293e-05, "loss": 0.2568, "step": 2966 }, { "epoch": 2.8172323759791125, "grad_norm": 0.04037092253565788, "learning_rate": 8.017285306759914e-05, "loss": 0.2431, "step": 2967 }, { "epoch": 2.8181818181818183, "grad_norm": 0.05579729750752449, "learning_rate": 8.011437183211081e-05, "loss": 0.2807, "step": 2968 }, { "epoch": 2.819131260384524, "grad_norm": 0.05057593807578087, "learning_rate": 8.005589767676986e-05, "loss": 0.2726, "step": 2969 }, { "epoch": 2.82008070258723, "grad_norm": 0.04789821803569794, "learning_rate": 7.999743062239557e-05, "loss": 0.2698, "step": 2970 }, { "epoch": 2.821030144789936, "grad_norm": 0.056588314473629, "learning_rate": 7.993897068980477e-05, "loss": 0.2746, "step": 2971 }, { "epoch": 2.821979586992642, "grad_norm": 0.04293562099337578, "learning_rate": 7.988051789981176e-05, "loss": 0.2607, "step": 2972 }, { "epoch": 2.8229290291953477, "grad_norm": 0.03815620765089989, "learning_rate": 7.982207227322824e-05, "loss": 0.247, "step": 2973 }, { "epoch": 2.8238784713980536, "grad_norm": 0.06469085812568665, "learning_rate": 7.976363383086342e-05, "loss": 0.264, "step": 2974 }, { "epoch": 2.8248279136007595, "grad_norm": 0.03865661844611168, "learning_rate": 7.97052025935239e-05, "loss": 0.2555, "step": 2975 }, { "epoch": 2.8257773558034653, "grad_norm": 0.04179610684514046, "learning_rate": 7.964677858201376e-05, "loss": 0.2663, "step": 2976 }, { "epoch": 2.826726798006171, "grad_norm": 0.043444134294986725, "learning_rate": 7.958836181713445e-05, "loss": 0.2639, "step": 2977 }, { "epoch": 2.827676240208877, "grad_norm": 0.0387243777513504, "learning_rate": 7.952995231968488e-05, "loss": 0.2669, "step": 2978 }, { "epoch": 2.828625682411583, "grad_norm": 0.03931890428066254, "learning_rate": 7.947155011046144e-05, "loss": 0.2537, "step": 2979 }, { "epoch": 2.829575124614289, "grad_norm": 0.047014620155096054, "learning_rate": 7.941315521025775e-05, "loss": 0.2759, "step": 2980 }, { "epoch": 2.830524566816995, "grad_norm": 0.03905611112713814, "learning_rate": 7.935476763986503e-05, "loss": 0.2638, "step": 2981 }, { "epoch": 2.831474009019701, "grad_norm": 0.053726695477962494, "learning_rate": 7.92963874200717e-05, "loss": 0.2709, "step": 2982 }, { "epoch": 2.832423451222407, "grad_norm": 0.03801732510328293, "learning_rate": 7.923801457166372e-05, "loss": 0.2511, "step": 2983 }, { "epoch": 2.833372893425113, "grad_norm": 0.04040442034602165, "learning_rate": 7.91796491154243e-05, "loss": 0.2509, "step": 2984 }, { "epoch": 2.8343223356278187, "grad_norm": 0.046596333384513855, "learning_rate": 7.912129107213416e-05, "loss": 0.2878, "step": 2985 }, { "epoch": 2.8352717778305245, "grad_norm": 0.04395151510834694, "learning_rate": 7.90629404625712e-05, "loss": 0.2664, "step": 2986 }, { "epoch": 2.8362212200332304, "grad_norm": 0.04623299464583397, "learning_rate": 7.900459730751084e-05, "loss": 0.2589, "step": 2987 }, { "epoch": 2.8371706622359363, "grad_norm": 0.04465977102518082, "learning_rate": 7.894626162772578e-05, "loss": 0.2686, "step": 2988 }, { "epoch": 2.838120104438642, "grad_norm": 0.04362845793366432, "learning_rate": 7.888793344398601e-05, "loss": 0.2608, "step": 2989 }, { "epoch": 2.8390695466413485, "grad_norm": 0.040313150733709335, "learning_rate": 7.882961277705895e-05, "loss": 0.2555, "step": 2990 }, { "epoch": 2.8400189888440543, "grad_norm": 0.03675536438822746, "learning_rate": 7.877129964770924e-05, "loss": 0.2434, "step": 2991 }, { "epoch": 2.8409684310467602, "grad_norm": 0.059962980449199677, "learning_rate": 7.871299407669892e-05, "loss": 0.2819, "step": 2992 }, { "epoch": 2.841917873249466, "grad_norm": 0.040946152061223984, "learning_rate": 7.865469608478726e-05, "loss": 0.2615, "step": 2993 }, { "epoch": 2.842867315452172, "grad_norm": 0.04320983588695526, "learning_rate": 7.859640569273093e-05, "loss": 0.2619, "step": 2994 }, { "epoch": 2.843816757654878, "grad_norm": 0.04121886566281319, "learning_rate": 7.853812292128387e-05, "loss": 0.2612, "step": 2995 }, { "epoch": 2.8447661998575837, "grad_norm": 0.043592579662799835, "learning_rate": 7.847984779119717e-05, "loss": 0.2572, "step": 2996 }, { "epoch": 2.8457156420602896, "grad_norm": 0.04470603168010712, "learning_rate": 7.84215803232194e-05, "loss": 0.247, "step": 2997 }, { "epoch": 2.8466650842629955, "grad_norm": 0.0900123119354248, "learning_rate": 7.836332053809625e-05, "loss": 0.2607, "step": 2998 }, { "epoch": 2.8476145264657013, "grad_norm": 0.08290302008390427, "learning_rate": 7.830506845657082e-05, "loss": 0.2535, "step": 2999 }, { "epoch": 2.848563968668407, "grad_norm": 0.04399091377854347, "learning_rate": 7.824682409938328e-05, "loss": 0.2578, "step": 3000 }, { "epoch": 2.848563968668407, "eval_loss": 0.26121968030929565, "eval_runtime": 37.7523, "eval_samples_per_second": 2.278, "eval_steps_per_second": 2.278, "step": 3000 }, { "epoch": 2.849513410871113, "grad_norm": 0.03738872706890106, "learning_rate": 7.81885874872712e-05, "loss": 0.2466, "step": 3001 }, { "epoch": 2.850462853073819, "grad_norm": 0.048882003873586655, "learning_rate": 7.813035864096932e-05, "loss": 0.2604, "step": 3002 }, { "epoch": 2.851412295276525, "grad_norm": 0.037704020738601685, "learning_rate": 7.807213758120966e-05, "loss": 0.2504, "step": 3003 }, { "epoch": 2.8523617374792307, "grad_norm": 0.07369041442871094, "learning_rate": 7.801392432872149e-05, "loss": 0.2461, "step": 3004 }, { "epoch": 2.8533111796819366, "grad_norm": 0.05542570352554321, "learning_rate": 7.795571890423116e-05, "loss": 0.2907, "step": 3005 }, { "epoch": 2.854260621884643, "grad_norm": 0.05089758709073067, "learning_rate": 7.789752132846239e-05, "loss": 0.2747, "step": 3006 }, { "epoch": 2.855210064087349, "grad_norm": 0.051443975418806076, "learning_rate": 7.783933162213604e-05, "loss": 0.2673, "step": 3007 }, { "epoch": 2.8561595062900547, "grad_norm": 0.04792041704058647, "learning_rate": 7.778114980597018e-05, "loss": 0.2619, "step": 3008 }, { "epoch": 2.8571089484927605, "grad_norm": 0.0704968124628067, "learning_rate": 7.772297590068002e-05, "loss": 0.2745, "step": 3009 }, { "epoch": 2.8580583906954664, "grad_norm": 0.05193328484892845, "learning_rate": 7.766480992697802e-05, "loss": 0.2614, "step": 3010 }, { "epoch": 2.8590078328981723, "grad_norm": 0.05074877664446831, "learning_rate": 7.760665190557382e-05, "loss": 0.2547, "step": 3011 }, { "epoch": 2.859957275100878, "grad_norm": 0.05875850468873978, "learning_rate": 7.754850185717415e-05, "loss": 0.2617, "step": 3012 }, { "epoch": 2.860906717303584, "grad_norm": 0.040531840175390244, "learning_rate": 7.749035980248305e-05, "loss": 0.254, "step": 3013 }, { "epoch": 2.86185615950629, "grad_norm": 0.04752432927489281, "learning_rate": 7.74322257622015e-05, "loss": 0.2634, "step": 3014 }, { "epoch": 2.8628056017089962, "grad_norm": 0.04267633333802223, "learning_rate": 7.73740997570278e-05, "loss": 0.254, "step": 3015 }, { "epoch": 2.863755043911702, "grad_norm": 0.054555121809244156, "learning_rate": 7.731598180765732e-05, "loss": 0.2834, "step": 3016 }, { "epoch": 2.864704486114408, "grad_norm": 0.044725362211465836, "learning_rate": 7.725787193478262e-05, "loss": 0.2594, "step": 3017 }, { "epoch": 2.865653928317114, "grad_norm": 0.04918695613741875, "learning_rate": 7.719977015909326e-05, "loss": 0.2537, "step": 3018 }, { "epoch": 2.8666033705198197, "grad_norm": 0.040399499237537384, "learning_rate": 7.714167650127603e-05, "loss": 0.2537, "step": 3019 }, { "epoch": 2.8675528127225256, "grad_norm": 0.04605403542518616, "learning_rate": 7.708359098201482e-05, "loss": 0.2625, "step": 3020 }, { "epoch": 2.8685022549252315, "grad_norm": 0.039211615920066833, "learning_rate": 7.702551362199056e-05, "loss": 0.2619, "step": 3021 }, { "epoch": 2.8694516971279374, "grad_norm": 0.040134087204933167, "learning_rate": 7.696744444188138e-05, "loss": 0.2637, "step": 3022 }, { "epoch": 2.8704011393306432, "grad_norm": 0.04593004286289215, "learning_rate": 7.690938346236233e-05, "loss": 0.2761, "step": 3023 }, { "epoch": 2.871350581533349, "grad_norm": 0.04389437288045883, "learning_rate": 7.685133070410571e-05, "loss": 0.2499, "step": 3024 }, { "epoch": 2.872300023736055, "grad_norm": 0.04520121216773987, "learning_rate": 7.67932861877808e-05, "loss": 0.2615, "step": 3025 }, { "epoch": 2.873249465938761, "grad_norm": 0.03421633318066597, "learning_rate": 7.673524993405397e-05, "loss": 0.2619, "step": 3026 }, { "epoch": 2.8741989081414667, "grad_norm": 0.04385941103100777, "learning_rate": 7.667722196358869e-05, "loss": 0.2595, "step": 3027 }, { "epoch": 2.8751483503441726, "grad_norm": 0.0449550487101078, "learning_rate": 7.661920229704535e-05, "loss": 0.2523, "step": 3028 }, { "epoch": 2.8760977925468785, "grad_norm": 0.037781983613967896, "learning_rate": 7.656119095508154e-05, "loss": 0.2474, "step": 3029 }, { "epoch": 2.8770472347495843, "grad_norm": 0.043374691158533096, "learning_rate": 7.650318795835179e-05, "loss": 0.2535, "step": 3030 }, { "epoch": 2.8779966769522907, "grad_norm": 0.03891807794570923, "learning_rate": 7.644519332750772e-05, "loss": 0.2531, "step": 3031 }, { "epoch": 2.8789461191549965, "grad_norm": 0.03850429132580757, "learning_rate": 7.638720708319789e-05, "loss": 0.2594, "step": 3032 }, { "epoch": 2.8798955613577024, "grad_norm": 0.03768150508403778, "learning_rate": 7.632922924606795e-05, "loss": 0.2549, "step": 3033 }, { "epoch": 2.8808450035604083, "grad_norm": 0.0817415788769722, "learning_rate": 7.62712598367605e-05, "loss": 0.2625, "step": 3034 }, { "epoch": 2.881794445763114, "grad_norm": 0.04083314538002014, "learning_rate": 7.62132988759152e-05, "loss": 0.2568, "step": 3035 }, { "epoch": 2.88274388796582, "grad_norm": 0.039418481290340424, "learning_rate": 7.61553463841687e-05, "loss": 0.2605, "step": 3036 }, { "epoch": 2.883693330168526, "grad_norm": 0.04179481044411659, "learning_rate": 7.60974023821545e-05, "loss": 0.2582, "step": 3037 }, { "epoch": 2.884642772371232, "grad_norm": 0.040783416479825974, "learning_rate": 7.603946689050329e-05, "loss": 0.2587, "step": 3038 }, { "epoch": 2.8855922145739377, "grad_norm": 0.037703026086091995, "learning_rate": 7.598153992984254e-05, "loss": 0.2611, "step": 3039 }, { "epoch": 2.886541656776644, "grad_norm": 0.03856633976101875, "learning_rate": 7.592362152079684e-05, "loss": 0.2556, "step": 3040 }, { "epoch": 2.88749109897935, "grad_norm": 0.03830847144126892, "learning_rate": 7.586571168398759e-05, "loss": 0.2596, "step": 3041 }, { "epoch": 2.8884405411820557, "grad_norm": 0.07717377692461014, "learning_rate": 7.580781044003324e-05, "loss": 0.2694, "step": 3042 }, { "epoch": 2.8893899833847616, "grad_norm": 0.0506473146378994, "learning_rate": 7.574991780954914e-05, "loss": 0.2671, "step": 3043 }, { "epoch": 2.8903394255874675, "grad_norm": 0.03902186080813408, "learning_rate": 7.569203381314757e-05, "loss": 0.2629, "step": 3044 }, { "epoch": 2.8912888677901734, "grad_norm": 0.03913324326276779, "learning_rate": 7.563415847143782e-05, "loss": 0.2595, "step": 3045 }, { "epoch": 2.8922383099928792, "grad_norm": 0.043382804840803146, "learning_rate": 7.557629180502593e-05, "loss": 0.2656, "step": 3046 }, { "epoch": 2.893187752195585, "grad_norm": 0.04160072281956673, "learning_rate": 7.551843383451497e-05, "loss": 0.2575, "step": 3047 }, { "epoch": 2.894137194398291, "grad_norm": 0.04108607769012451, "learning_rate": 7.54605845805049e-05, "loss": 0.2565, "step": 3048 }, { "epoch": 2.895086636600997, "grad_norm": 0.07181921601295471, "learning_rate": 7.540274406359263e-05, "loss": 0.2608, "step": 3049 }, { "epoch": 2.8960360788037027, "grad_norm": 0.040363240987062454, "learning_rate": 7.534491230437178e-05, "loss": 0.2639, "step": 3050 }, { "epoch": 2.8969855210064086, "grad_norm": 0.08342358469963074, "learning_rate": 7.528708932343304e-05, "loss": 0.2793, "step": 3051 }, { "epoch": 2.8979349632091145, "grad_norm": 0.06666367501020432, "learning_rate": 7.52292751413639e-05, "loss": 0.2445, "step": 3052 }, { "epoch": 2.8988844054118204, "grad_norm": 0.07102706283330917, "learning_rate": 7.517146977874869e-05, "loss": 0.2902, "step": 3053 }, { "epoch": 2.8998338476145262, "grad_norm": 0.04264743998646736, "learning_rate": 7.511367325616868e-05, "loss": 0.2565, "step": 3054 }, { "epoch": 2.900783289817232, "grad_norm": 0.04553769901394844, "learning_rate": 7.505588559420189e-05, "loss": 0.2549, "step": 3055 }, { "epoch": 2.9017327320199384, "grad_norm": 0.04745159670710564, "learning_rate": 7.499810681342325e-05, "loss": 0.2516, "step": 3056 }, { "epoch": 2.9026821742226443, "grad_norm": 0.07339516282081604, "learning_rate": 7.494033693440451e-05, "loss": 0.2724, "step": 3057 }, { "epoch": 2.90363161642535, "grad_norm": 0.04640224948525429, "learning_rate": 7.488257597771433e-05, "loss": 0.2628, "step": 3058 }, { "epoch": 2.904581058628056, "grad_norm": 0.0962815135717392, "learning_rate": 7.482482396391801e-05, "loss": 0.2716, "step": 3059 }, { "epoch": 2.905530500830762, "grad_norm": 0.04240866377949715, "learning_rate": 7.476708091357782e-05, "loss": 0.258, "step": 3060 }, { "epoch": 2.906479943033468, "grad_norm": 0.045219387859106064, "learning_rate": 7.470934684725283e-05, "loss": 0.2629, "step": 3061 }, { "epoch": 2.9074293852361737, "grad_norm": 0.053542688488960266, "learning_rate": 7.465162178549881e-05, "loss": 0.2738, "step": 3062 }, { "epoch": 2.9083788274388795, "grad_norm": 0.10473237186670303, "learning_rate": 7.459390574886847e-05, "loss": 0.2615, "step": 3063 }, { "epoch": 2.9093282696415854, "grad_norm": 0.05222583934664726, "learning_rate": 7.453619875791114e-05, "loss": 0.2616, "step": 3064 }, { "epoch": 2.9102777118442917, "grad_norm": 0.10798244178295135, "learning_rate": 7.447850083317307e-05, "loss": 0.277, "step": 3065 }, { "epoch": 2.9112271540469976, "grad_norm": 0.06557677686214447, "learning_rate": 7.442081199519719e-05, "loss": 0.2755, "step": 3066 }, { "epoch": 2.9121765962497035, "grad_norm": 0.05239195376634598, "learning_rate": 7.436313226452325e-05, "loss": 0.2772, "step": 3067 }, { "epoch": 2.9131260384524094, "grad_norm": 0.044036369770765305, "learning_rate": 7.43054616616878e-05, "loss": 0.2583, "step": 3068 }, { "epoch": 2.9140754806551152, "grad_norm": 0.07759764790534973, "learning_rate": 7.424780020722397e-05, "loss": 0.2516, "step": 3069 }, { "epoch": 2.915024922857821, "grad_norm": 0.04186937212944031, "learning_rate": 7.419014792166181e-05, "loss": 0.2619, "step": 3070 }, { "epoch": 2.915974365060527, "grad_norm": 0.04191587120294571, "learning_rate": 7.413250482552802e-05, "loss": 0.2616, "step": 3071 }, { "epoch": 2.916923807263233, "grad_norm": 0.03930152952671051, "learning_rate": 7.407487093934608e-05, "loss": 0.2592, "step": 3072 }, { "epoch": 2.9178732494659387, "grad_norm": 0.04785576090216637, "learning_rate": 7.401724628363608e-05, "loss": 0.2668, "step": 3073 }, { "epoch": 2.9188226916686446, "grad_norm": 0.0375237911939621, "learning_rate": 7.395963087891497e-05, "loss": 0.2491, "step": 3074 }, { "epoch": 2.9197721338713505, "grad_norm": 0.03894813358783722, "learning_rate": 7.39020247456963e-05, "loss": 0.2646, "step": 3075 }, { "epoch": 2.9207215760740564, "grad_norm": 0.04161771759390831, "learning_rate": 7.384442790449036e-05, "loss": 0.2592, "step": 3076 }, { "epoch": 2.9216710182767622, "grad_norm": 0.0407453328371048, "learning_rate": 7.378684037580417e-05, "loss": 0.2589, "step": 3077 }, { "epoch": 2.922620460479468, "grad_norm": 0.03704614192247391, "learning_rate": 7.372926218014131e-05, "loss": 0.246, "step": 3078 }, { "epoch": 2.923569902682174, "grad_norm": 0.04671390354633331, "learning_rate": 7.367169333800218e-05, "loss": 0.271, "step": 3079 }, { "epoch": 2.92451934488488, "grad_norm": 0.041560541838407516, "learning_rate": 7.361413386988378e-05, "loss": 0.2707, "step": 3080 }, { "epoch": 2.925468787087586, "grad_norm": 0.043347813189029694, "learning_rate": 7.35565837962798e-05, "loss": 0.2667, "step": 3081 }, { "epoch": 2.926418229290292, "grad_norm": 0.047262370586395264, "learning_rate": 7.34990431376805e-05, "loss": 0.2484, "step": 3082 }, { "epoch": 2.927367671492998, "grad_norm": 0.04352419823408127, "learning_rate": 7.34415119145729e-05, "loss": 0.2684, "step": 3083 }, { "epoch": 2.928317113695704, "grad_norm": 0.040909770876169205, "learning_rate": 7.338399014744061e-05, "loss": 0.2645, "step": 3084 }, { "epoch": 2.9292665558984097, "grad_norm": 0.03679214417934418, "learning_rate": 7.332647785676388e-05, "loss": 0.2608, "step": 3085 }, { "epoch": 2.9302159981011155, "grad_norm": 0.050959546118974686, "learning_rate": 7.32689750630196e-05, "loss": 0.2862, "step": 3086 }, { "epoch": 2.9311654403038214, "grad_norm": 0.038495469838380814, "learning_rate": 7.32114817866812e-05, "loss": 0.2562, "step": 3087 }, { "epoch": 2.9321148825065273, "grad_norm": 0.046665240079164505, "learning_rate": 7.315399804821888e-05, "loss": 0.2599, "step": 3088 }, { "epoch": 2.933064324709233, "grad_norm": 0.04234423115849495, "learning_rate": 7.309652386809922e-05, "loss": 0.2546, "step": 3089 }, { "epoch": 2.9340137669119395, "grad_norm": 0.04000794515013695, "learning_rate": 7.303905926678564e-05, "loss": 0.2579, "step": 3090 }, { "epoch": 2.9349632091146454, "grad_norm": 0.054213687777519226, "learning_rate": 7.298160426473796e-05, "loss": 0.2852, "step": 3091 }, { "epoch": 2.9359126513173512, "grad_norm": 0.03904402256011963, "learning_rate": 7.29241588824127e-05, "loss": 0.2607, "step": 3092 }, { "epoch": 2.936862093520057, "grad_norm": 0.03642754629254341, "learning_rate": 7.286672314026294e-05, "loss": 0.2553, "step": 3093 }, { "epoch": 2.937811535722763, "grad_norm": 0.042792122811079025, "learning_rate": 7.280929705873818e-05, "loss": 0.2649, "step": 3094 }, { "epoch": 2.938760977925469, "grad_norm": 0.04224555939435959, "learning_rate": 7.275188065828475e-05, "loss": 0.263, "step": 3095 }, { "epoch": 2.9397104201281747, "grad_norm": 0.1053476333618164, "learning_rate": 7.269447395934526e-05, "loss": 0.2437, "step": 3096 }, { "epoch": 2.9406598623308806, "grad_norm": 0.03846612200140953, "learning_rate": 7.263707698235909e-05, "loss": 0.2595, "step": 3097 }, { "epoch": 2.9416093045335865, "grad_norm": 0.03868841007351875, "learning_rate": 7.257968974776194e-05, "loss": 0.2478, "step": 3098 }, { "epoch": 2.9425587467362924, "grad_norm": 0.05365443229675293, "learning_rate": 7.252231227598623e-05, "loss": 0.2806, "step": 3099 }, { "epoch": 2.9435081889389982, "grad_norm": 0.03868798166513443, "learning_rate": 7.246494458746085e-05, "loss": 0.2598, "step": 3100 }, { "epoch": 2.944457631141704, "grad_norm": 0.03806505724787712, "learning_rate": 7.240758670261114e-05, "loss": 0.2528, "step": 3101 }, { "epoch": 2.94540707334441, "grad_norm": 0.04681408777832985, "learning_rate": 7.235023864185906e-05, "loss": 0.2594, "step": 3102 }, { "epoch": 2.946356515547116, "grad_norm": 0.09290296584367752, "learning_rate": 7.229290042562293e-05, "loss": 0.2503, "step": 3103 }, { "epoch": 2.9473059577498217, "grad_norm": 0.045926641672849655, "learning_rate": 7.223557207431772e-05, "loss": 0.261, "step": 3104 }, { "epoch": 2.948255399952528, "grad_norm": 0.07976659387350082, "learning_rate": 7.217825360835473e-05, "loss": 0.2597, "step": 3105 }, { "epoch": 2.949204842155234, "grad_norm": 0.060166455805301666, "learning_rate": 7.212094504814195e-05, "loss": 0.2563, "step": 3106 }, { "epoch": 2.95015428435794, "grad_norm": 0.078881174325943, "learning_rate": 7.206364641408357e-05, "loss": 0.2521, "step": 3107 }, { "epoch": 2.9511037265606457, "grad_norm": 0.04990749806165695, "learning_rate": 7.200635772658047e-05, "loss": 0.2659, "step": 3108 }, { "epoch": 2.9520531687633516, "grad_norm": 0.05827078968286514, "learning_rate": 7.194907900602993e-05, "loss": 0.2743, "step": 3109 }, { "epoch": 2.9530026109660574, "grad_norm": 0.04445386677980423, "learning_rate": 7.189181027282561e-05, "loss": 0.2537, "step": 3110 }, { "epoch": 2.9539520531687633, "grad_norm": 0.041746556758880615, "learning_rate": 7.183455154735774e-05, "loss": 0.2472, "step": 3111 }, { "epoch": 2.954901495371469, "grad_norm": 0.04664076119661331, "learning_rate": 7.177730285001282e-05, "loss": 0.2564, "step": 3112 }, { "epoch": 2.955850937574175, "grad_norm": 0.04676587134599686, "learning_rate": 7.172006420117394e-05, "loss": 0.2697, "step": 3113 }, { "epoch": 2.956800379776881, "grad_norm": 0.09390587359666824, "learning_rate": 7.166283562122049e-05, "loss": 0.3155, "step": 3114 }, { "epoch": 2.9577498219795872, "grad_norm": 0.08722305297851562, "learning_rate": 7.160561713052842e-05, "loss": 0.2422, "step": 3115 }, { "epoch": 2.958699264182293, "grad_norm": 0.067685566842556, "learning_rate": 7.15484087494699e-05, "loss": 0.2616, "step": 3116 }, { "epoch": 2.959648706384999, "grad_norm": 0.06081811711192131, "learning_rate": 7.149121049841363e-05, "loss": 0.2777, "step": 3117 }, { "epoch": 2.960598148587705, "grad_norm": 0.04524696618318558, "learning_rate": 7.143402239772471e-05, "loss": 0.264, "step": 3118 }, { "epoch": 2.9615475907904107, "grad_norm": 0.042296383529901505, "learning_rate": 7.137684446776453e-05, "loss": 0.2689, "step": 3119 }, { "epoch": 2.9624970329931166, "grad_norm": 0.051620304584503174, "learning_rate": 7.131967672889101e-05, "loss": 0.266, "step": 3120 }, { "epoch": 2.9634464751958225, "grad_norm": 0.05887624993920326, "learning_rate": 7.126251920145822e-05, "loss": 0.2769, "step": 3121 }, { "epoch": 2.9643959173985284, "grad_norm": 0.04584120586514473, "learning_rate": 7.12053719058168e-05, "loss": 0.2631, "step": 3122 }, { "epoch": 2.9653453596012342, "grad_norm": 0.07010935246944427, "learning_rate": 7.114823486231366e-05, "loss": 0.2587, "step": 3123 }, { "epoch": 2.96629480180394, "grad_norm": 0.03999679163098335, "learning_rate": 7.109110809129205e-05, "loss": 0.2525, "step": 3124 }, { "epoch": 2.967244244006646, "grad_norm": 0.049806319177150726, "learning_rate": 7.103399161309164e-05, "loss": 0.2667, "step": 3125 }, { "epoch": 2.968193686209352, "grad_norm": 0.0547214075922966, "learning_rate": 7.09768854480483e-05, "loss": 0.2807, "step": 3126 }, { "epoch": 2.9691431284120577, "grad_norm": 0.06518308818340302, "learning_rate": 7.091978961649434e-05, "loss": 0.2803, "step": 3127 }, { "epoch": 2.9700925706147636, "grad_norm": 0.05517837405204773, "learning_rate": 7.086270413875835e-05, "loss": 0.2764, "step": 3128 }, { "epoch": 2.9710420128174695, "grad_norm": 0.042132288217544556, "learning_rate": 7.080562903516529e-05, "loss": 0.2551, "step": 3129 }, { "epoch": 2.971991455020176, "grad_norm": 0.04256582632660866, "learning_rate": 7.074856432603628e-05, "loss": 0.2533, "step": 3130 }, { "epoch": 2.9729408972228817, "grad_norm": 0.05388769134879112, "learning_rate": 7.06915100316889e-05, "loss": 0.2715, "step": 3131 }, { "epoch": 2.9738903394255876, "grad_norm": 0.04420170560479164, "learning_rate": 7.063446617243694e-05, "loss": 0.2763, "step": 3132 }, { "epoch": 2.9748397816282934, "grad_norm": 0.10847865045070648, "learning_rate": 7.057743276859048e-05, "loss": 0.2713, "step": 3133 }, { "epoch": 2.9757892238309993, "grad_norm": 0.07862085849046707, "learning_rate": 7.052040984045595e-05, "loss": 0.2489, "step": 3134 }, { "epoch": 2.976738666033705, "grad_norm": 0.039099689573049545, "learning_rate": 7.04633974083359e-05, "loss": 0.2534, "step": 3135 }, { "epoch": 2.977688108236411, "grad_norm": 0.06509828567504883, "learning_rate": 7.040639549252927e-05, "loss": 0.292, "step": 3136 }, { "epoch": 2.978637550439117, "grad_norm": 0.06332911550998688, "learning_rate": 7.034940411333125e-05, "loss": 0.2769, "step": 3137 }, { "epoch": 2.979586992641823, "grad_norm": 0.042316656559705734, "learning_rate": 7.029242329103323e-05, "loss": 0.2639, "step": 3138 }, { "epoch": 2.9805364348445287, "grad_norm": 0.04153651371598244, "learning_rate": 7.023545304592279e-05, "loss": 0.2698, "step": 3139 }, { "epoch": 2.981485877047235, "grad_norm": 0.04541517421603203, "learning_rate": 7.017849339828389e-05, "loss": 0.2629, "step": 3140 }, { "epoch": 2.982435319249941, "grad_norm": 0.04102837294340134, "learning_rate": 7.012154436839663e-05, "loss": 0.2621, "step": 3141 }, { "epoch": 2.9833847614526467, "grad_norm": 0.03542923927307129, "learning_rate": 7.00646059765373e-05, "loss": 0.2488, "step": 3142 }, { "epoch": 2.9843342036553526, "grad_norm": 0.04143969714641571, "learning_rate": 7.000767824297851e-05, "loss": 0.2565, "step": 3143 }, { "epoch": 2.9852836458580585, "grad_norm": 0.055488113313913345, "learning_rate": 6.995076118798893e-05, "loss": 0.2864, "step": 3144 }, { "epoch": 2.9862330880607644, "grad_norm": 0.037771549075841904, "learning_rate": 6.989385483183355e-05, "loss": 0.2584, "step": 3145 }, { "epoch": 2.9871825302634702, "grad_norm": 0.046980682760477066, "learning_rate": 6.983695919477345e-05, "loss": 0.2665, "step": 3146 }, { "epoch": 2.988131972466176, "grad_norm": 0.041630521416664124, "learning_rate": 6.978007429706606e-05, "loss": 0.2605, "step": 3147 }, { "epoch": 2.989081414668882, "grad_norm": 0.04207857325673103, "learning_rate": 6.972320015896473e-05, "loss": 0.259, "step": 3148 }, { "epoch": 2.990030856871588, "grad_norm": 0.04457660764455795, "learning_rate": 6.966633680071921e-05, "loss": 0.2596, "step": 3149 }, { "epoch": 2.9909802990742937, "grad_norm": 0.04395845904946327, "learning_rate": 6.960948424257532e-05, "loss": 0.2655, "step": 3150 }, { "epoch": 2.9919297412769996, "grad_norm": 0.0456823855638504, "learning_rate": 6.955264250477502e-05, "loss": 0.263, "step": 3151 }, { "epoch": 2.9928791834797055, "grad_norm": 0.04490295425057411, "learning_rate": 6.94958116075565e-05, "loss": 0.2653, "step": 3152 }, { "epoch": 2.9938286256824114, "grad_norm": 0.08633121848106384, "learning_rate": 6.94389915711539e-05, "loss": 0.2692, "step": 3153 }, { "epoch": 2.9947780678851172, "grad_norm": 0.04307481646537781, "learning_rate": 6.938218241579775e-05, "loss": 0.2605, "step": 3154 }, { "epoch": 2.9957275100878236, "grad_norm": 0.042450159788131714, "learning_rate": 6.932538416171447e-05, "loss": 0.2628, "step": 3155 }, { "epoch": 2.9966769522905294, "grad_norm": 0.0424211211502552, "learning_rate": 6.926859682912678e-05, "loss": 0.2671, "step": 3156 }, { "epoch": 2.9976263944932353, "grad_norm": 0.07237890362739563, "learning_rate": 6.921182043825347e-05, "loss": 0.2528, "step": 3157 }, { "epoch": 2.998575836695941, "grad_norm": 0.04258769005537033, "learning_rate": 6.915505500930928e-05, "loss": 0.2465, "step": 3158 }, { "epoch": 2.999525278898647, "grad_norm": 0.04987820237874985, "learning_rate": 6.909830056250527e-05, "loss": 0.259, "step": 3159 }, { "epoch": 3.000474721101353, "grad_norm": 0.08159984648227692, "learning_rate": 6.904155711804842e-05, "loss": 0.2765, "step": 3160 }, { "epoch": 3.001424163304059, "grad_norm": 0.08647124469280243, "learning_rate": 6.898482469614195e-05, "loss": 0.2787, "step": 3161 }, { "epoch": 3.0023736055067647, "grad_norm": 0.04441903904080391, "learning_rate": 6.892810331698496e-05, "loss": 0.2592, "step": 3162 }, { "epoch": 3.0033230477094706, "grad_norm": 0.044516198337078094, "learning_rate": 6.88713930007728e-05, "loss": 0.2526, "step": 3163 }, { "epoch": 3.0042724899121764, "grad_norm": 0.05410735309123993, "learning_rate": 6.881469376769676e-05, "loss": 0.2765, "step": 3164 }, { "epoch": 3.0052219321148823, "grad_norm": 0.049433011561632156, "learning_rate": 6.875800563794425e-05, "loss": 0.2671, "step": 3165 }, { "epoch": 3.0061713743175886, "grad_norm": 0.04667758569121361, "learning_rate": 6.870132863169874e-05, "loss": 0.2565, "step": 3166 }, { "epoch": 3.0071208165202945, "grad_norm": 0.04814934730529785, "learning_rate": 6.864466276913963e-05, "loss": 0.2684, "step": 3167 }, { "epoch": 3.0080702587230004, "grad_norm": 0.05075441300868988, "learning_rate": 6.85880080704425e-05, "loss": 0.2595, "step": 3168 }, { "epoch": 3.0090197009257063, "grad_norm": 0.04398110508918762, "learning_rate": 6.853136455577885e-05, "loss": 0.2534, "step": 3169 }, { "epoch": 3.009969143128412, "grad_norm": 0.04458888620138168, "learning_rate": 6.847473224531624e-05, "loss": 0.2647, "step": 3170 }, { "epoch": 3.010918585331118, "grad_norm": 0.04190952330827713, "learning_rate": 6.841811115921822e-05, "loss": 0.2486, "step": 3171 }, { "epoch": 3.011868027533824, "grad_norm": 0.045418839901685715, "learning_rate": 6.836150131764434e-05, "loss": 0.243, "step": 3172 }, { "epoch": 3.0128174697365298, "grad_norm": 0.05116521194577217, "learning_rate": 6.830490274075022e-05, "loss": 0.2598, "step": 3173 }, { "epoch": 3.0137669119392356, "grad_norm": 0.07649002224206924, "learning_rate": 6.824831544868735e-05, "loss": 0.2412, "step": 3174 }, { "epoch": 3.0147163541419415, "grad_norm": 0.06899211555719376, "learning_rate": 6.819173946160336e-05, "loss": 0.2619, "step": 3175 }, { "epoch": 3.0156657963446474, "grad_norm": 0.049410488456487656, "learning_rate": 6.813517479964162e-05, "loss": 0.2545, "step": 3176 }, { "epoch": 3.0166152385473533, "grad_norm": 0.04489083215594292, "learning_rate": 6.807862148294171e-05, "loss": 0.2615, "step": 3177 }, { "epoch": 3.017564680750059, "grad_norm": 0.05581679940223694, "learning_rate": 6.8022079531639e-05, "loss": 0.2678, "step": 3178 }, { "epoch": 3.0185141229527654, "grad_norm": 0.043040670454502106, "learning_rate": 6.796554896586498e-05, "loss": 0.2482, "step": 3179 }, { "epoch": 3.0194635651554713, "grad_norm": 0.06353656202554703, "learning_rate": 6.790902980574685e-05, "loss": 0.2609, "step": 3180 }, { "epoch": 3.020413007358177, "grad_norm": 0.044340141117572784, "learning_rate": 6.785252207140797e-05, "loss": 0.2489, "step": 3181 }, { "epoch": 3.021362449560883, "grad_norm": 0.048731133341789246, "learning_rate": 6.779602578296757e-05, "loss": 0.2663, "step": 3182 }, { "epoch": 3.022311891763589, "grad_norm": 0.07609883695840836, "learning_rate": 6.773954096054071e-05, "loss": 0.2559, "step": 3183 }, { "epoch": 3.023261333966295, "grad_norm": 0.0534072183072567, "learning_rate": 6.768306762423853e-05, "loss": 0.2579, "step": 3184 }, { "epoch": 3.0242107761690007, "grad_norm": 0.1267816722393036, "learning_rate": 6.762660579416791e-05, "loss": 0.2497, "step": 3185 }, { "epoch": 3.0251602183717066, "grad_norm": 0.17665016651153564, "learning_rate": 6.757015549043175e-05, "loss": 0.2561, "step": 3186 }, { "epoch": 3.0261096605744124, "grad_norm": 0.05306272208690643, "learning_rate": 6.751371673312877e-05, "loss": 0.2438, "step": 3187 }, { "epoch": 3.0270591027771183, "grad_norm": 0.048476967960596085, "learning_rate": 6.74572895423537e-05, "loss": 0.258, "step": 3188 }, { "epoch": 3.028008544979824, "grad_norm": 0.04300226643681526, "learning_rate": 6.740087393819698e-05, "loss": 0.2582, "step": 3189 }, { "epoch": 3.02895798718253, "grad_norm": 0.04573444277048111, "learning_rate": 6.734446994074507e-05, "loss": 0.2432, "step": 3190 }, { "epoch": 3.0299074293852364, "grad_norm": 0.04853740334510803, "learning_rate": 6.728807757008024e-05, "loss": 0.2401, "step": 3191 }, { "epoch": 3.0308568715879423, "grad_norm": 0.04887021705508232, "learning_rate": 6.72316968462806e-05, "loss": 0.2637, "step": 3192 }, { "epoch": 3.031806313790648, "grad_norm": 0.044489774852991104, "learning_rate": 6.717532778942019e-05, "loss": 0.2537, "step": 3193 }, { "epoch": 3.032755755993354, "grad_norm": 0.04894377663731575, "learning_rate": 6.711897041956876e-05, "loss": 0.2531, "step": 3194 }, { "epoch": 3.03370519819606, "grad_norm": 0.04671604186296463, "learning_rate": 6.706262475679205e-05, "loss": 0.2622, "step": 3195 }, { "epoch": 3.0346546403987658, "grad_norm": 0.046363890171051025, "learning_rate": 6.70062908211515e-05, "loss": 0.2613, "step": 3196 }, { "epoch": 3.0356040826014716, "grad_norm": 0.04283369332551956, "learning_rate": 6.694996863270451e-05, "loss": 0.2594, "step": 3197 }, { "epoch": 3.0365535248041775, "grad_norm": 0.07976411283016205, "learning_rate": 6.68936582115042e-05, "loss": 0.2537, "step": 3198 }, { "epoch": 3.0375029670068834, "grad_norm": 0.04474799707531929, "learning_rate": 6.683735957759949e-05, "loss": 0.2599, "step": 3199 }, { "epoch": 3.0384524092095893, "grad_norm": 0.05153253301978111, "learning_rate": 6.678107275103519e-05, "loss": 0.2564, "step": 3200 }, { "epoch": 3.039401851412295, "grad_norm": 0.04684925451874733, "learning_rate": 6.672479775185181e-05, "loss": 0.2513, "step": 3201 }, { "epoch": 3.040351293615001, "grad_norm": 0.04749739170074463, "learning_rate": 6.666853460008575e-05, "loss": 0.2652, "step": 3202 }, { "epoch": 3.041300735817707, "grad_norm": 0.04831293970346451, "learning_rate": 6.661228331576906e-05, "loss": 0.2641, "step": 3203 }, { "epoch": 3.042250178020413, "grad_norm": 0.055306605994701385, "learning_rate": 6.655604391892972e-05, "loss": 0.2542, "step": 3204 }, { "epoch": 3.043199620223119, "grad_norm": 0.051187027245759964, "learning_rate": 6.649981642959132e-05, "loss": 0.2677, "step": 3205 }, { "epoch": 3.044149062425825, "grad_norm": 0.051244210451841354, "learning_rate": 6.644360086777332e-05, "loss": 0.2488, "step": 3206 }, { "epoch": 3.045098504628531, "grad_norm": 0.0746002122759819, "learning_rate": 6.638739725349094e-05, "loss": 0.2516, "step": 3207 }, { "epoch": 3.0460479468312367, "grad_norm": 0.05683232471346855, "learning_rate": 6.633120560675508e-05, "loss": 0.2528, "step": 3208 }, { "epoch": 3.0469973890339426, "grad_norm": 0.05021344870328903, "learning_rate": 6.627502594757242e-05, "loss": 0.2561, "step": 3209 }, { "epoch": 3.0479468312366484, "grad_norm": 0.051989272236824036, "learning_rate": 6.62188582959453e-05, "loss": 0.2427, "step": 3210 }, { "epoch": 3.0488962734393543, "grad_norm": 0.0517214760184288, "learning_rate": 6.61627026718719e-05, "loss": 0.2592, "step": 3211 }, { "epoch": 3.04984571564206, "grad_norm": 0.044613976031541824, "learning_rate": 6.610655909534605e-05, "loss": 0.2491, "step": 3212 }, { "epoch": 3.050795157844766, "grad_norm": 0.05095863714814186, "learning_rate": 6.605042758635729e-05, "loss": 0.2512, "step": 3213 }, { "epoch": 3.051744600047472, "grad_norm": 0.05005130171775818, "learning_rate": 6.599430816489092e-05, "loss": 0.2517, "step": 3214 }, { "epoch": 3.052694042250178, "grad_norm": 0.04979756847023964, "learning_rate": 6.593820085092781e-05, "loss": 0.2566, "step": 3215 }, { "epoch": 3.053643484452884, "grad_norm": 0.04886776953935623, "learning_rate": 6.588210566444469e-05, "loss": 0.2584, "step": 3216 }, { "epoch": 3.05459292665559, "grad_norm": 0.04781502112746239, "learning_rate": 6.582602262541379e-05, "loss": 0.2558, "step": 3217 }, { "epoch": 3.055542368858296, "grad_norm": 0.044281426817178726, "learning_rate": 6.576995175380322e-05, "loss": 0.257, "step": 3218 }, { "epoch": 3.0564918110610018, "grad_norm": 0.0911925807595253, "learning_rate": 6.571389306957654e-05, "loss": 0.2588, "step": 3219 }, { "epoch": 3.0574412532637076, "grad_norm": 0.07043775916099548, "learning_rate": 6.565784659269314e-05, "loss": 0.2422, "step": 3220 }, { "epoch": 3.0583906954664135, "grad_norm": 0.04519929364323616, "learning_rate": 6.560181234310795e-05, "loss": 0.2546, "step": 3221 }, { "epoch": 3.0593401376691194, "grad_norm": 0.05477475747466087, "learning_rate": 6.554579034077164e-05, "loss": 0.2589, "step": 3222 }, { "epoch": 3.0602895798718253, "grad_norm": 0.04687114432454109, "learning_rate": 6.548978060563049e-05, "loss": 0.2527, "step": 3223 }, { "epoch": 3.061239022074531, "grad_norm": 0.06336509436368942, "learning_rate": 6.543378315762634e-05, "loss": 0.2553, "step": 3224 }, { "epoch": 3.062188464277237, "grad_norm": 0.08432972431182861, "learning_rate": 6.537779801669677e-05, "loss": 0.2476, "step": 3225 }, { "epoch": 3.063137906479943, "grad_norm": 0.04580822214484215, "learning_rate": 6.532182520277485e-05, "loss": 0.2486, "step": 3226 }, { "epoch": 3.0640873486826488, "grad_norm": 0.0473608635365963, "learning_rate": 6.526586473578945e-05, "loss": 0.2573, "step": 3227 }, { "epoch": 3.0650367908853546, "grad_norm": 0.04817958176136017, "learning_rate": 6.52099166356648e-05, "loss": 0.2636, "step": 3228 }, { "epoch": 3.065986233088061, "grad_norm": 0.04982390254735947, "learning_rate": 6.515398092232093e-05, "loss": 0.2583, "step": 3229 }, { "epoch": 3.066935675290767, "grad_norm": 0.05001278966665268, "learning_rate": 6.509805761567336e-05, "loss": 0.2555, "step": 3230 }, { "epoch": 3.0678851174934727, "grad_norm": 0.04802548140287399, "learning_rate": 6.50421467356332e-05, "loss": 0.2502, "step": 3231 }, { "epoch": 3.0688345596961786, "grad_norm": 0.05099169537425041, "learning_rate": 6.498624830210722e-05, "loss": 0.2579, "step": 3232 }, { "epoch": 3.0697840018988845, "grad_norm": 0.05927233770489693, "learning_rate": 6.493036233499761e-05, "loss": 0.2666, "step": 3233 }, { "epoch": 3.0707334441015903, "grad_norm": 0.0797928124666214, "learning_rate": 6.487448885420224e-05, "loss": 0.2357, "step": 3234 }, { "epoch": 3.071682886304296, "grad_norm": 0.05498325452208519, "learning_rate": 6.481862787961447e-05, "loss": 0.2586, "step": 3235 }, { "epoch": 3.072632328507002, "grad_norm": 0.05666450038552284, "learning_rate": 6.476277943112331e-05, "loss": 0.2516, "step": 3236 }, { "epoch": 3.073581770709708, "grad_norm": 0.058523859828710556, "learning_rate": 6.470694352861312e-05, "loss": 0.2663, "step": 3237 }, { "epoch": 3.074531212912414, "grad_norm": 0.0475163534283638, "learning_rate": 6.465112019196398e-05, "loss": 0.255, "step": 3238 }, { "epoch": 3.0754806551151197, "grad_norm": 0.07355392724275589, "learning_rate": 6.459530944105141e-05, "loss": 0.2676, "step": 3239 }, { "epoch": 3.0764300973178256, "grad_norm": 0.047693684697151184, "learning_rate": 6.453951129574644e-05, "loss": 0.2603, "step": 3240 }, { "epoch": 3.077379539520532, "grad_norm": 0.052322860807180405, "learning_rate": 6.448372577591568e-05, "loss": 0.2518, "step": 3241 }, { "epoch": 3.0783289817232378, "grad_norm": 0.04750973731279373, "learning_rate": 6.442795290142114e-05, "loss": 0.2515, "step": 3242 }, { "epoch": 3.0792784239259436, "grad_norm": 0.046384576708078384, "learning_rate": 6.437219269212042e-05, "loss": 0.2494, "step": 3243 }, { "epoch": 3.0802278661286495, "grad_norm": 0.04986315593123436, "learning_rate": 6.431644516786657e-05, "loss": 0.2534, "step": 3244 }, { "epoch": 3.0811773083313554, "grad_norm": 0.04750996455550194, "learning_rate": 6.426071034850811e-05, "loss": 0.2534, "step": 3245 }, { "epoch": 3.0821267505340613, "grad_norm": 0.0551830492913723, "learning_rate": 6.420498825388915e-05, "loss": 0.246, "step": 3246 }, { "epoch": 3.083076192736767, "grad_norm": 0.05087340250611305, "learning_rate": 6.414927890384903e-05, "loss": 0.2578, "step": 3247 }, { "epoch": 3.084025634939473, "grad_norm": 0.04738471284508705, "learning_rate": 6.40935823182228e-05, "loss": 0.251, "step": 3248 }, { "epoch": 3.084975077142179, "grad_norm": 0.07119819521903992, "learning_rate": 6.403789851684082e-05, "loss": 0.271, "step": 3249 }, { "epoch": 3.0859245193448848, "grad_norm": 0.04995843023061752, "learning_rate": 6.398222751952899e-05, "loss": 0.2585, "step": 3250 }, { "epoch": 3.0868739615475906, "grad_norm": 0.05113042891025543, "learning_rate": 6.392656934610852e-05, "loss": 0.2691, "step": 3251 }, { "epoch": 3.0878234037502965, "grad_norm": 0.052136246114969254, "learning_rate": 6.387092401639623e-05, "loss": 0.265, "step": 3252 }, { "epoch": 3.0887728459530024, "grad_norm": 0.048999518156051636, "learning_rate": 6.381529155020418e-05, "loss": 0.2535, "step": 3253 }, { "epoch": 3.0897222881557087, "grad_norm": 0.1198212206363678, "learning_rate": 6.375967196734003e-05, "loss": 0.2574, "step": 3254 }, { "epoch": 3.0906717303584146, "grad_norm": 0.0474587082862854, "learning_rate": 6.370406528760675e-05, "loss": 0.2623, "step": 3255 }, { "epoch": 3.0916211725611205, "grad_norm": 0.04819402098655701, "learning_rate": 6.364847153080268e-05, "loss": 0.2502, "step": 3256 }, { "epoch": 3.0925706147638263, "grad_norm": 0.04643898829817772, "learning_rate": 6.359289071672168e-05, "loss": 0.2528, "step": 3257 }, { "epoch": 3.093520056966532, "grad_norm": 0.05174558609724045, "learning_rate": 6.353732286515286e-05, "loss": 0.2657, "step": 3258 }, { "epoch": 3.094469499169238, "grad_norm": 0.04330934211611748, "learning_rate": 6.348176799588088e-05, "loss": 0.2559, "step": 3259 }, { "epoch": 3.095418941371944, "grad_norm": 0.06805543601512909, "learning_rate": 6.34262261286856e-05, "loss": 0.2317, "step": 3260 }, { "epoch": 3.09636838357465, "grad_norm": 0.042569007724523544, "learning_rate": 6.337069728334239e-05, "loss": 0.2549, "step": 3261 }, { "epoch": 3.0973178257773557, "grad_norm": 0.04311903938651085, "learning_rate": 6.33151814796219e-05, "loss": 0.2557, "step": 3262 }, { "epoch": 3.0982672679800616, "grad_norm": 0.05211617797613144, "learning_rate": 6.325967873729018e-05, "loss": 0.2471, "step": 3263 }, { "epoch": 3.0992167101827675, "grad_norm": 0.04343589022755623, "learning_rate": 6.320418907610865e-05, "loss": 0.2462, "step": 3264 }, { "epoch": 3.1001661523854733, "grad_norm": 0.04785288870334625, "learning_rate": 6.314871251583398e-05, "loss": 0.2478, "step": 3265 }, { "epoch": 3.1011155945881796, "grad_norm": 0.04962451383471489, "learning_rate": 6.309324907621827e-05, "loss": 0.2448, "step": 3266 }, { "epoch": 3.1020650367908855, "grad_norm": 0.05707186087965965, "learning_rate": 6.30377987770089e-05, "loss": 0.2557, "step": 3267 }, { "epoch": 3.1030144789935914, "grad_norm": 0.0540940947830677, "learning_rate": 6.298236163794863e-05, "loss": 0.2599, "step": 3268 }, { "epoch": 3.1039639211962973, "grad_norm": 0.07168012112379074, "learning_rate": 6.292693767877542e-05, "loss": 0.2355, "step": 3269 }, { "epoch": 3.104913363399003, "grad_norm": 0.050079986453056335, "learning_rate": 6.287152691922264e-05, "loss": 0.2575, "step": 3270 }, { "epoch": 3.105862805601709, "grad_norm": 0.05402916669845581, "learning_rate": 6.281612937901894e-05, "loss": 0.2676, "step": 3271 }, { "epoch": 3.106812247804415, "grad_norm": 0.05587102100253105, "learning_rate": 6.276074507788821e-05, "loss": 0.2721, "step": 3272 }, { "epoch": 3.1077616900071208, "grad_norm": 0.052861545234918594, "learning_rate": 6.270537403554973e-05, "loss": 0.2597, "step": 3273 }, { "epoch": 3.1087111322098266, "grad_norm": 0.05834222957491875, "learning_rate": 6.265001627171793e-05, "loss": 0.2667, "step": 3274 }, { "epoch": 3.1096605744125325, "grad_norm": 0.04725935310125351, "learning_rate": 6.259467180610261e-05, "loss": 0.2649, "step": 3275 }, { "epoch": 3.1106100166152384, "grad_norm": 0.09624456614255905, "learning_rate": 6.25393406584088e-05, "loss": 0.2602, "step": 3276 }, { "epoch": 3.1115594588179443, "grad_norm": 0.048281747847795486, "learning_rate": 6.248402284833682e-05, "loss": 0.2482, "step": 3277 }, { "epoch": 3.11250890102065, "grad_norm": 0.07574120908975601, "learning_rate": 6.242871839558215e-05, "loss": 0.2559, "step": 3278 }, { "epoch": 3.1134583432233565, "grad_norm": 0.04702775552868843, "learning_rate": 6.237342731983562e-05, "loss": 0.2484, "step": 3279 }, { "epoch": 3.1144077854260623, "grad_norm": 0.05545090511441231, "learning_rate": 6.231814964078327e-05, "loss": 0.2659, "step": 3280 }, { "epoch": 3.115357227628768, "grad_norm": 0.054547566920518875, "learning_rate": 6.22628853781063e-05, "loss": 0.2639, "step": 3281 }, { "epoch": 3.116306669831474, "grad_norm": 0.08548653870820999, "learning_rate": 6.220763455148126e-05, "loss": 0.2669, "step": 3282 }, { "epoch": 3.11725611203418, "grad_norm": 0.04788108915090561, "learning_rate": 6.215239718057976e-05, "loss": 0.2496, "step": 3283 }, { "epoch": 3.118205554236886, "grad_norm": 0.061255473643541336, "learning_rate": 6.209717328506877e-05, "loss": 0.2386, "step": 3284 }, { "epoch": 3.1191549964395917, "grad_norm": 0.06910520046949387, "learning_rate": 6.204196288461037e-05, "loss": 0.2486, "step": 3285 }, { "epoch": 3.1201044386422976, "grad_norm": 0.06149483472108841, "learning_rate": 6.198676599886185e-05, "loss": 0.2606, "step": 3286 }, { "epoch": 3.1210538808450035, "grad_norm": 0.09108784794807434, "learning_rate": 6.193158264747576e-05, "loss": 0.2633, "step": 3287 }, { "epoch": 3.1220033230477093, "grad_norm": 0.08666419982910156, "learning_rate": 6.187641285009966e-05, "loss": 0.2737, "step": 3288 }, { "epoch": 3.122952765250415, "grad_norm": 0.05654684826731682, "learning_rate": 6.18212566263765e-05, "loss": 0.2491, "step": 3289 }, { "epoch": 3.123902207453121, "grad_norm": 0.05671761557459831, "learning_rate": 6.176611399594421e-05, "loss": 0.2565, "step": 3290 }, { "epoch": 3.1248516496558274, "grad_norm": 0.04744125157594681, "learning_rate": 6.171098497843606e-05, "loss": 0.2653, "step": 3291 }, { "epoch": 3.1258010918585333, "grad_norm": 0.07659009844064713, "learning_rate": 6.165586959348026e-05, "loss": 0.2655, "step": 3292 }, { "epoch": 3.126750534061239, "grad_norm": 0.04795358330011368, "learning_rate": 6.160076786070036e-05, "loss": 0.2586, "step": 3293 }, { "epoch": 3.127699976263945, "grad_norm": 0.05069601535797119, "learning_rate": 6.154567979971493e-05, "loss": 0.2564, "step": 3294 }, { "epoch": 3.128649418466651, "grad_norm": 0.09271470457315445, "learning_rate": 6.149060543013771e-05, "loss": 0.2662, "step": 3295 }, { "epoch": 3.1295988606693568, "grad_norm": 0.06749092042446136, "learning_rate": 6.143554477157763e-05, "loss": 0.2669, "step": 3296 }, { "epoch": 3.1305483028720626, "grad_norm": 0.04530181735754013, "learning_rate": 6.13804978436386e-05, "loss": 0.2531, "step": 3297 }, { "epoch": 3.1314977450747685, "grad_norm": 0.04635681211948395, "learning_rate": 6.132546466591977e-05, "loss": 0.2463, "step": 3298 }, { "epoch": 3.1324471872774744, "grad_norm": 0.0409170500934124, "learning_rate": 6.127044525801529e-05, "loss": 0.2545, "step": 3299 }, { "epoch": 3.1333966294801803, "grad_norm": 0.057700783014297485, "learning_rate": 6.121543963951452e-05, "loss": 0.2579, "step": 3300 }, { "epoch": 3.134346071682886, "grad_norm": 0.044831424951553345, "learning_rate": 6.11604478300018e-05, "loss": 0.2527, "step": 3301 }, { "epoch": 3.135295513885592, "grad_norm": 0.044355545192956924, "learning_rate": 6.110546984905661e-05, "loss": 0.257, "step": 3302 }, { "epoch": 3.136244956088298, "grad_norm": 0.06783049553632736, "learning_rate": 6.105050571625353e-05, "loss": 0.27, "step": 3303 }, { "epoch": 3.137194398291004, "grad_norm": 0.0519019216299057, "learning_rate": 6.0995555451162145e-05, "loss": 0.2561, "step": 3304 }, { "epoch": 3.13814384049371, "grad_norm": 0.057388197630643845, "learning_rate": 6.094061907334718e-05, "loss": 0.2557, "step": 3305 }, { "epoch": 3.139093282696416, "grad_norm": 0.11174029111862183, "learning_rate": 6.0885696602368315e-05, "loss": 0.2425, "step": 3306 }, { "epoch": 3.140042724899122, "grad_norm": 0.059950169175863266, "learning_rate": 6.0830788057780374e-05, "loss": 0.2546, "step": 3307 }, { "epoch": 3.1409921671018277, "grad_norm": 0.04765070974826813, "learning_rate": 6.077589345913315e-05, "loss": 0.2418, "step": 3308 }, { "epoch": 3.1419416093045336, "grad_norm": 0.049464669078588486, "learning_rate": 6.072101282597156e-05, "loss": 0.2546, "step": 3309 }, { "epoch": 3.1428910515072395, "grad_norm": 0.06911448389291763, "learning_rate": 6.0666146177835425e-05, "loss": 0.2686, "step": 3310 }, { "epoch": 3.1438404937099453, "grad_norm": 0.06544983386993408, "learning_rate": 6.06112935342597e-05, "loss": 0.2493, "step": 3311 }, { "epoch": 3.144789935912651, "grad_norm": 0.08926332741975784, "learning_rate": 6.0556454914774295e-05, "loss": 0.2685, "step": 3312 }, { "epoch": 3.145739378115357, "grad_norm": 0.04235906898975372, "learning_rate": 6.0501630338904136e-05, "loss": 0.2577, "step": 3313 }, { "epoch": 3.146688820318063, "grad_norm": 0.04946906492114067, "learning_rate": 6.04468198261692e-05, "loss": 0.2554, "step": 3314 }, { "epoch": 3.1476382625207693, "grad_norm": 0.05564684048295021, "learning_rate": 6.039202339608432e-05, "loss": 0.2677, "step": 3315 }, { "epoch": 3.148587704723475, "grad_norm": 0.0531269796192646, "learning_rate": 6.03372410681595e-05, "loss": 0.2674, "step": 3316 }, { "epoch": 3.149537146926181, "grad_norm": 0.047610778361558914, "learning_rate": 6.028247286189953e-05, "loss": 0.2578, "step": 3317 }, { "epoch": 3.150486589128887, "grad_norm": 0.06487441807985306, "learning_rate": 6.0227718796804377e-05, "loss": 0.2677, "step": 3318 }, { "epoch": 3.151436031331593, "grad_norm": 0.04265904799103737, "learning_rate": 6.017297889236878e-05, "loss": 0.2526, "step": 3319 }, { "epoch": 3.1523854735342987, "grad_norm": 0.04901301488280296, "learning_rate": 6.011825316808255e-05, "loss": 0.2522, "step": 3320 }, { "epoch": 3.1533349157370045, "grad_norm": 0.04461880400776863, "learning_rate": 6.006354164343046e-05, "loss": 0.2567, "step": 3321 }, { "epoch": 3.1542843579397104, "grad_norm": 0.05425048992037773, "learning_rate": 6.000884433789211e-05, "loss": 0.2708, "step": 3322 }, { "epoch": 3.1552338001424163, "grad_norm": 0.047145530581474304, "learning_rate": 5.995416127094222e-05, "loss": 0.2598, "step": 3323 }, { "epoch": 3.156183242345122, "grad_norm": 0.04286041483283043, "learning_rate": 5.989949246205024e-05, "loss": 0.2544, "step": 3324 }, { "epoch": 3.157132684547828, "grad_norm": 0.0453423373401165, "learning_rate": 5.984483793068072e-05, "loss": 0.2547, "step": 3325 }, { "epoch": 3.158082126750534, "grad_norm": 0.07012559473514557, "learning_rate": 5.979019769629297e-05, "loss": 0.2868, "step": 3326 }, { "epoch": 3.1590315689532398, "grad_norm": 0.04472891986370087, "learning_rate": 5.9735571778341325e-05, "loss": 0.2578, "step": 3327 }, { "epoch": 3.1599810111559457, "grad_norm": 0.04697902500629425, "learning_rate": 5.9680960196274994e-05, "loss": 0.2549, "step": 3328 }, { "epoch": 3.160930453358652, "grad_norm": 0.04871654137969017, "learning_rate": 5.9626362969538053e-05, "loss": 0.2492, "step": 3329 }, { "epoch": 3.161879895561358, "grad_norm": 0.042063839733600616, "learning_rate": 5.957178011756952e-05, "loss": 0.2528, "step": 3330 }, { "epoch": 3.1628293377640637, "grad_norm": 0.04910881072282791, "learning_rate": 5.9517211659803216e-05, "loss": 0.2445, "step": 3331 }, { "epoch": 3.1637787799667696, "grad_norm": 0.0521257258951664, "learning_rate": 5.94626576156679e-05, "loss": 0.2623, "step": 3332 }, { "epoch": 3.1647282221694755, "grad_norm": 0.05412798747420311, "learning_rate": 5.9408118004587185e-05, "loss": 0.2565, "step": 3333 }, { "epoch": 3.1656776643721813, "grad_norm": 0.04634969308972359, "learning_rate": 5.935359284597957e-05, "loss": 0.2501, "step": 3334 }, { "epoch": 3.166627106574887, "grad_norm": 0.04625312611460686, "learning_rate": 5.92990821592583e-05, "loss": 0.2461, "step": 3335 }, { "epoch": 3.167576548777593, "grad_norm": 0.061656538397073746, "learning_rate": 5.924458596383161e-05, "loss": 0.2604, "step": 3336 }, { "epoch": 3.168525990980299, "grad_norm": 0.0656399130821228, "learning_rate": 5.919010427910252e-05, "loss": 0.2729, "step": 3337 }, { "epoch": 3.169475433183005, "grad_norm": 0.04736149311065674, "learning_rate": 5.913563712446883e-05, "loss": 0.2554, "step": 3338 }, { "epoch": 3.1704248753857107, "grad_norm": 0.07337143272161484, "learning_rate": 5.9081184519323275e-05, "loss": 0.2574, "step": 3339 }, { "epoch": 3.171374317588417, "grad_norm": 0.06286807358264923, "learning_rate": 5.902674648305329e-05, "loss": 0.2737, "step": 3340 }, { "epoch": 3.172323759791123, "grad_norm": 0.046282291412353516, "learning_rate": 5.89723230350412e-05, "loss": 0.2544, "step": 3341 }, { "epoch": 3.173273201993829, "grad_norm": 0.045843806117773056, "learning_rate": 5.89179141946641e-05, "loss": 0.2516, "step": 3342 }, { "epoch": 3.1742226441965347, "grad_norm": 0.0496109202504158, "learning_rate": 5.8863519981293926e-05, "loss": 0.261, "step": 3343 }, { "epoch": 3.1751720863992405, "grad_norm": 0.04872041568160057, "learning_rate": 5.8809140414297416e-05, "loss": 0.2449, "step": 3344 }, { "epoch": 3.1761215286019464, "grad_norm": 0.054158084094524384, "learning_rate": 5.8754775513035964e-05, "loss": 0.2507, "step": 3345 }, { "epoch": 3.1770709708046523, "grad_norm": 0.051501765847206116, "learning_rate": 5.8700425296865905e-05, "loss": 0.2472, "step": 3346 }, { "epoch": 3.178020413007358, "grad_norm": 0.048579879105091095, "learning_rate": 5.8646089785138235e-05, "loss": 0.2535, "step": 3347 }, { "epoch": 3.178969855210064, "grad_norm": 0.07795701920986176, "learning_rate": 5.859176899719883e-05, "loss": 0.2796, "step": 3348 }, { "epoch": 3.17991929741277, "grad_norm": 0.053580522537231445, "learning_rate": 5.8537462952388155e-05, "loss": 0.2599, "step": 3349 }, { "epoch": 3.180868739615476, "grad_norm": 0.04634246602654457, "learning_rate": 5.848317167004158e-05, "loss": 0.2451, "step": 3350 }, { "epoch": 3.1818181818181817, "grad_norm": 0.044239211827516556, "learning_rate": 5.842889516948913e-05, "loss": 0.245, "step": 3351 }, { "epoch": 3.1827676240208875, "grad_norm": 0.04600201174616814, "learning_rate": 5.837463347005561e-05, "loss": 0.2621, "step": 3352 }, { "epoch": 3.1837170662235934, "grad_norm": 0.0480601042509079, "learning_rate": 5.832038659106056e-05, "loss": 0.267, "step": 3353 }, { "epoch": 3.1846665084262997, "grad_norm": 0.05365194007754326, "learning_rate": 5.8266154551818216e-05, "loss": 0.226, "step": 3354 }, { "epoch": 3.1856159506290056, "grad_norm": 0.1545141488313675, "learning_rate": 5.821193737163753e-05, "loss": 0.2352, "step": 3355 }, { "epoch": 3.1865653928317115, "grad_norm": 0.04504143446683884, "learning_rate": 5.8157735069822176e-05, "loss": 0.2509, "step": 3356 }, { "epoch": 3.1875148350344173, "grad_norm": 0.08130753040313721, "learning_rate": 5.810354766567052e-05, "loss": 0.2566, "step": 3357 }, { "epoch": 3.1884642772371232, "grad_norm": 0.06167406588792801, "learning_rate": 5.8049375178475594e-05, "loss": 0.243, "step": 3358 }, { "epoch": 3.189413719439829, "grad_norm": 0.0594964362680912, "learning_rate": 5.799521762752524e-05, "loss": 0.2594, "step": 3359 }, { "epoch": 3.190363161642535, "grad_norm": 0.049432095140218735, "learning_rate": 5.794107503210186e-05, "loss": 0.2589, "step": 3360 }, { "epoch": 3.191312603845241, "grad_norm": 0.0841529369354248, "learning_rate": 5.788694741148257e-05, "loss": 0.278, "step": 3361 }, { "epoch": 3.1922620460479467, "grad_norm": 0.07379740476608276, "learning_rate": 5.7832834784939163e-05, "loss": 0.2686, "step": 3362 }, { "epoch": 3.1932114882506526, "grad_norm": 0.0550118163228035, "learning_rate": 5.777873717173803e-05, "loss": 0.2572, "step": 3363 }, { "epoch": 3.1941609304533585, "grad_norm": 0.050232190638780594, "learning_rate": 5.7724654591140385e-05, "loss": 0.2428, "step": 3364 }, { "epoch": 3.195110372656065, "grad_norm": 0.07592414319515228, "learning_rate": 5.7670587062401826e-05, "loss": 0.2639, "step": 3365 }, { "epoch": 3.1960598148587707, "grad_norm": 0.06244008243083954, "learning_rate": 5.761653460477286e-05, "loss": 0.2716, "step": 3366 }, { "epoch": 3.1970092570614765, "grad_norm": 0.05664997175335884, "learning_rate": 5.756249723749847e-05, "loss": 0.2569, "step": 3367 }, { "epoch": 3.1979586992641824, "grad_norm": 0.053819846361875534, "learning_rate": 5.750847497981827e-05, "loss": 0.264, "step": 3368 }, { "epoch": 3.1989081414668883, "grad_norm": 0.09246042370796204, "learning_rate": 5.745446785096664e-05, "loss": 0.256, "step": 3369 }, { "epoch": 3.199857583669594, "grad_norm": 0.0775340348482132, "learning_rate": 5.740047587017232e-05, "loss": 0.2708, "step": 3370 }, { "epoch": 3.2008070258723, "grad_norm": 0.06020486727356911, "learning_rate": 5.734649905665891e-05, "loss": 0.2579, "step": 3371 }, { "epoch": 3.201756468075006, "grad_norm": 0.051304880529642105, "learning_rate": 5.7292537429644454e-05, "loss": 0.2571, "step": 3372 }, { "epoch": 3.202705910277712, "grad_norm": 0.05811922997236252, "learning_rate": 5.723859100834165e-05, "loss": 0.2568, "step": 3373 }, { "epoch": 3.2036553524804177, "grad_norm": 0.05013841763138771, "learning_rate": 5.718465981195775e-05, "loss": 0.2479, "step": 3374 }, { "epoch": 3.2046047946831235, "grad_norm": 0.048892173916101456, "learning_rate": 5.713074385969457e-05, "loss": 0.2522, "step": 3375 }, { "epoch": 3.2055542368858294, "grad_norm": 0.09769143909215927, "learning_rate": 5.7076843170748615e-05, "loss": 0.2512, "step": 3376 }, { "epoch": 3.2065036790885353, "grad_norm": 0.09074780344963074, "learning_rate": 5.702295776431084e-05, "loss": 0.2631, "step": 3377 }, { "epoch": 3.207453121291241, "grad_norm": 0.05048530921339989, "learning_rate": 5.6969087659566756e-05, "loss": 0.2496, "step": 3378 }, { "epoch": 3.2084025634939475, "grad_norm": 0.0813853070139885, "learning_rate": 5.691523287569649e-05, "loss": 0.2616, "step": 3379 }, { "epoch": 3.2093520056966534, "grad_norm": 0.04971461743116379, "learning_rate": 5.6861393431874675e-05, "loss": 0.2684, "step": 3380 }, { "epoch": 3.2103014478993592, "grad_norm": 0.07930952310562134, "learning_rate": 5.680756934727046e-05, "loss": 0.2686, "step": 3381 }, { "epoch": 3.211250890102065, "grad_norm": 0.04881668835878372, "learning_rate": 5.675376064104767e-05, "loss": 0.2527, "step": 3382 }, { "epoch": 3.212200332304771, "grad_norm": 0.09243467450141907, "learning_rate": 5.669996733236438e-05, "loss": 0.2709, "step": 3383 }, { "epoch": 3.213149774507477, "grad_norm": 0.05580917000770569, "learning_rate": 5.6646189440373456e-05, "loss": 0.2482, "step": 3384 }, { "epoch": 3.2140992167101827, "grad_norm": 0.0797090157866478, "learning_rate": 5.659242698422214e-05, "loss": 0.2635, "step": 3385 }, { "epoch": 3.2150486589128886, "grad_norm": 0.050484515726566315, "learning_rate": 5.653867998305216e-05, "loss": 0.246, "step": 3386 }, { "epoch": 3.2159981011155945, "grad_norm": 0.07907379418611526, "learning_rate": 5.64849484559999e-05, "loss": 0.2384, "step": 3387 }, { "epoch": 3.2169475433183004, "grad_norm": 0.057728394865989685, "learning_rate": 5.6431232422195946e-05, "loss": 0.242, "step": 3388 }, { "epoch": 3.2178969855210062, "grad_norm": 0.053205668926239014, "learning_rate": 5.6377531900765666e-05, "loss": 0.2532, "step": 3389 }, { "epoch": 3.2188464277237125, "grad_norm": 0.05353543162345886, "learning_rate": 5.6323846910828735e-05, "loss": 0.2574, "step": 3390 }, { "epoch": 3.2197958699264184, "grad_norm": 0.04991352930665016, "learning_rate": 5.6270177471499365e-05, "loss": 0.2574, "step": 3391 }, { "epoch": 3.2207453121291243, "grad_norm": 0.05403256043791771, "learning_rate": 5.621652360188617e-05, "loss": 0.2556, "step": 3392 }, { "epoch": 3.22169475433183, "grad_norm": 0.05058816447854042, "learning_rate": 5.616288532109225e-05, "loss": 0.2543, "step": 3393 }, { "epoch": 3.222644196534536, "grad_norm": 0.04839969053864479, "learning_rate": 5.610926264821523e-05, "loss": 0.2565, "step": 3394 }, { "epoch": 3.223593638737242, "grad_norm": 0.11676731705665588, "learning_rate": 5.6055655602347067e-05, "loss": 0.2506, "step": 3395 }, { "epoch": 3.224543080939948, "grad_norm": 0.04872575402259827, "learning_rate": 5.600206420257419e-05, "loss": 0.2591, "step": 3396 }, { "epoch": 3.2254925231426537, "grad_norm": 0.046489961445331573, "learning_rate": 5.5948488467977486e-05, "loss": 0.2462, "step": 3397 }, { "epoch": 3.2264419653453595, "grad_norm": 0.04424404352903366, "learning_rate": 5.589492841763224e-05, "loss": 0.2567, "step": 3398 }, { "epoch": 3.2273914075480654, "grad_norm": 0.04828077182173729, "learning_rate": 5.5841384070608104e-05, "loss": 0.2568, "step": 3399 }, { "epoch": 3.2283408497507713, "grad_norm": 0.06325559318065643, "learning_rate": 5.5787855445969276e-05, "loss": 0.2802, "step": 3400 }, { "epoch": 3.229290291953477, "grad_norm": 0.10159096866846085, "learning_rate": 5.5734342562774234e-05, "loss": 0.2502, "step": 3401 }, { "epoch": 3.230239734156183, "grad_norm": 0.04155226796865463, "learning_rate": 5.568084544007588e-05, "loss": 0.2463, "step": 3402 }, { "epoch": 3.2311891763588894, "grad_norm": 0.04739375039935112, "learning_rate": 5.562736409692153e-05, "loss": 0.262, "step": 3403 }, { "epoch": 3.2321386185615952, "grad_norm": 0.04406020790338516, "learning_rate": 5.55738985523528e-05, "loss": 0.2507, "step": 3404 }, { "epoch": 3.233088060764301, "grad_norm": 0.09737671911716461, "learning_rate": 5.55204488254059e-05, "loss": 0.2427, "step": 3405 }, { "epoch": 3.234037502967007, "grad_norm": 0.05227050185203552, "learning_rate": 5.546701493511106e-05, "loss": 0.2669, "step": 3406 }, { "epoch": 3.234986945169713, "grad_norm": 0.044119905680418015, "learning_rate": 5.541359690049321e-05, "loss": 0.2486, "step": 3407 }, { "epoch": 3.2359363873724187, "grad_norm": 0.04765981808304787, "learning_rate": 5.5360194740571445e-05, "loss": 0.261, "step": 3408 }, { "epoch": 3.2368858295751246, "grad_norm": 0.050302762538194656, "learning_rate": 5.5306808474359205e-05, "loss": 0.2604, "step": 3409 }, { "epoch": 3.2378352717778305, "grad_norm": 0.10235820710659027, "learning_rate": 5.525343812086445e-05, "loss": 0.2609, "step": 3410 }, { "epoch": 3.2387847139805364, "grad_norm": 0.04724949970841408, "learning_rate": 5.520008369908918e-05, "loss": 0.2617, "step": 3411 }, { "epoch": 3.2397341561832422, "grad_norm": 0.05436694622039795, "learning_rate": 5.5146745228030006e-05, "loss": 0.2839, "step": 3412 }, { "epoch": 3.240683598385948, "grad_norm": 0.04633798077702522, "learning_rate": 5.50934227266777e-05, "loss": 0.252, "step": 3413 }, { "epoch": 3.241633040588654, "grad_norm": 0.04847753047943115, "learning_rate": 5.504011621401738e-05, "loss": 0.26, "step": 3414 }, { "epoch": 3.2425824827913603, "grad_norm": 0.07506626099348068, "learning_rate": 5.498682570902849e-05, "loss": 0.2523, "step": 3415 }, { "epoch": 3.243531924994066, "grad_norm": 0.050959791988134384, "learning_rate": 5.493355123068473e-05, "loss": 0.2615, "step": 3416 }, { "epoch": 3.244481367196772, "grad_norm": 0.06273401528596878, "learning_rate": 5.488029279795419e-05, "loss": 0.2692, "step": 3417 }, { "epoch": 3.245430809399478, "grad_norm": 0.04362702742218971, "learning_rate": 5.4827050429799167e-05, "loss": 0.2534, "step": 3418 }, { "epoch": 3.246380251602184, "grad_norm": 0.08757391571998596, "learning_rate": 5.477382414517624e-05, "loss": 0.2646, "step": 3419 }, { "epoch": 3.2473296938048897, "grad_norm": 0.04454657435417175, "learning_rate": 5.472061396303629e-05, "loss": 0.2584, "step": 3420 }, { "epoch": 3.2482791360075955, "grad_norm": 0.053924281150102615, "learning_rate": 5.466741990232445e-05, "loss": 0.2429, "step": 3421 }, { "epoch": 3.2492285782103014, "grad_norm": 0.05537960305809975, "learning_rate": 5.461424198198006e-05, "loss": 0.2595, "step": 3422 }, { "epoch": 3.2501780204130073, "grad_norm": 0.06580296903848648, "learning_rate": 5.456108022093691e-05, "loss": 0.2739, "step": 3423 }, { "epoch": 3.251127462615713, "grad_norm": 0.0508543886244297, "learning_rate": 5.4507934638122727e-05, "loss": 0.2566, "step": 3424 }, { "epoch": 3.252076904818419, "grad_norm": 0.05126870423555374, "learning_rate": 5.445480525245976e-05, "loss": 0.2546, "step": 3425 }, { "epoch": 3.253026347021125, "grad_norm": 0.0474667064845562, "learning_rate": 5.440169208286436e-05, "loss": 0.2609, "step": 3426 }, { "epoch": 3.253975789223831, "grad_norm": 0.04915899783372879, "learning_rate": 5.434859514824706e-05, "loss": 0.2558, "step": 3427 }, { "epoch": 3.2549252314265367, "grad_norm": 0.061247166246175766, "learning_rate": 5.429551446751282e-05, "loss": 0.2369, "step": 3428 }, { "epoch": 3.255874673629243, "grad_norm": 0.08276187628507614, "learning_rate": 5.424245005956048e-05, "loss": 0.2525, "step": 3429 }, { "epoch": 3.256824115831949, "grad_norm": 0.06013704091310501, "learning_rate": 5.418940194328344e-05, "loss": 0.2656, "step": 3430 }, { "epoch": 3.2577735580346547, "grad_norm": 0.05011112242937088, "learning_rate": 5.413637013756898e-05, "loss": 0.2445, "step": 3431 }, { "epoch": 3.2587230002373606, "grad_norm": 0.0639798641204834, "learning_rate": 5.4083354661298814e-05, "loss": 0.279, "step": 3432 }, { "epoch": 3.2596724424400665, "grad_norm": 0.046627677977085114, "learning_rate": 5.403035553334881e-05, "loss": 0.252, "step": 3433 }, { "epoch": 3.2606218846427724, "grad_norm": 0.049054812639951706, "learning_rate": 5.397737277258883e-05, "loss": 0.2484, "step": 3434 }, { "epoch": 3.2615713268454782, "grad_norm": 0.05973832309246063, "learning_rate": 5.3924406397883174e-05, "loss": 0.2428, "step": 3435 }, { "epoch": 3.262520769048184, "grad_norm": 0.05139094963669777, "learning_rate": 5.3871456428090025e-05, "loss": 0.2456, "step": 3436 }, { "epoch": 3.26347021125089, "grad_norm": 0.07416705787181854, "learning_rate": 5.3818522882061995e-05, "loss": 0.2297, "step": 3437 }, { "epoch": 3.264419653453596, "grad_norm": 0.07129445672035217, "learning_rate": 5.376560577864567e-05, "loss": 0.2438, "step": 3438 }, { "epoch": 3.2653690956563017, "grad_norm": 0.050706807523965836, "learning_rate": 5.371270513668185e-05, "loss": 0.26, "step": 3439 }, { "epoch": 3.266318537859008, "grad_norm": 0.04617472365498543, "learning_rate": 5.365982097500545e-05, "loss": 0.2546, "step": 3440 }, { "epoch": 3.267267980061714, "grad_norm": 0.06968291103839874, "learning_rate": 5.36069533124455e-05, "loss": 0.2617, "step": 3441 }, { "epoch": 3.26821742226442, "grad_norm": 0.044583242386579514, "learning_rate": 5.355410216782526e-05, "loss": 0.2538, "step": 3442 }, { "epoch": 3.2691668644671257, "grad_norm": 0.058784905821084976, "learning_rate": 5.350126755996199e-05, "loss": 0.2661, "step": 3443 }, { "epoch": 3.2701163066698316, "grad_norm": 0.05246102437376976, "learning_rate": 5.344844950766712e-05, "loss": 0.2618, "step": 3444 }, { "epoch": 3.2710657488725374, "grad_norm": 0.0502961203455925, "learning_rate": 5.339564802974615e-05, "loss": 0.2557, "step": 3445 }, { "epoch": 3.2720151910752433, "grad_norm": 0.06429962813854218, "learning_rate": 5.33428631449987e-05, "loss": 0.2554, "step": 3446 }, { "epoch": 3.272964633277949, "grad_norm": 0.07397405058145523, "learning_rate": 5.329009487221845e-05, "loss": 0.2734, "step": 3447 }, { "epoch": 3.273914075480655, "grad_norm": 0.04893027991056442, "learning_rate": 5.3237343230193296e-05, "loss": 0.2609, "step": 3448 }, { "epoch": 3.274863517683361, "grad_norm": 0.05211041122674942, "learning_rate": 5.318460823770504e-05, "loss": 0.2543, "step": 3449 }, { "epoch": 3.275812959886067, "grad_norm": 0.04557442665100098, "learning_rate": 5.313188991352964e-05, "loss": 0.2482, "step": 3450 }, { "epoch": 3.2767624020887727, "grad_norm": 0.048406410962343216, "learning_rate": 5.307918827643712e-05, "loss": 0.259, "step": 3451 }, { "epoch": 3.2777118442914785, "grad_norm": 0.05563647300004959, "learning_rate": 5.302650334519151e-05, "loss": 0.2263, "step": 3452 }, { "epoch": 3.2786612864941844, "grad_norm": 0.04374406114220619, "learning_rate": 5.2973835138551056e-05, "loss": 0.2494, "step": 3453 }, { "epoch": 3.2796107286968907, "grad_norm": 0.053693000227212906, "learning_rate": 5.292118367526775e-05, "loss": 0.2699, "step": 3454 }, { "epoch": 3.2805601708995966, "grad_norm": 0.04658116027712822, "learning_rate": 5.2868548974087925e-05, "loss": 0.2578, "step": 3455 }, { "epoch": 3.2815096131023025, "grad_norm": 0.0444650836288929, "learning_rate": 5.28159310537518e-05, "loss": 0.25, "step": 3456 }, { "epoch": 3.2824590553050084, "grad_norm": 0.04959743097424507, "learning_rate": 5.2763329932993574e-05, "loss": 0.265, "step": 3457 }, { "epoch": 3.2834084975077142, "grad_norm": 0.04746592417359352, "learning_rate": 5.2710745630541666e-05, "loss": 0.2571, "step": 3458 }, { "epoch": 3.28435793971042, "grad_norm": 0.09326629340648651, "learning_rate": 5.265817816511822e-05, "loss": 0.2475, "step": 3459 }, { "epoch": 3.285307381913126, "grad_norm": 0.050191480666399, "learning_rate": 5.260562755543963e-05, "loss": 0.265, "step": 3460 }, { "epoch": 3.286256824115832, "grad_norm": 0.04949505627155304, "learning_rate": 5.255309382021618e-05, "loss": 0.2694, "step": 3461 }, { "epoch": 3.2872062663185377, "grad_norm": 0.05030905082821846, "learning_rate": 5.250057697815215e-05, "loss": 0.2615, "step": 3462 }, { "epoch": 3.2881557085212436, "grad_norm": 0.04959176853299141, "learning_rate": 5.244807704794582e-05, "loss": 0.2615, "step": 3463 }, { "epoch": 3.2891051507239495, "grad_norm": 0.04158158227801323, "learning_rate": 5.2395594048289444e-05, "loss": 0.2469, "step": 3464 }, { "epoch": 3.290054592926656, "grad_norm": 0.0466022789478302, "learning_rate": 5.234312799786921e-05, "loss": 0.2499, "step": 3465 }, { "epoch": 3.2910040351293617, "grad_norm": 0.03744713217020035, "learning_rate": 5.229067891536539e-05, "loss": 0.2512, "step": 3466 }, { "epoch": 3.2919534773320676, "grad_norm": 0.04421250522136688, "learning_rate": 5.223824681945211e-05, "loss": 0.2553, "step": 3467 }, { "epoch": 3.2929029195347734, "grad_norm": 0.049400459975004196, "learning_rate": 5.2185831728797443e-05, "loss": 0.2647, "step": 3468 }, { "epoch": 3.2938523617374793, "grad_norm": 0.10226401686668396, "learning_rate": 5.213343366206347e-05, "loss": 0.2723, "step": 3469 }, { "epoch": 3.294801803940185, "grad_norm": 0.04881599545478821, "learning_rate": 5.2081052637906104e-05, "loss": 0.2612, "step": 3470 }, { "epoch": 3.295751246142891, "grad_norm": 0.04717850685119629, "learning_rate": 5.2028688674975415e-05, "loss": 0.2496, "step": 3471 }, { "epoch": 3.296700688345597, "grad_norm": 0.048411279916763306, "learning_rate": 5.197634179191508e-05, "loss": 0.2527, "step": 3472 }, { "epoch": 3.297650130548303, "grad_norm": 0.05463425815105438, "learning_rate": 5.192401200736298e-05, "loss": 0.2492, "step": 3473 }, { "epoch": 3.2985995727510087, "grad_norm": 0.07052161544561386, "learning_rate": 5.1871699339950755e-05, "loss": 0.271, "step": 3474 }, { "epoch": 3.2995490149537146, "grad_norm": 0.05703425779938698, "learning_rate": 5.1819403808303926e-05, "loss": 0.2719, "step": 3475 }, { "epoch": 3.3004984571564204, "grad_norm": 0.07146856188774109, "learning_rate": 5.176712543104212e-05, "loss": 0.2593, "step": 3476 }, { "epoch": 3.3014478993591263, "grad_norm": 0.054076872766017914, "learning_rate": 5.171486422677855e-05, "loss": 0.2548, "step": 3477 }, { "epoch": 3.302397341561832, "grad_norm": 0.0902649313211441, "learning_rate": 5.166262021412058e-05, "loss": 0.2347, "step": 3478 }, { "epoch": 3.3033467837645385, "grad_norm": 0.047539256513118744, "learning_rate": 5.161039341166931e-05, "loss": 0.2549, "step": 3479 }, { "epoch": 3.3042962259672444, "grad_norm": 0.04861597344279289, "learning_rate": 5.1558183838019755e-05, "loss": 0.2614, "step": 3480 }, { "epoch": 3.3052456681699502, "grad_norm": 0.04589053988456726, "learning_rate": 5.15059915117608e-05, "loss": 0.2631, "step": 3481 }, { "epoch": 3.306195110372656, "grad_norm": 0.048206113278865814, "learning_rate": 5.145381645147511e-05, "loss": 0.2564, "step": 3482 }, { "epoch": 3.307144552575362, "grad_norm": 0.05081455409526825, "learning_rate": 5.14016586757394e-05, "loss": 0.246, "step": 3483 }, { "epoch": 3.308093994778068, "grad_norm": 0.08231256902217865, "learning_rate": 5.134951820312401e-05, "loss": 0.2348, "step": 3484 }, { "epoch": 3.3090434369807737, "grad_norm": 0.04811001196503639, "learning_rate": 5.129739505219324e-05, "loss": 0.2501, "step": 3485 }, { "epoch": 3.3099928791834796, "grad_norm": 0.08391708880662918, "learning_rate": 5.124528924150521e-05, "loss": 0.2364, "step": 3486 }, { "epoch": 3.3109423213861855, "grad_norm": 0.0699569433927536, "learning_rate": 5.119320078961183e-05, "loss": 0.2391, "step": 3487 }, { "epoch": 3.3118917635888914, "grad_norm": 0.0865458995103836, "learning_rate": 5.114112971505882e-05, "loss": 0.2409, "step": 3488 }, { "epoch": 3.3128412057915972, "grad_norm": 0.06671997159719467, "learning_rate": 5.108907603638582e-05, "loss": 0.2678, "step": 3489 }, { "epoch": 3.3137906479943036, "grad_norm": 0.05352495610713959, "learning_rate": 5.103703977212615e-05, "loss": 0.255, "step": 3490 }, { "epoch": 3.3147400901970094, "grad_norm": 0.053172145038843155, "learning_rate": 5.0985020940807005e-05, "loss": 0.2457, "step": 3491 }, { "epoch": 3.3156895323997153, "grad_norm": 0.046016935259103775, "learning_rate": 5.093301956094934e-05, "loss": 0.2526, "step": 3492 }, { "epoch": 3.316638974602421, "grad_norm": 0.046845342963933945, "learning_rate": 5.0881035651067855e-05, "loss": 0.2591, "step": 3493 }, { "epoch": 3.317588416805127, "grad_norm": 0.04753097519278526, "learning_rate": 5.08290692296712e-05, "loss": 0.2601, "step": 3494 }, { "epoch": 3.318537859007833, "grad_norm": 0.044487521052360535, "learning_rate": 5.077712031526153e-05, "loss": 0.2641, "step": 3495 }, { "epoch": 3.319487301210539, "grad_norm": 0.08220727741718292, "learning_rate": 5.072518892633502e-05, "loss": 0.2822, "step": 3496 }, { "epoch": 3.3204367434132447, "grad_norm": 0.04794852435588837, "learning_rate": 5.0673275081381475e-05, "loss": 0.2519, "step": 3497 }, { "epoch": 3.3213861856159506, "grad_norm": 0.0479121133685112, "learning_rate": 5.0621378798884446e-05, "loss": 0.255, "step": 3498 }, { "epoch": 3.3223356278186564, "grad_norm": 0.048131756484508514, "learning_rate": 5.056950009732135e-05, "loss": 0.2409, "step": 3499 }, { "epoch": 3.3232850700213623, "grad_norm": 0.0748886838555336, "learning_rate": 5.051763899516313e-05, "loss": 0.272, "step": 3500 }, { "epoch": 3.3232850700213623, "eval_loss": 0.259135365486145, "eval_runtime": 37.7828, "eval_samples_per_second": 2.276, "eval_steps_per_second": 2.276, "step": 3500 }, { "epoch": 3.324234512224068, "grad_norm": 0.050895802676677704, "learning_rate": 5.046579551087469e-05, "loss": 0.2582, "step": 3501 }, { "epoch": 3.325183954426774, "grad_norm": 0.04380827769637108, "learning_rate": 5.041396966291453e-05, "loss": 0.2492, "step": 3502 }, { "epoch": 3.32613339662948, "grad_norm": 0.05072317644953728, "learning_rate": 5.036216146973491e-05, "loss": 0.2591, "step": 3503 }, { "epoch": 3.3270828388321863, "grad_norm": 0.04307514801621437, "learning_rate": 5.0310370949781794e-05, "loss": 0.2523, "step": 3504 }, { "epoch": 3.328032281034892, "grad_norm": 0.060126278549432755, "learning_rate": 5.02585981214948e-05, "loss": 0.2368, "step": 3505 }, { "epoch": 3.328981723237598, "grad_norm": 0.04991989582777023, "learning_rate": 5.0206843003307406e-05, "loss": 0.2634, "step": 3506 }, { "epoch": 3.329931165440304, "grad_norm": 0.0549112968146801, "learning_rate": 5.0155105613646636e-05, "loss": 0.2487, "step": 3507 }, { "epoch": 3.3308806076430097, "grad_norm": 0.045029643923044205, "learning_rate": 5.0103385970933245e-05, "loss": 0.2444, "step": 3508 }, { "epoch": 3.3318300498457156, "grad_norm": 0.0672144964337349, "learning_rate": 5.005168409358166e-05, "loss": 0.2712, "step": 3509 }, { "epoch": 3.3327794920484215, "grad_norm": 0.049809981137514114, "learning_rate": 5.000000000000002e-05, "loss": 0.2506, "step": 3510 }, { "epoch": 3.3337289342511274, "grad_norm": 0.042252179235219955, "learning_rate": 4.9948333708590055e-05, "loss": 0.2488, "step": 3511 }, { "epoch": 3.3346783764538332, "grad_norm": 0.05093264579772949, "learning_rate": 4.989668523774732e-05, "loss": 0.2574, "step": 3512 }, { "epoch": 3.335627818656539, "grad_norm": 0.05729779228568077, "learning_rate": 4.9845054605860775e-05, "loss": 0.2311, "step": 3513 }, { "epoch": 3.3365772608592454, "grad_norm": 0.047221146523952484, "learning_rate": 4.979344183131326e-05, "loss": 0.2506, "step": 3514 }, { "epoch": 3.3375267030619513, "grad_norm": 0.04780590906739235, "learning_rate": 4.974184693248115e-05, "loss": 0.2605, "step": 3515 }, { "epoch": 3.338476145264657, "grad_norm": 0.05309277027845383, "learning_rate": 4.9690269927734414e-05, "loss": 0.2641, "step": 3516 }, { "epoch": 3.339425587467363, "grad_norm": 0.07272838056087494, "learning_rate": 4.963871083543683e-05, "loss": 0.269, "step": 3517 }, { "epoch": 3.340375029670069, "grad_norm": 0.06501755863428116, "learning_rate": 4.958716967394552e-05, "loss": 0.2788, "step": 3518 }, { "epoch": 3.341324471872775, "grad_norm": 0.04865370690822601, "learning_rate": 4.953564646161148e-05, "loss": 0.2491, "step": 3519 }, { "epoch": 3.3422739140754807, "grad_norm": 0.050759755074977875, "learning_rate": 4.94841412167792e-05, "loss": 0.2303, "step": 3520 }, { "epoch": 3.3432233562781866, "grad_norm": 0.07433301210403442, "learning_rate": 4.943265395778672e-05, "loss": 0.2493, "step": 3521 }, { "epoch": 3.3441727984808924, "grad_norm": 0.052284806966781616, "learning_rate": 4.938118470296587e-05, "loss": 0.2638, "step": 3522 }, { "epoch": 3.3451222406835983, "grad_norm": 0.0797976404428482, "learning_rate": 4.932973347064177e-05, "loss": 0.254, "step": 3523 }, { "epoch": 3.346071682886304, "grad_norm": 0.04564180225133896, "learning_rate": 4.9278300279133425e-05, "loss": 0.251, "step": 3524 }, { "epoch": 3.34702112508901, "grad_norm": 0.059488341212272644, "learning_rate": 4.922688514675324e-05, "loss": 0.2596, "step": 3525 }, { "epoch": 3.347970567291716, "grad_norm": 0.050069741904735565, "learning_rate": 4.917548809180724e-05, "loss": 0.2574, "step": 3526 }, { "epoch": 3.348920009494422, "grad_norm": 0.07369361072778702, "learning_rate": 4.912410913259501e-05, "loss": 0.2708, "step": 3527 }, { "epoch": 3.349869451697128, "grad_norm": 0.06439623236656189, "learning_rate": 4.9072748287409677e-05, "loss": 0.2558, "step": 3528 }, { "epoch": 3.350818893899834, "grad_norm": 0.047009095549583435, "learning_rate": 4.902140557453791e-05, "loss": 0.2509, "step": 3529 }, { "epoch": 3.35176833610254, "grad_norm": 0.06549584865570068, "learning_rate": 4.897008101226002e-05, "loss": 0.2816, "step": 3530 }, { "epoch": 3.3527177783052458, "grad_norm": 0.05347298085689545, "learning_rate": 4.891877461884973e-05, "loss": 0.2425, "step": 3531 }, { "epoch": 3.3536672205079516, "grad_norm": 0.08783289790153503, "learning_rate": 4.886748641257435e-05, "loss": 0.2358, "step": 3532 }, { "epoch": 3.3546166627106575, "grad_norm": 0.04892612621188164, "learning_rate": 4.881621641169472e-05, "loss": 0.2522, "step": 3533 }, { "epoch": 3.3555661049133634, "grad_norm": 0.05587669089436531, "learning_rate": 4.8764964634465136e-05, "loss": 0.2636, "step": 3534 }, { "epoch": 3.3565155471160693, "grad_norm": 0.05909194424748421, "learning_rate": 4.871373109913358e-05, "loss": 0.2501, "step": 3535 }, { "epoch": 3.357464989318775, "grad_norm": 0.07228993624448776, "learning_rate": 4.8662515823941255e-05, "loss": 0.2849, "step": 3536 }, { "epoch": 3.358414431521481, "grad_norm": 0.05400128290057182, "learning_rate": 4.861131882712314e-05, "loss": 0.2576, "step": 3537 }, { "epoch": 3.359363873724187, "grad_norm": 0.0629948079586029, "learning_rate": 4.8560140126907564e-05, "loss": 0.2369, "step": 3538 }, { "epoch": 3.360313315926893, "grad_norm": 0.045974526554346085, "learning_rate": 4.85089797415163e-05, "loss": 0.253, "step": 3539 }, { "epoch": 3.361262758129599, "grad_norm": 0.06558282673358917, "learning_rate": 4.845783768916482e-05, "loss": 0.2458, "step": 3540 }, { "epoch": 3.362212200332305, "grad_norm": 0.04700404405593872, "learning_rate": 4.840671398806174e-05, "loss": 0.2452, "step": 3541 }, { "epoch": 3.363161642535011, "grad_norm": 0.07221265882253647, "learning_rate": 4.8355608656409426e-05, "loss": 0.2776, "step": 3542 }, { "epoch": 3.3641110847377167, "grad_norm": 0.05162516236305237, "learning_rate": 4.8304521712403575e-05, "loss": 0.2571, "step": 3543 }, { "epoch": 3.3650605269404226, "grad_norm": 0.0484129823744297, "learning_rate": 4.825345317423334e-05, "loss": 0.265, "step": 3544 }, { "epoch": 3.3660099691431284, "grad_norm": 0.05916272848844528, "learning_rate": 4.820240306008136e-05, "loss": 0.2429, "step": 3545 }, { "epoch": 3.3669594113458343, "grad_norm": 0.09139509499073029, "learning_rate": 4.8151371388123644e-05, "loss": 0.2717, "step": 3546 }, { "epoch": 3.36790885354854, "grad_norm": 0.07215554267168045, "learning_rate": 4.8100358176529794e-05, "loss": 0.273, "step": 3547 }, { "epoch": 3.368858295751246, "grad_norm": 0.08182163536548615, "learning_rate": 4.804936344346258e-05, "loss": 0.2581, "step": 3548 }, { "epoch": 3.369807737953952, "grad_norm": 0.0751405879855156, "learning_rate": 4.799838720707846e-05, "loss": 0.2648, "step": 3549 }, { "epoch": 3.370757180156658, "grad_norm": 0.06326061487197876, "learning_rate": 4.794742948552715e-05, "loss": 0.2466, "step": 3550 }, { "epoch": 3.3717066223593637, "grad_norm": 0.05183592066168785, "learning_rate": 4.78964902969518e-05, "loss": 0.2387, "step": 3551 }, { "epoch": 3.3726560645620696, "grad_norm": 0.04825581982731819, "learning_rate": 4.7845569659489e-05, "loss": 0.2598, "step": 3552 }, { "epoch": 3.373605506764776, "grad_norm": 0.048268452286720276, "learning_rate": 4.779466759126868e-05, "loss": 0.2543, "step": 3553 }, { "epoch": 3.3745549489674818, "grad_norm": 0.06446631997823715, "learning_rate": 4.774378411041416e-05, "loss": 0.2519, "step": 3554 }, { "epoch": 3.3755043911701876, "grad_norm": 0.05456710606813431, "learning_rate": 4.7692919235042255e-05, "loss": 0.258, "step": 3555 }, { "epoch": 3.3764538333728935, "grad_norm": 0.04780459776520729, "learning_rate": 4.764207298326301e-05, "loss": 0.2443, "step": 3556 }, { "epoch": 3.3774032755755994, "grad_norm": 0.06650704890489578, "learning_rate": 4.7591245373179924e-05, "loss": 0.2538, "step": 3557 }, { "epoch": 3.3783527177783053, "grad_norm": 0.04714061692357063, "learning_rate": 4.754043642288981e-05, "loss": 0.2513, "step": 3558 }, { "epoch": 3.379302159981011, "grad_norm": 0.07686729729175568, "learning_rate": 4.748964615048285e-05, "loss": 0.2756, "step": 3559 }, { "epoch": 3.380251602183717, "grad_norm": 0.09275008738040924, "learning_rate": 4.743887457404268e-05, "loss": 0.2703, "step": 3560 }, { "epoch": 3.381201044386423, "grad_norm": 0.07193811982870102, "learning_rate": 4.738812171164604e-05, "loss": 0.275, "step": 3561 }, { "epoch": 3.3821504865891288, "grad_norm": 0.05852194502949715, "learning_rate": 4.733738758136327e-05, "loss": 0.2501, "step": 3562 }, { "epoch": 3.3830999287918346, "grad_norm": 0.043062902987003326, "learning_rate": 4.7286672201257873e-05, "loss": 0.2575, "step": 3563 }, { "epoch": 3.384049370994541, "grad_norm": 0.06103948876261711, "learning_rate": 4.723597558938672e-05, "loss": 0.2695, "step": 3564 }, { "epoch": 3.384998813197247, "grad_norm": 0.05683431774377823, "learning_rate": 4.7185297763800084e-05, "loss": 0.2587, "step": 3565 }, { "epoch": 3.3859482553999527, "grad_norm": 0.05255016312003136, "learning_rate": 4.713463874254135e-05, "loss": 0.2517, "step": 3566 }, { "epoch": 3.3868976976026586, "grad_norm": 0.04798266291618347, "learning_rate": 4.708399854364742e-05, "loss": 0.2637, "step": 3567 }, { "epoch": 3.3878471398053644, "grad_norm": 0.09509357064962387, "learning_rate": 4.7033377185148385e-05, "loss": 0.2498, "step": 3568 }, { "epoch": 3.3887965820080703, "grad_norm": 0.05289481580257416, "learning_rate": 4.698277468506763e-05, "loss": 0.2664, "step": 3569 }, { "epoch": 3.389746024210776, "grad_norm": 0.049032047390937805, "learning_rate": 4.693219106142186e-05, "loss": 0.2487, "step": 3570 }, { "epoch": 3.390695466413482, "grad_norm": 0.06823623180389404, "learning_rate": 4.6881626332221e-05, "loss": 0.2503, "step": 3571 }, { "epoch": 3.391644908616188, "grad_norm": 0.0999704971909523, "learning_rate": 4.683108051546836e-05, "loss": 0.2484, "step": 3572 }, { "epoch": 3.392594350818894, "grad_norm": 0.0468447245657444, "learning_rate": 4.678055362916041e-05, "loss": 0.256, "step": 3573 }, { "epoch": 3.3935437930215997, "grad_norm": 0.056768015027046204, "learning_rate": 4.673004569128692e-05, "loss": 0.2709, "step": 3574 }, { "epoch": 3.3944932352243056, "grad_norm": 0.05072092264890671, "learning_rate": 4.66795567198309e-05, "loss": 0.258, "step": 3575 }, { "epoch": 3.3954426774270114, "grad_norm": 0.04872744157910347, "learning_rate": 4.662908673276862e-05, "loss": 0.2504, "step": 3576 }, { "epoch": 3.3963921196297173, "grad_norm": 0.07183968275785446, "learning_rate": 4.6578635748069566e-05, "loss": 0.2338, "step": 3577 }, { "epoch": 3.3973415618324236, "grad_norm": 0.051606882363557816, "learning_rate": 4.6528203783696534e-05, "loss": 0.2472, "step": 3578 }, { "epoch": 3.3982910040351295, "grad_norm": 0.06574734300374985, "learning_rate": 4.647779085760546e-05, "loss": 0.2364, "step": 3579 }, { "epoch": 3.3992404462378354, "grad_norm": 0.04813767224550247, "learning_rate": 4.6427396987745555e-05, "loss": 0.252, "step": 3580 }, { "epoch": 3.4001898884405413, "grad_norm": 0.045176900923252106, "learning_rate": 4.637702219205919e-05, "loss": 0.2516, "step": 3581 }, { "epoch": 3.401139330643247, "grad_norm": 0.05002497881650925, "learning_rate": 4.6326666488481975e-05, "loss": 0.2492, "step": 3582 }, { "epoch": 3.402088772845953, "grad_norm": 0.0624178983271122, "learning_rate": 4.627632989494283e-05, "loss": 0.2495, "step": 3583 }, { "epoch": 3.403038215048659, "grad_norm": 0.05267757549881935, "learning_rate": 4.622601242936361e-05, "loss": 0.2616, "step": 3584 }, { "epoch": 3.4039876572513648, "grad_norm": 0.052334707230329514, "learning_rate": 4.617571410965964e-05, "loss": 0.2609, "step": 3585 }, { "epoch": 3.4049370994540706, "grad_norm": 0.061013367027044296, "learning_rate": 4.6125434953739275e-05, "loss": 0.266, "step": 3586 }, { "epoch": 3.4058865416567765, "grad_norm": 0.05254976451396942, "learning_rate": 4.607517497950402e-05, "loss": 0.2616, "step": 3587 }, { "epoch": 3.4068359838594824, "grad_norm": 0.053374432027339935, "learning_rate": 4.6024934204848745e-05, "loss": 0.2558, "step": 3588 }, { "epoch": 3.4077854260621887, "grad_norm": 0.05137740448117256, "learning_rate": 4.5974712647661176e-05, "loss": 0.2589, "step": 3589 }, { "epoch": 3.4087348682648946, "grad_norm": 0.04847841337323189, "learning_rate": 4.5924510325822503e-05, "loss": 0.2563, "step": 3590 }, { "epoch": 3.4096843104676005, "grad_norm": 0.04520807042717934, "learning_rate": 4.587432725720687e-05, "loss": 0.2516, "step": 3591 }, { "epoch": 3.4106337526703063, "grad_norm": 0.045983344316482544, "learning_rate": 4.5824163459681656e-05, "loss": 0.2417, "step": 3592 }, { "epoch": 3.411583194873012, "grad_norm": 0.04577578604221344, "learning_rate": 4.577401895110733e-05, "loss": 0.2401, "step": 3593 }, { "epoch": 3.412532637075718, "grad_norm": 0.0507587231695652, "learning_rate": 4.57238937493375e-05, "loss": 0.2547, "step": 3594 }, { "epoch": 3.413482079278424, "grad_norm": 0.05529985949397087, "learning_rate": 4.5673787872218965e-05, "loss": 0.2687, "step": 3595 }, { "epoch": 3.41443152148113, "grad_norm": 0.05202402547001839, "learning_rate": 4.5623701337591565e-05, "loss": 0.2576, "step": 3596 }, { "epoch": 3.4153809636838357, "grad_norm": 0.04816010966897011, "learning_rate": 4.5573634163288294e-05, "loss": 0.2457, "step": 3597 }, { "epoch": 3.4163304058865416, "grad_norm": 0.06441019475460052, "learning_rate": 4.552358636713523e-05, "loss": 0.2617, "step": 3598 }, { "epoch": 3.4172798480892475, "grad_norm": 0.061169568449258804, "learning_rate": 4.547355796695156e-05, "loss": 0.2748, "step": 3599 }, { "epoch": 3.4182292902919533, "grad_norm": 0.04750002920627594, "learning_rate": 4.542354898054953e-05, "loss": 0.2625, "step": 3600 }, { "epoch": 3.419178732494659, "grad_norm": 0.05058363825082779, "learning_rate": 4.537355942573463e-05, "loss": 0.2629, "step": 3601 }, { "epoch": 3.420128174697365, "grad_norm": 0.052922967821359634, "learning_rate": 4.532358932030517e-05, "loss": 0.2601, "step": 3602 }, { "epoch": 3.4210776169000714, "grad_norm": 0.048897773027420044, "learning_rate": 4.527363868205278e-05, "loss": 0.2518, "step": 3603 }, { "epoch": 3.4220270591027773, "grad_norm": 0.07733273506164551, "learning_rate": 4.5223707528762e-05, "loss": 0.2596, "step": 3604 }, { "epoch": 3.422976501305483, "grad_norm": 0.053071219474077225, "learning_rate": 4.517379587821049e-05, "loss": 0.2563, "step": 3605 }, { "epoch": 3.423925943508189, "grad_norm": 0.06092002987861633, "learning_rate": 4.512390374816905e-05, "loss": 0.2588, "step": 3606 }, { "epoch": 3.424875385710895, "grad_norm": 0.04813700541853905, "learning_rate": 4.507403115640131e-05, "loss": 0.252, "step": 3607 }, { "epoch": 3.4258248279136008, "grad_norm": 0.0530259907245636, "learning_rate": 4.502417812066418e-05, "loss": 0.2591, "step": 3608 }, { "epoch": 3.4267742701163066, "grad_norm": 0.05496685951948166, "learning_rate": 4.497434465870749e-05, "loss": 0.2537, "step": 3609 }, { "epoch": 3.4277237123190125, "grad_norm": 0.09745223820209503, "learning_rate": 4.492453078827409e-05, "loss": 0.261, "step": 3610 }, { "epoch": 3.4286731545217184, "grad_norm": 0.12049929052591324, "learning_rate": 4.487473652709989e-05, "loss": 0.2217, "step": 3611 }, { "epoch": 3.4296225967244243, "grad_norm": 0.06632792949676514, "learning_rate": 4.4824961892913786e-05, "loss": 0.2442, "step": 3612 }, { "epoch": 3.43057203892713, "grad_norm": 0.07434866577386856, "learning_rate": 4.477520690343776e-05, "loss": 0.2357, "step": 3613 }, { "epoch": 3.4315214811298365, "grad_norm": 0.09781359136104584, "learning_rate": 4.4725471576386735e-05, "loss": 0.2687, "step": 3614 }, { "epoch": 3.4324709233325423, "grad_norm": 0.049130357801914215, "learning_rate": 4.467575592946864e-05, "loss": 0.2536, "step": 3615 }, { "epoch": 3.433420365535248, "grad_norm": 0.061379898339509964, "learning_rate": 4.4626059980384404e-05, "loss": 0.2606, "step": 3616 }, { "epoch": 3.434369807737954, "grad_norm": 0.04795004054903984, "learning_rate": 4.457638374682794e-05, "loss": 0.2451, "step": 3617 }, { "epoch": 3.43531924994066, "grad_norm": 0.05227689817547798, "learning_rate": 4.452672724648611e-05, "loss": 0.248, "step": 3618 }, { "epoch": 3.436268692143366, "grad_norm": 0.0511259026825428, "learning_rate": 4.447709049703885e-05, "loss": 0.2566, "step": 3619 }, { "epoch": 3.4372181343460717, "grad_norm": 0.05518270656466484, "learning_rate": 4.442747351615899e-05, "loss": 0.2534, "step": 3620 }, { "epoch": 3.4381675765487776, "grad_norm": 0.06017496809363365, "learning_rate": 4.43778763215123e-05, "loss": 0.2491, "step": 3621 }, { "epoch": 3.4391170187514835, "grad_norm": 0.044689204543828964, "learning_rate": 4.432829893075755e-05, "loss": 0.2521, "step": 3622 }, { "epoch": 3.4400664609541893, "grad_norm": 0.1103987917304039, "learning_rate": 4.4278741361546404e-05, "loss": 0.2351, "step": 3623 }, { "epoch": 3.441015903156895, "grad_norm": 0.07348011434078217, "learning_rate": 4.4229203631523616e-05, "loss": 0.2433, "step": 3624 }, { "epoch": 3.441965345359601, "grad_norm": 0.058661412447690964, "learning_rate": 4.417968575832664e-05, "loss": 0.2381, "step": 3625 }, { "epoch": 3.442914787562307, "grad_norm": 0.05927921459078789, "learning_rate": 4.413018775958607e-05, "loss": 0.2522, "step": 3626 }, { "epoch": 3.443864229765013, "grad_norm": 0.05343930050730705, "learning_rate": 4.4080709652925336e-05, "loss": 0.255, "step": 3627 }, { "epoch": 3.444813671967719, "grad_norm": 0.05600766837596893, "learning_rate": 4.4031251455960735e-05, "loss": 0.2643, "step": 3628 }, { "epoch": 3.445763114170425, "grad_norm": 0.08665018528699875, "learning_rate": 4.3981813186301646e-05, "loss": 0.2348, "step": 3629 }, { "epoch": 3.446712556373131, "grad_norm": 0.05690651014447212, "learning_rate": 4.3932394861550106e-05, "loss": 0.2613, "step": 3630 }, { "epoch": 3.4476619985758368, "grad_norm": 0.050910115242004395, "learning_rate": 4.38829964993013e-05, "loss": 0.2489, "step": 3631 }, { "epoch": 3.4486114407785426, "grad_norm": 0.0760849341750145, "learning_rate": 4.383361811714313e-05, "loss": 0.2771, "step": 3632 }, { "epoch": 3.4495608829812485, "grad_norm": 0.07664194703102112, "learning_rate": 4.3784259732656464e-05, "loss": 0.2696, "step": 3633 }, { "epoch": 3.4505103251839544, "grad_norm": 0.05651098117232323, "learning_rate": 4.373492136341502e-05, "loss": 0.2629, "step": 3634 }, { "epoch": 3.4514597673866603, "grad_norm": 0.058015789836645126, "learning_rate": 4.3685603026985354e-05, "loss": 0.2509, "step": 3635 }, { "epoch": 3.452409209589366, "grad_norm": 0.04693415388464928, "learning_rate": 4.3636304740927046e-05, "loss": 0.2518, "step": 3636 }, { "epoch": 3.453358651792072, "grad_norm": 0.07002034038305283, "learning_rate": 4.358702652279235e-05, "loss": 0.2443, "step": 3637 }, { "epoch": 3.454308093994778, "grad_norm": 0.051126398146152496, "learning_rate": 4.3537768390126476e-05, "loss": 0.2479, "step": 3638 }, { "epoch": 3.455257536197484, "grad_norm": 0.077112577855587, "learning_rate": 4.348853036046746e-05, "loss": 0.2893, "step": 3639 }, { "epoch": 3.45620697840019, "grad_norm": 0.07877352833747864, "learning_rate": 4.343931245134616e-05, "loss": 0.2471, "step": 3640 }, { "epoch": 3.457156420602896, "grad_norm": 0.05718987062573433, "learning_rate": 4.3390114680286266e-05, "loss": 0.2607, "step": 3641 }, { "epoch": 3.458105862805602, "grad_norm": 0.0493890680372715, "learning_rate": 4.334093706480443e-05, "loss": 0.2641, "step": 3642 }, { "epoch": 3.4590553050083077, "grad_norm": 0.08550732582807541, "learning_rate": 4.329177962240988e-05, "loss": 0.269, "step": 3643 }, { "epoch": 3.4600047472110136, "grad_norm": 0.04813637584447861, "learning_rate": 4.3242642370604893e-05, "loss": 0.2592, "step": 3644 }, { "epoch": 3.4609541894137195, "grad_norm": 0.04899512976408005, "learning_rate": 4.3193525326884435e-05, "loss": 0.246, "step": 3645 }, { "epoch": 3.4619036316164253, "grad_norm": 0.049979884177446365, "learning_rate": 4.314442850873628e-05, "loss": 0.2422, "step": 3646 }, { "epoch": 3.462853073819131, "grad_norm": 0.05317814648151398, "learning_rate": 4.3095351933641124e-05, "loss": 0.2615, "step": 3647 }, { "epoch": 3.463802516021837, "grad_norm": 0.10087147355079651, "learning_rate": 4.304629561907222e-05, "loss": 0.26, "step": 3648 }, { "epoch": 3.464751958224543, "grad_norm": 0.08153299987316132, "learning_rate": 4.299725958249586e-05, "loss": 0.289, "step": 3649 }, { "epoch": 3.465701400427249, "grad_norm": 0.047881294041872025, "learning_rate": 4.294824384137096e-05, "loss": 0.257, "step": 3650 }, { "epoch": 3.4666508426299547, "grad_norm": 0.05004667118191719, "learning_rate": 4.289924841314922e-05, "loss": 0.2535, "step": 3651 }, { "epoch": 3.4676002848326606, "grad_norm": 0.04900941625237465, "learning_rate": 4.285027331527525e-05, "loss": 0.2601, "step": 3652 }, { "epoch": 3.468549727035367, "grad_norm": 0.045236848294734955, "learning_rate": 4.2801318565186165e-05, "loss": 0.2551, "step": 3653 }, { "epoch": 3.4694991692380728, "grad_norm": 0.048930395394563675, "learning_rate": 4.275238418031209e-05, "loss": 0.2458, "step": 3654 }, { "epoch": 3.4704486114407787, "grad_norm": 0.04744086042046547, "learning_rate": 4.270347017807574e-05, "loss": 0.2446, "step": 3655 }, { "epoch": 3.4713980536434845, "grad_norm": 0.060483697801828384, "learning_rate": 4.265457657589267e-05, "loss": 0.254, "step": 3656 }, { "epoch": 3.4723474958461904, "grad_norm": 0.06201721355319023, "learning_rate": 4.260570339117107e-05, "loss": 0.2527, "step": 3657 }, { "epoch": 3.4732969380488963, "grad_norm": 0.0473172627389431, "learning_rate": 4.2556850641311964e-05, "loss": 0.2472, "step": 3658 }, { "epoch": 3.474246380251602, "grad_norm": 0.058292679488658905, "learning_rate": 4.250801834370899e-05, "loss": 0.2579, "step": 3659 }, { "epoch": 3.475195822454308, "grad_norm": 0.04996812716126442, "learning_rate": 4.245920651574864e-05, "loss": 0.2364, "step": 3660 }, { "epoch": 3.476145264657014, "grad_norm": 0.0430903434753418, "learning_rate": 4.241041517481001e-05, "loss": 0.2537, "step": 3661 }, { "epoch": 3.4770947068597198, "grad_norm": 0.044585153460502625, "learning_rate": 4.236164433826495e-05, "loss": 0.2513, "step": 3662 }, { "epoch": 3.4780441490624256, "grad_norm": 0.061624232679605484, "learning_rate": 4.231289402347798e-05, "loss": 0.2685, "step": 3663 }, { "epoch": 3.478993591265132, "grad_norm": 0.050394099205732346, "learning_rate": 4.22641642478063e-05, "loss": 0.2681, "step": 3664 }, { "epoch": 3.479943033467838, "grad_norm": 0.044247131794691086, "learning_rate": 4.221545502859994e-05, "loss": 0.2509, "step": 3665 }, { "epoch": 3.4808924756705437, "grad_norm": 0.06681392341852188, "learning_rate": 4.216676638320135e-05, "loss": 0.2365, "step": 3666 }, { "epoch": 3.4818419178732496, "grad_norm": 0.04180409014225006, "learning_rate": 4.2118098328945896e-05, "loss": 0.2547, "step": 3667 }, { "epoch": 3.4827913600759555, "grad_norm": 0.051863424479961395, "learning_rate": 4.206945088316151e-05, "loss": 0.2546, "step": 3668 }, { "epoch": 3.4837408022786613, "grad_norm": 0.07532540708780289, "learning_rate": 4.202082406316877e-05, "loss": 0.2369, "step": 3669 }, { "epoch": 3.484690244481367, "grad_norm": 0.04621781036257744, "learning_rate": 4.197221788628096e-05, "loss": 0.2426, "step": 3670 }, { "epoch": 3.485639686684073, "grad_norm": 0.07874076068401337, "learning_rate": 4.1923632369803946e-05, "loss": 0.2673, "step": 3671 }, { "epoch": 3.486589128886779, "grad_norm": 0.044167328625917435, "learning_rate": 4.1875067531036374e-05, "loss": 0.2445, "step": 3672 }, { "epoch": 3.487538571089485, "grad_norm": 0.05342372506856918, "learning_rate": 4.18265233872693e-05, "loss": 0.2511, "step": 3673 }, { "epoch": 3.4884880132921907, "grad_norm": 0.050559043884277344, "learning_rate": 4.1777999955786675e-05, "loss": 0.2562, "step": 3674 }, { "epoch": 3.4894374554948966, "grad_norm": 0.05720347538590431, "learning_rate": 4.172949725386488e-05, "loss": 0.2429, "step": 3675 }, { "epoch": 3.4903868976976025, "grad_norm": 0.054926902055740356, "learning_rate": 4.168101529877297e-05, "loss": 0.2567, "step": 3676 }, { "epoch": 3.4913363399003083, "grad_norm": 0.05101928859949112, "learning_rate": 4.163255410777274e-05, "loss": 0.2448, "step": 3677 }, { "epoch": 3.4922857821030147, "grad_norm": 0.05976559594273567, "learning_rate": 4.158411369811831e-05, "loss": 0.2592, "step": 3678 }, { "epoch": 3.4932352243057205, "grad_norm": 0.050272174179553986, "learning_rate": 4.15356940870567e-05, "loss": 0.2539, "step": 3679 }, { "epoch": 3.4941846665084264, "grad_norm": 0.05651102960109711, "learning_rate": 4.148729529182736e-05, "loss": 0.2626, "step": 3680 }, { "epoch": 3.4951341087111323, "grad_norm": 0.04792777821421623, "learning_rate": 4.143891732966233e-05, "loss": 0.2435, "step": 3681 }, { "epoch": 3.496083550913838, "grad_norm": 0.05119699984788895, "learning_rate": 4.13905602177863e-05, "loss": 0.2581, "step": 3682 }, { "epoch": 3.497032993116544, "grad_norm": 0.05814248323440552, "learning_rate": 4.134222397341649e-05, "loss": 0.2246, "step": 3683 }, { "epoch": 3.49798243531925, "grad_norm": 0.04919525235891342, "learning_rate": 4.129390861376268e-05, "loss": 0.2573, "step": 3684 }, { "epoch": 3.498931877521956, "grad_norm": 0.049026183784008026, "learning_rate": 4.124561415602729e-05, "loss": 0.2469, "step": 3685 }, { "epoch": 3.4998813197246617, "grad_norm": 0.05068015307188034, "learning_rate": 4.119734061740521e-05, "loss": 0.2501, "step": 3686 }, { "epoch": 3.5008307619273675, "grad_norm": 0.059492919594049454, "learning_rate": 4.1149088015083925e-05, "loss": 0.24, "step": 3687 }, { "epoch": 3.501780204130074, "grad_norm": 0.05316033214330673, "learning_rate": 4.110085636624346e-05, "loss": 0.2647, "step": 3688 }, { "epoch": 3.5027296463327797, "grad_norm": 0.05338413268327713, "learning_rate": 4.105264568805633e-05, "loss": 0.2561, "step": 3689 }, { "epoch": 3.5036790885354856, "grad_norm": 0.05011837184429169, "learning_rate": 4.100445599768774e-05, "loss": 0.2461, "step": 3690 }, { "epoch": 3.5046285307381915, "grad_norm": 0.05094316601753235, "learning_rate": 4.0956287312295183e-05, "loss": 0.2623, "step": 3691 }, { "epoch": 3.5055779729408973, "grad_norm": 0.05120784044265747, "learning_rate": 4.090813964902889e-05, "loss": 0.2494, "step": 3692 }, { "epoch": 3.506527415143603, "grad_norm": 0.07456446439027786, "learning_rate": 4.08600130250315e-05, "loss": 0.2714, "step": 3693 }, { "epoch": 3.507476857346309, "grad_norm": 0.05052252858877182, "learning_rate": 4.081190745743814e-05, "loss": 0.2547, "step": 3694 }, { "epoch": 3.508426299549015, "grad_norm": 0.09999831020832062, "learning_rate": 4.0763822963376586e-05, "loss": 0.269, "step": 3695 }, { "epoch": 3.509375741751721, "grad_norm": 0.0458545945584774, "learning_rate": 4.071575955996687e-05, "loss": 0.2481, "step": 3696 }, { "epoch": 3.5103251839544267, "grad_norm": 0.07028304040431976, "learning_rate": 4.066771726432176e-05, "loss": 0.2618, "step": 3697 }, { "epoch": 3.5112746261571326, "grad_norm": 0.05372486636042595, "learning_rate": 4.061969609354634e-05, "loss": 0.2526, "step": 3698 }, { "epoch": 3.5122240683598385, "grad_norm": 0.06410619616508484, "learning_rate": 4.057169606473827e-05, "loss": 0.2442, "step": 3699 }, { "epoch": 3.5131735105625443, "grad_norm": 0.04757387191057205, "learning_rate": 4.0523717194987634e-05, "loss": 0.2496, "step": 3700 }, { "epoch": 3.51412295276525, "grad_norm": 0.05048002675175667, "learning_rate": 4.047575950137693e-05, "loss": 0.2443, "step": 3701 }, { "epoch": 3.515072394967956, "grad_norm": 0.055802926421165466, "learning_rate": 4.0427823000981293e-05, "loss": 0.2524, "step": 3702 }, { "epoch": 3.516021837170662, "grad_norm": 0.07027749717235565, "learning_rate": 4.037990771086813e-05, "loss": 0.2691, "step": 3703 }, { "epoch": 3.5169712793733683, "grad_norm": 0.07510142773389816, "learning_rate": 4.0332013648097375e-05, "loss": 0.2485, "step": 3704 }, { "epoch": 3.517920721576074, "grad_norm": 0.05176989361643791, "learning_rate": 4.028414082972141e-05, "loss": 0.2573, "step": 3705 }, { "epoch": 3.51887016377878, "grad_norm": 0.05572926253080368, "learning_rate": 4.023628927278501e-05, "loss": 0.2565, "step": 3706 }, { "epoch": 3.519819605981486, "grad_norm": 0.13677886128425598, "learning_rate": 4.018845899432539e-05, "loss": 0.2491, "step": 3707 }, { "epoch": 3.520769048184192, "grad_norm": 0.06043456867337227, "learning_rate": 4.0140650011372295e-05, "loss": 0.2591, "step": 3708 }, { "epoch": 3.5217184903868977, "grad_norm": 0.08667951822280884, "learning_rate": 4.009286234094772e-05, "loss": 0.2825, "step": 3709 }, { "epoch": 3.5226679325896035, "grad_norm": 0.060174696147441864, "learning_rate": 4.004509600006619e-05, "loss": 0.2709, "step": 3710 }, { "epoch": 3.5236173747923094, "grad_norm": 0.05058741942048073, "learning_rate": 3.999735100573457e-05, "loss": 0.2507, "step": 3711 }, { "epoch": 3.5245668169950153, "grad_norm": 0.05240025743842125, "learning_rate": 3.9949627374952146e-05, "loss": 0.2576, "step": 3712 }, { "epoch": 3.5255162591977216, "grad_norm": 0.06356460601091385, "learning_rate": 3.990192512471068e-05, "loss": 0.2688, "step": 3713 }, { "epoch": 3.5264657014004275, "grad_norm": 0.05215013772249222, "learning_rate": 3.985424427199413e-05, "loss": 0.2395, "step": 3714 }, { "epoch": 3.5274151436031334, "grad_norm": 0.05398750677704811, "learning_rate": 3.9806584833779025e-05, "loss": 0.2528, "step": 3715 }, { "epoch": 3.5283645858058392, "grad_norm": 0.09040288627147675, "learning_rate": 3.975894682703418e-05, "loss": 0.2327, "step": 3716 }, { "epoch": 3.529314028008545, "grad_norm": 0.062312051653862, "learning_rate": 3.971133026872077e-05, "loss": 0.2532, "step": 3717 }, { "epoch": 3.530263470211251, "grad_norm": 0.07472645491361618, "learning_rate": 3.966373517579244e-05, "loss": 0.2651, "step": 3718 }, { "epoch": 3.531212912413957, "grad_norm": 0.05372164770960808, "learning_rate": 3.961616156519499e-05, "loss": 0.2546, "step": 3719 }, { "epoch": 3.5321623546166627, "grad_norm": 0.10469577461481094, "learning_rate": 3.9568609453866766e-05, "loss": 0.2797, "step": 3720 }, { "epoch": 3.5331117968193686, "grad_norm": 0.05509538576006889, "learning_rate": 3.952107885873839e-05, "loss": 0.2541, "step": 3721 }, { "epoch": 3.5340612390220745, "grad_norm": 0.10789843648672104, "learning_rate": 3.947356979673279e-05, "loss": 0.2432, "step": 3722 }, { "epoch": 3.5350106812247803, "grad_norm": 0.0722312331199646, "learning_rate": 3.942608228476526e-05, "loss": 0.2764, "step": 3723 }, { "epoch": 3.5359601234274862, "grad_norm": 0.05012940987944603, "learning_rate": 3.9378616339743404e-05, "loss": 0.2418, "step": 3724 }, { "epoch": 3.536909565630192, "grad_norm": 0.10994049906730652, "learning_rate": 3.9331171978567204e-05, "loss": 0.2588, "step": 3725 }, { "epoch": 3.537859007832898, "grad_norm": 0.05162129923701286, "learning_rate": 3.9283749218128885e-05, "loss": 0.2492, "step": 3726 }, { "epoch": 3.538808450035604, "grad_norm": 0.05795177444815636, "learning_rate": 3.923634807531301e-05, "loss": 0.2527, "step": 3727 }, { "epoch": 3.5397578922383097, "grad_norm": 0.0650908499956131, "learning_rate": 3.9188968566996455e-05, "loss": 0.2529, "step": 3728 }, { "epoch": 3.540707334441016, "grad_norm": 0.051869384944438934, "learning_rate": 3.914161071004836e-05, "loss": 0.2549, "step": 3729 }, { "epoch": 3.541656776643722, "grad_norm": 0.07333017140626907, "learning_rate": 3.909427452133016e-05, "loss": 0.2546, "step": 3730 }, { "epoch": 3.542606218846428, "grad_norm": 0.053457971662282944, "learning_rate": 3.904696001769571e-05, "loss": 0.2526, "step": 3731 }, { "epoch": 3.5435556610491337, "grad_norm": 0.0637202113866806, "learning_rate": 3.899966721599086e-05, "loss": 0.2537, "step": 3732 }, { "epoch": 3.5445051032518395, "grad_norm": 0.05182478576898575, "learning_rate": 3.8952396133054035e-05, "loss": 0.2649, "step": 3733 }, { "epoch": 3.5454545454545454, "grad_norm": 0.05615059286355972, "learning_rate": 3.890514678571575e-05, "loss": 0.2608, "step": 3734 }, { "epoch": 3.5464039876572513, "grad_norm": 0.05612653121352196, "learning_rate": 3.885791919079878e-05, "loss": 0.2641, "step": 3735 }, { "epoch": 3.547353429859957, "grad_norm": 0.05406120419502258, "learning_rate": 3.88107133651183e-05, "loss": 0.2455, "step": 3736 }, { "epoch": 3.548302872062663, "grad_norm": 0.08043722808361053, "learning_rate": 3.876352932548152e-05, "loss": 0.2448, "step": 3737 }, { "epoch": 3.5492523142653694, "grad_norm": 0.0515315979719162, "learning_rate": 3.871636708868809e-05, "loss": 0.2487, "step": 3738 }, { "epoch": 3.5502017564680752, "grad_norm": 0.058112140744924545, "learning_rate": 3.866922667152979e-05, "loss": 0.2543, "step": 3739 }, { "epoch": 3.551151198670781, "grad_norm": 0.05072011426091194, "learning_rate": 3.862210809079061e-05, "loss": 0.2518, "step": 3740 }, { "epoch": 3.552100640873487, "grad_norm": 0.07569391280412674, "learning_rate": 3.857501136324694e-05, "loss": 0.2729, "step": 3741 }, { "epoch": 3.553050083076193, "grad_norm": 0.047762371599674225, "learning_rate": 3.8527936505667095e-05, "loss": 0.2598, "step": 3742 }, { "epoch": 3.5539995252788987, "grad_norm": 0.050283271819353104, "learning_rate": 3.8480883534811886e-05, "loss": 0.2604, "step": 3743 }, { "epoch": 3.5549489674816046, "grad_norm": 0.0424044132232666, "learning_rate": 3.843385246743417e-05, "loss": 0.2461, "step": 3744 }, { "epoch": 3.5558984096843105, "grad_norm": 0.04651379585266113, "learning_rate": 3.8386843320279076e-05, "loss": 0.2483, "step": 3745 }, { "epoch": 3.5568478518870164, "grad_norm": 0.05307883024215698, "learning_rate": 3.833985611008387e-05, "loss": 0.2548, "step": 3746 }, { "epoch": 3.5577972940897222, "grad_norm": 0.05513716861605644, "learning_rate": 3.829289085357806e-05, "loss": 0.263, "step": 3747 }, { "epoch": 3.558746736292428, "grad_norm": 0.05174417421221733, "learning_rate": 3.824594756748326e-05, "loss": 0.2432, "step": 3748 }, { "epoch": 3.559696178495134, "grad_norm": 0.0701284259557724, "learning_rate": 3.8199026268513424e-05, "loss": 0.2642, "step": 3749 }, { "epoch": 3.56064562069784, "grad_norm": 0.07058609277009964, "learning_rate": 3.815212697337451e-05, "loss": 0.2311, "step": 3750 }, { "epoch": 3.5615950629005457, "grad_norm": 0.051359616219997406, "learning_rate": 3.810524969876471e-05, "loss": 0.2645, "step": 3751 }, { "epoch": 3.5625445051032516, "grad_norm": 0.05393604561686516, "learning_rate": 3.805839446137438e-05, "loss": 0.2533, "step": 3752 }, { "epoch": 3.5634939473059575, "grad_norm": 0.052022725343704224, "learning_rate": 3.8011561277885964e-05, "loss": 0.2539, "step": 3753 }, { "epoch": 3.564443389508664, "grad_norm": 0.06624822318553925, "learning_rate": 3.796475016497424e-05, "loss": 0.2495, "step": 3754 }, { "epoch": 3.5653928317113697, "grad_norm": 0.05817562714219093, "learning_rate": 3.7917961139305836e-05, "loss": 0.2562, "step": 3755 }, { "epoch": 3.5663422739140755, "grad_norm": 0.12431687861680984, "learning_rate": 3.787119421753979e-05, "loss": 0.2377, "step": 3756 }, { "epoch": 3.5672917161167814, "grad_norm": 0.04925607889890671, "learning_rate": 3.7824449416327126e-05, "loss": 0.2433, "step": 3757 }, { "epoch": 3.5682411583194873, "grad_norm": 0.04910467565059662, "learning_rate": 3.777772675231098e-05, "loss": 0.2582, "step": 3758 }, { "epoch": 3.569190600522193, "grad_norm": 0.04863162711262703, "learning_rate": 3.7731026242126766e-05, "loss": 0.2571, "step": 3759 }, { "epoch": 3.570140042724899, "grad_norm": 0.04957108944654465, "learning_rate": 3.768434790240175e-05, "loss": 0.2559, "step": 3760 }, { "epoch": 3.571089484927605, "grad_norm": 0.051660917699337006, "learning_rate": 3.7637691749755546e-05, "loss": 0.2636, "step": 3761 }, { "epoch": 3.572038927130311, "grad_norm": 0.05814244598150253, "learning_rate": 3.759105780079974e-05, "loss": 0.2622, "step": 3762 }, { "epoch": 3.572988369333017, "grad_norm": 0.07492271810770035, "learning_rate": 3.7544446072138054e-05, "loss": 0.2386, "step": 3763 }, { "epoch": 3.573937811535723, "grad_norm": 0.04846430569887161, "learning_rate": 3.749785658036627e-05, "loss": 0.2626, "step": 3764 }, { "epoch": 3.574887253738429, "grad_norm": 0.055590301752090454, "learning_rate": 3.745128934207225e-05, "loss": 0.253, "step": 3765 }, { "epoch": 3.5758366959411347, "grad_norm": 0.052691273391246796, "learning_rate": 3.740474437383602e-05, "loss": 0.2511, "step": 3766 }, { "epoch": 3.5767861381438406, "grad_norm": 0.07563423365354538, "learning_rate": 3.735822169222957e-05, "loss": 0.2738, "step": 3767 }, { "epoch": 3.5777355803465465, "grad_norm": 0.053566690534353256, "learning_rate": 3.7311721313816994e-05, "loss": 0.2641, "step": 3768 }, { "epoch": 3.5786850225492524, "grad_norm": 0.0528990775346756, "learning_rate": 3.726524325515446e-05, "loss": 0.2561, "step": 3769 }, { "epoch": 3.5796344647519582, "grad_norm": 0.05242493748664856, "learning_rate": 3.721878753279017e-05, "loss": 0.2663, "step": 3770 }, { "epoch": 3.580583906954664, "grad_norm": 0.05699344351887703, "learning_rate": 3.7172354163264324e-05, "loss": 0.2723, "step": 3771 }, { "epoch": 3.58153334915737, "grad_norm": 0.05177152156829834, "learning_rate": 3.7125943163109354e-05, "loss": 0.2464, "step": 3772 }, { "epoch": 3.582482791360076, "grad_norm": 0.05266297236084938, "learning_rate": 3.707955454884943e-05, "loss": 0.2468, "step": 3773 }, { "epoch": 3.5834322335627817, "grad_norm": 0.04910292476415634, "learning_rate": 3.703318833700103e-05, "loss": 0.254, "step": 3774 }, { "epoch": 3.5843816757654876, "grad_norm": 0.07194791734218597, "learning_rate": 3.6986844544072494e-05, "loss": 0.2385, "step": 3775 }, { "epoch": 3.5853311179681935, "grad_norm": 0.04586068168282509, "learning_rate": 3.694052318656421e-05, "loss": 0.2478, "step": 3776 }, { "epoch": 3.5862805601708994, "grad_norm": 0.09382741153240204, "learning_rate": 3.689422428096868e-05, "loss": 0.2705, "step": 3777 }, { "epoch": 3.5872300023736052, "grad_norm": 0.06713338196277618, "learning_rate": 3.684794784377018e-05, "loss": 0.2813, "step": 3778 }, { "epoch": 3.5881794445763115, "grad_norm": 0.12264712899923325, "learning_rate": 3.68016938914453e-05, "loss": 0.2384, "step": 3779 }, { "epoch": 3.5891288867790174, "grad_norm": 0.05466051772236824, "learning_rate": 3.675546244046228e-05, "loss": 0.2558, "step": 3780 }, { "epoch": 3.5900783289817233, "grad_norm": 0.05842528119683266, "learning_rate": 3.6709253507281624e-05, "loss": 0.2696, "step": 3781 }, { "epoch": 3.591027771184429, "grad_norm": 0.05473971739411354, "learning_rate": 3.6663067108355776e-05, "loss": 0.2518, "step": 3782 }, { "epoch": 3.591977213387135, "grad_norm": 0.05151469632983208, "learning_rate": 3.661690326012897e-05, "loss": 0.2444, "step": 3783 }, { "epoch": 3.592926655589841, "grad_norm": 0.048597030341625214, "learning_rate": 3.657076197903766e-05, "loss": 0.2478, "step": 3784 }, { "epoch": 3.593876097792547, "grad_norm": 0.0704786479473114, "learning_rate": 3.652464328151002e-05, "loss": 0.2697, "step": 3785 }, { "epoch": 3.5948255399952527, "grad_norm": 0.061194755136966705, "learning_rate": 3.647854718396642e-05, "loss": 0.2345, "step": 3786 }, { "epoch": 3.5957749821979585, "grad_norm": 0.0968698039650917, "learning_rate": 3.643247370281903e-05, "loss": 0.2414, "step": 3787 }, { "epoch": 3.596724424400665, "grad_norm": 0.054514043033123016, "learning_rate": 3.638642285447201e-05, "loss": 0.2568, "step": 3788 }, { "epoch": 3.5976738666033707, "grad_norm": 0.04937252402305603, "learning_rate": 3.6340394655321465e-05, "loss": 0.2523, "step": 3789 }, { "epoch": 3.5986233088060766, "grad_norm": 0.06724977493286133, "learning_rate": 3.62943891217554e-05, "loss": 0.2568, "step": 3790 }, { "epoch": 3.5995727510087825, "grad_norm": 0.049823611974716187, "learning_rate": 3.624840627015385e-05, "loss": 0.2551, "step": 3791 }, { "epoch": 3.6005221932114884, "grad_norm": 0.0614524707198143, "learning_rate": 3.6202446116888666e-05, "loss": 0.2546, "step": 3792 }, { "epoch": 3.6014716354141942, "grad_norm": 0.0789070874452591, "learning_rate": 3.6156508678323676e-05, "loss": 0.2715, "step": 3793 }, { "epoch": 3.6024210776169, "grad_norm": 0.05372389405965805, "learning_rate": 3.611059397081459e-05, "loss": 0.2418, "step": 3794 }, { "epoch": 3.603370519819606, "grad_norm": 0.057337675243616104, "learning_rate": 3.606470201070904e-05, "loss": 0.2496, "step": 3795 }, { "epoch": 3.604319962022312, "grad_norm": 0.07104726880788803, "learning_rate": 3.601883281434652e-05, "loss": 0.2539, "step": 3796 }, { "epoch": 3.6052694042250177, "grad_norm": 0.05221518501639366, "learning_rate": 3.597298639805853e-05, "loss": 0.2468, "step": 3797 }, { "epoch": 3.6062188464277236, "grad_norm": 0.06567849218845367, "learning_rate": 3.5927162778168355e-05, "loss": 0.2375, "step": 3798 }, { "epoch": 3.6071682886304295, "grad_norm": 0.05055369809269905, "learning_rate": 3.588136197099119e-05, "loss": 0.2625, "step": 3799 }, { "epoch": 3.6081177308331354, "grad_norm": 0.046621520072221756, "learning_rate": 3.58355839928341e-05, "loss": 0.2403, "step": 3800 }, { "epoch": 3.6090671730358412, "grad_norm": 0.06843309849500656, "learning_rate": 3.5789828859996025e-05, "loss": 0.273, "step": 3801 }, { "epoch": 3.610016615238547, "grad_norm": 0.05004161596298218, "learning_rate": 3.574409658876785e-05, "loss": 0.2514, "step": 3802 }, { "epoch": 3.6109660574412534, "grad_norm": 0.04522772133350372, "learning_rate": 3.5698387195432146e-05, "loss": 0.2468, "step": 3803 }, { "epoch": 3.6119154996439593, "grad_norm": 0.05035033077001572, "learning_rate": 3.565270069626352e-05, "loss": 0.2537, "step": 3804 }, { "epoch": 3.612864941846665, "grad_norm": 0.04506196454167366, "learning_rate": 3.5607037107528326e-05, "loss": 0.2504, "step": 3805 }, { "epoch": 3.613814384049371, "grad_norm": 0.09473405033349991, "learning_rate": 3.5561396445484765e-05, "loss": 0.2283, "step": 3806 }, { "epoch": 3.614763826252077, "grad_norm": 0.05030398070812225, "learning_rate": 3.5515778726382966e-05, "loss": 0.248, "step": 3807 }, { "epoch": 3.615713268454783, "grad_norm": 0.05178692564368248, "learning_rate": 3.54701839664647e-05, "loss": 0.253, "step": 3808 }, { "epoch": 3.6166627106574887, "grad_norm": 0.050760503858327866, "learning_rate": 3.542461218196379e-05, "loss": 0.2512, "step": 3809 }, { "epoch": 3.6176121528601946, "grad_norm": 0.045815981924533844, "learning_rate": 3.5379063389105727e-05, "loss": 0.2473, "step": 3810 }, { "epoch": 3.6185615950629004, "grad_norm": 0.07337402552366257, "learning_rate": 3.533353760410786e-05, "loss": 0.2436, "step": 3811 }, { "epoch": 3.6195110372656063, "grad_norm": 0.05059612914919853, "learning_rate": 3.528803484317934e-05, "loss": 0.25, "step": 3812 }, { "epoch": 3.6204604794683126, "grad_norm": 0.04782482236623764, "learning_rate": 3.524255512252112e-05, "loss": 0.256, "step": 3813 }, { "epoch": 3.6214099216710185, "grad_norm": 0.04968879744410515, "learning_rate": 3.519709845832598e-05, "loss": 0.2486, "step": 3814 }, { "epoch": 3.6223593638737244, "grad_norm": 0.05315404012799263, "learning_rate": 3.515166486677848e-05, "loss": 0.2491, "step": 3815 }, { "epoch": 3.6233088060764302, "grad_norm": 0.053226448595523834, "learning_rate": 3.510625436405491e-05, "loss": 0.2607, "step": 3816 }, { "epoch": 3.624258248279136, "grad_norm": 0.052453454583883286, "learning_rate": 3.5060866966323405e-05, "loss": 0.2579, "step": 3817 }, { "epoch": 3.625207690481842, "grad_norm": 0.10191935300827026, "learning_rate": 3.501550268974385e-05, "loss": 0.2735, "step": 3818 }, { "epoch": 3.626157132684548, "grad_norm": 0.05817404016852379, "learning_rate": 3.497016155046786e-05, "loss": 0.2608, "step": 3819 }, { "epoch": 3.6271065748872537, "grad_norm": 0.05418933928012848, "learning_rate": 3.4924843564638945e-05, "loss": 0.2555, "step": 3820 }, { "epoch": 3.6280560170899596, "grad_norm": 0.053123462945222855, "learning_rate": 3.487954874839214e-05, "loss": 0.2495, "step": 3821 }, { "epoch": 3.6290054592926655, "grad_norm": 0.04980839043855667, "learning_rate": 3.483427711785449e-05, "loss": 0.2564, "step": 3822 }, { "epoch": 3.6299549014953714, "grad_norm": 0.06832653284072876, "learning_rate": 3.478902868914461e-05, "loss": 0.2639, "step": 3823 }, { "epoch": 3.6309043436980772, "grad_norm": 0.05591224133968353, "learning_rate": 3.4743803478372874e-05, "loss": 0.255, "step": 3824 }, { "epoch": 3.631853785900783, "grad_norm": 0.05391015112400055, "learning_rate": 3.469860150164152e-05, "loss": 0.2565, "step": 3825 }, { "epoch": 3.632803228103489, "grad_norm": 0.05338095501065254, "learning_rate": 3.465342277504428e-05, "loss": 0.2568, "step": 3826 }, { "epoch": 3.633752670306195, "grad_norm": 0.04986494407057762, "learning_rate": 3.460826731466685e-05, "loss": 0.2532, "step": 3827 }, { "epoch": 3.634702112508901, "grad_norm": 0.053041331470012665, "learning_rate": 3.45631351365865e-05, "loss": 0.2473, "step": 3828 }, { "epoch": 3.635651554711607, "grad_norm": 0.08892609924077988, "learning_rate": 3.451802625687225e-05, "loss": 0.2584, "step": 3829 }, { "epoch": 3.636600996914313, "grad_norm": 0.0916905626654625, "learning_rate": 3.447294069158481e-05, "loss": 0.2617, "step": 3830 }, { "epoch": 3.637550439117019, "grad_norm": 0.05388447269797325, "learning_rate": 3.4427878456776573e-05, "loss": 0.2533, "step": 3831 }, { "epoch": 3.6384998813197247, "grad_norm": 0.09179041534662247, "learning_rate": 3.438283956849172e-05, "loss": 0.2356, "step": 3832 }, { "epoch": 3.6394493235224306, "grad_norm": 0.07118596136569977, "learning_rate": 3.433782404276601e-05, "loss": 0.2463, "step": 3833 }, { "epoch": 3.6403987657251364, "grad_norm": 0.059209588915109634, "learning_rate": 3.429283189562694e-05, "loss": 0.2575, "step": 3834 }, { "epoch": 3.6413482079278423, "grad_norm": 0.06903867423534393, "learning_rate": 3.424786314309365e-05, "loss": 0.2342, "step": 3835 }, { "epoch": 3.642297650130548, "grad_norm": 0.04557694122195244, "learning_rate": 3.420291780117698e-05, "loss": 0.2468, "step": 3836 }, { "epoch": 3.643247092333254, "grad_norm": 0.060068871825933456, "learning_rate": 3.415799588587939e-05, "loss": 0.2571, "step": 3837 }, { "epoch": 3.6441965345359604, "grad_norm": 0.053642213344573975, "learning_rate": 3.411309741319511e-05, "loss": 0.2604, "step": 3838 }, { "epoch": 3.6451459767386662, "grad_norm": 0.05564951151609421, "learning_rate": 3.4068222399109884e-05, "loss": 0.2595, "step": 3839 }, { "epoch": 3.646095418941372, "grad_norm": 0.11451321840286255, "learning_rate": 3.402337085960119e-05, "loss": 0.2544, "step": 3840 }, { "epoch": 3.647044861144078, "grad_norm": 0.05393780395388603, "learning_rate": 3.3978542810638125e-05, "loss": 0.2515, "step": 3841 }, { "epoch": 3.647994303346784, "grad_norm": 0.05257837474346161, "learning_rate": 3.393373826818137e-05, "loss": 0.2544, "step": 3842 }, { "epoch": 3.6489437455494897, "grad_norm": 0.07172742486000061, "learning_rate": 3.388895724818341e-05, "loss": 0.2375, "step": 3843 }, { "epoch": 3.6498931877521956, "grad_norm": 0.06196949630975723, "learning_rate": 3.384419976658808e-05, "loss": 0.2552, "step": 3844 }, { "epoch": 3.6508426299549015, "grad_norm": 0.04753759130835533, "learning_rate": 3.37994658393311e-05, "loss": 0.2506, "step": 3845 }, { "epoch": 3.6517920721576074, "grad_norm": 0.06825859099626541, "learning_rate": 3.3754755482339653e-05, "loss": 0.2814, "step": 3846 }, { "epoch": 3.6527415143603132, "grad_norm": 0.13267584145069122, "learning_rate": 3.371006871153254e-05, "loss": 0.2458, "step": 3847 }, { "epoch": 3.653690956563019, "grad_norm": 0.06627479940652847, "learning_rate": 3.366540554282028e-05, "loss": 0.2419, "step": 3848 }, { "epoch": 3.654640398765725, "grad_norm": 0.04707813635468483, "learning_rate": 3.362076599210479e-05, "loss": 0.2456, "step": 3849 }, { "epoch": 3.655589840968431, "grad_norm": 0.09856180846691132, "learning_rate": 3.357615007527976e-05, "loss": 0.2373, "step": 3850 }, { "epoch": 3.6565392831711367, "grad_norm": 0.05382237583398819, "learning_rate": 3.3531557808230387e-05, "loss": 0.2615, "step": 3851 }, { "epoch": 3.6574887253738426, "grad_norm": 0.057159341871738434, "learning_rate": 3.348698920683343e-05, "loss": 0.2486, "step": 3852 }, { "epoch": 3.658438167576549, "grad_norm": 0.05542777106165886, "learning_rate": 3.344244428695728e-05, "loss": 0.2537, "step": 3853 }, { "epoch": 3.659387609779255, "grad_norm": 0.06226300820708275, "learning_rate": 3.3397923064461786e-05, "loss": 0.2522, "step": 3854 }, { "epoch": 3.6603370519819607, "grad_norm": 0.05447829142212868, "learning_rate": 3.3353425555198547e-05, "loss": 0.2626, "step": 3855 }, { "epoch": 3.6612864941846666, "grad_norm": 0.08190428465604782, "learning_rate": 3.330895177501056e-05, "loss": 0.2403, "step": 3856 }, { "epoch": 3.6622359363873724, "grad_norm": 0.09613342583179474, "learning_rate": 3.32645017397324e-05, "loss": 0.243, "step": 3857 }, { "epoch": 3.6631853785900783, "grad_norm": 0.07900498062372208, "learning_rate": 3.3220075465190246e-05, "loss": 0.2551, "step": 3858 }, { "epoch": 3.664134820792784, "grad_norm": 0.0935310423374176, "learning_rate": 3.317567296720177e-05, "loss": 0.2445, "step": 3859 }, { "epoch": 3.66508426299549, "grad_norm": 0.12767313420772552, "learning_rate": 3.313129426157613e-05, "loss": 0.2532, "step": 3860 }, { "epoch": 3.666033705198196, "grad_norm": 0.051936663687229156, "learning_rate": 3.308693936411421e-05, "loss": 0.251, "step": 3861 }, { "epoch": 3.666983147400902, "grad_norm": 0.05065144971013069, "learning_rate": 3.3042608290608124e-05, "loss": 0.2514, "step": 3862 }, { "epoch": 3.667932589603608, "grad_norm": 0.05864845588803291, "learning_rate": 3.2998301056841774e-05, "loss": 0.2554, "step": 3863 }, { "epoch": 3.668882031806314, "grad_norm": 0.05251367390155792, "learning_rate": 3.2954017678590406e-05, "loss": 0.2494, "step": 3864 }, { "epoch": 3.66983147400902, "grad_norm": 0.047137439250946045, "learning_rate": 3.290975817162082e-05, "loss": 0.2433, "step": 3865 }, { "epoch": 3.6707809162117258, "grad_norm": 0.04499272257089615, "learning_rate": 3.2865522551691396e-05, "loss": 0.2548, "step": 3866 }, { "epoch": 3.6717303584144316, "grad_norm": 0.05629131570458412, "learning_rate": 3.282131083455183e-05, "loss": 0.2491, "step": 3867 }, { "epoch": 3.6726798006171375, "grad_norm": 0.05436830222606659, "learning_rate": 3.277712303594349e-05, "loss": 0.2521, "step": 3868 }, { "epoch": 3.6736292428198434, "grad_norm": 0.052901942282915115, "learning_rate": 3.273295917159912e-05, "loss": 0.2491, "step": 3869 }, { "epoch": 3.6745786850225493, "grad_norm": 0.05838017538189888, "learning_rate": 3.268881925724297e-05, "loss": 0.2704, "step": 3870 }, { "epoch": 3.675528127225255, "grad_norm": 0.05413403362035751, "learning_rate": 3.264470330859082e-05, "loss": 0.2579, "step": 3871 }, { "epoch": 3.676477569427961, "grad_norm": 0.05225560441613197, "learning_rate": 3.260061134134976e-05, "loss": 0.2567, "step": 3872 }, { "epoch": 3.677427011630667, "grad_norm": 0.059620507061481476, "learning_rate": 3.255654337121855e-05, "loss": 0.2611, "step": 3873 }, { "epoch": 3.6783764538333728, "grad_norm": 0.05178217962384224, "learning_rate": 3.2512499413887255e-05, "loss": 0.2583, "step": 3874 }, { "epoch": 3.6793258960360786, "grad_norm": 0.04592955484986305, "learning_rate": 3.246847948503744e-05, "loss": 0.2579, "step": 3875 }, { "epoch": 3.6802753382387845, "grad_norm": 0.051579222083091736, "learning_rate": 3.2424483600342104e-05, "loss": 0.2579, "step": 3876 }, { "epoch": 3.6812247804414904, "grad_norm": 0.04873146116733551, "learning_rate": 3.238051177546571e-05, "loss": 0.2493, "step": 3877 }, { "epoch": 3.6821742226441967, "grad_norm": 0.048910629004240036, "learning_rate": 3.2336564026064084e-05, "loss": 0.2634, "step": 3878 }, { "epoch": 3.6831236648469026, "grad_norm": 0.049188822507858276, "learning_rate": 3.229264036778462e-05, "loss": 0.2525, "step": 3879 }, { "epoch": 3.6840731070496084, "grad_norm": 0.04678282514214516, "learning_rate": 3.224874081626601e-05, "loss": 0.2537, "step": 3880 }, { "epoch": 3.6850225492523143, "grad_norm": 0.06820831447839737, "learning_rate": 3.220486538713839e-05, "loss": 0.2694, "step": 3881 }, { "epoch": 3.68597199145502, "grad_norm": 0.04611432924866676, "learning_rate": 3.216101409602333e-05, "loss": 0.2534, "step": 3882 }, { "epoch": 3.686921433657726, "grad_norm": 0.06939106434583664, "learning_rate": 3.211718695853375e-05, "loss": 0.2687, "step": 3883 }, { "epoch": 3.687870875860432, "grad_norm": 0.04914408549666405, "learning_rate": 3.207338399027413e-05, "loss": 0.2538, "step": 3884 }, { "epoch": 3.688820318063138, "grad_norm": 0.049557916820049286, "learning_rate": 3.202960520684009e-05, "loss": 0.2307, "step": 3885 }, { "epoch": 3.6897697602658437, "grad_norm": 0.05168003961443901, "learning_rate": 3.198585062381886e-05, "loss": 0.26, "step": 3886 }, { "epoch": 3.6907192024685496, "grad_norm": 0.09558333456516266, "learning_rate": 3.194212025678896e-05, "loss": 0.2583, "step": 3887 }, { "epoch": 3.691668644671256, "grad_norm": 0.07344388216733932, "learning_rate": 3.1898414121320276e-05, "loss": 0.2757, "step": 3888 }, { "epoch": 3.6926180868739618, "grad_norm": 0.05108393356204033, "learning_rate": 3.185473223297416e-05, "loss": 0.2616, "step": 3889 }, { "epoch": 3.6935675290766676, "grad_norm": 0.06204976141452789, "learning_rate": 3.1811074607303135e-05, "loss": 0.2602, "step": 3890 }, { "epoch": 3.6945169712793735, "grad_norm": 0.0552101694047451, "learning_rate": 3.1767441259851374e-05, "loss": 0.2535, "step": 3891 }, { "epoch": 3.6954664134820794, "grad_norm": 0.05207996442914009, "learning_rate": 3.172383220615408e-05, "loss": 0.2513, "step": 3892 }, { "epoch": 3.6964158556847853, "grad_norm": 0.06118466332554817, "learning_rate": 3.168024746173808e-05, "loss": 0.2458, "step": 3893 }, { "epoch": 3.697365297887491, "grad_norm": 0.05093217268586159, "learning_rate": 3.16366870421214e-05, "loss": 0.2507, "step": 3894 }, { "epoch": 3.698314740090197, "grad_norm": 0.06563693284988403, "learning_rate": 3.1593150962813424e-05, "loss": 0.274, "step": 3895 }, { "epoch": 3.699264182292903, "grad_norm": 0.07928726822137833, "learning_rate": 3.154963923931496e-05, "loss": 0.2644, "step": 3896 }, { "epoch": 3.7002136244956088, "grad_norm": 0.08785230666399002, "learning_rate": 3.1506151887117974e-05, "loss": 0.2404, "step": 3897 }, { "epoch": 3.7011630666983146, "grad_norm": 0.05994411185383797, "learning_rate": 3.146268892170592e-05, "loss": 0.2451, "step": 3898 }, { "epoch": 3.7021125089010205, "grad_norm": 0.057615093886852264, "learning_rate": 3.1419250358553474e-05, "loss": 0.2688, "step": 3899 }, { "epoch": 3.7030619511037264, "grad_norm": 0.06122539937496185, "learning_rate": 3.137583621312665e-05, "loss": 0.2647, "step": 3900 }, { "epoch": 3.7040113933064323, "grad_norm": 0.04660304635763168, "learning_rate": 3.1332446500882794e-05, "loss": 0.2519, "step": 3901 }, { "epoch": 3.704960835509138, "grad_norm": 0.05333678424358368, "learning_rate": 3.12890812372705e-05, "loss": 0.2445, "step": 3902 }, { "epoch": 3.7059102777118444, "grad_norm": 0.053378038108348846, "learning_rate": 3.124574043772967e-05, "loss": 0.2421, "step": 3903 }, { "epoch": 3.7068597199145503, "grad_norm": 0.06653116643428802, "learning_rate": 3.1202424117691566e-05, "loss": 0.2571, "step": 3904 }, { "epoch": 3.707809162117256, "grad_norm": 0.04834749549627304, "learning_rate": 3.115913229257864e-05, "loss": 0.2509, "step": 3905 }, { "epoch": 3.708758604319962, "grad_norm": 0.08246218413114548, "learning_rate": 3.1115864977804676e-05, "loss": 0.2326, "step": 3906 }, { "epoch": 3.709708046522668, "grad_norm": 0.05619177594780922, "learning_rate": 3.107262218877473e-05, "loss": 0.2535, "step": 3907 }, { "epoch": 3.710657488725374, "grad_norm": 0.08788593858480453, "learning_rate": 3.102940394088504e-05, "loss": 0.2563, "step": 3908 }, { "epoch": 3.7116069309280797, "grad_norm": 0.05328623577952385, "learning_rate": 3.0986210249523315e-05, "loss": 0.256, "step": 3909 }, { "epoch": 3.7125563731307856, "grad_norm": 0.06127059459686279, "learning_rate": 3.094304113006824e-05, "loss": 0.258, "step": 3910 }, { "epoch": 3.7135058153334914, "grad_norm": 0.04658037796616554, "learning_rate": 3.089989659788999e-05, "loss": 0.2488, "step": 3911 }, { "epoch": 3.7144552575361973, "grad_norm": 0.05059393495321274, "learning_rate": 3.085677666834986e-05, "loss": 0.2474, "step": 3912 }, { "epoch": 3.7154046997389036, "grad_norm": 0.054072484374046326, "learning_rate": 3.0813681356800405e-05, "loss": 0.2475, "step": 3913 }, { "epoch": 3.7163541419416095, "grad_norm": 0.06147841364145279, "learning_rate": 3.07706106785855e-05, "loss": 0.2452, "step": 3914 }, { "epoch": 3.7173035841443154, "grad_norm": 0.06334567070007324, "learning_rate": 3.072756464904006e-05, "loss": 0.2608, "step": 3915 }, { "epoch": 3.7182530263470213, "grad_norm": 0.1141507476568222, "learning_rate": 3.068454328349044e-05, "loss": 0.2692, "step": 3916 }, { "epoch": 3.719202468549727, "grad_norm": 0.05562411993741989, "learning_rate": 3.064154659725408e-05, "loss": 0.2566, "step": 3917 }, { "epoch": 3.720151910752433, "grad_norm": 0.05495092645287514, "learning_rate": 3.059857460563966e-05, "loss": 0.2597, "step": 3918 }, { "epoch": 3.721101352955139, "grad_norm": 0.0544046126306057, "learning_rate": 3.0555627323947076e-05, "loss": 0.2711, "step": 3919 }, { "epoch": 3.7220507951578448, "grad_norm": 0.061430271714925766, "learning_rate": 3.0512704767467413e-05, "loss": 0.2688, "step": 3920 }, { "epoch": 3.7230002373605506, "grad_norm": 0.08111298084259033, "learning_rate": 3.0469806951483017e-05, "loss": 0.2747, "step": 3921 }, { "epoch": 3.7239496795632565, "grad_norm": 0.05766540765762329, "learning_rate": 3.0426933891267327e-05, "loss": 0.2575, "step": 3922 }, { "epoch": 3.7248991217659624, "grad_norm": 0.05006188526749611, "learning_rate": 3.0384085602085044e-05, "loss": 0.2515, "step": 3923 }, { "epoch": 3.7258485639686683, "grad_norm": 0.056151438504457474, "learning_rate": 3.0341262099191993e-05, "loss": 0.2665, "step": 3924 }, { "epoch": 3.726798006171374, "grad_norm": 0.060625769197940826, "learning_rate": 3.029846339783522e-05, "loss": 0.2579, "step": 3925 }, { "epoch": 3.72774744837408, "grad_norm": 0.05002756416797638, "learning_rate": 3.025568951325287e-05, "loss": 0.2626, "step": 3926 }, { "epoch": 3.728696890576786, "grad_norm": 0.051841624081134796, "learning_rate": 3.021294046067439e-05, "loss": 0.2545, "step": 3927 }, { "epoch": 3.729646332779492, "grad_norm": 0.04806916415691376, "learning_rate": 3.0170216255320262e-05, "loss": 0.2278, "step": 3928 }, { "epoch": 3.730595774982198, "grad_norm": 0.04851704090833664, "learning_rate": 3.0127516912402142e-05, "loss": 0.2524, "step": 3929 }, { "epoch": 3.731545217184904, "grad_norm": 0.05547931790351868, "learning_rate": 3.0084842447122864e-05, "loss": 0.2621, "step": 3930 }, { "epoch": 3.73249465938761, "grad_norm": 0.05749111995100975, "learning_rate": 3.0042192874676365e-05, "loss": 0.2245, "step": 3931 }, { "epoch": 3.7334441015903157, "grad_norm": 0.045757438987493515, "learning_rate": 2.999956821024783e-05, "loss": 0.2525, "step": 3932 }, { "epoch": 3.7343935437930216, "grad_norm": 0.05155427008867264, "learning_rate": 2.9956968469013368e-05, "loss": 0.2581, "step": 3933 }, { "epoch": 3.7353429859957274, "grad_norm": 0.0530339851975441, "learning_rate": 2.991439366614043e-05, "loss": 0.2489, "step": 3934 }, { "epoch": 3.7362924281984333, "grad_norm": 0.04974301531910896, "learning_rate": 2.987184381678747e-05, "loss": 0.2487, "step": 3935 }, { "epoch": 3.737241870401139, "grad_norm": 0.06348806619644165, "learning_rate": 2.9829318936104044e-05, "loss": 0.2383, "step": 3936 }, { "epoch": 3.7381913126038455, "grad_norm": 0.06022779271006584, "learning_rate": 2.978681903923095e-05, "loss": 0.2293, "step": 3937 }, { "epoch": 3.7391407548065514, "grad_norm": 0.10987991839647293, "learning_rate": 2.9744344141299884e-05, "loss": 0.2495, "step": 3938 }, { "epoch": 3.7400901970092573, "grad_norm": 0.04573450982570648, "learning_rate": 2.9701894257433826e-05, "loss": 0.2496, "step": 3939 }, { "epoch": 3.741039639211963, "grad_norm": 0.0443129725754261, "learning_rate": 2.9659469402746777e-05, "loss": 0.2487, "step": 3940 }, { "epoch": 3.741989081414669, "grad_norm": 0.05262639373540878, "learning_rate": 2.9617069592343804e-05, "loss": 0.2552, "step": 3941 }, { "epoch": 3.742938523617375, "grad_norm": 0.04952634498476982, "learning_rate": 2.9574694841321082e-05, "loss": 0.2432, "step": 3942 }, { "epoch": 3.7438879658200808, "grad_norm": 0.05165358632802963, "learning_rate": 2.953234516476584e-05, "loss": 0.2466, "step": 3943 }, { "epoch": 3.7448374080227866, "grad_norm": 0.05307038128376007, "learning_rate": 2.9490020577756473e-05, "loss": 0.2457, "step": 3944 }, { "epoch": 3.7457868502254925, "grad_norm": 0.0756666511297226, "learning_rate": 2.9447721095362324e-05, "loss": 0.2401, "step": 3945 }, { "epoch": 3.7467362924281984, "grad_norm": 0.11313078552484512, "learning_rate": 2.940544673264385e-05, "loss": 0.2345, "step": 3946 }, { "epoch": 3.7476857346309043, "grad_norm": 0.0538734532892704, "learning_rate": 2.9363197504652573e-05, "loss": 0.2535, "step": 3947 }, { "epoch": 3.74863517683361, "grad_norm": 0.06038649380207062, "learning_rate": 2.932097342643103e-05, "loss": 0.2531, "step": 3948 }, { "epoch": 3.749584619036316, "grad_norm": 0.05886239930987358, "learning_rate": 2.927877451301282e-05, "loss": 0.2583, "step": 3949 }, { "epoch": 3.750534061239022, "grad_norm": 0.05016456916928291, "learning_rate": 2.9236600779422673e-05, "loss": 0.2526, "step": 3950 }, { "epoch": 3.7514835034417278, "grad_norm": 0.04979074373841286, "learning_rate": 2.919445224067614e-05, "loss": 0.2468, "step": 3951 }, { "epoch": 3.7524329456444336, "grad_norm": 0.05229020491242409, "learning_rate": 2.9152328911780026e-05, "loss": 0.2526, "step": 3952 }, { "epoch": 3.75338238784714, "grad_norm": 0.055056676268577576, "learning_rate": 2.911023080773204e-05, "loss": 0.2489, "step": 3953 }, { "epoch": 3.754331830049846, "grad_norm": 0.053240686655044556, "learning_rate": 2.9068157943520903e-05, "loss": 0.2546, "step": 3954 }, { "epoch": 3.7552812722525517, "grad_norm": 0.05663219094276428, "learning_rate": 2.902611033412648e-05, "loss": 0.2471, "step": 3955 }, { "epoch": 3.7562307144552576, "grad_norm": 0.05138954147696495, "learning_rate": 2.8984087994519405e-05, "loss": 0.2511, "step": 3956 }, { "epoch": 3.7571801566579635, "grad_norm": 0.05986800789833069, "learning_rate": 2.894209093966157e-05, "loss": 0.2565, "step": 3957 }, { "epoch": 3.7581295988606693, "grad_norm": 0.04984092339873314, "learning_rate": 2.8900119184505704e-05, "loss": 0.2565, "step": 3958 }, { "epoch": 3.759079041063375, "grad_norm": 0.06115711107850075, "learning_rate": 2.8858172743995547e-05, "loss": 0.2378, "step": 3959 }, { "epoch": 3.760028483266081, "grad_norm": 0.10267126560211182, "learning_rate": 2.881625163306596e-05, "loss": 0.2467, "step": 3960 }, { "epoch": 3.760977925468787, "grad_norm": 0.05122007802128792, "learning_rate": 2.8774355866642543e-05, "loss": 0.2565, "step": 3961 }, { "epoch": 3.7619273676714933, "grad_norm": 0.0978183001279831, "learning_rate": 2.87324854596421e-05, "loss": 0.247, "step": 3962 }, { "epoch": 3.762876809874199, "grad_norm": 0.04869680851697922, "learning_rate": 2.8690640426972292e-05, "loss": 0.2497, "step": 3963 }, { "epoch": 3.763826252076905, "grad_norm": 0.06409458816051483, "learning_rate": 2.864882078353176e-05, "loss": 0.2604, "step": 3964 }, { "epoch": 3.764775694279611, "grad_norm": 0.055247921496629715, "learning_rate": 2.8607026544210114e-05, "loss": 0.2536, "step": 3965 }, { "epoch": 3.7657251364823168, "grad_norm": 0.04963747039437294, "learning_rate": 2.8565257723887918e-05, "loss": 0.246, "step": 3966 }, { "epoch": 3.7666745786850226, "grad_norm": 0.04997308924794197, "learning_rate": 2.8523514337436663e-05, "loss": 0.2582, "step": 3967 }, { "epoch": 3.7676240208877285, "grad_norm": 0.057859741151332855, "learning_rate": 2.8481796399718874e-05, "loss": 0.2591, "step": 3968 }, { "epoch": 3.7685734630904344, "grad_norm": 0.05374854803085327, "learning_rate": 2.84401039255879e-05, "loss": 0.2586, "step": 3969 }, { "epoch": 3.7695229052931403, "grad_norm": 0.05135156214237213, "learning_rate": 2.8398436929888085e-05, "loss": 0.2621, "step": 3970 }, { "epoch": 3.770472347495846, "grad_norm": 0.10533904284238815, "learning_rate": 2.8356795427454674e-05, "loss": 0.2445, "step": 3971 }, { "epoch": 3.771421789698552, "grad_norm": 0.04445955157279968, "learning_rate": 2.8315179433113847e-05, "loss": 0.2421, "step": 3972 }, { "epoch": 3.772371231901258, "grad_norm": 0.05651763081550598, "learning_rate": 2.8273588961682774e-05, "loss": 0.2545, "step": 3973 }, { "epoch": 3.7733206741039638, "grad_norm": 0.05163528770208359, "learning_rate": 2.8232024027969362e-05, "loss": 0.2576, "step": 3974 }, { "epoch": 3.7742701163066696, "grad_norm": 0.09719181060791016, "learning_rate": 2.8190484646772607e-05, "loss": 0.2358, "step": 3975 }, { "epoch": 3.7752195585093755, "grad_norm": 0.05339033901691437, "learning_rate": 2.8148970832882326e-05, "loss": 0.2527, "step": 3976 }, { "epoch": 3.7761690007120814, "grad_norm": 0.055794693529605865, "learning_rate": 2.8107482601079183e-05, "loss": 0.2472, "step": 3977 }, { "epoch": 3.7771184429147877, "grad_norm": 0.0768372118473053, "learning_rate": 2.8066019966134904e-05, "loss": 0.2547, "step": 3978 }, { "epoch": 3.7780678851174936, "grad_norm": 0.06772004067897797, "learning_rate": 2.8024582942811862e-05, "loss": 0.2601, "step": 3979 }, { "epoch": 3.7790173273201995, "grad_norm": 0.07262419164180756, "learning_rate": 2.798317154586352e-05, "loss": 0.2752, "step": 3980 }, { "epoch": 3.7799667695229053, "grad_norm": 0.05989028513431549, "learning_rate": 2.7941785790034104e-05, "loss": 0.2617, "step": 3981 }, { "epoch": 3.780916211725611, "grad_norm": 0.058179810643196106, "learning_rate": 2.790042569005874e-05, "loss": 0.2415, "step": 3982 }, { "epoch": 3.781865653928317, "grad_norm": 0.053919147700071335, "learning_rate": 2.7859091260663427e-05, "loss": 0.2474, "step": 3983 }, { "epoch": 3.782815096131023, "grad_norm": 0.04912007227540016, "learning_rate": 2.781778251656498e-05, "loss": 0.2537, "step": 3984 }, { "epoch": 3.783764538333729, "grad_norm": 0.051920775324106216, "learning_rate": 2.7776499472471185e-05, "loss": 0.2285, "step": 3985 }, { "epoch": 3.7847139805364347, "grad_norm": 0.049069683998823166, "learning_rate": 2.773524214308054e-05, "loss": 0.2546, "step": 3986 }, { "epoch": 3.785663422739141, "grad_norm": 0.05164619907736778, "learning_rate": 2.7694010543082472e-05, "loss": 0.2411, "step": 3987 }, { "epoch": 3.786612864941847, "grad_norm": 0.06377508491277695, "learning_rate": 2.7652804687157208e-05, "loss": 0.2487, "step": 3988 }, { "epoch": 3.7875623071445528, "grad_norm": 0.06035701185464859, "learning_rate": 2.7611624589975816e-05, "loss": 0.2505, "step": 3989 }, { "epoch": 3.7885117493472587, "grad_norm": 0.05832657963037491, "learning_rate": 2.7570470266200176e-05, "loss": 0.24, "step": 3990 }, { "epoch": 3.7894611915499645, "grad_norm": 0.04982447996735573, "learning_rate": 2.7529341730483117e-05, "loss": 0.2523, "step": 3991 }, { "epoch": 3.7904106337526704, "grad_norm": 0.050874706357717514, "learning_rate": 2.748823899746805e-05, "loss": 0.2556, "step": 3992 }, { "epoch": 3.7913600759553763, "grad_norm": 0.053797561675310135, "learning_rate": 2.7447162081789423e-05, "loss": 0.2425, "step": 3993 }, { "epoch": 3.792309518158082, "grad_norm": 0.05993964150547981, "learning_rate": 2.7406110998072375e-05, "loss": 0.2359, "step": 3994 }, { "epoch": 3.793258960360788, "grad_norm": 0.06711548566818237, "learning_rate": 2.736508576093285e-05, "loss": 0.2406, "step": 3995 }, { "epoch": 3.794208402563494, "grad_norm": 0.0883718952536583, "learning_rate": 2.7324086384977698e-05, "loss": 0.2678, "step": 3996 }, { "epoch": 3.7951578447661998, "grad_norm": 0.05363830551505089, "learning_rate": 2.728311288480436e-05, "loss": 0.2518, "step": 3997 }, { "epoch": 3.7961072869689056, "grad_norm": 0.044073816388845444, "learning_rate": 2.7242165275001273e-05, "loss": 0.252, "step": 3998 }, { "epoch": 3.7970567291716115, "grad_norm": 0.055176567286252975, "learning_rate": 2.720124357014754e-05, "loss": 0.2492, "step": 3999 }, { "epoch": 3.7980061713743174, "grad_norm": 0.055266376584768295, "learning_rate": 2.716034778481301e-05, "loss": 0.2626, "step": 4000 }, { "epoch": 3.7980061713743174, "eval_loss": 0.25707772374153137, "eval_runtime": 37.9163, "eval_samples_per_second": 2.268, "eval_steps_per_second": 2.268, "step": 4000 }, { "epoch": 3.7989556135770233, "grad_norm": 0.05252851918339729, "learning_rate": 2.7119477933558478e-05, "loss": 0.2514, "step": 4001 }, { "epoch": 3.799905055779729, "grad_norm": 0.052902501076459885, "learning_rate": 2.7078634030935258e-05, "loss": 0.2384, "step": 4002 }, { "epoch": 3.8008544979824355, "grad_norm": 0.07614622265100479, "learning_rate": 2.7037816091485668e-05, "loss": 0.2608, "step": 4003 }, { "epoch": 3.8018039401851413, "grad_norm": 0.14443829655647278, "learning_rate": 2.6997024129742542e-05, "loss": 0.2511, "step": 4004 }, { "epoch": 3.802753382387847, "grad_norm": 0.047908131033182144, "learning_rate": 2.6956258160229695e-05, "loss": 0.2458, "step": 4005 }, { "epoch": 3.803702824590553, "grad_norm": 0.04549311101436615, "learning_rate": 2.6915518197461553e-05, "loss": 0.2503, "step": 4006 }, { "epoch": 3.804652266793259, "grad_norm": 0.06348168849945068, "learning_rate": 2.6874804255943297e-05, "loss": 0.2641, "step": 4007 }, { "epoch": 3.805601708995965, "grad_norm": 0.060389503836631775, "learning_rate": 2.683411635017087e-05, "loss": 0.2549, "step": 4008 }, { "epoch": 3.8065511511986707, "grad_norm": 0.16582825779914856, "learning_rate": 2.6793454494630888e-05, "loss": 0.2791, "step": 4009 }, { "epoch": 3.8075005934013766, "grad_norm": 0.05757433548569679, "learning_rate": 2.675281870380082e-05, "loss": 0.2486, "step": 4010 }, { "epoch": 3.8084500356040825, "grad_norm": 0.06793633848428726, "learning_rate": 2.6712208992148736e-05, "loss": 0.2618, "step": 4011 }, { "epoch": 3.809399477806789, "grad_norm": 0.06024308502674103, "learning_rate": 2.6671625374133445e-05, "loss": 0.2603, "step": 4012 }, { "epoch": 3.8103489200094947, "grad_norm": 0.05375480651855469, "learning_rate": 2.6631067864204497e-05, "loss": 0.2551, "step": 4013 }, { "epoch": 3.8112983622122005, "grad_norm": 0.07164514809846878, "learning_rate": 2.6590536476802118e-05, "loss": 0.2615, "step": 4014 }, { "epoch": 3.8122478044149064, "grad_norm": 0.06168850138783455, "learning_rate": 2.65500312263572e-05, "loss": 0.2535, "step": 4015 }, { "epoch": 3.8131972466176123, "grad_norm": 0.05497964471578598, "learning_rate": 2.6509552127291447e-05, "loss": 0.2468, "step": 4016 }, { "epoch": 3.814146688820318, "grad_norm": 0.06022312492132187, "learning_rate": 2.6469099194017143e-05, "loss": 0.2685, "step": 4017 }, { "epoch": 3.815096131023024, "grad_norm": 0.05725991725921631, "learning_rate": 2.6428672440937285e-05, "loss": 0.2555, "step": 4018 }, { "epoch": 3.81604557322573, "grad_norm": 0.05475730448961258, "learning_rate": 2.638827188244556e-05, "loss": 0.2643, "step": 4019 }, { "epoch": 3.8169950154284358, "grad_norm": 0.05317939445376396, "learning_rate": 2.634789753292629e-05, "loss": 0.2503, "step": 4020 }, { "epoch": 3.8179444576311417, "grad_norm": 0.0483684316277504, "learning_rate": 2.6307549406754585e-05, "loss": 0.2553, "step": 4021 }, { "epoch": 3.8188938998338475, "grad_norm": 0.045412421226501465, "learning_rate": 2.626722751829601e-05, "loss": 0.243, "step": 4022 }, { "epoch": 3.8198433420365534, "grad_norm": 0.05453164875507355, "learning_rate": 2.622693188190699e-05, "loss": 0.2579, "step": 4023 }, { "epoch": 3.8207927842392593, "grad_norm": 0.05538022145628929, "learning_rate": 2.6186662511934513e-05, "loss": 0.2515, "step": 4024 }, { "epoch": 3.821742226441965, "grad_norm": 0.06117261201143265, "learning_rate": 2.6146419422716173e-05, "loss": 0.2556, "step": 4025 }, { "epoch": 3.822691668644671, "grad_norm": 0.06717471033334732, "learning_rate": 2.6106202628580355e-05, "loss": 0.2536, "step": 4026 }, { "epoch": 3.823641110847377, "grad_norm": 0.05201391875743866, "learning_rate": 2.6066012143845876e-05, "loss": 0.2442, "step": 4027 }, { "epoch": 3.824590553050083, "grad_norm": 0.04769446328282356, "learning_rate": 2.602584798282237e-05, "loss": 0.2501, "step": 4028 }, { "epoch": 3.825539995252789, "grad_norm": 0.05002529174089432, "learning_rate": 2.5985710159809996e-05, "loss": 0.2557, "step": 4029 }, { "epoch": 3.826489437455495, "grad_norm": 0.1064804345369339, "learning_rate": 2.594559868909956e-05, "loss": 0.2545, "step": 4030 }, { "epoch": 3.827438879658201, "grad_norm": 0.04813575744628906, "learning_rate": 2.5905513584972487e-05, "loss": 0.2421, "step": 4031 }, { "epoch": 3.8283883218609067, "grad_norm": 0.04889265075325966, "learning_rate": 2.58654548617008e-05, "loss": 0.2497, "step": 4032 }, { "epoch": 3.8293377640636126, "grad_norm": 0.06219366192817688, "learning_rate": 2.5825422533547184e-05, "loss": 0.2386, "step": 4033 }, { "epoch": 3.8302872062663185, "grad_norm": 0.06098899245262146, "learning_rate": 2.5785416614764867e-05, "loss": 0.2557, "step": 4034 }, { "epoch": 3.8312366484690243, "grad_norm": 0.04500975087285042, "learning_rate": 2.5745437119597705e-05, "loss": 0.2516, "step": 4035 }, { "epoch": 3.83218609067173, "grad_norm": 0.048414573073387146, "learning_rate": 2.5705484062280106e-05, "loss": 0.2564, "step": 4036 }, { "epoch": 3.8331355328744365, "grad_norm": 0.057486824691295624, "learning_rate": 2.5665557457037128e-05, "loss": 0.254, "step": 4037 }, { "epoch": 3.8340849750771424, "grad_norm": 0.06463531404733658, "learning_rate": 2.5625657318084318e-05, "loss": 0.276, "step": 4038 }, { "epoch": 3.8350344172798483, "grad_norm": 0.048133254051208496, "learning_rate": 2.558578365962796e-05, "loss": 0.2585, "step": 4039 }, { "epoch": 3.835983859482554, "grad_norm": 0.06365126371383667, "learning_rate": 2.5545936495864686e-05, "loss": 0.249, "step": 4040 }, { "epoch": 3.83693330168526, "grad_norm": 0.055381596088409424, "learning_rate": 2.5506115840981904e-05, "loss": 0.2498, "step": 4041 }, { "epoch": 3.837882743887966, "grad_norm": 0.07869044691324234, "learning_rate": 2.5466321709157482e-05, "loss": 0.2244, "step": 4042 }, { "epoch": 3.838832186090672, "grad_norm": 0.05803418159484863, "learning_rate": 2.542655411455982e-05, "loss": 0.2711, "step": 4043 }, { "epoch": 3.8397816282933777, "grad_norm": 0.05418579280376434, "learning_rate": 2.5386813071347992e-05, "loss": 0.2653, "step": 4044 }, { "epoch": 3.8407310704960835, "grad_norm": 0.06050990894436836, "learning_rate": 2.5347098593671414e-05, "loss": 0.2497, "step": 4045 }, { "epoch": 3.8416805126987894, "grad_norm": 0.05278535932302475, "learning_rate": 2.5307410695670275e-05, "loss": 0.2542, "step": 4046 }, { "epoch": 3.8426299549014953, "grad_norm": 0.05332508683204651, "learning_rate": 2.5267749391475148e-05, "loss": 0.259, "step": 4047 }, { "epoch": 3.843579397104201, "grad_norm": 0.051350705325603485, "learning_rate": 2.5228114695207172e-05, "loss": 0.2501, "step": 4048 }, { "epoch": 3.844528839306907, "grad_norm": 0.05481059476733208, "learning_rate": 2.5188506620978025e-05, "loss": 0.2639, "step": 4049 }, { "epoch": 3.845478281509613, "grad_norm": 0.06032145023345947, "learning_rate": 2.514892518288988e-05, "loss": 0.2499, "step": 4050 }, { "epoch": 3.846427723712319, "grad_norm": 0.051769085228443146, "learning_rate": 2.5109370395035514e-05, "loss": 0.2352, "step": 4051 }, { "epoch": 3.8473771659150247, "grad_norm": 0.05359676852822304, "learning_rate": 2.5069842271498102e-05, "loss": 0.2507, "step": 4052 }, { "epoch": 3.848326608117731, "grad_norm": 0.07494449615478516, "learning_rate": 2.5030340826351373e-05, "loss": 0.2712, "step": 4053 }, { "epoch": 3.849276050320437, "grad_norm": 0.05114074423909187, "learning_rate": 2.499086607365957e-05, "loss": 0.249, "step": 4054 }, { "epoch": 3.8502254925231427, "grad_norm": 0.054538123309612274, "learning_rate": 2.4951418027477402e-05, "loss": 0.2466, "step": 4055 }, { "epoch": 3.8511749347258486, "grad_norm": 0.04820817708969116, "learning_rate": 2.491199670185008e-05, "loss": 0.2512, "step": 4056 }, { "epoch": 3.8521243769285545, "grad_norm": 0.049865830689668655, "learning_rate": 2.4872602110813348e-05, "loss": 0.253, "step": 4057 }, { "epoch": 3.8530738191312603, "grad_norm": 0.0494709387421608, "learning_rate": 2.4833234268393378e-05, "loss": 0.2594, "step": 4058 }, { "epoch": 3.854023261333966, "grad_norm": 0.052232518792152405, "learning_rate": 2.479389318860682e-05, "loss": 0.2581, "step": 4059 }, { "epoch": 3.854972703536672, "grad_norm": 0.05848172679543495, "learning_rate": 2.475457888546081e-05, "loss": 0.2761, "step": 4060 }, { "epoch": 3.855922145739378, "grad_norm": 0.05332505330443382, "learning_rate": 2.471529137295292e-05, "loss": 0.2544, "step": 4061 }, { "epoch": 3.8568715879420843, "grad_norm": 0.05203618109226227, "learning_rate": 2.467603066507129e-05, "loss": 0.2569, "step": 4062 }, { "epoch": 3.85782103014479, "grad_norm": 0.06011023744940758, "learning_rate": 2.4636796775794336e-05, "loss": 0.2516, "step": 4063 }, { "epoch": 3.858770472347496, "grad_norm": 0.05454760044813156, "learning_rate": 2.4597589719091107e-05, "loss": 0.2633, "step": 4064 }, { "epoch": 3.859719914550202, "grad_norm": 0.04533764719963074, "learning_rate": 2.4558409508920986e-05, "loss": 0.2542, "step": 4065 }, { "epoch": 3.860669356752908, "grad_norm": 0.05031617358326912, "learning_rate": 2.4519256159233795e-05, "loss": 0.252, "step": 4066 }, { "epoch": 3.8616187989556137, "grad_norm": 0.05697787180542946, "learning_rate": 2.4480129683969932e-05, "loss": 0.2593, "step": 4067 }, { "epoch": 3.8625682411583195, "grad_norm": 0.0841057151556015, "learning_rate": 2.444103009705999e-05, "loss": 0.233, "step": 4068 }, { "epoch": 3.8635176833610254, "grad_norm": 0.08296506106853485, "learning_rate": 2.4401957412425214e-05, "loss": 0.2376, "step": 4069 }, { "epoch": 3.8644671255637313, "grad_norm": 0.055589281022548676, "learning_rate": 2.4362911643977147e-05, "loss": 0.2404, "step": 4070 }, { "epoch": 3.865416567766437, "grad_norm": 0.058519408106803894, "learning_rate": 2.4323892805617777e-05, "loss": 0.2535, "step": 4071 }, { "epoch": 3.866366009969143, "grad_norm": 0.04924516752362251, "learning_rate": 2.4284900911239517e-05, "loss": 0.2528, "step": 4072 }, { "epoch": 3.867315452171849, "grad_norm": 0.05597160384058952, "learning_rate": 2.424593597472512e-05, "loss": 0.2488, "step": 4073 }, { "epoch": 3.868264894374555, "grad_norm": 0.058191198855638504, "learning_rate": 2.420699800994787e-05, "loss": 0.2483, "step": 4074 }, { "epoch": 3.8692143365772607, "grad_norm": 0.07110270857810974, "learning_rate": 2.4168087030771346e-05, "loss": 0.2675, "step": 4075 }, { "epoch": 3.8701637787799665, "grad_norm": 0.06401795893907547, "learning_rate": 2.4129203051049555e-05, "loss": 0.2556, "step": 4076 }, { "epoch": 3.8711132209826724, "grad_norm": 0.047086797654628754, "learning_rate": 2.409034608462686e-05, "loss": 0.25, "step": 4077 }, { "epoch": 3.8720626631853787, "grad_norm": 0.05323222279548645, "learning_rate": 2.405151614533804e-05, "loss": 0.2477, "step": 4078 }, { "epoch": 3.8730121053880846, "grad_norm": 0.051141295582056046, "learning_rate": 2.401271324700821e-05, "loss": 0.2563, "step": 4079 }, { "epoch": 3.8739615475907905, "grad_norm": 0.05030396580696106, "learning_rate": 2.3973937403452983e-05, "loss": 0.2639, "step": 4080 }, { "epoch": 3.8749109897934964, "grad_norm": 0.058408744633197784, "learning_rate": 2.3935188628478123e-05, "loss": 0.2501, "step": 4081 }, { "epoch": 3.8758604319962022, "grad_norm": 0.055497270077466965, "learning_rate": 2.389646693587996e-05, "loss": 0.2524, "step": 4082 }, { "epoch": 3.876809874198908, "grad_norm": 0.05612848326563835, "learning_rate": 2.3857772339445063e-05, "loss": 0.2589, "step": 4083 }, { "epoch": 3.877759316401614, "grad_norm": 0.0453292652964592, "learning_rate": 2.3819104852950368e-05, "loss": 0.2514, "step": 4084 }, { "epoch": 3.87870875860432, "grad_norm": 0.059018030762672424, "learning_rate": 2.3780464490163267e-05, "loss": 0.2481, "step": 4085 }, { "epoch": 3.8796582008070257, "grad_norm": 0.05215967819094658, "learning_rate": 2.3741851264841297e-05, "loss": 0.2569, "step": 4086 }, { "epoch": 3.880607643009732, "grad_norm": 0.06927253305912018, "learning_rate": 2.3703265190732526e-05, "loss": 0.2708, "step": 4087 }, { "epoch": 3.881557085212438, "grad_norm": 0.05080414563417435, "learning_rate": 2.3664706281575233e-05, "loss": 0.2596, "step": 4088 }, { "epoch": 3.882506527415144, "grad_norm": 0.04832320287823677, "learning_rate": 2.3626174551098046e-05, "loss": 0.2498, "step": 4089 }, { "epoch": 3.8834559696178497, "grad_norm": 0.059689346700906754, "learning_rate": 2.3587670013020024e-05, "loss": 0.235, "step": 4090 }, { "epoch": 3.8844054118205555, "grad_norm": 0.049732983112335205, "learning_rate": 2.3549192681050336e-05, "loss": 0.24, "step": 4091 }, { "epoch": 3.8853548540232614, "grad_norm": 0.05987070873379707, "learning_rate": 2.3510742568888656e-05, "loss": 0.2525, "step": 4092 }, { "epoch": 3.8863042962259673, "grad_norm": 0.05327129364013672, "learning_rate": 2.3472319690224886e-05, "loss": 0.2534, "step": 4093 }, { "epoch": 3.887253738428673, "grad_norm": 0.059572651982307434, "learning_rate": 2.3433924058739233e-05, "loss": 0.2512, "step": 4094 }, { "epoch": 3.888203180631379, "grad_norm": 0.06076107546687126, "learning_rate": 2.339555568810221e-05, "loss": 0.2539, "step": 4095 }, { "epoch": 3.889152622834085, "grad_norm": 0.04622993990778923, "learning_rate": 2.335721459197462e-05, "loss": 0.2533, "step": 4096 }, { "epoch": 3.890102065036791, "grad_norm": 0.05739247426390648, "learning_rate": 2.3318900784007524e-05, "loss": 0.2599, "step": 4097 }, { "epoch": 3.8910515072394967, "grad_norm": 0.0760936364531517, "learning_rate": 2.3280614277842382e-05, "loss": 0.2499, "step": 4098 }, { "epoch": 3.8920009494422025, "grad_norm": 0.053201522678136826, "learning_rate": 2.32423550871108e-05, "loss": 0.2489, "step": 4099 }, { "epoch": 3.8929503916449084, "grad_norm": 0.06175504997372627, "learning_rate": 2.3204123225434715e-05, "loss": 0.2738, "step": 4100 }, { "epoch": 3.8938998338476143, "grad_norm": 0.05046062543988228, "learning_rate": 2.316591870642635e-05, "loss": 0.2543, "step": 4101 }, { "epoch": 3.89484927605032, "grad_norm": 0.14622515439987183, "learning_rate": 2.312774154368812e-05, "loss": 0.2286, "step": 4102 }, { "epoch": 3.8957987182530265, "grad_norm": 0.04975195601582527, "learning_rate": 2.3089591750812846e-05, "loss": 0.2512, "step": 4103 }, { "epoch": 3.8967481604557324, "grad_norm": 0.04820247367024422, "learning_rate": 2.3051469341383402e-05, "loss": 0.2512, "step": 4104 }, { "epoch": 3.8976976026584382, "grad_norm": 0.07370350509881973, "learning_rate": 2.3013374328973114e-05, "loss": 0.246, "step": 4105 }, { "epoch": 3.898647044861144, "grad_norm": 0.054529186338186264, "learning_rate": 2.2975306727145418e-05, "loss": 0.2547, "step": 4106 }, { "epoch": 3.89959648706385, "grad_norm": 0.05046022683382034, "learning_rate": 2.293726654945402e-05, "loss": 0.2458, "step": 4107 }, { "epoch": 3.900545929266556, "grad_norm": 0.057758230715990067, "learning_rate": 2.2899253809442944e-05, "loss": 0.2657, "step": 4108 }, { "epoch": 3.9014953714692617, "grad_norm": 0.09853757172822952, "learning_rate": 2.2861268520646274e-05, "loss": 0.235, "step": 4109 }, { "epoch": 3.9024448136719676, "grad_norm": 0.05186276137828827, "learning_rate": 2.2823310696588494e-05, "loss": 0.2539, "step": 4110 }, { "epoch": 3.9033942558746735, "grad_norm": 0.053207337856292725, "learning_rate": 2.2785380350784237e-05, "loss": 0.2451, "step": 4111 }, { "epoch": 3.90434369807738, "grad_norm": 0.07637642323970795, "learning_rate": 2.2747477496738334e-05, "loss": 0.2699, "step": 4112 }, { "epoch": 3.9052931402800857, "grad_norm": 0.10490339994430542, "learning_rate": 2.270960214794584e-05, "loss": 0.2484, "step": 4113 }, { "epoch": 3.9062425824827915, "grad_norm": 0.07953619956970215, "learning_rate": 2.2671754317892013e-05, "loss": 0.2624, "step": 4114 }, { "epoch": 3.9071920246854974, "grad_norm": 0.05667957291007042, "learning_rate": 2.2633934020052383e-05, "loss": 0.2618, "step": 4115 }, { "epoch": 3.9081414668882033, "grad_norm": 0.054316237568855286, "learning_rate": 2.2596141267892568e-05, "loss": 0.2555, "step": 4116 }, { "epoch": 3.909090909090909, "grad_norm": 0.04946750029921532, "learning_rate": 2.2558376074868448e-05, "loss": 0.2478, "step": 4117 }, { "epoch": 3.910040351293615, "grad_norm": 0.06284037977457047, "learning_rate": 2.2520638454426068e-05, "loss": 0.2634, "step": 4118 }, { "epoch": 3.910989793496321, "grad_norm": 0.07433723658323288, "learning_rate": 2.2482928420001657e-05, "loss": 0.2628, "step": 4119 }, { "epoch": 3.911939235699027, "grad_norm": 0.07572853565216064, "learning_rate": 2.2445245985021614e-05, "loss": 0.237, "step": 4120 }, { "epoch": 3.9128886779017327, "grad_norm": 0.0872272476553917, "learning_rate": 2.2407591162902573e-05, "loss": 0.2529, "step": 4121 }, { "epoch": 3.9138381201044385, "grad_norm": 0.05421094223856926, "learning_rate": 2.23699639670512e-05, "loss": 0.2445, "step": 4122 }, { "epoch": 3.9147875623071444, "grad_norm": 0.05467075854539871, "learning_rate": 2.2332364410864493e-05, "loss": 0.2589, "step": 4123 }, { "epoch": 3.9157370045098503, "grad_norm": 0.10132791101932526, "learning_rate": 2.229479250772949e-05, "loss": 0.2544, "step": 4124 }, { "epoch": 3.916686446712556, "grad_norm": 0.06448742747306824, "learning_rate": 2.2257248271023423e-05, "loss": 0.254, "step": 4125 }, { "epoch": 3.917635888915262, "grad_norm": 0.06296240538358688, "learning_rate": 2.221973171411367e-05, "loss": 0.2405, "step": 4126 }, { "epoch": 3.918585331117968, "grad_norm": 0.062131330370903015, "learning_rate": 2.218224285035774e-05, "loss": 0.2573, "step": 4127 }, { "epoch": 3.9195347733206742, "grad_norm": 0.07281263172626495, "learning_rate": 2.2144781693103357e-05, "loss": 0.2509, "step": 4128 }, { "epoch": 3.92048421552338, "grad_norm": 0.05253308266401291, "learning_rate": 2.210734825568822e-05, "loss": 0.2506, "step": 4129 }, { "epoch": 3.921433657726086, "grad_norm": 0.05333153158426285, "learning_rate": 2.2069942551440358e-05, "loss": 0.2614, "step": 4130 }, { "epoch": 3.922383099928792, "grad_norm": 0.055194880813360214, "learning_rate": 2.2032564593677774e-05, "loss": 0.2503, "step": 4131 }, { "epoch": 3.9233325421314977, "grad_norm": 0.05564767122268677, "learning_rate": 2.199521439570863e-05, "loss": 0.2464, "step": 4132 }, { "epoch": 3.9242819843342036, "grad_norm": 0.05214362591505051, "learning_rate": 2.1957891970831302e-05, "loss": 0.2367, "step": 4133 }, { "epoch": 3.9252314265369095, "grad_norm": 0.06188051030039787, "learning_rate": 2.192059733233408e-05, "loss": 0.247, "step": 4134 }, { "epoch": 3.9261808687396154, "grad_norm": 0.0542658269405365, "learning_rate": 2.1883330493495557e-05, "loss": 0.2525, "step": 4135 }, { "epoch": 3.9271303109423212, "grad_norm": 0.05622916668653488, "learning_rate": 2.1846091467584318e-05, "loss": 0.2442, "step": 4136 }, { "epoch": 3.9280797531450276, "grad_norm": 0.05223899707198143, "learning_rate": 2.1808880267859078e-05, "loss": 0.2414, "step": 4137 }, { "epoch": 3.9290291953477334, "grad_norm": 0.051392000168561935, "learning_rate": 2.177169690756864e-05, "loss": 0.2443, "step": 4138 }, { "epoch": 3.9299786375504393, "grad_norm": 0.053416091948747635, "learning_rate": 2.1734541399951857e-05, "loss": 0.2491, "step": 4139 }, { "epoch": 3.930928079753145, "grad_norm": 0.055155426263809204, "learning_rate": 2.1697413758237784e-05, "loss": 0.2528, "step": 4140 }, { "epoch": 3.931877521955851, "grad_norm": 0.0750957801938057, "learning_rate": 2.166031399564542e-05, "loss": 0.2482, "step": 4141 }, { "epoch": 3.932826964158557, "grad_norm": 0.049310583621263504, "learning_rate": 2.1623242125383903e-05, "loss": 0.249, "step": 4142 }, { "epoch": 3.933776406361263, "grad_norm": 0.054114725440740585, "learning_rate": 2.1586198160652427e-05, "loss": 0.2475, "step": 4143 }, { "epoch": 3.9347258485639687, "grad_norm": 0.05737404525279999, "learning_rate": 2.1549182114640252e-05, "loss": 0.2535, "step": 4144 }, { "epoch": 3.9356752907666746, "grad_norm": 0.0634528249502182, "learning_rate": 2.1512194000526676e-05, "loss": 0.2156, "step": 4145 }, { "epoch": 3.9366247329693804, "grad_norm": 0.07257208228111267, "learning_rate": 2.1475233831481122e-05, "loss": 0.2578, "step": 4146 }, { "epoch": 3.9375741751720863, "grad_norm": 0.056972626596689224, "learning_rate": 2.1438301620662993e-05, "loss": 0.2551, "step": 4147 }, { "epoch": 3.938523617374792, "grad_norm": 0.05729486420750618, "learning_rate": 2.1401397381221767e-05, "loss": 0.2744, "step": 4148 }, { "epoch": 3.939473059577498, "grad_norm": 0.05070596560835838, "learning_rate": 2.136452112629693e-05, "loss": 0.2602, "step": 4149 }, { "epoch": 3.940422501780204, "grad_norm": 0.046729620546102524, "learning_rate": 2.1327672869018032e-05, "loss": 0.2518, "step": 4150 }, { "epoch": 3.94137194398291, "grad_norm": 0.048859477043151855, "learning_rate": 2.1290852622504732e-05, "loss": 0.245, "step": 4151 }, { "epoch": 3.942321386185616, "grad_norm": 0.05489668622612953, "learning_rate": 2.1254060399866505e-05, "loss": 0.2588, "step": 4152 }, { "epoch": 3.943270828388322, "grad_norm": 0.04978334531188011, "learning_rate": 2.1217296214203086e-05, "loss": 0.2504, "step": 4153 }, { "epoch": 3.944220270591028, "grad_norm": 0.056370414793491364, "learning_rate": 2.1180560078604074e-05, "loss": 0.2503, "step": 4154 }, { "epoch": 3.9451697127937337, "grad_norm": 0.0500507652759552, "learning_rate": 2.1143852006149122e-05, "loss": 0.2426, "step": 4155 }, { "epoch": 3.9461191549964396, "grad_norm": 0.05014021322131157, "learning_rate": 2.110717200990797e-05, "loss": 0.2587, "step": 4156 }, { "epoch": 3.9470685971991455, "grad_norm": 0.051097285002470016, "learning_rate": 2.1070520102940184e-05, "loss": 0.2516, "step": 4157 }, { "epoch": 3.9480180394018514, "grad_norm": 0.08694472163915634, "learning_rate": 2.1033896298295508e-05, "loss": 0.2643, "step": 4158 }, { "epoch": 3.9489674816045572, "grad_norm": 0.08210060745477676, "learning_rate": 2.0997300609013592e-05, "loss": 0.2264, "step": 4159 }, { "epoch": 3.949916923807263, "grad_norm": 0.05755892023444176, "learning_rate": 2.0960733048124083e-05, "loss": 0.2692, "step": 4160 }, { "epoch": 3.950866366009969, "grad_norm": 0.050832442939281464, "learning_rate": 2.0924193628646626e-05, "loss": 0.2536, "step": 4161 }, { "epoch": 3.9518158082126753, "grad_norm": 0.060199491679668427, "learning_rate": 2.088768236359081e-05, "loss": 0.2571, "step": 4162 }, { "epoch": 3.952765250415381, "grad_norm": 0.04771970584988594, "learning_rate": 2.085119926595628e-05, "loss": 0.2531, "step": 4163 }, { "epoch": 3.953714692618087, "grad_norm": 0.05692679435014725, "learning_rate": 2.0814744348732595e-05, "loss": 0.2545, "step": 4164 }, { "epoch": 3.954664134820793, "grad_norm": 0.05149425193667412, "learning_rate": 2.077831762489927e-05, "loss": 0.2571, "step": 4165 }, { "epoch": 3.955613577023499, "grad_norm": 0.05007016658782959, "learning_rate": 2.074191910742581e-05, "loss": 0.2602, "step": 4166 }, { "epoch": 3.9565630192262047, "grad_norm": 0.04917161166667938, "learning_rate": 2.0705548809271658e-05, "loss": 0.2603, "step": 4167 }, { "epoch": 3.9575124614289106, "grad_norm": 0.07422235608100891, "learning_rate": 2.0669206743386216e-05, "loss": 0.2713, "step": 4168 }, { "epoch": 3.9584619036316164, "grad_norm": 0.05925751104950905, "learning_rate": 2.0632892922708892e-05, "loss": 0.2671, "step": 4169 }, { "epoch": 3.9594113458343223, "grad_norm": 0.053712397813797, "learning_rate": 2.0596607360168897e-05, "loss": 0.2666, "step": 4170 }, { "epoch": 3.960360788037028, "grad_norm": 0.05303372070193291, "learning_rate": 2.056035006868553e-05, "loss": 0.2433, "step": 4171 }, { "epoch": 3.961310230239734, "grad_norm": 0.079205721616745, "learning_rate": 2.0524121061167945e-05, "loss": 0.26, "step": 4172 }, { "epoch": 3.96225967244244, "grad_norm": 0.05416445806622505, "learning_rate": 2.0487920350515212e-05, "loss": 0.2476, "step": 4173 }, { "epoch": 3.963209114645146, "grad_norm": 0.06397967040538788, "learning_rate": 2.045174794961644e-05, "loss": 0.239, "step": 4174 }, { "epoch": 3.9641585568478517, "grad_norm": 0.09109178930521011, "learning_rate": 2.0415603871350473e-05, "loss": 0.2594, "step": 4175 }, { "epoch": 3.9651079990505576, "grad_norm": 0.07438705861568451, "learning_rate": 2.0379488128586243e-05, "loss": 0.2526, "step": 4176 }, { "epoch": 3.966057441253264, "grad_norm": 0.051279328763484955, "learning_rate": 2.0343400734182493e-05, "loss": 0.249, "step": 4177 }, { "epoch": 3.9670068834559697, "grad_norm": 0.08495932817459106, "learning_rate": 2.0307341700987892e-05, "loss": 0.2366, "step": 4178 }, { "epoch": 3.9679563256586756, "grad_norm": 0.051565755158662796, "learning_rate": 2.027131104184108e-05, "loss": 0.2477, "step": 4179 }, { "epoch": 3.9689057678613815, "grad_norm": 0.058595266193151474, "learning_rate": 2.023530876957045e-05, "loss": 0.2402, "step": 4180 }, { "epoch": 3.9698552100640874, "grad_norm": 0.06185249239206314, "learning_rate": 2.0199334896994448e-05, "loss": 0.2556, "step": 4181 }, { "epoch": 3.9708046522667932, "grad_norm": 0.053781840950250626, "learning_rate": 2.016338943692131e-05, "loss": 0.2568, "step": 4182 }, { "epoch": 3.971754094469499, "grad_norm": 0.05680249631404877, "learning_rate": 2.0127472402149173e-05, "loss": 0.2551, "step": 4183 }, { "epoch": 3.972703536672205, "grad_norm": 0.053001519292593, "learning_rate": 2.0091583805466075e-05, "loss": 0.2521, "step": 4184 }, { "epoch": 3.973652978874911, "grad_norm": 0.061895888298749924, "learning_rate": 2.0055723659649904e-05, "loss": 0.2529, "step": 4185 }, { "epoch": 3.9746024210776167, "grad_norm": 0.07784521579742432, "learning_rate": 2.0019891977468408e-05, "loss": 0.263, "step": 4186 }, { "epoch": 3.975551863280323, "grad_norm": 0.060105256736278534, "learning_rate": 1.9984088771679264e-05, "loss": 0.2542, "step": 4187 }, { "epoch": 3.976501305483029, "grad_norm": 0.07342025637626648, "learning_rate": 1.994831405502996e-05, "loss": 0.232, "step": 4188 }, { "epoch": 3.977450747685735, "grad_norm": 0.058840759098529816, "learning_rate": 1.9912567840257845e-05, "loss": 0.2581, "step": 4189 }, { "epoch": 3.9784001898884407, "grad_norm": 0.15697553753852844, "learning_rate": 1.9876850140090108e-05, "loss": 0.254, "step": 4190 }, { "epoch": 3.9793496320911466, "grad_norm": 0.05611603334546089, "learning_rate": 1.98411609672438e-05, "loss": 0.2435, "step": 4191 }, { "epoch": 3.9802990742938524, "grad_norm": 0.05032116547226906, "learning_rate": 1.9805500334425876e-05, "loss": 0.2561, "step": 4192 }, { "epoch": 3.9812485164965583, "grad_norm": 0.08890901505947113, "learning_rate": 1.976986825433297e-05, "loss": 0.2368, "step": 4193 }, { "epoch": 3.982197958699264, "grad_norm": 0.08615860342979431, "learning_rate": 1.973426473965172e-05, "loss": 0.2725, "step": 4194 }, { "epoch": 3.98314740090197, "grad_norm": 0.06632962077856064, "learning_rate": 1.9698689803058522e-05, "loss": 0.2554, "step": 4195 }, { "epoch": 3.984096843104676, "grad_norm": 0.09921465814113617, "learning_rate": 1.9663143457219558e-05, "loss": 0.2575, "step": 4196 }, { "epoch": 3.985046285307382, "grad_norm": 0.048173487186431885, "learning_rate": 1.962762571479094e-05, "loss": 0.2495, "step": 4197 }, { "epoch": 3.9859957275100877, "grad_norm": 0.06488333642482758, "learning_rate": 1.959213658841844e-05, "loss": 0.239, "step": 4198 }, { "epoch": 3.9869451697127936, "grad_norm": 0.05705002322793007, "learning_rate": 1.95566760907378e-05, "loss": 0.2478, "step": 4199 }, { "epoch": 3.9878946119154994, "grad_norm": 0.05279826000332832, "learning_rate": 1.952124423437447e-05, "loss": 0.2512, "step": 4200 }, { "epoch": 3.9888440541182053, "grad_norm": 0.05641533434391022, "learning_rate": 1.948584103194373e-05, "loss": 0.26, "step": 4201 }, { "epoch": 3.9897934963209116, "grad_norm": 0.07074812054634094, "learning_rate": 1.9450466496050656e-05, "loss": 0.244, "step": 4202 }, { "epoch": 3.9907429385236175, "grad_norm": 0.054319318383932114, "learning_rate": 1.9415120639290085e-05, "loss": 0.2544, "step": 4203 }, { "epoch": 3.9916923807263234, "grad_norm": 0.04937317222356796, "learning_rate": 1.937980347424675e-05, "loss": 0.2481, "step": 4204 }, { "epoch": 3.9926418229290292, "grad_norm": 0.07532598078250885, "learning_rate": 1.934451501349507e-05, "loss": 0.2734, "step": 4205 }, { "epoch": 3.993591265131735, "grad_norm": 0.06492338329553604, "learning_rate": 1.9309255269599235e-05, "loss": 0.2531, "step": 4206 }, { "epoch": 3.994540707334441, "grad_norm": 0.05689654126763344, "learning_rate": 1.9274024255113287e-05, "loss": 0.2523, "step": 4207 }, { "epoch": 3.995490149537147, "grad_norm": 0.05884096398949623, "learning_rate": 1.9238821982580967e-05, "loss": 0.2512, "step": 4208 }, { "epoch": 3.9964395917398527, "grad_norm": 0.0537344291806221, "learning_rate": 1.9203648464535818e-05, "loss": 0.2423, "step": 4209 }, { "epoch": 3.9973890339425586, "grad_norm": 0.0568530336022377, "learning_rate": 1.9168503713501184e-05, "loss": 0.2637, "step": 4210 }, { "epoch": 3.9983384761452645, "grad_norm": 0.06064499542117119, "learning_rate": 1.913338774199004e-05, "loss": 0.2562, "step": 4211 }, { "epoch": 3.999287918347971, "grad_norm": 0.05399147793650627, "learning_rate": 1.9098300562505266e-05, "loss": 0.2424, "step": 4212 }, { "epoch": 4.000237360550677, "grad_norm": 0.05513492599129677, "learning_rate": 1.90632421875394e-05, "loss": 0.2563, "step": 4213 }, { "epoch": 4.001186802753383, "grad_norm": 0.08468678593635559, "learning_rate": 1.9028212629574726e-05, "loss": 0.2547, "step": 4214 }, { "epoch": 4.002136244956088, "grad_norm": 0.054223205894231796, "learning_rate": 1.8993211901083353e-05, "loss": 0.2209, "step": 4215 }, { "epoch": 4.003085687158794, "grad_norm": 0.060239773243665695, "learning_rate": 1.895824001452696e-05, "loss": 0.2468, "step": 4216 }, { "epoch": 4.0040351293615, "grad_norm": 0.12123169749975204, "learning_rate": 1.892329698235715e-05, "loss": 0.2172, "step": 4217 }, { "epoch": 4.004984571564206, "grad_norm": 0.05851219221949577, "learning_rate": 1.8888382817015117e-05, "loss": 0.2243, "step": 4218 }, { "epoch": 4.005934013766912, "grad_norm": 0.058043915778398514, "learning_rate": 1.8853497530931795e-05, "loss": 0.2403, "step": 4219 }, { "epoch": 4.006883455969618, "grad_norm": 0.06909014284610748, "learning_rate": 1.881864113652796e-05, "loss": 0.2411, "step": 4220 }, { "epoch": 4.007832898172324, "grad_norm": 0.057406459003686905, "learning_rate": 1.8783813646213867e-05, "loss": 0.2532, "step": 4221 }, { "epoch": 4.00878234037503, "grad_norm": 0.06199433654546738, "learning_rate": 1.874901507238972e-05, "loss": 0.2478, "step": 4222 }, { "epoch": 4.009731782577735, "grad_norm": 0.13553054630756378, "learning_rate": 1.8714245427445278e-05, "loss": 0.2485, "step": 4223 }, { "epoch": 4.010681224780441, "grad_norm": 0.07412765920162201, "learning_rate": 1.8679504723760055e-05, "loss": 0.2234, "step": 4224 }, { "epoch": 4.011630666983147, "grad_norm": 0.05949070304632187, "learning_rate": 1.864479297370325e-05, "loss": 0.2442, "step": 4225 }, { "epoch": 4.012580109185853, "grad_norm": 0.07053567469120026, "learning_rate": 1.8610110189633757e-05, "loss": 0.2348, "step": 4226 }, { "epoch": 4.013529551388559, "grad_norm": 0.07391881942749023, "learning_rate": 1.8575456383900114e-05, "loss": 0.2482, "step": 4227 }, { "epoch": 4.014478993591265, "grad_norm": 0.13988161087036133, "learning_rate": 1.8540831568840644e-05, "loss": 0.235, "step": 4228 }, { "epoch": 4.015428435793971, "grad_norm": 0.073545902967453, "learning_rate": 1.8506235756783262e-05, "loss": 0.2307, "step": 4229 }, { "epoch": 4.016377877996677, "grad_norm": 0.06786803901195526, "learning_rate": 1.8471668960045574e-05, "loss": 0.2421, "step": 4230 }, { "epoch": 4.017327320199382, "grad_norm": 0.07447106391191483, "learning_rate": 1.843713119093485e-05, "loss": 0.2507, "step": 4231 }, { "epoch": 4.018276762402089, "grad_norm": 0.06786752492189407, "learning_rate": 1.840262246174803e-05, "loss": 0.2211, "step": 4232 }, { "epoch": 4.019226204604795, "grad_norm": 0.07335960865020752, "learning_rate": 1.836814278477179e-05, "loss": 0.2297, "step": 4233 }, { "epoch": 4.020175646807501, "grad_norm": 0.08651788532733917, "learning_rate": 1.8333692172282292e-05, "loss": 0.2484, "step": 4234 }, { "epoch": 4.021125089010207, "grad_norm": 0.0822267234325409, "learning_rate": 1.8299270636545518e-05, "loss": 0.2353, "step": 4235 }, { "epoch": 4.022074531212913, "grad_norm": 0.06862235814332962, "learning_rate": 1.8264878189817002e-05, "loss": 0.2466, "step": 4236 }, { "epoch": 4.023023973415619, "grad_norm": 0.09110434353351593, "learning_rate": 1.823051484434195e-05, "loss": 0.2476, "step": 4237 }, { "epoch": 4.0239734156183244, "grad_norm": 0.06781786680221558, "learning_rate": 1.819618061235525e-05, "loss": 0.2438, "step": 4238 }, { "epoch": 4.02492285782103, "grad_norm": 0.06978391855955124, "learning_rate": 1.8161875506081293e-05, "loss": 0.2524, "step": 4239 }, { "epoch": 4.025872300023736, "grad_norm": 0.07549986243247986, "learning_rate": 1.8127599537734296e-05, "loss": 0.2371, "step": 4240 }, { "epoch": 4.026821742226442, "grad_norm": 0.10006662458181381, "learning_rate": 1.8093352719517874e-05, "loss": 0.2396, "step": 4241 }, { "epoch": 4.027771184429148, "grad_norm": 0.0820450708270073, "learning_rate": 1.8059135063625477e-05, "loss": 0.2416, "step": 4242 }, { "epoch": 4.028720626631854, "grad_norm": 0.08785741031169891, "learning_rate": 1.8024946582240033e-05, "loss": 0.2359, "step": 4243 }, { "epoch": 4.02967006883456, "grad_norm": 0.09228499233722687, "learning_rate": 1.7990787287534104e-05, "loss": 0.2466, "step": 4244 }, { "epoch": 4.030619511037266, "grad_norm": 0.0822257474064827, "learning_rate": 1.795665719166997e-05, "loss": 0.2452, "step": 4245 }, { "epoch": 4.031568953239971, "grad_norm": 0.06719242036342621, "learning_rate": 1.792255630679931e-05, "loss": 0.2441, "step": 4246 }, { "epoch": 4.032518395442677, "grad_norm": 0.1547231525182724, "learning_rate": 1.788848464506362e-05, "loss": 0.2547, "step": 4247 }, { "epoch": 4.033467837645383, "grad_norm": 0.08464305847883224, "learning_rate": 1.7854442218593838e-05, "loss": 0.2151, "step": 4248 }, { "epoch": 4.034417279848089, "grad_norm": 0.06678024679422379, "learning_rate": 1.7820429039510566e-05, "loss": 0.2468, "step": 4249 }, { "epoch": 4.035366722050795, "grad_norm": 0.06837774068117142, "learning_rate": 1.7786445119923968e-05, "loss": 0.2386, "step": 4250 }, { "epoch": 4.036316164253501, "grad_norm": 0.067479707300663, "learning_rate": 1.775249047193377e-05, "loss": 0.2334, "step": 4251 }, { "epoch": 4.037265606456207, "grad_norm": 0.07563888281583786, "learning_rate": 1.7718565107629347e-05, "loss": 0.2449, "step": 4252 }, { "epoch": 4.038215048658913, "grad_norm": 0.06924168020486832, "learning_rate": 1.7684669039089587e-05, "loss": 0.2456, "step": 4253 }, { "epoch": 4.039164490861618, "grad_norm": 0.0730455070734024, "learning_rate": 1.765080227838295e-05, "loss": 0.2498, "step": 4254 }, { "epoch": 4.040113933064324, "grad_norm": 0.11271868646144867, "learning_rate": 1.7616964837567495e-05, "loss": 0.2054, "step": 4255 }, { "epoch": 4.04106337526703, "grad_norm": 0.0824706181883812, "learning_rate": 1.7583156728690787e-05, "loss": 0.2515, "step": 4256 }, { "epoch": 4.042012817469737, "grad_norm": 0.09537190198898315, "learning_rate": 1.7549377963789994e-05, "loss": 0.242, "step": 4257 }, { "epoch": 4.042962259672443, "grad_norm": 0.07382145524024963, "learning_rate": 1.7515628554891862e-05, "loss": 0.2486, "step": 4258 }, { "epoch": 4.043911701875149, "grad_norm": 0.08604732155799866, "learning_rate": 1.748190851401258e-05, "loss": 0.2348, "step": 4259 }, { "epoch": 4.044861144077855, "grad_norm": 0.0709012970328331, "learning_rate": 1.7448217853157998e-05, "loss": 0.2501, "step": 4260 }, { "epoch": 4.0458105862805605, "grad_norm": 0.07132518291473389, "learning_rate": 1.741455658432344e-05, "loss": 0.2423, "step": 4261 }, { "epoch": 4.046760028483266, "grad_norm": 0.07555680721998215, "learning_rate": 1.738092471949375e-05, "loss": 0.2445, "step": 4262 }, { "epoch": 4.047709470685972, "grad_norm": 0.07572092860937119, "learning_rate": 1.7347322270643418e-05, "loss": 0.2493, "step": 4263 }, { "epoch": 4.048658912888678, "grad_norm": 0.06770916283130646, "learning_rate": 1.7313749249736267e-05, "loss": 0.2466, "step": 4264 }, { "epoch": 4.049608355091384, "grad_norm": 0.07170511037111282, "learning_rate": 1.728020566872581e-05, "loss": 0.2072, "step": 4265 }, { "epoch": 4.05055779729409, "grad_norm": 0.07005757093429565, "learning_rate": 1.7246691539555028e-05, "loss": 0.2419, "step": 4266 } ], "logging_steps": 1, "max_steps": 5265, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 158, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4755390979171287e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }