diff --git "a/checkpoint-2212/trainer_state.json" "b/checkpoint-2212/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2212/trainer_state.json" @@ -0,0 +1,15525 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.10040351293615, + "eval_steps": 500, + "global_step": 2212, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009494422027059103, + "grad_norm": 5.364119052886963, + "learning_rate": 6.329113924050633e-07, + "loss": 2.9616, + "step": 1 + }, + { + "epoch": 0.0018988844054118206, + "grad_norm": 5.426063060760498, + "learning_rate": 1.2658227848101265e-06, + "loss": 2.9664, + "step": 2 + }, + { + "epoch": 0.0028483266081177306, + "grad_norm": 5.507386684417725, + "learning_rate": 1.8987341772151901e-06, + "loss": 2.96, + "step": 3 + }, + { + "epoch": 0.0037977688108236413, + "grad_norm": 5.467552185058594, + "learning_rate": 2.531645569620253e-06, + "loss": 2.975, + "step": 4 + }, + { + "epoch": 0.004747211013529551, + "grad_norm": 5.386384963989258, + "learning_rate": 3.1645569620253167e-06, + "loss": 2.9473, + "step": 5 + }, + { + "epoch": 0.005696653216235461, + "grad_norm": 4.9166951179504395, + "learning_rate": 3.7974683544303802e-06, + "loss": 2.8597, + "step": 6 + }, + { + "epoch": 0.006646095418941372, + "grad_norm": 5.469020843505859, + "learning_rate": 4.430379746835443e-06, + "loss": 2.9633, + "step": 7 + }, + { + "epoch": 0.0075955376216472826, + "grad_norm": 5.380453586578369, + "learning_rate": 5.063291139240506e-06, + "loss": 2.8931, + "step": 8 + }, + { + "epoch": 0.008544979824353193, + "grad_norm": 4.922253131866455, + "learning_rate": 5.69620253164557e-06, + "loss": 2.7452, + "step": 9 + }, + { + "epoch": 0.009494422027059102, + "grad_norm": 5.517508029937744, + "learning_rate": 6.329113924050633e-06, + "loss": 2.8326, + "step": 10 + }, + { + "epoch": 0.010443864229765013, + "grad_norm": 5.810976982116699, + "learning_rate": 6.9620253164556965e-06, + "loss": 2.7854, + "step": 11 + }, + { + "epoch": 0.011393306432470923, + "grad_norm": 5.690661430358887, + "learning_rate": 7.5949367088607605e-06, + "loss": 2.7069, + "step": 12 + }, + { + "epoch": 0.012342748635176834, + "grad_norm": 5.994122505187988, + "learning_rate": 8.227848101265822e-06, + "loss": 2.5705, + "step": 13 + }, + { + "epoch": 0.013292190837882745, + "grad_norm": 5.86803674697876, + "learning_rate": 8.860759493670886e-06, + "loss": 2.4461, + "step": 14 + }, + { + "epoch": 0.014241633040588654, + "grad_norm": 5.448781490325928, + "learning_rate": 9.49367088607595e-06, + "loss": 2.2408, + "step": 15 + }, + { + "epoch": 0.015191075243294565, + "grad_norm": 6.4004902839660645, + "learning_rate": 1.0126582278481012e-05, + "loss": 2.1205, + "step": 16 + }, + { + "epoch": 0.016140517446000476, + "grad_norm": 6.970590591430664, + "learning_rate": 1.0759493670886076e-05, + "loss": 1.9474, + "step": 17 + }, + { + "epoch": 0.017089959648706386, + "grad_norm": 7.423785209655762, + "learning_rate": 1.139240506329114e-05, + "loss": 1.7348, + "step": 18 + }, + { + "epoch": 0.018039401851412295, + "grad_norm": 7.429481029510498, + "learning_rate": 1.2025316455696203e-05, + "loss": 1.4835, + "step": 19 + }, + { + "epoch": 0.018988844054118204, + "grad_norm": 6.7193284034729, + "learning_rate": 1.2658227848101267e-05, + "loss": 1.2373, + "step": 20 + }, + { + "epoch": 0.019938286256824117, + "grad_norm": 4.46099853515625, + "learning_rate": 1.3291139240506329e-05, + "loss": 1.1095, + "step": 21 + }, + { + "epoch": 0.020887728459530026, + "grad_norm": 3.001573085784912, + "learning_rate": 1.3924050632911393e-05, + "loss": 0.8642, + "step": 22 + }, + { + "epoch": 0.021837170662235936, + "grad_norm": 2.197000026702881, + "learning_rate": 1.4556962025316457e-05, + "loss": 0.7734, + "step": 23 + }, + { + "epoch": 0.022786612864941845, + "grad_norm": 1.8113943338394165, + "learning_rate": 1.5189873417721521e-05, + "loss": 0.7341, + "step": 24 + }, + { + "epoch": 0.023736055067647758, + "grad_norm": 1.7461305856704712, + "learning_rate": 1.5822784810126583e-05, + "loss": 0.6743, + "step": 25 + }, + { + "epoch": 0.024685497270353667, + "grad_norm": 1.3315849304199219, + "learning_rate": 1.6455696202531644e-05, + "loss": 0.5975, + "step": 26 + }, + { + "epoch": 0.025634939473059577, + "grad_norm": 0.726314127445221, + "learning_rate": 1.7088607594936708e-05, + "loss": 0.5659, + "step": 27 + }, + { + "epoch": 0.02658438167576549, + "grad_norm": 0.6269010901451111, + "learning_rate": 1.7721518987341772e-05, + "loss": 0.5226, + "step": 28 + }, + { + "epoch": 0.0275338238784714, + "grad_norm": 0.5819966197013855, + "learning_rate": 1.8354430379746836e-05, + "loss": 0.6132, + "step": 29 + }, + { + "epoch": 0.028483266081177308, + "grad_norm": 0.6247850060462952, + "learning_rate": 1.89873417721519e-05, + "loss": 0.5, + "step": 30 + }, + { + "epoch": 0.029432708283883217, + "grad_norm": 0.702621579170227, + "learning_rate": 1.962025316455696e-05, + "loss": 0.4958, + "step": 31 + }, + { + "epoch": 0.03038215048658913, + "grad_norm": 0.6045309901237488, + "learning_rate": 2.0253164556962025e-05, + "loss": 0.4405, + "step": 32 + }, + { + "epoch": 0.031331592689295036, + "grad_norm": 0.5436626076698303, + "learning_rate": 2.088607594936709e-05, + "loss": 0.5607, + "step": 33 + }, + { + "epoch": 0.03228103489200095, + "grad_norm": 0.43146297335624695, + "learning_rate": 2.1518987341772153e-05, + "loss": 0.3987, + "step": 34 + }, + { + "epoch": 0.03323047709470686, + "grad_norm": 0.5124548673629761, + "learning_rate": 2.2151898734177217e-05, + "loss": 0.5084, + "step": 35 + }, + { + "epoch": 0.03417991929741277, + "grad_norm": 0.4466649293899536, + "learning_rate": 2.278481012658228e-05, + "loss": 0.3761, + "step": 36 + }, + { + "epoch": 0.03512936150011868, + "grad_norm": 0.41221529245376587, + "learning_rate": 2.341772151898734e-05, + "loss": 0.3859, + "step": 37 + }, + { + "epoch": 0.03607880370282459, + "grad_norm": 0.3802257180213928, + "learning_rate": 2.4050632911392405e-05, + "loss": 0.3447, + "step": 38 + }, + { + "epoch": 0.0370282459055305, + "grad_norm": 0.47727710008621216, + "learning_rate": 2.468354430379747e-05, + "loss": 0.3914, + "step": 39 + }, + { + "epoch": 0.03797768810823641, + "grad_norm": 0.41048529744148254, + "learning_rate": 2.5316455696202533e-05, + "loss": 0.2988, + "step": 40 + }, + { + "epoch": 0.038927130310942325, + "grad_norm": 0.5019667744636536, + "learning_rate": 2.5949367088607597e-05, + "loss": 0.2938, + "step": 41 + }, + { + "epoch": 0.039876572513648234, + "grad_norm": 0.42121732234954834, + "learning_rate": 2.6582278481012658e-05, + "loss": 0.2579, + "step": 42 + }, + { + "epoch": 0.04082601471635414, + "grad_norm": 0.4193897247314453, + "learning_rate": 2.7215189873417722e-05, + "loss": 0.3262, + "step": 43 + }, + { + "epoch": 0.04177545691906005, + "grad_norm": 0.2978931665420532, + "learning_rate": 2.7848101265822786e-05, + "loss": 0.2365, + "step": 44 + }, + { + "epoch": 0.04272489912176596, + "grad_norm": 0.34771448373794556, + "learning_rate": 2.848101265822785e-05, + "loss": 0.2364, + "step": 45 + }, + { + "epoch": 0.04367434132447187, + "grad_norm": 0.3881576955318451, + "learning_rate": 2.9113924050632914e-05, + "loss": 0.286, + "step": 46 + }, + { + "epoch": 0.04462378352717778, + "grad_norm": 0.33863797783851624, + "learning_rate": 2.9746835443037974e-05, + "loss": 0.2739, + "step": 47 + }, + { + "epoch": 0.04557322572988369, + "grad_norm": 0.2894616723060608, + "learning_rate": 3.0379746835443042e-05, + "loss": 0.2587, + "step": 48 + }, + { + "epoch": 0.046522667932589606, + "grad_norm": 0.22292694449424744, + "learning_rate": 3.10126582278481e-05, + "loss": 0.1861, + "step": 49 + }, + { + "epoch": 0.047472110135295516, + "grad_norm": 0.21907460689544678, + "learning_rate": 3.1645569620253167e-05, + "loss": 0.1755, + "step": 50 + }, + { + "epoch": 0.048421552338001425, + "grad_norm": 0.29593944549560547, + "learning_rate": 3.227848101265823e-05, + "loss": 0.1856, + "step": 51 + }, + { + "epoch": 0.049370994540707334, + "grad_norm": 0.23055657744407654, + "learning_rate": 3.291139240506329e-05, + "loss": 0.2102, + "step": 52 + }, + { + "epoch": 0.050320436743413244, + "grad_norm": 0.18929323554039001, + "learning_rate": 3.354430379746836e-05, + "loss": 0.1909, + "step": 53 + }, + { + "epoch": 0.05126987894611915, + "grad_norm": 0.15004883706569672, + "learning_rate": 3.4177215189873416e-05, + "loss": 0.1619, + "step": 54 + }, + { + "epoch": 0.05221932114882506, + "grad_norm": 0.15621644258499146, + "learning_rate": 3.4810126582278487e-05, + "loss": 0.1759, + "step": 55 + }, + { + "epoch": 0.05316876335153098, + "grad_norm": 0.16266578435897827, + "learning_rate": 3.5443037974683544e-05, + "loss": 0.1657, + "step": 56 + }, + { + "epoch": 0.05411820555423689, + "grad_norm": 0.14417718350887299, + "learning_rate": 3.607594936708861e-05, + "loss": 0.1698, + "step": 57 + }, + { + "epoch": 0.0550676477569428, + "grad_norm": 0.21402889490127563, + "learning_rate": 3.670886075949367e-05, + "loss": 0.2185, + "step": 58 + }, + { + "epoch": 0.05601708995964871, + "grad_norm": 0.1997889280319214, + "learning_rate": 3.7341772151898736e-05, + "loss": 0.2143, + "step": 59 + }, + { + "epoch": 0.056966532162354616, + "grad_norm": 0.13755086064338684, + "learning_rate": 3.79746835443038e-05, + "loss": 0.1677, + "step": 60 + }, + { + "epoch": 0.057915974365060525, + "grad_norm": 0.19304363429546356, + "learning_rate": 3.8607594936708864e-05, + "loss": 0.2113, + "step": 61 + }, + { + "epoch": 0.058865416567766435, + "grad_norm": 0.14066031575202942, + "learning_rate": 3.924050632911392e-05, + "loss": 0.1612, + "step": 62 + }, + { + "epoch": 0.059814858770472344, + "grad_norm": 0.13375213742256165, + "learning_rate": 3.987341772151899e-05, + "loss": 0.164, + "step": 63 + }, + { + "epoch": 0.06076430097317826, + "grad_norm": 0.15216922760009766, + "learning_rate": 4.050632911392405e-05, + "loss": 0.16, + "step": 64 + }, + { + "epoch": 0.06171374317588417, + "grad_norm": 0.16130389273166656, + "learning_rate": 4.113924050632912e-05, + "loss": 0.1957, + "step": 65 + }, + { + "epoch": 0.06266318537859007, + "grad_norm": 0.1791229248046875, + "learning_rate": 4.177215189873418e-05, + "loss": 0.1993, + "step": 66 + }, + { + "epoch": 0.06361262758129599, + "grad_norm": 0.11038907617330551, + "learning_rate": 4.240506329113924e-05, + "loss": 0.1517, + "step": 67 + }, + { + "epoch": 0.0645620697840019, + "grad_norm": 0.13327902555465698, + "learning_rate": 4.3037974683544305e-05, + "loss": 0.1501, + "step": 68 + }, + { + "epoch": 0.06551151198670781, + "grad_norm": 0.13731731474399567, + "learning_rate": 4.367088607594937e-05, + "loss": 0.1596, + "step": 69 + }, + { + "epoch": 0.06646095418941372, + "grad_norm": 0.13924308121204376, + "learning_rate": 4.430379746835443e-05, + "loss": 0.152, + "step": 70 + }, + { + "epoch": 0.06741039639211963, + "grad_norm": 0.1482289433479309, + "learning_rate": 4.49367088607595e-05, + "loss": 0.1536, + "step": 71 + }, + { + "epoch": 0.06835983859482554, + "grad_norm": 0.10759364813566208, + "learning_rate": 4.556962025316456e-05, + "loss": 0.1543, + "step": 72 + }, + { + "epoch": 0.06930928079753144, + "grad_norm": 0.12899678945541382, + "learning_rate": 4.6202531645569625e-05, + "loss": 0.165, + "step": 73 + }, + { + "epoch": 0.07025872300023736, + "grad_norm": 0.11689919233322144, + "learning_rate": 4.683544303797468e-05, + "loss": 0.1564, + "step": 74 + }, + { + "epoch": 0.07120816520294328, + "grad_norm": 0.12697139382362366, + "learning_rate": 4.7468354430379746e-05, + "loss": 0.162, + "step": 75 + }, + { + "epoch": 0.07215760740564918, + "grad_norm": 0.12069376558065414, + "learning_rate": 4.810126582278481e-05, + "loss": 0.1467, + "step": 76 + }, + { + "epoch": 0.0731070496083551, + "grad_norm": 0.10199815034866333, + "learning_rate": 4.8734177215189874e-05, + "loss": 0.1528, + "step": 77 + }, + { + "epoch": 0.074056491811061, + "grad_norm": 0.1142750009894371, + "learning_rate": 4.936708860759494e-05, + "loss": 0.1574, + "step": 78 + }, + { + "epoch": 0.07500593401376691, + "grad_norm": 0.11019093543291092, + "learning_rate": 5e-05, + "loss": 0.1512, + "step": 79 + }, + { + "epoch": 0.07595537621647282, + "grad_norm": 0.09426973015069962, + "learning_rate": 5.0632911392405066e-05, + "loss": 0.1481, + "step": 80 + }, + { + "epoch": 0.07690481841917873, + "grad_norm": 0.09757663309574127, + "learning_rate": 5.1265822784810124e-05, + "loss": 0.1484, + "step": 81 + }, + { + "epoch": 0.07785426062188465, + "grad_norm": 0.10646392405033112, + "learning_rate": 5.1898734177215194e-05, + "loss": 0.1549, + "step": 82 + }, + { + "epoch": 0.07880370282459055, + "grad_norm": 0.12109784036874771, + "learning_rate": 5.253164556962026e-05, + "loss": 0.1448, + "step": 83 + }, + { + "epoch": 0.07975314502729647, + "grad_norm": 0.12039211392402649, + "learning_rate": 5.3164556962025316e-05, + "loss": 0.1538, + "step": 84 + }, + { + "epoch": 0.08070258723000237, + "grad_norm": 0.16873961687088013, + "learning_rate": 5.379746835443038e-05, + "loss": 0.1971, + "step": 85 + }, + { + "epoch": 0.08165202943270829, + "grad_norm": 0.12140022218227386, + "learning_rate": 5.4430379746835444e-05, + "loss": 0.1497, + "step": 86 + }, + { + "epoch": 0.08260147163541419, + "grad_norm": 0.14637599885463715, + "learning_rate": 5.5063291139240514e-05, + "loss": 0.1958, + "step": 87 + }, + { + "epoch": 0.0835509138381201, + "grad_norm": 0.1141396313905716, + "learning_rate": 5.569620253164557e-05, + "loss": 0.1457, + "step": 88 + }, + { + "epoch": 0.08450035604082601, + "grad_norm": 0.2128390371799469, + "learning_rate": 5.6329113924050636e-05, + "loss": 0.2339, + "step": 89 + }, + { + "epoch": 0.08544979824353192, + "grad_norm": 0.18838858604431152, + "learning_rate": 5.69620253164557e-05, + "loss": 0.2029, + "step": 90 + }, + { + "epoch": 0.08639924044623784, + "grad_norm": 0.19592566788196564, + "learning_rate": 5.759493670886076e-05, + "loss": 0.2276, + "step": 91 + }, + { + "epoch": 0.08734868264894374, + "grad_norm": 0.14753012359142303, + "learning_rate": 5.822784810126583e-05, + "loss": 0.1916, + "step": 92 + }, + { + "epoch": 0.08829812485164966, + "grad_norm": 0.1494351178407669, + "learning_rate": 5.886075949367089e-05, + "loss": 0.1913, + "step": 93 + }, + { + "epoch": 0.08924756705435556, + "grad_norm": 0.1173478439450264, + "learning_rate": 5.949367088607595e-05, + "loss": 0.1438, + "step": 94 + }, + { + "epoch": 0.09019700925706148, + "grad_norm": 0.12023188918828964, + "learning_rate": 6.012658227848101e-05, + "loss": 0.1516, + "step": 95 + }, + { + "epoch": 0.09114645145976738, + "grad_norm": 0.1275833547115326, + "learning_rate": 6.0759493670886084e-05, + "loss": 0.1492, + "step": 96 + }, + { + "epoch": 0.0920958936624733, + "grad_norm": 0.1360282599925995, + "learning_rate": 6.139240506329115e-05, + "loss": 0.1507, + "step": 97 + }, + { + "epoch": 0.09304533586517921, + "grad_norm": 0.1586841195821762, + "learning_rate": 6.20253164556962e-05, + "loss": 0.1956, + "step": 98 + }, + { + "epoch": 0.09399477806788512, + "grad_norm": 0.14281995594501495, + "learning_rate": 6.265822784810128e-05, + "loss": 0.1774, + "step": 99 + }, + { + "epoch": 0.09494422027059103, + "grad_norm": 0.12553077936172485, + "learning_rate": 6.329113924050633e-05, + "loss": 0.148, + "step": 100 + }, + { + "epoch": 0.09589366247329693, + "grad_norm": 0.1117570698261261, + "learning_rate": 6.392405063291139e-05, + "loss": 0.16, + "step": 101 + }, + { + "epoch": 0.09684310467600285, + "grad_norm": 0.13955281674861908, + "learning_rate": 6.455696202531646e-05, + "loss": 0.1464, + "step": 102 + }, + { + "epoch": 0.09779254687870875, + "grad_norm": 0.10990285873413086, + "learning_rate": 6.518987341772153e-05, + "loss": 0.147, + "step": 103 + }, + { + "epoch": 0.09874198908141467, + "grad_norm": 0.10545991361141205, + "learning_rate": 6.582278481012658e-05, + "loss": 0.1436, + "step": 104 + }, + { + "epoch": 0.09969143128412059, + "grad_norm": 0.1717437207698822, + "learning_rate": 6.645569620253165e-05, + "loss": 0.2278, + "step": 105 + }, + { + "epoch": 0.10064087348682649, + "grad_norm": 0.10950994491577148, + "learning_rate": 6.708860759493672e-05, + "loss": 0.1493, + "step": 106 + }, + { + "epoch": 0.1015903156895324, + "grad_norm": 0.11200258880853653, + "learning_rate": 6.772151898734177e-05, + "loss": 0.1536, + "step": 107 + }, + { + "epoch": 0.1025397578922383, + "grad_norm": 0.10955105721950531, + "learning_rate": 6.835443037974683e-05, + "loss": 0.1483, + "step": 108 + }, + { + "epoch": 0.10348920009494422, + "grad_norm": 0.11920775473117828, + "learning_rate": 6.89873417721519e-05, + "loss": 0.1492, + "step": 109 + }, + { + "epoch": 0.10443864229765012, + "grad_norm": 0.1390092819929123, + "learning_rate": 6.962025316455697e-05, + "loss": 0.1849, + "step": 110 + }, + { + "epoch": 0.10538808450035604, + "grad_norm": 0.1363140493631363, + "learning_rate": 7.025316455696203e-05, + "loss": 0.1849, + "step": 111 + }, + { + "epoch": 0.10633752670306196, + "grad_norm": 0.09190025180578232, + "learning_rate": 7.088607594936709e-05, + "loss": 0.1587, + "step": 112 + }, + { + "epoch": 0.10728696890576786, + "grad_norm": 0.09020426124334335, + "learning_rate": 7.151898734177216e-05, + "loss": 0.1377, + "step": 113 + }, + { + "epoch": 0.10823641110847378, + "grad_norm": 0.10544883459806442, + "learning_rate": 7.215189873417722e-05, + "loss": 0.1516, + "step": 114 + }, + { + "epoch": 0.10918585331117968, + "grad_norm": 0.12401281297206879, + "learning_rate": 7.278481012658229e-05, + "loss": 0.154, + "step": 115 + }, + { + "epoch": 0.1101352955138856, + "grad_norm": 0.1008707657456398, + "learning_rate": 7.341772151898734e-05, + "loss": 0.1448, + "step": 116 + }, + { + "epoch": 0.1110847377165915, + "grad_norm": 0.10302747040987015, + "learning_rate": 7.40506329113924e-05, + "loss": 0.1451, + "step": 117 + }, + { + "epoch": 0.11203417991929741, + "grad_norm": 0.12748293578624725, + "learning_rate": 7.468354430379747e-05, + "loss": 0.1829, + "step": 118 + }, + { + "epoch": 0.11298362212200333, + "grad_norm": 0.10413361340761185, + "learning_rate": 7.531645569620254e-05, + "loss": 0.1371, + "step": 119 + }, + { + "epoch": 0.11393306432470923, + "grad_norm": 0.1243433803319931, + "learning_rate": 7.59493670886076e-05, + "loss": 0.1409, + "step": 120 + }, + { + "epoch": 0.11488250652741515, + "grad_norm": 0.11630933731794357, + "learning_rate": 7.658227848101266e-05, + "loss": 0.1372, + "step": 121 + }, + { + "epoch": 0.11583194873012105, + "grad_norm": 0.17981529235839844, + "learning_rate": 7.721518987341773e-05, + "loss": 0.2257, + "step": 122 + }, + { + "epoch": 0.11678139093282697, + "grad_norm": 0.14063452184200287, + "learning_rate": 7.78481012658228e-05, + "loss": 0.1841, + "step": 123 + }, + { + "epoch": 0.11773083313553287, + "grad_norm": 0.1264188438653946, + "learning_rate": 7.848101265822784e-05, + "loss": 0.1471, + "step": 124 + }, + { + "epoch": 0.11868027533823879, + "grad_norm": 0.12827955186367035, + "learning_rate": 7.911392405063291e-05, + "loss": 0.1493, + "step": 125 + }, + { + "epoch": 0.11962971754094469, + "grad_norm": 0.09800329059362411, + "learning_rate": 7.974683544303798e-05, + "loss": 0.1414, + "step": 126 + }, + { + "epoch": 0.1205791597436506, + "grad_norm": 0.09902197122573853, + "learning_rate": 8.037974683544304e-05, + "loss": 0.1462, + "step": 127 + }, + { + "epoch": 0.12152860194635652, + "grad_norm": 0.09450504928827286, + "learning_rate": 8.10126582278481e-05, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 0.12247804414906242, + "grad_norm": 0.11012883484363556, + "learning_rate": 8.164556962025317e-05, + "loss": 0.1437, + "step": 129 + }, + { + "epoch": 0.12342748635176834, + "grad_norm": 0.11717642843723297, + "learning_rate": 8.227848101265824e-05, + "loss": 0.1478, + "step": 130 + }, + { + "epoch": 0.12437692855447424, + "grad_norm": 0.08754123747348785, + "learning_rate": 8.29113924050633e-05, + "loss": 0.1408, + "step": 131 + }, + { + "epoch": 0.12532637075718014, + "grad_norm": 0.10017862170934677, + "learning_rate": 8.354430379746835e-05, + "loss": 0.1476, + "step": 132 + }, + { + "epoch": 0.12627581295988607, + "grad_norm": 0.08994068205356598, + "learning_rate": 8.417721518987342e-05, + "loss": 0.1478, + "step": 133 + }, + { + "epoch": 0.12722525516259198, + "grad_norm": 0.09894968569278717, + "learning_rate": 8.481012658227848e-05, + "loss": 0.1309, + "step": 134 + }, + { + "epoch": 0.12817469736529788, + "grad_norm": 0.10028701275587082, + "learning_rate": 8.544303797468355e-05, + "loss": 0.1433, + "step": 135 + }, + { + "epoch": 0.1291241395680038, + "grad_norm": 0.0897536426782608, + "learning_rate": 8.607594936708861e-05, + "loss": 0.1459, + "step": 136 + }, + { + "epoch": 0.1300735817707097, + "grad_norm": 0.10435349494218826, + "learning_rate": 8.670886075949367e-05, + "loss": 0.1434, + "step": 137 + }, + { + "epoch": 0.13102302397341561, + "grad_norm": 0.11718117445707321, + "learning_rate": 8.734177215189874e-05, + "loss": 0.1509, + "step": 138 + }, + { + "epoch": 0.13197246617612152, + "grad_norm": 0.14426474273204803, + "learning_rate": 8.797468354430381e-05, + "loss": 0.1373, + "step": 139 + }, + { + "epoch": 0.13292190837882745, + "grad_norm": 0.13101965188980103, + "learning_rate": 8.860759493670887e-05, + "loss": 0.1358, + "step": 140 + }, + { + "epoch": 0.13387135058153335, + "grad_norm": 0.11235956102609634, + "learning_rate": 8.924050632911392e-05, + "loss": 0.1394, + "step": 141 + }, + { + "epoch": 0.13482079278423925, + "grad_norm": 0.11327100545167923, + "learning_rate": 8.9873417721519e-05, + "loss": 0.1443, + "step": 142 + }, + { + "epoch": 0.13577023498694518, + "grad_norm": 0.10912016034126282, + "learning_rate": 9.050632911392407e-05, + "loss": 0.1698, + "step": 143 + }, + { + "epoch": 0.13671967718965108, + "grad_norm": 0.16535617411136627, + "learning_rate": 9.113924050632912e-05, + "loss": 0.2255, + "step": 144 + }, + { + "epoch": 0.137669119392357, + "grad_norm": 0.10184327512979507, + "learning_rate": 9.177215189873418e-05, + "loss": 0.1371, + "step": 145 + }, + { + "epoch": 0.1386185615950629, + "grad_norm": 0.10998040437698364, + "learning_rate": 9.240506329113925e-05, + "loss": 0.1794, + "step": 146 + }, + { + "epoch": 0.13956800379776882, + "grad_norm": 0.08974044770002365, + "learning_rate": 9.303797468354431e-05, + "loss": 0.144, + "step": 147 + }, + { + "epoch": 0.14051744600047472, + "grad_norm": 0.12724193930625916, + "learning_rate": 9.367088607594936e-05, + "loss": 0.1794, + "step": 148 + }, + { + "epoch": 0.14146688820318062, + "grad_norm": 0.1079091802239418, + "learning_rate": 9.430379746835444e-05, + "loss": 0.1399, + "step": 149 + }, + { + "epoch": 0.14241633040588655, + "grad_norm": 0.09480807185173035, + "learning_rate": 9.493670886075949e-05, + "loss": 0.1395, + "step": 150 + }, + { + "epoch": 0.14336577260859246, + "grad_norm": 0.08620745688676834, + "learning_rate": 9.556962025316456e-05, + "loss": 0.1415, + "step": 151 + }, + { + "epoch": 0.14431521481129836, + "grad_norm": 0.10517002642154694, + "learning_rate": 9.620253164556962e-05, + "loss": 0.1723, + "step": 152 + }, + { + "epoch": 0.14526465701400426, + "grad_norm": 0.0956311896443367, + "learning_rate": 9.683544303797469e-05, + "loss": 0.1515, + "step": 153 + }, + { + "epoch": 0.1462140992167102, + "grad_norm": 0.08050324022769928, + "learning_rate": 9.746835443037975e-05, + "loss": 0.1322, + "step": 154 + }, + { + "epoch": 0.1471635414194161, + "grad_norm": 0.0853201299905777, + "learning_rate": 9.810126582278482e-05, + "loss": 0.142, + "step": 155 + }, + { + "epoch": 0.148112983622122, + "grad_norm": 0.09991180151700974, + "learning_rate": 9.873417721518988e-05, + "loss": 0.1348, + "step": 156 + }, + { + "epoch": 0.14906242582482793, + "grad_norm": 0.08640603721141815, + "learning_rate": 9.936708860759493e-05, + "loss": 0.1397, + "step": 157 + }, + { + "epoch": 0.15001186802753383, + "grad_norm": 0.09057717025279999, + "learning_rate": 0.0001, + "loss": 0.1381, + "step": 158 + }, + { + "epoch": 0.15096131023023973, + "grad_norm": 0.09916041046380997, + "learning_rate": 0.00010063291139240508, + "loss": 0.1509, + "step": 159 + }, + { + "epoch": 0.15191075243294563, + "grad_norm": 0.09434045851230621, + "learning_rate": 0.00010126582278481013, + "loss": 0.1388, + "step": 160 + }, + { + "epoch": 0.15286019463565156, + "grad_norm": 0.1273377537727356, + "learning_rate": 0.0001018987341772152, + "loss": 0.1401, + "step": 161 + }, + { + "epoch": 0.15380963683835747, + "grad_norm": 0.1297912299633026, + "learning_rate": 0.00010253164556962025, + "loss": 0.1852, + "step": 162 + }, + { + "epoch": 0.15475907904106337, + "grad_norm": 0.1151595488190651, + "learning_rate": 0.00010316455696202532, + "loss": 0.1848, + "step": 163 + }, + { + "epoch": 0.1557085212437693, + "grad_norm": 0.13381290435791016, + "learning_rate": 0.00010379746835443039, + "loss": 0.1438, + "step": 164 + }, + { + "epoch": 0.1566579634464752, + "grad_norm": 0.07880119979381561, + "learning_rate": 0.00010443037974683545, + "loss": 0.1327, + "step": 165 + }, + { + "epoch": 0.1576074056491811, + "grad_norm": 0.0843740776181221, + "learning_rate": 0.00010506329113924052, + "loss": 0.1398, + "step": 166 + }, + { + "epoch": 0.158556847851887, + "grad_norm": 0.0981813594698906, + "learning_rate": 0.00010569620253164559, + "loss": 0.1409, + "step": 167 + }, + { + "epoch": 0.15950629005459294, + "grad_norm": 0.10005304962396622, + "learning_rate": 0.00010632911392405063, + "loss": 0.1783, + "step": 168 + }, + { + "epoch": 0.16045573225729884, + "grad_norm": 0.08365727961063385, + "learning_rate": 0.00010696202531645569, + "loss": 0.1275, + "step": 169 + }, + { + "epoch": 0.16140517446000474, + "grad_norm": 0.1017635315656662, + "learning_rate": 0.00010759493670886076, + "loss": 0.1792, + "step": 170 + }, + { + "epoch": 0.16235461666271067, + "grad_norm": 0.07007888704538345, + "learning_rate": 0.00010822784810126583, + "loss": 0.1473, + "step": 171 + }, + { + "epoch": 0.16330405886541657, + "grad_norm": 0.07718679308891296, + "learning_rate": 0.00010886075949367089, + "loss": 0.1396, + "step": 172 + }, + { + "epoch": 0.16425350106812248, + "grad_norm": 0.07228100299835205, + "learning_rate": 0.00010949367088607596, + "loss": 0.1398, + "step": 173 + }, + { + "epoch": 0.16520294327082838, + "grad_norm": 0.07955378293991089, + "learning_rate": 0.00011012658227848103, + "loss": 0.1402, + "step": 174 + }, + { + "epoch": 0.1661523854735343, + "grad_norm": 0.0816427692770958, + "learning_rate": 0.00011075949367088607, + "loss": 0.1345, + "step": 175 + }, + { + "epoch": 0.1671018276762402, + "grad_norm": 0.07641757279634476, + "learning_rate": 0.00011139240506329114, + "loss": 0.1373, + "step": 176 + }, + { + "epoch": 0.1680512698789461, + "grad_norm": 0.07354450225830078, + "learning_rate": 0.0001120253164556962, + "loss": 0.1394, + "step": 177 + }, + { + "epoch": 0.16900071208165202, + "grad_norm": 0.08322398364543915, + "learning_rate": 0.00011265822784810127, + "loss": 0.138, + "step": 178 + }, + { + "epoch": 0.16995015428435795, + "grad_norm": 0.13528607785701752, + "learning_rate": 0.00011329113924050634, + "loss": 0.2188, + "step": 179 + }, + { + "epoch": 0.17089959648706385, + "grad_norm": 0.10803692042827606, + "learning_rate": 0.0001139240506329114, + "loss": 0.1782, + "step": 180 + }, + { + "epoch": 0.17184903868976975, + "grad_norm": 0.08404573053121567, + "learning_rate": 0.00011455696202531647, + "loss": 0.1394, + "step": 181 + }, + { + "epoch": 0.17279848089247568, + "grad_norm": 0.12790893018245697, + "learning_rate": 0.00011518987341772151, + "loss": 0.2157, + "step": 182 + }, + { + "epoch": 0.17374792309518158, + "grad_norm": 0.09879907220602036, + "learning_rate": 0.00011582278481012658, + "loss": 0.1693, + "step": 183 + }, + { + "epoch": 0.17469736529788749, + "grad_norm": 0.08092228323221207, + "learning_rate": 0.00011645569620253166, + "loss": 0.136, + "step": 184 + }, + { + "epoch": 0.1756468075005934, + "grad_norm": 0.07660632580518723, + "learning_rate": 0.00011708860759493671, + "loss": 0.1332, + "step": 185 + }, + { + "epoch": 0.17659624970329932, + "grad_norm": 0.07474201172590256, + "learning_rate": 0.00011772151898734178, + "loss": 0.1301, + "step": 186 + }, + { + "epoch": 0.17754569190600522, + "grad_norm": 0.09162931889295578, + "learning_rate": 0.00011835443037974685, + "loss": 0.1407, + "step": 187 + }, + { + "epoch": 0.17849513410871112, + "grad_norm": 0.08646775782108307, + "learning_rate": 0.0001189873417721519, + "loss": 0.139, + "step": 188 + }, + { + "epoch": 0.17944457631141705, + "grad_norm": 0.0759253203868866, + "learning_rate": 0.00011962025316455696, + "loss": 0.1342, + "step": 189 + }, + { + "epoch": 0.18039401851412296, + "grad_norm": 0.08292865008115768, + "learning_rate": 0.00012025316455696203, + "loss": 0.1389, + "step": 190 + }, + { + "epoch": 0.18134346071682886, + "grad_norm": 0.12379574030637741, + "learning_rate": 0.0001208860759493671, + "loss": 0.1795, + "step": 191 + }, + { + "epoch": 0.18229290291953476, + "grad_norm": 0.10240278393030167, + "learning_rate": 0.00012151898734177217, + "loss": 0.1721, + "step": 192 + }, + { + "epoch": 0.1832423451222407, + "grad_norm": 0.09666036069393158, + "learning_rate": 0.00012215189873417722, + "loss": 0.1783, + "step": 193 + }, + { + "epoch": 0.1841917873249466, + "grad_norm": 0.08314768224954605, + "learning_rate": 0.0001227848101265823, + "loss": 0.1429, + "step": 194 + }, + { + "epoch": 0.1851412295276525, + "grad_norm": 0.07590368390083313, + "learning_rate": 0.00012341772151898734, + "loss": 0.1393, + "step": 195 + }, + { + "epoch": 0.18609067173035843, + "grad_norm": 0.10585250705480576, + "learning_rate": 0.0001240506329113924, + "loss": 0.2155, + "step": 196 + }, + { + "epoch": 0.18704011393306433, + "grad_norm": 0.06995555013418198, + "learning_rate": 0.00012468354430379748, + "loss": 0.1374, + "step": 197 + }, + { + "epoch": 0.18798955613577023, + "grad_norm": 0.07370735704898834, + "learning_rate": 0.00012531645569620255, + "loss": 0.1367, + "step": 198 + }, + { + "epoch": 0.18893899833847613, + "grad_norm": 0.07194443792104721, + "learning_rate": 0.0001259493670886076, + "loss": 0.1437, + "step": 199 + }, + { + "epoch": 0.18988844054118206, + "grad_norm": 0.06982647627592087, + "learning_rate": 0.00012658227848101267, + "loss": 0.1358, + "step": 200 + }, + { + "epoch": 0.19083788274388797, + "grad_norm": 0.06538347154855728, + "learning_rate": 0.0001272151898734177, + "loss": 0.1354, + "step": 201 + }, + { + "epoch": 0.19178732494659387, + "grad_norm": 0.07789324969053268, + "learning_rate": 0.00012784810126582278, + "loss": 0.178, + "step": 202 + }, + { + "epoch": 0.1927367671492998, + "grad_norm": 0.07376820594072342, + "learning_rate": 0.00012848101265822785, + "loss": 0.1621, + "step": 203 + }, + { + "epoch": 0.1936862093520057, + "grad_norm": 0.0720745250582695, + "learning_rate": 0.00012911392405063292, + "loss": 0.132, + "step": 204 + }, + { + "epoch": 0.1946356515547116, + "grad_norm": 0.06211116537451744, + "learning_rate": 0.000129746835443038, + "loss": 0.1387, + "step": 205 + }, + { + "epoch": 0.1955850937574175, + "grad_norm": 0.06701771914958954, + "learning_rate": 0.00013037974683544306, + "loss": 0.14, + "step": 206 + }, + { + "epoch": 0.19653453596012344, + "grad_norm": 0.07692532986402512, + "learning_rate": 0.0001310126582278481, + "loss": 0.1322, + "step": 207 + }, + { + "epoch": 0.19748397816282934, + "grad_norm": 0.07763269543647766, + "learning_rate": 0.00013164556962025315, + "loss": 0.1393, + "step": 208 + }, + { + "epoch": 0.19843342036553524, + "grad_norm": 0.08769022673368454, + "learning_rate": 0.00013227848101265822, + "loss": 0.1489, + "step": 209 + }, + { + "epoch": 0.19938286256824117, + "grad_norm": 0.08881859481334686, + "learning_rate": 0.0001329113924050633, + "loss": 0.1765, + "step": 210 + }, + { + "epoch": 0.20033230477094707, + "grad_norm": 0.06811822950839996, + "learning_rate": 0.00013354430379746836, + "loss": 0.1332, + "step": 211 + }, + { + "epoch": 0.20128174697365298, + "grad_norm": 0.06390922516584396, + "learning_rate": 0.00013417721518987343, + "loss": 0.1343, + "step": 212 + }, + { + "epoch": 0.20223118917635888, + "grad_norm": 0.06630406528711319, + "learning_rate": 0.0001348101265822785, + "loss": 0.1329, + "step": 213 + }, + { + "epoch": 0.2031806313790648, + "grad_norm": 0.0730772465467453, + "learning_rate": 0.00013544303797468355, + "loss": 0.1354, + "step": 214 + }, + { + "epoch": 0.2041300735817707, + "grad_norm": 0.06487323343753815, + "learning_rate": 0.00013607594936708862, + "loss": 0.1297, + "step": 215 + }, + { + "epoch": 0.2050795157844766, + "grad_norm": 0.06967955082654953, + "learning_rate": 0.00013670886075949366, + "loss": 0.1398, + "step": 216 + }, + { + "epoch": 0.20602895798718254, + "grad_norm": 0.08531820774078369, + "learning_rate": 0.00013734177215189873, + "loss": 0.1336, + "step": 217 + }, + { + "epoch": 0.20697840018988845, + "grad_norm": 0.0757659375667572, + "learning_rate": 0.0001379746835443038, + "loss": 0.1606, + "step": 218 + }, + { + "epoch": 0.20792784239259435, + "grad_norm": 0.060206469148397446, + "learning_rate": 0.00013860759493670888, + "loss": 0.1337, + "step": 219 + }, + { + "epoch": 0.20887728459530025, + "grad_norm": 0.07996556162834167, + "learning_rate": 0.00013924050632911395, + "loss": 0.1308, + "step": 220 + }, + { + "epoch": 0.20982672679800618, + "grad_norm": 0.06206861138343811, + "learning_rate": 0.000139873417721519, + "loss": 0.1347, + "step": 221 + }, + { + "epoch": 0.21077616900071208, + "grad_norm": 0.08736416697502136, + "learning_rate": 0.00014050632911392406, + "loss": 0.1768, + "step": 222 + }, + { + "epoch": 0.21172561120341798, + "grad_norm": 0.06427916139364243, + "learning_rate": 0.00014113924050632913, + "loss": 0.1374, + "step": 223 + }, + { + "epoch": 0.21267505340612392, + "grad_norm": 0.10996536910533905, + "learning_rate": 0.00014177215189873418, + "loss": 0.222, + "step": 224 + }, + { + "epoch": 0.21362449560882982, + "grad_norm": 0.08439125120639801, + "learning_rate": 0.00014240506329113925, + "loss": 0.1854, + "step": 225 + }, + { + "epoch": 0.21457393781153572, + "grad_norm": 0.06892693787813187, + "learning_rate": 0.00014303797468354432, + "loss": 0.139, + "step": 226 + }, + { + "epoch": 0.21552338001424162, + "grad_norm": 0.08241122961044312, + "learning_rate": 0.0001436708860759494, + "loss": 0.173, + "step": 227 + }, + { + "epoch": 0.21647282221694755, + "grad_norm": 0.07911046594381332, + "learning_rate": 0.00014430379746835443, + "loss": 0.1418, + "step": 228 + }, + { + "epoch": 0.21742226441965345, + "grad_norm": 0.06346064805984497, + "learning_rate": 0.0001449367088607595, + "loss": 0.1406, + "step": 229 + }, + { + "epoch": 0.21837170662235936, + "grad_norm": 0.060393668711185455, + "learning_rate": 0.00014556962025316457, + "loss": 0.1417, + "step": 230 + }, + { + "epoch": 0.2193211488250653, + "grad_norm": 0.05912507325410843, + "learning_rate": 0.00014620253164556962, + "loss": 0.1298, + "step": 231 + }, + { + "epoch": 0.2202705910277712, + "grad_norm": 0.07730337232351303, + "learning_rate": 0.0001468354430379747, + "loss": 0.1769, + "step": 232 + }, + { + "epoch": 0.2212200332304771, + "grad_norm": 0.07612381875514984, + "learning_rate": 0.00014746835443037976, + "loss": 0.1338, + "step": 233 + }, + { + "epoch": 0.222169475433183, + "grad_norm": 0.055311791598796844, + "learning_rate": 0.0001481012658227848, + "loss": 0.1313, + "step": 234 + }, + { + "epoch": 0.22311891763588892, + "grad_norm": 0.08492033183574677, + "learning_rate": 0.00014873417721518987, + "loss": 0.1367, + "step": 235 + }, + { + "epoch": 0.22406835983859483, + "grad_norm": 0.07133237272500992, + "learning_rate": 0.00014936708860759494, + "loss": 0.1308, + "step": 236 + }, + { + "epoch": 0.22501780204130073, + "grad_norm": 0.07148605585098267, + "learning_rate": 0.00015000000000000001, + "loss": 0.133, + "step": 237 + }, + { + "epoch": 0.22596724424400666, + "grad_norm": 0.06900472939014435, + "learning_rate": 0.00015063291139240508, + "loss": 0.138, + "step": 238 + }, + { + "epoch": 0.22691668644671256, + "grad_norm": 0.062325432896614075, + "learning_rate": 0.00015126582278481013, + "loss": 0.1338, + "step": 239 + }, + { + "epoch": 0.22786612864941846, + "grad_norm": 0.06719667464494705, + "learning_rate": 0.0001518987341772152, + "loss": 0.1316, + "step": 240 + }, + { + "epoch": 0.22881557085212437, + "grad_norm": 0.07456009089946747, + "learning_rate": 0.00015253164556962024, + "loss": 0.1412, + "step": 241 + }, + { + "epoch": 0.2297650130548303, + "grad_norm": 0.05619575083255768, + "learning_rate": 0.00015316455696202531, + "loss": 0.1342, + "step": 242 + }, + { + "epoch": 0.2307144552575362, + "grad_norm": 0.06157098710536957, + "learning_rate": 0.00015379746835443038, + "loss": 0.1329, + "step": 243 + }, + { + "epoch": 0.2316638974602421, + "grad_norm": 0.06759827584028244, + "learning_rate": 0.00015443037974683546, + "loss": 0.1411, + "step": 244 + }, + { + "epoch": 0.232613339662948, + "grad_norm": 0.06892479956150055, + "learning_rate": 0.00015506329113924053, + "loss": 0.1484, + "step": 245 + }, + { + "epoch": 0.23356278186565393, + "grad_norm": 0.08536699414253235, + "learning_rate": 0.0001556962025316456, + "loss": 0.1855, + "step": 246 + }, + { + "epoch": 0.23451222406835984, + "grad_norm": 0.06800314784049988, + "learning_rate": 0.00015632911392405064, + "loss": 0.1379, + "step": 247 + }, + { + "epoch": 0.23546166627106574, + "grad_norm": 0.0625622496008873, + "learning_rate": 0.00015696202531645568, + "loss": 0.1344, + "step": 248 + }, + { + "epoch": 0.23641110847377167, + "grad_norm": 0.06030593812465668, + "learning_rate": 0.00015759493670886075, + "loss": 0.1254, + "step": 249 + }, + { + "epoch": 0.23736055067647757, + "grad_norm": 0.06694353371858597, + "learning_rate": 0.00015822784810126583, + "loss": 0.1413, + "step": 250 + }, + { + "epoch": 0.23830999287918347, + "grad_norm": 0.06594134122133255, + "learning_rate": 0.0001588607594936709, + "loss": 0.1394, + "step": 251 + }, + { + "epoch": 0.23925943508188938, + "grad_norm": 0.09062930941581726, + "learning_rate": 0.00015949367088607597, + "loss": 0.1883, + "step": 252 + }, + { + "epoch": 0.2402088772845953, + "grad_norm": 0.06029089167714119, + "learning_rate": 0.00016012658227848104, + "loss": 0.1271, + "step": 253 + }, + { + "epoch": 0.2411583194873012, + "grad_norm": 0.08471622318029404, + "learning_rate": 0.00016075949367088608, + "loss": 0.172, + "step": 254 + }, + { + "epoch": 0.2421077616900071, + "grad_norm": 0.061710160225629807, + "learning_rate": 0.00016139240506329115, + "loss": 0.1348, + "step": 255 + }, + { + "epoch": 0.24305720389271304, + "grad_norm": 0.0812671035528183, + "learning_rate": 0.0001620253164556962, + "loss": 0.1312, + "step": 256 + }, + { + "epoch": 0.24400664609541894, + "grad_norm": 0.06917005032300949, + "learning_rate": 0.00016265822784810127, + "loss": 0.1464, + "step": 257 + }, + { + "epoch": 0.24495608829812485, + "grad_norm": 0.0905887708067894, + "learning_rate": 0.00016329113924050634, + "loss": 0.1759, + "step": 258 + }, + { + "epoch": 0.24590553050083075, + "grad_norm": 0.05976787209510803, + "learning_rate": 0.0001639240506329114, + "loss": 0.1404, + "step": 259 + }, + { + "epoch": 0.24685497270353668, + "grad_norm": 0.07545675337314606, + "learning_rate": 0.00016455696202531648, + "loss": 0.1322, + "step": 260 + }, + { + "epoch": 0.24780441490624258, + "grad_norm": 0.07035024464130402, + "learning_rate": 0.00016518987341772152, + "loss": 0.1378, + "step": 261 + }, + { + "epoch": 0.24875385710894848, + "grad_norm": 0.07665737718343735, + "learning_rate": 0.0001658227848101266, + "loss": 0.1827, + "step": 262 + }, + { + "epoch": 0.24970329931165441, + "grad_norm": 0.06619013845920563, + "learning_rate": 0.00016645569620253166, + "loss": 0.1284, + "step": 263 + }, + { + "epoch": 0.2506527415143603, + "grad_norm": 0.0647001713514328, + "learning_rate": 0.0001670886075949367, + "loss": 0.133, + "step": 264 + }, + { + "epoch": 0.2516021837170662, + "grad_norm": 0.060702718794345856, + "learning_rate": 0.00016772151898734178, + "loss": 0.1335, + "step": 265 + }, + { + "epoch": 0.25255162591977215, + "grad_norm": 0.0508468896150589, + "learning_rate": 0.00016835443037974685, + "loss": 0.1333, + "step": 266 + }, + { + "epoch": 0.253501068122478, + "grad_norm": 0.09877864271402359, + "learning_rate": 0.0001689873417721519, + "loss": 0.2031, + "step": 267 + }, + { + "epoch": 0.25445051032518395, + "grad_norm": 0.06673337519168854, + "learning_rate": 0.00016962025316455696, + "loss": 0.1356, + "step": 268 + }, + { + "epoch": 0.2553999525278899, + "grad_norm": 0.10604165494441986, + "learning_rate": 0.00017025316455696204, + "loss": 0.2517, + "step": 269 + }, + { + "epoch": 0.25634939473059576, + "grad_norm": 0.07689858227968216, + "learning_rate": 0.0001708860759493671, + "loss": 0.1761, + "step": 270 + }, + { + "epoch": 0.2572988369333017, + "grad_norm": 0.05482449755072594, + "learning_rate": 0.00017151898734177218, + "loss": 0.131, + "step": 271 + }, + { + "epoch": 0.2582482791360076, + "grad_norm": 0.08622145652770996, + "learning_rate": 0.00017215189873417722, + "loss": 0.1335, + "step": 272 + }, + { + "epoch": 0.2591977213387135, + "grad_norm": 0.0748213455080986, + "learning_rate": 0.0001727848101265823, + "loss": 0.176, + "step": 273 + }, + { + "epoch": 0.2601471635414194, + "grad_norm": 0.06163305416703224, + "learning_rate": 0.00017341772151898733, + "loss": 0.1381, + "step": 274 + }, + { + "epoch": 0.26109660574412535, + "grad_norm": 0.06141841039061546, + "learning_rate": 0.0001740506329113924, + "loss": 0.1353, + "step": 275 + }, + { + "epoch": 0.26204604794683123, + "grad_norm": 0.07326913625001907, + "learning_rate": 0.00017468354430379748, + "loss": 0.1441, + "step": 276 + }, + { + "epoch": 0.26299549014953716, + "grad_norm": 0.05951124057173729, + "learning_rate": 0.00017531645569620255, + "loss": 0.1321, + "step": 277 + }, + { + "epoch": 0.26394493235224303, + "grad_norm": 0.08364073932170868, + "learning_rate": 0.00017594936708860762, + "loss": 0.187, + "step": 278 + }, + { + "epoch": 0.26489437455494896, + "grad_norm": 0.05849132314324379, + "learning_rate": 0.00017658227848101266, + "loss": 0.1393, + "step": 279 + }, + { + "epoch": 0.2658438167576549, + "grad_norm": 0.05452360957860947, + "learning_rate": 0.00017721518987341773, + "loss": 0.1342, + "step": 280 + }, + { + "epoch": 0.26679325896036077, + "grad_norm": 0.04878188297152519, + "learning_rate": 0.00017784810126582278, + "loss": 0.1445, + "step": 281 + }, + { + "epoch": 0.2677427011630667, + "grad_norm": 0.06066753342747688, + "learning_rate": 0.00017848101265822785, + "loss": 0.1423, + "step": 282 + }, + { + "epoch": 0.26869214336577263, + "grad_norm": 0.04918207973241806, + "learning_rate": 0.00017911392405063292, + "loss": 0.1316, + "step": 283 + }, + { + "epoch": 0.2696415855684785, + "grad_norm": 0.05103525519371033, + "learning_rate": 0.000179746835443038, + "loss": 0.1313, + "step": 284 + }, + { + "epoch": 0.27059102777118443, + "grad_norm": 0.05667628347873688, + "learning_rate": 0.00018037974683544306, + "loss": 0.1434, + "step": 285 + }, + { + "epoch": 0.27154046997389036, + "grad_norm": 0.06226016581058502, + "learning_rate": 0.00018101265822784813, + "loss": 0.1357, + "step": 286 + }, + { + "epoch": 0.27248991217659624, + "grad_norm": 0.04695293679833412, + "learning_rate": 0.00018164556962025317, + "loss": 0.1314, + "step": 287 + }, + { + "epoch": 0.27343935437930217, + "grad_norm": 0.05762844532728195, + "learning_rate": 0.00018227848101265824, + "loss": 0.1349, + "step": 288 + }, + { + "epoch": 0.27438879658200804, + "grad_norm": 0.05454534292221069, + "learning_rate": 0.0001829113924050633, + "loss": 0.1432, + "step": 289 + }, + { + "epoch": 0.275338238784714, + "grad_norm": 0.050270579755306244, + "learning_rate": 0.00018354430379746836, + "loss": 0.1272, + "step": 290 + }, + { + "epoch": 0.2762876809874199, + "grad_norm": 0.0688452497124672, + "learning_rate": 0.00018417721518987343, + "loss": 0.1708, + "step": 291 + }, + { + "epoch": 0.2772371231901258, + "grad_norm": 0.06213200092315674, + "learning_rate": 0.0001848101265822785, + "loss": 0.1674, + "step": 292 + }, + { + "epoch": 0.2781865653928317, + "grad_norm": 0.059717319905757904, + "learning_rate": 0.00018544303797468354, + "loss": 0.169, + "step": 293 + }, + { + "epoch": 0.27913600759553764, + "grad_norm": 0.06223325803875923, + "learning_rate": 0.00018607594936708861, + "loss": 0.1369, + "step": 294 + }, + { + "epoch": 0.2800854497982435, + "grad_norm": 0.053163208067417145, + "learning_rate": 0.00018670886075949369, + "loss": 0.133, + "step": 295 + }, + { + "epoch": 0.28103489200094944, + "grad_norm": 0.06647945195436478, + "learning_rate": 0.00018734177215189873, + "loss": 0.1438, + "step": 296 + }, + { + "epoch": 0.2819843342036554, + "grad_norm": 0.0588272288441658, + "learning_rate": 0.0001879746835443038, + "loss": 0.1338, + "step": 297 + }, + { + "epoch": 0.28293377640636125, + "grad_norm": 0.05841274932026863, + "learning_rate": 0.00018860759493670887, + "loss": 0.1329, + "step": 298 + }, + { + "epoch": 0.2838832186090672, + "grad_norm": 0.09033369272947311, + "learning_rate": 0.00018924050632911394, + "loss": 0.1747, + "step": 299 + }, + { + "epoch": 0.2848326608117731, + "grad_norm": 0.052215326577425, + "learning_rate": 0.00018987341772151899, + "loss": 0.1296, + "step": 300 + }, + { + "epoch": 0.285782103014479, + "grad_norm": 0.05880101025104523, + "learning_rate": 0.00019050632911392406, + "loss": 0.1287, + "step": 301 + }, + { + "epoch": 0.2867315452171849, + "grad_norm": 0.0691700354218483, + "learning_rate": 0.00019113924050632913, + "loss": 0.1676, + "step": 302 + }, + { + "epoch": 0.2876809874198908, + "grad_norm": 0.057025909423828125, + "learning_rate": 0.0001917721518987342, + "loss": 0.1346, + "step": 303 + }, + { + "epoch": 0.2886304296225967, + "grad_norm": 0.04936329275369644, + "learning_rate": 0.00019240506329113924, + "loss": 0.1354, + "step": 304 + }, + { + "epoch": 0.28957987182530265, + "grad_norm": 0.0680055245757103, + "learning_rate": 0.0001930379746835443, + "loss": 0.1344, + "step": 305 + }, + { + "epoch": 0.2905293140280085, + "grad_norm": 0.07374466210603714, + "learning_rate": 0.00019367088607594938, + "loss": 0.1428, + "step": 306 + }, + { + "epoch": 0.29147875623071445, + "grad_norm": 0.061204761266708374, + "learning_rate": 0.00019430379746835443, + "loss": 0.1246, + "step": 307 + }, + { + "epoch": 0.2924281984334204, + "grad_norm": 0.053467705845832825, + "learning_rate": 0.0001949367088607595, + "loss": 0.1342, + "step": 308 + }, + { + "epoch": 0.29337764063612626, + "grad_norm": 0.057525087147951126, + "learning_rate": 0.00019556962025316457, + "loss": 0.1377, + "step": 309 + }, + { + "epoch": 0.2943270828388322, + "grad_norm": 0.07857844978570938, + "learning_rate": 0.00019620253164556964, + "loss": 0.2076, + "step": 310 + }, + { + "epoch": 0.2952765250415381, + "grad_norm": 0.05250545218586922, + "learning_rate": 0.0001968354430379747, + "loss": 0.1432, + "step": 311 + }, + { + "epoch": 0.296225967244244, + "grad_norm": 0.07495012134313583, + "learning_rate": 0.00019746835443037975, + "loss": 0.1766, + "step": 312 + }, + { + "epoch": 0.2971754094469499, + "grad_norm": 0.04692578688263893, + "learning_rate": 0.0001981012658227848, + "loss": 0.1408, + "step": 313 + }, + { + "epoch": 0.29812485164965585, + "grad_norm": 0.055666085332632065, + "learning_rate": 0.00019873417721518987, + "loss": 0.1391, + "step": 314 + }, + { + "epoch": 0.29907429385236173, + "grad_norm": 0.050465911626815796, + "learning_rate": 0.00019936708860759494, + "loss": 0.1415, + "step": 315 + }, + { + "epoch": 0.30002373605506766, + "grad_norm": 0.051260240375995636, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 316 + }, + { + "epoch": 0.30097317825777353, + "grad_norm": 0.0503215529024601, + "learning_rate": 0.000199999938945738, + "loss": 0.1348, + "step": 317 + }, + { + "epoch": 0.30192262046047946, + "grad_norm": 0.04917483776807785, + "learning_rate": 0.0001999997557830265, + "loss": 0.1342, + "step": 318 + }, + { + "epoch": 0.3028720626631854, + "grad_norm": 0.06354209035634995, + "learning_rate": 0.00019999945051208916, + "loss": 0.1365, + "step": 319 + }, + { + "epoch": 0.30382150486589127, + "grad_norm": 0.04878314957022667, + "learning_rate": 0.0001999990231332988, + "loss": 0.13, + "step": 320 + }, + { + "epoch": 0.3047709470685972, + "grad_norm": 0.07046223431825638, + "learning_rate": 0.0001999984736471772, + "loss": 0.1394, + "step": 321 + }, + { + "epoch": 0.30572038927130313, + "grad_norm": 0.04456232488155365, + "learning_rate": 0.00019999780205439538, + "loss": 0.1278, + "step": 322 + }, + { + "epoch": 0.306669831474009, + "grad_norm": 0.06280628591775894, + "learning_rate": 0.00019999700835577342, + "loss": 0.1715, + "step": 323 + }, + { + "epoch": 0.30761927367671493, + "grad_norm": 0.07462131977081299, + "learning_rate": 0.00019999609255228046, + "loss": 0.1772, + "step": 324 + }, + { + "epoch": 0.30856871587942086, + "grad_norm": 0.059642352163791656, + "learning_rate": 0.00019999505464503482, + "loss": 0.1294, + "step": 325 + }, + { + "epoch": 0.30951815808212674, + "grad_norm": 0.06458820402622223, + "learning_rate": 0.00019999389463530383, + "loss": 0.173, + "step": 326 + }, + { + "epoch": 0.31046760028483267, + "grad_norm": 0.05901939421892166, + "learning_rate": 0.00019999261252450396, + "loss": 0.1419, + "step": 327 + }, + { + "epoch": 0.3114170424875386, + "grad_norm": 0.055540215224027634, + "learning_rate": 0.00019999120831420083, + "loss": 0.1314, + "step": 328 + }, + { + "epoch": 0.3123664846902445, + "grad_norm": 0.0546739287674427, + "learning_rate": 0.00019998968200610903, + "loss": 0.1354, + "step": 329 + }, + { + "epoch": 0.3133159268929504, + "grad_norm": 0.0689477026462555, + "learning_rate": 0.00019998803360209234, + "loss": 0.132, + "step": 330 + }, + { + "epoch": 0.3142653690956563, + "grad_norm": 0.05279696360230446, + "learning_rate": 0.00019998626310416365, + "loss": 0.1424, + "step": 331 + }, + { + "epoch": 0.3152148112983622, + "grad_norm": 0.055384278297424316, + "learning_rate": 0.00019998437051448482, + "loss": 0.141, + "step": 332 + }, + { + "epoch": 0.31616425350106814, + "grad_norm": 0.04636182263493538, + "learning_rate": 0.0001999823558353669, + "loss": 0.1414, + "step": 333 + }, + { + "epoch": 0.317113695703774, + "grad_norm": 0.04795726016163826, + "learning_rate": 0.00019998021906926993, + "loss": 0.1255, + "step": 334 + }, + { + "epoch": 0.31806313790647994, + "grad_norm": 0.05326540395617485, + "learning_rate": 0.00019997796021880318, + "loss": 0.1309, + "step": 335 + }, + { + "epoch": 0.3190125801091859, + "grad_norm": 0.0684736892580986, + "learning_rate": 0.00019997557928672484, + "loss": 0.1825, + "step": 336 + }, + { + "epoch": 0.31996202231189175, + "grad_norm": 0.042282164096832275, + "learning_rate": 0.0001999730762759422, + "loss": 0.12, + "step": 337 + }, + { + "epoch": 0.3209114645145977, + "grad_norm": 0.05297423154115677, + "learning_rate": 0.00019997045118951175, + "loss": 0.1309, + "step": 338 + }, + { + "epoch": 0.3218609067173036, + "grad_norm": 0.080621138215065, + "learning_rate": 0.00019996770403063883, + "loss": 0.2134, + "step": 339 + }, + { + "epoch": 0.3228103489200095, + "grad_norm": 0.05552308261394501, + "learning_rate": 0.00019996483480267803, + "loss": 0.1361, + "step": 340 + }, + { + "epoch": 0.3237597911227154, + "grad_norm": 0.05070111155509949, + "learning_rate": 0.00019996184350913287, + "loss": 0.1314, + "step": 341 + }, + { + "epoch": 0.32470923332542134, + "grad_norm": 0.04412266984581947, + "learning_rate": 0.00019995873015365601, + "loss": 0.1299, + "step": 342 + }, + { + "epoch": 0.3256586755281272, + "grad_norm": 0.0445338599383831, + "learning_rate": 0.00019995549474004917, + "loss": 0.1313, + "step": 343 + }, + { + "epoch": 0.32660811773083315, + "grad_norm": 0.08224980533123016, + "learning_rate": 0.000199952137272263, + "loss": 0.1844, + "step": 344 + }, + { + "epoch": 0.327557559933539, + "grad_norm": 0.04331446811556816, + "learning_rate": 0.0001999486577543972, + "loss": 0.133, + "step": 345 + }, + { + "epoch": 0.32850700213624495, + "grad_norm": 0.049314577132463455, + "learning_rate": 0.00019994505619070068, + "loss": 0.1351, + "step": 346 + }, + { + "epoch": 0.3294564443389509, + "grad_norm": 0.0697011798620224, + "learning_rate": 0.00019994133258557117, + "loss": 0.1709, + "step": 347 + }, + { + "epoch": 0.33040588654165676, + "grad_norm": 0.0510990135371685, + "learning_rate": 0.00019993748694355557, + "loss": 0.1365, + "step": 348 + }, + { + "epoch": 0.3313553287443627, + "grad_norm": 0.05100785568356514, + "learning_rate": 0.00019993351926934967, + "loss": 0.1302, + "step": 349 + }, + { + "epoch": 0.3323047709470686, + "grad_norm": 0.08001980185508728, + "learning_rate": 0.00019992942956779838, + "loss": 0.1736, + "step": 350 + }, + { + "epoch": 0.3332542131497745, + "grad_norm": 0.05298507958650589, + "learning_rate": 0.00019992521784389559, + "loss": 0.159, + "step": 351 + }, + { + "epoch": 0.3342036553524804, + "grad_norm": 0.04655485600233078, + "learning_rate": 0.00019992088410278414, + "loss": 0.1401, + "step": 352 + }, + { + "epoch": 0.33515309755518635, + "grad_norm": 0.047509439289569855, + "learning_rate": 0.00019991642834975594, + "loss": 0.1369, + "step": 353 + }, + { + "epoch": 0.3361025397578922, + "grad_norm": 0.046006906777620316, + "learning_rate": 0.0001999118505902518, + "loss": 0.1384, + "step": 354 + }, + { + "epoch": 0.33705198196059816, + "grad_norm": 0.07522892951965332, + "learning_rate": 0.00019990715082986155, + "loss": 0.2254, + "step": 355 + }, + { + "epoch": 0.33800142416330403, + "grad_norm": 0.048646144568920135, + "learning_rate": 0.00019990232907432404, + "loss": 0.1355, + "step": 356 + }, + { + "epoch": 0.33895086636600996, + "grad_norm": 0.03941798582673073, + "learning_rate": 0.000199897385329527, + "loss": 0.1242, + "step": 357 + }, + { + "epoch": 0.3399003085687159, + "grad_norm": 0.04582727700471878, + "learning_rate": 0.0001998923196015072, + "loss": 0.1347, + "step": 358 + }, + { + "epoch": 0.34084975077142177, + "grad_norm": 0.05890033766627312, + "learning_rate": 0.00019988713189645027, + "loss": 0.1356, + "step": 359 + }, + { + "epoch": 0.3417991929741277, + "grad_norm": 0.050398606806993484, + "learning_rate": 0.00019988182222069093, + "loss": 0.1379, + "step": 360 + }, + { + "epoch": 0.3427486351768336, + "grad_norm": 0.053657352924346924, + "learning_rate": 0.00019987639058071267, + "loss": 0.1417, + "step": 361 + }, + { + "epoch": 0.3436980773795395, + "grad_norm": 0.04928993433713913, + "learning_rate": 0.00019987083698314804, + "loss": 0.1269, + "step": 362 + }, + { + "epoch": 0.34464751958224543, + "grad_norm": 0.04932550713419914, + "learning_rate": 0.0001998651614347784, + "loss": 0.1429, + "step": 363 + }, + { + "epoch": 0.34559696178495136, + "grad_norm": 0.0531768873333931, + "learning_rate": 0.00019985936394253413, + "loss": 0.1367, + "step": 364 + }, + { + "epoch": 0.34654640398765724, + "grad_norm": 0.05342009291052818, + "learning_rate": 0.00019985344451349443, + "loss": 0.1365, + "step": 365 + }, + { + "epoch": 0.34749584619036317, + "grad_norm": 0.04960772022604942, + "learning_rate": 0.00019984740315488742, + "loss": 0.133, + "step": 366 + }, + { + "epoch": 0.3484452883930691, + "grad_norm": 0.04490765556693077, + "learning_rate": 0.00019984123987409013, + "loss": 0.1347, + "step": 367 + }, + { + "epoch": 0.34939473059577497, + "grad_norm": 0.05546121671795845, + "learning_rate": 0.0001998349546786285, + "loss": 0.169, + "step": 368 + }, + { + "epoch": 0.3503441727984809, + "grad_norm": 0.04962169751524925, + "learning_rate": 0.0001998285475761772, + "loss": 0.1325, + "step": 369 + }, + { + "epoch": 0.3512936150011868, + "grad_norm": 0.0451858825981617, + "learning_rate": 0.00019982201857455988, + "loss": 0.1291, + "step": 370 + }, + { + "epoch": 0.3522430572038927, + "grad_norm": 0.07738906145095825, + "learning_rate": 0.00019981536768174903, + "loss": 0.1841, + "step": 371 + }, + { + "epoch": 0.35319249940659864, + "grad_norm": 0.05104148015379906, + "learning_rate": 0.000199808594905866, + "loss": 0.1375, + "step": 372 + }, + { + "epoch": 0.3541419416093045, + "grad_norm": 0.04850155860185623, + "learning_rate": 0.00019980170025518082, + "loss": 0.1335, + "step": 373 + }, + { + "epoch": 0.35509138381201044, + "grad_norm": 0.050271324813365936, + "learning_rate": 0.00019979468373811248, + "loss": 0.1394, + "step": 374 + }, + { + "epoch": 0.35604082601471637, + "grad_norm": 0.050799645483493805, + "learning_rate": 0.0001997875453632288, + "loss": 0.135, + "step": 375 + }, + { + "epoch": 0.35699026821742225, + "grad_norm": 0.05703526735305786, + "learning_rate": 0.00019978028513924627, + "loss": 0.1371, + "step": 376 + }, + { + "epoch": 0.3579397104201282, + "grad_norm": 0.06665853410959244, + "learning_rate": 0.00019977290307503028, + "loss": 0.1837, + "step": 377 + }, + { + "epoch": 0.3588891526228341, + "grad_norm": 0.04639972746372223, + "learning_rate": 0.000199765399179595, + "loss": 0.1315, + "step": 378 + }, + { + "epoch": 0.35983859482554, + "grad_norm": 0.07625308632850647, + "learning_rate": 0.00019975777346210326, + "loss": 0.2064, + "step": 379 + }, + { + "epoch": 0.3607880370282459, + "grad_norm": 0.048770248889923096, + "learning_rate": 0.00019975002593186674, + "loss": 0.1363, + "step": 380 + }, + { + "epoch": 0.36173747923095184, + "grad_norm": 0.04932136833667755, + "learning_rate": 0.00019974215659834582, + "loss": 0.1374, + "step": 381 + }, + { + "epoch": 0.3626869214336577, + "grad_norm": 0.03848756104707718, + "learning_rate": 0.00019973416547114964, + "loss": 0.1333, + "step": 382 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.04468891769647598, + "learning_rate": 0.00019972605256003605, + "loss": 0.129, + "step": 383 + }, + { + "epoch": 0.3645858058390695, + "grad_norm": 0.048413511365652084, + "learning_rate": 0.0001997178178749116, + "loss": 0.1314, + "step": 384 + }, + { + "epoch": 0.36553524804177545, + "grad_norm": 0.045054856687784195, + "learning_rate": 0.00019970946142583155, + "loss": 0.1323, + "step": 385 + }, + { + "epoch": 0.3664846902444814, + "grad_norm": 0.05541200935840607, + "learning_rate": 0.00019970098322299982, + "loss": 0.1342, + "step": 386 + }, + { + "epoch": 0.36743413244718726, + "grad_norm": 0.06861472874879837, + "learning_rate": 0.00019969238327676906, + "loss": 0.1347, + "step": 387 + }, + { + "epoch": 0.3683835746498932, + "grad_norm": 0.043996453285217285, + "learning_rate": 0.00019968366159764047, + "loss": 0.132, + "step": 388 + }, + { + "epoch": 0.3693330168525991, + "grad_norm": 0.06562239676713943, + "learning_rate": 0.000199674818196264, + "loss": 0.1759, + "step": 389 + }, + { + "epoch": 0.370282459055305, + "grad_norm": 0.04714899882674217, + "learning_rate": 0.00019966585308343822, + "loss": 0.1274, + "step": 390 + }, + { + "epoch": 0.3712319012580109, + "grad_norm": 0.04736959934234619, + "learning_rate": 0.00019965676627011026, + "loss": 0.1265, + "step": 391 + }, + { + "epoch": 0.37218134346071685, + "grad_norm": 0.056829433888196945, + "learning_rate": 0.0001996475577673759, + "loss": 0.1402, + "step": 392 + }, + { + "epoch": 0.3731307856634227, + "grad_norm": 0.0426231250166893, + "learning_rate": 0.00019963822758647953, + "loss": 0.1364, + "step": 393 + }, + { + "epoch": 0.37408022786612866, + "grad_norm": 0.07376877963542938, + "learning_rate": 0.00019962877573881404, + "loss": 0.2042, + "step": 394 + }, + { + "epoch": 0.3750296700688346, + "grad_norm": 0.043273668736219406, + "learning_rate": 0.00019961920223592104, + "loss": 0.132, + "step": 395 + }, + { + "epoch": 0.37597911227154046, + "grad_norm": 0.044406965374946594, + "learning_rate": 0.00019960950708949052, + "loss": 0.1344, + "step": 396 + }, + { + "epoch": 0.3769285544742464, + "grad_norm": 0.040342606604099274, + "learning_rate": 0.00019959969031136106, + "loss": 0.1214, + "step": 397 + }, + { + "epoch": 0.37787799667695227, + "grad_norm": 0.05118388682603836, + "learning_rate": 0.00019958975191351983, + "loss": 0.14, + "step": 398 + }, + { + "epoch": 0.3788274388796582, + "grad_norm": 0.045876793563365936, + "learning_rate": 0.00019957969190810245, + "loss": 0.1335, + "step": 399 + }, + { + "epoch": 0.3797768810823641, + "grad_norm": 0.0645332932472229, + "learning_rate": 0.00019956951030739308, + "loss": 0.1702, + "step": 400 + }, + { + "epoch": 0.38072632328507, + "grad_norm": 0.05039132386445999, + "learning_rate": 0.00019955920712382423, + "loss": 0.136, + "step": 401 + }, + { + "epoch": 0.38167576548777593, + "grad_norm": 0.052004653960466385, + "learning_rate": 0.00019954878236997704, + "loss": 0.1386, + "step": 402 + }, + { + "epoch": 0.38262520769048186, + "grad_norm": 0.05021458491683006, + "learning_rate": 0.00019953823605858105, + "loss": 0.1378, + "step": 403 + }, + { + "epoch": 0.38357464989318774, + "grad_norm": 0.058653559535741806, + "learning_rate": 0.0001995275682025141, + "loss": 0.1437, + "step": 404 + }, + { + "epoch": 0.38452409209589367, + "grad_norm": 0.04466673359274864, + "learning_rate": 0.00019951677881480264, + "loss": 0.1334, + "step": 405 + }, + { + "epoch": 0.3854735342985996, + "grad_norm": 0.06119415909051895, + "learning_rate": 0.00019950586790862138, + "loss": 0.1296, + "step": 406 + }, + { + "epoch": 0.38642297650130547, + "grad_norm": 0.04749077931046486, + "learning_rate": 0.0001994948354972935, + "loss": 0.1341, + "step": 407 + }, + { + "epoch": 0.3873724187040114, + "grad_norm": 0.037752799689769745, + "learning_rate": 0.00019948368159429053, + "loss": 0.134, + "step": 408 + }, + { + "epoch": 0.38832186090671733, + "grad_norm": 0.08903038501739502, + "learning_rate": 0.00019947240621323226, + "loss": 0.2155, + "step": 409 + }, + { + "epoch": 0.3892713031094232, + "grad_norm": 0.03878140076994896, + "learning_rate": 0.00019946100936788698, + "loss": 0.1176, + "step": 410 + }, + { + "epoch": 0.39022074531212914, + "grad_norm": 0.04927309602499008, + "learning_rate": 0.00019944949107217113, + "loss": 0.1344, + "step": 411 + }, + { + "epoch": 0.391170187514835, + "grad_norm": 0.04933890327811241, + "learning_rate": 0.00019943785134014962, + "loss": 0.1315, + "step": 412 + }, + { + "epoch": 0.39211962971754094, + "grad_norm": 0.06702516227960587, + "learning_rate": 0.0001994260901860355, + "loss": 0.1826, + "step": 413 + }, + { + "epoch": 0.39306907192024687, + "grad_norm": 0.048132237046957016, + "learning_rate": 0.00019941420762419014, + "loss": 0.1436, + "step": 414 + }, + { + "epoch": 0.39401851412295275, + "grad_norm": 0.07756894826889038, + "learning_rate": 0.00019940220366912318, + "loss": 0.2162, + "step": 415 + }, + { + "epoch": 0.3949679563256587, + "grad_norm": 0.04789011925458908, + "learning_rate": 0.00019939007833549242, + "loss": 0.1295, + "step": 416 + }, + { + "epoch": 0.3959173985283646, + "grad_norm": 0.04369444027543068, + "learning_rate": 0.000199377831638104, + "loss": 0.1322, + "step": 417 + }, + { + "epoch": 0.3968668407310705, + "grad_norm": 0.05376122146844864, + "learning_rate": 0.00019936546359191216, + "loss": 0.1743, + "step": 418 + }, + { + "epoch": 0.3978162829337764, + "grad_norm": 0.045930229127407074, + "learning_rate": 0.0001993529742120193, + "loss": 0.1336, + "step": 419 + }, + { + "epoch": 0.39876572513648234, + "grad_norm": 0.039980966597795486, + "learning_rate": 0.00019934036351367606, + "loss": 0.1349, + "step": 420 + }, + { + "epoch": 0.3997151673391882, + "grad_norm": 0.03797341510653496, + "learning_rate": 0.00019932763151228115, + "loss": 0.1256, + "step": 421 + }, + { + "epoch": 0.40066460954189415, + "grad_norm": 0.04779914394021034, + "learning_rate": 0.00019931477822338146, + "loss": 0.1411, + "step": 422 + }, + { + "epoch": 0.4016140517446, + "grad_norm": 0.040458668023347855, + "learning_rate": 0.00019930180366267193, + "loss": 0.126, + "step": 423 + }, + { + "epoch": 0.40256349394730595, + "grad_norm": 0.04114462807774544, + "learning_rate": 0.0001992887078459956, + "loss": 0.127, + "step": 424 + }, + { + "epoch": 0.4035129361500119, + "grad_norm": 0.048119012266397476, + "learning_rate": 0.00019927549078934358, + "loss": 0.1346, + "step": 425 + }, + { + "epoch": 0.40446237835271776, + "grad_norm": 0.0545562319457531, + "learning_rate": 0.00019926215250885504, + "loss": 0.1387, + "step": 426 + }, + { + "epoch": 0.4054118205554237, + "grad_norm": 0.052092909812927246, + "learning_rate": 0.00019924869302081715, + "loss": 0.1389, + "step": 427 + }, + { + "epoch": 0.4063612627581296, + "grad_norm": 0.03847799077630043, + "learning_rate": 0.0001992351123416651, + "loss": 0.1234, + "step": 428 + }, + { + "epoch": 0.4073107049608355, + "grad_norm": 0.0436912477016449, + "learning_rate": 0.000199221410487982, + "loss": 0.1362, + "step": 429 + }, + { + "epoch": 0.4082601471635414, + "grad_norm": 0.04420888423919678, + "learning_rate": 0.00019920758747649908, + "loss": 0.1243, + "step": 430 + }, + { + "epoch": 0.40920958936624735, + "grad_norm": 0.037297070026397705, + "learning_rate": 0.00019919364332409535, + "loss": 0.1331, + "step": 431 + }, + { + "epoch": 0.4101590315689532, + "grad_norm": 0.03854360058903694, + "learning_rate": 0.00019917957804779782, + "loss": 0.1266, + "step": 432 + }, + { + "epoch": 0.41110847377165916, + "grad_norm": 0.04071418195962906, + "learning_rate": 0.00019916539166478137, + "loss": 0.1292, + "step": 433 + }, + { + "epoch": 0.4120579159743651, + "grad_norm": 0.04560808837413788, + "learning_rate": 0.00019915108419236882, + "loss": 0.1381, + "step": 434 + }, + { + "epoch": 0.41300735817707096, + "grad_norm": 0.06313233822584152, + "learning_rate": 0.00019913665564803078, + "loss": 0.2031, + "step": 435 + }, + { + "epoch": 0.4139568003797769, + "grad_norm": 0.04507524147629738, + "learning_rate": 0.00019912210604938578, + "loss": 0.1277, + "step": 436 + }, + { + "epoch": 0.41490624258248276, + "grad_norm": 0.05048058554530144, + "learning_rate": 0.00019910743541420007, + "loss": 0.1315, + "step": 437 + }, + { + "epoch": 0.4158556847851887, + "grad_norm": 0.04872648045420647, + "learning_rate": 0.0001990926437603878, + "loss": 0.1292, + "step": 438 + }, + { + "epoch": 0.4168051269878946, + "grad_norm": 0.04400710016489029, + "learning_rate": 0.00019907773110601075, + "loss": 0.1236, + "step": 439 + }, + { + "epoch": 0.4177545691906005, + "grad_norm": 0.051591627299785614, + "learning_rate": 0.00019906269746927863, + "loss": 0.1358, + "step": 440 + }, + { + "epoch": 0.41870401139330643, + "grad_norm": 0.04288725182414055, + "learning_rate": 0.00019904754286854877, + "loss": 0.126, + "step": 441 + }, + { + "epoch": 0.41965345359601236, + "grad_norm": 0.04984726384282112, + "learning_rate": 0.00019903226732232622, + "loss": 0.1326, + "step": 442 + }, + { + "epoch": 0.42060289579871823, + "grad_norm": 0.041585132479667664, + "learning_rate": 0.00019901687084926373, + "loss": 0.136, + "step": 443 + }, + { + "epoch": 0.42155233800142417, + "grad_norm": 0.05849035084247589, + "learning_rate": 0.0001990013534681617, + "loss": 0.1727, + "step": 444 + }, + { + "epoch": 0.4225017802041301, + "grad_norm": 0.043387994170188904, + "learning_rate": 0.00019898571519796817, + "loss": 0.1393, + "step": 445 + }, + { + "epoch": 0.42345122240683597, + "grad_norm": 0.05867496132850647, + "learning_rate": 0.0001989699560577788, + "loss": 0.1664, + "step": 446 + }, + { + "epoch": 0.4244006646095419, + "grad_norm": 0.07019232958555222, + "learning_rate": 0.00019895407606683685, + "loss": 0.1653, + "step": 447 + }, + { + "epoch": 0.42535010681224783, + "grad_norm": 0.04676515609025955, + "learning_rate": 0.00019893807524453314, + "loss": 0.1368, + "step": 448 + }, + { + "epoch": 0.4262995490149537, + "grad_norm": 0.06640240550041199, + "learning_rate": 0.00019892195361040607, + "loss": 0.2089, + "step": 449 + }, + { + "epoch": 0.42724899121765963, + "grad_norm": 0.044658735394477844, + "learning_rate": 0.00019890571118414148, + "loss": 0.1298, + "step": 450 + }, + { + "epoch": 0.4281984334203655, + "grad_norm": 0.04810122773051262, + "learning_rate": 0.00019888934798557278, + "loss": 0.1288, + "step": 451 + }, + { + "epoch": 0.42914787562307144, + "grad_norm": 0.0425436794757843, + "learning_rate": 0.0001988728640346808, + "loss": 0.1354, + "step": 452 + }, + { + "epoch": 0.43009731782577737, + "grad_norm": 0.04513363912701607, + "learning_rate": 0.0001988562593515939, + "loss": 0.1346, + "step": 453 + }, + { + "epoch": 0.43104676002848324, + "grad_norm": 0.052022870630025864, + "learning_rate": 0.0001988395339565878, + "loss": 0.1302, + "step": 454 + }, + { + "epoch": 0.4319962022311892, + "grad_norm": 0.04852641373872757, + "learning_rate": 0.0001988226878700856, + "loss": 0.1388, + "step": 455 + }, + { + "epoch": 0.4329456444338951, + "grad_norm": 0.04990584775805473, + "learning_rate": 0.00019880572111265785, + "loss": 0.1552, + "step": 456 + }, + { + "epoch": 0.433895086636601, + "grad_norm": 0.052271679043769836, + "learning_rate": 0.00019878863370502238, + "loss": 0.1404, + "step": 457 + }, + { + "epoch": 0.4348445288393069, + "grad_norm": 0.04795520752668381, + "learning_rate": 0.00019877142566804436, + "loss": 0.1341, + "step": 458 + }, + { + "epoch": 0.43579397104201284, + "grad_norm": 0.048165664076805115, + "learning_rate": 0.00019875409702273632, + "loss": 0.1343, + "step": 459 + }, + { + "epoch": 0.4367434132447187, + "grad_norm": 0.04213611036539078, + "learning_rate": 0.000198736647790258, + "loss": 0.1369, + "step": 460 + }, + { + "epoch": 0.43769285544742464, + "grad_norm": 0.05819966271519661, + "learning_rate": 0.00019871907799191632, + "loss": 0.1615, + "step": 461 + }, + { + "epoch": 0.4386422976501306, + "grad_norm": 0.057378821074962616, + "learning_rate": 0.00019870138764916558, + "loss": 0.175, + "step": 462 + }, + { + "epoch": 0.43959173985283645, + "grad_norm": 0.0432853177189827, + "learning_rate": 0.00019868357678360724, + "loss": 0.1371, + "step": 463 + }, + { + "epoch": 0.4405411820555424, + "grad_norm": 0.03890872746706009, + "learning_rate": 0.0001986656454169898, + "loss": 0.1332, + "step": 464 + }, + { + "epoch": 0.44149062425824825, + "grad_norm": 0.04006613418459892, + "learning_rate": 0.00019864759357120896, + "loss": 0.1342, + "step": 465 + }, + { + "epoch": 0.4424400664609542, + "grad_norm": 0.049053166061639786, + "learning_rate": 0.00019862942126830767, + "loss": 0.1756, + "step": 466 + }, + { + "epoch": 0.4433895086636601, + "grad_norm": 0.03966079652309418, + "learning_rate": 0.00019861112853047577, + "loss": 0.1303, + "step": 467 + }, + { + "epoch": 0.444338950866366, + "grad_norm": 0.04506433755159378, + "learning_rate": 0.0001985927153800503, + "loss": 0.136, + "step": 468 + }, + { + "epoch": 0.4452883930690719, + "grad_norm": 0.04392915591597557, + "learning_rate": 0.00019857418183951526, + "loss": 0.1397, + "step": 469 + }, + { + "epoch": 0.44623783527177785, + "grad_norm": 0.038007620722055435, + "learning_rate": 0.0001985555279315017, + "loss": 0.1246, + "step": 470 + }, + { + "epoch": 0.4471872774744837, + "grad_norm": 0.048948097974061966, + "learning_rate": 0.00019853675367878764, + "loss": 0.1329, + "step": 471 + }, + { + "epoch": 0.44813671967718965, + "grad_norm": 0.04174380376935005, + "learning_rate": 0.00019851785910429806, + "loss": 0.13, + "step": 472 + }, + { + "epoch": 0.4490861618798956, + "grad_norm": 0.048575468361377716, + "learning_rate": 0.00019849884423110478, + "loss": 0.1385, + "step": 473 + }, + { + "epoch": 0.45003560408260146, + "grad_norm": 0.05167670175433159, + "learning_rate": 0.00019847970908242664, + "loss": 0.1684, + "step": 474 + }, + { + "epoch": 0.4509850462853074, + "grad_norm": 0.06849198788404465, + "learning_rate": 0.00019846045368162923, + "loss": 0.1795, + "step": 475 + }, + { + "epoch": 0.4519344884880133, + "grad_norm": 0.044273603707551956, + "learning_rate": 0.0001984410780522251, + "loss": 0.1246, + "step": 476 + }, + { + "epoch": 0.4528839306907192, + "grad_norm": 0.048194363713264465, + "learning_rate": 0.00019842158221787353, + "loss": 0.1366, + "step": 477 + }, + { + "epoch": 0.4538333728934251, + "grad_norm": 0.033906418830156326, + "learning_rate": 0.00019840196620238057, + "loss": 0.1235, + "step": 478 + }, + { + "epoch": 0.454782815096131, + "grad_norm": 0.043933141976594925, + "learning_rate": 0.00019838223002969905, + "loss": 0.1195, + "step": 479 + }, + { + "epoch": 0.45573225729883693, + "grad_norm": 0.056823644787073135, + "learning_rate": 0.00019836237372392854, + "loss": 0.1757, + "step": 480 + }, + { + "epoch": 0.45668169950154286, + "grad_norm": 0.07587820291519165, + "learning_rate": 0.00019834239730931526, + "loss": 0.1784, + "step": 481 + }, + { + "epoch": 0.45763114170424873, + "grad_norm": 0.04008018970489502, + "learning_rate": 0.0001983223008102521, + "loss": 0.1306, + "step": 482 + }, + { + "epoch": 0.45858058390695466, + "grad_norm": 0.05180038511753082, + "learning_rate": 0.00019830208425127867, + "loss": 0.1485, + "step": 483 + }, + { + "epoch": 0.4595300261096606, + "grad_norm": 0.0691617876291275, + "learning_rate": 0.00019828174765708104, + "loss": 0.1249, + "step": 484 + }, + { + "epoch": 0.46047946831236647, + "grad_norm": 0.0565367266535759, + "learning_rate": 0.00019826129105249195, + "loss": 0.1744, + "step": 485 + }, + { + "epoch": 0.4614289105150724, + "grad_norm": 0.044927019625902176, + "learning_rate": 0.00019824071446249072, + "loss": 0.1341, + "step": 486 + }, + { + "epoch": 0.46237835271777833, + "grad_norm": 0.04481721669435501, + "learning_rate": 0.00019822001791220298, + "loss": 0.1354, + "step": 487 + }, + { + "epoch": 0.4633277949204842, + "grad_norm": 0.05233500525355339, + "learning_rate": 0.0001981992014269011, + "loss": 0.1501, + "step": 488 + }, + { + "epoch": 0.46427723712319013, + "grad_norm": 0.044350553303956985, + "learning_rate": 0.00019817826503200372, + "loss": 0.1335, + "step": 489 + }, + { + "epoch": 0.465226679325896, + "grad_norm": 0.03551819548010826, + "learning_rate": 0.000198157208753076, + "loss": 0.1322, + "step": 490 + }, + { + "epoch": 0.46617612152860194, + "grad_norm": 0.04409592226147652, + "learning_rate": 0.00019813603261582943, + "loss": 0.1561, + "step": 491 + }, + { + "epoch": 0.46712556373130787, + "grad_norm": 0.04842127487063408, + "learning_rate": 0.0001981147366461219, + "loss": 0.1296, + "step": 492 + }, + { + "epoch": 0.46807500593401374, + "grad_norm": 0.04349881038069725, + "learning_rate": 0.00019809332086995757, + "loss": 0.1319, + "step": 493 + }, + { + "epoch": 0.4690244481367197, + "grad_norm": 0.04413028433918953, + "learning_rate": 0.00019807178531348698, + "loss": 0.1321, + "step": 494 + }, + { + "epoch": 0.4699738903394256, + "grad_norm": 0.03972313553094864, + "learning_rate": 0.00019805013000300683, + "loss": 0.1358, + "step": 495 + }, + { + "epoch": 0.4709233325421315, + "grad_norm": 0.052269116044044495, + "learning_rate": 0.00019802835496496012, + "loss": 0.1389, + "step": 496 + }, + { + "epoch": 0.4718727747448374, + "grad_norm": 0.0379653237760067, + "learning_rate": 0.00019800646022593603, + "loss": 0.1283, + "step": 497 + }, + { + "epoch": 0.47282221694754334, + "grad_norm": 0.04370688647031784, + "learning_rate": 0.0001979844458126699, + "loss": 0.1278, + "step": 498 + }, + { + "epoch": 0.4737716591502492, + "grad_norm": 0.03912369906902313, + "learning_rate": 0.0001979623117520432, + "loss": 0.1257, + "step": 499 + }, + { + "epoch": 0.47472110135295514, + "grad_norm": 0.039594005793333054, + "learning_rate": 0.00019794005807108352, + "loss": 0.1375, + "step": 500 + }, + { + "epoch": 0.4756705435556611, + "grad_norm": 0.03889892250299454, + "learning_rate": 0.00019791768479696448, + "loss": 0.13, + "step": 501 + }, + { + "epoch": 0.47661998575836695, + "grad_norm": 0.03966660797595978, + "learning_rate": 0.00019789519195700578, + "loss": 0.1268, + "step": 502 + }, + { + "epoch": 0.4775694279610729, + "grad_norm": 0.04501716047525406, + "learning_rate": 0.00019787257957867306, + "loss": 0.1423, + "step": 503 + }, + { + "epoch": 0.47851887016377875, + "grad_norm": 0.06255436688661575, + "learning_rate": 0.000197849847689578, + "loss": 0.1799, + "step": 504 + }, + { + "epoch": 0.4794683123664847, + "grad_norm": 0.050308458507061005, + "learning_rate": 0.00019782699631747813, + "loss": 0.1733, + "step": 505 + }, + { + "epoch": 0.4804177545691906, + "grad_norm": 0.0357963964343071, + "learning_rate": 0.00019780402549027698, + "loss": 0.1268, + "step": 506 + }, + { + "epoch": 0.4813671967718965, + "grad_norm": 0.03651968017220497, + "learning_rate": 0.00019778093523602384, + "loss": 0.1267, + "step": 507 + }, + { + "epoch": 0.4823166389746024, + "grad_norm": 0.043042074888944626, + "learning_rate": 0.0001977577255829139, + "loss": 0.1256, + "step": 508 + }, + { + "epoch": 0.48326608117730835, + "grad_norm": 0.07031014561653137, + "learning_rate": 0.00019773439655928815, + "loss": 0.1796, + "step": 509 + }, + { + "epoch": 0.4842155233800142, + "grad_norm": 0.04429268836975098, + "learning_rate": 0.00019771094819363326, + "loss": 0.1298, + "step": 510 + }, + { + "epoch": 0.48516496558272015, + "grad_norm": 0.0373898483812809, + "learning_rate": 0.00019768738051458172, + "loss": 0.1232, + "step": 511 + }, + { + "epoch": 0.4861144077854261, + "grad_norm": 0.05853155627846718, + "learning_rate": 0.00019766369355091166, + "loss": 0.1694, + "step": 512 + }, + { + "epoch": 0.48706384998813196, + "grad_norm": 0.05050895735621452, + "learning_rate": 0.00019763988733154686, + "loss": 0.1665, + "step": 513 + }, + { + "epoch": 0.4880132921908379, + "grad_norm": 0.04074448347091675, + "learning_rate": 0.0001976159618855568, + "loss": 0.1336, + "step": 514 + }, + { + "epoch": 0.4889627343935438, + "grad_norm": 0.03826110064983368, + "learning_rate": 0.00019759191724215644, + "loss": 0.132, + "step": 515 + }, + { + "epoch": 0.4899121765962497, + "grad_norm": 0.04392875358462334, + "learning_rate": 0.0001975677534307064, + "loss": 0.1204, + "step": 516 + }, + { + "epoch": 0.4908616187989556, + "grad_norm": 0.04615531116724014, + "learning_rate": 0.0001975434704807127, + "loss": 0.1358, + "step": 517 + }, + { + "epoch": 0.4918110610016615, + "grad_norm": 0.053060565143823624, + "learning_rate": 0.00019751906842182688, + "loss": 0.1299, + "step": 518 + }, + { + "epoch": 0.49276050320436743, + "grad_norm": 0.04905511438846588, + "learning_rate": 0.00019749454728384594, + "loss": 0.1284, + "step": 519 + }, + { + "epoch": 0.49370994540707336, + "grad_norm": 0.04257996007800102, + "learning_rate": 0.00019746990709671234, + "loss": 0.1353, + "step": 520 + }, + { + "epoch": 0.49465938760977923, + "grad_norm": 0.05581909418106079, + "learning_rate": 0.0001974451478905138, + "loss": 0.1594, + "step": 521 + }, + { + "epoch": 0.49560882981248516, + "grad_norm": 0.04603990167379379, + "learning_rate": 0.00019742026969548338, + "loss": 0.1383, + "step": 522 + }, + { + "epoch": 0.4965582720151911, + "grad_norm": 0.058511972427368164, + "learning_rate": 0.00019739527254199958, + "loss": 0.1725, + "step": 523 + }, + { + "epoch": 0.49750771421789697, + "grad_norm": 0.03875808045268059, + "learning_rate": 0.000197370156460586, + "loss": 0.1405, + "step": 524 + }, + { + "epoch": 0.4984571564206029, + "grad_norm": 0.040860000997781754, + "learning_rate": 0.00019734492148191151, + "loss": 0.139, + "step": 525 + }, + { + "epoch": 0.49940659862330883, + "grad_norm": 0.06110459193587303, + "learning_rate": 0.00019731956763679014, + "loss": 0.223, + "step": 526 + }, + { + "epoch": 0.5003560408260147, + "grad_norm": 0.05238598585128784, + "learning_rate": 0.00019729409495618117, + "loss": 0.1681, + "step": 527 + }, + { + "epoch": 0.5013054830287206, + "grad_norm": 0.05180145800113678, + "learning_rate": 0.00019726850347118885, + "loss": 0.1743, + "step": 528 + }, + { + "epoch": 0.5022549252314266, + "grad_norm": 0.05066410079598427, + "learning_rate": 0.00019724279321306262, + "loss": 0.1634, + "step": 529 + }, + { + "epoch": 0.5032043674341324, + "grad_norm": 0.06856084614992142, + "learning_rate": 0.00019721696421319684, + "loss": 0.1685, + "step": 530 + }, + { + "epoch": 0.5041538096368383, + "grad_norm": 0.045972324907779694, + "learning_rate": 0.00019719101650313096, + "loss": 0.1245, + "step": 531 + }, + { + "epoch": 0.5051032518395443, + "grad_norm": 0.04522623121738434, + "learning_rate": 0.00019716495011454934, + "loss": 0.1367, + "step": 532 + }, + { + "epoch": 0.5060526940422502, + "grad_norm": 0.0780516117811203, + "learning_rate": 0.00019713876507928126, + "loss": 0.1351, + "step": 533 + }, + { + "epoch": 0.507002136244956, + "grad_norm": 0.04264210909605026, + "learning_rate": 0.00019711246142930088, + "loss": 0.1312, + "step": 534 + }, + { + "epoch": 0.507951578447662, + "grad_norm": 0.059501100331544876, + "learning_rate": 0.00019708603919672718, + "loss": 0.1698, + "step": 535 + }, + { + "epoch": 0.5089010206503679, + "grad_norm": 0.060105033218860626, + "learning_rate": 0.00019705949841382396, + "loss": 0.1303, + "step": 536 + }, + { + "epoch": 0.5098504628530738, + "grad_norm": 0.04733967408537865, + "learning_rate": 0.00019703283911299982, + "loss": 0.1245, + "step": 537 + }, + { + "epoch": 0.5107999050557798, + "grad_norm": 0.04254663735628128, + "learning_rate": 0.00019700606132680798, + "loss": 0.1343, + "step": 538 + }, + { + "epoch": 0.5117493472584856, + "grad_norm": 0.06302463263273239, + "learning_rate": 0.00019697916508794645, + "loss": 0.1831, + "step": 539 + }, + { + "epoch": 0.5126987894611915, + "grad_norm": 0.05301344394683838, + "learning_rate": 0.0001969521504292578, + "loss": 0.1316, + "step": 540 + }, + { + "epoch": 0.5136482316638975, + "grad_norm": 0.04151083528995514, + "learning_rate": 0.00019692501738372922, + "loss": 0.1335, + "step": 541 + }, + { + "epoch": 0.5145976738666034, + "grad_norm": 0.05647062510251999, + "learning_rate": 0.00019689776598449257, + "loss": 0.1688, + "step": 542 + }, + { + "epoch": 0.5155471160693093, + "grad_norm": 0.037060294300317764, + "learning_rate": 0.000196870396264824, + "loss": 0.1339, + "step": 543 + }, + { + "epoch": 0.5164965582720152, + "grad_norm": 0.04036247730255127, + "learning_rate": 0.0001968429082581443, + "loss": 0.1361, + "step": 544 + }, + { + "epoch": 0.5174460004747211, + "grad_norm": 0.040889665484428406, + "learning_rate": 0.00019681530199801875, + "loss": 0.1356, + "step": 545 + }, + { + "epoch": 0.518395442677427, + "grad_norm": 0.0538480207324028, + "learning_rate": 0.00019678757751815686, + "loss": 0.1689, + "step": 546 + }, + { + "epoch": 0.519344884880133, + "grad_norm": 0.04074794426560402, + "learning_rate": 0.0001967597348524126, + "loss": 0.1329, + "step": 547 + }, + { + "epoch": 0.5202943270828388, + "grad_norm": 0.03896891698241234, + "learning_rate": 0.00019673177403478428, + "loss": 0.1356, + "step": 548 + }, + { + "epoch": 0.5212437692855447, + "grad_norm": 0.04619259387254715, + "learning_rate": 0.00019670369509941442, + "loss": 0.163, + "step": 549 + }, + { + "epoch": 0.5221932114882507, + "grad_norm": 0.035968657582998276, + "learning_rate": 0.00019667549808058976, + "loss": 0.1242, + "step": 550 + }, + { + "epoch": 0.5231426536909566, + "grad_norm": 0.04564007744193077, + "learning_rate": 0.0001966471830127413, + "loss": 0.1364, + "step": 551 + }, + { + "epoch": 0.5240920958936625, + "grad_norm": 0.03991610184311867, + "learning_rate": 0.00019661874993044415, + "loss": 0.1312, + "step": 552 + }, + { + "epoch": 0.5250415380963683, + "grad_norm": 0.037240512669086456, + "learning_rate": 0.00019659019886841752, + "loss": 0.1279, + "step": 553 + }, + { + "epoch": 0.5259909802990743, + "grad_norm": 0.06598762422800064, + "learning_rate": 0.00019656152986152468, + "loss": 0.2165, + "step": 554 + }, + { + "epoch": 0.5269404225017802, + "grad_norm": 0.03867746889591217, + "learning_rate": 0.00019653274294477292, + "loss": 0.1233, + "step": 555 + }, + { + "epoch": 0.5278898647044861, + "grad_norm": 0.051915477961301804, + "learning_rate": 0.00019650383815331357, + "loss": 0.168, + "step": 556 + }, + { + "epoch": 0.528839306907192, + "grad_norm": 0.054896485060453415, + "learning_rate": 0.00019647481552244182, + "loss": 0.1678, + "step": 557 + }, + { + "epoch": 0.5297887491098979, + "grad_norm": 0.05439051240682602, + "learning_rate": 0.00019644567508759675, + "loss": 0.1607, + "step": 558 + }, + { + "epoch": 0.5307381913126038, + "grad_norm": 0.03601578250527382, + "learning_rate": 0.00019641641688436135, + "loss": 0.1271, + "step": 559 + }, + { + "epoch": 0.5316876335153098, + "grad_norm": 0.06025104597210884, + "learning_rate": 0.00019638704094846236, + "loss": 0.176, + "step": 560 + }, + { + "epoch": 0.5326370757180157, + "grad_norm": 0.04126368835568428, + "learning_rate": 0.00019635754731577032, + "loss": 0.1319, + "step": 561 + }, + { + "epoch": 0.5335865179207215, + "grad_norm": 0.05305393040180206, + "learning_rate": 0.00019632793602229943, + "loss": 0.1699, + "step": 562 + }, + { + "epoch": 0.5345359601234275, + "grad_norm": 0.03538331016898155, + "learning_rate": 0.00019629820710420764, + "loss": 0.124, + "step": 563 + }, + { + "epoch": 0.5354854023261334, + "grad_norm": 0.05861300975084305, + "learning_rate": 0.0001962683605977965, + "loss": 0.1688, + "step": 564 + }, + { + "epoch": 0.5364348445288393, + "grad_norm": 0.040226079523563385, + "learning_rate": 0.0001962383965395111, + "loss": 0.1334, + "step": 565 + }, + { + "epoch": 0.5373842867315453, + "grad_norm": 0.035788875073194504, + "learning_rate": 0.00019620831496594017, + "loss": 0.1281, + "step": 566 + }, + { + "epoch": 0.5383337289342511, + "grad_norm": 0.0334162712097168, + "learning_rate": 0.0001961781159138158, + "loss": 0.1317, + "step": 567 + }, + { + "epoch": 0.539283171136957, + "grad_norm": 0.03352081775665283, + "learning_rate": 0.00019614779942001364, + "loss": 0.1334, + "step": 568 + }, + { + "epoch": 0.540232613339663, + "grad_norm": 0.03684060648083687, + "learning_rate": 0.00019611736552155274, + "loss": 0.1349, + "step": 569 + }, + { + "epoch": 0.5411820555423689, + "grad_norm": 0.03640671446919441, + "learning_rate": 0.00019608681425559542, + "loss": 0.1278, + "step": 570 + }, + { + "epoch": 0.5421314977450747, + "grad_norm": 0.04167250171303749, + "learning_rate": 0.00019605614565944748, + "loss": 0.1384, + "step": 571 + }, + { + "epoch": 0.5430809399477807, + "grad_norm": 0.0416824147105217, + "learning_rate": 0.00019602535977055778, + "loss": 0.1319, + "step": 572 + }, + { + "epoch": 0.5440303821504866, + "grad_norm": 0.03897137567400932, + "learning_rate": 0.00019599445662651861, + "loss": 0.1389, + "step": 573 + }, + { + "epoch": 0.5449798243531925, + "grad_norm": 0.03894896060228348, + "learning_rate": 0.00019596343626506526, + "loss": 0.1341, + "step": 574 + }, + { + "epoch": 0.5459292665558985, + "grad_norm": 0.04211690276861191, + "learning_rate": 0.00019593229872407627, + "loss": 0.1377, + "step": 575 + }, + { + "epoch": 0.5468787087586043, + "grad_norm": 0.04308454692363739, + "learning_rate": 0.00019590104404157327, + "loss": 0.1268, + "step": 576 + }, + { + "epoch": 0.5478281509613102, + "grad_norm": 0.0525001622736454, + "learning_rate": 0.00019586967225572086, + "loss": 0.1775, + "step": 577 + }, + { + "epoch": 0.5487775931640161, + "grad_norm": 0.056315965950489044, + "learning_rate": 0.00019583818340482664, + "loss": 0.1688, + "step": 578 + }, + { + "epoch": 0.5497270353667221, + "grad_norm": 0.03801283985376358, + "learning_rate": 0.0001958065775273412, + "loss": 0.1309, + "step": 579 + }, + { + "epoch": 0.550676477569428, + "grad_norm": 0.03738854080438614, + "learning_rate": 0.00019577485466185804, + "loss": 0.137, + "step": 580 + }, + { + "epoch": 0.5516259197721338, + "grad_norm": 0.03772661089897156, + "learning_rate": 0.0001957430148471134, + "loss": 0.1276, + "step": 581 + }, + { + "epoch": 0.5525753619748398, + "grad_norm": 0.039842378348112106, + "learning_rate": 0.00019571105812198652, + "loss": 0.1329, + "step": 582 + }, + { + "epoch": 0.5535248041775457, + "grad_norm": 0.033689334988594055, + "learning_rate": 0.0001956789845254992, + "loss": 0.1265, + "step": 583 + }, + { + "epoch": 0.5544742463802516, + "grad_norm": 0.046588387340307236, + "learning_rate": 0.00019564679409681608, + "loss": 0.1645, + "step": 584 + }, + { + "epoch": 0.5554236885829575, + "grad_norm": 0.03861064463853836, + "learning_rate": 0.0001956144868752444, + "loss": 0.1267, + "step": 585 + }, + { + "epoch": 0.5563731307856634, + "grad_norm": 0.03467525169253349, + "learning_rate": 0.000195582062900234, + "loss": 0.1299, + "step": 586 + }, + { + "epoch": 0.5573225729883693, + "grad_norm": 0.03659389913082123, + "learning_rate": 0.0001955495222113774, + "loss": 0.1286, + "step": 587 + }, + { + "epoch": 0.5582720151910753, + "grad_norm": 0.03826770931482315, + "learning_rate": 0.0001955168648484095, + "loss": 0.1313, + "step": 588 + }, + { + "epoch": 0.5592214573937812, + "grad_norm": 0.038110729306936264, + "learning_rate": 0.00019548409085120772, + "loss": 0.137, + "step": 589 + }, + { + "epoch": 0.560170899596487, + "grad_norm": 0.03989555314183235, + "learning_rate": 0.0001954512002597919, + "loss": 0.132, + "step": 590 + }, + { + "epoch": 0.561120341799193, + "grad_norm": 0.05395180359482765, + "learning_rate": 0.00019541819311432427, + "loss": 0.1401, + "step": 591 + }, + { + "epoch": 0.5620697840018989, + "grad_norm": 0.05007918179035187, + "learning_rate": 0.00019538506945510938, + "loss": 0.1584, + "step": 592 + }, + { + "epoch": 0.5630192262046048, + "grad_norm": 0.047849785536527634, + "learning_rate": 0.00019535182932259404, + "loss": 0.1265, + "step": 593 + }, + { + "epoch": 0.5639686684073107, + "grad_norm": 0.04303041473031044, + "learning_rate": 0.00019531847275736726, + "loss": 0.1245, + "step": 594 + }, + { + "epoch": 0.5649181106100166, + "grad_norm": 0.04128289222717285, + "learning_rate": 0.00019528499980016025, + "loss": 0.1317, + "step": 595 + }, + { + "epoch": 0.5658675528127225, + "grad_norm": 0.04311414808034897, + "learning_rate": 0.00019525141049184637, + "loss": 0.1364, + "step": 596 + }, + { + "epoch": 0.5668169950154285, + "grad_norm": 0.03765838220715523, + "learning_rate": 0.00019521770487344103, + "loss": 0.1268, + "step": 597 + }, + { + "epoch": 0.5677664372181344, + "grad_norm": 0.03674585744738579, + "learning_rate": 0.00019518388298610164, + "loss": 0.1297, + "step": 598 + }, + { + "epoch": 0.5687158794208402, + "grad_norm": 0.036937762051820755, + "learning_rate": 0.0001951499448711276, + "loss": 0.1303, + "step": 599 + }, + { + "epoch": 0.5696653216235462, + "grad_norm": 0.03748161345720291, + "learning_rate": 0.0001951158905699603, + "loss": 0.1328, + "step": 600 + }, + { + "epoch": 0.5706147638262521, + "grad_norm": 0.04011257737874985, + "learning_rate": 0.00019508172012418283, + "loss": 0.1346, + "step": 601 + }, + { + "epoch": 0.571564206028958, + "grad_norm": 0.03853931650519371, + "learning_rate": 0.00019504743357552035, + "loss": 0.1279, + "step": 602 + }, + { + "epoch": 0.572513648231664, + "grad_norm": 0.03750459849834442, + "learning_rate": 0.0001950130309658396, + "loss": 0.1227, + "step": 603 + }, + { + "epoch": 0.5734630904343698, + "grad_norm": 0.05542079731822014, + "learning_rate": 0.00019497851233714908, + "loss": 0.1647, + "step": 604 + }, + { + "epoch": 0.5744125326370757, + "grad_norm": 0.04472218081355095, + "learning_rate": 0.00019494387773159898, + "loss": 0.1416, + "step": 605 + }, + { + "epoch": 0.5753619748397816, + "grad_norm": 0.052323974668979645, + "learning_rate": 0.00019490912719148114, + "loss": 0.1367, + "step": 606 + }, + { + "epoch": 0.5763114170424876, + "grad_norm": 0.037580832839012146, + "learning_rate": 0.00019487426075922893, + "loss": 0.131, + "step": 607 + }, + { + "epoch": 0.5772608592451934, + "grad_norm": 0.03929577395319939, + "learning_rate": 0.0001948392784774172, + "loss": 0.128, + "step": 608 + }, + { + "epoch": 0.5782103014478993, + "grad_norm": 0.03706606104969978, + "learning_rate": 0.0001948041803887623, + "loss": 0.1316, + "step": 609 + }, + { + "epoch": 0.5791597436506053, + "grad_norm": 0.038938358426094055, + "learning_rate": 0.00019476896653612203, + "loss": 0.1275, + "step": 610 + }, + { + "epoch": 0.5801091858533112, + "grad_norm": 0.04818068817257881, + "learning_rate": 0.00019473363696249546, + "loss": 0.1662, + "step": 611 + }, + { + "epoch": 0.581058628056017, + "grad_norm": 0.03735940158367157, + "learning_rate": 0.00019469819171102304, + "loss": 0.1361, + "step": 612 + }, + { + "epoch": 0.582008070258723, + "grad_norm": 0.03568827733397484, + "learning_rate": 0.00019466263082498645, + "loss": 0.1216, + "step": 613 + }, + { + "epoch": 0.5829575124614289, + "grad_norm": 0.03913251310586929, + "learning_rate": 0.0001946269543478085, + "loss": 0.1321, + "step": 614 + }, + { + "epoch": 0.5839069546641348, + "grad_norm": 0.062009479850530624, + "learning_rate": 0.0001945911623230533, + "loss": 0.1778, + "step": 615 + }, + { + "epoch": 0.5848563968668408, + "grad_norm": 0.039088111370801926, + "learning_rate": 0.0001945552547944259, + "loss": 0.1352, + "step": 616 + }, + { + "epoch": 0.5858058390695466, + "grad_norm": 0.041976600885391235, + "learning_rate": 0.0001945192318057725, + "loss": 0.1394, + "step": 617 + }, + { + "epoch": 0.5867552812722525, + "grad_norm": 0.03723563253879547, + "learning_rate": 0.00019448309340108018, + "loss": 0.1246, + "step": 618 + }, + { + "epoch": 0.5877047234749585, + "grad_norm": 0.0382399819791317, + "learning_rate": 0.00019444683962447707, + "loss": 0.1232, + "step": 619 + }, + { + "epoch": 0.5886541656776644, + "grad_norm": 0.03758077695965767, + "learning_rate": 0.0001944104705202321, + "loss": 0.1417, + "step": 620 + }, + { + "epoch": 0.5896036078803703, + "grad_norm": 0.034823786467313766, + "learning_rate": 0.000194373986132755, + "loss": 0.1304, + "step": 621 + }, + { + "epoch": 0.5905530500830762, + "grad_norm": 0.03755120187997818, + "learning_rate": 0.00019433738650659641, + "loss": 0.133, + "step": 622 + }, + { + "epoch": 0.5915024922857821, + "grad_norm": 0.03759913146495819, + "learning_rate": 0.00019430067168644754, + "loss": 0.1222, + "step": 623 + }, + { + "epoch": 0.592451934488488, + "grad_norm": 0.06232694163918495, + "learning_rate": 0.0001942638417171403, + "loss": 0.1778, + "step": 624 + }, + { + "epoch": 0.593401376691194, + "grad_norm": 0.05642306059598923, + "learning_rate": 0.00019422689664364725, + "loss": 0.1706, + "step": 625 + }, + { + "epoch": 0.5943508188938998, + "grad_norm": 0.0827709287405014, + "learning_rate": 0.00019418983651108148, + "loss": 0.2371, + "step": 626 + }, + { + "epoch": 0.5953002610966057, + "grad_norm": 0.03614366054534912, + "learning_rate": 0.00019415266136469652, + "loss": 0.1225, + "step": 627 + }, + { + "epoch": 0.5962497032993117, + "grad_norm": 0.042416494339704514, + "learning_rate": 0.00019411537124988643, + "loss": 0.1239, + "step": 628 + }, + { + "epoch": 0.5971991455020176, + "grad_norm": 0.037246908992528915, + "learning_rate": 0.00019407796621218566, + "loss": 0.1292, + "step": 629 + }, + { + "epoch": 0.5981485877047235, + "grad_norm": 0.05374092981219292, + "learning_rate": 0.00019404044629726887, + "loss": 0.1782, + "step": 630 + }, + { + "epoch": 0.5990980299074293, + "grad_norm": 0.052854426205158234, + "learning_rate": 0.00019400281155095112, + "loss": 0.1711, + "step": 631 + }, + { + "epoch": 0.6000474721101353, + "grad_norm": 0.038800131529569626, + "learning_rate": 0.00019396506201918765, + "loss": 0.1285, + "step": 632 + }, + { + "epoch": 0.6009969143128412, + "grad_norm": 0.040118250995874405, + "learning_rate": 0.0001939271977480738, + "loss": 0.1335, + "step": 633 + }, + { + "epoch": 0.6019463565155471, + "grad_norm": 0.07007341086864471, + "learning_rate": 0.00019388921878384517, + "loss": 0.2115, + "step": 634 + }, + { + "epoch": 0.602895798718253, + "grad_norm": 0.03245210647583008, + "learning_rate": 0.0001938511251728772, + "loss": 0.1304, + "step": 635 + }, + { + "epoch": 0.6038452409209589, + "grad_norm": 0.03384733200073242, + "learning_rate": 0.00019381291696168553, + "loss": 0.1297, + "step": 636 + }, + { + "epoch": 0.6047946831236648, + "grad_norm": 0.04325825348496437, + "learning_rate": 0.0001937745941969256, + "loss": 0.1337, + "step": 637 + }, + { + "epoch": 0.6057441253263708, + "grad_norm": 0.046986173838377, + "learning_rate": 0.00019373615692539275, + "loss": 0.1385, + "step": 638 + }, + { + "epoch": 0.6066935675290767, + "grad_norm": 0.03726234659552574, + "learning_rate": 0.0001936976051940222, + "loss": 0.1393, + "step": 639 + }, + { + "epoch": 0.6076430097317825, + "grad_norm": 0.05574486404657364, + "learning_rate": 0.0001936589390498889, + "loss": 0.1698, + "step": 640 + }, + { + "epoch": 0.6085924519344885, + "grad_norm": 0.052818477153778076, + "learning_rate": 0.0001936201585402075, + "loss": 0.1722, + "step": 641 + }, + { + "epoch": 0.6095418941371944, + "grad_norm": 0.03535636141896248, + "learning_rate": 0.00019358126371233231, + "loss": 0.129, + "step": 642 + }, + { + "epoch": 0.6104913363399003, + "grad_norm": 0.03453061729669571, + "learning_rate": 0.00019354225461375724, + "loss": 0.1313, + "step": 643 + }, + { + "epoch": 0.6114407785426063, + "grad_norm": 0.030467770993709564, + "learning_rate": 0.0001935031312921157, + "loss": 0.1241, + "step": 644 + }, + { + "epoch": 0.6123902207453121, + "grad_norm": 0.03996508568525314, + "learning_rate": 0.0001934638937951806, + "loss": 0.1265, + "step": 645 + }, + { + "epoch": 0.613339662948018, + "grad_norm": 0.034416794776916504, + "learning_rate": 0.00019342454217086429, + "loss": 0.1341, + "step": 646 + }, + { + "epoch": 0.614289105150724, + "grad_norm": 0.03674698621034622, + "learning_rate": 0.00019338507646721845, + "loss": 0.1399, + "step": 647 + }, + { + "epoch": 0.6152385473534299, + "grad_norm": 0.037850040942430496, + "learning_rate": 0.0001933454967324341, + "loss": 0.1295, + "step": 648 + }, + { + "epoch": 0.6161879895561357, + "grad_norm": 0.037829235196113586, + "learning_rate": 0.0001933058030148414, + "loss": 0.1302, + "step": 649 + }, + { + "epoch": 0.6171374317588417, + "grad_norm": 0.03579702973365784, + "learning_rate": 0.00019326599536290983, + "loss": 0.1352, + "step": 650 + }, + { + "epoch": 0.6180868739615476, + "grad_norm": 0.052539851516485214, + "learning_rate": 0.00019322607382524785, + "loss": 0.1744, + "step": 651 + }, + { + "epoch": 0.6190363161642535, + "grad_norm": 0.03814668953418732, + "learning_rate": 0.0001931860384506031, + "loss": 0.1303, + "step": 652 + }, + { + "epoch": 0.6199857583669595, + "grad_norm": 0.03730069473385811, + "learning_rate": 0.00019314588928786224, + "loss": 0.1236, + "step": 653 + }, + { + "epoch": 0.6209352005696653, + "grad_norm": 0.04081875458359718, + "learning_rate": 0.00019310562638605078, + "loss": 0.1328, + "step": 654 + }, + { + "epoch": 0.6218846427723712, + "grad_norm": 0.03532617911696434, + "learning_rate": 0.00019306524979433308, + "loss": 0.1238, + "step": 655 + }, + { + "epoch": 0.6228340849750772, + "grad_norm": 0.035857025533914566, + "learning_rate": 0.00019302475956201254, + "loss": 0.1244, + "step": 656 + }, + { + "epoch": 0.6237835271777831, + "grad_norm": 0.036031339317560196, + "learning_rate": 0.0001929841557385311, + "loss": 0.1251, + "step": 657 + }, + { + "epoch": 0.624732969380489, + "grad_norm": 0.037832874804735184, + "learning_rate": 0.00019294343837346944, + "loss": 0.1262, + "step": 658 + }, + { + "epoch": 0.6256824115831948, + "grad_norm": 0.03651989623904228, + "learning_rate": 0.00019290260751654706, + "loss": 0.1239, + "step": 659 + }, + { + "epoch": 0.6266318537859008, + "grad_norm": 0.03595907241106033, + "learning_rate": 0.00019286166321762184, + "loss": 0.1342, + "step": 660 + }, + { + "epoch": 0.6275812959886067, + "grad_norm": 0.04714696854352951, + "learning_rate": 0.00019282060552669025, + "loss": 0.1712, + "step": 661 + }, + { + "epoch": 0.6285307381913126, + "grad_norm": 0.0448799654841423, + "learning_rate": 0.00019277943449388726, + "loss": 0.1601, + "step": 662 + }, + { + "epoch": 0.6294801803940185, + "grad_norm": 0.03133920207619667, + "learning_rate": 0.0001927381501694862, + "loss": 0.127, + "step": 663 + }, + { + "epoch": 0.6304296225967244, + "grad_norm": 0.051593225449323654, + "learning_rate": 0.00019269675260389876, + "loss": 0.1659, + "step": 664 + }, + { + "epoch": 0.6313790647994303, + "grad_norm": 0.03713349625468254, + "learning_rate": 0.0001926552418476749, + "loss": 0.1294, + "step": 665 + }, + { + "epoch": 0.6323285070021363, + "grad_norm": 0.03420734405517578, + "learning_rate": 0.00019261361795150275, + "loss": 0.1376, + "step": 666 + }, + { + "epoch": 0.6332779492048421, + "grad_norm": 0.04476429522037506, + "learning_rate": 0.00019257188096620867, + "loss": 0.1595, + "step": 667 + }, + { + "epoch": 0.634227391407548, + "grad_norm": 0.05289504677057266, + "learning_rate": 0.00019253003094275707, + "loss": 0.1589, + "step": 668 + }, + { + "epoch": 0.635176833610254, + "grad_norm": 0.042022526264190674, + "learning_rate": 0.0001924880679322504, + "loss": 0.1316, + "step": 669 + }, + { + "epoch": 0.6361262758129599, + "grad_norm": 0.0408223457634449, + "learning_rate": 0.00019244599198592907, + "loss": 0.1386, + "step": 670 + }, + { + "epoch": 0.6370757180156658, + "grad_norm": 0.03941584751009941, + "learning_rate": 0.00019240380315517142, + "loss": 0.1325, + "step": 671 + }, + { + "epoch": 0.6380251602183717, + "grad_norm": 0.03860325738787651, + "learning_rate": 0.00019236150149149357, + "loss": 0.1215, + "step": 672 + }, + { + "epoch": 0.6389746024210776, + "grad_norm": 0.0342581607401371, + "learning_rate": 0.00019231908704654948, + "loss": 0.1247, + "step": 673 + }, + { + "epoch": 0.6399240446237835, + "grad_norm": 0.04099750518798828, + "learning_rate": 0.00019227655987213077, + "loss": 0.1335, + "step": 674 + }, + { + "epoch": 0.6408734868264895, + "grad_norm": 0.031005796045064926, + "learning_rate": 0.00019223392002016678, + "loss": 0.1297, + "step": 675 + }, + { + "epoch": 0.6418229290291954, + "grad_norm": 0.05248212069272995, + "learning_rate": 0.0001921911675427244, + "loss": 0.1737, + "step": 676 + }, + { + "epoch": 0.6427723712319012, + "grad_norm": 0.04168983921408653, + "learning_rate": 0.00019214830249200806, + "loss": 0.1373, + "step": 677 + }, + { + "epoch": 0.6437218134346072, + "grad_norm": 0.03659060224890709, + "learning_rate": 0.0001921053249203596, + "loss": 0.1263, + "step": 678 + }, + { + "epoch": 0.6446712556373131, + "grad_norm": 0.042256928980350494, + "learning_rate": 0.00019206223488025834, + "loss": 0.1646, + "step": 679 + }, + { + "epoch": 0.645620697840019, + "grad_norm": 0.04420709237456322, + "learning_rate": 0.00019201903242432086, + "loss": 0.1577, + "step": 680 + }, + { + "epoch": 0.646570140042725, + "grad_norm": 0.03781798109412193, + "learning_rate": 0.00019197571760530107, + "loss": 0.1253, + "step": 681 + }, + { + "epoch": 0.6475195822454308, + "grad_norm": 0.03728644549846649, + "learning_rate": 0.00019193229047609003, + "loss": 0.1423, + "step": 682 + }, + { + "epoch": 0.6484690244481367, + "grad_norm": 0.05171523615717888, + "learning_rate": 0.00019188875108971598, + "loss": 0.177, + "step": 683 + }, + { + "epoch": 0.6494184666508427, + "grad_norm": 0.05022161453962326, + "learning_rate": 0.0001918450994993442, + "loss": 0.1616, + "step": 684 + }, + { + "epoch": 0.6503679088535486, + "grad_norm": 0.037774864584207535, + "learning_rate": 0.00019180133575827707, + "loss": 0.1257, + "step": 685 + }, + { + "epoch": 0.6513173510562544, + "grad_norm": 0.056198425590991974, + "learning_rate": 0.00019175745991995377, + "loss": 0.1751, + "step": 686 + }, + { + "epoch": 0.6522667932589603, + "grad_norm": 0.05259314179420471, + "learning_rate": 0.0001917134720379505, + "loss": 0.1655, + "step": 687 + }, + { + "epoch": 0.6532162354616663, + "grad_norm": 0.04018954187631607, + "learning_rate": 0.00019166937216598013, + "loss": 0.1178, + "step": 688 + }, + { + "epoch": 0.6541656776643722, + "grad_norm": 0.057170454412698746, + "learning_rate": 0.00019162516035789247, + "loss": 0.1744, + "step": 689 + }, + { + "epoch": 0.655115119867078, + "grad_norm": 0.04647281393408775, + "learning_rate": 0.00019158083666767381, + "loss": 0.1343, + "step": 690 + }, + { + "epoch": 0.656064562069784, + "grad_norm": 0.056390274316072464, + "learning_rate": 0.00019153640114944723, + "loss": 0.2029, + "step": 691 + }, + { + "epoch": 0.6570140042724899, + "grad_norm": 0.03656432405114174, + "learning_rate": 0.00019149185385747224, + "loss": 0.1249, + "step": 692 + }, + { + "epoch": 0.6579634464751958, + "grad_norm": 0.031422629952430725, + "learning_rate": 0.0001914471948461449, + "loss": 0.1232, + "step": 693 + }, + { + "epoch": 0.6589128886779018, + "grad_norm": 0.0463186614215374, + "learning_rate": 0.00019140242416999765, + "loss": 0.1675, + "step": 694 + }, + { + "epoch": 0.6598623308806076, + "grad_norm": 0.03907819464802742, + "learning_rate": 0.0001913575418836993, + "loss": 0.1307, + "step": 695 + }, + { + "epoch": 0.6608117730833135, + "grad_norm": 0.04354274645447731, + "learning_rate": 0.00019131254804205498, + "loss": 0.1381, + "step": 696 + }, + { + "epoch": 0.6617612152860195, + "grad_norm": 0.0355788990855217, + "learning_rate": 0.00019126744270000598, + "loss": 0.1273, + "step": 697 + }, + { + "epoch": 0.6627106574887254, + "grad_norm": 0.0382835678756237, + "learning_rate": 0.0001912222259126298, + "loss": 0.1184, + "step": 698 + }, + { + "epoch": 0.6636600996914312, + "grad_norm": 0.05007009580731392, + "learning_rate": 0.00019117689773513993, + "loss": 0.1751, + "step": 699 + }, + { + "epoch": 0.6646095418941372, + "grad_norm": 0.05426732823252678, + "learning_rate": 0.000191131458222886, + "loss": 0.175, + "step": 700 + }, + { + "epoch": 0.6655589840968431, + "grad_norm": 0.033966466784477234, + "learning_rate": 0.00019108590743135352, + "loss": 0.123, + "step": 701 + }, + { + "epoch": 0.666508426299549, + "grad_norm": 0.04007060080766678, + "learning_rate": 0.00019104024541616386, + "loss": 0.1386, + "step": 702 + }, + { + "epoch": 0.667457868502255, + "grad_norm": 0.05075724050402641, + "learning_rate": 0.00019099447223307423, + "loss": 0.1698, + "step": 703 + }, + { + "epoch": 0.6684073107049608, + "grad_norm": 0.04677930474281311, + "learning_rate": 0.00019094858793797757, + "loss": 0.1633, + "step": 704 + }, + { + "epoch": 0.6693567529076667, + "grad_norm": 0.04063379392027855, + "learning_rate": 0.00019090259258690263, + "loss": 0.1414, + "step": 705 + }, + { + "epoch": 0.6703061951103727, + "grad_norm": 0.039291396737098694, + "learning_rate": 0.00019085648623601352, + "loss": 0.1273, + "step": 706 + }, + { + "epoch": 0.6712556373130786, + "grad_norm": 0.04960642755031586, + "learning_rate": 0.00019081026894161008, + "loss": 0.1512, + "step": 707 + }, + { + "epoch": 0.6722050795157845, + "grad_norm": 0.04266348108649254, + "learning_rate": 0.00019076394076012756, + "loss": 0.1352, + "step": 708 + }, + { + "epoch": 0.6731545217184904, + "grad_norm": 0.03943296894431114, + "learning_rate": 0.00019071750174813663, + "loss": 0.1332, + "step": 709 + }, + { + "epoch": 0.6741039639211963, + "grad_norm": 0.04927997291088104, + "learning_rate": 0.0001906709519623433, + "loss": 0.1645, + "step": 710 + }, + { + "epoch": 0.6750534061239022, + "grad_norm": 0.0418451763689518, + "learning_rate": 0.00019062429145958877, + "loss": 0.1279, + "step": 711 + }, + { + "epoch": 0.6760028483266081, + "grad_norm": 0.04283139482140541, + "learning_rate": 0.0001905775202968495, + "loss": 0.1388, + "step": 712 + }, + { + "epoch": 0.676952290529314, + "grad_norm": 0.05674710497260094, + "learning_rate": 0.00019053063853123714, + "loss": 0.171, + "step": 713 + }, + { + "epoch": 0.6779017327320199, + "grad_norm": 0.03568726405501366, + "learning_rate": 0.00019048364621999825, + "loss": 0.1329, + "step": 714 + }, + { + "epoch": 0.6788511749347258, + "grad_norm": 0.03796301409602165, + "learning_rate": 0.00019043654342051447, + "loss": 0.1352, + "step": 715 + }, + { + "epoch": 0.6798006171374318, + "grad_norm": 0.03538963943719864, + "learning_rate": 0.00019038933019030233, + "loss": 0.1328, + "step": 716 + }, + { + "epoch": 0.6807500593401377, + "grad_norm": 0.05234035104513168, + "learning_rate": 0.00019034200658701322, + "loss": 0.1649, + "step": 717 + }, + { + "epoch": 0.6816995015428435, + "grad_norm": 0.03719701990485191, + "learning_rate": 0.00019029457266843327, + "loss": 0.1295, + "step": 718 + }, + { + "epoch": 0.6826489437455495, + "grad_norm": 0.03594352304935455, + "learning_rate": 0.00019024702849248335, + "loss": 0.128, + "step": 719 + }, + { + "epoch": 0.6835983859482554, + "grad_norm": 0.04097168892621994, + "learning_rate": 0.00019019937411721895, + "loss": 0.1331, + "step": 720 + }, + { + "epoch": 0.6845478281509613, + "grad_norm": 0.03943239524960518, + "learning_rate": 0.00019015160960083013, + "loss": 0.1337, + "step": 721 + }, + { + "epoch": 0.6854972703536673, + "grad_norm": 0.0411958172917366, + "learning_rate": 0.00019010373500164145, + "loss": 0.1603, + "step": 722 + }, + { + "epoch": 0.6864467125563731, + "grad_norm": 0.05295250564813614, + "learning_rate": 0.00019005575037811184, + "loss": 0.1644, + "step": 723 + }, + { + "epoch": 0.687396154759079, + "grad_norm": 0.03916552662849426, + "learning_rate": 0.00019000765578883465, + "loss": 0.135, + "step": 724 + }, + { + "epoch": 0.688345596961785, + "grad_norm": 0.03871094062924385, + "learning_rate": 0.00018995945129253745, + "loss": 0.1276, + "step": 725 + }, + { + "epoch": 0.6892950391644909, + "grad_norm": 0.03405594825744629, + "learning_rate": 0.00018991113694808204, + "loss": 0.1327, + "step": 726 + }, + { + "epoch": 0.6902444813671967, + "grad_norm": 0.03824371099472046, + "learning_rate": 0.00018986271281446436, + "loss": 0.1357, + "step": 727 + }, + { + "epoch": 0.6911939235699027, + "grad_norm": 0.03813684731721878, + "learning_rate": 0.0001898141789508144, + "loss": 0.1341, + "step": 728 + }, + { + "epoch": 0.6921433657726086, + "grad_norm": 0.03283112868666649, + "learning_rate": 0.0001897655354163962, + "loss": 0.1299, + "step": 729 + }, + { + "epoch": 0.6930928079753145, + "grad_norm": 0.03226768597960472, + "learning_rate": 0.00018971678227060757, + "loss": 0.1272, + "step": 730 + }, + { + "epoch": 0.6940422501780205, + "grad_norm": 0.037317484617233276, + "learning_rate": 0.0001896679195729803, + "loss": 0.1339, + "step": 731 + }, + { + "epoch": 0.6949916923807263, + "grad_norm": 0.05428892746567726, + "learning_rate": 0.0001896189473831799, + "loss": 0.1667, + "step": 732 + }, + { + "epoch": 0.6959411345834322, + "grad_norm": 0.04177982360124588, + "learning_rate": 0.0001895698657610056, + "loss": 0.1337, + "step": 733 + }, + { + "epoch": 0.6968905767861382, + "grad_norm": 0.041572730988264084, + "learning_rate": 0.00018952067476639024, + "loss": 0.1332, + "step": 734 + }, + { + "epoch": 0.6978400189888441, + "grad_norm": 0.03430505469441414, + "learning_rate": 0.00018947137445940023, + "loss": 0.1265, + "step": 735 + }, + { + "epoch": 0.6987894611915499, + "grad_norm": 0.03863980621099472, + "learning_rate": 0.00018942196490023542, + "loss": 0.1337, + "step": 736 + }, + { + "epoch": 0.6997389033942559, + "grad_norm": 0.06445252895355225, + "learning_rate": 0.00018937244614922912, + "loss": 0.2032, + "step": 737 + }, + { + "epoch": 0.7006883455969618, + "grad_norm": 0.03358490392565727, + "learning_rate": 0.00018932281826684793, + "loss": 0.127, + "step": 738 + }, + { + "epoch": 0.7016377877996677, + "grad_norm": 0.034341324120759964, + "learning_rate": 0.00018927308131369173, + "loss": 0.1303, + "step": 739 + }, + { + "epoch": 0.7025872300023736, + "grad_norm": 0.035848621279001236, + "learning_rate": 0.00018922323535049354, + "loss": 0.1272, + "step": 740 + }, + { + "epoch": 0.7035366722050795, + "grad_norm": 0.03865866735577583, + "learning_rate": 0.0001891732804381196, + "loss": 0.136, + "step": 741 + }, + { + "epoch": 0.7044861144077854, + "grad_norm": 0.045944251120090485, + "learning_rate": 0.0001891232166375691, + "loss": 0.1741, + "step": 742 + }, + { + "epoch": 0.7054355566104913, + "grad_norm": 0.04418769106268883, + "learning_rate": 0.00018907304400997418, + "loss": 0.1504, + "step": 743 + }, + { + "epoch": 0.7063849988131973, + "grad_norm": 0.062257930636405945, + "learning_rate": 0.0001890227626165999, + "loss": 0.1786, + "step": 744 + }, + { + "epoch": 0.7073344410159031, + "grad_norm": 0.037457846105098724, + "learning_rate": 0.00018897237251884415, + "loss": 0.1389, + "step": 745 + }, + { + "epoch": 0.708283883218609, + "grad_norm": 0.039091672748327255, + "learning_rate": 0.0001889218737782375, + "loss": 0.1264, + "step": 746 + }, + { + "epoch": 0.709233325421315, + "grad_norm": 0.035011596977710724, + "learning_rate": 0.00018887126645644324, + "loss": 0.1363, + "step": 747 + }, + { + "epoch": 0.7101827676240209, + "grad_norm": 0.104104183614254, + "learning_rate": 0.00018882055061525722, + "loss": 0.1588, + "step": 748 + }, + { + "epoch": 0.7111322098267268, + "grad_norm": 0.03222833201289177, + "learning_rate": 0.0001887697263166078, + "loss": 0.1259, + "step": 749 + }, + { + "epoch": 0.7120816520294327, + "grad_norm": 0.049904145300388336, + "learning_rate": 0.0001887187936225558, + "loss": 0.1676, + "step": 750 + }, + { + "epoch": 0.7130310942321386, + "grad_norm": 0.15150390565395355, + "learning_rate": 0.00018866775259529435, + "loss": 0.1369, + "step": 751 + }, + { + "epoch": 0.7139805364348445, + "grad_norm": 0.03994397446513176, + "learning_rate": 0.0001886166032971489, + "loss": 0.1294, + "step": 752 + }, + { + "epoch": 0.7149299786375505, + "grad_norm": 0.06274881213903427, + "learning_rate": 0.00018856534579057713, + "loss": 0.1659, + "step": 753 + }, + { + "epoch": 0.7158794208402564, + "grad_norm": 0.04001612216234207, + "learning_rate": 0.00018851398013816883, + "loss": 0.1305, + "step": 754 + }, + { + "epoch": 0.7168288630429622, + "grad_norm": 0.03961142525076866, + "learning_rate": 0.0001884625064026458, + "loss": 0.1265, + "step": 755 + }, + { + "epoch": 0.7177783052456682, + "grad_norm": 0.033916253596544266, + "learning_rate": 0.00018841092464686186, + "loss": 0.1336, + "step": 756 + }, + { + "epoch": 0.7187277474483741, + "grad_norm": 0.040992431342601776, + "learning_rate": 0.00018835923493380278, + "loss": 0.1403, + "step": 757 + }, + { + "epoch": 0.71967718965108, + "grad_norm": 0.03410341590642929, + "learning_rate": 0.00018830743732658608, + "loss": 0.1233, + "step": 758 + }, + { + "epoch": 0.720626631853786, + "grad_norm": 0.05984083190560341, + "learning_rate": 0.000188255531888461, + "loss": 0.1417, + "step": 759 + }, + { + "epoch": 0.7215760740564918, + "grad_norm": 0.03874243050813675, + "learning_rate": 0.00018820351868280858, + "loss": 0.1366, + "step": 760 + }, + { + "epoch": 0.7225255162591977, + "grad_norm": 0.05256400629878044, + "learning_rate": 0.00018815139777314136, + "loss": 0.172, + "step": 761 + }, + { + "epoch": 0.7234749584619037, + "grad_norm": 0.039005253463983536, + "learning_rate": 0.0001880991692231034, + "loss": 0.1312, + "step": 762 + }, + { + "epoch": 0.7244244006646096, + "grad_norm": 0.04029637575149536, + "learning_rate": 0.0001880468330964702, + "loss": 0.1327, + "step": 763 + }, + { + "epoch": 0.7253738428673154, + "grad_norm": 0.04493672773241997, + "learning_rate": 0.00018799438945714866, + "loss": 0.1555, + "step": 764 + }, + { + "epoch": 0.7263232850700213, + "grad_norm": 0.03862634301185608, + "learning_rate": 0.0001879418383691769, + "loss": 0.133, + "step": 765 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.03904002159833908, + "learning_rate": 0.00018788917989672434, + "loss": 0.1259, + "step": 766 + }, + { + "epoch": 0.7282221694754332, + "grad_norm": 0.037936531007289886, + "learning_rate": 0.0001878364141040914, + "loss": 0.1263, + "step": 767 + }, + { + "epoch": 0.729171611678139, + "grad_norm": 0.03802201896905899, + "learning_rate": 0.0001877835410557096, + "loss": 0.134, + "step": 768 + }, + { + "epoch": 0.730121053880845, + "grad_norm": 0.03759211301803589, + "learning_rate": 0.00018773056081614154, + "loss": 0.1383, + "step": 769 + }, + { + "epoch": 0.7310704960835509, + "grad_norm": 0.0498163104057312, + "learning_rate": 0.0001876774734500805, + "loss": 0.1573, + "step": 770 + }, + { + "epoch": 0.7320199382862568, + "grad_norm": 0.036126043647527695, + "learning_rate": 0.00018762427902235072, + "loss": 0.1274, + "step": 771 + }, + { + "epoch": 0.7329693804889628, + "grad_norm": 0.044809550046920776, + "learning_rate": 0.0001875709775979071, + "loss": 0.1703, + "step": 772 + }, + { + "epoch": 0.7339188226916686, + "grad_norm": 0.050454337149858475, + "learning_rate": 0.0001875175692418353, + "loss": 0.1699, + "step": 773 + }, + { + "epoch": 0.7348682648943745, + "grad_norm": 0.06160600110888481, + "learning_rate": 0.00018746405401935142, + "loss": 0.1806, + "step": 774 + }, + { + "epoch": 0.7358177070970805, + "grad_norm": 0.05408332124352455, + "learning_rate": 0.0001874104319958021, + "loss": 0.1681, + "step": 775 + }, + { + "epoch": 0.7367671492997864, + "grad_norm": 0.03859655559062958, + "learning_rate": 0.00018735670323666442, + "loss": 0.1297, + "step": 776 + }, + { + "epoch": 0.7377165915024922, + "grad_norm": 0.05268474668264389, + "learning_rate": 0.00018730286780754577, + "loss": 0.1658, + "step": 777 + }, + { + "epoch": 0.7386660337051982, + "grad_norm": 0.06406822055578232, + "learning_rate": 0.00018724892577418381, + "loss": 0.199, + "step": 778 + }, + { + "epoch": 0.7396154759079041, + "grad_norm": 0.05488892272114754, + "learning_rate": 0.00018719487720244638, + "loss": 0.1669, + "step": 779 + }, + { + "epoch": 0.74056491811061, + "grad_norm": 0.03732241317629814, + "learning_rate": 0.00018714072215833132, + "loss": 0.1337, + "step": 780 + }, + { + "epoch": 0.741514360313316, + "grad_norm": 0.05548230558633804, + "learning_rate": 0.00018708646070796664, + "loss": 0.1652, + "step": 781 + }, + { + "epoch": 0.7424638025160218, + "grad_norm": 0.06930623203516006, + "learning_rate": 0.0001870320929176101, + "loss": 0.1647, + "step": 782 + }, + { + "epoch": 0.7434132447187277, + "grad_norm": 0.05485931411385536, + "learning_rate": 0.0001869776188536495, + "loss": 0.2149, + "step": 783 + }, + { + "epoch": 0.7443626869214337, + "grad_norm": 0.03739183023571968, + "learning_rate": 0.00018692303858260228, + "loss": 0.1257, + "step": 784 + }, + { + "epoch": 0.7453121291241396, + "grad_norm": 0.03913332521915436, + "learning_rate": 0.00018686835217111557, + "loss": 0.1293, + "step": 785 + }, + { + "epoch": 0.7462615713268455, + "grad_norm": 0.03580600768327713, + "learning_rate": 0.0001868135596859662, + "loss": 0.1278, + "step": 786 + }, + { + "epoch": 0.7472110135295514, + "grad_norm": 0.03586685657501221, + "learning_rate": 0.00018675866119406042, + "loss": 0.128, + "step": 787 + }, + { + "epoch": 0.7481604557322573, + "grad_norm": 0.04061829298734665, + "learning_rate": 0.00018670365676243397, + "loss": 0.1256, + "step": 788 + }, + { + "epoch": 0.7491098979349632, + "grad_norm": 0.03580275923013687, + "learning_rate": 0.000186648546458252, + "loss": 0.1265, + "step": 789 + }, + { + "epoch": 0.7500593401376692, + "grad_norm": 0.04277309030294418, + "learning_rate": 0.00018659333034880884, + "loss": 0.1678, + "step": 790 + }, + { + "epoch": 0.751008782340375, + "grad_norm": 0.03997024893760681, + "learning_rate": 0.00018653800850152808, + "loss": 0.1251, + "step": 791 + }, + { + "epoch": 0.7519582245430809, + "grad_norm": 0.03809446841478348, + "learning_rate": 0.0001864825809839624, + "loss": 0.1354, + "step": 792 + }, + { + "epoch": 0.7529076667457868, + "grad_norm": 0.05002079904079437, + "learning_rate": 0.00018642704786379354, + "loss": 0.1492, + "step": 793 + }, + { + "epoch": 0.7538571089484928, + "grad_norm": 0.03734049201011658, + "learning_rate": 0.00018637140920883217, + "loss": 0.1328, + "step": 794 + }, + { + "epoch": 0.7548065511511987, + "grad_norm": 0.034287337213754654, + "learning_rate": 0.00018631566508701784, + "loss": 0.1261, + "step": 795 + }, + { + "epoch": 0.7557559933539045, + "grad_norm": 0.0322953499853611, + "learning_rate": 0.00018625981556641882, + "loss": 0.1251, + "step": 796 + }, + { + "epoch": 0.7567054355566105, + "grad_norm": 0.03397887200117111, + "learning_rate": 0.00018620386071523218, + "loss": 0.1226, + "step": 797 + }, + { + "epoch": 0.7576548777593164, + "grad_norm": 0.048685140907764435, + "learning_rate": 0.0001861478006017836, + "loss": 0.1677, + "step": 798 + }, + { + "epoch": 0.7586043199620223, + "grad_norm": 0.06330600380897522, + "learning_rate": 0.00018609163529452723, + "loss": 0.2012, + "step": 799 + }, + { + "epoch": 0.7595537621647283, + "grad_norm": 0.04262509569525719, + "learning_rate": 0.00018603536486204564, + "loss": 0.1271, + "step": 800 + }, + { + "epoch": 0.7605032043674341, + "grad_norm": 0.04021213576197624, + "learning_rate": 0.00018597898937304988, + "loss": 0.1426, + "step": 801 + }, + { + "epoch": 0.76145264657014, + "grad_norm": 0.05070256441831589, + "learning_rate": 0.0001859225088963792, + "loss": 0.209, + "step": 802 + }, + { + "epoch": 0.762402088772846, + "grad_norm": 0.05344654247164726, + "learning_rate": 0.00018586592350100113, + "loss": 0.2093, + "step": 803 + }, + { + "epoch": 0.7633515309755519, + "grad_norm": 0.03695262596011162, + "learning_rate": 0.0001858092332560112, + "loss": 0.1264, + "step": 804 + }, + { + "epoch": 0.7643009731782577, + "grad_norm": 0.041282836347818375, + "learning_rate": 0.00018575243823063306, + "loss": 0.1275, + "step": 805 + }, + { + "epoch": 0.7652504153809637, + "grad_norm": 0.038663093000650406, + "learning_rate": 0.00018569553849421828, + "loss": 0.1285, + "step": 806 + }, + { + "epoch": 0.7661998575836696, + "grad_norm": 0.05324345454573631, + "learning_rate": 0.00018563853411624628, + "loss": 0.1691, + "step": 807 + }, + { + "epoch": 0.7671492997863755, + "grad_norm": 0.0382021889090538, + "learning_rate": 0.00018558142516632425, + "loss": 0.1299, + "step": 808 + }, + { + "epoch": 0.7680987419890815, + "grad_norm": 0.05059641972184181, + "learning_rate": 0.00018552421171418712, + "loss": 0.1685, + "step": 809 + }, + { + "epoch": 0.7690481841917873, + "grad_norm": 0.041547179222106934, + "learning_rate": 0.00018546689382969737, + "loss": 0.1322, + "step": 810 + }, + { + "epoch": 0.7699976263944932, + "grad_norm": 0.047367729246616364, + "learning_rate": 0.00018540947158284503, + "loss": 0.1662, + "step": 811 + }, + { + "epoch": 0.7709470685971992, + "grad_norm": 0.07076044380664825, + "learning_rate": 0.00018535194504374754, + "loss": 0.1749, + "step": 812 + }, + { + "epoch": 0.7718965107999051, + "grad_norm": 0.05194571986794472, + "learning_rate": 0.00018529431428264973, + "loss": 0.1595, + "step": 813 + }, + { + "epoch": 0.7728459530026109, + "grad_norm": 0.034832440316677094, + "learning_rate": 0.00018523657936992367, + "loss": 0.1279, + "step": 814 + }, + { + "epoch": 0.7737953952053169, + "grad_norm": 0.03709466755390167, + "learning_rate": 0.00018517874037606862, + "loss": 0.1161, + "step": 815 + }, + { + "epoch": 0.7747448374080228, + "grad_norm": 0.03341936320066452, + "learning_rate": 0.00018512079737171086, + "loss": 0.1277, + "step": 816 + }, + { + "epoch": 0.7756942796107287, + "grad_norm": 0.0411679781973362, + "learning_rate": 0.00018506275042760382, + "loss": 0.1284, + "step": 817 + }, + { + "epoch": 0.7766437218134347, + "grad_norm": 0.04416754096746445, + "learning_rate": 0.00018500459961462773, + "loss": 0.1647, + "step": 818 + }, + { + "epoch": 0.7775931640161405, + "grad_norm": 0.03680622950196266, + "learning_rate": 0.00018494634500378966, + "loss": 0.1371, + "step": 819 + }, + { + "epoch": 0.7785426062188464, + "grad_norm": 0.037342917174100876, + "learning_rate": 0.0001848879866662235, + "loss": 0.1308, + "step": 820 + }, + { + "epoch": 0.7794920484215523, + "grad_norm": 0.04237838089466095, + "learning_rate": 0.00018482952467318976, + "loss": 0.1623, + "step": 821 + }, + { + "epoch": 0.7804414906242583, + "grad_norm": 0.04467133805155754, + "learning_rate": 0.00018477095909607546, + "loss": 0.1651, + "step": 822 + }, + { + "epoch": 0.7813909328269641, + "grad_norm": 0.04672664403915405, + "learning_rate": 0.00018471229000639424, + "loss": 0.1735, + "step": 823 + }, + { + "epoch": 0.78234037502967, + "grad_norm": 0.03545104339718819, + "learning_rate": 0.00018465351747578597, + "loss": 0.1342, + "step": 824 + }, + { + "epoch": 0.783289817232376, + "grad_norm": 0.04771837964653969, + "learning_rate": 0.000184594641576017, + "loss": 0.175, + "step": 825 + }, + { + "epoch": 0.7842392594350819, + "grad_norm": 0.03531822934746742, + "learning_rate": 0.00018453566237897976, + "loss": 0.1321, + "step": 826 + }, + { + "epoch": 0.7851887016377878, + "grad_norm": 0.04098953306674957, + "learning_rate": 0.00018447657995669295, + "loss": 0.1372, + "step": 827 + }, + { + "epoch": 0.7861381438404937, + "grad_norm": 0.053972020745277405, + "learning_rate": 0.00018441739438130114, + "loss": 0.1673, + "step": 828 + }, + { + "epoch": 0.7870875860431996, + "grad_norm": 0.03818265721201897, + "learning_rate": 0.00018435810572507507, + "loss": 0.1322, + "step": 829 + }, + { + "epoch": 0.7880370282459055, + "grad_norm": 0.033827316015958786, + "learning_rate": 0.0001842987140604112, + "loss": 0.1445, + "step": 830 + }, + { + "epoch": 0.7889864704486115, + "grad_norm": 0.041385356336832047, + "learning_rate": 0.00018423921945983179, + "loss": 0.1332, + "step": 831 + }, + { + "epoch": 0.7899359126513174, + "grad_norm": 0.03948013484477997, + "learning_rate": 0.00018417962199598483, + "loss": 0.1412, + "step": 832 + }, + { + "epoch": 0.7908853548540232, + "grad_norm": 0.044912584125995636, + "learning_rate": 0.00018411992174164393, + "loss": 0.1684, + "step": 833 + }, + { + "epoch": 0.7918347970567292, + "grad_norm": 0.03675195202231407, + "learning_rate": 0.0001840601187697082, + "loss": 0.1334, + "step": 834 + }, + { + "epoch": 0.7927842392594351, + "grad_norm": 0.0349728949368, + "learning_rate": 0.0001840002131532021, + "loss": 0.1323, + "step": 835 + }, + { + "epoch": 0.793733681462141, + "grad_norm": 0.03763123229146004, + "learning_rate": 0.0001839402049652755, + "loss": 0.1283, + "step": 836 + }, + { + "epoch": 0.794683123664847, + "grad_norm": 0.036798711866140366, + "learning_rate": 0.00018388009427920362, + "loss": 0.1272, + "step": 837 + }, + { + "epoch": 0.7956325658675528, + "grad_norm": 0.036771487444639206, + "learning_rate": 0.00018381988116838663, + "loss": 0.126, + "step": 838 + }, + { + "epoch": 0.7965820080702587, + "grad_norm": 0.060571007430553436, + "learning_rate": 0.00018375956570634987, + "loss": 0.1736, + "step": 839 + }, + { + "epoch": 0.7975314502729647, + "grad_norm": 0.0332857109606266, + "learning_rate": 0.00018369914796674373, + "loss": 0.1301, + "step": 840 + }, + { + "epoch": 0.7984808924756706, + "grad_norm": 0.045279379934072495, + "learning_rate": 0.00018363862802334334, + "loss": 0.1602, + "step": 841 + }, + { + "epoch": 0.7994303346783764, + "grad_norm": 0.03676297515630722, + "learning_rate": 0.00018357800595004877, + "loss": 0.1299, + "step": 842 + }, + { + "epoch": 0.8003797768810824, + "grad_norm": 0.05098710209131241, + "learning_rate": 0.0001835172818208847, + "loss": 0.1289, + "step": 843 + }, + { + "epoch": 0.8013292190837883, + "grad_norm": 0.047296855598688126, + "learning_rate": 0.00018345645571000052, + "loss": 0.1716, + "step": 844 + }, + { + "epoch": 0.8022786612864942, + "grad_norm": 0.03570317476987839, + "learning_rate": 0.00018339552769167003, + "loss": 0.1337, + "step": 845 + }, + { + "epoch": 0.8032281034892, + "grad_norm": 0.03380590304732323, + "learning_rate": 0.00018333449784029156, + "loss": 0.1218, + "step": 846 + }, + { + "epoch": 0.804177545691906, + "grad_norm": 0.0340820774435997, + "learning_rate": 0.00018327336623038778, + "loss": 0.1324, + "step": 847 + }, + { + "epoch": 0.8051269878946119, + "grad_norm": 0.03311248868703842, + "learning_rate": 0.00018321213293660558, + "loss": 0.1308, + "step": 848 + }, + { + "epoch": 0.8060764300973178, + "grad_norm": 0.035102471709251404, + "learning_rate": 0.00018315079803371605, + "loss": 0.1345, + "step": 849 + }, + { + "epoch": 0.8070258723000238, + "grad_norm": 0.03358345478773117, + "learning_rate": 0.0001830893615966143, + "loss": 0.1341, + "step": 850 + }, + { + "epoch": 0.8079753145027296, + "grad_norm": 0.06460444629192352, + "learning_rate": 0.00018302782370031948, + "loss": 0.2051, + "step": 851 + }, + { + "epoch": 0.8089247567054355, + "grad_norm": 0.033203575760126114, + "learning_rate": 0.0001829661844199746, + "loss": 0.1357, + "step": 852 + }, + { + "epoch": 0.8098741989081415, + "grad_norm": 0.03588509559631348, + "learning_rate": 0.0001829044438308465, + "loss": 0.1335, + "step": 853 + }, + { + "epoch": 0.8108236411108474, + "grad_norm": 0.04263895004987717, + "learning_rate": 0.00018284260200832563, + "loss": 0.1739, + "step": 854 + }, + { + "epoch": 0.8117730833135532, + "grad_norm": 0.04004021733999252, + "learning_rate": 0.00018278065902792618, + "loss": 0.131, + "step": 855 + }, + { + "epoch": 0.8127225255162592, + "grad_norm": 0.035174135118722916, + "learning_rate": 0.00018271861496528584, + "loss": 0.1248, + "step": 856 + }, + { + "epoch": 0.8136719677189651, + "grad_norm": 0.03610173240303993, + "learning_rate": 0.00018265646989616566, + "loss": 0.1287, + "step": 857 + }, + { + "epoch": 0.814621409921671, + "grad_norm": 0.035818714648485184, + "learning_rate": 0.00018259422389645008, + "loss": 0.1335, + "step": 858 + }, + { + "epoch": 0.815570852124377, + "grad_norm": 0.03248162940144539, + "learning_rate": 0.00018253187704214672, + "loss": 0.1308, + "step": 859 + }, + { + "epoch": 0.8165202943270828, + "grad_norm": 0.031658854335546494, + "learning_rate": 0.00018246942940938646, + "loss": 0.1339, + "step": 860 + }, + { + "epoch": 0.8174697365297887, + "grad_norm": 0.035879503935575485, + "learning_rate": 0.0001824068810744232, + "loss": 0.1222, + "step": 861 + }, + { + "epoch": 0.8184191787324947, + "grad_norm": 0.05258049815893173, + "learning_rate": 0.0001823442321136337, + "loss": 0.1937, + "step": 862 + }, + { + "epoch": 0.8193686209352006, + "grad_norm": 0.03625549003481865, + "learning_rate": 0.0001822814826035178, + "loss": 0.1268, + "step": 863 + }, + { + "epoch": 0.8203180631379065, + "grad_norm": 0.04990942031145096, + "learning_rate": 0.00018221863262069793, + "loss": 0.1661, + "step": 864 + }, + { + "epoch": 0.8212675053406124, + "grad_norm": 0.0631263256072998, + "learning_rate": 0.00018215568224191927, + "loss": 0.2126, + "step": 865 + }, + { + "epoch": 0.8222169475433183, + "grad_norm": 0.03726550564169884, + "learning_rate": 0.00018209263154404958, + "loss": 0.1334, + "step": 866 + }, + { + "epoch": 0.8231663897460242, + "grad_norm": 0.040383536368608475, + "learning_rate": 0.0001820294806040792, + "loss": 0.1619, + "step": 867 + }, + { + "epoch": 0.8241158319487302, + "grad_norm": 0.03525468334555626, + "learning_rate": 0.00018196622949912078, + "loss": 0.1263, + "step": 868 + }, + { + "epoch": 0.825065274151436, + "grad_norm": 0.03585941344499588, + "learning_rate": 0.00018190287830640933, + "loss": 0.1245, + "step": 869 + }, + { + "epoch": 0.8260147163541419, + "grad_norm": 0.03207286074757576, + "learning_rate": 0.00018183942710330202, + "loss": 0.1262, + "step": 870 + }, + { + "epoch": 0.8269641585568479, + "grad_norm": 0.04638965427875519, + "learning_rate": 0.00018177587596727822, + "loss": 0.1653, + "step": 871 + }, + { + "epoch": 0.8279136007595538, + "grad_norm": 0.030705489218235016, + "learning_rate": 0.00018171222497593922, + "loss": 0.1276, + "step": 872 + }, + { + "epoch": 0.8288630429622597, + "grad_norm": 0.03139735013246536, + "learning_rate": 0.00018164847420700837, + "loss": 0.1344, + "step": 873 + }, + { + "epoch": 0.8298124851649655, + "grad_norm": 0.039802953600883484, + "learning_rate": 0.00018158462373833078, + "loss": 0.1373, + "step": 874 + }, + { + "epoch": 0.8307619273676715, + "grad_norm": 0.03284341096878052, + "learning_rate": 0.00018152067364787325, + "loss": 0.1236, + "step": 875 + }, + { + "epoch": 0.8317113695703774, + "grad_norm": 0.056572429835796356, + "learning_rate": 0.0001814566240137244, + "loss": 0.1665, + "step": 876 + }, + { + "epoch": 0.8326608117730833, + "grad_norm": 0.03471997380256653, + "learning_rate": 0.00018139247491409424, + "loss": 0.13, + "step": 877 + }, + { + "epoch": 0.8336102539757893, + "grad_norm": 0.03601829707622528, + "learning_rate": 0.00018132822642731426, + "loss": 0.127, + "step": 878 + }, + { + "epoch": 0.8345596961784951, + "grad_norm": 0.032708846032619476, + "learning_rate": 0.00018126387863183737, + "loss": 0.1264, + "step": 879 + }, + { + "epoch": 0.835509138381201, + "grad_norm": 0.035340629518032074, + "learning_rate": 0.00018119943160623773, + "loss": 0.1334, + "step": 880 + }, + { + "epoch": 0.836458580583907, + "grad_norm": 0.030397990718483925, + "learning_rate": 0.00018113488542921061, + "loss": 0.1254, + "step": 881 + }, + { + "epoch": 0.8374080227866129, + "grad_norm": 0.03871999308466911, + "learning_rate": 0.00018107024017957244, + "loss": 0.132, + "step": 882 + }, + { + "epoch": 0.8383574649893187, + "grad_norm": 0.04331507533788681, + "learning_rate": 0.00018100549593626052, + "loss": 0.1354, + "step": 883 + }, + { + "epoch": 0.8393069071920247, + "grad_norm": 0.03445984423160553, + "learning_rate": 0.00018094065277833314, + "loss": 0.129, + "step": 884 + }, + { + "epoch": 0.8402563493947306, + "grad_norm": 0.03362146392464638, + "learning_rate": 0.0001808757107849693, + "loss": 0.125, + "step": 885 + }, + { + "epoch": 0.8412057915974365, + "grad_norm": 0.041491370648145676, + "learning_rate": 0.00018081067003546876, + "loss": 0.1314, + "step": 886 + }, + { + "epoch": 0.8421552338001425, + "grad_norm": 0.034560974687337875, + "learning_rate": 0.00018074553060925175, + "loss": 0.126, + "step": 887 + }, + { + "epoch": 0.8431046760028483, + "grad_norm": 0.049931105226278305, + "learning_rate": 0.0001806802925858591, + "loss": 0.1709, + "step": 888 + }, + { + "epoch": 0.8440541182055542, + "grad_norm": 0.035841234028339386, + "learning_rate": 0.00018061495604495195, + "loss": 0.1396, + "step": 889 + }, + { + "epoch": 0.8450035604082602, + "grad_norm": 0.03359563648700714, + "learning_rate": 0.00018054952106631188, + "loss": 0.1323, + "step": 890 + }, + { + "epoch": 0.8459530026109661, + "grad_norm": 0.03390706703066826, + "learning_rate": 0.00018048398772984046, + "loss": 0.1287, + "step": 891 + }, + { + "epoch": 0.8469024448136719, + "grad_norm": 0.0474267303943634, + "learning_rate": 0.00018041835611555957, + "loss": 0.1693, + "step": 892 + }, + { + "epoch": 0.8478518870163779, + "grad_norm": 0.0334562286734581, + "learning_rate": 0.00018035262630361097, + "loss": 0.1295, + "step": 893 + }, + { + "epoch": 0.8488013292190838, + "grad_norm": 0.03383705019950867, + "learning_rate": 0.00018028679837425634, + "loss": 0.1259, + "step": 894 + }, + { + "epoch": 0.8497507714217897, + "grad_norm": 0.03384934738278389, + "learning_rate": 0.00018022087240787728, + "loss": 0.1218, + "step": 895 + }, + { + "epoch": 0.8507002136244957, + "grad_norm": 0.04088185727596283, + "learning_rate": 0.0001801548484849749, + "loss": 0.1343, + "step": 896 + }, + { + "epoch": 0.8516496558272015, + "grad_norm": 0.05273745581507683, + "learning_rate": 0.00018008872668617013, + "loss": 0.1688, + "step": 897 + }, + { + "epoch": 0.8525990980299074, + "grad_norm": 0.03253067284822464, + "learning_rate": 0.00018002250709220325, + "loss": 0.1333, + "step": 898 + }, + { + "epoch": 0.8535485402326133, + "grad_norm": 0.03033488616347313, + "learning_rate": 0.0001799561897839341, + "loss": 0.1292, + "step": 899 + }, + { + "epoch": 0.8544979824353193, + "grad_norm": 0.033945854753255844, + "learning_rate": 0.00017988977484234174, + "loss": 0.1415, + "step": 900 + }, + { + "epoch": 0.8554474246380251, + "grad_norm": 0.04456301033496857, + "learning_rate": 0.0001798232623485244, + "loss": 0.1762, + "step": 901 + }, + { + "epoch": 0.856396866840731, + "grad_norm": 0.03912430256605148, + "learning_rate": 0.00017975665238369962, + "loss": 0.142, + "step": 902 + }, + { + "epoch": 0.857346309043437, + "grad_norm": 0.032741378992795944, + "learning_rate": 0.0001796899450292038, + "loss": 0.1212, + "step": 903 + }, + { + "epoch": 0.8582957512461429, + "grad_norm": 0.047262486070394516, + "learning_rate": 0.0001796231403664923, + "loss": 0.1762, + "step": 904 + }, + { + "epoch": 0.8592451934488488, + "grad_norm": 0.03242664784193039, + "learning_rate": 0.00017955623847713928, + "loss": 0.1323, + "step": 905 + }, + { + "epoch": 0.8601946356515547, + "grad_norm": 0.030855266377329826, + "learning_rate": 0.0001794892394428377, + "loss": 0.1258, + "step": 906 + }, + { + "epoch": 0.8611440778542606, + "grad_norm": 0.03360726311802864, + "learning_rate": 0.00017942214334539907, + "loss": 0.1325, + "step": 907 + }, + { + "epoch": 0.8620935200569665, + "grad_norm": 0.032459285110235214, + "learning_rate": 0.00017935495026675345, + "loss": 0.1267, + "step": 908 + }, + { + "epoch": 0.8630429622596725, + "grad_norm": 0.04160567373037338, + "learning_rate": 0.00017928766028894928, + "loss": 0.1255, + "step": 909 + }, + { + "epoch": 0.8639924044623783, + "grad_norm": 0.03851740434765816, + "learning_rate": 0.0001792202734941534, + "loss": 0.1212, + "step": 910 + }, + { + "epoch": 0.8649418466650842, + "grad_norm": 0.03414515405893326, + "learning_rate": 0.00017915278996465084, + "loss": 0.1239, + "step": 911 + }, + { + "epoch": 0.8658912888677902, + "grad_norm": 0.17817381024360657, + "learning_rate": 0.0001790852097828447, + "loss": 0.1336, + "step": 912 + }, + { + "epoch": 0.8668407310704961, + "grad_norm": 0.03545542433857918, + "learning_rate": 0.0001790175330312562, + "loss": 0.1353, + "step": 913 + }, + { + "epoch": 0.867790173273202, + "grad_norm": 0.03207210451364517, + "learning_rate": 0.00017894975979252436, + "loss": 0.1243, + "step": 914 + }, + { + "epoch": 0.868739615475908, + "grad_norm": 0.046145763248205185, + "learning_rate": 0.0001788818901494061, + "loss": 0.1668, + "step": 915 + }, + { + "epoch": 0.8696890576786138, + "grad_norm": 0.03051767125725746, + "learning_rate": 0.00017881392418477607, + "loss": 0.1311, + "step": 916 + }, + { + "epoch": 0.8706384998813197, + "grad_norm": 0.03918071463704109, + "learning_rate": 0.00017874586198162647, + "loss": 0.1692, + "step": 917 + }, + { + "epoch": 0.8715879420840257, + "grad_norm": 0.03229302540421486, + "learning_rate": 0.0001786777036230671, + "loss": 0.1276, + "step": 918 + }, + { + "epoch": 0.8725373842867316, + "grad_norm": 0.032113853842020035, + "learning_rate": 0.00017860944919232503, + "loss": 0.1256, + "step": 919 + }, + { + "epoch": 0.8734868264894374, + "grad_norm": 0.03725959360599518, + "learning_rate": 0.00017854109877274484, + "loss": 0.1363, + "step": 920 + }, + { + "epoch": 0.8744362686921434, + "grad_norm": 0.02805374562740326, + "learning_rate": 0.00017847265244778817, + "loss": 0.1259, + "step": 921 + }, + { + "epoch": 0.8753857108948493, + "grad_norm": 0.03541216999292374, + "learning_rate": 0.00017840411030103383, + "loss": 0.1288, + "step": 922 + }, + { + "epoch": 0.8763351530975552, + "grad_norm": 0.04267534613609314, + "learning_rate": 0.0001783354724161776, + "loss": 0.1601, + "step": 923 + }, + { + "epoch": 0.8772845953002611, + "grad_norm": 0.04881501942873001, + "learning_rate": 0.00017826673887703223, + "loss": 0.1686, + "step": 924 + }, + { + "epoch": 0.878234037502967, + "grad_norm": 0.0337185375392437, + "learning_rate": 0.00017819790976752718, + "loss": 0.131, + "step": 925 + }, + { + "epoch": 0.8791834797056729, + "grad_norm": 0.033597834408283234, + "learning_rate": 0.00017812898517170872, + "loss": 0.1365, + "step": 926 + }, + { + "epoch": 0.8801329219083788, + "grad_norm": 0.047949645668268204, + "learning_rate": 0.00017805996517373962, + "loss": 0.178, + "step": 927 + }, + { + "epoch": 0.8810823641110848, + "grad_norm": 0.03533579409122467, + "learning_rate": 0.00017799084985789916, + "loss": 0.1281, + "step": 928 + }, + { + "epoch": 0.8820318063137906, + "grad_norm": 0.03638564050197601, + "learning_rate": 0.0001779216393085831, + "loss": 0.136, + "step": 929 + }, + { + "epoch": 0.8829812485164965, + "grad_norm": 0.034585777670145035, + "learning_rate": 0.00017785233361030333, + "loss": 0.1221, + "step": 930 + }, + { + "epoch": 0.8839306907192025, + "grad_norm": 0.03344082459807396, + "learning_rate": 0.00017778293284768807, + "loss": 0.1335, + "step": 931 + }, + { + "epoch": 0.8848801329219084, + "grad_norm": 0.029832901433110237, + "learning_rate": 0.00017771343710548155, + "loss": 0.131, + "step": 932 + }, + { + "epoch": 0.8858295751246142, + "grad_norm": 0.030377686023712158, + "learning_rate": 0.00017764384646854405, + "loss": 0.1216, + "step": 933 + }, + { + "epoch": 0.8867790173273202, + "grad_norm": 0.036345310509204865, + "learning_rate": 0.0001775741610218516, + "loss": 0.1289, + "step": 934 + }, + { + "epoch": 0.8877284595300261, + "grad_norm": 0.04609441012144089, + "learning_rate": 0.00017750438085049606, + "loss": 0.1598, + "step": 935 + }, + { + "epoch": 0.888677901732732, + "grad_norm": 0.03439109027385712, + "learning_rate": 0.00017743450603968506, + "loss": 0.1316, + "step": 936 + }, + { + "epoch": 0.889627343935438, + "grad_norm": 0.07119124382734299, + "learning_rate": 0.0001773645366747416, + "loss": 0.1664, + "step": 937 + }, + { + "epoch": 0.8905767861381438, + "grad_norm": 0.03385334461927414, + "learning_rate": 0.0001772944728411043, + "loss": 0.1294, + "step": 938 + }, + { + "epoch": 0.8915262283408497, + "grad_norm": 0.033481206744909286, + "learning_rate": 0.00017722431462432705, + "loss": 0.1218, + "step": 939 + }, + { + "epoch": 0.8924756705435557, + "grad_norm": 0.03365306556224823, + "learning_rate": 0.00017715406211007902, + "loss": 0.1295, + "step": 940 + }, + { + "epoch": 0.8934251127462616, + "grad_norm": 0.03675035014748573, + "learning_rate": 0.0001770837153841445, + "loss": 0.1237, + "step": 941 + }, + { + "epoch": 0.8943745549489674, + "grad_norm": 0.03245026618242264, + "learning_rate": 0.00017701327453242284, + "loss": 0.1304, + "step": 942 + }, + { + "epoch": 0.8953239971516734, + "grad_norm": 0.03346354141831398, + "learning_rate": 0.00017694273964092837, + "loss": 0.1274, + "step": 943 + }, + { + "epoch": 0.8962734393543793, + "grad_norm": 0.048563096672296524, + "learning_rate": 0.00017687211079579017, + "loss": 0.1719, + "step": 944 + }, + { + "epoch": 0.8972228815570852, + "grad_norm": 0.04709222912788391, + "learning_rate": 0.0001768013880832521, + "loss": 0.1281, + "step": 945 + }, + { + "epoch": 0.8981723237597912, + "grad_norm": 0.030402177944779396, + "learning_rate": 0.00017673057158967254, + "loss": 0.1229, + "step": 946 + }, + { + "epoch": 0.899121765962497, + "grad_norm": 0.03577994927763939, + "learning_rate": 0.00017665966140152458, + "loss": 0.1255, + "step": 947 + }, + { + "epoch": 0.9000712081652029, + "grad_norm": 0.04566454887390137, + "learning_rate": 0.00017658865760539552, + "loss": 0.1617, + "step": 948 + }, + { + "epoch": 0.9010206503679089, + "grad_norm": 0.04077988117933273, + "learning_rate": 0.00017651756028798713, + "loss": 0.1619, + "step": 949 + }, + { + "epoch": 0.9019700925706148, + "grad_norm": 0.045764826238155365, + "learning_rate": 0.00017644636953611522, + "loss": 0.1608, + "step": 950 + }, + { + "epoch": 0.9029195347733207, + "grad_norm": 0.035656195133924484, + "learning_rate": 0.0001763750854367098, + "loss": 0.1288, + "step": 951 + }, + { + "epoch": 0.9038689769760266, + "grad_norm": 0.04220154508948326, + "learning_rate": 0.0001763037080768148, + "loss": 0.1688, + "step": 952 + }, + { + "epoch": 0.9048184191787325, + "grad_norm": 0.03406943380832672, + "learning_rate": 0.0001762322375435881, + "loss": 0.1314, + "step": 953 + }, + { + "epoch": 0.9057678613814384, + "grad_norm": 0.037942539900541306, + "learning_rate": 0.00017616067392430126, + "loss": 0.1342, + "step": 954 + }, + { + "epoch": 0.9067173035841443, + "grad_norm": 0.06412187963724136, + "learning_rate": 0.00017608901730633964, + "loss": 0.2207, + "step": 955 + }, + { + "epoch": 0.9076667457868502, + "grad_norm": 0.0313476026058197, + "learning_rate": 0.00017601726777720202, + "loss": 0.1249, + "step": 956 + }, + { + "epoch": 0.9086161879895561, + "grad_norm": 0.0276046060025692, + "learning_rate": 0.00017594542542450072, + "loss": 0.1212, + "step": 957 + }, + { + "epoch": 0.909565630192262, + "grad_norm": 0.032439909875392914, + "learning_rate": 0.00017587349033596134, + "loss": 0.1277, + "step": 958 + }, + { + "epoch": 0.910515072394968, + "grad_norm": 0.039732351899147034, + "learning_rate": 0.00017580146259942278, + "loss": 0.1222, + "step": 959 + }, + { + "epoch": 0.9114645145976739, + "grad_norm": 0.033820103853940964, + "learning_rate": 0.00017572934230283707, + "loss": 0.1246, + "step": 960 + }, + { + "epoch": 0.9124139568003797, + "grad_norm": 0.03361973166465759, + "learning_rate": 0.00017565712953426918, + "loss": 0.1328, + "step": 961 + }, + { + "epoch": 0.9133633990030857, + "grad_norm": 0.0338444709777832, + "learning_rate": 0.00017558482438189712, + "loss": 0.1306, + "step": 962 + }, + { + "epoch": 0.9143128412057916, + "grad_norm": 0.04851710423827171, + "learning_rate": 0.0001755124269340116, + "loss": 0.1765, + "step": 963 + }, + { + "epoch": 0.9152622834084975, + "grad_norm": 0.03290700539946556, + "learning_rate": 0.0001754399372790161, + "loss": 0.1386, + "step": 964 + }, + { + "epoch": 0.9162117256112035, + "grad_norm": 0.034565720707178116, + "learning_rate": 0.00017536735550542661, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 0.9171611678139093, + "grad_norm": 0.04606771841645241, + "learning_rate": 0.00017529468170187176, + "loss": 0.1567, + "step": 966 + }, + { + "epoch": 0.9181106100166152, + "grad_norm": 0.03279464691877365, + "learning_rate": 0.00017522191595709238, + "loss": 0.1214, + "step": 967 + }, + { + "epoch": 0.9190600522193212, + "grad_norm": 0.036700885742902756, + "learning_rate": 0.00017514905835994168, + "loss": 0.1314, + "step": 968 + }, + { + "epoch": 0.9200094944220271, + "grad_norm": 0.04098424315452576, + "learning_rate": 0.00017507610899938501, + "loss": 0.164, + "step": 969 + }, + { + "epoch": 0.9209589366247329, + "grad_norm": 0.033782679587602615, + "learning_rate": 0.0001750030679644997, + "loss": 0.1376, + "step": 970 + }, + { + "epoch": 0.9219083788274389, + "grad_norm": 0.03304159641265869, + "learning_rate": 0.00017492993534447515, + "loss": 0.1244, + "step": 971 + }, + { + "epoch": 0.9228578210301448, + "grad_norm": 0.03158386051654816, + "learning_rate": 0.0001748567112286125, + "loss": 0.1345, + "step": 972 + }, + { + "epoch": 0.9238072632328507, + "grad_norm": 0.03615015745162964, + "learning_rate": 0.00017478339570632458, + "loss": 0.1434, + "step": 973 + }, + { + "epoch": 0.9247567054355567, + "grad_norm": 0.033553823828697205, + "learning_rate": 0.00017470998886713596, + "loss": 0.1292, + "step": 974 + }, + { + "epoch": 0.9257061476382625, + "grad_norm": 0.03953874111175537, + "learning_rate": 0.00017463649080068266, + "loss": 0.1621, + "step": 975 + }, + { + "epoch": 0.9266555898409684, + "grad_norm": 0.03288433700799942, + "learning_rate": 0.00017456290159671202, + "loss": 0.1357, + "step": 976 + }, + { + "epoch": 0.9276050320436744, + "grad_norm": 0.03154657408595085, + "learning_rate": 0.00017448922134508275, + "loss": 0.1322, + "step": 977 + }, + { + "epoch": 0.9285544742463803, + "grad_norm": 0.05669796094298363, + "learning_rate": 0.00017441545013576477, + "loss": 0.1761, + "step": 978 + }, + { + "epoch": 0.9295039164490861, + "grad_norm": 0.026679178699851036, + "learning_rate": 0.00017434158805883896, + "loss": 0.1295, + "step": 979 + }, + { + "epoch": 0.930453358651792, + "grad_norm": 0.03597673401236534, + "learning_rate": 0.00017426763520449721, + "loss": 0.1265, + "step": 980 + }, + { + "epoch": 0.931402800854498, + "grad_norm": 0.03097674809396267, + "learning_rate": 0.0001741935916630423, + "loss": 0.1339, + "step": 981 + }, + { + "epoch": 0.9323522430572039, + "grad_norm": 0.030252935364842415, + "learning_rate": 0.00017411945752488766, + "loss": 0.1247, + "step": 982 + }, + { + "epoch": 0.9333016852599098, + "grad_norm": 0.03460918739438057, + "learning_rate": 0.00017404523288055743, + "loss": 0.1323, + "step": 983 + }, + { + "epoch": 0.9342511274626157, + "grad_norm": 0.035575591027736664, + "learning_rate": 0.00017397091782068622, + "loss": 0.1258, + "step": 984 + }, + { + "epoch": 0.9352005696653216, + "grad_norm": 0.05128021538257599, + "learning_rate": 0.00017389651243601904, + "loss": 0.1364, + "step": 985 + }, + { + "epoch": 0.9361500118680275, + "grad_norm": 0.04355672374367714, + "learning_rate": 0.00017382201681741122, + "loss": 0.1656, + "step": 986 + }, + { + "epoch": 0.9370994540707335, + "grad_norm": 0.03357682749629021, + "learning_rate": 0.0001737474310558282, + "loss": 0.1285, + "step": 987 + }, + { + "epoch": 0.9380488962734393, + "grad_norm": 0.10623644292354584, + "learning_rate": 0.00017367275524234565, + "loss": 0.1726, + "step": 988 + }, + { + "epoch": 0.9389983384761452, + "grad_norm": 0.03605256229639053, + "learning_rate": 0.00017359798946814907, + "loss": 0.1358, + "step": 989 + }, + { + "epoch": 0.9399477806788512, + "grad_norm": 0.039663393050432205, + "learning_rate": 0.00017352313382453378, + "loss": 0.1299, + "step": 990 + }, + { + "epoch": 0.9408972228815571, + "grad_norm": 0.12416961044073105, + "learning_rate": 0.000173448188402905, + "loss": 0.1666, + "step": 991 + }, + { + "epoch": 0.941846665084263, + "grad_norm": 0.045010216534137726, + "learning_rate": 0.00017337315329477742, + "loss": 0.1733, + "step": 992 + }, + { + "epoch": 0.9427961072869689, + "grad_norm": 0.03456486761569977, + "learning_rate": 0.0001732980285917753, + "loss": 0.1312, + "step": 993 + }, + { + "epoch": 0.9437455494896748, + "grad_norm": 0.039561979472637177, + "learning_rate": 0.00017322281438563234, + "loss": 0.1354, + "step": 994 + }, + { + "epoch": 0.9446949916923807, + "grad_norm": 0.043275121599435806, + "learning_rate": 0.00017314751076819146, + "loss": 0.1651, + "step": 995 + }, + { + "epoch": 0.9456444338950867, + "grad_norm": 0.0392397940158844, + "learning_rate": 0.00017307211783140482, + "loss": 0.1647, + "step": 996 + }, + { + "epoch": 0.9465938760977926, + "grad_norm": 0.03428703919053078, + "learning_rate": 0.0001729966356673336, + "loss": 0.128, + "step": 997 + }, + { + "epoch": 0.9475433183004984, + "grad_norm": 0.03511650487780571, + "learning_rate": 0.000172921064368148, + "loss": 0.1297, + "step": 998 + }, + { + "epoch": 0.9484927605032044, + "grad_norm": 0.030319994315505028, + "learning_rate": 0.00017284540402612696, + "loss": 0.1269, + "step": 999 + }, + { + "epoch": 0.9494422027059103, + "grad_norm": 0.03071141429245472, + "learning_rate": 0.00017276965473365827, + "loss": 0.1224, + "step": 1000 + }, + { + "epoch": 0.9503916449086162, + "grad_norm": 0.04097789525985718, + "learning_rate": 0.00017269381658323822, + "loss": 0.1597, + "step": 1001 + }, + { + "epoch": 0.9513410871113221, + "grad_norm": 0.03407077491283417, + "learning_rate": 0.00017261788966747168, + "loss": 0.1268, + "step": 1002 + }, + { + "epoch": 0.952290529314028, + "grad_norm": 0.035802800208330154, + "learning_rate": 0.00017254187407907189, + "loss": 0.1338, + "step": 1003 + }, + { + "epoch": 0.9532399715167339, + "grad_norm": 0.030097633600234985, + "learning_rate": 0.00017246576991086034, + "loss": 0.1222, + "step": 1004 + }, + { + "epoch": 0.9541894137194399, + "grad_norm": 0.047994308173656464, + "learning_rate": 0.0001723895772557667, + "loss": 0.1632, + "step": 1005 + }, + { + "epoch": 0.9551388559221458, + "grad_norm": 0.03451845049858093, + "learning_rate": 0.00017231329620682876, + "loss": 0.1278, + "step": 1006 + }, + { + "epoch": 0.9560882981248516, + "grad_norm": 0.036820750683546066, + "learning_rate": 0.00017223692685719213, + "loss": 0.1355, + "step": 1007 + }, + { + "epoch": 0.9570377403275575, + "grad_norm": 0.03521284461021423, + "learning_rate": 0.0001721604693001103, + "loss": 0.1383, + "step": 1008 + }, + { + "epoch": 0.9579871825302635, + "grad_norm": 0.036953702569007874, + "learning_rate": 0.00017208392362894447, + "loss": 0.1352, + "step": 1009 + }, + { + "epoch": 0.9589366247329694, + "grad_norm": 0.031185979023575783, + "learning_rate": 0.00017200728993716345, + "loss": 0.1262, + "step": 1010 + }, + { + "epoch": 0.9598860669356752, + "grad_norm": 0.030822455883026123, + "learning_rate": 0.00017193056831834346, + "loss": 0.1211, + "step": 1011 + }, + { + "epoch": 0.9608355091383812, + "grad_norm": 0.031467005610466, + "learning_rate": 0.0001718537588661682, + "loss": 0.1271, + "step": 1012 + }, + { + "epoch": 0.9617849513410871, + "grad_norm": 0.03788928687572479, + "learning_rate": 0.0001717768616744285, + "loss": 0.1413, + "step": 1013 + }, + { + "epoch": 0.962734393543793, + "grad_norm": 0.03359632566571236, + "learning_rate": 0.00017169987683702243, + "loss": 0.1276, + "step": 1014 + }, + { + "epoch": 0.963683835746499, + "grad_norm": 0.03274601325392723, + "learning_rate": 0.000171622804447955, + "loss": 0.1308, + "step": 1015 + }, + { + "epoch": 0.9646332779492048, + "grad_norm": 0.03634633496403694, + "learning_rate": 0.0001715456446013382, + "loss": 0.1384, + "step": 1016 + }, + { + "epoch": 0.9655827201519107, + "grad_norm": 0.02978476695716381, + "learning_rate": 0.00017146839739139077, + "loss": 0.1301, + "step": 1017 + }, + { + "epoch": 0.9665321623546167, + "grad_norm": 0.03389682248234749, + "learning_rate": 0.0001713910629124381, + "loss": 0.1264, + "step": 1018 + }, + { + "epoch": 0.9674816045573226, + "grad_norm": 0.03452256694436073, + "learning_rate": 0.00017131364125891224, + "loss": 0.1317, + "step": 1019 + }, + { + "epoch": 0.9684310467600284, + "grad_norm": 0.03967840224504471, + "learning_rate": 0.00017123613252535163, + "loss": 0.1308, + "step": 1020 + }, + { + "epoch": 0.9693804889627344, + "grad_norm": 0.04021480306982994, + "learning_rate": 0.00017115853680640098, + "loss": 0.1637, + "step": 1021 + }, + { + "epoch": 0.9703299311654403, + "grad_norm": 0.02766057476401329, + "learning_rate": 0.00017108085419681132, + "loss": 0.1239, + "step": 1022 + }, + { + "epoch": 0.9712793733681462, + "grad_norm": 0.029945319518446922, + "learning_rate": 0.00017100308479143974, + "loss": 0.1236, + "step": 1023 + }, + { + "epoch": 0.9722288155708522, + "grad_norm": 0.03135136887431145, + "learning_rate": 0.00017092522868524928, + "loss": 0.1203, + "step": 1024 + }, + { + "epoch": 0.973178257773558, + "grad_norm": 0.04876153543591499, + "learning_rate": 0.00017084728597330893, + "loss": 0.1802, + "step": 1025 + }, + { + "epoch": 0.9741276999762639, + "grad_norm": 0.042958084493875504, + "learning_rate": 0.00017076925675079335, + "loss": 0.1656, + "step": 1026 + }, + { + "epoch": 0.9750771421789699, + "grad_norm": 0.04739035665988922, + "learning_rate": 0.00017069114111298287, + "loss": 0.167, + "step": 1027 + }, + { + "epoch": 0.9760265843816758, + "grad_norm": 0.042968571186065674, + "learning_rate": 0.00017061293915526335, + "loss": 0.173, + "step": 1028 + }, + { + "epoch": 0.9769760265843817, + "grad_norm": 0.07628528028726578, + "learning_rate": 0.00017053465097312606, + "loss": 0.1351, + "step": 1029 + }, + { + "epoch": 0.9779254687870876, + "grad_norm": 0.032479528337717056, + "learning_rate": 0.00017045627666216755, + "loss": 0.1294, + "step": 1030 + }, + { + "epoch": 0.9788749109897935, + "grad_norm": 0.029842333868145943, + "learning_rate": 0.0001703778163180895, + "loss": 0.1264, + "step": 1031 + }, + { + "epoch": 0.9798243531924994, + "grad_norm": 0.03622937202453613, + "learning_rate": 0.00017029927003669868, + "loss": 0.1287, + "step": 1032 + }, + { + "epoch": 0.9807737953952054, + "grad_norm": 0.05245399475097656, + "learning_rate": 0.00017022063791390684, + "loss": 0.1923, + "step": 1033 + }, + { + "epoch": 0.9817232375979112, + "grad_norm": 0.03335704281926155, + "learning_rate": 0.00017014192004573047, + "loss": 0.1241, + "step": 1034 + }, + { + "epoch": 0.9826726798006171, + "grad_norm": 0.03645642474293709, + "learning_rate": 0.0001700631165282908, + "loss": 0.1321, + "step": 1035 + }, + { + "epoch": 0.983622122003323, + "grad_norm": 0.05331774801015854, + "learning_rate": 0.00016998422745781363, + "loss": 0.169, + "step": 1036 + }, + { + "epoch": 0.984571564206029, + "grad_norm": 0.04615236446261406, + "learning_rate": 0.00016990525293062927, + "loss": 0.1623, + "step": 1037 + }, + { + "epoch": 0.9855210064087349, + "grad_norm": 0.047434594482183456, + "learning_rate": 0.00016982619304317233, + "loss": 0.1303, + "step": 1038 + }, + { + "epoch": 0.9864704486114407, + "grad_norm": 0.03144746273756027, + "learning_rate": 0.00016974704789198168, + "loss": 0.1203, + "step": 1039 + }, + { + "epoch": 0.9874198908141467, + "grad_norm": 0.04035501554608345, + "learning_rate": 0.00016966781757370028, + "loss": 0.1246, + "step": 1040 + }, + { + "epoch": 0.9883693330168526, + "grad_norm": 0.03864790499210358, + "learning_rate": 0.0001695885021850751, + "loss": 0.1305, + "step": 1041 + }, + { + "epoch": 0.9893187752195585, + "grad_norm": 0.03547806292772293, + "learning_rate": 0.00016950910182295705, + "loss": 0.1319, + "step": 1042 + }, + { + "epoch": 0.9902682174222645, + "grad_norm": 0.03442002460360527, + "learning_rate": 0.0001694296165843007, + "loss": 0.1344, + "step": 1043 + }, + { + "epoch": 0.9912176596249703, + "grad_norm": 0.0333750881254673, + "learning_rate": 0.00016935004656616425, + "loss": 0.1278, + "step": 1044 + }, + { + "epoch": 0.9921671018276762, + "grad_norm": 0.03143637254834175, + "learning_rate": 0.00016927039186570954, + "loss": 0.1237, + "step": 1045 + }, + { + "epoch": 0.9931165440303822, + "grad_norm": 0.03841651603579521, + "learning_rate": 0.0001691906525802017, + "loss": 0.1395, + "step": 1046 + }, + { + "epoch": 0.9940659862330881, + "grad_norm": 0.03443494066596031, + "learning_rate": 0.00016911082880700926, + "loss": 0.1422, + "step": 1047 + }, + { + "epoch": 0.9950154284357939, + "grad_norm": 0.027661804109811783, + "learning_rate": 0.0001690309206436038, + "loss": 0.1251, + "step": 1048 + }, + { + "epoch": 0.9959648706384999, + "grad_norm": 0.036862559616565704, + "learning_rate": 0.00016895092818756006, + "loss": 0.1337, + "step": 1049 + }, + { + "epoch": 0.9969143128412058, + "grad_norm": 0.035234466195106506, + "learning_rate": 0.00016887085153655554, + "loss": 0.1267, + "step": 1050 + }, + { + "epoch": 0.9978637550439117, + "grad_norm": 0.032372791320085526, + "learning_rate": 0.00016879069078837075, + "loss": 0.1254, + "step": 1051 + }, + { + "epoch": 0.9988131972466177, + "grad_norm": 0.037299785763025284, + "learning_rate": 0.00016871044604088877, + "loss": 0.1324, + "step": 1052 + }, + { + "epoch": 0.9997626394493235, + "grad_norm": 0.03843718767166138, + "learning_rate": 0.00016863011739209527, + "loss": 0.1328, + "step": 1053 + }, + { + "epoch": 1.0007120816520294, + "grad_norm": 0.03160862624645233, + "learning_rate": 0.00016854970494007836, + "loss": 0.1284, + "step": 1054 + }, + { + "epoch": 1.0016615238547353, + "grad_norm": 0.05188068002462387, + "learning_rate": 0.00016846920878302852, + "loss": 0.1775, + "step": 1055 + }, + { + "epoch": 1.0026109660574412, + "grad_norm": 0.04362662881612778, + "learning_rate": 0.00016838862901923842, + "loss": 0.1577, + "step": 1056 + }, + { + "epoch": 1.0035604082601473, + "grad_norm": 0.033426132053136826, + "learning_rate": 0.00016830796574710284, + "loss": 0.1252, + "step": 1057 + }, + { + "epoch": 1.0045098504628531, + "grad_norm": 0.06085265800356865, + "learning_rate": 0.00016822721906511844, + "loss": 0.1769, + "step": 1058 + }, + { + "epoch": 1.005459292665559, + "grad_norm": 0.03222273662686348, + "learning_rate": 0.00016814638907188388, + "loss": 0.1239, + "step": 1059 + }, + { + "epoch": 1.0064087348682649, + "grad_norm": 0.032014038413763046, + "learning_rate": 0.00016806547586609947, + "loss": 0.1191, + "step": 1060 + }, + { + "epoch": 1.0073581770709708, + "grad_norm": 0.03323471546173096, + "learning_rate": 0.00016798447954656707, + "loss": 0.1334, + "step": 1061 + }, + { + "epoch": 1.0083076192736766, + "grad_norm": 0.04325219243764877, + "learning_rate": 0.0001679034002121901, + "loss": 0.1623, + "step": 1062 + }, + { + "epoch": 1.0092570614763827, + "grad_norm": 0.029746338725090027, + "learning_rate": 0.0001678222379619734, + "loss": 0.1292, + "step": 1063 + }, + { + "epoch": 1.0102065036790886, + "grad_norm": 0.03265037387609482, + "learning_rate": 0.00016774099289502297, + "loss": 0.1271, + "step": 1064 + }, + { + "epoch": 1.0111559458817945, + "grad_norm": 0.04023383557796478, + "learning_rate": 0.0001676596651105459, + "loss": 0.1537, + "step": 1065 + }, + { + "epoch": 1.0121053880845003, + "grad_norm": 0.036106862127780914, + "learning_rate": 0.00016757825470785042, + "loss": 0.1237, + "step": 1066 + }, + { + "epoch": 1.0130548302872062, + "grad_norm": 0.04061293974518776, + "learning_rate": 0.00016749676178634556, + "loss": 0.1324, + "step": 1067 + }, + { + "epoch": 1.014004272489912, + "grad_norm": 0.050820399075746536, + "learning_rate": 0.0001674151864455411, + "loss": 0.1705, + "step": 1068 + }, + { + "epoch": 1.0149537146926182, + "grad_norm": 0.037347592413425446, + "learning_rate": 0.00016733352878504752, + "loss": 0.1248, + "step": 1069 + }, + { + "epoch": 1.015903156895324, + "grad_norm": 0.04108656942844391, + "learning_rate": 0.00016725178890457571, + "loss": 0.1201, + "step": 1070 + }, + { + "epoch": 1.01685259909803, + "grad_norm": 0.051215577870607376, + "learning_rate": 0.00016716996690393715, + "loss": 0.1705, + "step": 1071 + }, + { + "epoch": 1.0178020413007358, + "grad_norm": 0.05008477717638016, + "learning_rate": 0.00016708806288304336, + "loss": 0.1606, + "step": 1072 + }, + { + "epoch": 1.0187514835034417, + "grad_norm": 0.05916628614068031, + "learning_rate": 0.00016700607694190617, + "loss": 0.1824, + "step": 1073 + }, + { + "epoch": 1.0197009257061476, + "grad_norm": 0.03331366181373596, + "learning_rate": 0.00016692400918063744, + "loss": 0.1256, + "step": 1074 + }, + { + "epoch": 1.0206503679088534, + "grad_norm": 0.03364944830536842, + "learning_rate": 0.00016684185969944885, + "loss": 0.1273, + "step": 1075 + }, + { + "epoch": 1.0215998101115595, + "grad_norm": 0.02990981563925743, + "learning_rate": 0.000166759628598652, + "loss": 0.1284, + "step": 1076 + }, + { + "epoch": 1.0225492523142654, + "grad_norm": 0.03323819115757942, + "learning_rate": 0.00016667731597865796, + "loss": 0.1258, + "step": 1077 + }, + { + "epoch": 1.0234986945169713, + "grad_norm": 0.03008713200688362, + "learning_rate": 0.0001665949219399775, + "loss": 0.1244, + "step": 1078 + }, + { + "epoch": 1.0244481367196772, + "grad_norm": 0.04623178020119667, + "learning_rate": 0.00016651244658322085, + "loss": 0.1537, + "step": 1079 + }, + { + "epoch": 1.025397578922383, + "grad_norm": 0.034869614988565445, + "learning_rate": 0.00016642989000909732, + "loss": 0.1272, + "step": 1080 + }, + { + "epoch": 1.026347021125089, + "grad_norm": 0.03236447647213936, + "learning_rate": 0.0001663472523184156, + "loss": 0.1299, + "step": 1081 + }, + { + "epoch": 1.027296463327795, + "grad_norm": 0.02806561440229416, + "learning_rate": 0.00016626453361208335, + "loss": 0.1198, + "step": 1082 + }, + { + "epoch": 1.0282459055305009, + "grad_norm": 0.04762514680624008, + "learning_rate": 0.0001661817339911071, + "loss": 0.1695, + "step": 1083 + }, + { + "epoch": 1.0291953477332068, + "grad_norm": 0.039319079369306564, + "learning_rate": 0.00016609885355659234, + "loss": 0.1612, + "step": 1084 + }, + { + "epoch": 1.0301447899359126, + "grad_norm": 0.030540715903043747, + "learning_rate": 0.0001660158924097431, + "loss": 0.1251, + "step": 1085 + }, + { + "epoch": 1.0310942321386185, + "grad_norm": 0.029828663915395737, + "learning_rate": 0.000165932850651862, + "loss": 0.1287, + "step": 1086 + }, + { + "epoch": 1.0320436743413244, + "grad_norm": 0.030012918636202812, + "learning_rate": 0.0001658497283843501, + "loss": 0.132, + "step": 1087 + }, + { + "epoch": 1.0329931165440305, + "grad_norm": 0.03255194425582886, + "learning_rate": 0.0001657665257087068, + "loss": 0.1257, + "step": 1088 + }, + { + "epoch": 1.0339425587467364, + "grad_norm": 0.040951523929834366, + "learning_rate": 0.00016568324272652965, + "loss": 0.1507, + "step": 1089 + }, + { + "epoch": 1.0348920009494422, + "grad_norm": 0.027678990736603737, + "learning_rate": 0.00016559987953951427, + "loss": 0.1217, + "step": 1090 + }, + { + "epoch": 1.035841443152148, + "grad_norm": 0.03241724148392677, + "learning_rate": 0.0001655164362494542, + "loss": 0.1298, + "step": 1091 + }, + { + "epoch": 1.036790885354854, + "grad_norm": 0.038156237453222275, + "learning_rate": 0.00016543291295824085, + "loss": 0.1252, + "step": 1092 + }, + { + "epoch": 1.0377403275575598, + "grad_norm": 0.029806343838572502, + "learning_rate": 0.00016534930976786323, + "loss": 0.1265, + "step": 1093 + }, + { + "epoch": 1.038689769760266, + "grad_norm": 0.035036977380514145, + "learning_rate": 0.00016526562678040804, + "loss": 0.1247, + "step": 1094 + }, + { + "epoch": 1.0396392119629718, + "grad_norm": 0.032223109155893326, + "learning_rate": 0.00016518186409805922, + "loss": 0.1326, + "step": 1095 + }, + { + "epoch": 1.0405886541656777, + "grad_norm": 0.03192323073744774, + "learning_rate": 0.0001650980218230982, + "loss": 0.1186, + "step": 1096 + }, + { + "epoch": 1.0415380963683836, + "grad_norm": 0.031004801392555237, + "learning_rate": 0.00016501410005790362, + "loss": 0.1292, + "step": 1097 + }, + { + "epoch": 1.0424875385710894, + "grad_norm": 0.03421878442168236, + "learning_rate": 0.00016493009890495102, + "loss": 0.1362, + "step": 1098 + }, + { + "epoch": 1.0434369807737953, + "grad_norm": 0.03153158724308014, + "learning_rate": 0.00016484601846681297, + "loss": 0.1182, + "step": 1099 + }, + { + "epoch": 1.0443864229765012, + "grad_norm": 0.03977439925074577, + "learning_rate": 0.0001647618588461589, + "loss": 0.1327, + "step": 1100 + }, + { + "epoch": 1.0453358651792073, + "grad_norm": 0.03982316702604294, + "learning_rate": 0.00016467762014575485, + "loss": 0.1582, + "step": 1101 + }, + { + "epoch": 1.0462853073819132, + "grad_norm": 0.034796085208654404, + "learning_rate": 0.00016459330246846348, + "loss": 0.1258, + "step": 1102 + }, + { + "epoch": 1.047234749584619, + "grad_norm": 0.039261046797037125, + "learning_rate": 0.0001645089059172438, + "loss": 0.1321, + "step": 1103 + }, + { + "epoch": 1.048184191787325, + "grad_norm": 0.04305882379412651, + "learning_rate": 0.00016442443059515126, + "loss": 0.1406, + "step": 1104 + }, + { + "epoch": 1.0491336339900308, + "grad_norm": 0.03491320461034775, + "learning_rate": 0.00016433987660533742, + "loss": 0.1312, + "step": 1105 + }, + { + "epoch": 1.0500830761927367, + "grad_norm": 0.04404641315340996, + "learning_rate": 0.00016425524405104986, + "loss": 0.1267, + "step": 1106 + }, + { + "epoch": 1.0510325183954428, + "grad_norm": 0.034407854080200195, + "learning_rate": 0.0001641705330356322, + "loss": 0.1268, + "step": 1107 + }, + { + "epoch": 1.0519819605981486, + "grad_norm": 0.04843935742974281, + "learning_rate": 0.00016408574366252374, + "loss": 0.1601, + "step": 1108 + }, + { + "epoch": 1.0529314028008545, + "grad_norm": 0.03394000977277756, + "learning_rate": 0.0001640008760352596, + "loss": 0.13, + "step": 1109 + }, + { + "epoch": 1.0538808450035604, + "grad_norm": 0.027672087773680687, + "learning_rate": 0.00016391593025747038, + "loss": 0.1202, + "step": 1110 + }, + { + "epoch": 1.0548302872062663, + "grad_norm": 0.03761329874396324, + "learning_rate": 0.0001638309064328821, + "loss": 0.1562, + "step": 1111 + }, + { + "epoch": 1.0557797294089721, + "grad_norm": 0.048850156366825104, + "learning_rate": 0.0001637458046653161, + "loss": 0.1707, + "step": 1112 + }, + { + "epoch": 1.0567291716116782, + "grad_norm": 0.027066387236118317, + "learning_rate": 0.00016366062505868888, + "loss": 0.1204, + "step": 1113 + }, + { + "epoch": 1.057678613814384, + "grad_norm": 0.034062668681144714, + "learning_rate": 0.00016357536771701198, + "loss": 0.1378, + "step": 1114 + }, + { + "epoch": 1.05862805601709, + "grad_norm": 0.0422850139439106, + "learning_rate": 0.00016349003274439194, + "loss": 0.1583, + "step": 1115 + }, + { + "epoch": 1.0595774982197959, + "grad_norm": 0.0334283784031868, + "learning_rate": 0.00016340462024503, + "loss": 0.1276, + "step": 1116 + }, + { + "epoch": 1.0605269404225017, + "grad_norm": 0.03338415175676346, + "learning_rate": 0.00016331913032322212, + "loss": 0.1229, + "step": 1117 + }, + { + "epoch": 1.0614763826252076, + "grad_norm": 0.03128555044531822, + "learning_rate": 0.00016323356308335876, + "loss": 0.1167, + "step": 1118 + }, + { + "epoch": 1.0624258248279137, + "grad_norm": 0.033790841698646545, + "learning_rate": 0.00016314791862992486, + "loss": 0.1236, + "step": 1119 + }, + { + "epoch": 1.0633752670306196, + "grad_norm": 0.03544427454471588, + "learning_rate": 0.00016306219706749953, + "loss": 0.1319, + "step": 1120 + }, + { + "epoch": 1.0643247092333254, + "grad_norm": 0.03969413787126541, + "learning_rate": 0.0001629763985007561, + "loss": 0.1612, + "step": 1121 + }, + { + "epoch": 1.0652741514360313, + "grad_norm": 0.042924992740154266, + "learning_rate": 0.00016289052303446202, + "loss": 0.1659, + "step": 1122 + }, + { + "epoch": 1.0662235936387372, + "grad_norm": 0.04624541476368904, + "learning_rate": 0.00016280457077347848, + "loss": 0.1617, + "step": 1123 + }, + { + "epoch": 1.067173035841443, + "grad_norm": 0.034341566264629364, + "learning_rate": 0.00016271854182276058, + "loss": 0.1188, + "step": 1124 + }, + { + "epoch": 1.068122478044149, + "grad_norm": 0.03228682279586792, + "learning_rate": 0.00016263243628735695, + "loss": 0.129, + "step": 1125 + }, + { + "epoch": 1.069071920246855, + "grad_norm": 0.036037541925907135, + "learning_rate": 0.00016254625427240978, + "loss": 0.1309, + "step": 1126 + }, + { + "epoch": 1.070021362449561, + "grad_norm": 0.027421532198786736, + "learning_rate": 0.0001624599958831547, + "loss": 0.1176, + "step": 1127 + }, + { + "epoch": 1.0709708046522668, + "grad_norm": 0.030262261629104614, + "learning_rate": 0.00016237366122492052, + "loss": 0.1195, + "step": 1128 + }, + { + "epoch": 1.0719202468549727, + "grad_norm": 0.041230421513319016, + "learning_rate": 0.00016228725040312925, + "loss": 0.1562, + "step": 1129 + }, + { + "epoch": 1.0728696890576785, + "grad_norm": 0.03141395375132561, + "learning_rate": 0.00016220076352329582, + "loss": 0.13, + "step": 1130 + }, + { + "epoch": 1.0738191312603846, + "grad_norm": 0.0343187153339386, + "learning_rate": 0.00016211420069102815, + "loss": 0.134, + "step": 1131 + }, + { + "epoch": 1.0747685734630905, + "grad_norm": 0.04862483590841293, + "learning_rate": 0.0001620275620120268, + "loss": 0.1574, + "step": 1132 + }, + { + "epoch": 1.0757180156657964, + "grad_norm": 0.04204042628407478, + "learning_rate": 0.00016194084759208494, + "loss": 0.162, + "step": 1133 + }, + { + "epoch": 1.0766674578685023, + "grad_norm": 0.03309663385152817, + "learning_rate": 0.00016185405753708833, + "loss": 0.1251, + "step": 1134 + }, + { + "epoch": 1.0776169000712081, + "grad_norm": 0.03191671893000603, + "learning_rate": 0.00016176719195301503, + "loss": 0.125, + "step": 1135 + }, + { + "epoch": 1.078566342273914, + "grad_norm": 0.036822427064180374, + "learning_rate": 0.0001616802509459353, + "loss": 0.1484, + "step": 1136 + }, + { + "epoch": 1.0795157844766199, + "grad_norm": 0.029125772416591644, + "learning_rate": 0.00016159323462201149, + "loss": 0.1192, + "step": 1137 + }, + { + "epoch": 1.080465226679326, + "grad_norm": 0.034059688448905945, + "learning_rate": 0.000161506143087498, + "loss": 0.1309, + "step": 1138 + }, + { + "epoch": 1.0814146688820319, + "grad_norm": 0.03434915095567703, + "learning_rate": 0.00016141897644874096, + "loss": 0.1336, + "step": 1139 + }, + { + "epoch": 1.0823641110847377, + "grad_norm": 0.0348944216966629, + "learning_rate": 0.00016133173481217833, + "loss": 0.1317, + "step": 1140 + }, + { + "epoch": 1.0833135532874436, + "grad_norm": 0.033639729022979736, + "learning_rate": 0.00016124441828433957, + "loss": 0.1187, + "step": 1141 + }, + { + "epoch": 1.0842629954901495, + "grad_norm": 0.03063533827662468, + "learning_rate": 0.00016115702697184556, + "loss": 0.1332, + "step": 1142 + }, + { + "epoch": 1.0852124376928554, + "grad_norm": 0.03273540362715721, + "learning_rate": 0.00016106956098140858, + "loss": 0.1284, + "step": 1143 + }, + { + "epoch": 1.0861618798955615, + "grad_norm": 0.05293993651866913, + "learning_rate": 0.00016098202041983206, + "loss": 0.1687, + "step": 1144 + }, + { + "epoch": 1.0871113220982673, + "grad_norm": 0.03251373767852783, + "learning_rate": 0.00016089440539401046, + "loss": 0.1252, + "step": 1145 + }, + { + "epoch": 1.0880607643009732, + "grad_norm": 0.0367170013487339, + "learning_rate": 0.00016080671601092922, + "loss": 0.1419, + "step": 1146 + }, + { + "epoch": 1.089010206503679, + "grad_norm": 0.030752060934901237, + "learning_rate": 0.00016071895237766457, + "loss": 0.1257, + "step": 1147 + }, + { + "epoch": 1.089959648706385, + "grad_norm": 0.035168685019016266, + "learning_rate": 0.00016063111460138334, + "loss": 0.1385, + "step": 1148 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.03252134099602699, + "learning_rate": 0.00016054320278934296, + "loss": 0.1232, + "step": 1149 + }, + { + "epoch": 1.0918585331117967, + "grad_norm": 0.028666459023952484, + "learning_rate": 0.00016045521704889128, + "loss": 0.1242, + "step": 1150 + }, + { + "epoch": 1.0928079753145028, + "grad_norm": 0.047707218676805496, + "learning_rate": 0.00016036715748746634, + "loss": 0.1643, + "step": 1151 + }, + { + "epoch": 1.0937574175172087, + "grad_norm": 0.035980336368083954, + "learning_rate": 0.00016027902421259638, + "loss": 0.1329, + "step": 1152 + }, + { + "epoch": 1.0947068597199145, + "grad_norm": 0.04506576433777809, + "learning_rate": 0.00016019081733189967, + "loss": 0.1631, + "step": 1153 + }, + { + "epoch": 1.0956563019226204, + "grad_norm": 0.030268298462033272, + "learning_rate": 0.0001601025369530843, + "loss": 0.1319, + "step": 1154 + }, + { + "epoch": 1.0966057441253263, + "grad_norm": 0.056095585227012634, + "learning_rate": 0.00016001418318394817, + "loss": 0.1529, + "step": 1155 + }, + { + "epoch": 1.0975551863280324, + "grad_norm": 0.029666630551218987, + "learning_rate": 0.0001599257561323787, + "loss": 0.126, + "step": 1156 + }, + { + "epoch": 1.0985046285307383, + "grad_norm": 0.03648681938648224, + "learning_rate": 0.00015983725590635293, + "loss": 0.1378, + "step": 1157 + }, + { + "epoch": 1.0994540707334441, + "grad_norm": 0.03170529007911682, + "learning_rate": 0.00015974868261393714, + "loss": 0.1238, + "step": 1158 + }, + { + "epoch": 1.10040351293615, + "grad_norm": 0.032316990196704865, + "learning_rate": 0.0001596600363632869, + "loss": 0.1305, + "step": 1159 + }, + { + "epoch": 1.101352955138856, + "grad_norm": 0.03148328512907028, + "learning_rate": 0.00015957131726264677, + "loss": 0.1303, + "step": 1160 + }, + { + "epoch": 1.1023023973415618, + "grad_norm": 0.03739064186811447, + "learning_rate": 0.00015948252542035042, + "loss": 0.16, + "step": 1161 + }, + { + "epoch": 1.1032518395442676, + "grad_norm": 0.034856993705034256, + "learning_rate": 0.00015939366094482025, + "loss": 0.1273, + "step": 1162 + }, + { + "epoch": 1.1042012817469737, + "grad_norm": 0.03102080523967743, + "learning_rate": 0.0001593047239445673, + "loss": 0.1331, + "step": 1163 + }, + { + "epoch": 1.1051507239496796, + "grad_norm": 0.026448125019669533, + "learning_rate": 0.00015921571452819127, + "loss": 0.1241, + "step": 1164 + }, + { + "epoch": 1.1061001661523855, + "grad_norm": 0.034301795065402985, + "learning_rate": 0.0001591266328043802, + "loss": 0.1283, + "step": 1165 + }, + { + "epoch": 1.1070496083550914, + "grad_norm": 0.03346949443221092, + "learning_rate": 0.00015903747888191053, + "loss": 0.1355, + "step": 1166 + }, + { + "epoch": 1.1079990505577972, + "grad_norm": 0.0324571467936039, + "learning_rate": 0.00015894825286964675, + "loss": 0.1354, + "step": 1167 + }, + { + "epoch": 1.1089484927605031, + "grad_norm": 0.05366596579551697, + "learning_rate": 0.00015885895487654147, + "loss": 0.2099, + "step": 1168 + }, + { + "epoch": 1.1098979349632092, + "grad_norm": 0.04298697039484978, + "learning_rate": 0.00015876958501163512, + "loss": 0.1709, + "step": 1169 + }, + { + "epoch": 1.110847377165915, + "grad_norm": 0.02922794409096241, + "learning_rate": 0.00015868014338405592, + "loss": 0.1201, + "step": 1170 + }, + { + "epoch": 1.111796819368621, + "grad_norm": 0.02963380515575409, + "learning_rate": 0.00015859063010301974, + "loss": 0.119, + "step": 1171 + }, + { + "epoch": 1.1127462615713268, + "grad_norm": 0.05272309482097626, + "learning_rate": 0.0001585010452778299, + "loss": 0.1785, + "step": 1172 + }, + { + "epoch": 1.1136957037740327, + "grad_norm": 0.031892240047454834, + "learning_rate": 0.00015841138901787714, + "loss": 0.1292, + "step": 1173 + }, + { + "epoch": 1.1146451459767386, + "grad_norm": 0.02971399575471878, + "learning_rate": 0.0001583216614326394, + "loss": 0.124, + "step": 1174 + }, + { + "epoch": 1.1155945881794447, + "grad_norm": 0.03037869744002819, + "learning_rate": 0.00015823186263168169, + "loss": 0.1303, + "step": 1175 + }, + { + "epoch": 1.1165440303821506, + "grad_norm": 0.02748207375407219, + "learning_rate": 0.000158141992724656, + "loss": 0.1243, + "step": 1176 + }, + { + "epoch": 1.1174934725848564, + "grad_norm": 0.033940836787223816, + "learning_rate": 0.00015805205182130113, + "loss": 0.1254, + "step": 1177 + }, + { + "epoch": 1.1184429147875623, + "grad_norm": 0.03295721858739853, + "learning_rate": 0.00015796204003144264, + "loss": 0.1235, + "step": 1178 + }, + { + "epoch": 1.1193923569902682, + "grad_norm": 0.031388405710458755, + "learning_rate": 0.00015787195746499254, + "loss": 0.1221, + "step": 1179 + }, + { + "epoch": 1.120341799192974, + "grad_norm": 0.033072203397750854, + "learning_rate": 0.00015778180423194936, + "loss": 0.1284, + "step": 1180 + }, + { + "epoch": 1.1212912413956801, + "grad_norm": 0.03310628607869148, + "learning_rate": 0.00015769158044239787, + "loss": 0.136, + "step": 1181 + }, + { + "epoch": 1.122240683598386, + "grad_norm": 0.030244866386055946, + "learning_rate": 0.000157601286206509, + "loss": 0.1255, + "step": 1182 + }, + { + "epoch": 1.123190125801092, + "grad_norm": 0.04400714859366417, + "learning_rate": 0.0001575109216345397, + "loss": 0.1706, + "step": 1183 + }, + { + "epoch": 1.1241395680037978, + "grad_norm": 0.03094104304909706, + "learning_rate": 0.00015742048683683288, + "loss": 0.1261, + "step": 1184 + }, + { + "epoch": 1.1250890102065036, + "grad_norm": 0.03327153995633125, + "learning_rate": 0.00015732998192381707, + "loss": 0.1334, + "step": 1185 + }, + { + "epoch": 1.1260384524092095, + "grad_norm": 0.03229563683271408, + "learning_rate": 0.0001572394070060065, + "loss": 0.1168, + "step": 1186 + }, + { + "epoch": 1.1269878946119154, + "grad_norm": 0.03267960995435715, + "learning_rate": 0.0001571487621940009, + "loss": 0.1331, + "step": 1187 + }, + { + "epoch": 1.1279373368146215, + "grad_norm": 0.02902175299823284, + "learning_rate": 0.00015705804759848523, + "loss": 0.1302, + "step": 1188 + }, + { + "epoch": 1.1288867790173274, + "grad_norm": 0.030303161591291428, + "learning_rate": 0.00015696726333022984, + "loss": 0.1267, + "step": 1189 + }, + { + "epoch": 1.1298362212200332, + "grad_norm": 0.031242702156305313, + "learning_rate": 0.00015687640950009, + "loss": 0.125, + "step": 1190 + }, + { + "epoch": 1.1307856634227391, + "grad_norm": 0.02697862684726715, + "learning_rate": 0.00015678548621900597, + "loss": 0.1207, + "step": 1191 + }, + { + "epoch": 1.131735105625445, + "grad_norm": 0.03510352969169617, + "learning_rate": 0.0001566944935980029, + "loss": 0.132, + "step": 1192 + }, + { + "epoch": 1.1326845478281509, + "grad_norm": 0.03274201229214668, + "learning_rate": 0.00015660343174819045, + "loss": 0.1262, + "step": 1193 + }, + { + "epoch": 1.133633990030857, + "grad_norm": 0.03378736600279808, + "learning_rate": 0.00015651230078076296, + "loss": 0.1318, + "step": 1194 + }, + { + "epoch": 1.1345834322335628, + "grad_norm": 0.032314665615558624, + "learning_rate": 0.00015642110080699907, + "loss": 0.117, + "step": 1195 + }, + { + "epoch": 1.1355328744362687, + "grad_norm": 0.04177004471421242, + "learning_rate": 0.00015632983193826174, + "loss": 0.1527, + "step": 1196 + }, + { + "epoch": 1.1364823166389746, + "grad_norm": 0.05647768825292587, + "learning_rate": 0.00015623849428599804, + "loss": 0.2019, + "step": 1197 + }, + { + "epoch": 1.1374317588416805, + "grad_norm": 0.03124004229903221, + "learning_rate": 0.00015614708796173906, + "loss": 0.1228, + "step": 1198 + }, + { + "epoch": 1.1383812010443863, + "grad_norm": 0.029155779629945755, + "learning_rate": 0.00015605561307709964, + "loss": 0.126, + "step": 1199 + }, + { + "epoch": 1.1393306432470922, + "grad_norm": 0.030367044731974602, + "learning_rate": 0.0001559640697437785, + "loss": 0.1296, + "step": 1200 + }, + { + "epoch": 1.1402800854497983, + "grad_norm": 0.06225927174091339, + "learning_rate": 0.00015587245807355778, + "loss": 0.2258, + "step": 1201 + }, + { + "epoch": 1.1412295276525042, + "grad_norm": 0.03878935053944588, + "learning_rate": 0.00015578077817830313, + "loss": 0.1322, + "step": 1202 + }, + { + "epoch": 1.14217896985521, + "grad_norm": 0.030811108648777008, + "learning_rate": 0.0001556890301699636, + "loss": 0.127, + "step": 1203 + }, + { + "epoch": 1.143128412057916, + "grad_norm": 0.0561293326318264, + "learning_rate": 0.00015559721416057127, + "loss": 0.1689, + "step": 1204 + }, + { + "epoch": 1.1440778542606218, + "grad_norm": 0.05256973206996918, + "learning_rate": 0.0001555053302622413, + "loss": 0.1735, + "step": 1205 + }, + { + "epoch": 1.145027296463328, + "grad_norm": 0.037241023033857346, + "learning_rate": 0.0001554133785871718, + "loss": 0.1304, + "step": 1206 + }, + { + "epoch": 1.1459767386660338, + "grad_norm": 0.04502008110284805, + "learning_rate": 0.00015532135924764358, + "loss": 0.1594, + "step": 1207 + }, + { + "epoch": 1.1469261808687397, + "grad_norm": 0.053607337176799774, + "learning_rate": 0.00015522927235602014, + "loss": 0.1683, + "step": 1208 + }, + { + "epoch": 1.1478756230714455, + "grad_norm": 0.029219908639788628, + "learning_rate": 0.00015513711802474735, + "loss": 0.1267, + "step": 1209 + }, + { + "epoch": 1.1488250652741514, + "grad_norm": 0.03088328242301941, + "learning_rate": 0.0001550448963663536, + "loss": 0.1318, + "step": 1210 + }, + { + "epoch": 1.1497745074768573, + "grad_norm": 0.03560802713036537, + "learning_rate": 0.00015495260749344932, + "loss": 0.1433, + "step": 1211 + }, + { + "epoch": 1.1507239496795632, + "grad_norm": 0.033212240785360336, + "learning_rate": 0.00015486025151872706, + "loss": 0.1222, + "step": 1212 + }, + { + "epoch": 1.1516733918822692, + "grad_norm": 0.032883357256650925, + "learning_rate": 0.00015476782855496145, + "loss": 0.1317, + "step": 1213 + }, + { + "epoch": 1.1526228340849751, + "grad_norm": 0.028118513524532318, + "learning_rate": 0.0001546753387150087, + "loss": 0.1306, + "step": 1214 + }, + { + "epoch": 1.153572276287681, + "grad_norm": 0.040216926485300064, + "learning_rate": 0.00015458278211180688, + "loss": 0.1482, + "step": 1215 + }, + { + "epoch": 1.1545217184903869, + "grad_norm": 0.029117384925484657, + "learning_rate": 0.00015449015885837542, + "loss": 0.1287, + "step": 1216 + }, + { + "epoch": 1.1554711606930927, + "grad_norm": 0.08133453875780106, + "learning_rate": 0.00015439746906781524, + "loss": 0.2109, + "step": 1217 + }, + { + "epoch": 1.1564206028957986, + "grad_norm": 0.03554106503725052, + "learning_rate": 0.00015430471285330846, + "loss": 0.1297, + "step": 1218 + }, + { + "epoch": 1.1573700450985047, + "grad_norm": 0.037721745669841766, + "learning_rate": 0.00015421189032811835, + "loss": 0.1364, + "step": 1219 + }, + { + "epoch": 1.1583194873012106, + "grad_norm": 0.025993864983320236, + "learning_rate": 0.00015411900160558912, + "loss": 0.1198, + "step": 1220 + }, + { + "epoch": 1.1592689295039165, + "grad_norm": 0.03330320492386818, + "learning_rate": 0.00015402604679914575, + "loss": 0.1253, + "step": 1221 + }, + { + "epoch": 1.1602183717066223, + "grad_norm": 0.03463476151227951, + "learning_rate": 0.00015393302602229408, + "loss": 0.1235, + "step": 1222 + }, + { + "epoch": 1.1611678139093282, + "grad_norm": 0.03458210453391075, + "learning_rate": 0.00015383993938862037, + "loss": 0.1238, + "step": 1223 + }, + { + "epoch": 1.162117256112034, + "grad_norm": 0.03344335779547691, + "learning_rate": 0.00015374678701179134, + "loss": 0.1267, + "step": 1224 + }, + { + "epoch": 1.16306669831474, + "grad_norm": 0.027585169300436974, + "learning_rate": 0.00015365356900555395, + "loss": 0.1192, + "step": 1225 + }, + { + "epoch": 1.164016140517446, + "grad_norm": 0.03154926374554634, + "learning_rate": 0.00015356028548373538, + "loss": 0.1288, + "step": 1226 + }, + { + "epoch": 1.164965582720152, + "grad_norm": 0.030677665024995804, + "learning_rate": 0.00015346693656024271, + "loss": 0.1292, + "step": 1227 + }, + { + "epoch": 1.1659150249228578, + "grad_norm": 0.030145341530442238, + "learning_rate": 0.00015337352234906298, + "loss": 0.1331, + "step": 1228 + }, + { + "epoch": 1.1668644671255637, + "grad_norm": 0.03683342784643173, + "learning_rate": 0.00015328004296426287, + "loss": 0.125, + "step": 1229 + }, + { + "epoch": 1.1678139093282696, + "grad_norm": 0.029177436605095863, + "learning_rate": 0.0001531864985199887, + "loss": 0.1313, + "step": 1230 + }, + { + "epoch": 1.1687633515309757, + "grad_norm": 0.0415952131152153, + "learning_rate": 0.0001530928891304662, + "loss": 0.1642, + "step": 1231 + }, + { + "epoch": 1.1697127937336815, + "grad_norm": 0.027380244806408882, + "learning_rate": 0.00015299921491000043, + "loss": 0.1254, + "step": 1232 + }, + { + "epoch": 1.1706622359363874, + "grad_norm": 0.03130066767334938, + "learning_rate": 0.00015290547597297555, + "loss": 0.1291, + "step": 1233 + }, + { + "epoch": 1.1716116781390933, + "grad_norm": 0.04524728283286095, + "learning_rate": 0.00015281167243385484, + "loss": 0.1627, + "step": 1234 + }, + { + "epoch": 1.1725611203417992, + "grad_norm": 0.04698526859283447, + "learning_rate": 0.0001527178044071804, + "loss": 0.1892, + "step": 1235 + }, + { + "epoch": 1.173510562544505, + "grad_norm": 0.041941914707422256, + "learning_rate": 0.00015262387200757314, + "loss": 0.1603, + "step": 1236 + }, + { + "epoch": 1.174460004747211, + "grad_norm": 0.03519544377923012, + "learning_rate": 0.0001525298753497324, + "loss": 0.1306, + "step": 1237 + }, + { + "epoch": 1.175409446949917, + "grad_norm": 0.036025673151016235, + "learning_rate": 0.00015243581454843624, + "loss": 0.1315, + "step": 1238 + }, + { + "epoch": 1.1763588891526229, + "grad_norm": 0.05133717134594917, + "learning_rate": 0.0001523416897185409, + "loss": 0.1661, + "step": 1239 + }, + { + "epoch": 1.1773083313553288, + "grad_norm": 0.03367958217859268, + "learning_rate": 0.00015224750097498073, + "loss": 0.1311, + "step": 1240 + }, + { + "epoch": 1.1782577735580346, + "grad_norm": 0.035372741520404816, + "learning_rate": 0.0001521532484327683, + "loss": 0.1324, + "step": 1241 + }, + { + "epoch": 1.1792072157607405, + "grad_norm": 0.048746585845947266, + "learning_rate": 0.000152058932206994, + "loss": 0.1756, + "step": 1242 + }, + { + "epoch": 1.1801566579634466, + "grad_norm": 0.03578799590468407, + "learning_rate": 0.00015196455241282592, + "loss": 0.1344, + "step": 1243 + }, + { + "epoch": 1.1811061001661525, + "grad_norm": 0.030654437839984894, + "learning_rate": 0.00015187010916550988, + "loss": 0.1268, + "step": 1244 + }, + { + "epoch": 1.1820555423688583, + "grad_norm": 0.02881826087832451, + "learning_rate": 0.0001517756025803691, + "loss": 0.1149, + "step": 1245 + }, + { + "epoch": 1.1830049845715642, + "grad_norm": 0.031242484226822853, + "learning_rate": 0.00015168103277280422, + "loss": 0.1338, + "step": 1246 + }, + { + "epoch": 1.18395442677427, + "grad_norm": 0.028528152033686638, + "learning_rate": 0.000151586399858293, + "loss": 0.1203, + "step": 1247 + }, + { + "epoch": 1.184903868976976, + "grad_norm": 0.028410421684384346, + "learning_rate": 0.00015149170395239035, + "loss": 0.1296, + "step": 1248 + }, + { + "epoch": 1.1858533111796818, + "grad_norm": 0.029383866116404533, + "learning_rate": 0.00015139694517072796, + "loss": 0.1284, + "step": 1249 + }, + { + "epoch": 1.186802753382388, + "grad_norm": 0.031084850430488586, + "learning_rate": 0.00015130212362901447, + "loss": 0.1272, + "step": 1250 + }, + { + "epoch": 1.1877521955850938, + "grad_norm": 0.035449109971523285, + "learning_rate": 0.00015120723944303497, + "loss": 0.1293, + "step": 1251 + }, + { + "epoch": 1.1887016377877997, + "grad_norm": 0.030890950933098793, + "learning_rate": 0.0001511122927286512, + "loss": 0.1221, + "step": 1252 + }, + { + "epoch": 1.1896510799905056, + "grad_norm": 0.06967780739068985, + "learning_rate": 0.0001510172836018012, + "loss": 0.1277, + "step": 1253 + }, + { + "epoch": 1.1906005221932114, + "grad_norm": 0.03075585328042507, + "learning_rate": 0.00015092221217849917, + "loss": 0.1278, + "step": 1254 + }, + { + "epoch": 1.1915499643959173, + "grad_norm": 0.040119290351867676, + "learning_rate": 0.00015082707857483544, + "loss": 0.1546, + "step": 1255 + }, + { + "epoch": 1.1924994065986234, + "grad_norm": 0.03376394882798195, + "learning_rate": 0.0001507318829069763, + "loss": 0.1325, + "step": 1256 + }, + { + "epoch": 1.1934488488013293, + "grad_norm": 0.034935321658849716, + "learning_rate": 0.00015063662529116368, + "loss": 0.1361, + "step": 1257 + }, + { + "epoch": 1.1943982910040352, + "grad_norm": 0.043972812592983246, + "learning_rate": 0.00015054130584371528, + "loss": 0.1292, + "step": 1258 + }, + { + "epoch": 1.195347733206741, + "grad_norm": 0.06039128825068474, + "learning_rate": 0.0001504459246810243, + "loss": 0.1958, + "step": 1259 + }, + { + "epoch": 1.196297175409447, + "grad_norm": 0.030998772010207176, + "learning_rate": 0.00015035048191955927, + "loss": 0.1166, + "step": 1260 + }, + { + "epoch": 1.1972466176121528, + "grad_norm": 0.0286384467035532, + "learning_rate": 0.00015025497767586393, + "loss": 0.1225, + "step": 1261 + }, + { + "epoch": 1.1981960598148587, + "grad_norm": 0.03200898319482803, + "learning_rate": 0.0001501594120665571, + "loss": 0.1244, + "step": 1262 + }, + { + "epoch": 1.1991455020175648, + "grad_norm": 0.032870370894670486, + "learning_rate": 0.00015006378520833252, + "loss": 0.126, + "step": 1263 + }, + { + "epoch": 1.2000949442202706, + "grad_norm": 0.034849826246500015, + "learning_rate": 0.00014996809721795872, + "loss": 0.1263, + "step": 1264 + }, + { + "epoch": 1.2010443864229765, + "grad_norm": 0.045324552804231644, + "learning_rate": 0.00014987234821227898, + "loss": 0.1668, + "step": 1265 + }, + { + "epoch": 1.2019938286256824, + "grad_norm": 0.036612797528505325, + "learning_rate": 0.0001497765383082109, + "loss": 0.1595, + "step": 1266 + }, + { + "epoch": 1.2029432708283883, + "grad_norm": 0.03746375814080238, + "learning_rate": 0.00014968066762274657, + "loss": 0.1644, + "step": 1267 + }, + { + "epoch": 1.2038927130310944, + "grad_norm": 0.03137432038784027, + "learning_rate": 0.0001495847362729523, + "loss": 0.1239, + "step": 1268 + }, + { + "epoch": 1.2048421552338002, + "grad_norm": 0.0314825214445591, + "learning_rate": 0.0001494887443759684, + "loss": 0.1258, + "step": 1269 + }, + { + "epoch": 1.205791597436506, + "grad_norm": 0.032157186418771744, + "learning_rate": 0.00014939269204900917, + "loss": 0.1233, + "step": 1270 + }, + { + "epoch": 1.206741039639212, + "grad_norm": 0.0410330593585968, + "learning_rate": 0.0001492965794093627, + "loss": 0.153, + "step": 1271 + }, + { + "epoch": 1.2076904818419179, + "grad_norm": 0.0325077585875988, + "learning_rate": 0.0001492004065743907, + "loss": 0.1241, + "step": 1272 + }, + { + "epoch": 1.2086399240446237, + "grad_norm": 0.033166393637657166, + "learning_rate": 0.00014910417366152844, + "loss": 0.1292, + "step": 1273 + }, + { + "epoch": 1.2095893662473296, + "grad_norm": 0.02926860749721527, + "learning_rate": 0.0001490078807882845, + "loss": 0.1242, + "step": 1274 + }, + { + "epoch": 1.2105388084500357, + "grad_norm": 0.04637501388788223, + "learning_rate": 0.00014891152807224066, + "loss": 0.1404, + "step": 1275 + }, + { + "epoch": 1.2114882506527416, + "grad_norm": 0.035617321729660034, + "learning_rate": 0.0001488151156310518, + "loss": 0.1292, + "step": 1276 + }, + { + "epoch": 1.2124376928554474, + "grad_norm": 0.036330446600914, + "learning_rate": 0.00014871864358244574, + "loss": 0.1326, + "step": 1277 + }, + { + "epoch": 1.2133871350581533, + "grad_norm": 0.034302353858947754, + "learning_rate": 0.00014862211204422305, + "loss": 0.1296, + "step": 1278 + }, + { + "epoch": 1.2143365772608592, + "grad_norm": 0.027070587500929832, + "learning_rate": 0.00014852552113425702, + "loss": 0.1227, + "step": 1279 + }, + { + "epoch": 1.215286019463565, + "grad_norm": 0.029872050508856773, + "learning_rate": 0.00014842887097049333, + "loss": 0.1333, + "step": 1280 + }, + { + "epoch": 1.2162354616662712, + "grad_norm": 0.0336853489279747, + "learning_rate": 0.0001483321616709501, + "loss": 0.1264, + "step": 1281 + }, + { + "epoch": 1.217184903868977, + "grad_norm": 0.03892628103494644, + "learning_rate": 0.00014823539335371763, + "loss": 0.1516, + "step": 1282 + }, + { + "epoch": 1.218134346071683, + "grad_norm": 0.03041486255824566, + "learning_rate": 0.00014813856613695825, + "loss": 0.1303, + "step": 1283 + }, + { + "epoch": 1.2190837882743888, + "grad_norm": 0.042988792061805725, + "learning_rate": 0.00014804168013890628, + "loss": 0.1697, + "step": 1284 + }, + { + "epoch": 1.2200332304770947, + "grad_norm": 0.03176325559616089, + "learning_rate": 0.00014794473547786777, + "loss": 0.1309, + "step": 1285 + }, + { + "epoch": 1.2209826726798005, + "grad_norm": 0.032539233565330505, + "learning_rate": 0.00014784773227222042, + "loss": 0.1336, + "step": 1286 + }, + { + "epoch": 1.2219321148825064, + "grad_norm": 0.03153330832719803, + "learning_rate": 0.00014775067064041341, + "loss": 0.1244, + "step": 1287 + }, + { + "epoch": 1.2228815570852125, + "grad_norm": 0.03229093924164772, + "learning_rate": 0.00014765355070096728, + "loss": 0.1331, + "step": 1288 + }, + { + "epoch": 1.2238309992879184, + "grad_norm": 0.03116600587964058, + "learning_rate": 0.0001475563725724737, + "loss": 0.1263, + "step": 1289 + }, + { + "epoch": 1.2247804414906243, + "grad_norm": 0.027543194591999054, + "learning_rate": 0.0001474591363735955, + "loss": 0.1319, + "step": 1290 + }, + { + "epoch": 1.2257298836933301, + "grad_norm": 0.031299810856580734, + "learning_rate": 0.00014736184222306637, + "loss": 0.1235, + "step": 1291 + }, + { + "epoch": 1.226679325896036, + "grad_norm": 0.030574094504117966, + "learning_rate": 0.00014726449023969073, + "loss": 0.1337, + "step": 1292 + }, + { + "epoch": 1.227628768098742, + "grad_norm": 0.04210914671421051, + "learning_rate": 0.0001471670805423437, + "loss": 0.1439, + "step": 1293 + }, + { + "epoch": 1.228578210301448, + "grad_norm": 0.03234979510307312, + "learning_rate": 0.00014706961324997077, + "loss": 0.1339, + "step": 1294 + }, + { + "epoch": 1.2295276525041539, + "grad_norm": 0.028707873076200485, + "learning_rate": 0.00014697208848158782, + "loss": 0.1271, + "step": 1295 + }, + { + "epoch": 1.2304770947068597, + "grad_norm": 0.03162172809243202, + "learning_rate": 0.0001468745063562809, + "loss": 0.1291, + "step": 1296 + }, + { + "epoch": 1.2314265369095656, + "grad_norm": 0.032575272023677826, + "learning_rate": 0.00014677686699320614, + "loss": 0.1345, + "step": 1297 + }, + { + "epoch": 1.2323759791122715, + "grad_norm": 0.02751932106912136, + "learning_rate": 0.0001466791705115895, + "loss": 0.12, + "step": 1298 + }, + { + "epoch": 1.2333254213149774, + "grad_norm": 0.026365652680397034, + "learning_rate": 0.00014658141703072675, + "loss": 0.1147, + "step": 1299 + }, + { + "epoch": 1.2342748635176835, + "grad_norm": 0.03227808326482773, + "learning_rate": 0.00014648360666998314, + "loss": 0.1332, + "step": 1300 + }, + { + "epoch": 1.2352243057203893, + "grad_norm": 0.057433344423770905, + "learning_rate": 0.00014638573954879356, + "loss": 0.2349, + "step": 1301 + }, + { + "epoch": 1.2361737479230952, + "grad_norm": 0.0376310870051384, + "learning_rate": 0.000146287815786662, + "loss": 0.1588, + "step": 1302 + }, + { + "epoch": 1.237123190125801, + "grad_norm": 0.04079489782452583, + "learning_rate": 0.00014618983550316182, + "loss": 0.1625, + "step": 1303 + }, + { + "epoch": 1.238072632328507, + "grad_norm": 0.03367112949490547, + "learning_rate": 0.00014609179881793524, + "loss": 0.1266, + "step": 1304 + }, + { + "epoch": 1.2390220745312128, + "grad_norm": 0.030257422477006912, + "learning_rate": 0.0001459937058506934, + "loss": 0.132, + "step": 1305 + }, + { + "epoch": 1.239971516733919, + "grad_norm": 0.03830450400710106, + "learning_rate": 0.00014589555672121622, + "loss": 0.1349, + "step": 1306 + }, + { + "epoch": 1.2409209589366248, + "grad_norm": 0.030318403616547585, + "learning_rate": 0.00014579735154935213, + "loss": 0.1346, + "step": 1307 + }, + { + "epoch": 1.2418704011393307, + "grad_norm": 0.033780913800001144, + "learning_rate": 0.000145699090455018, + "loss": 0.1373, + "step": 1308 + }, + { + "epoch": 1.2428198433420365, + "grad_norm": 0.03642027825117111, + "learning_rate": 0.00014560077355819904, + "loss": 0.1279, + "step": 1309 + }, + { + "epoch": 1.2437692855447424, + "grad_norm": 0.027665693312883377, + "learning_rate": 0.00014550240097894852, + "loss": 0.1177, + "step": 1310 + }, + { + "epoch": 1.2447187277474483, + "grad_norm": 0.030852187424898148, + "learning_rate": 0.00014540397283738777, + "loss": 0.1373, + "step": 1311 + }, + { + "epoch": 1.2456681699501542, + "grad_norm": 0.03137564659118652, + "learning_rate": 0.00014530548925370594, + "loss": 0.129, + "step": 1312 + }, + { + "epoch": 1.2466176121528603, + "grad_norm": 0.032402969896793365, + "learning_rate": 0.0001452069503481599, + "loss": 0.1394, + "step": 1313 + }, + { + "epoch": 1.2475670543555661, + "grad_norm": 0.049039825797080994, + "learning_rate": 0.00014510835624107396, + "loss": 0.1593, + "step": 1314 + }, + { + "epoch": 1.248516496558272, + "grad_norm": 0.0339655838906765, + "learning_rate": 0.00014500970705284006, + "loss": 0.1367, + "step": 1315 + }, + { + "epoch": 1.249465938760978, + "grad_norm": 0.040319304913282394, + "learning_rate": 0.00014491100290391716, + "loss": 0.1679, + "step": 1316 + }, + { + "epoch": 1.2504153809636838, + "grad_norm": 0.024075858294963837, + "learning_rate": 0.00014481224391483152, + "loss": 0.1273, + "step": 1317 + }, + { + "epoch": 1.2513648231663899, + "grad_norm": 0.02570619434118271, + "learning_rate": 0.00014471343020617625, + "loss": 0.1277, + "step": 1318 + }, + { + "epoch": 1.2523142653690957, + "grad_norm": 0.028086170554161072, + "learning_rate": 0.00014461456189861132, + "loss": 0.1246, + "step": 1319 + }, + { + "epoch": 1.2532637075718016, + "grad_norm": 0.02902062050998211, + "learning_rate": 0.0001445156391128633, + "loss": 0.1305, + "step": 1320 + }, + { + "epoch": 1.2542131497745075, + "grad_norm": 0.030352482572197914, + "learning_rate": 0.00014441666196972542, + "loss": 0.1329, + "step": 1321 + }, + { + "epoch": 1.2551625919772134, + "grad_norm": 0.029998816549777985, + "learning_rate": 0.00014431763059005718, + "loss": 0.131, + "step": 1322 + }, + { + "epoch": 1.2561120341799192, + "grad_norm": 0.028111204504966736, + "learning_rate": 0.00014421854509478435, + "loss": 0.124, + "step": 1323 + }, + { + "epoch": 1.257061476382625, + "grad_norm": 0.02926759235560894, + "learning_rate": 0.00014411940560489877, + "loss": 0.1215, + "step": 1324 + }, + { + "epoch": 1.258010918585331, + "grad_norm": 0.03321680426597595, + "learning_rate": 0.00014402021224145815, + "loss": 0.1216, + "step": 1325 + }, + { + "epoch": 1.258960360788037, + "grad_norm": 0.03386010602116585, + "learning_rate": 0.00014392096512558613, + "loss": 0.1335, + "step": 1326 + }, + { + "epoch": 1.259909802990743, + "grad_norm": 0.03989921137690544, + "learning_rate": 0.0001438216643784718, + "loss": 0.1481, + "step": 1327 + }, + { + "epoch": 1.2608592451934488, + "grad_norm": 0.030915161594748497, + "learning_rate": 0.00014372231012136995, + "loss": 0.1254, + "step": 1328 + }, + { + "epoch": 1.2618086873961547, + "grad_norm": 0.04395739734172821, + "learning_rate": 0.00014362290247560053, + "loss": 0.1537, + "step": 1329 + }, + { + "epoch": 1.2627581295988608, + "grad_norm": 0.02942941151559353, + "learning_rate": 0.00014352344156254873, + "loss": 0.1248, + "step": 1330 + }, + { + "epoch": 1.2637075718015667, + "grad_norm": 0.02858722023665905, + "learning_rate": 0.00014342392750366485, + "loss": 0.1236, + "step": 1331 + }, + { + "epoch": 1.2646570140042726, + "grad_norm": 0.029218707233667374, + "learning_rate": 0.000143324360420464, + "loss": 0.1227, + "step": 1332 + }, + { + "epoch": 1.2656064562069784, + "grad_norm": 0.03079938143491745, + "learning_rate": 0.0001432247404345261, + "loss": 0.1251, + "step": 1333 + }, + { + "epoch": 1.2665558984096843, + "grad_norm": 0.03626713901758194, + "learning_rate": 0.00014312506766749563, + "loss": 0.1407, + "step": 1334 + }, + { + "epoch": 1.2675053406123902, + "grad_norm": 0.026556458324193954, + "learning_rate": 0.00014302534224108152, + "loss": 0.1235, + "step": 1335 + }, + { + "epoch": 1.268454782815096, + "grad_norm": 0.033421531319618225, + "learning_rate": 0.00014292556427705706, + "loss": 0.1324, + "step": 1336 + }, + { + "epoch": 1.269404225017802, + "grad_norm": 0.0425841398537159, + "learning_rate": 0.00014282573389725966, + "loss": 0.1674, + "step": 1337 + }, + { + "epoch": 1.270353667220508, + "grad_norm": 0.03258546441793442, + "learning_rate": 0.00014272585122359068, + "loss": 0.131, + "step": 1338 + }, + { + "epoch": 1.271303109423214, + "grad_norm": 0.03566194325685501, + "learning_rate": 0.00014262591637801536, + "loss": 0.128, + "step": 1339 + }, + { + "epoch": 1.2722525516259198, + "grad_norm": 0.03155380114912987, + "learning_rate": 0.0001425259294825627, + "loss": 0.1277, + "step": 1340 + }, + { + "epoch": 1.2732019938286256, + "grad_norm": 0.04435742273926735, + "learning_rate": 0.00014242589065932524, + "loss": 0.1594, + "step": 1341 + }, + { + "epoch": 1.2741514360313315, + "grad_norm": 0.03839895501732826, + "learning_rate": 0.0001423258000304589, + "loss": 0.1598, + "step": 1342 + }, + { + "epoch": 1.2751008782340376, + "grad_norm": 0.037625472992658615, + "learning_rate": 0.00014222565771818282, + "loss": 0.1276, + "step": 1343 + }, + { + "epoch": 1.2760503204367435, + "grad_norm": 0.029852135106921196, + "learning_rate": 0.00014212546384477934, + "loss": 0.1272, + "step": 1344 + }, + { + "epoch": 1.2769997626394494, + "grad_norm": 0.04018719121813774, + "learning_rate": 0.00014202521853259368, + "loss": 0.153, + "step": 1345 + }, + { + "epoch": 1.2779492048421552, + "grad_norm": 0.028893720358610153, + "learning_rate": 0.00014192492190403402, + "loss": 0.1245, + "step": 1346 + }, + { + "epoch": 1.2788986470448611, + "grad_norm": 0.029052307829260826, + "learning_rate": 0.000141824574081571, + "loss": 0.1272, + "step": 1347 + }, + { + "epoch": 1.279848089247567, + "grad_norm": 0.03023959882557392, + "learning_rate": 0.00014172417518773788, + "loss": 0.1259, + "step": 1348 + }, + { + "epoch": 1.2807975314502729, + "grad_norm": 0.06486006826162338, + "learning_rate": 0.00014162372534513027, + "loss": 0.2279, + "step": 1349 + }, + { + "epoch": 1.2817469736529787, + "grad_norm": 0.034129489213228226, + "learning_rate": 0.00014152322467640599, + "loss": 0.138, + "step": 1350 + }, + { + "epoch": 1.2826964158556848, + "grad_norm": 0.034559451043605804, + "learning_rate": 0.0001414226733042849, + "loss": 0.1321, + "step": 1351 + }, + { + "epoch": 1.2836458580583907, + "grad_norm": 0.03269064798951149, + "learning_rate": 0.0001413220713515489, + "loss": 0.1297, + "step": 1352 + }, + { + "epoch": 1.2845953002610966, + "grad_norm": 0.030022764578461647, + "learning_rate": 0.0001412214189410414, + "loss": 0.1278, + "step": 1353 + }, + { + "epoch": 1.2855447424638025, + "grad_norm": 0.05017710104584694, + "learning_rate": 0.00014112071619566766, + "loss": 0.1572, + "step": 1354 + }, + { + "epoch": 1.2864941846665086, + "grad_norm": 0.035493746399879456, + "learning_rate": 0.00014101996323839433, + "loss": 0.1277, + "step": 1355 + }, + { + "epoch": 1.2874436268692144, + "grad_norm": 0.03152285888791084, + "learning_rate": 0.0001409191601922493, + "loss": 0.1321, + "step": 1356 + }, + { + "epoch": 1.2883930690719203, + "grad_norm": 0.029683001339435577, + "learning_rate": 0.00014081830718032175, + "loss": 0.1224, + "step": 1357 + }, + { + "epoch": 1.2893425112746262, + "grad_norm": 0.03202647715806961, + "learning_rate": 0.0001407174043257617, + "loss": 0.13, + "step": 1358 + }, + { + "epoch": 1.290291953477332, + "grad_norm": 0.026956327259540558, + "learning_rate": 0.00014061645175178025, + "loss": 0.1225, + "step": 1359 + }, + { + "epoch": 1.291241395680038, + "grad_norm": 0.03426060825586319, + "learning_rate": 0.00014051544958164903, + "loss": 0.1345, + "step": 1360 + }, + { + "epoch": 1.2921908378827438, + "grad_norm": 0.031120451167225838, + "learning_rate": 0.00014041439793870036, + "loss": 0.1246, + "step": 1361 + }, + { + "epoch": 1.2931402800854497, + "grad_norm": 0.02656574547290802, + "learning_rate": 0.00014031329694632683, + "loss": 0.1297, + "step": 1362 + }, + { + "epoch": 1.2940897222881558, + "grad_norm": 0.02752675488591194, + "learning_rate": 0.00014021214672798143, + "loss": 0.1294, + "step": 1363 + }, + { + "epoch": 1.2950391644908616, + "grad_norm": 0.02884535677731037, + "learning_rate": 0.00014011094740717714, + "loss": 0.126, + "step": 1364 + }, + { + "epoch": 1.2959886066935675, + "grad_norm": 0.03029620461165905, + "learning_rate": 0.00014000969910748704, + "loss": 0.1338, + "step": 1365 + }, + { + "epoch": 1.2969380488962734, + "grad_norm": 0.04302069917321205, + "learning_rate": 0.00013990840195254384, + "loss": 0.1653, + "step": 1366 + }, + { + "epoch": 1.2978874910989793, + "grad_norm": 0.048259928822517395, + "learning_rate": 0.00013980705606604011, + "loss": 0.1269, + "step": 1367 + }, + { + "epoch": 1.2988369333016854, + "grad_norm": 0.029876641929149628, + "learning_rate": 0.00013970566157172774, + "loss": 0.1282, + "step": 1368 + }, + { + "epoch": 1.2997863755043912, + "grad_norm": 0.35130763053894043, + "learning_rate": 0.00013960421859341804, + "loss": 0.1434, + "step": 1369 + }, + { + "epoch": 1.3007358177070971, + "grad_norm": 0.23888662457466125, + "learning_rate": 0.00013950272725498156, + "loss": 0.186, + "step": 1370 + }, + { + "epoch": 1.301685259909803, + "grad_norm": 0.24867364764213562, + "learning_rate": 0.00013940118768034792, + "loss": 0.1585, + "step": 1371 + }, + { + "epoch": 1.3026347021125089, + "grad_norm": 0.10958488285541534, + "learning_rate": 0.0001392995999935055, + "loss": 0.1448, + "step": 1372 + }, + { + "epoch": 1.3035841443152147, + "grad_norm": 0.05493846908211708, + "learning_rate": 0.0001391979643185016, + "loss": 0.1292, + "step": 1373 + }, + { + "epoch": 1.3045335865179206, + "grad_norm": 0.0429663360118866, + "learning_rate": 0.000139096280779442, + "loss": 0.1243, + "step": 1374 + }, + { + "epoch": 1.3054830287206267, + "grad_norm": 0.02995472215116024, + "learning_rate": 0.000138994549500491, + "loss": 0.1216, + "step": 1375 + }, + { + "epoch": 1.3064324709233326, + "grad_norm": 0.04113904386758804, + "learning_rate": 0.00013889277060587119, + "loss": 0.1586, + "step": 1376 + }, + { + "epoch": 1.3073819131260385, + "grad_norm": 0.030523164197802544, + "learning_rate": 0.0001387909442198632, + "loss": 0.1212, + "step": 1377 + }, + { + "epoch": 1.3083313553287443, + "grad_norm": 0.04093822091817856, + "learning_rate": 0.00013868907046680576, + "loss": 0.1254, + "step": 1378 + }, + { + "epoch": 1.3092807975314502, + "grad_norm": 0.04895343258976936, + "learning_rate": 0.0001385871494710954, + "loss": 0.1636, + "step": 1379 + }, + { + "epoch": 1.3102302397341563, + "grad_norm": 0.09062381833791733, + "learning_rate": 0.0001384851813571864, + "loss": 0.167, + "step": 1380 + }, + { + "epoch": 1.3111796819368622, + "grad_norm": 0.034514930099248886, + "learning_rate": 0.00013838316624959044, + "loss": 0.1186, + "step": 1381 + }, + { + "epoch": 1.312129124139568, + "grad_norm": 0.052746132016181946, + "learning_rate": 0.0001382811042728767, + "loss": 0.1289, + "step": 1382 + }, + { + "epoch": 1.313078566342274, + "grad_norm": 0.06090299040079117, + "learning_rate": 0.00013817899555167154, + "loss": 0.1599, + "step": 1383 + }, + { + "epoch": 1.3140280085449798, + "grad_norm": 0.036167677491903305, + "learning_rate": 0.00013807684021065842, + "loss": 0.1294, + "step": 1384 + }, + { + "epoch": 1.3149774507476857, + "grad_norm": 0.05916972458362579, + "learning_rate": 0.00013797463837457775, + "loss": 0.1263, + "step": 1385 + }, + { + "epoch": 1.3159268929503916, + "grad_norm": 0.03488500416278839, + "learning_rate": 0.00013787239016822662, + "loss": 0.1347, + "step": 1386 + }, + { + "epoch": 1.3168763351530974, + "grad_norm": 0.038088779896497726, + "learning_rate": 0.00013777009571645885, + "loss": 0.1302, + "step": 1387 + }, + { + "epoch": 1.3178257773558035, + "grad_norm": 0.05069038271903992, + "learning_rate": 0.00013766775514418469, + "loss": 0.1553, + "step": 1388 + }, + { + "epoch": 1.3187752195585094, + "grad_norm": 0.06475794315338135, + "learning_rate": 0.00013756536857637065, + "loss": 0.2018, + "step": 1389 + }, + { + "epoch": 1.3197246617612153, + "grad_norm": 0.03393110632896423, + "learning_rate": 0.00013746293613803952, + "loss": 0.1238, + "step": 1390 + }, + { + "epoch": 1.3206741039639212, + "grad_norm": 0.04623769596219063, + "learning_rate": 0.00013736045795427002, + "loss": 0.1603, + "step": 1391 + }, + { + "epoch": 1.321623546166627, + "grad_norm": 0.03696979209780693, + "learning_rate": 0.0001372579341501967, + "loss": 0.1291, + "step": 1392 + }, + { + "epoch": 1.3225729883693331, + "grad_norm": 0.041281502693891525, + "learning_rate": 0.00013715536485100994, + "loss": 0.1344, + "step": 1393 + }, + { + "epoch": 1.323522430572039, + "grad_norm": 0.03636416420340538, + "learning_rate": 0.00013705275018195557, + "loss": 0.1334, + "step": 1394 + }, + { + "epoch": 1.3244718727747449, + "grad_norm": 0.037941355258226395, + "learning_rate": 0.0001369500902683348, + "loss": 0.1256, + "step": 1395 + }, + { + "epoch": 1.3254213149774507, + "grad_norm": 0.04853476956486702, + "learning_rate": 0.0001368473852355042, + "loss": 0.1604, + "step": 1396 + }, + { + "epoch": 1.3263707571801566, + "grad_norm": 0.034059878438711166, + "learning_rate": 0.00013674463520887533, + "loss": 0.1308, + "step": 1397 + }, + { + "epoch": 1.3273201993828625, + "grad_norm": 0.03482759743928909, + "learning_rate": 0.00013664184031391473, + "loss": 0.128, + "step": 1398 + }, + { + "epoch": 1.3282696415855684, + "grad_norm": 0.032961416989564896, + "learning_rate": 0.00013653900067614377, + "loss": 0.1276, + "step": 1399 + }, + { + "epoch": 1.3292190837882745, + "grad_norm": 0.03538922220468521, + "learning_rate": 0.00013643611642113842, + "loss": 0.1215, + "step": 1400 + }, + { + "epoch": 1.3301685259909803, + "grad_norm": 0.032544538378715515, + "learning_rate": 0.00013633318767452903, + "loss": 0.1235, + "step": 1401 + }, + { + "epoch": 1.3311179681936862, + "grad_norm": 0.04236935079097748, + "learning_rate": 0.00013623021456200048, + "loss": 0.1663, + "step": 1402 + }, + { + "epoch": 1.332067410396392, + "grad_norm": 0.04283679649233818, + "learning_rate": 0.00013612719720929164, + "loss": 0.1622, + "step": 1403 + }, + { + "epoch": 1.333016852599098, + "grad_norm": 0.03691123425960541, + "learning_rate": 0.00013602413574219553, + "loss": 0.1247, + "step": 1404 + }, + { + "epoch": 1.333966294801804, + "grad_norm": 0.028608130291104317, + "learning_rate": 0.000135921030286559, + "loss": 0.1153, + "step": 1405 + }, + { + "epoch": 1.33491573700451, + "grad_norm": 0.03310587257146835, + "learning_rate": 0.00013581788096828253, + "loss": 0.1305, + "step": 1406 + }, + { + "epoch": 1.3358651792072158, + "grad_norm": 0.03368659317493439, + "learning_rate": 0.00013571468791332024, + "loss": 0.128, + "step": 1407 + }, + { + "epoch": 1.3368146214099217, + "grad_norm": 0.04785076901316643, + "learning_rate": 0.00013561145124767968, + "loss": 0.1715, + "step": 1408 + }, + { + "epoch": 1.3377640636126276, + "grad_norm": 0.03625485301017761, + "learning_rate": 0.0001355081710974217, + "loss": 0.1305, + "step": 1409 + }, + { + "epoch": 1.3387135058153334, + "grad_norm": 0.03318242356181145, + "learning_rate": 0.00013540484758866, + "loss": 0.1244, + "step": 1410 + }, + { + "epoch": 1.3396629480180393, + "grad_norm": 0.03148429095745087, + "learning_rate": 0.0001353014808475615, + "loss": 0.1311, + "step": 1411 + }, + { + "epoch": 1.3406123902207452, + "grad_norm": 0.03518190607428551, + "learning_rate": 0.00013519807100034577, + "loss": 0.1276, + "step": 1412 + }, + { + "epoch": 1.3415618324234513, + "grad_norm": 0.031286850571632385, + "learning_rate": 0.00013509461817328507, + "loss": 0.1252, + "step": 1413 + }, + { + "epoch": 1.3425112746261572, + "grad_norm": 0.04668812453746796, + "learning_rate": 0.00013499112249270407, + "loss": 0.1639, + "step": 1414 + }, + { + "epoch": 1.343460716828863, + "grad_norm": 0.03220203518867493, + "learning_rate": 0.00013488758408497988, + "loss": 0.1254, + "step": 1415 + }, + { + "epoch": 1.344410159031569, + "grad_norm": 0.03599967062473297, + "learning_rate": 0.0001347840030765417, + "loss": 0.1307, + "step": 1416 + }, + { + "epoch": 1.3453596012342748, + "grad_norm": 0.03225992992520332, + "learning_rate": 0.00013468037959387075, + "loss": 0.12, + "step": 1417 + }, + { + "epoch": 1.3463090434369809, + "grad_norm": 0.0338221937417984, + "learning_rate": 0.00013457671376350012, + "loss": 0.1199, + "step": 1418 + }, + { + "epoch": 1.3472584856396868, + "grad_norm": 0.046623844653367996, + "learning_rate": 0.00013447300571201468, + "loss": 0.1695, + "step": 1419 + }, + { + "epoch": 1.3482079278423926, + "grad_norm": 0.029645482078194618, + "learning_rate": 0.00013436925556605078, + "loss": 0.127, + "step": 1420 + }, + { + "epoch": 1.3491573700450985, + "grad_norm": 0.03167693316936493, + "learning_rate": 0.00013426546345229618, + "loss": 0.1268, + "step": 1421 + }, + { + "epoch": 1.3501068122478044, + "grad_norm": 0.04903126880526543, + "learning_rate": 0.0001341616294974899, + "loss": 0.1616, + "step": 1422 + }, + { + "epoch": 1.3510562544505103, + "grad_norm": 0.03323996067047119, + "learning_rate": 0.00013405775382842206, + "loss": 0.1345, + "step": 1423 + }, + { + "epoch": 1.3520056966532161, + "grad_norm": 0.03047449141740799, + "learning_rate": 0.0001339538365719337, + "loss": 0.1286, + "step": 1424 + }, + { + "epoch": 1.3529551388559222, + "grad_norm": 0.03170877322554588, + "learning_rate": 0.00013384987785491665, + "loss": 0.1264, + "step": 1425 + }, + { + "epoch": 1.353904581058628, + "grad_norm": 0.03209366276860237, + "learning_rate": 0.00013374587780431337, + "loss": 0.1256, + "step": 1426 + }, + { + "epoch": 1.354854023261334, + "grad_norm": 0.04508209601044655, + "learning_rate": 0.00013364183654711678, + "loss": 0.1675, + "step": 1427 + }, + { + "epoch": 1.3558034654640398, + "grad_norm": 0.0423690564930439, + "learning_rate": 0.00013353775421037008, + "loss": 0.1578, + "step": 1428 + }, + { + "epoch": 1.3567529076667457, + "grad_norm": 0.03641896694898605, + "learning_rate": 0.0001334336309211668, + "loss": 0.1321, + "step": 1429 + }, + { + "epoch": 1.3577023498694518, + "grad_norm": 0.03118027374148369, + "learning_rate": 0.00013332946680665023, + "loss": 0.1284, + "step": 1430 + }, + { + "epoch": 1.3586517920721577, + "grad_norm": 0.029232513159513474, + "learning_rate": 0.00013322526199401367, + "loss": 0.1251, + "step": 1431 + }, + { + "epoch": 1.3596012342748636, + "grad_norm": 0.028796685859560966, + "learning_rate": 0.00013312101661050007, + "loss": 0.1266, + "step": 1432 + }, + { + "epoch": 1.3605506764775694, + "grad_norm": 0.04318710416555405, + "learning_rate": 0.00013301673078340196, + "loss": 0.162, + "step": 1433 + }, + { + "epoch": 1.3615001186802753, + "grad_norm": 0.0319884791970253, + "learning_rate": 0.00013291240464006118, + "loss": 0.1248, + "step": 1434 + }, + { + "epoch": 1.3624495608829812, + "grad_norm": 0.034977950155735016, + "learning_rate": 0.0001328080383078689, + "loss": 0.1325, + "step": 1435 + }, + { + "epoch": 1.363399003085687, + "grad_norm": 0.026079673320055008, + "learning_rate": 0.00013270363191426524, + "loss": 0.1266, + "step": 1436 + }, + { + "epoch": 1.364348445288393, + "grad_norm": 0.03158127889037132, + "learning_rate": 0.0001325991855867394, + "loss": 0.1326, + "step": 1437 + }, + { + "epoch": 1.365297887491099, + "grad_norm": 0.030608203262090683, + "learning_rate": 0.00013249469945282916, + "loss": 0.1358, + "step": 1438 + }, + { + "epoch": 1.366247329693805, + "grad_norm": 0.028404321521520615, + "learning_rate": 0.00013239017364012105, + "loss": 0.1273, + "step": 1439 + }, + { + "epoch": 1.3671967718965108, + "grad_norm": 0.03177287429571152, + "learning_rate": 0.00013228560827624995, + "loss": 0.1241, + "step": 1440 + }, + { + "epoch": 1.3681462140992167, + "grad_norm": 0.030803440138697624, + "learning_rate": 0.00013218100348889912, + "loss": 0.1271, + "step": 1441 + }, + { + "epoch": 1.3690956563019228, + "grad_norm": 0.038757532835006714, + "learning_rate": 0.0001320763594057999, + "loss": 0.1302, + "step": 1442 + }, + { + "epoch": 1.3700450985046286, + "grad_norm": 0.030192391946911812, + "learning_rate": 0.00013197167615473164, + "loss": 0.1246, + "step": 1443 + }, + { + "epoch": 1.3709945407073345, + "grad_norm": 0.056465089321136475, + "learning_rate": 0.00013186695386352158, + "loss": 0.186, + "step": 1444 + }, + { + "epoch": 1.3719439829100404, + "grad_norm": 0.029975950717926025, + "learning_rate": 0.00013176219266004442, + "loss": 0.1238, + "step": 1445 + }, + { + "epoch": 1.3728934251127463, + "grad_norm": 0.03526061028242111, + "learning_rate": 0.00013165739267222262, + "loss": 0.1198, + "step": 1446 + }, + { + "epoch": 1.3738428673154521, + "grad_norm": 0.031407009810209274, + "learning_rate": 0.0001315525540280259, + "loss": 0.1283, + "step": 1447 + }, + { + "epoch": 1.374792309518158, + "grad_norm": 0.05062803998589516, + "learning_rate": 0.0001314476768554712, + "loss": 0.1598, + "step": 1448 + }, + { + "epoch": 1.3757417517208639, + "grad_norm": 0.05837153270840645, + "learning_rate": 0.0001313427612826224, + "loss": 0.1706, + "step": 1449 + }, + { + "epoch": 1.37669119392357, + "grad_norm": 0.032810915261507034, + "learning_rate": 0.0001312378074375904, + "loss": 0.1228, + "step": 1450 + }, + { + "epoch": 1.3776406361262759, + "grad_norm": 0.052471403032541275, + "learning_rate": 0.0001311328154485328, + "loss": 0.2028, + "step": 1451 + }, + { + "epoch": 1.3785900783289817, + "grad_norm": 0.062009330838918686, + "learning_rate": 0.00013102778544365378, + "loss": 0.1659, + "step": 1452 + }, + { + "epoch": 1.3795395205316876, + "grad_norm": 0.03157106414437294, + "learning_rate": 0.00013092271755120392, + "loss": 0.1103, + "step": 1453 + }, + { + "epoch": 1.3804889627343935, + "grad_norm": 0.03269607201218605, + "learning_rate": 0.00013081761189948006, + "loss": 0.1274, + "step": 1454 + }, + { + "epoch": 1.3814384049370996, + "grad_norm": 0.032491281628608704, + "learning_rate": 0.00013071246861682515, + "loss": 0.1299, + "step": 1455 + }, + { + "epoch": 1.3823878471398054, + "grad_norm": 0.04476369544863701, + "learning_rate": 0.00013060728783162814, + "loss": 0.1543, + "step": 1456 + }, + { + "epoch": 1.3833372893425113, + "grad_norm": 0.03646747022867203, + "learning_rate": 0.0001305020696723237, + "loss": 0.1401, + "step": 1457 + }, + { + "epoch": 1.3842867315452172, + "grad_norm": 0.03056688979268074, + "learning_rate": 0.0001303968142673922, + "loss": 0.1318, + "step": 1458 + }, + { + "epoch": 1.385236173747923, + "grad_norm": 0.0505138523876667, + "learning_rate": 0.00013029152174535942, + "loss": 0.1702, + "step": 1459 + }, + { + "epoch": 1.386185615950629, + "grad_norm": 0.03478003665804863, + "learning_rate": 0.00013018619223479654, + "loss": 0.136, + "step": 1460 + }, + { + "epoch": 1.3871350581533348, + "grad_norm": 0.03196396306157112, + "learning_rate": 0.00013008082586431983, + "loss": 0.1235, + "step": 1461 + }, + { + "epoch": 1.3880845003560407, + "grad_norm": 0.030580265447497368, + "learning_rate": 0.0001299754227625907, + "loss": 0.1274, + "step": 1462 + }, + { + "epoch": 1.3890339425587468, + "grad_norm": 0.043844275176525116, + "learning_rate": 0.00012986998305831524, + "loss": 0.172, + "step": 1463 + }, + { + "epoch": 1.3899833847614527, + "grad_norm": 0.031638097018003464, + "learning_rate": 0.00012976450688024433, + "loss": 0.1221, + "step": 1464 + }, + { + "epoch": 1.3909328269641585, + "grad_norm": 0.030004551634192467, + "learning_rate": 0.00012965899435717337, + "loss": 0.1321, + "step": 1465 + }, + { + "epoch": 1.3918822691668644, + "grad_norm": 0.031170252710580826, + "learning_rate": 0.00012955344561794218, + "loss": 0.1327, + "step": 1466 + }, + { + "epoch": 1.3928317113695705, + "grad_norm": 0.02902391366660595, + "learning_rate": 0.00012944786079143472, + "loss": 0.1266, + "step": 1467 + }, + { + "epoch": 1.3937811535722764, + "grad_norm": 0.031269557774066925, + "learning_rate": 0.00012934224000657913, + "loss": 0.1237, + "step": 1468 + }, + { + "epoch": 1.3947305957749823, + "grad_norm": 0.04232865571975708, + "learning_rate": 0.0001292365833923473, + "loss": 0.1529, + "step": 1469 + }, + { + "epoch": 1.3956800379776881, + "grad_norm": 0.03645455837249756, + "learning_rate": 0.00012913089107775502, + "loss": 0.1594, + "step": 1470 + }, + { + "epoch": 1.396629480180394, + "grad_norm": 0.029367268085479736, + "learning_rate": 0.00012902516319186161, + "loss": 0.121, + "step": 1471 + }, + { + "epoch": 1.3975789223830999, + "grad_norm": 0.03407928720116615, + "learning_rate": 0.00012891939986376985, + "loss": 0.1289, + "step": 1472 + }, + { + "epoch": 1.3985283645858058, + "grad_norm": 0.034839022904634476, + "learning_rate": 0.00012881360122262575, + "loss": 0.1321, + "step": 1473 + }, + { + "epoch": 1.3994778067885116, + "grad_norm": 0.04005248472094536, + "learning_rate": 0.00012870776739761847, + "loss": 0.17, + "step": 1474 + }, + { + "epoch": 1.4004272489912177, + "grad_norm": 0.045347243547439575, + "learning_rate": 0.00012860189851798012, + "loss": 0.16, + "step": 1475 + }, + { + "epoch": 1.4013766911939236, + "grad_norm": 0.031093263998627663, + "learning_rate": 0.00012849599471298565, + "loss": 0.1268, + "step": 1476 + }, + { + "epoch": 1.4023261333966295, + "grad_norm": 0.025807669386267662, + "learning_rate": 0.00012839005611195269, + "loss": 0.1155, + "step": 1477 + }, + { + "epoch": 1.4032755755993354, + "grad_norm": 0.04496198520064354, + "learning_rate": 0.00012828408284424117, + "loss": 0.1696, + "step": 1478 + }, + { + "epoch": 1.4042250178020412, + "grad_norm": 0.028986521065235138, + "learning_rate": 0.00012817807503925357, + "loss": 0.1244, + "step": 1479 + }, + { + "epoch": 1.4051744600047473, + "grad_norm": 0.043852079659700394, + "learning_rate": 0.00012807203282643443, + "loss": 0.1562, + "step": 1480 + }, + { + "epoch": 1.4061239022074532, + "grad_norm": 0.028449110686779022, + "learning_rate": 0.00012796595633527032, + "loss": 0.1276, + "step": 1481 + }, + { + "epoch": 1.407073344410159, + "grad_norm": 0.03772464022040367, + "learning_rate": 0.00012785984569528975, + "loss": 0.163, + "step": 1482 + }, + { + "epoch": 1.408022786612865, + "grad_norm": 0.03227540850639343, + "learning_rate": 0.00012775370103606276, + "loss": 0.1272, + "step": 1483 + }, + { + "epoch": 1.4089722288155708, + "grad_norm": 0.03001963160932064, + "learning_rate": 0.0001276475224872011, + "loss": 0.1256, + "step": 1484 + }, + { + "epoch": 1.4099216710182767, + "grad_norm": 0.0357728935778141, + "learning_rate": 0.00012754131017835777, + "loss": 0.1355, + "step": 1485 + }, + { + "epoch": 1.4108711132209826, + "grad_norm": 0.03219794109463692, + "learning_rate": 0.0001274350642392271, + "loss": 0.1304, + "step": 1486 + }, + { + "epoch": 1.4118205554236885, + "grad_norm": 0.04606242850422859, + "learning_rate": 0.00012732878479954445, + "loss": 0.1629, + "step": 1487 + }, + { + "epoch": 1.4127699976263945, + "grad_norm": 0.04288827255368233, + "learning_rate": 0.000127222471989086, + "loss": 0.1667, + "step": 1488 + }, + { + "epoch": 1.4137194398291004, + "grad_norm": 0.031533095985651016, + "learning_rate": 0.0001271161259376688, + "loss": 0.1266, + "step": 1489 + }, + { + "epoch": 1.4146688820318063, + "grad_norm": 0.03418329730629921, + "learning_rate": 0.00012700974677515046, + "loss": 0.1441, + "step": 1490 + }, + { + "epoch": 1.4156183242345122, + "grad_norm": 0.028918685391545296, + "learning_rate": 0.00012690333463142897, + "loss": 0.117, + "step": 1491 + }, + { + "epoch": 1.4165677664372183, + "grad_norm": 0.04630662500858307, + "learning_rate": 0.00012679688963644265, + "loss": 0.1694, + "step": 1492 + }, + { + "epoch": 1.4175172086399241, + "grad_norm": 0.028670761734247208, + "learning_rate": 0.00012669041192016993, + "loss": 0.1218, + "step": 1493 + }, + { + "epoch": 1.41846665084263, + "grad_norm": 0.03250902146100998, + "learning_rate": 0.0001265839016126291, + "loss": 0.1353, + "step": 1494 + }, + { + "epoch": 1.419416093045336, + "grad_norm": 0.03904202580451965, + "learning_rate": 0.00012647735884387842, + "loss": 0.1566, + "step": 1495 + }, + { + "epoch": 1.4203655352480418, + "grad_norm": 0.030610278248786926, + "learning_rate": 0.00012637078374401568, + "loss": 0.1248, + "step": 1496 + }, + { + "epoch": 1.4213149774507476, + "grad_norm": 0.0320439413189888, + "learning_rate": 0.00012626417644317808, + "loss": 0.1341, + "step": 1497 + }, + { + "epoch": 1.4222644196534535, + "grad_norm": 0.03740748390555382, + "learning_rate": 0.0001261575370715423, + "loss": 0.1374, + "step": 1498 + }, + { + "epoch": 1.4232138618561594, + "grad_norm": 0.041164278984069824, + "learning_rate": 0.00012605086575932407, + "loss": 0.1242, + "step": 1499 + }, + { + "epoch": 1.4241633040588655, + "grad_norm": 0.02908271551132202, + "learning_rate": 0.00012594416263677816, + "loss": 0.1224, + "step": 1500 + }, + { + "epoch": 1.4251127462615714, + "grad_norm": 0.030539128929376602, + "learning_rate": 0.0001258374278341982, + "loss": 0.1236, + "step": 1501 + }, + { + "epoch": 1.4260621884642772, + "grad_norm": 0.027197500690817833, + "learning_rate": 0.00012573066148191647, + "loss": 0.1254, + "step": 1502 + }, + { + "epoch": 1.427011630666983, + "grad_norm": 0.031813718378543854, + "learning_rate": 0.00012562386371030377, + "loss": 0.1294, + "step": 1503 + }, + { + "epoch": 1.427961072869689, + "grad_norm": 0.04184641316533089, + "learning_rate": 0.00012551703464976928, + "loss": 0.1615, + "step": 1504 + }, + { + "epoch": 1.428910515072395, + "grad_norm": 0.03790717199444771, + "learning_rate": 0.00012541017443076042, + "loss": 0.1638, + "step": 1505 + }, + { + "epoch": 1.429859957275101, + "grad_norm": 0.03084125556051731, + "learning_rate": 0.00012530328318376258, + "loss": 0.1292, + "step": 1506 + }, + { + "epoch": 1.4308093994778068, + "grad_norm": 0.042278289794921875, + "learning_rate": 0.00012519636103929912, + "loss": 0.1691, + "step": 1507 + }, + { + "epoch": 1.4317588416805127, + "grad_norm": 0.02734595723450184, + "learning_rate": 0.0001250894081279311, + "loss": 0.1248, + "step": 1508 + }, + { + "epoch": 1.4327082838832186, + "grad_norm": 0.02997264452278614, + "learning_rate": 0.00012498242458025712, + "loss": 0.124, + "step": 1509 + }, + { + "epoch": 1.4336577260859245, + "grad_norm": 0.031008126214146614, + "learning_rate": 0.00012487541052691323, + "loss": 0.1335, + "step": 1510 + }, + { + "epoch": 1.4346071682886303, + "grad_norm": 0.042471520602703094, + "learning_rate": 0.0001247683660985727, + "loss": 0.1589, + "step": 1511 + }, + { + "epoch": 1.4355566104913362, + "grad_norm": 0.027912134304642677, + "learning_rate": 0.00012466129142594588, + "loss": 0.1208, + "step": 1512 + }, + { + "epoch": 1.4365060526940423, + "grad_norm": 0.03753120079636574, + "learning_rate": 0.0001245541866397801, + "loss": 0.1626, + "step": 1513 + }, + { + "epoch": 1.4374554948967482, + "grad_norm": 0.02756452187895775, + "learning_rate": 0.0001244470518708594, + "loss": 0.1215, + "step": 1514 + }, + { + "epoch": 1.438404937099454, + "grad_norm": 0.03357706964015961, + "learning_rate": 0.0001243398872500045, + "loss": 0.1376, + "step": 1515 + }, + { + "epoch": 1.43935437930216, + "grad_norm": 0.032955266535282135, + "learning_rate": 0.00012423269290807258, + "loss": 0.1364, + "step": 1516 + }, + { + "epoch": 1.440303821504866, + "grad_norm": 0.03405732661485672, + "learning_rate": 0.000124125468975957, + "loss": 0.126, + "step": 1517 + }, + { + "epoch": 1.441253263707572, + "grad_norm": 0.03659766912460327, + "learning_rate": 0.00012401821558458728, + "loss": 0.1561, + "step": 1518 + }, + { + "epoch": 1.4422027059102778, + "grad_norm": 0.030413653701543808, + "learning_rate": 0.00012391093286492905, + "loss": 0.1253, + "step": 1519 + }, + { + "epoch": 1.4431521481129836, + "grad_norm": 0.027322586625814438, + "learning_rate": 0.00012380362094798362, + "loss": 0.1217, + "step": 1520 + }, + { + "epoch": 1.4441015903156895, + "grad_norm": 0.037558842450380325, + "learning_rate": 0.00012369627996478797, + "loss": 0.1348, + "step": 1521 + }, + { + "epoch": 1.4450510325183954, + "grad_norm": 0.028927726671099663, + "learning_rate": 0.0001235889100464146, + "loss": 0.1184, + "step": 1522 + }, + { + "epoch": 1.4460004747211013, + "grad_norm": 0.028258686885237694, + "learning_rate": 0.00012348151132397133, + "loss": 0.1276, + "step": 1523 + }, + { + "epoch": 1.4469499169238071, + "grad_norm": 0.027749182656407356, + "learning_rate": 0.00012337408392860117, + "loss": 0.1235, + "step": 1524 + }, + { + "epoch": 1.4478993591265132, + "grad_norm": 0.03684193268418312, + "learning_rate": 0.0001232666279914821, + "loss": 0.1421, + "step": 1525 + }, + { + "epoch": 1.4488488013292191, + "grad_norm": 0.033882539719343185, + "learning_rate": 0.00012315914364382705, + "loss": 0.1236, + "step": 1526 + }, + { + "epoch": 1.449798243531925, + "grad_norm": 0.03675851225852966, + "learning_rate": 0.00012305163101688352, + "loss": 0.1618, + "step": 1527 + }, + { + "epoch": 1.4507476857346309, + "grad_norm": 0.030739063397049904, + "learning_rate": 0.00012294409024193355, + "loss": 0.127, + "step": 1528 + }, + { + "epoch": 1.4516971279373367, + "grad_norm": 0.026678606867790222, + "learning_rate": 0.00012283652145029362, + "loss": 0.1191, + "step": 1529 + }, + { + "epoch": 1.4526465701400428, + "grad_norm": 0.028862981125712395, + "learning_rate": 0.0001227289247733144, + "loss": 0.1255, + "step": 1530 + }, + { + "epoch": 1.4535960123427487, + "grad_norm": 0.02993926964700222, + "learning_rate": 0.0001226213003423807, + "loss": 0.1235, + "step": 1531 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.02765621617436409, + "learning_rate": 0.00012251364828891093, + "loss": 0.127, + "step": 1532 + }, + { + "epoch": 1.4554948967481605, + "grad_norm": 0.03139231353998184, + "learning_rate": 0.00012240596874435756, + "loss": 0.1225, + "step": 1533 + }, + { + "epoch": 1.4564443389508663, + "grad_norm": 0.030977755784988403, + "learning_rate": 0.00012229826184020649, + "loss": 0.1249, + "step": 1534 + }, + { + "epoch": 1.4573937811535722, + "grad_norm": 0.03158799931406975, + "learning_rate": 0.000122190527707977, + "loss": 0.1254, + "step": 1535 + }, + { + "epoch": 1.458343223356278, + "grad_norm": 0.03206062689423561, + "learning_rate": 0.00012208276647922162, + "loss": 0.1332, + "step": 1536 + }, + { + "epoch": 1.459292665558984, + "grad_norm": 0.028509238734841347, + "learning_rate": 0.00012197497828552601, + "loss": 0.1196, + "step": 1537 + }, + { + "epoch": 1.46024210776169, + "grad_norm": 0.03207945078611374, + "learning_rate": 0.0001218671632585088, + "loss": 0.1241, + "step": 1538 + }, + { + "epoch": 1.461191549964396, + "grad_norm": 0.026839956641197205, + "learning_rate": 0.00012175932152982125, + "loss": 0.1246, + "step": 1539 + }, + { + "epoch": 1.4621409921671018, + "grad_norm": 0.03129103407263756, + "learning_rate": 0.0001216514532311474, + "loss": 0.1192, + "step": 1540 + }, + { + "epoch": 1.4630904343698077, + "grad_norm": 0.027821926400065422, + "learning_rate": 0.00012154355849420353, + "loss": 0.1221, + "step": 1541 + }, + { + "epoch": 1.4640398765725138, + "grad_norm": 0.028374498710036278, + "learning_rate": 0.00012143563745073836, + "loss": 0.1253, + "step": 1542 + }, + { + "epoch": 1.4649893187752197, + "grad_norm": 0.05225376784801483, + "learning_rate": 0.0001213276902325327, + "loss": 0.1194, + "step": 1543 + }, + { + "epoch": 1.4659387609779255, + "grad_norm": 0.04315062612295151, + "learning_rate": 0.00012121971697139926, + "loss": 0.1613, + "step": 1544 + }, + { + "epoch": 1.4668882031806314, + "grad_norm": 0.027962563559412956, + "learning_rate": 0.00012111171779918264, + "loss": 0.1284, + "step": 1545 + }, + { + "epoch": 1.4678376453833373, + "grad_norm": 0.0343998521566391, + "learning_rate": 0.000121003692847759, + "loss": 0.1208, + "step": 1546 + }, + { + "epoch": 1.4687870875860431, + "grad_norm": 0.031139155849814415, + "learning_rate": 0.00012089564224903607, + "loss": 0.1323, + "step": 1547 + }, + { + "epoch": 1.469736529788749, + "grad_norm": 0.028015103191137314, + "learning_rate": 0.00012078756613495277, + "loss": 0.1266, + "step": 1548 + }, + { + "epoch": 1.470685971991455, + "grad_norm": 0.04571033641695976, + "learning_rate": 0.00012067946463747928, + "loss": 0.1561, + "step": 1549 + }, + { + "epoch": 1.471635414194161, + "grad_norm": 0.027564501389861107, + "learning_rate": 0.00012057133788861677, + "loss": 0.1214, + "step": 1550 + }, + { + "epoch": 1.4725848563968669, + "grad_norm": 0.027498599141836166, + "learning_rate": 0.00012046318602039717, + "loss": 0.1233, + "step": 1551 + }, + { + "epoch": 1.4735342985995727, + "grad_norm": 0.040304671972990036, + "learning_rate": 0.00012035500916488316, + "loss": 0.1636, + "step": 1552 + }, + { + "epoch": 1.4744837408022786, + "grad_norm": 0.045689020305871964, + "learning_rate": 0.00012024680745416787, + "loss": 0.1696, + "step": 1553 + }, + { + "epoch": 1.4754331830049845, + "grad_norm": 0.02754673734307289, + "learning_rate": 0.00012013858102037485, + "loss": 0.1265, + "step": 1554 + }, + { + "epoch": 1.4763826252076906, + "grad_norm": 0.04088296741247177, + "learning_rate": 0.0001200303299956578, + "loss": 0.1675, + "step": 1555 + }, + { + "epoch": 1.4773320674103965, + "grad_norm": 0.03440406173467636, + "learning_rate": 0.0001199220545122004, + "loss": 0.1245, + "step": 1556 + }, + { + "epoch": 1.4782815096131023, + "grad_norm": 0.04637245833873749, + "learning_rate": 0.00011981375470221628, + "loss": 0.1668, + "step": 1557 + }, + { + "epoch": 1.4792309518158082, + "grad_norm": 0.03825761005282402, + "learning_rate": 0.00011970543069794875, + "loss": 0.1599, + "step": 1558 + }, + { + "epoch": 1.480180394018514, + "grad_norm": 0.031823255121707916, + "learning_rate": 0.00011959708263167067, + "loss": 0.1232, + "step": 1559 + }, + { + "epoch": 1.48112983622122, + "grad_norm": 0.032104745507240295, + "learning_rate": 0.00011948871063568419, + "loss": 0.1237, + "step": 1560 + }, + { + "epoch": 1.4820792784239258, + "grad_norm": 0.05664534121751785, + "learning_rate": 0.00011938031484232079, + "loss": 0.1617, + "step": 1561 + }, + { + "epoch": 1.483028720626632, + "grad_norm": 0.02819441631436348, + "learning_rate": 0.00011927189538394101, + "loss": 0.1198, + "step": 1562 + }, + { + "epoch": 1.4839781628293378, + "grad_norm": 0.030618941411376, + "learning_rate": 0.00011916345239293423, + "loss": 0.1291, + "step": 1563 + }, + { + "epoch": 1.4849276050320437, + "grad_norm": 0.027746165171265602, + "learning_rate": 0.00011905498600171859, + "loss": 0.1287, + "step": 1564 + }, + { + "epoch": 1.4858770472347496, + "grad_norm": 0.05675269663333893, + "learning_rate": 0.00011894649634274075, + "loss": 0.1988, + "step": 1565 + }, + { + "epoch": 1.4868264894374554, + "grad_norm": 0.044594231992959976, + "learning_rate": 0.00011883798354847589, + "loss": 0.1611, + "step": 1566 + }, + { + "epoch": 1.4877759316401615, + "grad_norm": 0.050554268062114716, + "learning_rate": 0.00011872944775142736, + "loss": 0.1762, + "step": 1567 + }, + { + "epoch": 1.4887253738428674, + "grad_norm": 0.04185627028346062, + "learning_rate": 0.0001186208890841266, + "loss": 0.1593, + "step": 1568 + }, + { + "epoch": 1.4896748160455733, + "grad_norm": 0.030089320614933968, + "learning_rate": 0.00011851230767913303, + "loss": 0.1217, + "step": 1569 + }, + { + "epoch": 1.4906242582482792, + "grad_norm": 0.033343639224767685, + "learning_rate": 0.00011840370366903382, + "loss": 0.1284, + "step": 1570 + }, + { + "epoch": 1.491573700450985, + "grad_norm": 0.030400337651371956, + "learning_rate": 0.00011829507718644366, + "loss": 0.1315, + "step": 1571 + }, + { + "epoch": 1.492523142653691, + "grad_norm": 0.026966162025928497, + "learning_rate": 0.00011818642836400477, + "loss": 0.1206, + "step": 1572 + }, + { + "epoch": 1.4934725848563968, + "grad_norm": 0.030626913532614708, + "learning_rate": 0.00011807775733438664, + "loss": 0.1228, + "step": 1573 + }, + { + "epoch": 1.4944220270591027, + "grad_norm": 0.03993997722864151, + "learning_rate": 0.00011796906423028588, + "loss": 0.1488, + "step": 1574 + }, + { + "epoch": 1.4953714692618088, + "grad_norm": 0.030942171812057495, + "learning_rate": 0.00011786034918442596, + "loss": 0.1284, + "step": 1575 + }, + { + "epoch": 1.4963209114645146, + "grad_norm": 0.027237005531787872, + "learning_rate": 0.00011775161232955729, + "loss": 0.1278, + "step": 1576 + }, + { + "epoch": 1.4972703536672205, + "grad_norm": 0.030200913548469543, + "learning_rate": 0.0001176428537984568, + "loss": 0.1264, + "step": 1577 + }, + { + "epoch": 1.4982197958699264, + "grad_norm": 0.03261629864573479, + "learning_rate": 0.00011753407372392795, + "loss": 0.128, + "step": 1578 + }, + { + "epoch": 1.4991692380726322, + "grad_norm": 0.033451877534389496, + "learning_rate": 0.0001174252722388005, + "loss": 0.1345, + "step": 1579 + }, + { + "epoch": 1.5001186802753383, + "grad_norm": 0.045393262058496475, + "learning_rate": 0.00011731644947593026, + "loss": 0.1627, + "step": 1580 + }, + { + "epoch": 1.5010681224780442, + "grad_norm": 0.05379907041788101, + "learning_rate": 0.00011720760556819916, + "loss": 0.1867, + "step": 1581 + }, + { + "epoch": 1.50201756468075, + "grad_norm": 0.025756366550922394, + "learning_rate": 0.00011709874064851487, + "loss": 0.1229, + "step": 1582 + }, + { + "epoch": 1.502967006883456, + "grad_norm": 0.03149167448282242, + "learning_rate": 0.00011698985484981077, + "loss": 0.124, + "step": 1583 + }, + { + "epoch": 1.5039164490861618, + "grad_norm": 0.044556062668561935, + "learning_rate": 0.00011688094830504566, + "loss": 0.1522, + "step": 1584 + }, + { + "epoch": 1.5048658912888677, + "grad_norm": 0.029473567381501198, + "learning_rate": 0.00011677202114720374, + "loss": 0.1231, + "step": 1585 + }, + { + "epoch": 1.5058153334915736, + "grad_norm": 0.02735304646193981, + "learning_rate": 0.00011666307350929435, + "loss": 0.1242, + "step": 1586 + }, + { + "epoch": 1.5067647756942795, + "grad_norm": 0.04760657250881195, + "learning_rate": 0.00011655410552435184, + "loss": 0.1574, + "step": 1587 + }, + { + "epoch": 1.5077142178969856, + "grad_norm": 0.029798056930303574, + "learning_rate": 0.00011644511732543542, + "loss": 0.1289, + "step": 1588 + }, + { + "epoch": 1.5086636600996914, + "grad_norm": 0.0329173281788826, + "learning_rate": 0.00011633610904562892, + "loss": 0.1251, + "step": 1589 + }, + { + "epoch": 1.5096131023023973, + "grad_norm": 0.030921783298254013, + "learning_rate": 0.00011622708081804081, + "loss": 0.1257, + "step": 1590 + }, + { + "epoch": 1.5105625445051034, + "grad_norm": 0.031381431967020035, + "learning_rate": 0.0001161180327758038, + "loss": 0.1258, + "step": 1591 + }, + { + "epoch": 1.5115119867078093, + "grad_norm": 0.050078343600034714, + "learning_rate": 0.00011600896505207488, + "loss": 0.1764, + "step": 1592 + }, + { + "epoch": 1.5124614289105152, + "grad_norm": 0.04285876452922821, + "learning_rate": 0.00011589987778003501, + "loss": 0.1607, + "step": 1593 + }, + { + "epoch": 1.513410871113221, + "grad_norm": 0.02959441766142845, + "learning_rate": 0.00011579077109288907, + "loss": 0.1266, + "step": 1594 + }, + { + "epoch": 1.514360313315927, + "grad_norm": 0.03024190105497837, + "learning_rate": 0.00011568164512386559, + "loss": 0.1306, + "step": 1595 + }, + { + "epoch": 1.5153097555186328, + "grad_norm": 0.031076421961188316, + "learning_rate": 0.00011557250000621674, + "loss": 0.1326, + "step": 1596 + }, + { + "epoch": 1.5162591977213387, + "grad_norm": 0.10080650448799133, + "learning_rate": 0.00011546333587321795, + "loss": 0.1596, + "step": 1597 + }, + { + "epoch": 1.5172086399240445, + "grad_norm": 0.029658254235982895, + "learning_rate": 0.000115354152858168, + "loss": 0.1306, + "step": 1598 + }, + { + "epoch": 1.5181580821267504, + "grad_norm": 0.025450505316257477, + "learning_rate": 0.00011524495109438857, + "loss": 0.1199, + "step": 1599 + }, + { + "epoch": 1.5191075243294563, + "grad_norm": 0.0450344942510128, + "learning_rate": 0.00011513573071522439, + "loss": 0.1671, + "step": 1600 + }, + { + "epoch": 1.5200569665321624, + "grad_norm": 0.02928159199655056, + "learning_rate": 0.00011502649185404281, + "loss": 0.1247, + "step": 1601 + }, + { + "epoch": 1.5210064087348683, + "grad_norm": 0.04587600752711296, + "learning_rate": 0.00011491723464423385, + "loss": 0.1746, + "step": 1602 + }, + { + "epoch": 1.5219558509375741, + "grad_norm": 0.03615221753716469, + "learning_rate": 0.00011480795921920984, + "loss": 0.153, + "step": 1603 + }, + { + "epoch": 1.5229052931402802, + "grad_norm": 0.026470355689525604, + "learning_rate": 0.00011469866571240535, + "loss": 0.1204, + "step": 1604 + }, + { + "epoch": 1.523854735342986, + "grad_norm": 0.0324716791510582, + "learning_rate": 0.00011458935425727713, + "loss": 0.127, + "step": 1605 + }, + { + "epoch": 1.524804177545692, + "grad_norm": 0.028570136055350304, + "learning_rate": 0.00011448002498730375, + "loss": 0.1244, + "step": 1606 + }, + { + "epoch": 1.5257536197483978, + "grad_norm": 0.03165844827890396, + "learning_rate": 0.00011437067803598558, + "loss": 0.1286, + "step": 1607 + }, + { + "epoch": 1.5267030619511037, + "grad_norm": 0.029820239171385765, + "learning_rate": 0.00011426131353684457, + "loss": 0.1232, + "step": 1608 + }, + { + "epoch": 1.5276525041538096, + "grad_norm": 0.03704296052455902, + "learning_rate": 0.00011415193162342407, + "loss": 0.1262, + "step": 1609 + }, + { + "epoch": 1.5286019463565155, + "grad_norm": 0.029848331585526466, + "learning_rate": 0.00011404253242928877, + "loss": 0.1345, + "step": 1610 + }, + { + "epoch": 1.5295513885592213, + "grad_norm": 0.02494947612285614, + "learning_rate": 0.00011393311608802437, + "loss": 0.1247, + "step": 1611 + }, + { + "epoch": 1.5305008307619272, + "grad_norm": 0.024150602519512177, + "learning_rate": 0.0001138236827332376, + "loss": 0.1195, + "step": 1612 + }, + { + "epoch": 1.5314502729646333, + "grad_norm": 0.028073903173208237, + "learning_rate": 0.00011371423249855584, + "loss": 0.1298, + "step": 1613 + }, + { + "epoch": 1.5323997151673392, + "grad_norm": 0.03696022182703018, + "learning_rate": 0.0001136047655176272, + "loss": 0.1658, + "step": 1614 + }, + { + "epoch": 1.533349157370045, + "grad_norm": 0.026254741474986076, + "learning_rate": 0.00011349528192412018, + "loss": 0.1209, + "step": 1615 + }, + { + "epoch": 1.5342985995727512, + "grad_norm": 0.04131542891263962, + "learning_rate": 0.0001133857818517236, + "loss": 0.1566, + "step": 1616 + }, + { + "epoch": 1.535248041775457, + "grad_norm": 0.028996463865041733, + "learning_rate": 0.00011327626543414636, + "loss": 0.1287, + "step": 1617 + }, + { + "epoch": 1.536197483978163, + "grad_norm": 0.029789695516228676, + "learning_rate": 0.00011316673280511738, + "loss": 0.1303, + "step": 1618 + }, + { + "epoch": 1.5371469261808688, + "grad_norm": 0.026316309347748756, + "learning_rate": 0.00011305718409838528, + "loss": 0.1224, + "step": 1619 + }, + { + "epoch": 1.5380963683835747, + "grad_norm": 0.029919691383838654, + "learning_rate": 0.0001129476194477184, + "loss": 0.1297, + "step": 1620 + }, + { + "epoch": 1.5390458105862805, + "grad_norm": 0.029496189206838608, + "learning_rate": 0.0001128380389869045, + "loss": 0.1259, + "step": 1621 + }, + { + "epoch": 1.5399952527889864, + "grad_norm": 0.0289089847356081, + "learning_rate": 0.00011272844284975072, + "loss": 0.1138, + "step": 1622 + }, + { + "epoch": 1.5409446949916923, + "grad_norm": 0.03120460920035839, + "learning_rate": 0.00011261883117008321, + "loss": 0.1255, + "step": 1623 + }, + { + "epoch": 1.5418941371943982, + "grad_norm": 0.030355585739016533, + "learning_rate": 0.0001125092040817472, + "loss": 0.1234, + "step": 1624 + }, + { + "epoch": 1.542843579397104, + "grad_norm": 0.026462506502866745, + "learning_rate": 0.00011239956171860675, + "loss": 0.1194, + "step": 1625 + }, + { + "epoch": 1.5437930215998101, + "grad_norm": 0.04788212105631828, + "learning_rate": 0.00011228990421454449, + "loss": 0.1721, + "step": 1626 + }, + { + "epoch": 1.544742463802516, + "grad_norm": 0.04839539900422096, + "learning_rate": 0.00011218023170346159, + "loss": 0.1616, + "step": 1627 + }, + { + "epoch": 1.5456919060052219, + "grad_norm": 0.029102666303515434, + "learning_rate": 0.00011207054431927752, + "loss": 0.1244, + "step": 1628 + }, + { + "epoch": 1.546641348207928, + "grad_norm": 0.030667860060930252, + "learning_rate": 0.00011196084219592994, + "loss": 0.1211, + "step": 1629 + }, + { + "epoch": 1.5475907904106339, + "grad_norm": 0.029911190271377563, + "learning_rate": 0.00011185112546737451, + "loss": 0.1209, + "step": 1630 + }, + { + "epoch": 1.5485402326133397, + "grad_norm": 0.026976125314831734, + "learning_rate": 0.00011174139426758466, + "loss": 0.1201, + "step": 1631 + }, + { + "epoch": 1.5494896748160456, + "grad_norm": 0.04256618767976761, + "learning_rate": 0.00011163164873055158, + "loss": 0.1716, + "step": 1632 + }, + { + "epoch": 1.5504391170187515, + "grad_norm": 0.04431037977337837, + "learning_rate": 0.00011152188899028393, + "loss": 0.1535, + "step": 1633 + }, + { + "epoch": 1.5513885592214574, + "grad_norm": 0.040324628353118896, + "learning_rate": 0.00011141211518080768, + "loss": 0.1559, + "step": 1634 + }, + { + "epoch": 1.5523380014241632, + "grad_norm": 0.02631363458931446, + "learning_rate": 0.00011130232743616602, + "loss": 0.1254, + "step": 1635 + }, + { + "epoch": 1.553287443626869, + "grad_norm": 0.031095106154680252, + "learning_rate": 0.00011119252589041917, + "loss": 0.1296, + "step": 1636 + }, + { + "epoch": 1.554236885829575, + "grad_norm": 0.029540112242102623, + "learning_rate": 0.00011108271067764413, + "loss": 0.1237, + "step": 1637 + }, + { + "epoch": 1.555186328032281, + "grad_norm": 0.032384589314460754, + "learning_rate": 0.00011097288193193465, + "loss": 0.122, + "step": 1638 + }, + { + "epoch": 1.556135770234987, + "grad_norm": 0.041656699031591415, + "learning_rate": 0.00011086303978740102, + "loss": 0.1627, + "step": 1639 + }, + { + "epoch": 1.5570852124376928, + "grad_norm": 0.02969949133694172, + "learning_rate": 0.00011075318437816981, + "loss": 0.1278, + "step": 1640 + }, + { + "epoch": 1.558034654640399, + "grad_norm": 0.029392141848802567, + "learning_rate": 0.00011064331583838389, + "loss": 0.1222, + "step": 1641 + }, + { + "epoch": 1.5589840968431048, + "grad_norm": 0.03221196308732033, + "learning_rate": 0.0001105334343022021, + "loss": 0.1301, + "step": 1642 + }, + { + "epoch": 1.5599335390458107, + "grad_norm": 0.04113316535949707, + "learning_rate": 0.00011042353990379917, + "loss": 0.1545, + "step": 1643 + }, + { + "epoch": 1.5608829812485165, + "grad_norm": 0.04606552794575691, + "learning_rate": 0.00011031363277736546, + "loss": 0.1582, + "step": 1644 + }, + { + "epoch": 1.5618324234512224, + "grad_norm": 0.05224507302045822, + "learning_rate": 0.00011020371305710701, + "loss": 0.1702, + "step": 1645 + }, + { + "epoch": 1.5627818656539283, + "grad_norm": 0.03016183339059353, + "learning_rate": 0.00011009378087724518, + "loss": 0.1299, + "step": 1646 + }, + { + "epoch": 1.5637313078566342, + "grad_norm": 0.02981068380177021, + "learning_rate": 0.00010998383637201648, + "loss": 0.1258, + "step": 1647 + }, + { + "epoch": 1.56468075005934, + "grad_norm": 0.027657071128487587, + "learning_rate": 0.00010987387967567252, + "loss": 0.1338, + "step": 1648 + }, + { + "epoch": 1.565630192262046, + "grad_norm": 0.030992772430181503, + "learning_rate": 0.00010976391092247986, + "loss": 0.1249, + "step": 1649 + }, + { + "epoch": 1.566579634464752, + "grad_norm": 0.039394572377204895, + "learning_rate": 0.00010965393024671966, + "loss": 0.1598, + "step": 1650 + }, + { + "epoch": 1.5675290766674579, + "grad_norm": 0.042735736817121506, + "learning_rate": 0.00010954393778268777, + "loss": 0.1566, + "step": 1651 + }, + { + "epoch": 1.5684785188701638, + "grad_norm": 0.03833623602986336, + "learning_rate": 0.00010943393366469427, + "loss": 0.1557, + "step": 1652 + }, + { + "epoch": 1.5694279610728696, + "grad_norm": 0.028264719992876053, + "learning_rate": 0.00010932391802706363, + "loss": 0.1295, + "step": 1653 + }, + { + "epoch": 1.5703774032755757, + "grad_norm": 0.030619991943240166, + "learning_rate": 0.00010921389100413428, + "loss": 0.1294, + "step": 1654 + }, + { + "epoch": 1.5713268454782816, + "grad_norm": 0.0320441797375679, + "learning_rate": 0.00010910385273025865, + "loss": 0.1266, + "step": 1655 + }, + { + "epoch": 1.5722762876809875, + "grad_norm": 0.028138084337115288, + "learning_rate": 0.00010899380333980282, + "loss": 0.1177, + "step": 1656 + }, + { + "epoch": 1.5732257298836934, + "grad_norm": 0.030276020988821983, + "learning_rate": 0.00010888374296714644, + "loss": 0.1258, + "step": 1657 + }, + { + "epoch": 1.5741751720863992, + "grad_norm": 0.04478145390748978, + "learning_rate": 0.00010877367174668269, + "loss": 0.1555, + "step": 1658 + }, + { + "epoch": 1.575124614289105, + "grad_norm": 0.025850724428892136, + "learning_rate": 0.00010866358981281783, + "loss": 0.1186, + "step": 1659 + }, + { + "epoch": 1.576074056491811, + "grad_norm": 0.03069223277270794, + "learning_rate": 0.00010855349729997135, + "loss": 0.1314, + "step": 1660 + }, + { + "epoch": 1.5770234986945169, + "grad_norm": 0.050107646733522415, + "learning_rate": 0.00010844339434257558, + "loss": 0.1632, + "step": 1661 + }, + { + "epoch": 1.5779729408972227, + "grad_norm": 0.04577158764004707, + "learning_rate": 0.00010833328107507556, + "loss": 0.1777, + "step": 1662 + }, + { + "epoch": 1.5789223830999288, + "grad_norm": 0.030337292701005936, + "learning_rate": 0.00010822315763192903, + "loss": 0.1277, + "step": 1663 + }, + { + "epoch": 1.5798718253026347, + "grad_norm": 0.05210689827799797, + "learning_rate": 0.00010811302414760609, + "loss": 0.1583, + "step": 1664 + }, + { + "epoch": 1.5808212675053406, + "grad_norm": 0.056502800434827805, + "learning_rate": 0.00010800288075658911, + "loss": 0.1533, + "step": 1665 + }, + { + "epoch": 1.5817707097080467, + "grad_norm": 0.03072887845337391, + "learning_rate": 0.00010789272759337257, + "loss": 0.1349, + "step": 1666 + }, + { + "epoch": 1.5827201519107525, + "grad_norm": 0.030606022104620934, + "learning_rate": 0.00010778256479246283, + "loss": 0.1288, + "step": 1667 + }, + { + "epoch": 1.5836695941134584, + "grad_norm": 0.027298327535390854, + "learning_rate": 0.00010767239248837811, + "loss": 0.1256, + "step": 1668 + }, + { + "epoch": 1.5846190363161643, + "grad_norm": 0.044606730341911316, + "learning_rate": 0.00010756221081564813, + "loss": 0.1673, + "step": 1669 + }, + { + "epoch": 1.5855684785188702, + "grad_norm": 0.04783207178115845, + "learning_rate": 0.00010745201990881417, + "loss": 0.1539, + "step": 1670 + }, + { + "epoch": 1.586517920721576, + "grad_norm": 0.03072645701467991, + "learning_rate": 0.00010734181990242868, + "loss": 0.1255, + "step": 1671 + }, + { + "epoch": 1.587467362924282, + "grad_norm": 0.028861412778496742, + "learning_rate": 0.00010723161093105527, + "loss": 0.1252, + "step": 1672 + }, + { + "epoch": 1.5884168051269878, + "grad_norm": 0.06253904849290848, + "learning_rate": 0.0001071213931292685, + "loss": 0.208, + "step": 1673 + }, + { + "epoch": 1.5893662473296937, + "grad_norm": 0.02638799510896206, + "learning_rate": 0.00010701116663165368, + "loss": 0.1307, + "step": 1674 + }, + { + "epoch": 1.5903156895323998, + "grad_norm": 0.02984490990638733, + "learning_rate": 0.00010690093157280678, + "loss": 0.1305, + "step": 1675 + }, + { + "epoch": 1.5912651317351056, + "grad_norm": 0.029151547700166702, + "learning_rate": 0.0001067906880873342, + "loss": 0.1278, + "step": 1676 + }, + { + "epoch": 1.5922145739378115, + "grad_norm": 0.03524734824895859, + "learning_rate": 0.00010668043630985259, + "loss": 0.1426, + "step": 1677 + }, + { + "epoch": 1.5931640161405174, + "grad_norm": 0.03428010269999504, + "learning_rate": 0.00010657017637498881, + "loss": 0.125, + "step": 1678 + }, + { + "epoch": 1.5941134583432235, + "grad_norm": 0.05122271180152893, + "learning_rate": 0.00010645990841737965, + "loss": 0.1569, + "step": 1679 + }, + { + "epoch": 1.5950629005459294, + "grad_norm": 0.035184647887945175, + "learning_rate": 0.00010634963257167167, + "loss": 0.1358, + "step": 1680 + }, + { + "epoch": 1.5960123427486352, + "grad_norm": 0.03079393319785595, + "learning_rate": 0.00010623934897252106, + "loss": 0.1268, + "step": 1681 + }, + { + "epoch": 1.5969617849513411, + "grad_norm": 0.03128993511199951, + "learning_rate": 0.00010612905775459349, + "loss": 0.1223, + "step": 1682 + }, + { + "epoch": 1.597911227154047, + "grad_norm": 0.02892274223268032, + "learning_rate": 0.00010601875905256398, + "loss": 0.1293, + "step": 1683 + }, + { + "epoch": 1.5988606693567529, + "grad_norm": 0.030298085883259773, + "learning_rate": 0.00010590845300111663, + "loss": 0.1293, + "step": 1684 + }, + { + "epoch": 1.5998101115594587, + "grad_norm": 0.028357641771435738, + "learning_rate": 0.00010579813973494454, + "loss": 0.1269, + "step": 1685 + }, + { + "epoch": 1.6007595537621646, + "grad_norm": 0.027196258306503296, + "learning_rate": 0.00010568781938874959, + "loss": 0.1296, + "step": 1686 + }, + { + "epoch": 1.6017089959648705, + "grad_norm": 0.05107175186276436, + "learning_rate": 0.00010557749209724233, + "loss": 0.1604, + "step": 1687 + }, + { + "epoch": 1.6026584381675766, + "grad_norm": 0.050482697784900665, + "learning_rate": 0.00010546715799514178, + "loss": 0.1902, + "step": 1688 + }, + { + "epoch": 1.6036078803702825, + "grad_norm": 0.026253553107380867, + "learning_rate": 0.00010535681721717529, + "loss": 0.1226, + "step": 1689 + }, + { + "epoch": 1.6045573225729883, + "grad_norm": 0.03308340907096863, + "learning_rate": 0.0001052464698980784, + "loss": 0.1384, + "step": 1690 + }, + { + "epoch": 1.6055067647756944, + "grad_norm": 0.03511514514684677, + "learning_rate": 0.00010513611617259454, + "loss": 0.1577, + "step": 1691 + }, + { + "epoch": 1.6064562069784003, + "grad_norm": 0.0325862281024456, + "learning_rate": 0.00010502575617547501, + "loss": 0.1343, + "step": 1692 + }, + { + "epoch": 1.6074056491811062, + "grad_norm": 0.029174668714404106, + "learning_rate": 0.00010491539004147879, + "loss": 0.1222, + "step": 1693 + }, + { + "epoch": 1.608355091383812, + "grad_norm": 0.028774891048669815, + "learning_rate": 0.00010480501790537236, + "loss": 0.1237, + "step": 1694 + }, + { + "epoch": 1.609304533586518, + "grad_norm": 0.030504655092954636, + "learning_rate": 0.00010469463990192947, + "loss": 0.1228, + "step": 1695 + }, + { + "epoch": 1.6102539757892238, + "grad_norm": 0.02889893390238285, + "learning_rate": 0.0001045842561659311, + "loss": 0.1208, + "step": 1696 + }, + { + "epoch": 1.6112034179919297, + "grad_norm": 0.028366010636091232, + "learning_rate": 0.00010447386683216518, + "loss": 0.1193, + "step": 1697 + }, + { + "epoch": 1.6121528601946356, + "grad_norm": 0.028841307386755943, + "learning_rate": 0.0001043634720354265, + "loss": 0.1287, + "step": 1698 + }, + { + "epoch": 1.6131023023973414, + "grad_norm": 0.03739466145634651, + "learning_rate": 0.00010425307191051654, + "loss": 0.1369, + "step": 1699 + }, + { + "epoch": 1.6140517446000475, + "grad_norm": 0.031514909118413925, + "learning_rate": 0.00010414266659224323, + "loss": 0.1301, + "step": 1700 + }, + { + "epoch": 1.6150011868027534, + "grad_norm": 0.04335467517375946, + "learning_rate": 0.00010403225621542089, + "loss": 0.1543, + "step": 1701 + }, + { + "epoch": 1.6159506290054593, + "grad_norm": 0.026403702795505524, + "learning_rate": 0.00010392184091487, + "loss": 0.1229, + "step": 1702 + }, + { + "epoch": 1.6169000712081654, + "grad_norm": 0.06492079049348831, + "learning_rate": 0.00010381142082541706, + "loss": 0.1526, + "step": 1703 + }, + { + "epoch": 1.6178495134108712, + "grad_norm": 0.031183555722236633, + "learning_rate": 0.00010370099608189439, + "loss": 0.1255, + "step": 1704 + }, + { + "epoch": 1.6187989556135771, + "grad_norm": 0.02694527618587017, + "learning_rate": 0.00010359056681914006, + "loss": 0.1178, + "step": 1705 + }, + { + "epoch": 1.619748397816283, + "grad_norm": 0.031590498983860016, + "learning_rate": 0.00010348013317199756, + "loss": 0.1311, + "step": 1706 + }, + { + "epoch": 1.6206978400189889, + "grad_norm": 0.034355372190475464, + "learning_rate": 0.00010336969527531577, + "loss": 0.1363, + "step": 1707 + }, + { + "epoch": 1.6216472822216947, + "grad_norm": 0.04307783022522926, + "learning_rate": 0.00010325925326394886, + "loss": 0.1659, + "step": 1708 + }, + { + "epoch": 1.6225967244244006, + "grad_norm": 0.05346130579710007, + "learning_rate": 0.00010314880727275591, + "loss": 0.2022, + "step": 1709 + }, + { + "epoch": 1.6235461666271065, + "grad_norm": 0.028053171932697296, + "learning_rate": 0.00010303835743660086, + "loss": 0.1251, + "step": 1710 + }, + { + "epoch": 1.6244956088298124, + "grad_norm": 0.03092275932431221, + "learning_rate": 0.00010292790389035239, + "loss": 0.1299, + "step": 1711 + }, + { + "epoch": 1.6254450510325182, + "grad_norm": 0.03486338630318642, + "learning_rate": 0.00010281744676888368, + "loss": 0.1374, + "step": 1712 + }, + { + "epoch": 1.6263944932352243, + "grad_norm": 0.030160879716277122, + "learning_rate": 0.00010270698620707231, + "loss": 0.1381, + "step": 1713 + }, + { + "epoch": 1.6273439354379302, + "grad_norm": 0.03339090943336487, + "learning_rate": 0.00010259652233980007, + "loss": 0.1316, + "step": 1714 + }, + { + "epoch": 1.628293377640636, + "grad_norm": 0.027464497834444046, + "learning_rate": 0.00010248605530195268, + "loss": 0.1204, + "step": 1715 + }, + { + "epoch": 1.6292428198433422, + "grad_norm": 0.02733561024069786, + "learning_rate": 0.00010237558522841985, + "loss": 0.1259, + "step": 1716 + }, + { + "epoch": 1.630192262046048, + "grad_norm": 0.029772773385047913, + "learning_rate": 0.00010226511225409499, + "loss": 0.1252, + "step": 1717 + }, + { + "epoch": 1.631141704248754, + "grad_norm": 0.04465902969241142, + "learning_rate": 0.00010215463651387499, + "loss": 0.151, + "step": 1718 + }, + { + "epoch": 1.6320911464514598, + "grad_norm": 0.028140738606452942, + "learning_rate": 0.0001020441581426601, + "loss": 0.1221, + "step": 1719 + }, + { + "epoch": 1.6330405886541657, + "grad_norm": 0.026517389342188835, + "learning_rate": 0.00010193367727535392, + "loss": 0.1253, + "step": 1720 + }, + { + "epoch": 1.6339900308568716, + "grad_norm": 0.06271334737539291, + "learning_rate": 0.00010182319404686293, + "loss": 0.2072, + "step": 1721 + }, + { + "epoch": 1.6349394730595774, + "grad_norm": 0.051109135150909424, + "learning_rate": 0.00010171270859209662, + "loss": 0.1923, + "step": 1722 + }, + { + "epoch": 1.6358889152622833, + "grad_norm": 0.02858104184269905, + "learning_rate": 0.00010160222104596716, + "loss": 0.1317, + "step": 1723 + }, + { + "epoch": 1.6368383574649892, + "grad_norm": 0.031905338168144226, + "learning_rate": 0.00010149173154338917, + "loss": 0.1334, + "step": 1724 + }, + { + "epoch": 1.6377877996676953, + "grad_norm": 0.035984478890895844, + "learning_rate": 0.00010138124021927984, + "loss": 0.1308, + "step": 1725 + }, + { + "epoch": 1.6387372418704012, + "grad_norm": 0.029515955597162247, + "learning_rate": 0.00010127074720855845, + "loss": 0.1232, + "step": 1726 + }, + { + "epoch": 1.639686684073107, + "grad_norm": 0.03353870287537575, + "learning_rate": 0.0001011602526461464, + "loss": 0.1342, + "step": 1727 + }, + { + "epoch": 1.6406361262758131, + "grad_norm": 0.02787208929657936, + "learning_rate": 0.00010104975666696697, + "loss": 0.1216, + "step": 1728 + }, + { + "epoch": 1.641585568478519, + "grad_norm": 0.061213839799165726, + "learning_rate": 0.0001009392594059452, + "loss": 0.2093, + "step": 1729 + }, + { + "epoch": 1.6425350106812249, + "grad_norm": 0.034235142171382904, + "learning_rate": 0.0001008287609980076, + "loss": 0.1329, + "step": 1730 + }, + { + "epoch": 1.6434844528839307, + "grad_norm": 0.026360424235463142, + "learning_rate": 0.00010071826157808217, + "loss": 0.1239, + "step": 1731 + }, + { + "epoch": 1.6444338950866366, + "grad_norm": 0.026264041662216187, + "learning_rate": 0.00010060776128109812, + "loss": 0.12, + "step": 1732 + }, + { + "epoch": 1.6453833372893425, + "grad_norm": 0.02740940824151039, + "learning_rate": 0.00010049726024198578, + "loss": 0.1314, + "step": 1733 + }, + { + "epoch": 1.6463327794920484, + "grad_norm": 0.04096614569425583, + "learning_rate": 0.00010038675859567628, + "loss": 0.1681, + "step": 1734 + }, + { + "epoch": 1.6472822216947542, + "grad_norm": 0.04552573338150978, + "learning_rate": 0.00010027625647710155, + "loss": 0.16, + "step": 1735 + }, + { + "epoch": 1.6482316638974601, + "grad_norm": 0.034032173454761505, + "learning_rate": 0.00010016575402119413, + "loss": 0.1326, + "step": 1736 + }, + { + "epoch": 1.649181106100166, + "grad_norm": 0.03644052520394325, + "learning_rate": 0.00010005525136288692, + "loss": 0.146, + "step": 1737 + }, + { + "epoch": 1.650130548302872, + "grad_norm": 0.04277161881327629, + "learning_rate": 9.994474863711311e-05, + "loss": 0.1719, + "step": 1738 + }, + { + "epoch": 1.651079990505578, + "grad_norm": 0.027901561930775642, + "learning_rate": 9.98342459788059e-05, + "loss": 0.1238, + "step": 1739 + }, + { + "epoch": 1.6520294327082838, + "grad_norm": 0.030957000330090523, + "learning_rate": 9.972374352289848e-05, + "loss": 0.1315, + "step": 1740 + }, + { + "epoch": 1.65297887491099, + "grad_norm": 0.029299341142177582, + "learning_rate": 9.961324140432376e-05, + "loss": 0.1247, + "step": 1741 + }, + { + "epoch": 1.6539283171136958, + "grad_norm": 0.0292718093842268, + "learning_rate": 9.950273975801424e-05, + "loss": 0.1296, + "step": 1742 + }, + { + "epoch": 1.6548777593164017, + "grad_norm": 0.03113977424800396, + "learning_rate": 9.93922387189019e-05, + "loss": 0.1294, + "step": 1743 + }, + { + "epoch": 1.6558272015191076, + "grad_norm": 0.05127384141087532, + "learning_rate": 9.928173842191786e-05, + "loss": 0.1623, + "step": 1744 + }, + { + "epoch": 1.6567766437218134, + "grad_norm": 0.03058856725692749, + "learning_rate": 9.917123900199245e-05, + "loss": 0.1251, + "step": 1745 + }, + { + "epoch": 1.6577260859245193, + "grad_norm": 0.043525367975234985, + "learning_rate": 9.906074059405486e-05, + "loss": 0.1584, + "step": 1746 + }, + { + "epoch": 1.6586755281272252, + "grad_norm": 0.02724611759185791, + "learning_rate": 9.895024333303305e-05, + "loss": 0.1273, + "step": 1747 + }, + { + "epoch": 1.659624970329931, + "grad_norm": 0.026182804256677628, + "learning_rate": 9.883974735385361e-05, + "loss": 0.1165, + "step": 1748 + }, + { + "epoch": 1.660574412532637, + "grad_norm": 0.030495688319206238, + "learning_rate": 9.87292527914416e-05, + "loss": 0.1251, + "step": 1749 + }, + { + "epoch": 1.661523854735343, + "grad_norm": 0.03013971447944641, + "learning_rate": 9.861875978072017e-05, + "loss": 0.1278, + "step": 1750 + }, + { + "epoch": 1.662473296938049, + "grad_norm": 0.03999912738800049, + "learning_rate": 9.850826845661082e-05, + "loss": 0.1519, + "step": 1751 + }, + { + "epoch": 1.6634227391407548, + "grad_norm": 0.029559755697846413, + "learning_rate": 9.839777895403287e-05, + "loss": 0.1293, + "step": 1752 + }, + { + "epoch": 1.6643721813434609, + "grad_norm": 0.04213762283325195, + "learning_rate": 9.828729140790337e-05, + "loss": 0.1696, + "step": 1753 + }, + { + "epoch": 1.6653216235461668, + "grad_norm": 0.029974251985549927, + "learning_rate": 9.817680595313705e-05, + "loss": 0.1182, + "step": 1754 + }, + { + "epoch": 1.6662710657488726, + "grad_norm": 0.03835977986454964, + "learning_rate": 9.806632272464607e-05, + "loss": 0.1467, + "step": 1755 + }, + { + "epoch": 1.6672205079515785, + "grad_norm": 0.031473349779844284, + "learning_rate": 9.795584185733988e-05, + "loss": 0.1305, + "step": 1756 + }, + { + "epoch": 1.6681699501542844, + "grad_norm": 0.02675897814333439, + "learning_rate": 9.784536348612504e-05, + "loss": 0.1196, + "step": 1757 + }, + { + "epoch": 1.6691193923569903, + "grad_norm": 0.0419435016810894, + "learning_rate": 9.773488774590504e-05, + "loss": 0.1558, + "step": 1758 + }, + { + "epoch": 1.6700688345596961, + "grad_norm": 0.027311773970723152, + "learning_rate": 9.762441477158016e-05, + "loss": 0.1236, + "step": 1759 + }, + { + "epoch": 1.671018276762402, + "grad_norm": 0.05605548992753029, + "learning_rate": 9.751394469804734e-05, + "loss": 0.1787, + "step": 1760 + }, + { + "epoch": 1.6719677189651079, + "grad_norm": 0.025175364688038826, + "learning_rate": 9.740347766019997e-05, + "loss": 0.1275, + "step": 1761 + }, + { + "epoch": 1.6729171611678137, + "grad_norm": 0.04951293021440506, + "learning_rate": 9.729301379292773e-05, + "loss": 0.2023, + "step": 1762 + }, + { + "epoch": 1.6738666033705198, + "grad_norm": 0.02842806465923786, + "learning_rate": 9.718255323111635e-05, + "loss": 0.1238, + "step": 1763 + }, + { + "epoch": 1.6748160455732257, + "grad_norm": 0.029241712763905525, + "learning_rate": 9.707209610964765e-05, + "loss": 0.121, + "step": 1764 + }, + { + "epoch": 1.6757654877759316, + "grad_norm": 0.03337705507874489, + "learning_rate": 9.696164256339917e-05, + "loss": 0.1354, + "step": 1765 + }, + { + "epoch": 1.6767149299786377, + "grad_norm": 0.030520187690854073, + "learning_rate": 9.685119272724411e-05, + "loss": 0.1256, + "step": 1766 + }, + { + "epoch": 1.6776643721813436, + "grad_norm": 0.0318867489695549, + "learning_rate": 9.674074673605115e-05, + "loss": 0.1286, + "step": 1767 + }, + { + "epoch": 1.6786138143840494, + "grad_norm": 0.02671106904745102, + "learning_rate": 9.663030472468424e-05, + "loss": 0.1297, + "step": 1768 + }, + { + "epoch": 1.6795632565867553, + "grad_norm": 0.041651055216789246, + "learning_rate": 9.651986682800249e-05, + "loss": 0.1618, + "step": 1769 + }, + { + "epoch": 1.6805126987894612, + "grad_norm": 0.03195889666676521, + "learning_rate": 9.640943318085999e-05, + "loss": 0.1279, + "step": 1770 + }, + { + "epoch": 1.681462140992167, + "grad_norm": 0.04086165875196457, + "learning_rate": 9.629900391810563e-05, + "loss": 0.1678, + "step": 1771 + }, + { + "epoch": 1.682411583194873, + "grad_norm": 0.025256391614675522, + "learning_rate": 9.618857917458298e-05, + "loss": 0.1197, + "step": 1772 + }, + { + "epoch": 1.6833610253975788, + "grad_norm": 0.03352576494216919, + "learning_rate": 9.607815908513005e-05, + "loss": 0.1345, + "step": 1773 + }, + { + "epoch": 1.6843104676002847, + "grad_norm": 0.06082432344555855, + "learning_rate": 9.596774378457916e-05, + "loss": 0.1639, + "step": 1774 + }, + { + "epoch": 1.6852599098029908, + "grad_norm": 0.029191186651587486, + "learning_rate": 9.585733340775677e-05, + "loss": 0.1305, + "step": 1775 + }, + { + "epoch": 1.6862093520056967, + "grad_norm": 0.029343895614147186, + "learning_rate": 9.574692808948348e-05, + "loss": 0.1265, + "step": 1776 + }, + { + "epoch": 1.6871587942084025, + "grad_norm": 0.02953837811946869, + "learning_rate": 9.56365279645735e-05, + "loss": 0.1281, + "step": 1777 + }, + { + "epoch": 1.6881082364111086, + "grad_norm": 0.028798846527934074, + "learning_rate": 9.552613316783483e-05, + "loss": 0.1257, + "step": 1778 + }, + { + "epoch": 1.6890576786138145, + "grad_norm": 0.02905990555882454, + "learning_rate": 9.54157438340689e-05, + "loss": 0.1308, + "step": 1779 + }, + { + "epoch": 1.6900071208165204, + "grad_norm": 0.02965502068400383, + "learning_rate": 9.530536009807053e-05, + "loss": 0.1296, + "step": 1780 + }, + { + "epoch": 1.6909565630192263, + "grad_norm": 0.029197504743933678, + "learning_rate": 9.519498209462766e-05, + "loss": 0.1204, + "step": 1781 + }, + { + "epoch": 1.6919060052219321, + "grad_norm": 0.04930657148361206, + "learning_rate": 9.508460995852122e-05, + "loss": 0.1522, + "step": 1782 + }, + { + "epoch": 1.692855447424638, + "grad_norm": 0.027312377467751503, + "learning_rate": 9.497424382452501e-05, + "loss": 0.1203, + "step": 1783 + }, + { + "epoch": 1.6938048896273439, + "grad_norm": 0.03260885551571846, + "learning_rate": 9.486388382740548e-05, + "loss": 0.1334, + "step": 1784 + }, + { + "epoch": 1.6947543318300498, + "grad_norm": 0.052055153995752335, + "learning_rate": 9.475353010192162e-05, + "loss": 0.2113, + "step": 1785 + }, + { + "epoch": 1.6957037740327556, + "grad_norm": 0.038476429879665375, + "learning_rate": 9.464318278282472e-05, + "loss": 0.1669, + "step": 1786 + }, + { + "epoch": 1.6966532162354615, + "grad_norm": 0.03111964277923107, + "learning_rate": 9.453284200485825e-05, + "loss": 0.1255, + "step": 1787 + }, + { + "epoch": 1.6976026584381676, + "grad_norm": 0.02803085930645466, + "learning_rate": 9.44225079027577e-05, + "loss": 0.1297, + "step": 1788 + }, + { + "epoch": 1.6985521006408735, + "grad_norm": 0.029160544276237488, + "learning_rate": 9.431218061125044e-05, + "loss": 0.1263, + "step": 1789 + }, + { + "epoch": 1.6995015428435793, + "grad_norm": 0.03121958300471306, + "learning_rate": 9.420186026505548e-05, + "loss": 0.1277, + "step": 1790 + }, + { + "epoch": 1.7004509850462854, + "grad_norm": 0.027002684772014618, + "learning_rate": 9.40915469988834e-05, + "loss": 0.1236, + "step": 1791 + }, + { + "epoch": 1.7014004272489913, + "grad_norm": 0.027172109112143517, + "learning_rate": 9.398124094743604e-05, + "loss": 0.1241, + "step": 1792 + }, + { + "epoch": 1.7023498694516972, + "grad_norm": 0.02658463642001152, + "learning_rate": 9.387094224540653e-05, + "loss": 0.1268, + "step": 1793 + }, + { + "epoch": 1.703299311654403, + "grad_norm": 0.028862452134490013, + "learning_rate": 9.376065102747898e-05, + "loss": 0.1317, + "step": 1794 + }, + { + "epoch": 1.704248753857109, + "grad_norm": 0.025097506120800972, + "learning_rate": 9.365036742832838e-05, + "loss": 0.1164, + "step": 1795 + }, + { + "epoch": 1.7051981960598148, + "grad_norm": 0.04557095095515251, + "learning_rate": 9.354009158262038e-05, + "loss": 0.1522, + "step": 1796 + }, + { + "epoch": 1.7061476382625207, + "grad_norm": 0.03487172722816467, + "learning_rate": 9.342982362501123e-05, + "loss": 0.1398, + "step": 1797 + }, + { + "epoch": 1.7070970804652266, + "grad_norm": 0.02509194053709507, + "learning_rate": 9.331956369014746e-05, + "loss": 0.1166, + "step": 1798 + }, + { + "epoch": 1.7080465226679324, + "grad_norm": 0.030363403260707855, + "learning_rate": 9.320931191266587e-05, + "loss": 0.1191, + "step": 1799 + }, + { + "epoch": 1.7089959648706385, + "grad_norm": 0.08008571714162827, + "learning_rate": 9.309906842719323e-05, + "loss": 0.1494, + "step": 1800 + }, + { + "epoch": 1.7099454070733444, + "grad_norm": 0.027857676148414612, + "learning_rate": 9.298883336834633e-05, + "loss": 0.1307, + "step": 1801 + }, + { + "epoch": 1.7108948492760503, + "grad_norm": 0.02744341269135475, + "learning_rate": 9.28786068707315e-05, + "loss": 0.1217, + "step": 1802 + }, + { + "epoch": 1.7118442914787564, + "grad_norm": 0.0324774868786335, + "learning_rate": 9.276838906894472e-05, + "loss": 0.1311, + "step": 1803 + }, + { + "epoch": 1.7127937336814623, + "grad_norm": 0.028761887922883034, + "learning_rate": 9.265818009757132e-05, + "loss": 0.1275, + "step": 1804 + }, + { + "epoch": 1.7137431758841681, + "grad_norm": 0.02950756810605526, + "learning_rate": 9.254798009118584e-05, + "loss": 0.1262, + "step": 1805 + }, + { + "epoch": 1.714692618086874, + "grad_norm": 0.027881214395165443, + "learning_rate": 9.243778918435187e-05, + "loss": 0.1266, + "step": 1806 + }, + { + "epoch": 1.7156420602895799, + "grad_norm": 0.05155957117676735, + "learning_rate": 9.232760751162193e-05, + "loss": 0.1936, + "step": 1807 + }, + { + "epoch": 1.7165915024922858, + "grad_norm": 0.029041916131973267, + "learning_rate": 9.221743520753719e-05, + "loss": 0.1204, + "step": 1808 + }, + { + "epoch": 1.7175409446949916, + "grad_norm": 0.030144108459353447, + "learning_rate": 9.210727240662747e-05, + "loss": 0.1285, + "step": 1809 + }, + { + "epoch": 1.7184903868976975, + "grad_norm": 0.028103960677981377, + "learning_rate": 9.199711924341093e-05, + "loss": 0.125, + "step": 1810 + }, + { + "epoch": 1.7194398291004034, + "grad_norm": 0.03844073414802551, + "learning_rate": 9.188697585239394e-05, + "loss": 0.1525, + "step": 1811 + }, + { + "epoch": 1.7203892713031093, + "grad_norm": 0.04454744979739189, + "learning_rate": 9.177684236807099e-05, + "loss": 0.1616, + "step": 1812 + }, + { + "epoch": 1.7213387135058154, + "grad_norm": 0.027989163994789124, + "learning_rate": 9.166671892492446e-05, + "loss": 0.1265, + "step": 1813 + }, + { + "epoch": 1.7222881557085212, + "grad_norm": 0.04422546178102493, + "learning_rate": 9.155660565742444e-05, + "loss": 0.159, + "step": 1814 + }, + { + "epoch": 1.723237597911227, + "grad_norm": 0.027917252853512764, + "learning_rate": 9.144650270002866e-05, + "loss": 0.1229, + "step": 1815 + }, + { + "epoch": 1.7241870401139332, + "grad_norm": 0.05252804979681969, + "learning_rate": 9.133641018718217e-05, + "loss": 0.1955, + "step": 1816 + }, + { + "epoch": 1.725136482316639, + "grad_norm": 0.029228439554572105, + "learning_rate": 9.122632825331733e-05, + "loss": 0.1197, + "step": 1817 + }, + { + "epoch": 1.726085924519345, + "grad_norm": 0.02810599096119404, + "learning_rate": 9.111625703285356e-05, + "loss": 0.1284, + "step": 1818 + }, + { + "epoch": 1.7270353667220508, + "grad_norm": 0.02618074230849743, + "learning_rate": 9.10061966601972e-05, + "loss": 0.1239, + "step": 1819 + }, + { + "epoch": 1.7279848089247567, + "grad_norm": 0.026649268344044685, + "learning_rate": 9.089614726974137e-05, + "loss": 0.1218, + "step": 1820 + }, + { + "epoch": 1.7289342511274626, + "grad_norm": 0.02857782505452633, + "learning_rate": 9.078610899586575e-05, + "loss": 0.133, + "step": 1821 + }, + { + "epoch": 1.7298836933301684, + "grad_norm": 0.026309454813599586, + "learning_rate": 9.067608197293642e-05, + "loss": 0.1175, + "step": 1822 + }, + { + "epoch": 1.7308331355328743, + "grad_norm": 0.02791914902627468, + "learning_rate": 9.056606633530578e-05, + "loss": 0.12, + "step": 1823 + }, + { + "epoch": 1.7317825777355802, + "grad_norm": 0.030874181538820267, + "learning_rate": 9.045606221731229e-05, + "loss": 0.1307, + "step": 1824 + }, + { + "epoch": 1.7327320199382863, + "grad_norm": 0.030806539580225945, + "learning_rate": 9.034606975328033e-05, + "loss": 0.1188, + "step": 1825 + }, + { + "epoch": 1.7336814621409922, + "grad_norm": 0.028665419667959213, + "learning_rate": 9.023608907752015e-05, + "loss": 0.1311, + "step": 1826 + }, + { + "epoch": 1.734630904343698, + "grad_norm": 0.029301505535840988, + "learning_rate": 9.012612032432747e-05, + "loss": 0.1325, + "step": 1827 + }, + { + "epoch": 1.7355803465464041, + "grad_norm": 0.0394410640001297, + "learning_rate": 9.001616362798353e-05, + "loss": 0.164, + "step": 1828 + }, + { + "epoch": 1.73652978874911, + "grad_norm": 0.02754109725356102, + "learning_rate": 8.990621912275484e-05, + "loss": 0.12, + "step": 1829 + }, + { + "epoch": 1.737479230951816, + "grad_norm": 0.02719545178115368, + "learning_rate": 8.9796286942893e-05, + "loss": 0.1203, + "step": 1830 + }, + { + "epoch": 1.7384286731545218, + "grad_norm": 0.02480783686041832, + "learning_rate": 8.968636722263455e-05, + "loss": 0.1225, + "step": 1831 + }, + { + "epoch": 1.7393781153572276, + "grad_norm": 0.025418803095817566, + "learning_rate": 8.957646009620085e-05, + "loss": 0.125, + "step": 1832 + }, + { + "epoch": 1.7403275575599335, + "grad_norm": 0.024165470153093338, + "learning_rate": 8.94665656977979e-05, + "loss": 0.125, + "step": 1833 + }, + { + "epoch": 1.7412769997626394, + "grad_norm": 0.03011813573539257, + "learning_rate": 8.935668416161612e-05, + "loss": 0.1337, + "step": 1834 + }, + { + "epoch": 1.7422264419653453, + "grad_norm": 0.038413502275943756, + "learning_rate": 8.92468156218302e-05, + "loss": 0.1715, + "step": 1835 + }, + { + "epoch": 1.7431758841680511, + "grad_norm": 0.023849591612815857, + "learning_rate": 8.9136960212599e-05, + "loss": 0.1148, + "step": 1836 + }, + { + "epoch": 1.7441253263707572, + "grad_norm": 0.027159664779901505, + "learning_rate": 8.902711806806536e-05, + "loss": 0.1255, + "step": 1837 + }, + { + "epoch": 1.745074768573463, + "grad_norm": 0.030395383015275, + "learning_rate": 8.89172893223559e-05, + "loss": 0.1267, + "step": 1838 + }, + { + "epoch": 1.746024210776169, + "grad_norm": 0.025772254914045334, + "learning_rate": 8.880747410958085e-05, + "loss": 0.1212, + "step": 1839 + }, + { + "epoch": 1.7469736529788749, + "grad_norm": 0.04073212668299675, + "learning_rate": 8.8697672563834e-05, + "loss": 0.1229, + "step": 1840 + }, + { + "epoch": 1.747923095181581, + "grad_norm": 0.048602957278490067, + "learning_rate": 8.858788481919235e-05, + "loss": 0.1587, + "step": 1841 + }, + { + "epoch": 1.7488725373842868, + "grad_norm": 0.030672159045934677, + "learning_rate": 8.84781110097161e-05, + "loss": 0.1276, + "step": 1842 + }, + { + "epoch": 1.7498219795869927, + "grad_norm": 0.029867272824048996, + "learning_rate": 8.836835126944843e-05, + "loss": 0.1316, + "step": 1843 + }, + { + "epoch": 1.7507714217896986, + "grad_norm": 0.03122364915907383, + "learning_rate": 8.825860573241535e-05, + "loss": 0.1276, + "step": 1844 + }, + { + "epoch": 1.7517208639924045, + "grad_norm": 0.03530842810869217, + "learning_rate": 8.814887453262555e-05, + "loss": 0.1272, + "step": 1845 + }, + { + "epoch": 1.7526703061951103, + "grad_norm": 0.028104711323976517, + "learning_rate": 8.803915780407009e-05, + "loss": 0.1277, + "step": 1846 + }, + { + "epoch": 1.7536197483978162, + "grad_norm": 0.02434263750910759, + "learning_rate": 8.792945568072252e-05, + "loss": 0.1136, + "step": 1847 + }, + { + "epoch": 1.754569190600522, + "grad_norm": 0.027843188494443893, + "learning_rate": 8.781976829653846e-05, + "loss": 0.1199, + "step": 1848 + }, + { + "epoch": 1.755518632803228, + "grad_norm": 0.03688850998878479, + "learning_rate": 8.771009578545553e-05, + "loss": 0.1345, + "step": 1849 + }, + { + "epoch": 1.756468075005934, + "grad_norm": 0.027186516672372818, + "learning_rate": 8.760043828139325e-05, + "loss": 0.1149, + "step": 1850 + }, + { + "epoch": 1.75741751720864, + "grad_norm": 0.04049715772271156, + "learning_rate": 8.749079591825278e-05, + "loss": 0.1585, + "step": 1851 + }, + { + "epoch": 1.7583669594113458, + "grad_norm": 0.02956775203347206, + "learning_rate": 8.738116882991679e-05, + "loss": 0.1303, + "step": 1852 + }, + { + "epoch": 1.759316401614052, + "grad_norm": 0.026160147041082382, + "learning_rate": 8.72715571502493e-05, + "loss": 0.1237, + "step": 1853 + }, + { + "epoch": 1.7602658438167578, + "grad_norm": 0.023719090968370438, + "learning_rate": 8.71619610130955e-05, + "loss": 0.1172, + "step": 1854 + }, + { + "epoch": 1.7612152860194636, + "grad_norm": 0.024884294718503952, + "learning_rate": 8.705238055228161e-05, + "loss": 0.123, + "step": 1855 + }, + { + "epoch": 1.7621647282221695, + "grad_norm": 0.028241394087672234, + "learning_rate": 8.694281590161474e-05, + "loss": 0.129, + "step": 1856 + }, + { + "epoch": 1.7631141704248754, + "grad_norm": 0.028791090473532677, + "learning_rate": 8.683326719488263e-05, + "loss": 0.121, + "step": 1857 + }, + { + "epoch": 1.7640636126275813, + "grad_norm": 0.046369921416044235, + "learning_rate": 8.672373456585365e-05, + "loss": 0.1666, + "step": 1858 + }, + { + "epoch": 1.7650130548302871, + "grad_norm": 0.025271739810705185, + "learning_rate": 8.661421814827641e-05, + "loss": 0.1196, + "step": 1859 + }, + { + "epoch": 1.765962497032993, + "grad_norm": 0.02569795772433281, + "learning_rate": 8.650471807587983e-05, + "loss": 0.1235, + "step": 1860 + }, + { + "epoch": 1.766911939235699, + "grad_norm": 0.03638843819499016, + "learning_rate": 8.639523448237282e-05, + "loss": 0.1523, + "step": 1861 + }, + { + "epoch": 1.767861381438405, + "grad_norm": 0.03260574862360954, + "learning_rate": 8.628576750144419e-05, + "loss": 0.1328, + "step": 1862 + }, + { + "epoch": 1.7688108236411109, + "grad_norm": 0.02770201303064823, + "learning_rate": 8.617631726676243e-05, + "loss": 0.1256, + "step": 1863 + }, + { + "epoch": 1.7697602658438167, + "grad_norm": 0.02869422361254692, + "learning_rate": 8.606688391197564e-05, + "loss": 0.1261, + "step": 1864 + }, + { + "epoch": 1.7707097080465226, + "grad_norm": 0.02792002633213997, + "learning_rate": 8.595746757071125e-05, + "loss": 0.1277, + "step": 1865 + }, + { + "epoch": 1.7716591502492287, + "grad_norm": 0.025662843137979507, + "learning_rate": 8.584806837657594e-05, + "loss": 0.1163, + "step": 1866 + }, + { + "epoch": 1.7726085924519346, + "grad_norm": 0.027771448716521263, + "learning_rate": 8.573868646315546e-05, + "loss": 0.1273, + "step": 1867 + }, + { + "epoch": 1.7735580346546405, + "grad_norm": 0.026355689391493797, + "learning_rate": 8.562932196401444e-05, + "loss": 0.1241, + "step": 1868 + }, + { + "epoch": 1.7745074768573463, + "grad_norm": 0.028244782239198685, + "learning_rate": 8.551997501269629e-05, + "loss": 0.1319, + "step": 1869 + }, + { + "epoch": 1.7754569190600522, + "grad_norm": 0.027661755681037903, + "learning_rate": 8.541064574272292e-05, + "loss": 0.134, + "step": 1870 + }, + { + "epoch": 1.776406361262758, + "grad_norm": 0.026287924498319626, + "learning_rate": 8.530133428759468e-05, + "loss": 0.1215, + "step": 1871 + }, + { + "epoch": 1.777355803465464, + "grad_norm": 0.040449049323797226, + "learning_rate": 8.519204078079021e-05, + "loss": 0.179, + "step": 1872 + }, + { + "epoch": 1.7783052456681698, + "grad_norm": 0.022792836651206017, + "learning_rate": 8.508276535576619e-05, + "loss": 0.1208, + "step": 1873 + }, + { + "epoch": 1.7792546878708757, + "grad_norm": 0.042618922889232635, + "learning_rate": 8.497350814595721e-05, + "loss": 0.1666, + "step": 1874 + }, + { + "epoch": 1.7802041300735818, + "grad_norm": 0.0521213673055172, + "learning_rate": 8.486426928477561e-05, + "loss": 0.1858, + "step": 1875 + }, + { + "epoch": 1.7811535722762877, + "grad_norm": 0.04682205617427826, + "learning_rate": 8.475504890561142e-05, + "loss": 0.2037, + "step": 1876 + }, + { + "epoch": 1.7821030144789936, + "grad_norm": 0.041265182197093964, + "learning_rate": 8.464584714183204e-05, + "loss": 0.1775, + "step": 1877 + }, + { + "epoch": 1.7830524566816996, + "grad_norm": 0.027602121233940125, + "learning_rate": 8.453666412678206e-05, + "loss": 0.1186, + "step": 1878 + }, + { + "epoch": 1.7840018988844055, + "grad_norm": 0.025885846465826035, + "learning_rate": 8.442749999378327e-05, + "loss": 0.1275, + "step": 1879 + }, + { + "epoch": 1.7849513410871114, + "grad_norm": 0.05111463740468025, + "learning_rate": 8.43183548761344e-05, + "loss": 0.1535, + "step": 1880 + }, + { + "epoch": 1.7859007832898173, + "grad_norm": 0.026447484269738197, + "learning_rate": 8.420922890711094e-05, + "loss": 0.1244, + "step": 1881 + }, + { + "epoch": 1.7868502254925231, + "grad_norm": 0.046114444732666016, + "learning_rate": 8.410012221996502e-05, + "loss": 0.1549, + "step": 1882 + }, + { + "epoch": 1.787799667695229, + "grad_norm": 0.027883663773536682, + "learning_rate": 8.399103494792514e-05, + "loss": 0.1186, + "step": 1883 + }, + { + "epoch": 1.788749109897935, + "grad_norm": 0.02667239122092724, + "learning_rate": 8.388196722419621e-05, + "loss": 0.1367, + "step": 1884 + }, + { + "epoch": 1.7896985521006408, + "grad_norm": 0.028317047283053398, + "learning_rate": 8.377291918195922e-05, + "loss": 0.1293, + "step": 1885 + }, + { + "epoch": 1.7906479943033466, + "grad_norm": 0.044224657118320465, + "learning_rate": 8.36638909543711e-05, + "loss": 0.1722, + "step": 1886 + }, + { + "epoch": 1.7915974365060527, + "grad_norm": 0.029220551252365112, + "learning_rate": 8.35548826745646e-05, + "loss": 0.1315, + "step": 1887 + }, + { + "epoch": 1.7925468787087586, + "grad_norm": 0.028357302770018578, + "learning_rate": 8.344589447564818e-05, + "loss": 0.1271, + "step": 1888 + }, + { + "epoch": 1.7934963209114645, + "grad_norm": 0.027882913127541542, + "learning_rate": 8.333692649070568e-05, + "loss": 0.1311, + "step": 1889 + }, + { + "epoch": 1.7944457631141706, + "grad_norm": 0.029344897717237473, + "learning_rate": 8.322797885279627e-05, + "loss": 0.1231, + "step": 1890 + }, + { + "epoch": 1.7953952053168765, + "grad_norm": 0.039409589022397995, + "learning_rate": 8.311905169495435e-05, + "loss": 0.1651, + "step": 1891 + }, + { + "epoch": 1.7963446475195823, + "grad_norm": 0.025551313534379005, + "learning_rate": 8.301014515018925e-05, + "loss": 0.1162, + "step": 1892 + }, + { + "epoch": 1.7972940897222882, + "grad_norm": 0.027775781229138374, + "learning_rate": 8.290125935148516e-05, + "loss": 0.1254, + "step": 1893 + }, + { + "epoch": 1.798243531924994, + "grad_norm": 0.025555282831192017, + "learning_rate": 8.279239443180088e-05, + "loss": 0.1173, + "step": 1894 + }, + { + "epoch": 1.7991929741277, + "grad_norm": 0.027120131999254227, + "learning_rate": 8.268355052406978e-05, + "loss": 0.123, + "step": 1895 + }, + { + "epoch": 1.8001424163304058, + "grad_norm": 0.027624819427728653, + "learning_rate": 8.257472776119957e-05, + "loss": 0.1313, + "step": 1896 + }, + { + "epoch": 1.8010918585331117, + "grad_norm": 0.04341182857751846, + "learning_rate": 8.246592627607208e-05, + "loss": 0.1695, + "step": 1897 + }, + { + "epoch": 1.8020413007358176, + "grad_norm": 0.049969110637903214, + "learning_rate": 8.235714620154323e-05, + "loss": 0.1609, + "step": 1898 + }, + { + "epoch": 1.8029907429385235, + "grad_norm": 0.033960528671741486, + "learning_rate": 8.224838767044275e-05, + "loss": 0.1299, + "step": 1899 + }, + { + "epoch": 1.8039401851412296, + "grad_norm": 0.045656926929950714, + "learning_rate": 8.213965081557402e-05, + "loss": 0.1613, + "step": 1900 + }, + { + "epoch": 1.8048896273439354, + "grad_norm": 0.028559250757098198, + "learning_rate": 8.203093576971414e-05, + "loss": 0.1222, + "step": 1901 + }, + { + "epoch": 1.8058390695466413, + "grad_norm": 0.026110464707016945, + "learning_rate": 8.192224266561336e-05, + "loss": 0.1204, + "step": 1902 + }, + { + "epoch": 1.8067885117493474, + "grad_norm": 0.023995952680706978, + "learning_rate": 8.181357163599522e-05, + "loss": 0.1164, + "step": 1903 + }, + { + "epoch": 1.8077379539520533, + "grad_norm": 0.054921507835388184, + "learning_rate": 8.170492281355635e-05, + "loss": 0.1535, + "step": 1904 + }, + { + "epoch": 1.8086873961547592, + "grad_norm": 0.03065885417163372, + "learning_rate": 8.159629633096619e-05, + "loss": 0.1325, + "step": 1905 + }, + { + "epoch": 1.809636838357465, + "grad_norm": 0.02722746506333351, + "learning_rate": 8.148769232086698e-05, + "loss": 0.1201, + "step": 1906 + }, + { + "epoch": 1.810586280560171, + "grad_norm": 0.026619885116815567, + "learning_rate": 8.13791109158734e-05, + "loss": 0.125, + "step": 1907 + }, + { + "epoch": 1.8115357227628768, + "grad_norm": 0.02867223508656025, + "learning_rate": 8.127055224857266e-05, + "loss": 0.1279, + "step": 1908 + }, + { + "epoch": 1.8124851649655827, + "grad_norm": 0.029073316603899002, + "learning_rate": 8.116201645152412e-05, + "loss": 0.1195, + "step": 1909 + }, + { + "epoch": 1.8134346071682885, + "grad_norm": 0.04955434426665306, + "learning_rate": 8.105350365725926e-05, + "loss": 0.1657, + "step": 1910 + }, + { + "epoch": 1.8143840493709944, + "grad_norm": 0.03038748912513256, + "learning_rate": 8.094501399828143e-05, + "loss": 0.1244, + "step": 1911 + }, + { + "epoch": 1.8153334915737005, + "grad_norm": 0.08984460681676865, + "learning_rate": 8.08365476070658e-05, + "loss": 0.1602, + "step": 1912 + }, + { + "epoch": 1.8162829337764064, + "grad_norm": 0.028402511030435562, + "learning_rate": 8.0728104616059e-05, + "loss": 0.1162, + "step": 1913 + }, + { + "epoch": 1.8172323759791122, + "grad_norm": 0.03234838321805, + "learning_rate": 8.061968515767922e-05, + "loss": 0.1271, + "step": 1914 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.031242968514561653, + "learning_rate": 8.051128936431584e-05, + "loss": 0.1266, + "step": 1915 + }, + { + "epoch": 1.8191312603845242, + "grad_norm": 0.030700111761689186, + "learning_rate": 8.040291736832937e-05, + "loss": 0.1294, + "step": 1916 + }, + { + "epoch": 1.82008070258723, + "grad_norm": 0.031359221786260605, + "learning_rate": 8.029456930205128e-05, + "loss": 0.1316, + "step": 1917 + }, + { + "epoch": 1.821030144789936, + "grad_norm": 0.03048795275390148, + "learning_rate": 8.018624529778375e-05, + "loss": 0.127, + "step": 1918 + }, + { + "epoch": 1.8219795869926418, + "grad_norm": 0.03642559424042702, + "learning_rate": 8.007794548779964e-05, + "loss": 0.1577, + "step": 1919 + }, + { + "epoch": 1.8229290291953477, + "grad_norm": 0.029665078967809677, + "learning_rate": 7.996967000434224e-05, + "loss": 0.1266, + "step": 1920 + }, + { + "epoch": 1.8238784713980536, + "grad_norm": 0.027418775483965874, + "learning_rate": 7.986141897962518e-05, + "loss": 0.1319, + "step": 1921 + }, + { + "epoch": 1.8248279136007595, + "grad_norm": 0.03040868602693081, + "learning_rate": 7.975319254583216e-05, + "loss": 0.1263, + "step": 1922 + }, + { + "epoch": 1.8257773558034653, + "grad_norm": 0.030304348096251488, + "learning_rate": 7.96449908351169e-05, + "loss": 0.1357, + "step": 1923 + }, + { + "epoch": 1.8267267980061712, + "grad_norm": 0.029055261984467506, + "learning_rate": 7.953681397960287e-05, + "loss": 0.1293, + "step": 1924 + }, + { + "epoch": 1.8276762402088773, + "grad_norm": 0.030226033180952072, + "learning_rate": 7.942866211138324e-05, + "loss": 0.1335, + "step": 1925 + }, + { + "epoch": 1.8286256824115832, + "grad_norm": 0.02857894077897072, + "learning_rate": 7.93205353625207e-05, + "loss": 0.1307, + "step": 1926 + }, + { + "epoch": 1.829575124614289, + "grad_norm": 0.031932733952999115, + "learning_rate": 7.921243386504723e-05, + "loss": 0.1345, + "step": 1927 + }, + { + "epoch": 1.8305245668169952, + "grad_norm": 0.030729882419109344, + "learning_rate": 7.910435775096394e-05, + "loss": 0.1256, + "step": 1928 + }, + { + "epoch": 1.831474009019701, + "grad_norm": 0.056101903319358826, + "learning_rate": 7.899630715224098e-05, + "loss": 0.1858, + "step": 1929 + }, + { + "epoch": 1.832423451222407, + "grad_norm": 0.02785342186689377, + "learning_rate": 7.888828220081738e-05, + "loss": 0.1244, + "step": 1930 + }, + { + "epoch": 1.8333728934251128, + "grad_norm": 0.031731851398944855, + "learning_rate": 7.878028302860076e-05, + "loss": 0.1339, + "step": 1931 + }, + { + "epoch": 1.8343223356278187, + "grad_norm": 0.02918057143688202, + "learning_rate": 7.867230976746733e-05, + "loss": 0.1309, + "step": 1932 + }, + { + "epoch": 1.8352717778305245, + "grad_norm": 0.029841450974345207, + "learning_rate": 7.856436254926165e-05, + "loss": 0.1244, + "step": 1933 + }, + { + "epoch": 1.8362212200332304, + "grad_norm": 0.02823001891374588, + "learning_rate": 7.845644150579649e-05, + "loss": 0.1253, + "step": 1934 + }, + { + "epoch": 1.8371706622359363, + "grad_norm": 0.027422424405813217, + "learning_rate": 7.834854676885262e-05, + "loss": 0.1141, + "step": 1935 + }, + { + "epoch": 1.8381201044386422, + "grad_norm": 0.02671034075319767, + "learning_rate": 7.824067847017876e-05, + "loss": 0.1204, + "step": 1936 + }, + { + "epoch": 1.8390695466413483, + "grad_norm": 0.02999771386384964, + "learning_rate": 7.813283674149123e-05, + "loss": 0.1258, + "step": 1937 + }, + { + "epoch": 1.8400189888440541, + "grad_norm": 0.026322634890675545, + "learning_rate": 7.8025021714474e-05, + "loss": 0.117, + "step": 1938 + }, + { + "epoch": 1.84096843104676, + "grad_norm": 0.02665727399289608, + "learning_rate": 7.791723352077842e-05, + "loss": 0.1168, + "step": 1939 + }, + { + "epoch": 1.841917873249466, + "grad_norm": 0.043415650725364685, + "learning_rate": 7.780947229202305e-05, + "loss": 0.1666, + "step": 1940 + }, + { + "epoch": 1.842867315452172, + "grad_norm": 0.03161248564720154, + "learning_rate": 7.770173815979356e-05, + "loss": 0.1307, + "step": 1941 + }, + { + "epoch": 1.8438167576548778, + "grad_norm": 0.028511585667729378, + "learning_rate": 7.759403125564246e-05, + "loss": 0.1329, + "step": 1942 + }, + { + "epoch": 1.8447661998575837, + "grad_norm": 0.03340164199471474, + "learning_rate": 7.74863517110891e-05, + "loss": 0.1233, + "step": 1943 + }, + { + "epoch": 1.8457156420602896, + "grad_norm": 0.027353493496775627, + "learning_rate": 7.737869965761937e-05, + "loss": 0.1227, + "step": 1944 + }, + { + "epoch": 1.8466650842629955, + "grad_norm": 0.04435974359512329, + "learning_rate": 7.72710752266856e-05, + "loss": 0.1586, + "step": 1945 + }, + { + "epoch": 1.8476145264657013, + "grad_norm": 0.03443425893783569, + "learning_rate": 7.716347854970642e-05, + "loss": 0.1285, + "step": 1946 + }, + { + "epoch": 1.8485639686684072, + "grad_norm": 0.02941983938217163, + "learning_rate": 7.705590975806652e-05, + "loss": 0.1227, + "step": 1947 + }, + { + "epoch": 1.849513410871113, + "grad_norm": 0.031541094183921814, + "learning_rate": 7.694836898311654e-05, + "loss": 0.1307, + "step": 1948 + }, + { + "epoch": 1.850462853073819, + "grad_norm": 0.030199352651834488, + "learning_rate": 7.684085635617297e-05, + "loss": 0.126, + "step": 1949 + }, + { + "epoch": 1.851412295276525, + "grad_norm": 0.024474412202835083, + "learning_rate": 7.673337200851787e-05, + "loss": 0.1222, + "step": 1950 + }, + { + "epoch": 1.852361737479231, + "grad_norm": 0.02779853343963623, + "learning_rate": 7.662591607139882e-05, + "loss": 0.1242, + "step": 1951 + }, + { + "epoch": 1.8533111796819368, + "grad_norm": 0.036591142416000366, + "learning_rate": 7.651848867602867e-05, + "loss": 0.1593, + "step": 1952 + }, + { + "epoch": 1.854260621884643, + "grad_norm": 0.029311561957001686, + "learning_rate": 7.641108995358542e-05, + "loss": 0.1237, + "step": 1953 + }, + { + "epoch": 1.8552100640873488, + "grad_norm": 0.029321955516934395, + "learning_rate": 7.630372003521204e-05, + "loss": 0.1289, + "step": 1954 + }, + { + "epoch": 1.8561595062900547, + "grad_norm": 0.034341856837272644, + "learning_rate": 7.619637905201642e-05, + "loss": 0.1323, + "step": 1955 + }, + { + "epoch": 1.8571089484927605, + "grad_norm": 0.02771337330341339, + "learning_rate": 7.608906713507098e-05, + "loss": 0.133, + "step": 1956 + }, + { + "epoch": 1.8580583906954664, + "grad_norm": 0.03146693855524063, + "learning_rate": 7.598178441541274e-05, + "loss": 0.1346, + "step": 1957 + }, + { + "epoch": 1.8590078328981723, + "grad_norm": 0.028372354805469513, + "learning_rate": 7.587453102404306e-05, + "loss": 0.1194, + "step": 1958 + }, + { + "epoch": 1.8599572751008782, + "grad_norm": 0.027766333892941475, + "learning_rate": 7.576730709192744e-05, + "loss": 0.1241, + "step": 1959 + }, + { + "epoch": 1.860906717303584, + "grad_norm": 0.026262789964675903, + "learning_rate": 7.566011274999549e-05, + "loss": 0.1191, + "step": 1960 + }, + { + "epoch": 1.86185615950629, + "grad_norm": 0.028471313416957855, + "learning_rate": 7.555294812914061e-05, + "loss": 0.1208, + "step": 1961 + }, + { + "epoch": 1.862805601708996, + "grad_norm": 0.0403280183672905, + "learning_rate": 7.544581336021994e-05, + "loss": 0.169, + "step": 1962 + }, + { + "epoch": 1.8637550439117019, + "grad_norm": 0.029336489737033844, + "learning_rate": 7.533870857405414e-05, + "loss": 0.1275, + "step": 1963 + }, + { + "epoch": 1.8647044861144078, + "grad_norm": 0.05861514061689377, + "learning_rate": 7.523163390142732e-05, + "loss": 0.1984, + "step": 1964 + }, + { + "epoch": 1.8656539283171139, + "grad_norm": 0.026019204407930374, + "learning_rate": 7.51245894730868e-05, + "loss": 0.1194, + "step": 1965 + }, + { + "epoch": 1.8666033705198197, + "grad_norm": 0.043394673615694046, + "learning_rate": 7.501757541974289e-05, + "loss": 0.1598, + "step": 1966 + }, + { + "epoch": 1.8675528127225256, + "grad_norm": 0.025404971092939377, + "learning_rate": 7.49105918720689e-05, + "loss": 0.1159, + "step": 1967 + }, + { + "epoch": 1.8685022549252315, + "grad_norm": 0.028354499489068985, + "learning_rate": 7.480363896070089e-05, + "loss": 0.1216, + "step": 1968 + }, + { + "epoch": 1.8694516971279374, + "grad_norm": 0.06748262792825699, + "learning_rate": 7.469671681623742e-05, + "loss": 0.1888, + "step": 1969 + }, + { + "epoch": 1.8704011393306432, + "grad_norm": 0.027621906250715256, + "learning_rate": 7.458982556923963e-05, + "loss": 0.1196, + "step": 1970 + }, + { + "epoch": 1.871350581533349, + "grad_norm": 0.031015096232295036, + "learning_rate": 7.448296535023077e-05, + "loss": 0.1266, + "step": 1971 + }, + { + "epoch": 1.872300023736055, + "grad_norm": 0.02895331382751465, + "learning_rate": 7.437613628969627e-05, + "loss": 0.1284, + "step": 1972 + }, + { + "epoch": 1.8732494659387608, + "grad_norm": 0.04110453650355339, + "learning_rate": 7.426933851808355e-05, + "loss": 0.1545, + "step": 1973 + }, + { + "epoch": 1.8741989081414667, + "grad_norm": 0.030046746134757996, + "learning_rate": 7.416257216580181e-05, + "loss": 0.1269, + "step": 1974 + }, + { + "epoch": 1.8751483503441728, + "grad_norm": 0.04101106524467468, + "learning_rate": 7.405583736322182e-05, + "loss": 0.1621, + "step": 1975 + }, + { + "epoch": 1.8760977925468787, + "grad_norm": 0.04438061639666557, + "learning_rate": 7.394913424067591e-05, + "loss": 0.1693, + "step": 1976 + }, + { + "epoch": 1.8770472347495846, + "grad_norm": 0.028554782271385193, + "learning_rate": 7.38424629284577e-05, + "loss": 0.1265, + "step": 1977 + }, + { + "epoch": 1.8779966769522907, + "grad_norm": 0.028575632721185684, + "learning_rate": 7.373582355682191e-05, + "loss": 0.12, + "step": 1978 + }, + { + "epoch": 1.8789461191549965, + "grad_norm": 0.027870802208781242, + "learning_rate": 7.362921625598436e-05, + "loss": 0.1301, + "step": 1979 + }, + { + "epoch": 1.8798955613577024, + "grad_norm": 0.03442731872200966, + "learning_rate": 7.352264115612158e-05, + "loss": 0.1537, + "step": 1980 + }, + { + "epoch": 1.8808450035604083, + "grad_norm": 0.03244437277317047, + "learning_rate": 7.341609838737089e-05, + "loss": 0.1356, + "step": 1981 + }, + { + "epoch": 1.8817944457631142, + "grad_norm": 0.0298260897397995, + "learning_rate": 7.330958807983011e-05, + "loss": 0.1238, + "step": 1982 + }, + { + "epoch": 1.88274388796582, + "grad_norm": 0.02521882764995098, + "learning_rate": 7.320311036355736e-05, + "loss": 0.1185, + "step": 1983 + }, + { + "epoch": 1.883693330168526, + "grad_norm": 0.02781338430941105, + "learning_rate": 7.309666536857106e-05, + "loss": 0.1214, + "step": 1984 + }, + { + "epoch": 1.8846427723712318, + "grad_norm": 0.05946779251098633, + "learning_rate": 7.299025322484958e-05, + "loss": 0.1897, + "step": 1985 + }, + { + "epoch": 1.8855922145739377, + "grad_norm": 0.028507012873888016, + "learning_rate": 7.288387406233122e-05, + "loss": 0.1282, + "step": 1986 + }, + { + "epoch": 1.8865416567766438, + "grad_norm": 0.029610810801386833, + "learning_rate": 7.277752801091404e-05, + "loss": 0.1302, + "step": 1987 + }, + { + "epoch": 1.8874910989793496, + "grad_norm": 0.030304808169603348, + "learning_rate": 7.267121520045558e-05, + "loss": 0.132, + "step": 1988 + }, + { + "epoch": 1.8884405411820555, + "grad_norm": 0.028647607192397118, + "learning_rate": 7.256493576077292e-05, + "loss": 0.1309, + "step": 1989 + }, + { + "epoch": 1.8893899833847616, + "grad_norm": 0.04364948347210884, + "learning_rate": 7.245868982164226e-05, + "loss": 0.1628, + "step": 1990 + }, + { + "epoch": 1.8903394255874675, + "grad_norm": 0.029457390308380127, + "learning_rate": 7.235247751279893e-05, + "loss": 0.1163, + "step": 1991 + }, + { + "epoch": 1.8912888677901734, + "grad_norm": 0.059156183153390884, + "learning_rate": 7.224629896393726e-05, + "loss": 0.2033, + "step": 1992 + }, + { + "epoch": 1.8922383099928792, + "grad_norm": 0.02888781949877739, + "learning_rate": 7.214015430471028e-05, + "loss": 0.1242, + "step": 1993 + }, + { + "epoch": 1.893187752195585, + "grad_norm": 0.03040069155395031, + "learning_rate": 7.20340436647297e-05, + "loss": 0.1211, + "step": 1994 + }, + { + "epoch": 1.894137194398291, + "grad_norm": 0.028204258531332016, + "learning_rate": 7.192796717356562e-05, + "loss": 0.1267, + "step": 1995 + }, + { + "epoch": 1.8950866366009969, + "grad_norm": 0.030367571860551834, + "learning_rate": 7.182192496074648e-05, + "loss": 0.1232, + "step": 1996 + }, + { + "epoch": 1.8960360788037027, + "grad_norm": 0.02587362751364708, + "learning_rate": 7.171591715575888e-05, + "loss": 0.1261, + "step": 1997 + }, + { + "epoch": 1.8969855210064086, + "grad_norm": 0.028903882950544357, + "learning_rate": 7.160994388804736e-05, + "loss": 0.1318, + "step": 1998 + }, + { + "epoch": 1.8979349632091147, + "grad_norm": 0.025526562705636024, + "learning_rate": 7.150400528701436e-05, + "loss": 0.1205, + "step": 1999 + }, + { + "epoch": 1.8988844054118206, + "grad_norm": 0.029438691213726997, + "learning_rate": 7.139810148201987e-05, + "loss": 0.131, + "step": 2000 + }, + { + "epoch": 1.8998338476145264, + "grad_norm": 0.026545461267232895, + "learning_rate": 7.129223260238154e-05, + "loss": 0.1219, + "step": 2001 + }, + { + "epoch": 1.9007832898172323, + "grad_norm": 0.03137153759598732, + "learning_rate": 7.118639877737425e-05, + "loss": 0.1389, + "step": 2002 + }, + { + "epoch": 1.9017327320199384, + "grad_norm": 0.03655494004487991, + "learning_rate": 7.108060013623017e-05, + "loss": 0.1592, + "step": 2003 + }, + { + "epoch": 1.9026821742226443, + "grad_norm": 0.0271841399371624, + "learning_rate": 7.09748368081384e-05, + "loss": 0.1309, + "step": 2004 + }, + { + "epoch": 1.9036316164253502, + "grad_norm": 0.028577405959367752, + "learning_rate": 7.086910892224499e-05, + "loss": 0.1261, + "step": 2005 + }, + { + "epoch": 1.904581058628056, + "grad_norm": 0.028025876730680466, + "learning_rate": 7.076341660765271e-05, + "loss": 0.1323, + "step": 2006 + }, + { + "epoch": 1.905530500830762, + "grad_norm": 0.03332342579960823, + "learning_rate": 7.065775999342091e-05, + "loss": 0.1327, + "step": 2007 + }, + { + "epoch": 1.9064799430334678, + "grad_norm": 0.043180011212825775, + "learning_rate": 7.055213920856529e-05, + "loss": 0.1613, + "step": 2008 + }, + { + "epoch": 1.9074293852361737, + "grad_norm": 0.04228482022881508, + "learning_rate": 7.044655438205785e-05, + "loss": 0.1594, + "step": 2009 + }, + { + "epoch": 1.9083788274388795, + "grad_norm": 0.029172802343964577, + "learning_rate": 7.034100564282664e-05, + "loss": 0.1258, + "step": 2010 + }, + { + "epoch": 1.9093282696415854, + "grad_norm": 0.0426810160279274, + "learning_rate": 7.02354931197557e-05, + "loss": 0.16, + "step": 2011 + }, + { + "epoch": 1.9102777118442915, + "grad_norm": 0.025085503235459328, + "learning_rate": 7.013001694168478e-05, + "loss": 0.1233, + "step": 2012 + }, + { + "epoch": 1.9112271540469974, + "grad_norm": 0.0266293715685606, + "learning_rate": 7.002457723740934e-05, + "loss": 0.1214, + "step": 2013 + }, + { + "epoch": 1.9121765962497033, + "grad_norm": 0.03064984828233719, + "learning_rate": 6.991917413568017e-05, + "loss": 0.1186, + "step": 2014 + }, + { + "epoch": 1.9131260384524094, + "grad_norm": 0.026003271341323853, + "learning_rate": 6.981380776520348e-05, + "loss": 0.1228, + "step": 2015 + }, + { + "epoch": 1.9140754806551152, + "grad_norm": 0.045436155050992966, + "learning_rate": 6.970847825464059e-05, + "loss": 0.174, + "step": 2016 + }, + { + "epoch": 1.915024922857821, + "grad_norm": 0.029938362538814545, + "learning_rate": 6.960318573260783e-05, + "loss": 0.1201, + "step": 2017 + }, + { + "epoch": 1.915974365060527, + "grad_norm": 0.026935014873743057, + "learning_rate": 6.949793032767634e-05, + "loss": 0.1165, + "step": 2018 + }, + { + "epoch": 1.9169238072632329, + "grad_norm": 0.02809876948595047, + "learning_rate": 6.93927121683719e-05, + "loss": 0.1248, + "step": 2019 + }, + { + "epoch": 1.9178732494659387, + "grad_norm": 0.03932083770632744, + "learning_rate": 6.928753138317488e-05, + "loss": 0.1607, + "step": 2020 + }, + { + "epoch": 1.9188226916686446, + "grad_norm": 0.029043098911643028, + "learning_rate": 6.918238810051999e-05, + "loss": 0.1292, + "step": 2021 + }, + { + "epoch": 1.9197721338713505, + "grad_norm": 0.03849990293383598, + "learning_rate": 6.907728244879611e-05, + "loss": 0.1611, + "step": 2022 + }, + { + "epoch": 1.9207215760740564, + "grad_norm": 0.028439447283744812, + "learning_rate": 6.897221455634624e-05, + "loss": 0.1265, + "step": 2023 + }, + { + "epoch": 1.9216710182767625, + "grad_norm": 0.028611112385988235, + "learning_rate": 6.886718455146724e-05, + "loss": 0.1312, + "step": 2024 + }, + { + "epoch": 1.9226204604794683, + "grad_norm": 0.02605103701353073, + "learning_rate": 6.87621925624096e-05, + "loss": 0.1241, + "step": 2025 + }, + { + "epoch": 1.9235699026821742, + "grad_norm": 0.06604333966970444, + "learning_rate": 6.865723871737762e-05, + "loss": 0.2016, + "step": 2026 + }, + { + "epoch": 1.92451934488488, + "grad_norm": 0.044974714517593384, + "learning_rate": 6.855232314452884e-05, + "loss": 0.1778, + "step": 2027 + }, + { + "epoch": 1.9254687870875862, + "grad_norm": 0.03168616443872452, + "learning_rate": 6.844744597197409e-05, + "loss": 0.1327, + "step": 2028 + }, + { + "epoch": 1.926418229290292, + "grad_norm": 0.029546428471803665, + "learning_rate": 6.834260732777736e-05, + "loss": 0.1302, + "step": 2029 + }, + { + "epoch": 1.927367671492998, + "grad_norm": 0.05021713301539421, + "learning_rate": 6.823780733995557e-05, + "loss": 0.1863, + "step": 2030 + }, + { + "epoch": 1.9283171136957038, + "grad_norm": 0.030026502907276154, + "learning_rate": 6.813304613647845e-05, + "loss": 0.1349, + "step": 2031 + }, + { + "epoch": 1.9292665558984097, + "grad_norm": 0.03538592904806137, + "learning_rate": 6.802832384526836e-05, + "loss": 0.1374, + "step": 2032 + }, + { + "epoch": 1.9302159981011155, + "grad_norm": 0.027488164603710175, + "learning_rate": 6.792364059420012e-05, + "loss": 0.1237, + "step": 2033 + }, + { + "epoch": 1.9311654403038214, + "grad_norm": 0.03553836792707443, + "learning_rate": 6.781899651110091e-05, + "loss": 0.1522, + "step": 2034 + }, + { + "epoch": 1.9321148825065273, + "grad_norm": 0.029753949493169785, + "learning_rate": 6.771439172375007e-05, + "loss": 0.1222, + "step": 2035 + }, + { + "epoch": 1.9330643247092332, + "grad_norm": 0.03108718991279602, + "learning_rate": 6.760982635987899e-05, + "loss": 0.1186, + "step": 2036 + }, + { + "epoch": 1.9340137669119393, + "grad_norm": 0.02662482298910618, + "learning_rate": 6.750530054717088e-05, + "loss": 0.1189, + "step": 2037 + }, + { + "epoch": 1.9349632091146451, + "grad_norm": 0.029288165271282196, + "learning_rate": 6.740081441326062e-05, + "loss": 0.1179, + "step": 2038 + }, + { + "epoch": 1.935912651317351, + "grad_norm": 0.03478897735476494, + "learning_rate": 6.729636808573476e-05, + "loss": 0.1249, + "step": 2039 + }, + { + "epoch": 1.9368620935200571, + "grad_norm": 0.03969739004969597, + "learning_rate": 6.719196169213114e-05, + "loss": 0.1579, + "step": 2040 + }, + { + "epoch": 1.937811535722763, + "grad_norm": 0.030195200815796852, + "learning_rate": 6.708759535993884e-05, + "loss": 0.115, + "step": 2041 + }, + { + "epoch": 1.9387609779254689, + "grad_norm": 0.03426138311624527, + "learning_rate": 6.698326921659808e-05, + "loss": 0.1266, + "step": 2042 + }, + { + "epoch": 1.9397104201281747, + "grad_norm": 0.05202037841081619, + "learning_rate": 6.687898338949998e-05, + "loss": 0.193, + "step": 2043 + }, + { + "epoch": 1.9406598623308806, + "grad_norm": 0.027649085968732834, + "learning_rate": 6.67747380059864e-05, + "loss": 0.1222, + "step": 2044 + }, + { + "epoch": 1.9416093045335865, + "grad_norm": 0.026928169652819633, + "learning_rate": 6.667053319334982e-05, + "loss": 0.1204, + "step": 2045 + }, + { + "epoch": 1.9425587467362924, + "grad_norm": 0.056547269225120544, + "learning_rate": 6.656636907883325e-05, + "loss": 0.1602, + "step": 2046 + }, + { + "epoch": 1.9435081889389982, + "grad_norm": 0.026589645072817802, + "learning_rate": 6.646224578962993e-05, + "loss": 0.1214, + "step": 2047 + }, + { + "epoch": 1.9444576311417041, + "grad_norm": 0.02858765795826912, + "learning_rate": 6.635816345288329e-05, + "loss": 0.1242, + "step": 2048 + }, + { + "epoch": 1.9454070733444102, + "grad_norm": 0.04160701856017113, + "learning_rate": 6.625412219568668e-05, + "loss": 0.1606, + "step": 2049 + }, + { + "epoch": 1.946356515547116, + "grad_norm": 0.03329680487513542, + "learning_rate": 6.615012214508336e-05, + "loss": 0.1346, + "step": 2050 + }, + { + "epoch": 1.947305957749822, + "grad_norm": 0.041767850518226624, + "learning_rate": 6.604616342806632e-05, + "loss": 0.1566, + "step": 2051 + }, + { + "epoch": 1.948255399952528, + "grad_norm": 0.027340400964021683, + "learning_rate": 6.594224617157795e-05, + "loss": 0.1253, + "step": 2052 + }, + { + "epoch": 1.949204842155234, + "grad_norm": 0.06383645534515381, + "learning_rate": 6.583837050251012e-05, + "loss": 0.1518, + "step": 2053 + }, + { + "epoch": 1.9501542843579398, + "grad_norm": 0.051231034100055695, + "learning_rate": 6.573453654770383e-05, + "loss": 0.1565, + "step": 2054 + }, + { + "epoch": 1.9501542843579398, + "eval_loss": 0.37301480770111084, + "eval_runtime": 38.0432, + "eval_samples_per_second": 2.261, + "eval_steps_per_second": 2.261, + "step": 2054 + }, + { + "epoch": 1.9513410871113221, + "grad_norm": 0.4340180456638336, + "learning_rate": 0.00013378114170405474, + "loss": 0.3769, + "step": 2055 + }, + { + "epoch": 1.952290529314028, + "grad_norm": 0.2337399125099182, + "learning_rate": 0.00013372497405242763, + "loss": 0.3148, + "step": 2056 + }, + { + "epoch": 1.953239971516734, + "grad_norm": 0.15890651941299438, + "learning_rate": 0.00013366879439324493, + "loss": 0.3167, + "step": 2057 + }, + { + "epoch": 1.95418941371944, + "grad_norm": 0.14962317049503326, + "learning_rate": 0.00013361260274650906, + "loss": 0.3146, + "step": 2058 + }, + { + "epoch": 1.9551388559221459, + "grad_norm": 5.296742916107178, + "learning_rate": 0.00013355639913222668, + "loss": 0.4622, + "step": 2059 + }, + { + "epoch": 1.9560882981248517, + "grad_norm": 7.226221084594727, + "learning_rate": 0.0001335001835704087, + "loss": 1.4115, + "step": 2060 + }, + { + "epoch": 1.9570377403275576, + "grad_norm": 3.240274667739868, + "learning_rate": 0.00013344395608107031, + "loss": 1.2552, + "step": 2061 + }, + { + "epoch": 1.9579871825302635, + "grad_norm": 2.299501657485962, + "learning_rate": 0.00013338771668423095, + "loss": 0.3784, + "step": 2062 + }, + { + "epoch": 1.9589366247329694, + "grad_norm": 2.4971210956573486, + "learning_rate": 0.00013333146539991431, + "loss": 0.6146, + "step": 2063 + }, + { + "epoch": 1.9598860669356752, + "grad_norm": 1.7239331007003784, + "learning_rate": 0.00013327520224814822, + "loss": 0.4257, + "step": 2064 + }, + { + "epoch": 1.9608355091383811, + "grad_norm": 0.29740026593208313, + "learning_rate": 0.00013321892724896484, + "loss": 0.3187, + "step": 2065 + }, + { + "epoch": 1.961784951341087, + "grad_norm": 8.102334022521973, + "learning_rate": 0.0001331626404224005, + "loss": 0.481, + "step": 2066 + }, + { + "epoch": 1.9627343935437929, + "grad_norm": 0.29957181215286255, + "learning_rate": 0.0001331063417884958, + "loss": 0.3117, + "step": 2067 + }, + { + "epoch": 1.963683835746499, + "grad_norm": 2.238389730453491, + "learning_rate": 0.00013305003136729552, + "loss": 0.3736, + "step": 2068 + }, + { + "epoch": 1.9646332779492048, + "grad_norm": 0.37475112080574036, + "learning_rate": 0.0001329937091788485, + "loss": 0.3093, + "step": 2069 + }, + { + "epoch": 1.9655827201519107, + "grad_norm": 1.1860514879226685, + "learning_rate": 0.00013293737524320797, + "loss": 0.3951, + "step": 2070 + }, + { + "epoch": 1.9665321623546168, + "grad_norm": 0.2817871868610382, + "learning_rate": 0.00013288102958043126, + "loss": 0.3127, + "step": 2071 + }, + { + "epoch": 1.9674816045573227, + "grad_norm": 0.3843158483505249, + "learning_rate": 0.00013282467221057984, + "loss": 0.2984, + "step": 2072 + }, + { + "epoch": 1.9684310467600286, + "grad_norm": 0.40091991424560547, + "learning_rate": 0.0001327683031537194, + "loss": 0.2966, + "step": 2073 + }, + { + "epoch": 1.9693804889627344, + "grad_norm": 0.34588465094566345, + "learning_rate": 0.00013271192242991976, + "loss": 0.3163, + "step": 2074 + }, + { + "epoch": 1.9703299311654403, + "grad_norm": 0.19917060434818268, + "learning_rate": 0.00013265553005925492, + "loss": 0.3001, + "step": 2075 + }, + { + "epoch": 1.9712793733681462, + "grad_norm": 0.17843176424503326, + "learning_rate": 0.00013259912606180301, + "loss": 0.3018, + "step": 2076 + }, + { + "epoch": 1.972228815570852, + "grad_norm": 0.10518278181552887, + "learning_rate": 0.00013254271045764636, + "loss": 0.2883, + "step": 2077 + }, + { + "epoch": 1.973178257773558, + "grad_norm": 0.16444529592990875, + "learning_rate": 0.00013248628326687124, + "loss": 0.3041, + "step": 2078 + }, + { + "epoch": 1.9741276999762638, + "grad_norm": 0.1926691085100174, + "learning_rate": 0.00013242984450956828, + "loss": 0.2763, + "step": 2079 + }, + { + "epoch": 1.97507714217897, + "grad_norm": 0.24896161258220673, + "learning_rate": 0.00013237339420583212, + "loss": 0.2895, + "step": 2080 + }, + { + "epoch": 1.9760265843816758, + "grad_norm": 0.23915739357471466, + "learning_rate": 0.00013231693237576148, + "loss": 0.2901, + "step": 2081 + }, + { + "epoch": 1.9769760265843817, + "grad_norm": 0.08436968922615051, + "learning_rate": 0.00013226045903945926, + "loss": 0.278, + "step": 2082 + }, + { + "epoch": 1.9779254687870877, + "grad_norm": 0.9301303625106812, + "learning_rate": 0.00013220397421703247, + "loss": 0.316, + "step": 2083 + }, + { + "epoch": 1.9788749109897936, + "grad_norm": 1.2519832849502563, + "learning_rate": 0.00013214747792859201, + "loss": 0.2931, + "step": 2084 + }, + { + "epoch": 1.9798243531924995, + "grad_norm": 0.4805239140987396, + "learning_rate": 0.00013209097019425316, + "loss": 0.4146, + "step": 2085 + }, + { + "epoch": 1.9807737953952054, + "grad_norm": 0.21979232132434845, + "learning_rate": 0.00013203445103413507, + "loss": 0.3, + "step": 2086 + }, + { + "epoch": 1.9817232375979112, + "grad_norm": 0.1640891432762146, + "learning_rate": 0.000131977920468361, + "loss": 0.2969, + "step": 2087 + }, + { + "epoch": 1.9826726798006171, + "grad_norm": 0.17019522190093994, + "learning_rate": 0.0001319213785170583, + "loss": 0.2914, + "step": 2088 + }, + { + "epoch": 1.983622122003323, + "grad_norm": 0.09475825726985931, + "learning_rate": 0.00013186482520035839, + "loss": 0.297, + "step": 2089 + }, + { + "epoch": 1.9845715642060289, + "grad_norm": 0.09213607013225555, + "learning_rate": 0.00013180826053839668, + "loss": 0.288, + "step": 2090 + }, + { + "epoch": 1.9855210064087347, + "grad_norm": 0.11374935507774353, + "learning_rate": 0.00013175168455131263, + "loss": 0.2796, + "step": 2091 + }, + { + "epoch": 1.9864704486114406, + "grad_norm": 0.10812429338693619, + "learning_rate": 0.0001316950972592498, + "loss": 0.3057, + "step": 2092 + }, + { + "epoch": 1.9874198908141467, + "grad_norm": 0.07910951226949692, + "learning_rate": 0.00013163849868235564, + "loss": 0.2877, + "step": 2093 + }, + { + "epoch": 1.9883693330168526, + "grad_norm": 0.09240693598985672, + "learning_rate": 0.00013158188884078182, + "loss": 0.2906, + "step": 2094 + }, + { + "epoch": 1.9893187752195585, + "grad_norm": 0.097608283162117, + "learning_rate": 0.00013152526775468378, + "loss": 0.2906, + "step": 2095 + }, + { + "epoch": 1.9902682174222646, + "grad_norm": 0.2190292775630951, + "learning_rate": 0.00013146863544422118, + "loss": 0.2835, + "step": 2096 + }, + { + "epoch": 1.9912176596249704, + "grad_norm": 0.07066213339567184, + "learning_rate": 0.00013141199192955751, + "loss": 0.2856, + "step": 2097 + }, + { + "epoch": 1.9921671018276763, + "grad_norm": 0.10716898739337921, + "learning_rate": 0.0001313553372308604, + "loss": 0.3012, + "step": 2098 + }, + { + "epoch": 1.9931165440303822, + "grad_norm": 0.07971798628568649, + "learning_rate": 0.00013129867136830127, + "loss": 0.2678, + "step": 2099 + }, + { + "epoch": 1.994065986233088, + "grad_norm": 0.11225918680429459, + "learning_rate": 0.00013124199436205576, + "loss": 0.2799, + "step": 2100 + }, + { + "epoch": 1.995015428435794, + "grad_norm": 0.08741844445466995, + "learning_rate": 0.00013118530623230327, + "loss": 0.284, + "step": 2101 + }, + { + "epoch": 1.9959648706384998, + "grad_norm": 0.07644308358430862, + "learning_rate": 0.00013112860699922722, + "loss": 0.2988, + "step": 2102 + }, + { + "epoch": 1.9969143128412057, + "grad_norm": 0.07610399276018143, + "learning_rate": 0.00013107189668301508, + "loss": 0.2813, + "step": 2103 + }, + { + "epoch": 1.9978637550439116, + "grad_norm": 0.1364275962114334, + "learning_rate": 0.0001310151753038581, + "loss": 0.3006, + "step": 2104 + }, + { + "epoch": 1.9988131972466177, + "grad_norm": 0.06598393619060516, + "learning_rate": 0.0001309584428819516, + "loss": 0.2822, + "step": 2105 + }, + { + "epoch": 1.9997626394493235, + "grad_norm": 0.08182472735643387, + "learning_rate": 0.00013090169943749476, + "loss": 0.2757, + "step": 2106 + }, + { + "epoch": 2.0007120816520296, + "grad_norm": 0.06756250560283661, + "learning_rate": 0.0001308449449906907, + "loss": 0.2619, + "step": 2107 + }, + { + "epoch": 2.0016615238547355, + "grad_norm": 0.05981763079762459, + "learning_rate": 0.00013078817956174656, + "loss": 0.2856, + "step": 2108 + }, + { + "epoch": 2.0026109660574414, + "grad_norm": 0.0790615975856781, + "learning_rate": 0.0001307314031708732, + "loss": 0.2875, + "step": 2109 + }, + { + "epoch": 2.0035604082601473, + "grad_norm": 0.06421328336000443, + "learning_rate": 0.00013067461583828553, + "loss": 0.2683, + "step": 2110 + }, + { + "epoch": 2.004509850462853, + "grad_norm": 0.06607569754123688, + "learning_rate": 0.0001306178175842023, + "loss": 0.271, + "step": 2111 + }, + { + "epoch": 2.005459292665559, + "grad_norm": 0.06524945050477982, + "learning_rate": 0.00013056100842884612, + "loss": 0.2796, + "step": 2112 + }, + { + "epoch": 2.006408734868265, + "grad_norm": 0.05927155539393425, + "learning_rate": 0.00013050418839244355, + "loss": 0.2755, + "step": 2113 + }, + { + "epoch": 2.0073581770709708, + "grad_norm": 0.06408464163541794, + "learning_rate": 0.000130447357495225, + "loss": 0.2748, + "step": 2114 + }, + { + "epoch": 2.0083076192736766, + "grad_norm": 0.05964144691824913, + "learning_rate": 0.0001303905157574247, + "loss": 0.2772, + "step": 2115 + }, + { + "epoch": 2.0092570614763825, + "grad_norm": 0.05294380709528923, + "learning_rate": 0.00013033366319928079, + "loss": 0.2543, + "step": 2116 + }, + { + "epoch": 2.0102065036790884, + "grad_norm": 0.06316480785608292, + "learning_rate": 0.00013027679984103528, + "loss": 0.2659, + "step": 2117 + }, + { + "epoch": 2.0111559458817942, + "grad_norm": 0.0780426636338234, + "learning_rate": 0.000130219925702934, + "loss": 0.2809, + "step": 2118 + }, + { + "epoch": 2.0121053880845006, + "grad_norm": 0.05921616032719612, + "learning_rate": 0.00013016304080522656, + "loss": 0.2651, + "step": 2119 + }, + { + "epoch": 2.0130548302872064, + "grad_norm": 0.063509002327919, + "learning_rate": 0.0001301061451681665, + "loss": 0.2766, + "step": 2120 + }, + { + "epoch": 2.0140042724899123, + "grad_norm": 0.06251564621925354, + "learning_rate": 0.0001300492388120111, + "loss": 0.2826, + "step": 2121 + }, + { + "epoch": 2.014953714692618, + "grad_norm": 0.07721933722496033, + "learning_rate": 0.0001299923217570215, + "loss": 0.2876, + "step": 2122 + }, + { + "epoch": 2.015903156895324, + "grad_norm": 0.22655093669891357, + "learning_rate": 0.0001299353940234627, + "loss": 0.3023, + "step": 2123 + }, + { + "epoch": 2.01685259909803, + "grad_norm": 0.16343270242214203, + "learning_rate": 0.0001298784556316034, + "loss": 0.2902, + "step": 2124 + }, + { + "epoch": 2.017802041300736, + "grad_norm": 0.0674663856625557, + "learning_rate": 0.00012982150660171613, + "loss": 0.2639, + "step": 2125 + }, + { + "epoch": 2.0187514835034417, + "grad_norm": 0.06698331236839294, + "learning_rate": 0.00012976454695407723, + "loss": 0.2918, + "step": 2126 + }, + { + "epoch": 2.0197009257061476, + "grad_norm": 0.05850343778729439, + "learning_rate": 0.00012970757670896683, + "loss": 0.2691, + "step": 2127 + }, + { + "epoch": 2.0206503679088534, + "grad_norm": 0.05704069882631302, + "learning_rate": 0.0001296505958866688, + "loss": 0.2688, + "step": 2128 + }, + { + "epoch": 2.0215998101115593, + "grad_norm": 0.060891758650541306, + "learning_rate": 0.00012959360450747075, + "loss": 0.2652, + "step": 2129 + }, + { + "epoch": 2.022549252314265, + "grad_norm": 0.061691515147686005, + "learning_rate": 0.00012953660259166412, + "loss": 0.2756, + "step": 2130 + }, + { + "epoch": 2.023498694516971, + "grad_norm": 0.059189558029174805, + "learning_rate": 0.00012947959015954406, + "loss": 0.2759, + "step": 2131 + }, + { + "epoch": 2.0244481367196774, + "grad_norm": 0.06447713077068329, + "learning_rate": 0.00012942256723140952, + "loss": 0.2773, + "step": 2132 + }, + { + "epoch": 2.0253975789223833, + "grad_norm": 0.06263953447341919, + "learning_rate": 0.0001293655338275631, + "loss": 0.2867, + "step": 2133 + }, + { + "epoch": 2.026347021125089, + "grad_norm": 0.0576293058693409, + "learning_rate": 0.00012930848996831114, + "loss": 0.2776, + "step": 2134 + }, + { + "epoch": 2.027296463327795, + "grad_norm": 0.05699608847498894, + "learning_rate": 0.00012925143567396374, + "loss": 0.2757, + "step": 2135 + }, + { + "epoch": 2.028245905530501, + "grad_norm": 0.052561014890670776, + "learning_rate": 0.00012919437096483476, + "loss": 0.2555, + "step": 2136 + }, + { + "epoch": 2.0291953477332068, + "grad_norm": 0.053198445588350296, + "learning_rate": 0.00012913729586124165, + "loss": 0.2676, + "step": 2137 + }, + { + "epoch": 2.0301447899359126, + "grad_norm": 0.09329196810722351, + "learning_rate": 0.00012908021038350568, + "loss": 0.2796, + "step": 2138 + }, + { + "epoch": 2.0310942321386185, + "grad_norm": 0.07239534705877304, + "learning_rate": 0.00012902311455195172, + "loss": 0.2809, + "step": 2139 + }, + { + "epoch": 2.0320436743413244, + "grad_norm": 0.06299670785665512, + "learning_rate": 0.00012896600838690838, + "loss": 0.2672, + "step": 2140 + }, + { + "epoch": 2.0329931165440303, + "grad_norm": 0.05467437952756882, + "learning_rate": 0.00012890889190870795, + "loss": 0.268, + "step": 2141 + }, + { + "epoch": 2.033942558746736, + "grad_norm": 0.0641472190618515, + "learning_rate": 0.00012885176513768637, + "loss": 0.2844, + "step": 2142 + }, + { + "epoch": 2.034892000949442, + "grad_norm": 0.06481951475143433, + "learning_rate": 0.00012879462809418325, + "loss": 0.2883, + "step": 2143 + }, + { + "epoch": 2.0358414431521483, + "grad_norm": 0.05889345332980156, + "learning_rate": 0.0001287374807985418, + "loss": 0.2688, + "step": 2144 + }, + { + "epoch": 2.036790885354854, + "grad_norm": 0.05446067079901695, + "learning_rate": 0.00012868032327110904, + "loss": 0.2699, + "step": 2145 + }, + { + "epoch": 2.03774032755756, + "grad_norm": 0.0558142326772213, + "learning_rate": 0.00012862315553223547, + "loss": 0.2662, + "step": 2146 + }, + { + "epoch": 2.038689769760266, + "grad_norm": 0.05485325679183006, + "learning_rate": 0.0001285659776022753, + "loss": 0.2684, + "step": 2147 + }, + { + "epoch": 2.039639211962972, + "grad_norm": 0.05541551858186722, + "learning_rate": 0.0001285087895015864, + "loss": 0.2674, + "step": 2148 + }, + { + "epoch": 2.0405886541656777, + "grad_norm": 0.10139881074428558, + "learning_rate": 0.0001284515912505301, + "loss": 0.2737, + "step": 2149 + }, + { + "epoch": 2.0415380963683836, + "grad_norm": 0.05179375782608986, + "learning_rate": 0.00012839438286947163, + "loss": 0.2647, + "step": 2150 + }, + { + "epoch": 2.0424875385710894, + "grad_norm": 0.0590873584151268, + "learning_rate": 0.0001283371643787795, + "loss": 0.2743, + "step": 2151 + }, + { + "epoch": 2.0434369807737953, + "grad_norm": 0.0546240359544754, + "learning_rate": 0.00012827993579882612, + "loss": 0.2798, + "step": 2152 + }, + { + "epoch": 2.044386422976501, + "grad_norm": 0.056896887719631195, + "learning_rate": 0.0001282226971499872, + "loss": 0.2717, + "step": 2153 + }, + { + "epoch": 2.045335865179207, + "grad_norm": 0.052284859120845795, + "learning_rate": 0.00012816544845264228, + "loss": 0.2719, + "step": 2154 + }, + { + "epoch": 2.046285307381913, + "grad_norm": 0.060961298644542694, + "learning_rate": 0.0001281081897271744, + "loss": 0.2764, + "step": 2155 + }, + { + "epoch": 2.047234749584619, + "grad_norm": 0.08830570429563522, + "learning_rate": 0.0001280509209939701, + "loss": 0.2946, + "step": 2156 + }, + { + "epoch": 2.048184191787325, + "grad_norm": 0.05548688769340515, + "learning_rate": 0.00012799364227341955, + "loss": 0.2647, + "step": 2157 + }, + { + "epoch": 2.049133633990031, + "grad_norm": 0.05134082958102226, + "learning_rate": 0.00012793635358591645, + "loss": 0.2724, + "step": 2158 + }, + { + "epoch": 2.050083076192737, + "grad_norm": 0.06974118202924728, + "learning_rate": 0.0001278790549518581, + "loss": 0.2873, + "step": 2159 + }, + { + "epoch": 2.0510325183954428, + "grad_norm": 0.06583964079618454, + "learning_rate": 0.0001278217463916453, + "loss": 0.2823, + "step": 2160 + }, + { + "epoch": 2.0519819605981486, + "grad_norm": 0.05401783436536789, + "learning_rate": 0.00012776442792568232, + "loss": 0.2788, + "step": 2161 + }, + { + "epoch": 2.0529314028008545, + "grad_norm": 0.09343112260103226, + "learning_rate": 0.00012770709957437708, + "loss": 0.2824, + "step": 2162 + }, + { + "epoch": 2.0538808450035604, + "grad_norm": 0.11026190966367722, + "learning_rate": 0.00012764976135814094, + "loss": 0.2861, + "step": 2163 + }, + { + "epoch": 2.0548302872062663, + "grad_norm": 0.05160842835903168, + "learning_rate": 0.00012759241329738887, + "loss": 0.2615, + "step": 2164 + }, + { + "epoch": 2.055779729408972, + "grad_norm": 0.057216208428144455, + "learning_rate": 0.00012753505541253916, + "loss": 0.2757, + "step": 2165 + }, + { + "epoch": 2.056729171611678, + "grad_norm": 0.07923352718353271, + "learning_rate": 0.00012747768772401378, + "loss": 0.2659, + "step": 2166 + }, + { + "epoch": 2.057678613814384, + "grad_norm": 0.055502623319625854, + "learning_rate": 0.0001274203102522381, + "loss": 0.2757, + "step": 2167 + }, + { + "epoch": 2.0586280560170898, + "grad_norm": 0.10472196340560913, + "learning_rate": 0.00012736292301764098, + "loss": 0.2938, + "step": 2168 + }, + { + "epoch": 2.059577498219796, + "grad_norm": 0.1105305552482605, + "learning_rate": 0.00012730552604065475, + "loss": 0.2846, + "step": 2169 + }, + { + "epoch": 2.060526940422502, + "grad_norm": 0.06079312413930893, + "learning_rate": 0.0001272481193417153, + "loss": 0.2724, + "step": 2170 + }, + { + "epoch": 2.061476382625208, + "grad_norm": 0.06276509910821915, + "learning_rate": 0.00012719070294126182, + "loss": 0.2704, + "step": 2171 + }, + { + "epoch": 2.0624258248279137, + "grad_norm": 0.08746016025543213, + "learning_rate": 0.00012713327685973707, + "loss": 0.2834, + "step": 2172 + }, + { + "epoch": 2.0633752670306196, + "grad_norm": 0.053869761526584625, + "learning_rate": 0.0001270758411175873, + "loss": 0.2712, + "step": 2173 + }, + { + "epoch": 2.0643247092333254, + "grad_norm": 0.05118397995829582, + "learning_rate": 0.00012701839573526206, + "loss": 0.2737, + "step": 2174 + }, + { + "epoch": 2.0652741514360313, + "grad_norm": 0.05905655771493912, + "learning_rate": 0.0001269609407332144, + "loss": 0.2663, + "step": 2175 + }, + { + "epoch": 2.066223593638737, + "grad_norm": 0.049641139805316925, + "learning_rate": 0.00012690347613190082, + "loss": 0.263, + "step": 2176 + }, + { + "epoch": 2.067173035841443, + "grad_norm": 0.04823688417673111, + "learning_rate": 0.00012684600195178117, + "loss": 0.2667, + "step": 2177 + }, + { + "epoch": 2.068122478044149, + "grad_norm": 0.07979489117860794, + "learning_rate": 0.00012678851821331882, + "loss": 0.2854, + "step": 2178 + }, + { + "epoch": 2.069071920246855, + "grad_norm": 0.06123083457350731, + "learning_rate": 0.00012673102493698042, + "loss": 0.2832, + "step": 2179 + }, + { + "epoch": 2.0700213624495607, + "grad_norm": 0.07498030364513397, + "learning_rate": 0.00012667352214323614, + "loss": 0.3061, + "step": 2180 + }, + { + "epoch": 2.0709708046522666, + "grad_norm": 0.059050336480140686, + "learning_rate": 0.0001266160098525594, + "loss": 0.2623, + "step": 2181 + }, + { + "epoch": 2.071920246854973, + "grad_norm": 0.060739047825336456, + "learning_rate": 0.00012655848808542709, + "loss": 0.282, + "step": 2182 + }, + { + "epoch": 2.0728696890576788, + "grad_norm": 0.059133414179086685, + "learning_rate": 0.00012650095686231953, + "loss": 0.2637, + "step": 2183 + }, + { + "epoch": 2.0738191312603846, + "grad_norm": 0.05270388349890709, + "learning_rate": 0.00012644341620372023, + "loss": 0.2742, + "step": 2184 + }, + { + "epoch": 2.0747685734630905, + "grad_norm": 0.049184754490852356, + "learning_rate": 0.00012638586613011624, + "loss": 0.2582, + "step": 2185 + }, + { + "epoch": 2.0757180156657964, + "grad_norm": 0.05757623910903931, + "learning_rate": 0.0001263283066619978, + "loss": 0.2672, + "step": 2186 + }, + { + "epoch": 2.0766674578685023, + "grad_norm": 0.051976773887872696, + "learning_rate": 0.0001262707378198587, + "loss": 0.2769, + "step": 2187 + }, + { + "epoch": 2.077616900071208, + "grad_norm": 0.04786711558699608, + "learning_rate": 0.00012621315962419585, + "loss": 0.2661, + "step": 2188 + }, + { + "epoch": 2.078566342273914, + "grad_norm": 0.0624409057199955, + "learning_rate": 0.00012615557209550967, + "loss": 0.2867, + "step": 2189 + }, + { + "epoch": 2.07951578447662, + "grad_norm": 0.05563337355852127, + "learning_rate": 0.00012609797525430373, + "loss": 0.2778, + "step": 2190 + }, + { + "epoch": 2.0804652266793258, + "grad_norm": 0.04968985542654991, + "learning_rate": 0.00012604036912108505, + "loss": 0.2562, + "step": 2191 + }, + { + "epoch": 2.0814146688820316, + "grad_norm": 0.05211299657821655, + "learning_rate": 0.00012598275371636394, + "loss": 0.2746, + "step": 2192 + }, + { + "epoch": 2.0823641110847375, + "grad_norm": 0.0466628223657608, + "learning_rate": 0.00012592512906065397, + "loss": 0.2654, + "step": 2193 + }, + { + "epoch": 2.083313553287444, + "grad_norm": 0.056648485362529755, + "learning_rate": 0.000125867495174472, + "loss": 0.28, + "step": 2194 + }, + { + "epoch": 2.0842629954901497, + "grad_norm": 0.056760817766189575, + "learning_rate": 0.0001258098520783382, + "loss": 0.2732, + "step": 2195 + }, + { + "epoch": 2.0852124376928556, + "grad_norm": 0.05097498744726181, + "learning_rate": 0.00012575219979277602, + "loss": 0.261, + "step": 2196 + }, + { + "epoch": 2.0861618798955615, + "grad_norm": 0.05032607540488243, + "learning_rate": 0.00012569453833831222, + "loss": 0.2769, + "step": 2197 + }, + { + "epoch": 2.0871113220982673, + "grad_norm": 0.04438967630267143, + "learning_rate": 0.00012563686773547675, + "loss": 0.2561, + "step": 2198 + }, + { + "epoch": 2.088060764300973, + "grad_norm": 0.05397673696279526, + "learning_rate": 0.00012557918800480282, + "loss": 0.2712, + "step": 2199 + }, + { + "epoch": 2.089010206503679, + "grad_norm": 0.05158831924200058, + "learning_rate": 0.00012552149916682695, + "loss": 0.2685, + "step": 2200 + }, + { + "epoch": 2.089959648706385, + "grad_norm": 0.06279024481773376, + "learning_rate": 0.00012546380124208887, + "loss": 0.2722, + "step": 2201 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.04665720462799072, + "learning_rate": 0.00012540609425113156, + "loss": 0.2604, + "step": 2202 + }, + { + "epoch": 2.0918585331117967, + "grad_norm": 0.059546615928411484, + "learning_rate": 0.00012534837821450117, + "loss": 0.2721, + "step": 2203 + }, + { + "epoch": 2.0928079753145026, + "grad_norm": 0.0592176578938961, + "learning_rate": 0.0001252906531527472, + "loss": 0.2716, + "step": 2204 + }, + { + "epoch": 2.0937574175172085, + "grad_norm": 0.04968995600938797, + "learning_rate": 0.00012523291908642217, + "loss": 0.2474, + "step": 2205 + }, + { + "epoch": 2.0947068597199143, + "grad_norm": 0.052708033472299576, + "learning_rate": 0.00012517517603608203, + "loss": 0.2668, + "step": 2206 + }, + { + "epoch": 2.0956563019226206, + "grad_norm": 0.06978727877140045, + "learning_rate": 0.0001251174240222857, + "loss": 0.2729, + "step": 2207 + }, + { + "epoch": 2.0966057441253265, + "grad_norm": 0.061792004853487015, + "learning_rate": 0.0001250596630655955, + "loss": 0.2706, + "step": 2208 + }, + { + "epoch": 2.0975551863280324, + "grad_norm": 0.05177111551165581, + "learning_rate": 0.00012500189318657675, + "loss": 0.2759, + "step": 2209 + }, + { + "epoch": 2.0985046285307383, + "grad_norm": 0.05225459113717079, + "learning_rate": 0.00012494411440579814, + "loss": 0.2662, + "step": 2210 + }, + { + "epoch": 2.099454070733444, + "grad_norm": 0.046468012034893036, + "learning_rate": 0.00012488632674383134, + "loss": 0.2712, + "step": 2211 + }, + { + "epoch": 2.10040351293615, + "grad_norm": 0.044963154941797256, + "learning_rate": 0.00012482853022125132, + "loss": 0.2685, + "step": 2212 + } + ], + "logging_steps": 1, + "max_steps": 5265, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 158, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2839911292489564e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}