{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007722007722007722, "grad_norm": 56.5, "learning_rate": 3.2467532467532474e-08, "loss": 1.4466, "step": 1 }, { "epoch": 0.0015444015444015444, "grad_norm": 27.25, "learning_rate": 6.493506493506495e-08, "loss": 1.4423, "step": 2 }, { "epoch": 0.0023166023166023165, "grad_norm": 12.6875, "learning_rate": 9.74025974025974e-08, "loss": 1.2234, "step": 3 }, { "epoch": 0.003088803088803089, "grad_norm": 13.75, "learning_rate": 1.298701298701299e-07, "loss": 1.2594, "step": 4 }, { "epoch": 0.003861003861003861, "grad_norm": 17.0, "learning_rate": 1.6233766233766232e-07, "loss": 1.3921, "step": 5 }, { "epoch": 0.004633204633204633, "grad_norm": 35.75, "learning_rate": 1.948051948051948e-07, "loss": 1.1142, "step": 6 }, { "epoch": 0.005405405405405406, "grad_norm": 55.25, "learning_rate": 2.2727272727272729e-07, "loss": 1.3251, "step": 7 }, { "epoch": 0.006177606177606178, "grad_norm": 23.875, "learning_rate": 2.597402597402598e-07, "loss": 1.3841, "step": 8 }, { "epoch": 0.0069498069498069494, "grad_norm": 9.6875, "learning_rate": 2.9220779220779225e-07, "loss": 1.0753, "step": 9 }, { "epoch": 0.007722007722007722, "grad_norm": 19.875, "learning_rate": 3.2467532467532465e-07, "loss": 1.3261, "step": 10 }, { "epoch": 0.008494208494208495, "grad_norm": 45.75, "learning_rate": 3.5714285714285716e-07, "loss": 1.4809, "step": 11 }, { "epoch": 0.009266409266409266, "grad_norm": 60.75, "learning_rate": 3.896103896103896e-07, "loss": 1.2285, "step": 12 }, { "epoch": 0.010038610038610039, "grad_norm": 28.875, "learning_rate": 4.220779220779221e-07, "loss": 1.3495, "step": 13 }, { "epoch": 0.010810810810810811, "grad_norm": 48.25, "learning_rate": 4.5454545454545457e-07, "loss": 1.2391, "step": 14 }, { "epoch": 0.011583011583011582, "grad_norm": 12.9375, "learning_rate": 4.870129870129871e-07, "loss": 1.3978, "step": 15 }, { "epoch": 0.012355212355212355, "grad_norm": 37.25, "learning_rate": 5.194805194805196e-07, "loss": 1.4953, "step": 16 }, { "epoch": 0.013127413127413128, "grad_norm": 16.625, "learning_rate": 5.51948051948052e-07, "loss": 1.3271, "step": 17 }, { "epoch": 0.013899613899613899, "grad_norm": 14.0625, "learning_rate": 5.844155844155845e-07, "loss": 1.5048, "step": 18 }, { "epoch": 0.014671814671814672, "grad_norm": 42.0, "learning_rate": 6.168831168831169e-07, "loss": 1.2461, "step": 19 }, { "epoch": 0.015444015444015444, "grad_norm": 14.375, "learning_rate": 6.493506493506493e-07, "loss": 1.3821, "step": 20 }, { "epoch": 0.016216216216216217, "grad_norm": 17.25, "learning_rate": 6.818181818181818e-07, "loss": 1.4612, "step": 21 }, { "epoch": 0.01698841698841699, "grad_norm": 15.6875, "learning_rate": 7.142857142857143e-07, "loss": 1.2628, "step": 22 }, { "epoch": 0.01776061776061776, "grad_norm": 13.1875, "learning_rate": 7.467532467532468e-07, "loss": 1.3607, "step": 23 }, { "epoch": 0.018532818532818532, "grad_norm": 14.3125, "learning_rate": 7.792207792207792e-07, "loss": 1.3691, "step": 24 }, { "epoch": 0.019305019305019305, "grad_norm": 41.5, "learning_rate": 8.116883116883117e-07, "loss": 1.3236, "step": 25 }, { "epoch": 0.020077220077220077, "grad_norm": 12.0625, "learning_rate": 8.441558441558442e-07, "loss": 1.2468, "step": 26 }, { "epoch": 0.02084942084942085, "grad_norm": 21.625, "learning_rate": 8.766233766233767e-07, "loss": 1.2418, "step": 27 }, { "epoch": 0.021621621621621623, "grad_norm": 17.375, "learning_rate": 9.090909090909091e-07, "loss": 1.4311, "step": 28 }, { "epoch": 0.022393822393822392, "grad_norm": 31.625, "learning_rate": 9.415584415584417e-07, "loss": 1.4924, "step": 29 }, { "epoch": 0.023166023166023165, "grad_norm": 12.625, "learning_rate": 9.740259740259742e-07, "loss": 1.4778, "step": 30 }, { "epoch": 0.023938223938223938, "grad_norm": 34.5, "learning_rate": 1.0064935064935066e-06, "loss": 1.2457, "step": 31 }, { "epoch": 0.02471042471042471, "grad_norm": 13.0625, "learning_rate": 1.0389610389610392e-06, "loss": 1.3357, "step": 32 }, { "epoch": 0.025482625482625483, "grad_norm": 16.5, "learning_rate": 1.0714285714285714e-06, "loss": 1.2675, "step": 33 }, { "epoch": 0.026254826254826256, "grad_norm": 12.875, "learning_rate": 1.103896103896104e-06, "loss": 1.3332, "step": 34 }, { "epoch": 0.02702702702702703, "grad_norm": 21.875, "learning_rate": 1.1363636363636364e-06, "loss": 1.3432, "step": 35 }, { "epoch": 0.027799227799227798, "grad_norm": 26.875, "learning_rate": 1.168831168831169e-06, "loss": 1.3359, "step": 36 }, { "epoch": 0.02857142857142857, "grad_norm": 8.375, "learning_rate": 1.2012987012987014e-06, "loss": 1.1774, "step": 37 }, { "epoch": 0.029343629343629343, "grad_norm": 29.25, "learning_rate": 1.2337662337662338e-06, "loss": 1.2207, "step": 38 }, { "epoch": 0.030115830115830116, "grad_norm": 8.6875, "learning_rate": 1.2662337662337662e-06, "loss": 1.2534, "step": 39 }, { "epoch": 0.03088803088803089, "grad_norm": 12.5, "learning_rate": 1.2987012987012986e-06, "loss": 1.3108, "step": 40 }, { "epoch": 0.03166023166023166, "grad_norm": 10.6875, "learning_rate": 1.3311688311688312e-06, "loss": 1.2418, "step": 41 }, { "epoch": 0.032432432432432434, "grad_norm": 34.25, "learning_rate": 1.3636363636363636e-06, "loss": 1.244, "step": 42 }, { "epoch": 0.033204633204633204, "grad_norm": 12.6875, "learning_rate": 1.3961038961038962e-06, "loss": 1.17, "step": 43 }, { "epoch": 0.03397683397683398, "grad_norm": 7.40625, "learning_rate": 1.4285714285714286e-06, "loss": 1.3112, "step": 44 }, { "epoch": 0.03474903474903475, "grad_norm": 43.5, "learning_rate": 1.461038961038961e-06, "loss": 1.2921, "step": 45 }, { "epoch": 0.03552123552123552, "grad_norm": 8.9375, "learning_rate": 1.4935064935064936e-06, "loss": 1.2722, "step": 46 }, { "epoch": 0.036293436293436294, "grad_norm": 22.375, "learning_rate": 1.525974025974026e-06, "loss": 1.3692, "step": 47 }, { "epoch": 0.037065637065637064, "grad_norm": 8.875, "learning_rate": 1.5584415584415584e-06, "loss": 1.1782, "step": 48 }, { "epoch": 0.03783783783783784, "grad_norm": 12.0, "learning_rate": 1.590909090909091e-06, "loss": 1.1099, "step": 49 }, { "epoch": 0.03861003861003861, "grad_norm": 18.125, "learning_rate": 1.6233766233766235e-06, "loss": 1.1888, "step": 50 }, { "epoch": 0.039382239382239385, "grad_norm": 8.375, "learning_rate": 1.655844155844156e-06, "loss": 1.0458, "step": 51 }, { "epoch": 0.040154440154440155, "grad_norm": 5.6875, "learning_rate": 1.6883116883116885e-06, "loss": 1.064, "step": 52 }, { "epoch": 0.040926640926640924, "grad_norm": 16.0, "learning_rate": 1.7207792207792209e-06, "loss": 1.1945, "step": 53 }, { "epoch": 0.0416988416988417, "grad_norm": 11.25, "learning_rate": 1.7532467532467535e-06, "loss": 1.3159, "step": 54 }, { "epoch": 0.04247104247104247, "grad_norm": 16.75, "learning_rate": 1.7857142857142859e-06, "loss": 1.1229, "step": 55 }, { "epoch": 0.043243243243243246, "grad_norm": 38.75, "learning_rate": 1.8181818181818183e-06, "loss": 1.3574, "step": 56 }, { "epoch": 0.044015444015444015, "grad_norm": 6.09375, "learning_rate": 1.850649350649351e-06, "loss": 1.1454, "step": 57 }, { "epoch": 0.044787644787644784, "grad_norm": 8.3125, "learning_rate": 1.8831168831168833e-06, "loss": 1.1335, "step": 58 }, { "epoch": 0.04555984555984556, "grad_norm": 5.625, "learning_rate": 1.9155844155844157e-06, "loss": 1.1644, "step": 59 }, { "epoch": 0.04633204633204633, "grad_norm": 6.34375, "learning_rate": 1.9480519480519483e-06, "loss": 1.0538, "step": 60 }, { "epoch": 0.047104247104247106, "grad_norm": 20.75, "learning_rate": 1.980519480519481e-06, "loss": 1.0306, "step": 61 }, { "epoch": 0.047876447876447875, "grad_norm": 12.375, "learning_rate": 2.012987012987013e-06, "loss": 1.2176, "step": 62 }, { "epoch": 0.04864864864864865, "grad_norm": 7.3125, "learning_rate": 2.0454545454545457e-06, "loss": 1.2399, "step": 63 }, { "epoch": 0.04942084942084942, "grad_norm": 23.125, "learning_rate": 2.0779220779220784e-06, "loss": 1.2174, "step": 64 }, { "epoch": 0.05019305019305019, "grad_norm": 6.3125, "learning_rate": 2.1103896103896105e-06, "loss": 1.1111, "step": 65 }, { "epoch": 0.050965250965250966, "grad_norm": 8.875, "learning_rate": 2.1428571428571427e-06, "loss": 1.1853, "step": 66 }, { "epoch": 0.051737451737451735, "grad_norm": 23.875, "learning_rate": 2.1753246753246753e-06, "loss": 1.1374, "step": 67 }, { "epoch": 0.05250965250965251, "grad_norm": 53.25, "learning_rate": 2.207792207792208e-06, "loss": 1.3138, "step": 68 }, { "epoch": 0.05328185328185328, "grad_norm": 9.5, "learning_rate": 2.24025974025974e-06, "loss": 1.2263, "step": 69 }, { "epoch": 0.05405405405405406, "grad_norm": 19.375, "learning_rate": 2.2727272727272728e-06, "loss": 1.0593, "step": 70 }, { "epoch": 0.054826254826254826, "grad_norm": 15.375, "learning_rate": 2.3051948051948054e-06, "loss": 1.1565, "step": 71 }, { "epoch": 0.055598455598455596, "grad_norm": 7.90625, "learning_rate": 2.337662337662338e-06, "loss": 1.1925, "step": 72 }, { "epoch": 0.05637065637065637, "grad_norm": 5.46875, "learning_rate": 2.37012987012987e-06, "loss": 1.2136, "step": 73 }, { "epoch": 0.05714285714285714, "grad_norm": 16.75, "learning_rate": 2.402597402597403e-06, "loss": 1.1775, "step": 74 }, { "epoch": 0.05791505791505792, "grad_norm": 23.125, "learning_rate": 2.4350649350649354e-06, "loss": 1.2065, "step": 75 }, { "epoch": 0.05868725868725869, "grad_norm": 6.15625, "learning_rate": 2.4675324675324676e-06, "loss": 1.1728, "step": 76 }, { "epoch": 0.05945945945945946, "grad_norm": 7.0625, "learning_rate": 2.5e-06, "loss": 1.1926, "step": 77 }, { "epoch": 0.06023166023166023, "grad_norm": 18.875, "learning_rate": 2.499999023224565e-06, "loss": 1.2329, "step": 78 }, { "epoch": 0.061003861003861, "grad_norm": 7.46875, "learning_rate": 2.4999960928997866e-06, "loss": 1.1095, "step": 79 }, { "epoch": 0.06177606177606178, "grad_norm": 7.84375, "learning_rate": 2.499991209030244e-06, "loss": 0.9439, "step": 80 }, { "epoch": 0.06254826254826255, "grad_norm": 9.0, "learning_rate": 2.4999843716235704e-06, "loss": 1.1695, "step": 81 }, { "epoch": 0.06332046332046332, "grad_norm": 25.375, "learning_rate": 2.4999755806904514e-06, "loss": 1.1446, "step": 82 }, { "epoch": 0.06409266409266409, "grad_norm": 9.4375, "learning_rate": 2.4999648362446258e-06, "loss": 1.1367, "step": 83 }, { "epoch": 0.06486486486486487, "grad_norm": 5.75, "learning_rate": 2.499952138302885e-06, "loss": 0.9475, "step": 84 }, { "epoch": 0.06563706563706563, "grad_norm": 13.5, "learning_rate": 2.499937486885075e-06, "loss": 1.0359, "step": 85 }, { "epoch": 0.06640926640926641, "grad_norm": 8.8125, "learning_rate": 2.4999208820140926e-06, "loss": 1.1438, "step": 86 }, { "epoch": 0.06718146718146718, "grad_norm": 7.1875, "learning_rate": 2.4999023237158894e-06, "loss": 1.2295, "step": 87 }, { "epoch": 0.06795366795366796, "grad_norm": 4.03125, "learning_rate": 2.499881812019468e-06, "loss": 1.0299, "step": 88 }, { "epoch": 0.06872586872586872, "grad_norm": 3.546875, "learning_rate": 2.4998593469568856e-06, "loss": 0.9298, "step": 89 }, { "epoch": 0.0694980694980695, "grad_norm": 6.46875, "learning_rate": 2.4998349285632518e-06, "loss": 0.9896, "step": 90 }, { "epoch": 0.07027027027027027, "grad_norm": 4.75, "learning_rate": 2.499808556876728e-06, "loss": 1.0663, "step": 91 }, { "epoch": 0.07104247104247104, "grad_norm": 6.1875, "learning_rate": 2.499780231938529e-06, "loss": 1.0716, "step": 92 }, { "epoch": 0.07181467181467181, "grad_norm": 7.59375, "learning_rate": 2.499749953792923e-06, "loss": 1.1818, "step": 93 }, { "epoch": 0.07258687258687259, "grad_norm": 6.40625, "learning_rate": 2.4997177224872287e-06, "loss": 1.1997, "step": 94 }, { "epoch": 0.07335907335907337, "grad_norm": 4.625, "learning_rate": 2.4996835380718197e-06, "loss": 1.0715, "step": 95 }, { "epoch": 0.07413127413127413, "grad_norm": 6.625, "learning_rate": 2.4996474006001196e-06, "loss": 1.0548, "step": 96 }, { "epoch": 0.0749034749034749, "grad_norm": 6.25, "learning_rate": 2.4996093101286066e-06, "loss": 1.0603, "step": 97 }, { "epoch": 0.07567567567567568, "grad_norm": 22.875, "learning_rate": 2.499569266716809e-06, "loss": 1.1109, "step": 98 }, { "epoch": 0.07644787644787644, "grad_norm": 6.8125, "learning_rate": 2.4995272704273095e-06, "loss": 1.0048, "step": 99 }, { "epoch": 0.07722007722007722, "grad_norm": 10.4375, "learning_rate": 2.499483321325741e-06, "loss": 1.0712, "step": 100 }, { "epoch": 0.077992277992278, "grad_norm": 13.75, "learning_rate": 2.499437419480788e-06, "loss": 1.0816, "step": 101 }, { "epoch": 0.07876447876447877, "grad_norm": 5.78125, "learning_rate": 2.499389564964189e-06, "loss": 1.0226, "step": 102 }, { "epoch": 0.07953667953667953, "grad_norm": 5.40625, "learning_rate": 2.4993397578507333e-06, "loss": 0.8815, "step": 103 }, { "epoch": 0.08030888030888031, "grad_norm": 6.5, "learning_rate": 2.49928799821826e-06, "loss": 1.1842, "step": 104 }, { "epoch": 0.08108108108108109, "grad_norm": 4.5625, "learning_rate": 2.4992342861476626e-06, "loss": 1.103, "step": 105 }, { "epoch": 0.08185328185328185, "grad_norm": 6.4375, "learning_rate": 2.4991786217228837e-06, "loss": 1.1377, "step": 106 }, { "epoch": 0.08262548262548262, "grad_norm": 4.34375, "learning_rate": 2.4991210050309178e-06, "loss": 1.0884, "step": 107 }, { "epoch": 0.0833976833976834, "grad_norm": 8.5, "learning_rate": 2.499061436161811e-06, "loss": 1.0684, "step": 108 }, { "epoch": 0.08416988416988418, "grad_norm": 6.8125, "learning_rate": 2.49899991520866e-06, "loss": 1.0401, "step": 109 }, { "epoch": 0.08494208494208494, "grad_norm": 21.375, "learning_rate": 2.498936442267612e-06, "loss": 1.0401, "step": 110 }, { "epoch": 0.08571428571428572, "grad_norm": 4.6875, "learning_rate": 2.4988710174378658e-06, "loss": 1.0732, "step": 111 }, { "epoch": 0.08648648648648649, "grad_norm": 4.3125, "learning_rate": 2.498803640821669e-06, "loss": 1.0263, "step": 112 }, { "epoch": 0.08725868725868725, "grad_norm": 4.34375, "learning_rate": 2.498734312524321e-06, "loss": 1.0192, "step": 113 }, { "epoch": 0.08803088803088803, "grad_norm": 13.1875, "learning_rate": 2.4986630326541704e-06, "loss": 1.1032, "step": 114 }, { "epoch": 0.0888030888030888, "grad_norm": 4.3125, "learning_rate": 2.4985898013226173e-06, "loss": 1.1008, "step": 115 }, { "epoch": 0.08957528957528957, "grad_norm": 10.625, "learning_rate": 2.49851461864411e-06, "loss": 0.9732, "step": 116 }, { "epoch": 0.09034749034749034, "grad_norm": 4.21875, "learning_rate": 2.4984374847361467e-06, "loss": 0.9061, "step": 117 }, { "epoch": 0.09111969111969112, "grad_norm": 4.34375, "learning_rate": 2.498358399719276e-06, "loss": 0.9201, "step": 118 }, { "epoch": 0.0918918918918919, "grad_norm": 4.5625, "learning_rate": 2.4982773637170947e-06, "loss": 0.9597, "step": 119 }, { "epoch": 0.09266409266409266, "grad_norm": 7.09375, "learning_rate": 2.4981943768562495e-06, "loss": 1.0646, "step": 120 }, { "epoch": 0.09343629343629344, "grad_norm": 6.71875, "learning_rate": 2.498109439266436e-06, "loss": 0.9822, "step": 121 }, { "epoch": 0.09420849420849421, "grad_norm": 34.0, "learning_rate": 2.498022551080397e-06, "loss": 0.9686, "step": 122 }, { "epoch": 0.09498069498069497, "grad_norm": 4.40625, "learning_rate": 2.497933712433926e-06, "loss": 0.9359, "step": 123 }, { "epoch": 0.09575289575289575, "grad_norm": 6.59375, "learning_rate": 2.497842923465863e-06, "loss": 1.055, "step": 124 }, { "epoch": 0.09652509652509653, "grad_norm": 42.25, "learning_rate": 2.497750184318097e-06, "loss": 0.9127, "step": 125 }, { "epoch": 0.0972972972972973, "grad_norm": 7.84375, "learning_rate": 2.4976554951355645e-06, "loss": 1.0234, "step": 126 }, { "epoch": 0.09806949806949807, "grad_norm": 3.359375, "learning_rate": 2.4975588560662497e-06, "loss": 0.8025, "step": 127 }, { "epoch": 0.09884169884169884, "grad_norm": 4.90625, "learning_rate": 2.4974602672611832e-06, "loss": 0.8655, "step": 128 }, { "epoch": 0.09961389961389962, "grad_norm": 20.875, "learning_rate": 2.497359728874445e-06, "loss": 1.0199, "step": 129 }, { "epoch": 0.10038610038610038, "grad_norm": 3.765625, "learning_rate": 2.4972572410631594e-06, "loss": 0.8791, "step": 130 }, { "epoch": 0.10115830115830116, "grad_norm": 4.34375, "learning_rate": 2.4971528039874994e-06, "loss": 0.9124, "step": 131 }, { "epoch": 0.10193050193050193, "grad_norm": 5.21875, "learning_rate": 2.497046417810683e-06, "loss": 0.8891, "step": 132 }, { "epoch": 0.10270270270270271, "grad_norm": 4.21875, "learning_rate": 2.496938082698975e-06, "loss": 0.9044, "step": 133 }, { "epoch": 0.10347490347490347, "grad_norm": 7.375, "learning_rate": 2.496827798821686e-06, "loss": 0.9957, "step": 134 }, { "epoch": 0.10424710424710425, "grad_norm": 3.9375, "learning_rate": 2.4967155663511715e-06, "loss": 1.0356, "step": 135 }, { "epoch": 0.10501930501930502, "grad_norm": 4.40625, "learning_rate": 2.496601385462834e-06, "loss": 0.9208, "step": 136 }, { "epoch": 0.10579150579150579, "grad_norm": 4.96875, "learning_rate": 2.4964852563351193e-06, "loss": 1.0727, "step": 137 }, { "epoch": 0.10656370656370656, "grad_norm": 15.0625, "learning_rate": 2.496367179149519e-06, "loss": 1.0164, "step": 138 }, { "epoch": 0.10733590733590734, "grad_norm": 6.4375, "learning_rate": 2.496247154090569e-06, "loss": 0.9361, "step": 139 }, { "epoch": 0.10810810810810811, "grad_norm": 4.75, "learning_rate": 2.4961251813458494e-06, "loss": 0.8342, "step": 140 }, { "epoch": 0.10888030888030888, "grad_norm": 7.53125, "learning_rate": 2.496001261105984e-06, "loss": 1.0916, "step": 141 }, { "epoch": 0.10965250965250965, "grad_norm": 4.6875, "learning_rate": 2.4958753935646403e-06, "loss": 1.0878, "step": 142 }, { "epoch": 0.11042471042471043, "grad_norm": 8.0625, "learning_rate": 2.495747578918529e-06, "loss": 0.9419, "step": 143 }, { "epoch": 0.11119691119691119, "grad_norm": 5.375, "learning_rate": 2.4956178173674044e-06, "loss": 1.1948, "step": 144 }, { "epoch": 0.11196911196911197, "grad_norm": 7.46875, "learning_rate": 2.4954861091140636e-06, "loss": 1.0782, "step": 145 }, { "epoch": 0.11274131274131274, "grad_norm": 6.4375, "learning_rate": 2.4953524543643444e-06, "loss": 1.0481, "step": 146 }, { "epoch": 0.11351351351351352, "grad_norm": 51.0, "learning_rate": 2.495216853327129e-06, "loss": 0.9363, "step": 147 }, { "epoch": 0.11428571428571428, "grad_norm": 4.65625, "learning_rate": 2.4950793062143397e-06, "loss": 0.9846, "step": 148 }, { "epoch": 0.11505791505791506, "grad_norm": 6.1875, "learning_rate": 2.494939813240941e-06, "loss": 1.0972, "step": 149 }, { "epoch": 0.11583011583011583, "grad_norm": 7.46875, "learning_rate": 2.4947983746249376e-06, "loss": 0.9793, "step": 150 }, { "epoch": 0.1166023166023166, "grad_norm": 4.09375, "learning_rate": 2.4946549905873763e-06, "loss": 1.0116, "step": 151 }, { "epoch": 0.11737451737451737, "grad_norm": 4.0625, "learning_rate": 2.494509661352343e-06, "loss": 1.0275, "step": 152 }, { "epoch": 0.11814671814671815, "grad_norm": 3.453125, "learning_rate": 2.4943623871469647e-06, "loss": 0.687, "step": 153 }, { "epoch": 0.11891891891891893, "grad_norm": 3.84375, "learning_rate": 2.4942131682014063e-06, "loss": 0.868, "step": 154 }, { "epoch": 0.11969111969111969, "grad_norm": 4.59375, "learning_rate": 2.4940620047488747e-06, "loss": 0.977, "step": 155 }, { "epoch": 0.12046332046332046, "grad_norm": 6.34375, "learning_rate": 2.493908897025614e-06, "loss": 1.0145, "step": 156 }, { "epoch": 0.12123552123552124, "grad_norm": 3.84375, "learning_rate": 2.493753845270906e-06, "loss": 0.9569, "step": 157 }, { "epoch": 0.122007722007722, "grad_norm": 5.0625, "learning_rate": 2.493596849727073e-06, "loss": 0.8456, "step": 158 }, { "epoch": 0.12277992277992278, "grad_norm": 4.8125, "learning_rate": 2.4934379106394737e-06, "loss": 1.0846, "step": 159 }, { "epoch": 0.12355212355212356, "grad_norm": 6.875, "learning_rate": 2.493277028256505e-06, "loss": 1.0196, "step": 160 }, { "epoch": 0.12432432432432433, "grad_norm": 3.921875, "learning_rate": 2.4931142028295997e-06, "loss": 0.9135, "step": 161 }, { "epoch": 0.1250965250965251, "grad_norm": 4.3125, "learning_rate": 2.4929494346132284e-06, "loss": 1.0662, "step": 162 }, { "epoch": 0.12586872586872586, "grad_norm": 8.3125, "learning_rate": 2.492782723864898e-06, "loss": 1.0511, "step": 163 }, { "epoch": 0.12664092664092663, "grad_norm": 4.09375, "learning_rate": 2.4926140708451496e-06, "loss": 0.9705, "step": 164 }, { "epoch": 0.1274131274131274, "grad_norm": 14.6875, "learning_rate": 2.492443475817562e-06, "loss": 0.9048, "step": 165 }, { "epoch": 0.12818532818532818, "grad_norm": 5.15625, "learning_rate": 2.492270939048748e-06, "loss": 0.7544, "step": 166 }, { "epoch": 0.12895752895752896, "grad_norm": 5.15625, "learning_rate": 2.492096460808355e-06, "loss": 1.0298, "step": 167 }, { "epoch": 0.12972972972972974, "grad_norm": 3.96875, "learning_rate": 2.491920041369065e-06, "loss": 0.8717, "step": 168 }, { "epoch": 0.1305019305019305, "grad_norm": 5.75, "learning_rate": 2.491741681006592e-06, "loss": 1.0299, "step": 169 }, { "epoch": 0.13127413127413126, "grad_norm": 7.0, "learning_rate": 2.4915613799996867e-06, "loss": 0.9021, "step": 170 }, { "epoch": 0.13204633204633204, "grad_norm": 3.65625, "learning_rate": 2.49137913863013e-06, "loss": 0.8802, "step": 171 }, { "epoch": 0.13281853281853281, "grad_norm": 10.0625, "learning_rate": 2.491194957182736e-06, "loss": 0.8909, "step": 172 }, { "epoch": 0.1335907335907336, "grad_norm": 18.75, "learning_rate": 2.491008835945352e-06, "loss": 0.9, "step": 173 }, { "epoch": 0.13436293436293437, "grad_norm": 6.375, "learning_rate": 2.4908207752088538e-06, "loss": 1.0848, "step": 174 }, { "epoch": 0.13513513513513514, "grad_norm": 4.96875, "learning_rate": 2.490630775267152e-06, "loss": 0.9535, "step": 175 }, { "epoch": 0.13590733590733592, "grad_norm": 5.03125, "learning_rate": 2.490438836417186e-06, "loss": 0.9195, "step": 176 }, { "epoch": 0.13667953667953667, "grad_norm": 6.9375, "learning_rate": 2.4902449589589257e-06, "loss": 1.0811, "step": 177 }, { "epoch": 0.13745173745173744, "grad_norm": 4.8125, "learning_rate": 2.49004914319537e-06, "loss": 0.9316, "step": 178 }, { "epoch": 0.13822393822393822, "grad_norm": 6.34375, "learning_rate": 2.489851389432548e-06, "loss": 0.9471, "step": 179 }, { "epoch": 0.138996138996139, "grad_norm": 5.90625, "learning_rate": 2.4896516979795183e-06, "loss": 0.9443, "step": 180 }, { "epoch": 0.13976833976833977, "grad_norm": 4.53125, "learning_rate": 2.4894500691483655e-06, "loss": 0.9099, "step": 181 }, { "epoch": 0.14054054054054055, "grad_norm": 22.375, "learning_rate": 2.489246503254204e-06, "loss": 1.0664, "step": 182 }, { "epoch": 0.14131274131274132, "grad_norm": 11.4375, "learning_rate": 2.489041000615175e-06, "loss": 1.0281, "step": 183 }, { "epoch": 0.14208494208494207, "grad_norm": 3.796875, "learning_rate": 2.4888335615524458e-06, "loss": 0.8355, "step": 184 }, { "epoch": 0.14285714285714285, "grad_norm": 5.5, "learning_rate": 2.488624186390211e-06, "loss": 0.8602, "step": 185 }, { "epoch": 0.14362934362934363, "grad_norm": 4.5, "learning_rate": 2.4884128754556906e-06, "loss": 0.9809, "step": 186 }, { "epoch": 0.1444015444015444, "grad_norm": 4.46875, "learning_rate": 2.4881996290791306e-06, "loss": 0.9328, "step": 187 }, { "epoch": 0.14517374517374518, "grad_norm": 11.3125, "learning_rate": 2.4879844475938e-06, "loss": 1.1522, "step": 188 }, { "epoch": 0.14594594594594595, "grad_norm": 4.25, "learning_rate": 2.487767331335993e-06, "loss": 1.0641, "step": 189 }, { "epoch": 0.14671814671814673, "grad_norm": 5.0625, "learning_rate": 2.487548280645029e-06, "loss": 1.0248, "step": 190 }, { "epoch": 0.14749034749034748, "grad_norm": 4.84375, "learning_rate": 2.4873272958632484e-06, "loss": 0.8967, "step": 191 }, { "epoch": 0.14826254826254825, "grad_norm": 4.625, "learning_rate": 2.4871043773360154e-06, "loss": 1.0069, "step": 192 }, { "epoch": 0.14903474903474903, "grad_norm": 4.28125, "learning_rate": 2.4868795254117165e-06, "loss": 0.9017, "step": 193 }, { "epoch": 0.1498069498069498, "grad_norm": 3.9375, "learning_rate": 2.486652740441759e-06, "loss": 0.9137, "step": 194 }, { "epoch": 0.15057915057915058, "grad_norm": 4.625, "learning_rate": 2.486424022780572e-06, "loss": 0.9259, "step": 195 }, { "epoch": 0.15135135135135136, "grad_norm": 4.125, "learning_rate": 2.486193372785604e-06, "loss": 0.8902, "step": 196 }, { "epoch": 0.15212355212355214, "grad_norm": 3.671875, "learning_rate": 2.4859607908173257e-06, "loss": 0.9096, "step": 197 }, { "epoch": 0.15289575289575288, "grad_norm": 7.46875, "learning_rate": 2.4857262772392242e-06, "loss": 1.006, "step": 198 }, { "epoch": 0.15366795366795366, "grad_norm": 9.375, "learning_rate": 2.4854898324178076e-06, "loss": 0.8996, "step": 199 }, { "epoch": 0.15444015444015444, "grad_norm": 4.03125, "learning_rate": 2.4852514567226015e-06, "loss": 0.8006, "step": 200 }, { "epoch": 0.1552123552123552, "grad_norm": 4.9375, "learning_rate": 2.485011150526149e-06, "loss": 1.0241, "step": 201 }, { "epoch": 0.155984555984556, "grad_norm": 4.5, "learning_rate": 2.48476891420401e-06, "loss": 0.9775, "step": 202 }, { "epoch": 0.15675675675675677, "grad_norm": 5.1875, "learning_rate": 2.4845247481347613e-06, "loss": 1.0241, "step": 203 }, { "epoch": 0.15752895752895754, "grad_norm": 4.59375, "learning_rate": 2.4842786526999966e-06, "loss": 0.9228, "step": 204 }, { "epoch": 0.1583011583011583, "grad_norm": 27.375, "learning_rate": 2.484030628284323e-06, "loss": 0.9081, "step": 205 }, { "epoch": 0.15907335907335907, "grad_norm": 4.21875, "learning_rate": 2.4837806752753634e-06, "loss": 0.9936, "step": 206 }, { "epoch": 0.15984555984555984, "grad_norm": 5.0, "learning_rate": 2.483528794063754e-06, "loss": 0.9051, "step": 207 }, { "epoch": 0.16061776061776062, "grad_norm": 9.375, "learning_rate": 2.4832749850431464e-06, "loss": 0.9586, "step": 208 }, { "epoch": 0.1613899613899614, "grad_norm": 4.25, "learning_rate": 2.4830192486102023e-06, "loss": 0.9018, "step": 209 }, { "epoch": 0.16216216216216217, "grad_norm": 5.96875, "learning_rate": 2.4827615851645968e-06, "loss": 0.9251, "step": 210 }, { "epoch": 0.16293436293436295, "grad_norm": 6.3125, "learning_rate": 2.4825019951090183e-06, "loss": 0.9263, "step": 211 }, { "epoch": 0.1637065637065637, "grad_norm": 4.5625, "learning_rate": 2.482240478849164e-06, "loss": 0.9814, "step": 212 }, { "epoch": 0.16447876447876447, "grad_norm": 4.28125, "learning_rate": 2.4819770367937413e-06, "loss": 1.0005, "step": 213 }, { "epoch": 0.16525096525096525, "grad_norm": 4.0625, "learning_rate": 2.481711669354469e-06, "loss": 0.854, "step": 214 }, { "epoch": 0.16602316602316602, "grad_norm": 4.3125, "learning_rate": 2.4814443769460745e-06, "loss": 0.885, "step": 215 }, { "epoch": 0.1667953667953668, "grad_norm": 8.9375, "learning_rate": 2.481175159986292e-06, "loss": 0.9664, "step": 216 }, { "epoch": 0.16756756756756758, "grad_norm": 4.125, "learning_rate": 2.4809040188958663e-06, "loss": 0.9512, "step": 217 }, { "epoch": 0.16833976833976835, "grad_norm": 4.5625, "learning_rate": 2.4806309540985466e-06, "loss": 0.9311, "step": 218 }, { "epoch": 0.1691119691119691, "grad_norm": 4.34375, "learning_rate": 2.48035596602109e-06, "loss": 1.0632, "step": 219 }, { "epoch": 0.16988416988416988, "grad_norm": 6.125, "learning_rate": 2.480079055093259e-06, "loss": 0.8995, "step": 220 }, { "epoch": 0.17065637065637065, "grad_norm": 8.625, "learning_rate": 2.479800221747822e-06, "loss": 1.011, "step": 221 }, { "epoch": 0.17142857142857143, "grad_norm": 7.15625, "learning_rate": 2.4795194664205496e-06, "loss": 1.1351, "step": 222 }, { "epoch": 0.1722007722007722, "grad_norm": 28.5, "learning_rate": 2.4792367895502185e-06, "loss": 0.8636, "step": 223 }, { "epoch": 0.17297297297297298, "grad_norm": 5.28125, "learning_rate": 2.4789521915786076e-06, "loss": 0.8059, "step": 224 }, { "epoch": 0.17374517374517376, "grad_norm": 3.375, "learning_rate": 2.478665672950499e-06, "loss": 0.7674, "step": 225 }, { "epoch": 0.1745173745173745, "grad_norm": 4.9375, "learning_rate": 2.478377234113674e-06, "loss": 0.9757, "step": 226 }, { "epoch": 0.17528957528957528, "grad_norm": 5.0, "learning_rate": 2.4780868755189173e-06, "loss": 0.9446, "step": 227 }, { "epoch": 0.17606177606177606, "grad_norm": 4.375, "learning_rate": 2.4777945976200137e-06, "loss": 0.9549, "step": 228 }, { "epoch": 0.17683397683397684, "grad_norm": 75.0, "learning_rate": 2.4775004008737464e-06, "loss": 1.1002, "step": 229 }, { "epoch": 0.1776061776061776, "grad_norm": 43.75, "learning_rate": 2.477204285739898e-06, "loss": 1.0292, "step": 230 }, { "epoch": 0.1783783783783784, "grad_norm": 3.9375, "learning_rate": 2.4769062526812497e-06, "loss": 1.0109, "step": 231 }, { "epoch": 0.17915057915057914, "grad_norm": 6.46875, "learning_rate": 2.4766063021635798e-06, "loss": 1.0839, "step": 232 }, { "epoch": 0.1799227799227799, "grad_norm": 4.8125, "learning_rate": 2.4763044346556625e-06, "loss": 0.9943, "step": 233 }, { "epoch": 0.1806949806949807, "grad_norm": 4.09375, "learning_rate": 2.476000650629269e-06, "loss": 0.9996, "step": 234 }, { "epoch": 0.18146718146718147, "grad_norm": 4.15625, "learning_rate": 2.475694950559165e-06, "loss": 0.9701, "step": 235 }, { "epoch": 0.18223938223938224, "grad_norm": 4.15625, "learning_rate": 2.475387334923112e-06, "loss": 1.0469, "step": 236 }, { "epoch": 0.18301158301158302, "grad_norm": 6.78125, "learning_rate": 2.475077804201863e-06, "loss": 0.8802, "step": 237 }, { "epoch": 0.1837837837837838, "grad_norm": 3.875, "learning_rate": 2.4747663588791653e-06, "loss": 0.9032, "step": 238 }, { "epoch": 0.18455598455598454, "grad_norm": 7.625, "learning_rate": 2.4744529994417595e-06, "loss": 0.9486, "step": 239 }, { "epoch": 0.18532818532818532, "grad_norm": 3.921875, "learning_rate": 2.4741377263793755e-06, "loss": 0.9201, "step": 240 }, { "epoch": 0.1861003861003861, "grad_norm": 4.3125, "learning_rate": 2.4738205401847352e-06, "loss": 0.883, "step": 241 }, { "epoch": 0.18687258687258687, "grad_norm": 5.03125, "learning_rate": 2.47350144135355e-06, "loss": 0.9588, "step": 242 }, { "epoch": 0.18764478764478765, "grad_norm": 5.5625, "learning_rate": 2.473180430384521e-06, "loss": 0.7662, "step": 243 }, { "epoch": 0.18841698841698842, "grad_norm": 6.875, "learning_rate": 2.472857507779336e-06, "loss": 0.89, "step": 244 }, { "epoch": 0.1891891891891892, "grad_norm": 4.5, "learning_rate": 2.472532674042673e-06, "loss": 0.9595, "step": 245 }, { "epoch": 0.18996138996138995, "grad_norm": 4.0625, "learning_rate": 2.4722059296821944e-06, "loss": 0.8092, "step": 246 }, { "epoch": 0.19073359073359072, "grad_norm": 3.78125, "learning_rate": 2.47187727520855e-06, "loss": 0.7712, "step": 247 }, { "epoch": 0.1915057915057915, "grad_norm": 4.8125, "learning_rate": 2.471546711135374e-06, "loss": 1.0739, "step": 248 }, { "epoch": 0.19227799227799228, "grad_norm": 5.09375, "learning_rate": 2.4712142379792866e-06, "loss": 0.9746, "step": 249 }, { "epoch": 0.19305019305019305, "grad_norm": 8.375, "learning_rate": 2.4708798562598885e-06, "loss": 0.9217, "step": 250 }, { "epoch": 0.19382239382239383, "grad_norm": 4.25, "learning_rate": 2.470543566499766e-06, "loss": 0.8844, "step": 251 }, { "epoch": 0.1945945945945946, "grad_norm": 4.59375, "learning_rate": 2.470205369224487e-06, "loss": 0.9584, "step": 252 }, { "epoch": 0.19536679536679535, "grad_norm": 4.84375, "learning_rate": 2.4698652649625996e-06, "loss": 0.8192, "step": 253 }, { "epoch": 0.19613899613899613, "grad_norm": 4.75, "learning_rate": 2.469523254245632e-06, "loss": 0.8615, "step": 254 }, { "epoch": 0.1969111969111969, "grad_norm": 4.5625, "learning_rate": 2.469179337608093e-06, "loss": 0.8559, "step": 255 }, { "epoch": 0.19768339768339768, "grad_norm": 4.21875, "learning_rate": 2.468833515587469e-06, "loss": 1.0063, "step": 256 }, { "epoch": 0.19845559845559846, "grad_norm": 3.859375, "learning_rate": 2.4684857887242257e-06, "loss": 0.9519, "step": 257 }, { "epoch": 0.19922779922779923, "grad_norm": 4.65625, "learning_rate": 2.468136157561804e-06, "loss": 0.9062, "step": 258 }, { "epoch": 0.2, "grad_norm": 8.625, "learning_rate": 2.4677846226466222e-06, "loss": 0.9301, "step": 259 }, { "epoch": 0.20077220077220076, "grad_norm": 4.25, "learning_rate": 2.4674311845280724e-06, "loss": 0.8769, "step": 260 }, { "epoch": 0.20154440154440154, "grad_norm": 98.0, "learning_rate": 2.4670758437585233e-06, "loss": 0.891, "step": 261 }, { "epoch": 0.2023166023166023, "grad_norm": 4.03125, "learning_rate": 2.4667186008933154e-06, "loss": 0.9207, "step": 262 }, { "epoch": 0.2030888030888031, "grad_norm": 4.0625, "learning_rate": 2.466359456490762e-06, "loss": 0.9626, "step": 263 }, { "epoch": 0.20386100386100386, "grad_norm": 5.0625, "learning_rate": 2.4659984111121493e-06, "loss": 0.8968, "step": 264 }, { "epoch": 0.20463320463320464, "grad_norm": 4.21875, "learning_rate": 2.465635465321733e-06, "loss": 0.8895, "step": 265 }, { "epoch": 0.20540540540540542, "grad_norm": 4.28125, "learning_rate": 2.4652706196867406e-06, "loss": 0.9038, "step": 266 }, { "epoch": 0.20617760617760617, "grad_norm": 5.375, "learning_rate": 2.4649038747773664e-06, "loss": 0.9638, "step": 267 }, { "epoch": 0.20694980694980694, "grad_norm": 5.875, "learning_rate": 2.464535231166775e-06, "loss": 0.7915, "step": 268 }, { "epoch": 0.20772200772200772, "grad_norm": 3.875, "learning_rate": 2.4641646894310976e-06, "loss": 0.8623, "step": 269 }, { "epoch": 0.2084942084942085, "grad_norm": 4.1875, "learning_rate": 2.463792250149432e-06, "loss": 0.9715, "step": 270 }, { "epoch": 0.20926640926640927, "grad_norm": 4.625, "learning_rate": 2.463417913903841e-06, "loss": 0.8795, "step": 271 }, { "epoch": 0.21003861003861005, "grad_norm": 4.84375, "learning_rate": 2.463041681279353e-06, "loss": 0.9441, "step": 272 }, { "epoch": 0.21081081081081082, "grad_norm": 4.6875, "learning_rate": 2.4626635528639593e-06, "loss": 0.9686, "step": 273 }, { "epoch": 0.21158301158301157, "grad_norm": 5.625, "learning_rate": 2.4622835292486147e-06, "loss": 0.9567, "step": 274 }, { "epoch": 0.21235521235521235, "grad_norm": 5.46875, "learning_rate": 2.461901611027235e-06, "loss": 0.9869, "step": 275 }, { "epoch": 0.21312741312741312, "grad_norm": 8.6875, "learning_rate": 2.461517798796699e-06, "loss": 0.8519, "step": 276 }, { "epoch": 0.2138996138996139, "grad_norm": 7.84375, "learning_rate": 2.461132093156842e-06, "loss": 0.8663, "step": 277 }, { "epoch": 0.21467181467181468, "grad_norm": 8.75, "learning_rate": 2.4607444947104623e-06, "loss": 1.0405, "step": 278 }, { "epoch": 0.21544401544401545, "grad_norm": 4.625, "learning_rate": 2.460355004063313e-06, "loss": 0.9003, "step": 279 }, { "epoch": 0.21621621621621623, "grad_norm": 3.890625, "learning_rate": 2.4599636218241074e-06, "loss": 0.8359, "step": 280 }, { "epoch": 0.21698841698841698, "grad_norm": 4.25, "learning_rate": 2.4595703486045123e-06, "loss": 1.0473, "step": 281 }, { "epoch": 0.21776061776061775, "grad_norm": 4.34375, "learning_rate": 2.459175185019152e-06, "loss": 0.8225, "step": 282 }, { "epoch": 0.21853281853281853, "grad_norm": 3.828125, "learning_rate": 2.4587781316856033e-06, "loss": 1.1177, "step": 283 }, { "epoch": 0.2193050193050193, "grad_norm": 5.8125, "learning_rate": 2.4583791892243988e-06, "loss": 1.0003, "step": 284 }, { "epoch": 0.22007722007722008, "grad_norm": 3.65625, "learning_rate": 2.4579783582590206e-06, "loss": 0.8246, "step": 285 }, { "epoch": 0.22084942084942086, "grad_norm": 7.46875, "learning_rate": 2.4575756394159044e-06, "loss": 0.9186, "step": 286 }, { "epoch": 0.22162162162162163, "grad_norm": 3.59375, "learning_rate": 2.4571710333244346e-06, "loss": 0.8753, "step": 287 }, { "epoch": 0.22239382239382238, "grad_norm": 4.875, "learning_rate": 2.4567645406169475e-06, "loss": 0.9768, "step": 288 }, { "epoch": 0.22316602316602316, "grad_norm": 7.5, "learning_rate": 2.456356161928726e-06, "loss": 0.898, "step": 289 }, { "epoch": 0.22393822393822393, "grad_norm": 4.53125, "learning_rate": 2.4559458978979998e-06, "loss": 0.9229, "step": 290 }, { "epoch": 0.2247104247104247, "grad_norm": 3.78125, "learning_rate": 2.4555337491659477e-06, "loss": 0.9352, "step": 291 }, { "epoch": 0.2254826254826255, "grad_norm": 4.1875, "learning_rate": 2.455119716376692e-06, "loss": 0.7978, "step": 292 }, { "epoch": 0.22625482625482626, "grad_norm": 3.703125, "learning_rate": 2.4547038001773e-06, "loss": 0.9299, "step": 293 }, { "epoch": 0.22702702702702704, "grad_norm": 4.21875, "learning_rate": 2.454286001217782e-06, "loss": 0.8966, "step": 294 }, { "epoch": 0.2277992277992278, "grad_norm": 4.1875, "learning_rate": 2.4538663201510913e-06, "loss": 0.8692, "step": 295 }, { "epoch": 0.22857142857142856, "grad_norm": 7.4375, "learning_rate": 2.453444757633124e-06, "loss": 0.8863, "step": 296 }, { "epoch": 0.22934362934362934, "grad_norm": 3.828125, "learning_rate": 2.453021314322713e-06, "loss": 0.8632, "step": 297 }, { "epoch": 0.23011583011583012, "grad_norm": 4.09375, "learning_rate": 2.452595990881634e-06, "loss": 1.077, "step": 298 }, { "epoch": 0.2308880308880309, "grad_norm": 3.625, "learning_rate": 2.4521687879746e-06, "loss": 0.9034, "step": 299 }, { "epoch": 0.23166023166023167, "grad_norm": 4.4375, "learning_rate": 2.45173970626926e-06, "loss": 0.9585, "step": 300 }, { "epoch": 0.23243243243243245, "grad_norm": 4.46875, "learning_rate": 2.451308746436201e-06, "loss": 0.8263, "step": 301 }, { "epoch": 0.2332046332046332, "grad_norm": 6.5625, "learning_rate": 2.4508759091489447e-06, "loss": 0.9787, "step": 302 }, { "epoch": 0.23397683397683397, "grad_norm": 4.5625, "learning_rate": 2.450441195083947e-06, "loss": 0.8649, "step": 303 }, { "epoch": 0.23474903474903475, "grad_norm": 6.0625, "learning_rate": 2.4500046049205957e-06, "loss": 0.8496, "step": 304 }, { "epoch": 0.23552123552123552, "grad_norm": 23.875, "learning_rate": 2.4495661393412127e-06, "loss": 0.9051, "step": 305 }, { "epoch": 0.2362934362934363, "grad_norm": 22.125, "learning_rate": 2.4491257990310498e-06, "loss": 0.9017, "step": 306 }, { "epoch": 0.23706563706563707, "grad_norm": 22.125, "learning_rate": 2.448683584678288e-06, "loss": 0.9033, "step": 307 }, { "epoch": 0.23783783783783785, "grad_norm": 4.65625, "learning_rate": 2.4482394969740386e-06, "loss": 1.1217, "step": 308 }, { "epoch": 0.2386100386100386, "grad_norm": 4.40625, "learning_rate": 2.44779353661234e-06, "loss": 0.8801, "step": 309 }, { "epoch": 0.23938223938223938, "grad_norm": 3.75, "learning_rate": 2.4473457042901567e-06, "loss": 0.7884, "step": 310 }, { "epoch": 0.24015444015444015, "grad_norm": 4.71875, "learning_rate": 2.446896000707379e-06, "loss": 0.8898, "step": 311 }, { "epoch": 0.24092664092664093, "grad_norm": 4.09375, "learning_rate": 2.446444426566823e-06, "loss": 0.8278, "step": 312 }, { "epoch": 0.2416988416988417, "grad_norm": 4.34375, "learning_rate": 2.445990982574226e-06, "loss": 0.9677, "step": 313 }, { "epoch": 0.24247104247104248, "grad_norm": 4.125, "learning_rate": 2.44553566943825e-06, "loss": 0.8657, "step": 314 }, { "epoch": 0.24324324324324326, "grad_norm": 5.3125, "learning_rate": 2.445078487870476e-06, "loss": 0.8782, "step": 315 }, { "epoch": 0.244015444015444, "grad_norm": 4.5, "learning_rate": 2.444619438585406e-06, "loss": 0.7571, "step": 316 }, { "epoch": 0.24478764478764478, "grad_norm": 3.625, "learning_rate": 2.444158522300461e-06, "loss": 0.8337, "step": 317 }, { "epoch": 0.24555984555984556, "grad_norm": 4.71875, "learning_rate": 2.44369573973598e-06, "loss": 1.0684, "step": 318 }, { "epoch": 0.24633204633204633, "grad_norm": 4.03125, "learning_rate": 2.4432310916152186e-06, "loss": 0.8859, "step": 319 }, { "epoch": 0.2471042471042471, "grad_norm": 3.84375, "learning_rate": 2.4427645786643468e-06, "loss": 0.9775, "step": 320 }, { "epoch": 0.2478764478764479, "grad_norm": 4.34375, "learning_rate": 2.442296201612451e-06, "loss": 0.8991, "step": 321 }, { "epoch": 0.24864864864864866, "grad_norm": 3.75, "learning_rate": 2.4418259611915295e-06, "loss": 0.908, "step": 322 }, { "epoch": 0.2494208494208494, "grad_norm": 4.21875, "learning_rate": 2.441353858136493e-06, "loss": 1.021, "step": 323 }, { "epoch": 0.2501930501930502, "grad_norm": 4.03125, "learning_rate": 2.440879893185164e-06, "loss": 0.9707, "step": 324 }, { "epoch": 0.25096525096525096, "grad_norm": 4.03125, "learning_rate": 2.4404040670782735e-06, "loss": 0.9813, "step": 325 }, { "epoch": 0.2517374517374517, "grad_norm": 3.6875, "learning_rate": 2.439926380559462e-06, "loss": 0.9916, "step": 326 }, { "epoch": 0.2525096525096525, "grad_norm": 4.03125, "learning_rate": 2.439446834375278e-06, "loss": 0.8781, "step": 327 }, { "epoch": 0.25328185328185326, "grad_norm": 4.125, "learning_rate": 2.4389654292751753e-06, "loss": 0.8851, "step": 328 }, { "epoch": 0.25405405405405407, "grad_norm": 3.796875, "learning_rate": 2.4384821660115136e-06, "loss": 0.9075, "step": 329 }, { "epoch": 0.2548262548262548, "grad_norm": 6.375, "learning_rate": 2.4379970453395564e-06, "loss": 0.9684, "step": 330 }, { "epoch": 0.2555984555984556, "grad_norm": 6.53125, "learning_rate": 2.4375100680174697e-06, "loss": 0.8458, "step": 331 }, { "epoch": 0.25637065637065637, "grad_norm": 6.375, "learning_rate": 2.437021234806322e-06, "loss": 0.992, "step": 332 }, { "epoch": 0.2571428571428571, "grad_norm": 3.984375, "learning_rate": 2.436530546470081e-06, "loss": 0.9524, "step": 333 }, { "epoch": 0.2579150579150579, "grad_norm": 30.0, "learning_rate": 2.4360380037756153e-06, "loss": 1.0254, "step": 334 }, { "epoch": 0.25868725868725867, "grad_norm": 3.984375, "learning_rate": 2.4355436074926904e-06, "loss": 0.874, "step": 335 }, { "epoch": 0.2594594594594595, "grad_norm": 6.25, "learning_rate": 2.435047358393968e-06, "loss": 1.0239, "step": 336 }, { "epoch": 0.2602316602316602, "grad_norm": 4.15625, "learning_rate": 2.4345492572550077e-06, "loss": 0.9186, "step": 337 }, { "epoch": 0.261003861003861, "grad_norm": 4.34375, "learning_rate": 2.4340493048542616e-06, "loss": 0.9132, "step": 338 }, { "epoch": 0.2617760617760618, "grad_norm": 4.0, "learning_rate": 2.4335475019730756e-06, "loss": 0.8661, "step": 339 }, { "epoch": 0.2625482625482625, "grad_norm": 5.09375, "learning_rate": 2.433043849395688e-06, "loss": 0.8451, "step": 340 }, { "epoch": 0.2633204633204633, "grad_norm": 3.75, "learning_rate": 2.432538347909227e-06, "loss": 0.929, "step": 341 }, { "epoch": 0.2640926640926641, "grad_norm": 5.15625, "learning_rate": 2.4320309983037115e-06, "loss": 0.9172, "step": 342 }, { "epoch": 0.2648648648648649, "grad_norm": 29.375, "learning_rate": 2.431521801372048e-06, "loss": 0.8868, "step": 343 }, { "epoch": 0.26563706563706563, "grad_norm": 3.859375, "learning_rate": 2.4310107579100294e-06, "loss": 0.835, "step": 344 }, { "epoch": 0.26640926640926643, "grad_norm": 5.875, "learning_rate": 2.4304978687163368e-06, "loss": 0.9439, "step": 345 }, { "epoch": 0.2671814671814672, "grad_norm": 4.0625, "learning_rate": 2.4299831345925326e-06, "loss": 0.9663, "step": 346 }, { "epoch": 0.26795366795366793, "grad_norm": 3.953125, "learning_rate": 2.4294665563430653e-06, "loss": 0.8343, "step": 347 }, { "epoch": 0.26872586872586873, "grad_norm": 3.734375, "learning_rate": 2.4289481347752643e-06, "loss": 0.8639, "step": 348 }, { "epoch": 0.2694980694980695, "grad_norm": 6.0, "learning_rate": 2.428427870699339e-06, "loss": 1.0189, "step": 349 }, { "epoch": 0.2702702702702703, "grad_norm": 8.0, "learning_rate": 2.4279057649283805e-06, "loss": 0.801, "step": 350 }, { "epoch": 0.27104247104247103, "grad_norm": 4.15625, "learning_rate": 2.4273818182783564e-06, "loss": 0.9285, "step": 351 }, { "epoch": 0.27181467181467184, "grad_norm": 5.09375, "learning_rate": 2.426856031568112e-06, "loss": 0.9963, "step": 352 }, { "epoch": 0.2725868725868726, "grad_norm": 3.765625, "learning_rate": 2.4263284056193683e-06, "loss": 0.943, "step": 353 }, { "epoch": 0.27335907335907333, "grad_norm": 4.03125, "learning_rate": 2.4257989412567196e-06, "loss": 0.8907, "step": 354 }, { "epoch": 0.27413127413127414, "grad_norm": 4.0625, "learning_rate": 2.425267639307636e-06, "loss": 0.898, "step": 355 }, { "epoch": 0.2749034749034749, "grad_norm": 3.703125, "learning_rate": 2.424734500602456e-06, "loss": 0.8486, "step": 356 }, { "epoch": 0.2756756756756757, "grad_norm": 4.3125, "learning_rate": 2.424199525974392e-06, "loss": 0.9788, "step": 357 }, { "epoch": 0.27644787644787644, "grad_norm": 4.96875, "learning_rate": 2.4236627162595233e-06, "loss": 0.9183, "step": 358 }, { "epoch": 0.27722007722007724, "grad_norm": 3.859375, "learning_rate": 2.4231240722967984e-06, "loss": 0.8102, "step": 359 }, { "epoch": 0.277992277992278, "grad_norm": 3.53125, "learning_rate": 2.422583594928031e-06, "loss": 0.7856, "step": 360 }, { "epoch": 0.27876447876447874, "grad_norm": 4.09375, "learning_rate": 2.422041284997903e-06, "loss": 0.8568, "step": 361 }, { "epoch": 0.27953667953667954, "grad_norm": 56.5, "learning_rate": 2.421497143353957e-06, "loss": 0.8746, "step": 362 }, { "epoch": 0.2803088803088803, "grad_norm": 3.84375, "learning_rate": 2.4209511708466e-06, "loss": 0.902, "step": 363 }, { "epoch": 0.2810810810810811, "grad_norm": 5.90625, "learning_rate": 2.4204033683291e-06, "loss": 0.9323, "step": 364 }, { "epoch": 0.28185328185328185, "grad_norm": 4.46875, "learning_rate": 2.4198537366575854e-06, "loss": 1.017, "step": 365 }, { "epoch": 0.28262548262548265, "grad_norm": 5.03125, "learning_rate": 2.419302276691042e-06, "loss": 0.9913, "step": 366 }, { "epoch": 0.2833976833976834, "grad_norm": 5.78125, "learning_rate": 2.4187489892913153e-06, "loss": 1.0389, "step": 367 }, { "epoch": 0.28416988416988415, "grad_norm": 4.75, "learning_rate": 2.4181938753231044e-06, "loss": 1.0162, "step": 368 }, { "epoch": 0.28494208494208495, "grad_norm": 4.125, "learning_rate": 2.4176369356539643e-06, "loss": 0.8842, "step": 369 }, { "epoch": 0.2857142857142857, "grad_norm": 3.84375, "learning_rate": 2.417078171154303e-06, "loss": 0.8139, "step": 370 }, { "epoch": 0.2864864864864865, "grad_norm": 3.84375, "learning_rate": 2.41651758269738e-06, "loss": 0.9389, "step": 371 }, { "epoch": 0.28725868725868725, "grad_norm": 4.6875, "learning_rate": 2.4159551711593065e-06, "loss": 0.8194, "step": 372 }, { "epoch": 0.28803088803088805, "grad_norm": 4.0625, "learning_rate": 2.415390937419042e-06, "loss": 0.7892, "step": 373 }, { "epoch": 0.2888030888030888, "grad_norm": 8.25, "learning_rate": 2.414824882358393e-06, "loss": 0.9512, "step": 374 }, { "epoch": 0.28957528957528955, "grad_norm": 3.71875, "learning_rate": 2.414257006862015e-06, "loss": 0.8587, "step": 375 }, { "epoch": 0.29034749034749036, "grad_norm": 4.0, "learning_rate": 2.4136873118174047e-06, "loss": 0.8764, "step": 376 }, { "epoch": 0.2911196911196911, "grad_norm": 5.90625, "learning_rate": 2.4131157981149065e-06, "loss": 0.8886, "step": 377 }, { "epoch": 0.2918918918918919, "grad_norm": 4.9375, "learning_rate": 2.412542466647705e-06, "loss": 0.9227, "step": 378 }, { "epoch": 0.29266409266409266, "grad_norm": 3.78125, "learning_rate": 2.411967318311826e-06, "loss": 0.832, "step": 379 }, { "epoch": 0.29343629343629346, "grad_norm": 3.71875, "learning_rate": 2.411390354006134e-06, "loss": 0.7514, "step": 380 }, { "epoch": 0.2942084942084942, "grad_norm": 4.90625, "learning_rate": 2.410811574632333e-06, "loss": 0.897, "step": 381 }, { "epoch": 0.29498069498069496, "grad_norm": 4.59375, "learning_rate": 2.4102309810949624e-06, "loss": 0.9183, "step": 382 }, { "epoch": 0.29575289575289576, "grad_norm": 6.9375, "learning_rate": 2.4096485743013976e-06, "loss": 0.8497, "step": 383 }, { "epoch": 0.2965250965250965, "grad_norm": 5.375, "learning_rate": 2.409064355161848e-06, "loss": 0.8845, "step": 384 }, { "epoch": 0.2972972972972973, "grad_norm": 4.46875, "learning_rate": 2.408478324589355e-06, "loss": 0.8601, "step": 385 }, { "epoch": 0.29806949806949806, "grad_norm": 4.6875, "learning_rate": 2.4078904834997907e-06, "loss": 1.0236, "step": 386 }, { "epoch": 0.29884169884169887, "grad_norm": 3.859375, "learning_rate": 2.4073008328118572e-06, "loss": 0.8348, "step": 387 }, { "epoch": 0.2996138996138996, "grad_norm": 39.0, "learning_rate": 2.4067093734470845e-06, "loss": 1.0384, "step": 388 }, { "epoch": 0.30038610038610036, "grad_norm": 5.125, "learning_rate": 2.4061161063298295e-06, "loss": 1.0525, "step": 389 }, { "epoch": 0.30115830115830117, "grad_norm": 4.375, "learning_rate": 2.405521032387274e-06, "loss": 0.8524, "step": 390 }, { "epoch": 0.3019305019305019, "grad_norm": 3.671875, "learning_rate": 2.4049241525494245e-06, "loss": 0.8019, "step": 391 }, { "epoch": 0.3027027027027027, "grad_norm": 4.625, "learning_rate": 2.404325467749108e-06, "loss": 0.9597, "step": 392 }, { "epoch": 0.30347490347490347, "grad_norm": 4.875, "learning_rate": 2.403724978921974e-06, "loss": 1.0347, "step": 393 }, { "epoch": 0.30424710424710427, "grad_norm": 4.53125, "learning_rate": 2.403122687006491e-06, "loss": 1.0782, "step": 394 }, { "epoch": 0.305019305019305, "grad_norm": 4.1875, "learning_rate": 2.4025185929439453e-06, "loss": 0.8402, "step": 395 }, { "epoch": 0.30579150579150577, "grad_norm": 4.6875, "learning_rate": 2.4019126976784395e-06, "loss": 0.9656, "step": 396 }, { "epoch": 0.3065637065637066, "grad_norm": 3.796875, "learning_rate": 2.401305002156891e-06, "loss": 0.7803, "step": 397 }, { "epoch": 0.3073359073359073, "grad_norm": 5.90625, "learning_rate": 2.400695507329032e-06, "loss": 0.7754, "step": 398 }, { "epoch": 0.3081081081081081, "grad_norm": 4.6875, "learning_rate": 2.4000842141474056e-06, "loss": 0.9298, "step": 399 }, { "epoch": 0.3088803088803089, "grad_norm": 7.3125, "learning_rate": 2.3994711235673656e-06, "loss": 0.9038, "step": 400 }, { "epoch": 0.3096525096525097, "grad_norm": 4.59375, "learning_rate": 2.398856236547074e-06, "loss": 0.9207, "step": 401 }, { "epoch": 0.3104247104247104, "grad_norm": 3.984375, "learning_rate": 2.3982395540475023e-06, "loss": 1.0649, "step": 402 }, { "epoch": 0.3111969111969112, "grad_norm": 4.53125, "learning_rate": 2.397621077032427e-06, "loss": 1.1418, "step": 403 }, { "epoch": 0.311969111969112, "grad_norm": 6.8125, "learning_rate": 2.3970008064684285e-06, "loss": 0.8477, "step": 404 }, { "epoch": 0.3127413127413127, "grad_norm": 4.8125, "learning_rate": 2.3963787433248915e-06, "loss": 0.9938, "step": 405 }, { "epoch": 0.31351351351351353, "grad_norm": 3.75, "learning_rate": 2.395754888574001e-06, "loss": 0.8271, "step": 406 }, { "epoch": 0.3142857142857143, "grad_norm": 4.1875, "learning_rate": 2.395129243190743e-06, "loss": 1.0018, "step": 407 }, { "epoch": 0.3150579150579151, "grad_norm": 4.0625, "learning_rate": 2.3945018081529016e-06, "loss": 0.8369, "step": 408 }, { "epoch": 0.31583011583011583, "grad_norm": 3.6875, "learning_rate": 2.393872584441058e-06, "loss": 0.7837, "step": 409 }, { "epoch": 0.3166023166023166, "grad_norm": 3.859375, "learning_rate": 2.3932415730385876e-06, "loss": 0.6864, "step": 410 }, { "epoch": 0.3173745173745174, "grad_norm": 6.46875, "learning_rate": 2.392608774931662e-06, "loss": 0.9328, "step": 411 }, { "epoch": 0.31814671814671813, "grad_norm": 3.671875, "learning_rate": 2.391974191109243e-06, "loss": 0.7778, "step": 412 }, { "epoch": 0.31891891891891894, "grad_norm": 4.125, "learning_rate": 2.3913378225630847e-06, "loss": 0.8835, "step": 413 }, { "epoch": 0.3196911196911197, "grad_norm": 5.53125, "learning_rate": 2.3906996702877287e-06, "loss": 0.8366, "step": 414 }, { "epoch": 0.3204633204633205, "grad_norm": 4.84375, "learning_rate": 2.3900597352805068e-06, "loss": 0.8612, "step": 415 }, { "epoch": 0.32123552123552124, "grad_norm": 5.4375, "learning_rate": 2.389418018541534e-06, "loss": 0.8605, "step": 416 }, { "epoch": 0.322007722007722, "grad_norm": 4.6875, "learning_rate": 2.388774521073712e-06, "loss": 0.8301, "step": 417 }, { "epoch": 0.3227799227799228, "grad_norm": 6.90625, "learning_rate": 2.388129243882725e-06, "loss": 0.8409, "step": 418 }, { "epoch": 0.32355212355212354, "grad_norm": 4.90625, "learning_rate": 2.3874821879770384e-06, "loss": 0.9609, "step": 419 }, { "epoch": 0.32432432432432434, "grad_norm": 4.3125, "learning_rate": 2.386833354367897e-06, "loss": 1.0157, "step": 420 }, { "epoch": 0.3250965250965251, "grad_norm": 4.25, "learning_rate": 2.386182744069325e-06, "loss": 0.7306, "step": 421 }, { "epoch": 0.3258687258687259, "grad_norm": 3.84375, "learning_rate": 2.3855303580981225e-06, "loss": 0.9016, "step": 422 }, { "epoch": 0.32664092664092664, "grad_norm": 5.09375, "learning_rate": 2.3848761974738647e-06, "loss": 0.8901, "step": 423 }, { "epoch": 0.3274131274131274, "grad_norm": 4.03125, "learning_rate": 2.3842202632189007e-06, "loss": 0.8731, "step": 424 }, { "epoch": 0.3281853281853282, "grad_norm": 4.59375, "learning_rate": 2.383562556358351e-06, "loss": 0.9231, "step": 425 }, { "epoch": 0.32895752895752894, "grad_norm": 3.65625, "learning_rate": 2.382903077920106e-06, "loss": 1.0336, "step": 426 }, { "epoch": 0.32972972972972975, "grad_norm": 4.96875, "learning_rate": 2.382241828934827e-06, "loss": 0.9427, "step": 427 }, { "epoch": 0.3305019305019305, "grad_norm": 4.65625, "learning_rate": 2.3815788104359393e-06, "loss": 0.8664, "step": 428 }, { "epoch": 0.3312741312741313, "grad_norm": 4.40625, "learning_rate": 2.3809140234596366e-06, "loss": 1.0327, "step": 429 }, { "epoch": 0.33204633204633205, "grad_norm": 3.875, "learning_rate": 2.380247469044874e-06, "loss": 0.8935, "step": 430 }, { "epoch": 0.3328185328185328, "grad_norm": 3.796875, "learning_rate": 2.3795791482333697e-06, "loss": 0.8567, "step": 431 }, { "epoch": 0.3335907335907336, "grad_norm": 4.15625, "learning_rate": 2.3789090620696035e-06, "loss": 0.7822, "step": 432 }, { "epoch": 0.33436293436293435, "grad_norm": 24.625, "learning_rate": 2.3782372116008127e-06, "loss": 0.9705, "step": 433 }, { "epoch": 0.33513513513513515, "grad_norm": 4.03125, "learning_rate": 2.377563597876993e-06, "loss": 0.8513, "step": 434 }, { "epoch": 0.3359073359073359, "grad_norm": 6.75, "learning_rate": 2.3768882219508953e-06, "loss": 0.9354, "step": 435 }, { "epoch": 0.3366795366795367, "grad_norm": 6.78125, "learning_rate": 2.376211084878024e-06, "loss": 0.9943, "step": 436 }, { "epoch": 0.33745173745173745, "grad_norm": 5.09375, "learning_rate": 2.3755321877166372e-06, "loss": 0.8334, "step": 437 }, { "epoch": 0.3382239382239382, "grad_norm": 4.875, "learning_rate": 2.3748515315277426e-06, "loss": 0.9075, "step": 438 }, { "epoch": 0.338996138996139, "grad_norm": 3.90625, "learning_rate": 2.3741691173750973e-06, "loss": 0.7247, "step": 439 }, { "epoch": 0.33976833976833976, "grad_norm": 4.09375, "learning_rate": 2.373484946325207e-06, "loss": 0.9381, "step": 440 }, { "epoch": 0.34054054054054056, "grad_norm": 5.34375, "learning_rate": 2.37279901944732e-06, "loss": 1.0157, "step": 441 }, { "epoch": 0.3413127413127413, "grad_norm": 3.640625, "learning_rate": 2.3721113378134325e-06, "loss": 0.8448, "step": 442 }, { "epoch": 0.3420849420849421, "grad_norm": 4.09375, "learning_rate": 2.371421902498281e-06, "loss": 0.8529, "step": 443 }, { "epoch": 0.34285714285714286, "grad_norm": 4.03125, "learning_rate": 2.3707307145793424e-06, "loss": 0.9267, "step": 444 }, { "epoch": 0.3436293436293436, "grad_norm": 4.0625, "learning_rate": 2.3700377751368336e-06, "loss": 0.7751, "step": 445 }, { "epoch": 0.3444015444015444, "grad_norm": 4.21875, "learning_rate": 2.369343085253709e-06, "loss": 0.9323, "step": 446 }, { "epoch": 0.34517374517374516, "grad_norm": 4.28125, "learning_rate": 2.368646646015658e-06, "loss": 0.8029, "step": 447 }, { "epoch": 0.34594594594594597, "grad_norm": 4.0625, "learning_rate": 2.367948458511104e-06, "loss": 0.7469, "step": 448 }, { "epoch": 0.3467181467181467, "grad_norm": 4.625, "learning_rate": 2.3672485238312027e-06, "loss": 1.0202, "step": 449 }, { "epoch": 0.3474903474903475, "grad_norm": 4.375, "learning_rate": 2.366546843069841e-06, "loss": 0.9535, "step": 450 }, { "epoch": 0.34826254826254827, "grad_norm": 3.765625, "learning_rate": 2.365843417323634e-06, "loss": 0.8203, "step": 451 }, { "epoch": 0.349034749034749, "grad_norm": 4.21875, "learning_rate": 2.365138247691924e-06, "loss": 0.881, "step": 452 }, { "epoch": 0.3498069498069498, "grad_norm": 5.28125, "learning_rate": 2.364431335276778e-06, "loss": 1.001, "step": 453 }, { "epoch": 0.35057915057915057, "grad_norm": 7.78125, "learning_rate": 2.3637226811829887e-06, "loss": 0.8269, "step": 454 }, { "epoch": 0.35135135135135137, "grad_norm": 6.75, "learning_rate": 2.3630122865180695e-06, "loss": 0.8499, "step": 455 }, { "epoch": 0.3521235521235521, "grad_norm": 5.78125, "learning_rate": 2.3623001523922533e-06, "loss": 0.8538, "step": 456 }, { "epoch": 0.3528957528957529, "grad_norm": 3.71875, "learning_rate": 2.361586279918493e-06, "loss": 0.8859, "step": 457 }, { "epoch": 0.35366795366795367, "grad_norm": 8.125, "learning_rate": 2.3608706702124573e-06, "loss": 0.9498, "step": 458 }, { "epoch": 0.3544401544401544, "grad_norm": 4.15625, "learning_rate": 2.36015332439253e-06, "loss": 0.8066, "step": 459 }, { "epoch": 0.3552123552123552, "grad_norm": 4.46875, "learning_rate": 2.3594342435798085e-06, "loss": 0.9654, "step": 460 }, { "epoch": 0.355984555984556, "grad_norm": 4.78125, "learning_rate": 2.3587134288981014e-06, "loss": 0.9392, "step": 461 }, { "epoch": 0.3567567567567568, "grad_norm": 3.921875, "learning_rate": 2.357990881473927e-06, "loss": 0.8615, "step": 462 }, { "epoch": 0.3575289575289575, "grad_norm": 4.15625, "learning_rate": 2.3572666024365125e-06, "loss": 0.894, "step": 463 }, { "epoch": 0.3583011583011583, "grad_norm": 4.0625, "learning_rate": 2.3565405929177908e-06, "loss": 0.8433, "step": 464 }, { "epoch": 0.3590733590733591, "grad_norm": 4.25, "learning_rate": 2.355812854052398e-06, "loss": 0.8094, "step": 465 }, { "epoch": 0.3598455598455598, "grad_norm": 31.625, "learning_rate": 2.3550833869776753e-06, "loss": 0.858, "step": 466 }, { "epoch": 0.36061776061776063, "grad_norm": 5.78125, "learning_rate": 2.354352192833662e-06, "loss": 1.2154, "step": 467 }, { "epoch": 0.3613899613899614, "grad_norm": 4.65625, "learning_rate": 2.3536192727631e-06, "loss": 0.9887, "step": 468 }, { "epoch": 0.3621621621621622, "grad_norm": 4.75, "learning_rate": 2.352884627911425e-06, "loss": 0.8957, "step": 469 }, { "epoch": 0.36293436293436293, "grad_norm": 4.125, "learning_rate": 2.35214825942677e-06, "loss": 0.7956, "step": 470 }, { "epoch": 0.3637065637065637, "grad_norm": 3.75, "learning_rate": 2.3514101684599627e-06, "loss": 0.7734, "step": 471 }, { "epoch": 0.3644787644787645, "grad_norm": 3.828125, "learning_rate": 2.3506703561645207e-06, "loss": 0.7991, "step": 472 }, { "epoch": 0.36525096525096523, "grad_norm": 5.84375, "learning_rate": 2.3499288236966535e-06, "loss": 0.8616, "step": 473 }, { "epoch": 0.36602316602316604, "grad_norm": 4.75, "learning_rate": 2.349185572215258e-06, "loss": 1.0298, "step": 474 }, { "epoch": 0.3667953667953668, "grad_norm": 4.28125, "learning_rate": 2.348440602881917e-06, "loss": 1.1332, "step": 475 }, { "epoch": 0.3675675675675676, "grad_norm": 4.25, "learning_rate": 2.3476939168609005e-06, "loss": 0.9983, "step": 476 }, { "epoch": 0.36833976833976834, "grad_norm": 4.6875, "learning_rate": 2.346945515319158e-06, "loss": 0.8933, "step": 477 }, { "epoch": 0.3691119691119691, "grad_norm": 4.15625, "learning_rate": 2.346195399426323e-06, "loss": 0.9386, "step": 478 }, { "epoch": 0.3698841698841699, "grad_norm": 4.0625, "learning_rate": 2.3454435703547072e-06, "loss": 0.922, "step": 479 }, { "epoch": 0.37065637065637064, "grad_norm": 4.03125, "learning_rate": 2.3446900292792997e-06, "loss": 0.8974, "step": 480 }, { "epoch": 0.37142857142857144, "grad_norm": 5.21875, "learning_rate": 2.3439347773777644e-06, "loss": 0.938, "step": 481 }, { "epoch": 0.3722007722007722, "grad_norm": 3.859375, "learning_rate": 2.3431778158304404e-06, "loss": 0.8837, "step": 482 }, { "epoch": 0.372972972972973, "grad_norm": 4.84375, "learning_rate": 2.3424191458203375e-06, "loss": 1.014, "step": 483 }, { "epoch": 0.37374517374517374, "grad_norm": 4.21875, "learning_rate": 2.341658768533137e-06, "loss": 0.8851, "step": 484 }, { "epoch": 0.3745173745173745, "grad_norm": 4.0, "learning_rate": 2.340896685157186e-06, "loss": 0.9704, "step": 485 }, { "epoch": 0.3752895752895753, "grad_norm": 4.65625, "learning_rate": 2.340132896883501e-06, "loss": 0.9962, "step": 486 }, { "epoch": 0.37606177606177604, "grad_norm": 7.6875, "learning_rate": 2.33936740490576e-06, "loss": 0.904, "step": 487 }, { "epoch": 0.37683397683397685, "grad_norm": 4.625, "learning_rate": 2.338600210420306e-06, "loss": 0.8123, "step": 488 }, { "epoch": 0.3776061776061776, "grad_norm": 6.40625, "learning_rate": 2.3378313146261413e-06, "loss": 0.8026, "step": 489 }, { "epoch": 0.3783783783783784, "grad_norm": 4.9375, "learning_rate": 2.3370607187249277e-06, "loss": 0.8459, "step": 490 }, { "epoch": 0.37915057915057915, "grad_norm": 5.9375, "learning_rate": 2.3362884239209836e-06, "loss": 0.8981, "step": 491 }, { "epoch": 0.3799227799227799, "grad_norm": 4.5625, "learning_rate": 2.3355144314212835e-06, "loss": 0.862, "step": 492 }, { "epoch": 0.3806949806949807, "grad_norm": 4.15625, "learning_rate": 2.3347387424354533e-06, "loss": 1.0107, "step": 493 }, { "epoch": 0.38146718146718145, "grad_norm": 3.625, "learning_rate": 2.333961358175772e-06, "loss": 0.8362, "step": 494 }, { "epoch": 0.38223938223938225, "grad_norm": 4.03125, "learning_rate": 2.333182279857167e-06, "loss": 0.9245, "step": 495 }, { "epoch": 0.383011583011583, "grad_norm": 3.8125, "learning_rate": 2.332401508697214e-06, "loss": 0.9272, "step": 496 }, { "epoch": 0.3837837837837838, "grad_norm": 3.78125, "learning_rate": 2.3316190459161337e-06, "loss": 0.7451, "step": 497 }, { "epoch": 0.38455598455598455, "grad_norm": 4.875, "learning_rate": 2.330834892736791e-06, "loss": 0.9196, "step": 498 }, { "epoch": 0.3853281853281853, "grad_norm": 3.984375, "learning_rate": 2.330049050384692e-06, "loss": 0.7831, "step": 499 }, { "epoch": 0.3861003861003861, "grad_norm": 4.40625, "learning_rate": 2.329261520087983e-06, "loss": 0.9362, "step": 500 }, { "epoch": 0.38687258687258685, "grad_norm": 4.3125, "learning_rate": 2.328472303077449e-06, "loss": 0.9166, "step": 501 }, { "epoch": 0.38764478764478766, "grad_norm": 4.71875, "learning_rate": 2.3276814005865102e-06, "loss": 0.9465, "step": 502 }, { "epoch": 0.3884169884169884, "grad_norm": 3.875, "learning_rate": 2.326888813851221e-06, "loss": 0.879, "step": 503 }, { "epoch": 0.3891891891891892, "grad_norm": 3.734375, "learning_rate": 2.326094544110269e-06, "loss": 0.8614, "step": 504 }, { "epoch": 0.38996138996138996, "grad_norm": 5.625, "learning_rate": 2.32529859260497e-06, "loss": 0.9224, "step": 505 }, { "epoch": 0.3907335907335907, "grad_norm": 4.84375, "learning_rate": 2.32450096057927e-06, "loss": 0.9693, "step": 506 }, { "epoch": 0.3915057915057915, "grad_norm": 5.25, "learning_rate": 2.3237016492797413e-06, "loss": 1.0179, "step": 507 }, { "epoch": 0.39227799227799226, "grad_norm": 5.3125, "learning_rate": 2.3229006599555796e-06, "loss": 0.8103, "step": 508 }, { "epoch": 0.39305019305019306, "grad_norm": 5.0625, "learning_rate": 2.3220979938586034e-06, "loss": 0.8732, "step": 509 }, { "epoch": 0.3938223938223938, "grad_norm": 5.0, "learning_rate": 2.321293652243252e-06, "loss": 0.9533, "step": 510 }, { "epoch": 0.3945945945945946, "grad_norm": 4.09375, "learning_rate": 2.320487636366584e-06, "loss": 0.9828, "step": 511 }, { "epoch": 0.39536679536679536, "grad_norm": 4.5, "learning_rate": 2.319679947488273e-06, "loss": 0.9896, "step": 512 }, { "epoch": 0.3961389961389961, "grad_norm": 4.09375, "learning_rate": 2.3188705868706086e-06, "loss": 0.9307, "step": 513 }, { "epoch": 0.3969111969111969, "grad_norm": 4.21875, "learning_rate": 2.318059555778492e-06, "loss": 0.9179, "step": 514 }, { "epoch": 0.39768339768339767, "grad_norm": 4.125, "learning_rate": 2.3172468554794357e-06, "loss": 0.9221, "step": 515 }, { "epoch": 0.39845559845559847, "grad_norm": 4.1875, "learning_rate": 2.316432487243561e-06, "loss": 1.0201, "step": 516 }, { "epoch": 0.3992277992277992, "grad_norm": 3.90625, "learning_rate": 2.315616452343596e-06, "loss": 0.903, "step": 517 }, { "epoch": 0.4, "grad_norm": 4.28125, "learning_rate": 2.3147987520548725e-06, "loss": 0.9827, "step": 518 }, { "epoch": 0.40077220077220077, "grad_norm": 4.03125, "learning_rate": 2.313979387655326e-06, "loss": 0.8319, "step": 519 }, { "epoch": 0.4015444015444015, "grad_norm": 4.59375, "learning_rate": 2.3131583604254933e-06, "loss": 0.9477, "step": 520 }, { "epoch": 0.4023166023166023, "grad_norm": 3.78125, "learning_rate": 2.312335671648508e-06, "loss": 0.8499, "step": 521 }, { "epoch": 0.40308880308880307, "grad_norm": 3.875, "learning_rate": 2.3115113226101026e-06, "loss": 0.8115, "step": 522 }, { "epoch": 0.4038610038610039, "grad_norm": 4.09375, "learning_rate": 2.3106853145986025e-06, "loss": 0.9968, "step": 523 }, { "epoch": 0.4046332046332046, "grad_norm": 4.375, "learning_rate": 2.3098576489049274e-06, "loss": 0.8129, "step": 524 }, { "epoch": 0.40540540540540543, "grad_norm": 3.640625, "learning_rate": 2.309028326822586e-06, "loss": 0.7694, "step": 525 }, { "epoch": 0.4061776061776062, "grad_norm": 4.0, "learning_rate": 2.308197349647678e-06, "loss": 0.9073, "step": 526 }, { "epoch": 0.4069498069498069, "grad_norm": 4.75, "learning_rate": 2.3073647186788866e-06, "loss": 0.8578, "step": 527 }, { "epoch": 0.40772200772200773, "grad_norm": 4.3125, "learning_rate": 2.3065304352174827e-06, "loss": 0.8417, "step": 528 }, { "epoch": 0.4084942084942085, "grad_norm": 4.34375, "learning_rate": 2.3056945005673183e-06, "loss": 0.8781, "step": 529 }, { "epoch": 0.4092664092664093, "grad_norm": 7.90625, "learning_rate": 2.3048569160348255e-06, "loss": 0.722, "step": 530 }, { "epoch": 0.41003861003861003, "grad_norm": 4.03125, "learning_rate": 2.3040176829290154e-06, "loss": 0.9661, "step": 531 }, { "epoch": 0.41081081081081083, "grad_norm": 4.40625, "learning_rate": 2.303176802561477e-06, "loss": 1.0005, "step": 532 }, { "epoch": 0.4115830115830116, "grad_norm": 4.3125, "learning_rate": 2.3023342762463708e-06, "loss": 0.9264, "step": 533 }, { "epoch": 0.41235521235521233, "grad_norm": 7.625, "learning_rate": 2.301490105300432e-06, "loss": 0.9187, "step": 534 }, { "epoch": 0.41312741312741313, "grad_norm": 4.90625, "learning_rate": 2.3006442910429648e-06, "loss": 0.8986, "step": 535 }, { "epoch": 0.4138996138996139, "grad_norm": 5.34375, "learning_rate": 2.2997968347958427e-06, "loss": 0.9642, "step": 536 }, { "epoch": 0.4146718146718147, "grad_norm": 5.09375, "learning_rate": 2.2989477378835045e-06, "loss": 0.765, "step": 537 }, { "epoch": 0.41544401544401544, "grad_norm": 3.75, "learning_rate": 2.2980970016329535e-06, "loss": 0.9346, "step": 538 }, { "epoch": 0.41621621621621624, "grad_norm": 69.5, "learning_rate": 2.2972446273737554e-06, "loss": 0.8685, "step": 539 }, { "epoch": 0.416988416988417, "grad_norm": 5.375, "learning_rate": 2.296390616438034e-06, "loss": 0.7781, "step": 540 }, { "epoch": 0.41776061776061774, "grad_norm": 3.75, "learning_rate": 2.295534970160474e-06, "loss": 0.9369, "step": 541 }, { "epoch": 0.41853281853281854, "grad_norm": 4.90625, "learning_rate": 2.294677689878313e-06, "loss": 0.9454, "step": 542 }, { "epoch": 0.4193050193050193, "grad_norm": 4.84375, "learning_rate": 2.293818776931345e-06, "loss": 0.962, "step": 543 }, { "epoch": 0.4200772200772201, "grad_norm": 4.84375, "learning_rate": 2.292958232661912e-06, "loss": 1.0271, "step": 544 }, { "epoch": 0.42084942084942084, "grad_norm": 3.84375, "learning_rate": 2.292096058414909e-06, "loss": 0.7849, "step": 545 }, { "epoch": 0.42162162162162165, "grad_norm": 6.34375, "learning_rate": 2.2912322555377765e-06, "loss": 1.0011, "step": 546 }, { "epoch": 0.4223938223938224, "grad_norm": 4.34375, "learning_rate": 2.2903668253805016e-06, "loss": 0.9098, "step": 547 }, { "epoch": 0.42316602316602314, "grad_norm": 4.0625, "learning_rate": 2.289499769295613e-06, "loss": 1.0212, "step": 548 }, { "epoch": 0.42393822393822395, "grad_norm": 10.8125, "learning_rate": 2.2886310886381806e-06, "loss": 0.7441, "step": 549 }, { "epoch": 0.4247104247104247, "grad_norm": 4.15625, "learning_rate": 2.2877607847658156e-06, "loss": 0.813, "step": 550 }, { "epoch": 0.4254826254826255, "grad_norm": 4.03125, "learning_rate": 2.286888859038663e-06, "loss": 0.957, "step": 551 }, { "epoch": 0.42625482625482625, "grad_norm": 4.5, "learning_rate": 2.2860153128194043e-06, "loss": 1.0001, "step": 552 }, { "epoch": 0.42702702702702705, "grad_norm": 3.640625, "learning_rate": 2.285140147473253e-06, "loss": 0.8173, "step": 553 }, { "epoch": 0.4277992277992278, "grad_norm": 4.6875, "learning_rate": 2.2842633643679528e-06, "loss": 0.9316, "step": 554 }, { "epoch": 0.42857142857142855, "grad_norm": 4.375, "learning_rate": 2.283384964873777e-06, "loss": 0.9585, "step": 555 }, { "epoch": 0.42934362934362935, "grad_norm": 3.640625, "learning_rate": 2.2825049503635225e-06, "loss": 0.7838, "step": 556 }, { "epoch": 0.4301158301158301, "grad_norm": 4.96875, "learning_rate": 2.2816233222125134e-06, "loss": 0.9329, "step": 557 }, { "epoch": 0.4308880308880309, "grad_norm": 4.84375, "learning_rate": 2.280740081798593e-06, "loss": 0.9601, "step": 558 }, { "epoch": 0.43166023166023165, "grad_norm": 4.4375, "learning_rate": 2.2798552305021258e-06, "loss": 1.052, "step": 559 }, { "epoch": 0.43243243243243246, "grad_norm": 5.125, "learning_rate": 2.2789687697059937e-06, "loss": 0.913, "step": 560 }, { "epoch": 0.4332046332046332, "grad_norm": 4.625, "learning_rate": 2.2780807007955928e-06, "loss": 0.8333, "step": 561 }, { "epoch": 0.43397683397683395, "grad_norm": 5.5, "learning_rate": 2.2771910251588343e-06, "loss": 0.9745, "step": 562 }, { "epoch": 0.43474903474903476, "grad_norm": 3.8125, "learning_rate": 2.2762997441861385e-06, "loss": 0.8421, "step": 563 }, { "epoch": 0.4355212355212355, "grad_norm": 6.4375, "learning_rate": 2.2754068592704363e-06, "loss": 0.968, "step": 564 }, { "epoch": 0.4362934362934363, "grad_norm": 4.96875, "learning_rate": 2.2745123718071645e-06, "loss": 1.0062, "step": 565 }, { "epoch": 0.43706563706563706, "grad_norm": 4.71875, "learning_rate": 2.2736162831942645e-06, "loss": 0.9998, "step": 566 }, { "epoch": 0.43783783783783786, "grad_norm": 4.21875, "learning_rate": 2.27271859483218e-06, "loss": 0.8232, "step": 567 }, { "epoch": 0.4386100386100386, "grad_norm": 4.5, "learning_rate": 2.271819308123855e-06, "loss": 0.9366, "step": 568 }, { "epoch": 0.43938223938223936, "grad_norm": 3.84375, "learning_rate": 2.2709184244747307e-06, "loss": 0.9306, "step": 569 }, { "epoch": 0.44015444015444016, "grad_norm": 4.46875, "learning_rate": 2.2700159452927456e-06, "loss": 0.8685, "step": 570 }, { "epoch": 0.4409266409266409, "grad_norm": 4.125, "learning_rate": 2.2691118719883307e-06, "loss": 0.9678, "step": 571 }, { "epoch": 0.4416988416988417, "grad_norm": 11.4375, "learning_rate": 2.268206205974408e-06, "loss": 0.849, "step": 572 }, { "epoch": 0.44247104247104246, "grad_norm": 3.71875, "learning_rate": 2.2672989486663903e-06, "loss": 0.8055, "step": 573 }, { "epoch": 0.44324324324324327, "grad_norm": 3.671875, "learning_rate": 2.2663901014821748e-06, "loss": 0.8229, "step": 574 }, { "epoch": 0.444015444015444, "grad_norm": 4.34375, "learning_rate": 2.2654796658421464e-06, "loss": 0.8422, "step": 575 }, { "epoch": 0.44478764478764476, "grad_norm": 4.5625, "learning_rate": 2.2645676431691698e-06, "loss": 0.8994, "step": 576 }, { "epoch": 0.44555984555984557, "grad_norm": 4.46875, "learning_rate": 2.2636540348885913e-06, "loss": 0.8931, "step": 577 }, { "epoch": 0.4463320463320463, "grad_norm": 4.03125, "learning_rate": 2.2627388424282357e-06, "loss": 0.9011, "step": 578 }, { "epoch": 0.4471042471042471, "grad_norm": 4.0, "learning_rate": 2.2618220672184026e-06, "loss": 0.9412, "step": 579 }, { "epoch": 0.44787644787644787, "grad_norm": 4.59375, "learning_rate": 2.2609037106918658e-06, "loss": 0.787, "step": 580 }, { "epoch": 0.4486486486486487, "grad_norm": 4.03125, "learning_rate": 2.2599837742838697e-06, "loss": 1.0205, "step": 581 }, { "epoch": 0.4494208494208494, "grad_norm": 4.03125, "learning_rate": 2.259062259432129e-06, "loss": 0.879, "step": 582 }, { "epoch": 0.45019305019305017, "grad_norm": 5.03125, "learning_rate": 2.2581391675768247e-06, "loss": 1.0534, "step": 583 }, { "epoch": 0.450965250965251, "grad_norm": 12.0, "learning_rate": 2.2572145001606015e-06, "loss": 0.8696, "step": 584 }, { "epoch": 0.4517374517374517, "grad_norm": 6.40625, "learning_rate": 2.256288258628568e-06, "loss": 0.9971, "step": 585 }, { "epoch": 0.4525096525096525, "grad_norm": 3.9375, "learning_rate": 2.255360444428292e-06, "loss": 0.8969, "step": 586 }, { "epoch": 0.4532818532818533, "grad_norm": 4.125, "learning_rate": 2.2544310590097994e-06, "loss": 0.8744, "step": 587 }, { "epoch": 0.4540540540540541, "grad_norm": 3.953125, "learning_rate": 2.2535001038255714e-06, "loss": 0.9437, "step": 588 }, { "epoch": 0.4548262548262548, "grad_norm": 4.53125, "learning_rate": 2.2525675803305423e-06, "loss": 0.961, "step": 589 }, { "epoch": 0.4555984555984556, "grad_norm": 3.6875, "learning_rate": 2.2516334899820987e-06, "loss": 0.8956, "step": 590 }, { "epoch": 0.4563706563706564, "grad_norm": 4.0625, "learning_rate": 2.2506978342400736e-06, "loss": 0.9129, "step": 591 }, { "epoch": 0.45714285714285713, "grad_norm": 3.953125, "learning_rate": 2.2497606145667494e-06, "loss": 0.9902, "step": 592 }, { "epoch": 0.45791505791505793, "grad_norm": 3.703125, "learning_rate": 2.2488218324268504e-06, "loss": 0.8153, "step": 593 }, { "epoch": 0.4586872586872587, "grad_norm": 5.9375, "learning_rate": 2.247881489287543e-06, "loss": 0.8559, "step": 594 }, { "epoch": 0.4594594594594595, "grad_norm": 5.90625, "learning_rate": 2.2469395866184347e-06, "loss": 0.8832, "step": 595 }, { "epoch": 0.46023166023166023, "grad_norm": 4.34375, "learning_rate": 2.2459961258915687e-06, "loss": 0.9839, "step": 596 }, { "epoch": 0.461003861003861, "grad_norm": 4.09375, "learning_rate": 2.245051108581424e-06, "loss": 0.9553, "step": 597 }, { "epoch": 0.4617760617760618, "grad_norm": 4.71875, "learning_rate": 2.244104536164912e-06, "loss": 0.9535, "step": 598 }, { "epoch": 0.46254826254826253, "grad_norm": 4.21875, "learning_rate": 2.2431564101213756e-06, "loss": 0.9903, "step": 599 }, { "epoch": 0.46332046332046334, "grad_norm": 5.03125, "learning_rate": 2.242206731932583e-06, "loss": 1.006, "step": 600 }, { "epoch": 0.4640926640926641, "grad_norm": 4.1875, "learning_rate": 2.2412555030827315e-06, "loss": 0.908, "step": 601 }, { "epoch": 0.4648648648648649, "grad_norm": 4.25, "learning_rate": 2.2403027250584393e-06, "loss": 0.9642, "step": 602 }, { "epoch": 0.46563706563706564, "grad_norm": 4.0625, "learning_rate": 2.239348399348747e-06, "loss": 0.7784, "step": 603 }, { "epoch": 0.4664092664092664, "grad_norm": 3.890625, "learning_rate": 2.2383925274451134e-06, "loss": 0.9789, "step": 604 }, { "epoch": 0.4671814671814672, "grad_norm": 4.71875, "learning_rate": 2.237435110841414e-06, "loss": 0.9335, "step": 605 }, { "epoch": 0.46795366795366794, "grad_norm": 4.1875, "learning_rate": 2.2364761510339394e-06, "loss": 0.8024, "step": 606 }, { "epoch": 0.46872586872586874, "grad_norm": 7.84375, "learning_rate": 2.23551564952139e-06, "loss": 0.8114, "step": 607 }, { "epoch": 0.4694980694980695, "grad_norm": 3.8125, "learning_rate": 2.2345536078048767e-06, "loss": 0.8505, "step": 608 }, { "epoch": 0.4702702702702703, "grad_norm": 4.4375, "learning_rate": 2.233590027387918e-06, "loss": 0.7705, "step": 609 }, { "epoch": 0.47104247104247104, "grad_norm": 4.3125, "learning_rate": 2.2326249097764358e-06, "loss": 0.921, "step": 610 }, { "epoch": 0.4718146718146718, "grad_norm": 3.921875, "learning_rate": 2.231658256478756e-06, "loss": 0.8355, "step": 611 }, { "epoch": 0.4725868725868726, "grad_norm": 5.15625, "learning_rate": 2.230690069005603e-06, "loss": 0.9661, "step": 612 }, { "epoch": 0.47335907335907335, "grad_norm": 5.5625, "learning_rate": 2.2297203488701003e-06, "loss": 0.8782, "step": 613 }, { "epoch": 0.47413127413127415, "grad_norm": 4.375, "learning_rate": 2.2287490975877655e-06, "loss": 0.9556, "step": 614 }, { "epoch": 0.4749034749034749, "grad_norm": 4.625, "learning_rate": 2.2277763166765097e-06, "loss": 0.8952, "step": 615 }, { "epoch": 0.4756756756756757, "grad_norm": 4.40625, "learning_rate": 2.2268020076566345e-06, "loss": 0.8588, "step": 616 }, { "epoch": 0.47644787644787645, "grad_norm": 4.21875, "learning_rate": 2.22582617205083e-06, "loss": 0.922, "step": 617 }, { "epoch": 0.4772200772200772, "grad_norm": 4.0625, "learning_rate": 2.2248488113841714e-06, "loss": 0.9773, "step": 618 }, { "epoch": 0.477992277992278, "grad_norm": 4.3125, "learning_rate": 2.223869927184118e-06, "loss": 0.9647, "step": 619 }, { "epoch": 0.47876447876447875, "grad_norm": 4.53125, "learning_rate": 2.2228895209805097e-06, "loss": 1.0195, "step": 620 }, { "epoch": 0.47953667953667956, "grad_norm": 4.375, "learning_rate": 2.2219075943055647e-06, "loss": 0.8882, "step": 621 }, { "epoch": 0.4803088803088803, "grad_norm": 10.0625, "learning_rate": 2.220924148693879e-06, "loss": 0.8442, "step": 622 }, { "epoch": 0.4810810810810811, "grad_norm": 4.6875, "learning_rate": 2.219939185682421e-06, "loss": 0.9303, "step": 623 }, { "epoch": 0.48185328185328186, "grad_norm": 3.984375, "learning_rate": 2.2189527068105305e-06, "loss": 0.9598, "step": 624 }, { "epoch": 0.4826254826254826, "grad_norm": 5.21875, "learning_rate": 2.217964713619918e-06, "loss": 0.8684, "step": 625 }, { "epoch": 0.4833976833976834, "grad_norm": 3.921875, "learning_rate": 2.216975207654658e-06, "loss": 0.869, "step": 626 }, { "epoch": 0.48416988416988416, "grad_norm": 4.15625, "learning_rate": 2.2159841904611913e-06, "loss": 0.8563, "step": 627 }, { "epoch": 0.48494208494208496, "grad_norm": 4.40625, "learning_rate": 2.2149916635883206e-06, "loss": 0.893, "step": 628 }, { "epoch": 0.4857142857142857, "grad_norm": 4.0625, "learning_rate": 2.213997628587206e-06, "loss": 0.9176, "step": 629 }, { "epoch": 0.4864864864864865, "grad_norm": 4.65625, "learning_rate": 2.213002087011367e-06, "loss": 1.0375, "step": 630 }, { "epoch": 0.48725868725868726, "grad_norm": 4.90625, "learning_rate": 2.212005040416676e-06, "loss": 0.9166, "step": 631 }, { "epoch": 0.488030888030888, "grad_norm": 4.90625, "learning_rate": 2.2110064903613584e-06, "loss": 0.8619, "step": 632 }, { "epoch": 0.4888030888030888, "grad_norm": 3.5625, "learning_rate": 2.210006438405988e-06, "loss": 0.7963, "step": 633 }, { "epoch": 0.48957528957528956, "grad_norm": 4.71875, "learning_rate": 2.2090048861134873e-06, "loss": 0.8451, "step": 634 }, { "epoch": 0.49034749034749037, "grad_norm": 3.84375, "learning_rate": 2.2080018350491227e-06, "loss": 0.9564, "step": 635 }, { "epoch": 0.4911196911196911, "grad_norm": 3.796875, "learning_rate": 2.2069972867805035e-06, "loss": 0.8859, "step": 636 }, { "epoch": 0.4918918918918919, "grad_norm": 3.40625, "learning_rate": 2.205991242877578e-06, "loss": 0.8054, "step": 637 }, { "epoch": 0.49266409266409267, "grad_norm": 3.84375, "learning_rate": 2.2049837049126338e-06, "loss": 0.9111, "step": 638 }, { "epoch": 0.4934362934362934, "grad_norm": 4.4375, "learning_rate": 2.2039746744602915e-06, "loss": 0.8474, "step": 639 }, { "epoch": 0.4942084942084942, "grad_norm": 5.875, "learning_rate": 2.2029641530975043e-06, "loss": 0.9289, "step": 640 }, { "epoch": 0.49498069498069497, "grad_norm": 4.03125, "learning_rate": 2.201952142403557e-06, "loss": 0.7471, "step": 641 }, { "epoch": 0.4957528957528958, "grad_norm": 4.96875, "learning_rate": 2.200938643960061e-06, "loss": 0.7788, "step": 642 }, { "epoch": 0.4965250965250965, "grad_norm": 5.5625, "learning_rate": 2.1999236593509527e-06, "loss": 0.8834, "step": 643 }, { "epoch": 0.4972972972972973, "grad_norm": 14.1875, "learning_rate": 2.1989071901624913e-06, "loss": 1.0912, "step": 644 }, { "epoch": 0.4980694980694981, "grad_norm": 4.125, "learning_rate": 2.1978892379832566e-06, "loss": 0.9463, "step": 645 }, { "epoch": 0.4988416988416988, "grad_norm": 4.625, "learning_rate": 2.1968698044041455e-06, "loss": 0.8939, "step": 646 }, { "epoch": 0.4996138996138996, "grad_norm": 3.875, "learning_rate": 2.1958488910183703e-06, "loss": 0.7966, "step": 647 }, { "epoch": 0.5003861003861004, "grad_norm": 4.28125, "learning_rate": 2.194826499421455e-06, "loss": 0.885, "step": 648 }, { "epoch": 0.5011583011583012, "grad_norm": 3.609375, "learning_rate": 2.1938026312112364e-06, "loss": 0.847, "step": 649 }, { "epoch": 0.5019305019305019, "grad_norm": 4.125, "learning_rate": 2.1927772879878567e-06, "loss": 1.0307, "step": 650 }, { "epoch": 0.5027027027027027, "grad_norm": 4.15625, "learning_rate": 2.1917504713537637e-06, "loss": 0.9233, "step": 651 }, { "epoch": 0.5034749034749034, "grad_norm": 4.875, "learning_rate": 2.1907221829137087e-06, "loss": 0.783, "step": 652 }, { "epoch": 0.5042471042471043, "grad_norm": 4.34375, "learning_rate": 2.1896924242747424e-06, "loss": 1.0317, "step": 653 }, { "epoch": 0.505019305019305, "grad_norm": 3.78125, "learning_rate": 2.188661197046214e-06, "loss": 0.7628, "step": 654 }, { "epoch": 0.5057915057915058, "grad_norm": 3.796875, "learning_rate": 2.187628502839767e-06, "loss": 0.8157, "step": 655 }, { "epoch": 0.5065637065637065, "grad_norm": 4.1875, "learning_rate": 2.1865943432693377e-06, "loss": 0.8844, "step": 656 }, { "epoch": 0.5073359073359074, "grad_norm": 3.640625, "learning_rate": 2.1855587199511534e-06, "loss": 0.9356, "step": 657 }, { "epoch": 0.5081081081081081, "grad_norm": 4.875, "learning_rate": 2.1845216345037275e-06, "loss": 0.9348, "step": 658 }, { "epoch": 0.5088803088803089, "grad_norm": 4.09375, "learning_rate": 2.1834830885478605e-06, "loss": 0.7272, "step": 659 }, { "epoch": 0.5096525096525096, "grad_norm": 6.0, "learning_rate": 2.1824430837066327e-06, "loss": 0.9228, "step": 660 }, { "epoch": 0.5104247104247104, "grad_norm": 4.59375, "learning_rate": 2.1814016216054077e-06, "loss": 1.0445, "step": 661 }, { "epoch": 0.5111969111969112, "grad_norm": 4.3125, "learning_rate": 2.1803587038718236e-06, "loss": 0.9305, "step": 662 }, { "epoch": 0.511969111969112, "grad_norm": 3.984375, "learning_rate": 2.179314332135796e-06, "loss": 0.8631, "step": 663 }, { "epoch": 0.5127413127413127, "grad_norm": 4.78125, "learning_rate": 2.17826850802951e-06, "loss": 1.0349, "step": 664 }, { "epoch": 0.5135135135135135, "grad_norm": 3.546875, "learning_rate": 2.1772212331874225e-06, "loss": 0.8361, "step": 665 }, { "epoch": 0.5142857142857142, "grad_norm": 5.34375, "learning_rate": 2.176172509246258e-06, "loss": 1.017, "step": 666 }, { "epoch": 0.5150579150579151, "grad_norm": 3.546875, "learning_rate": 2.175122337845005e-06, "loss": 0.7892, "step": 667 }, { "epoch": 0.5158301158301158, "grad_norm": 4.75, "learning_rate": 2.1740707206249127e-06, "loss": 0.8844, "step": 668 }, { "epoch": 0.5166023166023166, "grad_norm": 28.125, "learning_rate": 2.1730176592294926e-06, "loss": 0.8989, "step": 669 }, { "epoch": 0.5173745173745173, "grad_norm": 4.0, "learning_rate": 2.1719631553045113e-06, "loss": 0.8026, "step": 670 }, { "epoch": 0.5181467181467182, "grad_norm": 4.3125, "learning_rate": 2.1709072104979905e-06, "loss": 0.796, "step": 671 }, { "epoch": 0.518918918918919, "grad_norm": 4.0625, "learning_rate": 2.1698498264602037e-06, "loss": 0.8918, "step": 672 }, { "epoch": 0.5196911196911197, "grad_norm": 4.65625, "learning_rate": 2.1687910048436743e-06, "loss": 0.8664, "step": 673 }, { "epoch": 0.5204633204633204, "grad_norm": 4.4375, "learning_rate": 2.167730747303171e-06, "loss": 0.9302, "step": 674 }, { "epoch": 0.5212355212355212, "grad_norm": 4.09375, "learning_rate": 2.1666690554957083e-06, "loss": 0.8622, "step": 675 }, { "epoch": 0.522007722007722, "grad_norm": 4.53125, "learning_rate": 2.1656059310805406e-06, "loss": 0.9829, "step": 676 }, { "epoch": 0.5227799227799228, "grad_norm": 7.84375, "learning_rate": 2.164541375719162e-06, "loss": 0.9373, "step": 677 }, { "epoch": 0.5235521235521235, "grad_norm": 5.0, "learning_rate": 2.1634753910753032e-06, "loss": 0.9508, "step": 678 }, { "epoch": 0.5243243243243243, "grad_norm": 3.8125, "learning_rate": 2.162407978814929e-06, "loss": 0.8652, "step": 679 }, { "epoch": 0.525096525096525, "grad_norm": 5.0, "learning_rate": 2.1613391406062334e-06, "loss": 0.8193, "step": 680 }, { "epoch": 0.5258687258687259, "grad_norm": 8.5, "learning_rate": 2.160268878119641e-06, "loss": 0.8315, "step": 681 }, { "epoch": 0.5266409266409267, "grad_norm": 6.96875, "learning_rate": 2.159197193027802e-06, "loss": 0.9212, "step": 682 }, { "epoch": 0.5274131274131274, "grad_norm": 4.9375, "learning_rate": 2.1581240870055887e-06, "loss": 0.7926, "step": 683 }, { "epoch": 0.5281853281853282, "grad_norm": 22.5, "learning_rate": 2.1570495617300953e-06, "loss": 0.9643, "step": 684 }, { "epoch": 0.528957528957529, "grad_norm": 5.84375, "learning_rate": 2.155973618880633e-06, "loss": 0.822, "step": 685 }, { "epoch": 0.5297297297297298, "grad_norm": 3.484375, "learning_rate": 2.15489626013873e-06, "loss": 0.819, "step": 686 }, { "epoch": 0.5305019305019305, "grad_norm": 4.3125, "learning_rate": 2.1538174871881256e-06, "loss": 0.894, "step": 687 }, { "epoch": 0.5312741312741313, "grad_norm": 16.0, "learning_rate": 2.1527373017147707e-06, "loss": 0.9325, "step": 688 }, { "epoch": 0.532046332046332, "grad_norm": 3.96875, "learning_rate": 2.151655705406823e-06, "loss": 0.9741, "step": 689 }, { "epoch": 0.5328185328185329, "grad_norm": 4.125, "learning_rate": 2.1505726999546446e-06, "loss": 0.8511, "step": 690 }, { "epoch": 0.5335907335907336, "grad_norm": 4.1875, "learning_rate": 2.149488287050801e-06, "loss": 0.8297, "step": 691 }, { "epoch": 0.5343629343629344, "grad_norm": 4.3125, "learning_rate": 2.148402468390057e-06, "loss": 0.8366, "step": 692 }, { "epoch": 0.5351351351351351, "grad_norm": 4.8125, "learning_rate": 2.147315245669374e-06, "loss": 0.882, "step": 693 }, { "epoch": 0.5359073359073359, "grad_norm": 4.78125, "learning_rate": 2.1462266205879077e-06, "loss": 0.8078, "step": 694 }, { "epoch": 0.5366795366795367, "grad_norm": 8.4375, "learning_rate": 2.145136594847006e-06, "loss": 0.9668, "step": 695 }, { "epoch": 0.5374517374517375, "grad_norm": 4.8125, "learning_rate": 2.144045170150205e-06, "loss": 0.8329, "step": 696 }, { "epoch": 0.5382239382239382, "grad_norm": 5.15625, "learning_rate": 2.142952348203229e-06, "loss": 1.0409, "step": 697 }, { "epoch": 0.538996138996139, "grad_norm": 5.71875, "learning_rate": 2.1418581307139824e-06, "loss": 0.7945, "step": 698 }, { "epoch": 0.5397683397683398, "grad_norm": 3.59375, "learning_rate": 2.1407625193925544e-06, "loss": 0.809, "step": 699 }, { "epoch": 0.5405405405405406, "grad_norm": 4.625, "learning_rate": 2.1396655159512108e-06, "loss": 0.9578, "step": 700 }, { "epoch": 0.5413127413127413, "grad_norm": 3.953125, "learning_rate": 2.1385671221043927e-06, "loss": 0.8875, "step": 701 }, { "epoch": 0.5420849420849421, "grad_norm": 8.0, "learning_rate": 2.137467339568715e-06, "loss": 0.8467, "step": 702 }, { "epoch": 0.5428571428571428, "grad_norm": 3.859375, "learning_rate": 2.136366170062963e-06, "loss": 0.8461, "step": 703 }, { "epoch": 0.5436293436293437, "grad_norm": 5.25, "learning_rate": 2.135263615308088e-06, "loss": 0.9254, "step": 704 }, { "epoch": 0.5444015444015444, "grad_norm": 15.25, "learning_rate": 2.134159677027209e-06, "loss": 0.8745, "step": 705 }, { "epoch": 0.5451737451737452, "grad_norm": 4.3125, "learning_rate": 2.1330543569456047e-06, "loss": 0.8502, "step": 706 }, { "epoch": 0.5459459459459459, "grad_norm": 9.5625, "learning_rate": 2.1319476567907145e-06, "loss": 0.852, "step": 707 }, { "epoch": 0.5467181467181467, "grad_norm": 4.3125, "learning_rate": 2.1308395782921342e-06, "loss": 0.9808, "step": 708 }, { "epoch": 0.5474903474903475, "grad_norm": 4.125, "learning_rate": 2.1297301231816147e-06, "loss": 1.0547, "step": 709 }, { "epoch": 0.5482625482625483, "grad_norm": 5.40625, "learning_rate": 2.1286192931930573e-06, "loss": 0.9482, "step": 710 }, { "epoch": 0.549034749034749, "grad_norm": 5.03125, "learning_rate": 2.1275070900625115e-06, "loss": 0.9112, "step": 711 }, { "epoch": 0.5498069498069498, "grad_norm": 3.71875, "learning_rate": 2.1263935155281746e-06, "loss": 0.8255, "step": 712 }, { "epoch": 0.5505791505791506, "grad_norm": 30.875, "learning_rate": 2.1252785713303857e-06, "loss": 0.8204, "step": 713 }, { "epoch": 0.5513513513513514, "grad_norm": 3.734375, "learning_rate": 2.1241622592116256e-06, "loss": 0.7733, "step": 714 }, { "epoch": 0.5521235521235521, "grad_norm": 3.96875, "learning_rate": 2.123044580916511e-06, "loss": 0.9166, "step": 715 }, { "epoch": 0.5528957528957529, "grad_norm": 4.5625, "learning_rate": 2.1219255381917967e-06, "loss": 0.8707, "step": 716 }, { "epoch": 0.5536679536679536, "grad_norm": 15.0, "learning_rate": 2.1208051327863672e-06, "loss": 0.8593, "step": 717 }, { "epoch": 0.5544401544401545, "grad_norm": 5.40625, "learning_rate": 2.1196833664512376e-06, "loss": 1.0338, "step": 718 }, { "epoch": 0.5552123552123552, "grad_norm": 6.875, "learning_rate": 2.1185602409395502e-06, "loss": 0.7999, "step": 719 }, { "epoch": 0.555984555984556, "grad_norm": 4.28125, "learning_rate": 2.1174357580065714e-06, "loss": 1.0591, "step": 720 }, { "epoch": 0.5567567567567567, "grad_norm": 3.796875, "learning_rate": 2.1163099194096887e-06, "loss": 0.9941, "step": 721 }, { "epoch": 0.5575289575289575, "grad_norm": 3.984375, "learning_rate": 2.1151827269084084e-06, "loss": 0.9156, "step": 722 }, { "epoch": 0.5583011583011583, "grad_norm": 3.765625, "learning_rate": 2.1140541822643533e-06, "loss": 0.9485, "step": 723 }, { "epoch": 0.5590733590733591, "grad_norm": 3.921875, "learning_rate": 2.1129242872412585e-06, "loss": 0.9341, "step": 724 }, { "epoch": 0.5598455598455598, "grad_norm": 4.21875, "learning_rate": 2.11179304360497e-06, "loss": 0.9612, "step": 725 }, { "epoch": 0.5606177606177606, "grad_norm": 5.71875, "learning_rate": 2.1106604531234415e-06, "loss": 0.6366, "step": 726 }, { "epoch": 0.5613899613899614, "grad_norm": 5.0, "learning_rate": 2.1095265175667314e-06, "loss": 0.875, "step": 727 }, { "epoch": 0.5621621621621622, "grad_norm": 4.96875, "learning_rate": 2.108391238707e-06, "loss": 0.8424, "step": 728 }, { "epoch": 0.5629343629343629, "grad_norm": 3.625, "learning_rate": 2.1072546183185073e-06, "loss": 0.8907, "step": 729 }, { "epoch": 0.5637065637065637, "grad_norm": 8.125, "learning_rate": 2.1061166581776106e-06, "loss": 0.8914, "step": 730 }, { "epoch": 0.5644787644787644, "grad_norm": 6.375, "learning_rate": 2.10497736006276e-06, "loss": 0.9157, "step": 731 }, { "epoch": 0.5652509652509653, "grad_norm": 4.40625, "learning_rate": 2.1038367257544965e-06, "loss": 1.0341, "step": 732 }, { "epoch": 0.566023166023166, "grad_norm": 4.03125, "learning_rate": 2.1026947570354506e-06, "loss": 1.0178, "step": 733 }, { "epoch": 0.5667953667953668, "grad_norm": 3.8125, "learning_rate": 2.101551455690337e-06, "loss": 0.8529, "step": 734 }, { "epoch": 0.5675675675675675, "grad_norm": 5.65625, "learning_rate": 2.1004068235059537e-06, "loss": 0.9114, "step": 735 }, { "epoch": 0.5683397683397683, "grad_norm": 3.40625, "learning_rate": 2.0992608622711785e-06, "loss": 0.7569, "step": 736 }, { "epoch": 0.5691119691119692, "grad_norm": 3.828125, "learning_rate": 2.098113573776966e-06, "loss": 0.9121, "step": 737 }, { "epoch": 0.5698841698841699, "grad_norm": 3.9375, "learning_rate": 2.0969649598163454e-06, "loss": 0.9351, "step": 738 }, { "epoch": 0.5706563706563706, "grad_norm": 4.53125, "learning_rate": 2.0958150221844177e-06, "loss": 0.8395, "step": 739 }, { "epoch": 0.5714285714285714, "grad_norm": 3.953125, "learning_rate": 2.0946637626783515e-06, "loss": 0.7678, "step": 740 }, { "epoch": 0.5722007722007721, "grad_norm": 4.25, "learning_rate": 2.093511183097383e-06, "loss": 0.8809, "step": 741 }, { "epoch": 0.572972972972973, "grad_norm": 4.40625, "learning_rate": 2.0923572852428096e-06, "loss": 0.933, "step": 742 }, { "epoch": 0.5737451737451738, "grad_norm": 4.34375, "learning_rate": 2.0912020709179907e-06, "loss": 0.9257, "step": 743 }, { "epoch": 0.5745173745173745, "grad_norm": 3.765625, "learning_rate": 2.0900455419283416e-06, "loss": 0.8285, "step": 744 }, { "epoch": 0.5752895752895753, "grad_norm": 3.984375, "learning_rate": 2.088887700081333e-06, "loss": 0.8101, "step": 745 }, { "epoch": 0.5760617760617761, "grad_norm": 3.984375, "learning_rate": 2.0877285471864873e-06, "loss": 0.8361, "step": 746 }, { "epoch": 0.5768339768339769, "grad_norm": 6.3125, "learning_rate": 2.086568085055376e-06, "loss": 0.9817, "step": 747 }, { "epoch": 0.5776061776061776, "grad_norm": 4.5, "learning_rate": 2.0854063155016156e-06, "loss": 0.7972, "step": 748 }, { "epoch": 0.5783783783783784, "grad_norm": 3.625, "learning_rate": 2.084243240340868e-06, "loss": 0.8469, "step": 749 }, { "epoch": 0.5791505791505791, "grad_norm": 4.875, "learning_rate": 2.0830788613908344e-06, "loss": 0.8831, "step": 750 }, { "epoch": 0.57992277992278, "grad_norm": 5.8125, "learning_rate": 2.0819131804712526e-06, "loss": 0.8178, "step": 751 }, { "epoch": 0.5806949806949807, "grad_norm": 5.21875, "learning_rate": 2.0807461994038973e-06, "loss": 0.8544, "step": 752 }, { "epoch": 0.5814671814671815, "grad_norm": 4.96875, "learning_rate": 2.079577920012573e-06, "loss": 1.0638, "step": 753 }, { "epoch": 0.5822393822393822, "grad_norm": 7.09375, "learning_rate": 2.078408344123115e-06, "loss": 0.9036, "step": 754 }, { "epoch": 0.583011583011583, "grad_norm": 4.125, "learning_rate": 2.0772374735633837e-06, "loss": 0.8084, "step": 755 }, { "epoch": 0.5837837837837838, "grad_norm": 4.34375, "learning_rate": 2.0760653101632636e-06, "loss": 0.9485, "step": 756 }, { "epoch": 0.5845559845559846, "grad_norm": 5.25, "learning_rate": 2.074891855754659e-06, "loss": 0.8077, "step": 757 }, { "epoch": 0.5853281853281853, "grad_norm": 4.09375, "learning_rate": 2.0737171121714924e-06, "loss": 0.7218, "step": 758 }, { "epoch": 0.5861003861003861, "grad_norm": 4.4375, "learning_rate": 2.072541081249701e-06, "loss": 0.9713, "step": 759 }, { "epoch": 0.5868725868725869, "grad_norm": 18.75, "learning_rate": 2.071363764827233e-06, "loss": 0.8354, "step": 760 }, { "epoch": 0.5876447876447877, "grad_norm": 5.28125, "learning_rate": 2.0701851647440478e-06, "loss": 0.8504, "step": 761 }, { "epoch": 0.5884169884169884, "grad_norm": 4.6875, "learning_rate": 2.0690052828421086e-06, "loss": 0.9009, "step": 762 }, { "epoch": 0.5891891891891892, "grad_norm": 5.65625, "learning_rate": 2.067824120965383e-06, "loss": 0.8744, "step": 763 }, { "epoch": 0.5899613899613899, "grad_norm": 4.21875, "learning_rate": 2.0666416809598394e-06, "loss": 0.9118, "step": 764 }, { "epoch": 0.5907335907335908, "grad_norm": 3.921875, "learning_rate": 2.0654579646734424e-06, "loss": 1.0176, "step": 765 }, { "epoch": 0.5915057915057915, "grad_norm": 4.5625, "learning_rate": 2.064272973956152e-06, "loss": 0.9366, "step": 766 }, { "epoch": 0.5922779922779923, "grad_norm": 4.21875, "learning_rate": 2.0630867106599205e-06, "loss": 0.7799, "step": 767 }, { "epoch": 0.593050193050193, "grad_norm": 4.46875, "learning_rate": 2.061899176638688e-06, "loss": 0.9611, "step": 768 }, { "epoch": 0.5938223938223938, "grad_norm": 5.5625, "learning_rate": 2.0607103737483814e-06, "loss": 0.9232, "step": 769 }, { "epoch": 0.5945945945945946, "grad_norm": 3.78125, "learning_rate": 2.0595203038469097e-06, "loss": 0.9249, "step": 770 }, { "epoch": 0.5953667953667954, "grad_norm": 4.875, "learning_rate": 2.058328968794163e-06, "loss": 0.7956, "step": 771 }, { "epoch": 0.5961389961389961, "grad_norm": 4.90625, "learning_rate": 2.057136370452008e-06, "loss": 0.8284, "step": 772 }, { "epoch": 0.5969111969111969, "grad_norm": 4.40625, "learning_rate": 2.055942510684286e-06, "loss": 0.7977, "step": 773 }, { "epoch": 0.5976833976833977, "grad_norm": 4.28125, "learning_rate": 2.0547473913568097e-06, "loss": 0.8815, "step": 774 }, { "epoch": 0.5984555984555985, "grad_norm": 3.984375, "learning_rate": 2.0535510143373598e-06, "loss": 0.851, "step": 775 }, { "epoch": 0.5992277992277992, "grad_norm": 4.375, "learning_rate": 2.052353381495684e-06, "loss": 1.0555, "step": 776 }, { "epoch": 0.6, "grad_norm": 3.984375, "learning_rate": 2.0511544947034908e-06, "loss": 0.7947, "step": 777 }, { "epoch": 0.6007722007722007, "grad_norm": 4.0, "learning_rate": 2.049954355834449e-06, "loss": 0.9793, "step": 778 }, { "epoch": 0.6015444015444016, "grad_norm": 4.34375, "learning_rate": 2.0487529667641855e-06, "loss": 0.8112, "step": 779 }, { "epoch": 0.6023166023166023, "grad_norm": 5.03125, "learning_rate": 2.0475503293702786e-06, "loss": 0.8879, "step": 780 }, { "epoch": 0.6030888030888031, "grad_norm": 4.03125, "learning_rate": 2.0463464455322603e-06, "loss": 0.9287, "step": 781 }, { "epoch": 0.6038610038610038, "grad_norm": 4.875, "learning_rate": 2.0451413171316086e-06, "loss": 0.7728, "step": 782 }, { "epoch": 0.6046332046332046, "grad_norm": 4.5, "learning_rate": 2.0439349460517477e-06, "loss": 0.9164, "step": 783 }, { "epoch": 0.6054054054054054, "grad_norm": 4.75, "learning_rate": 2.0427273341780427e-06, "loss": 0.9061, "step": 784 }, { "epoch": 0.6061776061776062, "grad_norm": 4.28125, "learning_rate": 2.0415184833977994e-06, "loss": 0.7774, "step": 785 }, { "epoch": 0.6069498069498069, "grad_norm": 4.5625, "learning_rate": 2.040308395600258e-06, "loss": 0.8666, "step": 786 }, { "epoch": 0.6077220077220077, "grad_norm": 3.015625, "learning_rate": 2.039097072676594e-06, "loss": 0.4609, "step": 787 }, { "epoch": 0.6084942084942085, "grad_norm": 4.25, "learning_rate": 2.0378845165199115e-06, "loss": 0.8307, "step": 788 }, { "epoch": 0.6092664092664093, "grad_norm": 5.25, "learning_rate": 2.0366707290252428e-06, "loss": 0.8246, "step": 789 }, { "epoch": 0.61003861003861, "grad_norm": 3.5, "learning_rate": 2.0354557120895446e-06, "loss": 0.8073, "step": 790 }, { "epoch": 0.6108108108108108, "grad_norm": 4.40625, "learning_rate": 2.0342394676116946e-06, "loss": 1.0326, "step": 791 }, { "epoch": 0.6115830115830115, "grad_norm": 3.75, "learning_rate": 2.0330219974924886e-06, "loss": 0.8795, "step": 792 }, { "epoch": 0.6123552123552124, "grad_norm": 4.21875, "learning_rate": 2.0318033036346394e-06, "loss": 0.9993, "step": 793 }, { "epoch": 0.6131274131274131, "grad_norm": 4.03125, "learning_rate": 2.030583387942771e-06, "loss": 0.8912, "step": 794 }, { "epoch": 0.6138996138996139, "grad_norm": 13.5, "learning_rate": 2.029362252323417e-06, "loss": 0.7856, "step": 795 }, { "epoch": 0.6146718146718146, "grad_norm": 6.8125, "learning_rate": 2.028139898685018e-06, "loss": 0.9797, "step": 796 }, { "epoch": 0.6154440154440154, "grad_norm": 5.6875, "learning_rate": 2.0269163289379187e-06, "loss": 0.8056, "step": 797 }, { "epoch": 0.6162162162162163, "grad_norm": 4.4375, "learning_rate": 2.025691544994363e-06, "loss": 0.8174, "step": 798 }, { "epoch": 0.616988416988417, "grad_norm": 4.96875, "learning_rate": 2.024465548768493e-06, "loss": 0.8792, "step": 799 }, { "epoch": 0.6177606177606177, "grad_norm": 4.40625, "learning_rate": 2.023238342176345e-06, "loss": 0.8667, "step": 800 }, { "epoch": 0.6185328185328185, "grad_norm": 5.125, "learning_rate": 2.022009927135849e-06, "loss": 0.9014, "step": 801 }, { "epoch": 0.6193050193050194, "grad_norm": 3.90625, "learning_rate": 2.020780305566821e-06, "loss": 0.8165, "step": 802 }, { "epoch": 0.6200772200772201, "grad_norm": 3.828125, "learning_rate": 2.0195494793909636e-06, "loss": 0.8877, "step": 803 }, { "epoch": 0.6208494208494209, "grad_norm": 3.71875, "learning_rate": 2.018317450531862e-06, "loss": 0.8869, "step": 804 }, { "epoch": 0.6216216216216216, "grad_norm": 30.0, "learning_rate": 2.0170842209149816e-06, "loss": 0.9187, "step": 805 }, { "epoch": 0.6223938223938223, "grad_norm": 3.984375, "learning_rate": 2.015849792467663e-06, "loss": 0.8403, "step": 806 }, { "epoch": 0.6231660231660232, "grad_norm": 10.5625, "learning_rate": 2.014614167119122e-06, "loss": 0.9074, "step": 807 }, { "epoch": 0.623938223938224, "grad_norm": 4.21875, "learning_rate": 2.0133773468004434e-06, "loss": 0.8268, "step": 808 }, { "epoch": 0.6247104247104247, "grad_norm": 4.6875, "learning_rate": 2.0121393334445814e-06, "loss": 0.7394, "step": 809 }, { "epoch": 0.6254826254826255, "grad_norm": 4.25, "learning_rate": 2.0109001289863526e-06, "loss": 0.9179, "step": 810 }, { "epoch": 0.6262548262548262, "grad_norm": 4.6875, "learning_rate": 2.009659735362437e-06, "loss": 0.8045, "step": 811 }, { "epoch": 0.6270270270270271, "grad_norm": 7.1875, "learning_rate": 2.0084181545113713e-06, "loss": 0.9381, "step": 812 }, { "epoch": 0.6277992277992278, "grad_norm": 4.4375, "learning_rate": 2.0071753883735497e-06, "loss": 0.8502, "step": 813 }, { "epoch": 0.6285714285714286, "grad_norm": 3.609375, "learning_rate": 2.0059314388912163e-06, "loss": 0.7951, "step": 814 }, { "epoch": 0.6293436293436293, "grad_norm": 8.375, "learning_rate": 2.0046863080084673e-06, "loss": 0.8453, "step": 815 }, { "epoch": 0.6301158301158302, "grad_norm": 5.65625, "learning_rate": 2.003439997671243e-06, "loss": 0.9853, "step": 816 }, { "epoch": 0.6308880308880309, "grad_norm": 44.75, "learning_rate": 2.0021925098273287e-06, "loss": 0.9861, "step": 817 }, { "epoch": 0.6316602316602317, "grad_norm": 4.3125, "learning_rate": 2.0009438464263487e-06, "loss": 0.769, "step": 818 }, { "epoch": 0.6324324324324324, "grad_norm": 4.53125, "learning_rate": 1.9996940094197652e-06, "loss": 1.0191, "step": 819 }, { "epoch": 0.6332046332046332, "grad_norm": 3.65625, "learning_rate": 1.9984430007608735e-06, "loss": 0.9312, "step": 820 }, { "epoch": 0.633976833976834, "grad_norm": 3.90625, "learning_rate": 1.997190822404802e-06, "loss": 0.9179, "step": 821 }, { "epoch": 0.6347490347490348, "grad_norm": 4.25, "learning_rate": 1.9959374763085057e-06, "loss": 0.9168, "step": 822 }, { "epoch": 0.6355212355212355, "grad_norm": 4.6875, "learning_rate": 1.9946829644307647e-06, "loss": 0.8352, "step": 823 }, { "epoch": 0.6362934362934363, "grad_norm": 3.5625, "learning_rate": 1.993427288732181e-06, "loss": 0.7746, "step": 824 }, { "epoch": 0.637065637065637, "grad_norm": 5.0, "learning_rate": 1.992170451175176e-06, "loss": 0.7366, "step": 825 }, { "epoch": 0.6378378378378379, "grad_norm": 8.125, "learning_rate": 1.990912453723986e-06, "loss": 0.9053, "step": 826 }, { "epoch": 0.6386100386100386, "grad_norm": 6.1875, "learning_rate": 1.989653298344662e-06, "loss": 0.7879, "step": 827 }, { "epoch": 0.6393822393822394, "grad_norm": 3.546875, "learning_rate": 1.988392987005062e-06, "loss": 0.8456, "step": 828 }, { "epoch": 0.6401544401544401, "grad_norm": 3.515625, "learning_rate": 1.9871315216748518e-06, "loss": 0.8126, "step": 829 }, { "epoch": 0.640926640926641, "grad_norm": 13.75, "learning_rate": 1.985868904325502e-06, "loss": 0.8984, "step": 830 }, { "epoch": 0.6416988416988417, "grad_norm": 3.828125, "learning_rate": 1.9846051369302806e-06, "loss": 0.8575, "step": 831 }, { "epoch": 0.6424710424710425, "grad_norm": 4.15625, "learning_rate": 1.983340221464256e-06, "loss": 0.9063, "step": 832 }, { "epoch": 0.6432432432432432, "grad_norm": 6.40625, "learning_rate": 1.98207415990429e-06, "loss": 0.912, "step": 833 }, { "epoch": 0.644015444015444, "grad_norm": 3.671875, "learning_rate": 1.9808069542290332e-06, "loss": 0.7003, "step": 834 }, { "epoch": 0.6447876447876448, "grad_norm": 4.28125, "learning_rate": 1.979538606418928e-06, "loss": 1.0557, "step": 835 }, { "epoch": 0.6455598455598456, "grad_norm": 4.0625, "learning_rate": 1.978269118456199e-06, "loss": 0.9036, "step": 836 }, { "epoch": 0.6463320463320463, "grad_norm": 4.46875, "learning_rate": 1.9769984923248544e-06, "loss": 0.9512, "step": 837 }, { "epoch": 0.6471042471042471, "grad_norm": 4.09375, "learning_rate": 1.97572673001068e-06, "loss": 0.7436, "step": 838 }, { "epoch": 0.6478764478764478, "grad_norm": 3.671875, "learning_rate": 1.9744538335012377e-06, "loss": 0.9873, "step": 839 }, { "epoch": 0.6486486486486487, "grad_norm": 17.0, "learning_rate": 1.9731798047858615e-06, "loss": 0.7909, "step": 840 }, { "epoch": 0.6494208494208494, "grad_norm": 4.53125, "learning_rate": 1.971904645855656e-06, "loss": 0.8845, "step": 841 }, { "epoch": 0.6501930501930502, "grad_norm": 5.46875, "learning_rate": 1.9706283587034918e-06, "loss": 0.7932, "step": 842 }, { "epoch": 0.6509652509652509, "grad_norm": 3.8125, "learning_rate": 1.969350945324002e-06, "loss": 0.8068, "step": 843 }, { "epoch": 0.6517374517374518, "grad_norm": 4.65625, "learning_rate": 1.9680724077135793e-06, "loss": 0.8812, "step": 844 }, { "epoch": 0.6525096525096525, "grad_norm": 4.0625, "learning_rate": 1.966792747870375e-06, "loss": 0.8205, "step": 845 }, { "epoch": 0.6532818532818533, "grad_norm": 3.96875, "learning_rate": 1.9655119677942945e-06, "loss": 0.9784, "step": 846 }, { "epoch": 0.654054054054054, "grad_norm": 4.34375, "learning_rate": 1.964230069486992e-06, "loss": 0.8317, "step": 847 }, { "epoch": 0.6548262548262548, "grad_norm": 3.765625, "learning_rate": 1.9629470549518703e-06, "loss": 0.9424, "step": 848 }, { "epoch": 0.6555984555984556, "grad_norm": 11.125, "learning_rate": 1.961662926194077e-06, "loss": 0.7892, "step": 849 }, { "epoch": 0.6563706563706564, "grad_norm": 6.03125, "learning_rate": 1.960377685220501e-06, "loss": 1.0857, "step": 850 }, { "epoch": 0.6571428571428571, "grad_norm": 3.71875, "learning_rate": 1.9590913340397687e-06, "loss": 0.7609, "step": 851 }, { "epoch": 0.6579150579150579, "grad_norm": 4.03125, "learning_rate": 1.9578038746622425e-06, "loss": 0.869, "step": 852 }, { "epoch": 0.6586872586872586, "grad_norm": 3.875, "learning_rate": 1.956515309100016e-06, "loss": 0.9776, "step": 853 }, { "epoch": 0.6594594594594595, "grad_norm": 4.09375, "learning_rate": 1.955225639366912e-06, "loss": 0.8745, "step": 854 }, { "epoch": 0.6602316602316602, "grad_norm": 4.28125, "learning_rate": 1.953934867478479e-06, "loss": 0.8343, "step": 855 }, { "epoch": 0.661003861003861, "grad_norm": 4.0625, "learning_rate": 1.9526429954519877e-06, "loss": 0.9388, "step": 856 }, { "epoch": 0.6617760617760617, "grad_norm": 3.90625, "learning_rate": 1.951350025306429e-06, "loss": 0.9081, "step": 857 }, { "epoch": 0.6625482625482626, "grad_norm": 3.75, "learning_rate": 1.9500559590625074e-06, "loss": 0.8194, "step": 858 }, { "epoch": 0.6633204633204633, "grad_norm": 4.21875, "learning_rate": 1.9487607987426444e-06, "loss": 0.9805, "step": 859 }, { "epoch": 0.6640926640926641, "grad_norm": 4.0, "learning_rate": 1.9474645463709677e-06, "loss": 0.8698, "step": 860 }, { "epoch": 0.6648648648648648, "grad_norm": 5.0, "learning_rate": 1.946167203973314e-06, "loss": 0.9805, "step": 861 }, { "epoch": 0.6656370656370656, "grad_norm": 4.0625, "learning_rate": 1.944868773577223e-06, "loss": 0.8833, "step": 862 }, { "epoch": 0.6664092664092665, "grad_norm": 4.40625, "learning_rate": 1.943569257211934e-06, "loss": 0.9686, "step": 863 }, { "epoch": 0.6671814671814672, "grad_norm": 5.625, "learning_rate": 1.9422686569083842e-06, "loss": 0.9212, "step": 864 }, { "epoch": 0.667953667953668, "grad_norm": 4.34375, "learning_rate": 1.940966974699205e-06, "loss": 0.8428, "step": 865 }, { "epoch": 0.6687258687258687, "grad_norm": 17.0, "learning_rate": 1.9396642126187178e-06, "loss": 1.001, "step": 866 }, { "epoch": 0.6694980694980694, "grad_norm": 4.46875, "learning_rate": 1.9383603727029325e-06, "loss": 1.0099, "step": 867 }, { "epoch": 0.6702702702702703, "grad_norm": 6.0625, "learning_rate": 1.9370554569895435e-06, "loss": 0.8546, "step": 868 }, { "epoch": 0.6710424710424711, "grad_norm": 4.0625, "learning_rate": 1.9357494675179254e-06, "loss": 1.0295, "step": 869 }, { "epoch": 0.6718146718146718, "grad_norm": 4.46875, "learning_rate": 1.934442406329133e-06, "loss": 0.9536, "step": 870 }, { "epoch": 0.6725868725868726, "grad_norm": 6.65625, "learning_rate": 1.9331342754658928e-06, "loss": 0.8847, "step": 871 }, { "epoch": 0.6733590733590734, "grad_norm": 3.984375, "learning_rate": 1.9318250769726066e-06, "loss": 0.9493, "step": 872 }, { "epoch": 0.6741312741312742, "grad_norm": 9.0, "learning_rate": 1.9305148128953426e-06, "loss": 0.2694, "step": 873 }, { "epoch": 0.6749034749034749, "grad_norm": 4.25, "learning_rate": 1.929203485281835e-06, "loss": 1.0014, "step": 874 }, { "epoch": 0.6756756756756757, "grad_norm": 3.984375, "learning_rate": 1.927891096181479e-06, "loss": 0.8919, "step": 875 }, { "epoch": 0.6764478764478764, "grad_norm": 3.921875, "learning_rate": 1.926577647645331e-06, "loss": 0.7825, "step": 876 }, { "epoch": 0.6772200772200773, "grad_norm": 4.0625, "learning_rate": 1.9252631417261004e-06, "loss": 0.6963, "step": 877 }, { "epoch": 0.677992277992278, "grad_norm": 4.34375, "learning_rate": 1.923947580478152e-06, "loss": 0.9595, "step": 878 }, { "epoch": 0.6787644787644788, "grad_norm": 4.8125, "learning_rate": 1.922630965957498e-06, "loss": 0.95, "step": 879 }, { "epoch": 0.6795366795366795, "grad_norm": 4.84375, "learning_rate": 1.9213133002217965e-06, "loss": 0.826, "step": 880 }, { "epoch": 0.6803088803088803, "grad_norm": 4.625, "learning_rate": 1.9199945853303496e-06, "loss": 0.963, "step": 881 }, { "epoch": 0.6810810810810811, "grad_norm": 3.890625, "learning_rate": 1.9186748233440993e-06, "loss": 0.9144, "step": 882 }, { "epoch": 0.6818532818532819, "grad_norm": 4.5, "learning_rate": 1.9173540163256216e-06, "loss": 0.9257, "step": 883 }, { "epoch": 0.6826254826254826, "grad_norm": 3.859375, "learning_rate": 1.916032166339129e-06, "loss": 0.8298, "step": 884 }, { "epoch": 0.6833976833976834, "grad_norm": 4.65625, "learning_rate": 1.9147092754504622e-06, "loss": 0.9003, "step": 885 }, { "epoch": 0.6841698841698842, "grad_norm": 4.125, "learning_rate": 1.9133853457270883e-06, "loss": 0.9569, "step": 886 }, { "epoch": 0.684942084942085, "grad_norm": 4.0625, "learning_rate": 1.912060379238099e-06, "loss": 0.7713, "step": 887 }, { "epoch": 0.6857142857142857, "grad_norm": 3.875, "learning_rate": 1.910734378054206e-06, "loss": 0.9624, "step": 888 }, { "epoch": 0.6864864864864865, "grad_norm": 3.71875, "learning_rate": 1.9094073442477375e-06, "loss": 0.8097, "step": 889 }, { "epoch": 0.6872586872586872, "grad_norm": 4.40625, "learning_rate": 1.9080792798926367e-06, "loss": 0.8396, "step": 890 }, { "epoch": 0.6880308880308881, "grad_norm": 4.0625, "learning_rate": 1.9067501870644553e-06, "loss": 0.8598, "step": 891 }, { "epoch": 0.6888030888030888, "grad_norm": 4.8125, "learning_rate": 1.9054200678403546e-06, "loss": 1.0012, "step": 892 }, { "epoch": 0.6895752895752896, "grad_norm": 3.609375, "learning_rate": 1.904088924299099e-06, "loss": 0.9291, "step": 893 }, { "epoch": 0.6903474903474903, "grad_norm": 4.65625, "learning_rate": 1.9027567585210532e-06, "loss": 0.8539, "step": 894 }, { "epoch": 0.6911196911196911, "grad_norm": 3.90625, "learning_rate": 1.9014235725881812e-06, "loss": 0.8699, "step": 895 }, { "epoch": 0.6918918918918919, "grad_norm": 3.71875, "learning_rate": 1.9000893685840392e-06, "loss": 0.8331, "step": 896 }, { "epoch": 0.6926640926640927, "grad_norm": 3.859375, "learning_rate": 1.898754148593776e-06, "loss": 0.9116, "step": 897 }, { "epoch": 0.6934362934362934, "grad_norm": 4.09375, "learning_rate": 1.8974179147041277e-06, "loss": 0.8476, "step": 898 }, { "epoch": 0.6942084942084942, "grad_norm": 5.5625, "learning_rate": 1.896080669003415e-06, "loss": 0.8158, "step": 899 }, { "epoch": 0.694980694980695, "grad_norm": 4.0625, "learning_rate": 1.8947424135815396e-06, "loss": 0.9805, "step": 900 }, { "epoch": 0.6957528957528958, "grad_norm": 4.09375, "learning_rate": 1.893403150529982e-06, "loss": 0.8148, "step": 901 }, { "epoch": 0.6965250965250965, "grad_norm": 3.6875, "learning_rate": 1.8920628819417969e-06, "loss": 0.755, "step": 902 }, { "epoch": 0.6972972972972973, "grad_norm": 5.59375, "learning_rate": 1.89072160991161e-06, "loss": 0.9566, "step": 903 }, { "epoch": 0.698069498069498, "grad_norm": 5.96875, "learning_rate": 1.8893793365356165e-06, "loss": 1.0, "step": 904 }, { "epoch": 0.6988416988416989, "grad_norm": 3.8125, "learning_rate": 1.8880360639115758e-06, "loss": 0.8501, "step": 905 }, { "epoch": 0.6996138996138996, "grad_norm": 3.625, "learning_rate": 1.8866917941388085e-06, "loss": 0.8662, "step": 906 }, { "epoch": 0.7003861003861004, "grad_norm": 4.5625, "learning_rate": 1.885346529318195e-06, "loss": 0.8457, "step": 907 }, { "epoch": 0.7011583011583011, "grad_norm": 3.859375, "learning_rate": 1.8840002715521688e-06, "loss": 0.8355, "step": 908 }, { "epoch": 0.7019305019305019, "grad_norm": 3.125, "learning_rate": 1.8826530229447177e-06, "loss": 0.7026, "step": 909 }, { "epoch": 0.7027027027027027, "grad_norm": 5.84375, "learning_rate": 1.8813047856013752e-06, "loss": 0.896, "step": 910 }, { "epoch": 0.7034749034749035, "grad_norm": 3.34375, "learning_rate": 1.8799555616292225e-06, "loss": 0.745, "step": 911 }, { "epoch": 0.7042471042471042, "grad_norm": 4.34375, "learning_rate": 1.8786053531368809e-06, "loss": 1.0545, "step": 912 }, { "epoch": 0.705019305019305, "grad_norm": 25.375, "learning_rate": 1.877254162234512e-06, "loss": 0.8466, "step": 913 }, { "epoch": 0.7057915057915058, "grad_norm": 3.984375, "learning_rate": 1.8759019910338115e-06, "loss": 0.9561, "step": 914 }, { "epoch": 0.7065637065637066, "grad_norm": 3.859375, "learning_rate": 1.8745488416480078e-06, "loss": 0.9712, "step": 915 }, { "epoch": 0.7073359073359073, "grad_norm": 5.125, "learning_rate": 1.8731947161918574e-06, "loss": 0.9822, "step": 916 }, { "epoch": 0.7081081081081081, "grad_norm": 4.78125, "learning_rate": 1.8718396167816432e-06, "loss": 0.6894, "step": 917 }, { "epoch": 0.7088803088803088, "grad_norm": 4.1875, "learning_rate": 1.8704835455351694e-06, "loss": 0.9087, "step": 918 }, { "epoch": 0.7096525096525097, "grad_norm": 4.21875, "learning_rate": 1.8691265045717596e-06, "loss": 0.9241, "step": 919 }, { "epoch": 0.7104247104247104, "grad_norm": 4.34375, "learning_rate": 1.8677684960122521e-06, "loss": 0.8938, "step": 920 }, { "epoch": 0.7111969111969112, "grad_norm": 4.21875, "learning_rate": 1.8664095219789985e-06, "loss": 0.9886, "step": 921 }, { "epoch": 0.711969111969112, "grad_norm": 4.8125, "learning_rate": 1.8650495845958585e-06, "loss": 0.8221, "step": 922 }, { "epoch": 0.7127413127413127, "grad_norm": 4.8125, "learning_rate": 1.8636886859881975e-06, "loss": 0.8824, "step": 923 }, { "epoch": 0.7135135135135136, "grad_norm": 5.65625, "learning_rate": 1.8623268282828832e-06, "loss": 0.9105, "step": 924 }, { "epoch": 0.7142857142857143, "grad_norm": 4.15625, "learning_rate": 1.8609640136082826e-06, "loss": 0.8499, "step": 925 }, { "epoch": 0.715057915057915, "grad_norm": 7.15625, "learning_rate": 1.8596002440942578e-06, "loss": 0.791, "step": 926 }, { "epoch": 0.7158301158301158, "grad_norm": 4.4375, "learning_rate": 1.8582355218721632e-06, "loss": 0.8639, "step": 927 }, { "epoch": 0.7166023166023165, "grad_norm": 4.46875, "learning_rate": 1.8568698490748419e-06, "loss": 0.9468, "step": 928 }, { "epoch": 0.7173745173745174, "grad_norm": 4.59375, "learning_rate": 1.8555032278366236e-06, "loss": 0.9931, "step": 929 }, { "epoch": 0.7181467181467182, "grad_norm": 4.21875, "learning_rate": 1.8541356602933192e-06, "loss": 0.9008, "step": 930 }, { "epoch": 0.7189189189189189, "grad_norm": 3.828125, "learning_rate": 1.852767148582219e-06, "loss": 0.8966, "step": 931 }, { "epoch": 0.7196911196911197, "grad_norm": 3.953125, "learning_rate": 1.8513976948420885e-06, "loss": 0.9121, "step": 932 }, { "epoch": 0.7204633204633205, "grad_norm": 5.5625, "learning_rate": 1.850027301213166e-06, "loss": 0.9909, "step": 933 }, { "epoch": 0.7212355212355213, "grad_norm": 4.875, "learning_rate": 1.8486559698371585e-06, "loss": 0.9098, "step": 934 }, { "epoch": 0.722007722007722, "grad_norm": 4.125, "learning_rate": 1.8472837028572383e-06, "loss": 0.9236, "step": 935 }, { "epoch": 0.7227799227799228, "grad_norm": 5.84375, "learning_rate": 1.8459105024180402e-06, "loss": 1.0365, "step": 936 }, { "epoch": 0.7235521235521235, "grad_norm": 4.96875, "learning_rate": 1.8445363706656577e-06, "loss": 0.8748, "step": 937 }, { "epoch": 0.7243243243243244, "grad_norm": 3.859375, "learning_rate": 1.84316130974764e-06, "loss": 0.7687, "step": 938 }, { "epoch": 0.7250965250965251, "grad_norm": 8.875, "learning_rate": 1.8417853218129883e-06, "loss": 1.0174, "step": 939 }, { "epoch": 0.7258687258687259, "grad_norm": 4.40625, "learning_rate": 1.8404084090121519e-06, "loss": 0.9296, "step": 940 }, { "epoch": 0.7266409266409266, "grad_norm": 3.75, "learning_rate": 1.8390305734970266e-06, "loss": 1.0601, "step": 941 }, { "epoch": 0.7274131274131274, "grad_norm": 4.34375, "learning_rate": 1.8376518174209502e-06, "loss": 0.9055, "step": 942 }, { "epoch": 0.7281853281853282, "grad_norm": 6.78125, "learning_rate": 1.8362721429386979e-06, "loss": 0.8865, "step": 943 }, { "epoch": 0.728957528957529, "grad_norm": 3.828125, "learning_rate": 1.834891552206482e-06, "loss": 0.8239, "step": 944 }, { "epoch": 0.7297297297297297, "grad_norm": 6.15625, "learning_rate": 1.8335100473819454e-06, "loss": 0.8347, "step": 945 }, { "epoch": 0.7305019305019305, "grad_norm": 4.125, "learning_rate": 1.8321276306241602e-06, "loss": 0.9364, "step": 946 }, { "epoch": 0.7312741312741313, "grad_norm": 3.859375, "learning_rate": 1.8307443040936237e-06, "loss": 0.8797, "step": 947 }, { "epoch": 0.7320463320463321, "grad_norm": 3.953125, "learning_rate": 1.8293600699522547e-06, "loss": 0.8882, "step": 948 }, { "epoch": 0.7328185328185328, "grad_norm": 4.34375, "learning_rate": 1.8279749303633907e-06, "loss": 0.8024, "step": 949 }, { "epoch": 0.7335907335907336, "grad_norm": 4.59375, "learning_rate": 1.8265888874917842e-06, "loss": 0.8986, "step": 950 }, { "epoch": 0.7343629343629343, "grad_norm": 7.78125, "learning_rate": 1.8252019435035995e-06, "loss": 0.8675, "step": 951 }, { "epoch": 0.7351351351351352, "grad_norm": 4.71875, "learning_rate": 1.823814100566409e-06, "loss": 0.8647, "step": 952 }, { "epoch": 0.7359073359073359, "grad_norm": 4.90625, "learning_rate": 1.8224253608491901e-06, "loss": 0.9373, "step": 953 }, { "epoch": 0.7366795366795367, "grad_norm": 7.0625, "learning_rate": 1.8210357265223216e-06, "loss": 0.7822, "step": 954 }, { "epoch": 0.7374517374517374, "grad_norm": 3.640625, "learning_rate": 1.8196451997575813e-06, "loss": 0.7796, "step": 955 }, { "epoch": 0.7382239382239382, "grad_norm": 7.25, "learning_rate": 1.8182537827281399e-06, "loss": 0.9519, "step": 956 }, { "epoch": 0.738996138996139, "grad_norm": 3.796875, "learning_rate": 1.8168614776085617e-06, "loss": 0.827, "step": 957 }, { "epoch": 0.7397683397683398, "grad_norm": 39.75, "learning_rate": 1.815468286574797e-06, "loss": 0.809, "step": 958 }, { "epoch": 0.7405405405405405, "grad_norm": 4.5625, "learning_rate": 1.8140742118041818e-06, "loss": 0.9136, "step": 959 }, { "epoch": 0.7413127413127413, "grad_norm": 3.5625, "learning_rate": 1.8126792554754327e-06, "loss": 0.8764, "step": 960 }, { "epoch": 0.7420849420849421, "grad_norm": 4.15625, "learning_rate": 1.8112834197686443e-06, "loss": 0.8365, "step": 961 }, { "epoch": 0.7428571428571429, "grad_norm": 4.03125, "learning_rate": 1.809886706865286e-06, "loss": 0.965, "step": 962 }, { "epoch": 0.7436293436293436, "grad_norm": 3.921875, "learning_rate": 1.8084891189481963e-06, "loss": 0.8435, "step": 963 }, { "epoch": 0.7444015444015444, "grad_norm": 4.15625, "learning_rate": 1.8070906582015833e-06, "loss": 0.8288, "step": 964 }, { "epoch": 0.7451737451737451, "grad_norm": 3.703125, "learning_rate": 1.8056913268110182e-06, "loss": 0.7691, "step": 965 }, { "epoch": 0.745945945945946, "grad_norm": 3.5625, "learning_rate": 1.8042911269634335e-06, "loss": 0.8656, "step": 966 }, { "epoch": 0.7467181467181467, "grad_norm": 4.0625, "learning_rate": 1.8028900608471182e-06, "loss": 0.9629, "step": 967 }, { "epoch": 0.7474903474903475, "grad_norm": 6.75, "learning_rate": 1.801488130651715e-06, "loss": 0.9233, "step": 968 }, { "epoch": 0.7482625482625482, "grad_norm": 4.21875, "learning_rate": 1.800085338568218e-06, "loss": 0.9006, "step": 969 }, { "epoch": 0.749034749034749, "grad_norm": 4.9375, "learning_rate": 1.7986816867889674e-06, "loss": 0.8507, "step": 970 }, { "epoch": 0.7498069498069498, "grad_norm": 8.25, "learning_rate": 1.7972771775076478e-06, "loss": 0.9288, "step": 971 }, { "epoch": 0.7505791505791506, "grad_norm": 4.34375, "learning_rate": 1.7958718129192828e-06, "loss": 0.9621, "step": 972 }, { "epoch": 0.7513513513513513, "grad_norm": 4.09375, "learning_rate": 1.7944655952202338e-06, "loss": 0.8752, "step": 973 }, { "epoch": 0.7521235521235521, "grad_norm": 4.53125, "learning_rate": 1.7930585266081952e-06, "loss": 0.8629, "step": 974 }, { "epoch": 0.752895752895753, "grad_norm": 4.15625, "learning_rate": 1.791650609282191e-06, "loss": 0.9228, "step": 975 }, { "epoch": 0.7536679536679537, "grad_norm": 10.25, "learning_rate": 1.790241845442571e-06, "loss": 0.8939, "step": 976 }, { "epoch": 0.7544401544401544, "grad_norm": 4.9375, "learning_rate": 1.7888322372910097e-06, "loss": 0.7511, "step": 977 }, { "epoch": 0.7552123552123552, "grad_norm": 4.125, "learning_rate": 1.7874217870304993e-06, "loss": 0.8543, "step": 978 }, { "epoch": 0.7559845559845559, "grad_norm": 4.96875, "learning_rate": 1.7860104968653497e-06, "loss": 0.9071, "step": 979 }, { "epoch": 0.7567567567567568, "grad_norm": 6.0625, "learning_rate": 1.7845983690011813e-06, "loss": 0.9613, "step": 980 }, { "epoch": 0.7575289575289575, "grad_norm": 5.21875, "learning_rate": 1.7831854056449267e-06, "loss": 0.8649, "step": 981 }, { "epoch": 0.7583011583011583, "grad_norm": 5.3125, "learning_rate": 1.781771609004821e-06, "loss": 0.8261, "step": 982 }, { "epoch": 0.759073359073359, "grad_norm": 3.953125, "learning_rate": 1.7803569812904036e-06, "loss": 0.9201, "step": 983 }, { "epoch": 0.7598455598455598, "grad_norm": 3.703125, "learning_rate": 1.7789415247125128e-06, "loss": 0.7362, "step": 984 }, { "epoch": 0.7606177606177607, "grad_norm": 3.84375, "learning_rate": 1.7775252414832811e-06, "loss": 0.8625, "step": 985 }, { "epoch": 0.7613899613899614, "grad_norm": 7.6875, "learning_rate": 1.776108133816134e-06, "loss": 0.8208, "step": 986 }, { "epoch": 0.7621621621621621, "grad_norm": 4.0, "learning_rate": 1.7746902039257848e-06, "loss": 0.8641, "step": 987 }, { "epoch": 0.7629343629343629, "grad_norm": 4.28125, "learning_rate": 1.773271454028232e-06, "loss": 0.8324, "step": 988 }, { "epoch": 0.7637065637065638, "grad_norm": 4.03125, "learning_rate": 1.7718518863407557e-06, "loss": 0.8395, "step": 989 }, { "epoch": 0.7644787644787645, "grad_norm": 4.15625, "learning_rate": 1.7704315030819147e-06, "loss": 0.8489, "step": 990 }, { "epoch": 0.7652509652509653, "grad_norm": 4.03125, "learning_rate": 1.7690103064715403e-06, "loss": 0.8573, "step": 991 }, { "epoch": 0.766023166023166, "grad_norm": 4.40625, "learning_rate": 1.7675882987307378e-06, "loss": 0.8898, "step": 992 }, { "epoch": 0.7667953667953668, "grad_norm": 28.75, "learning_rate": 1.7661654820818783e-06, "loss": 0.9599, "step": 993 }, { "epoch": 0.7675675675675676, "grad_norm": 4.65625, "learning_rate": 1.764741858748597e-06, "loss": 0.8899, "step": 994 }, { "epoch": 0.7683397683397684, "grad_norm": 4.75, "learning_rate": 1.763317430955791e-06, "loss": 1.0054, "step": 995 }, { "epoch": 0.7691119691119691, "grad_norm": 4.25, "learning_rate": 1.761892200929614e-06, "loss": 0.8155, "step": 996 }, { "epoch": 0.7698841698841699, "grad_norm": 3.96875, "learning_rate": 1.7604661708974738e-06, "loss": 0.8688, "step": 997 }, { "epoch": 0.7706563706563706, "grad_norm": 3.953125, "learning_rate": 1.7590393430880276e-06, "loss": 0.7937, "step": 998 }, { "epoch": 0.7714285714285715, "grad_norm": 4.78125, "learning_rate": 1.7576117197311798e-06, "loss": 0.8835, "step": 999 }, { "epoch": 0.7722007722007722, "grad_norm": 4.84375, "learning_rate": 1.7561833030580788e-06, "loss": 0.8861, "step": 1000 }, { "epoch": 0.772972972972973, "grad_norm": 4.09375, "learning_rate": 1.754754095301112e-06, "loss": 0.8273, "step": 1001 }, { "epoch": 0.7737451737451737, "grad_norm": 3.5, "learning_rate": 1.7533240986939038e-06, "loss": 0.813, "step": 1002 }, { "epoch": 0.7745173745173746, "grad_norm": 4.09375, "learning_rate": 1.7518933154713106e-06, "loss": 0.9778, "step": 1003 }, { "epoch": 0.7752895752895753, "grad_norm": 4.125, "learning_rate": 1.7504617478694188e-06, "loss": 0.8915, "step": 1004 }, { "epoch": 0.7760617760617761, "grad_norm": 4.03125, "learning_rate": 1.7490293981255407e-06, "loss": 0.9455, "step": 1005 }, { "epoch": 0.7768339768339768, "grad_norm": 4.875, "learning_rate": 1.747596268478211e-06, "loss": 0.7654, "step": 1006 }, { "epoch": 0.7776061776061776, "grad_norm": 4.15625, "learning_rate": 1.7461623611671823e-06, "loss": 0.8876, "step": 1007 }, { "epoch": 0.7783783783783784, "grad_norm": 3.8125, "learning_rate": 1.744727678433424e-06, "loss": 0.9562, "step": 1008 }, { "epoch": 0.7791505791505792, "grad_norm": 4.21875, "learning_rate": 1.7432922225191165e-06, "loss": 0.8959, "step": 1009 }, { "epoch": 0.7799227799227799, "grad_norm": 3.4375, "learning_rate": 1.7418559956676485e-06, "loss": 0.7994, "step": 1010 }, { "epoch": 0.7806949806949807, "grad_norm": 3.625, "learning_rate": 1.7404190001236134e-06, "loss": 0.7779, "step": 1011 }, { "epoch": 0.7814671814671814, "grad_norm": 3.625, "learning_rate": 1.7389812381328073e-06, "loss": 0.7948, "step": 1012 }, { "epoch": 0.7822393822393823, "grad_norm": 3.796875, "learning_rate": 1.7375427119422224e-06, "loss": 0.8677, "step": 1013 }, { "epoch": 0.783011583011583, "grad_norm": 3.5, "learning_rate": 1.7361034238000464e-06, "loss": 0.8321, "step": 1014 }, { "epoch": 0.7837837837837838, "grad_norm": 4.0, "learning_rate": 1.734663375955657e-06, "loss": 0.9345, "step": 1015 }, { "epoch": 0.7845559845559845, "grad_norm": 4.125, "learning_rate": 1.7332225706596196e-06, "loss": 0.9638, "step": 1016 }, { "epoch": 0.7853281853281854, "grad_norm": 5.0625, "learning_rate": 1.7317810101636838e-06, "loss": 0.897, "step": 1017 }, { "epoch": 0.7861003861003861, "grad_norm": 4.375, "learning_rate": 1.7303386967207788e-06, "loss": 0.8634, "step": 1018 }, { "epoch": 0.7868725868725869, "grad_norm": 6.9375, "learning_rate": 1.7288956325850104e-06, "loss": 0.9193, "step": 1019 }, { "epoch": 0.7876447876447876, "grad_norm": 4.53125, "learning_rate": 1.7274518200116585e-06, "loss": 0.8322, "step": 1020 }, { "epoch": 0.7884169884169884, "grad_norm": 4.71875, "learning_rate": 1.7260072612571717e-06, "loss": 0.9302, "step": 1021 }, { "epoch": 0.7891891891891892, "grad_norm": 4.03125, "learning_rate": 1.7245619585791656e-06, "loss": 0.8651, "step": 1022 }, { "epoch": 0.78996138996139, "grad_norm": 3.609375, "learning_rate": 1.7231159142364177e-06, "loss": 0.7516, "step": 1023 }, { "epoch": 0.7907335907335907, "grad_norm": 3.46875, "learning_rate": 1.721669130488865e-06, "loss": 0.7289, "step": 1024 }, { "epoch": 0.7915057915057915, "grad_norm": 3.8125, "learning_rate": 1.7202216095976002e-06, "loss": 0.7923, "step": 1025 }, { "epoch": 0.7922779922779922, "grad_norm": 4.4375, "learning_rate": 1.7187733538248678e-06, "loss": 0.8739, "step": 1026 }, { "epoch": 0.7930501930501931, "grad_norm": 3.8125, "learning_rate": 1.7173243654340604e-06, "loss": 0.7997, "step": 1027 }, { "epoch": 0.7938223938223938, "grad_norm": 3.9375, "learning_rate": 1.715874646689717e-06, "loss": 0.9286, "step": 1028 }, { "epoch": 0.7945945945945946, "grad_norm": 3.640625, "learning_rate": 1.714424199857516e-06, "loss": 0.8748, "step": 1029 }, { "epoch": 0.7953667953667953, "grad_norm": 15.0625, "learning_rate": 1.7129730272042758e-06, "loss": 0.8401, "step": 1030 }, { "epoch": 0.7961389961389962, "grad_norm": 4.0, "learning_rate": 1.7115211309979467e-06, "loss": 0.864, "step": 1031 }, { "epoch": 0.7969111969111969, "grad_norm": 4.15625, "learning_rate": 1.7100685135076128e-06, "loss": 0.9867, "step": 1032 }, { "epoch": 0.7976833976833977, "grad_norm": 3.78125, "learning_rate": 1.708615177003483e-06, "loss": 0.9566, "step": 1033 }, { "epoch": 0.7984555984555984, "grad_norm": 7.25, "learning_rate": 1.70716112375689e-06, "loss": 0.4817, "step": 1034 }, { "epoch": 0.7992277992277992, "grad_norm": 4.59375, "learning_rate": 1.7057063560402885e-06, "loss": 1.0959, "step": 1035 }, { "epoch": 0.8, "grad_norm": 3.96875, "learning_rate": 1.7042508761272483e-06, "loss": 0.9112, "step": 1036 }, { "epoch": 0.8007722007722008, "grad_norm": 4.59375, "learning_rate": 1.702794686292453e-06, "loss": 0.9115, "step": 1037 }, { "epoch": 0.8015444015444015, "grad_norm": 3.609375, "learning_rate": 1.7013377888116947e-06, "loss": 0.8883, "step": 1038 }, { "epoch": 0.8023166023166023, "grad_norm": 4.15625, "learning_rate": 1.6998801859618724e-06, "loss": 0.8486, "step": 1039 }, { "epoch": 0.803088803088803, "grad_norm": 4.59375, "learning_rate": 1.6984218800209872e-06, "loss": 0.8811, "step": 1040 }, { "epoch": 0.8038610038610039, "grad_norm": 4.3125, "learning_rate": 1.696962873268139e-06, "loss": 1.0043, "step": 1041 }, { "epoch": 0.8046332046332046, "grad_norm": 3.671875, "learning_rate": 1.6955031679835226e-06, "loss": 0.8322, "step": 1042 }, { "epoch": 0.8054054054054054, "grad_norm": 4.375, "learning_rate": 1.6940427664484249e-06, "loss": 0.9439, "step": 1043 }, { "epoch": 0.8061776061776061, "grad_norm": 3.984375, "learning_rate": 1.6925816709452211e-06, "loss": 0.9746, "step": 1044 }, { "epoch": 0.806949806949807, "grad_norm": 3.453125, "learning_rate": 1.691119883757371e-06, "loss": 0.7129, "step": 1045 }, { "epoch": 0.8077220077220078, "grad_norm": 3.65625, "learning_rate": 1.6896574071694145e-06, "loss": 0.7858, "step": 1046 }, { "epoch": 0.8084942084942085, "grad_norm": 4.09375, "learning_rate": 1.6881942434669697e-06, "loss": 0.9056, "step": 1047 }, { "epoch": 0.8092664092664092, "grad_norm": 4.75, "learning_rate": 1.6867303949367285e-06, "loss": 1.0173, "step": 1048 }, { "epoch": 0.81003861003861, "grad_norm": 4.0, "learning_rate": 1.685265863866453e-06, "loss": 0.8114, "step": 1049 }, { "epoch": 0.8108108108108109, "grad_norm": 3.96875, "learning_rate": 1.6838006525449716e-06, "loss": 0.7977, "step": 1050 }, { "epoch": 0.8115830115830116, "grad_norm": 4.90625, "learning_rate": 1.6823347632621764e-06, "loss": 0.9449, "step": 1051 }, { "epoch": 0.8123552123552124, "grad_norm": 3.90625, "learning_rate": 1.6808681983090191e-06, "loss": 0.8885, "step": 1052 }, { "epoch": 0.8131274131274131, "grad_norm": 6.9375, "learning_rate": 1.6794009599775069e-06, "loss": 0.8166, "step": 1053 }, { "epoch": 0.8138996138996138, "grad_norm": 4.34375, "learning_rate": 1.6779330505606988e-06, "loss": 0.9611, "step": 1054 }, { "epoch": 0.8146718146718147, "grad_norm": 4.40625, "learning_rate": 1.6764644723527046e-06, "loss": 0.9432, "step": 1055 }, { "epoch": 0.8154440154440155, "grad_norm": 4.5, "learning_rate": 1.6749952276486777e-06, "loss": 0.8541, "step": 1056 }, { "epoch": 0.8162162162162162, "grad_norm": 4.15625, "learning_rate": 1.6735253187448133e-06, "loss": 0.7722, "step": 1057 }, { "epoch": 0.816988416988417, "grad_norm": 3.8125, "learning_rate": 1.6720547479383451e-06, "loss": 0.8316, "step": 1058 }, { "epoch": 0.8177606177606178, "grad_norm": 3.703125, "learning_rate": 1.6705835175275406e-06, "loss": 0.7219, "step": 1059 }, { "epoch": 0.8185328185328186, "grad_norm": 4.75, "learning_rate": 1.6691116298116987e-06, "loss": 0.9476, "step": 1060 }, { "epoch": 0.8193050193050193, "grad_norm": 4.375, "learning_rate": 1.6676390870911464e-06, "loss": 0.7728, "step": 1061 }, { "epoch": 0.8200772200772201, "grad_norm": 4.28125, "learning_rate": 1.6661658916672319e-06, "loss": 1.03, "step": 1062 }, { "epoch": 0.8208494208494208, "grad_norm": 4.0, "learning_rate": 1.664692045842325e-06, "loss": 0.8441, "step": 1063 }, { "epoch": 0.8216216216216217, "grad_norm": 5.65625, "learning_rate": 1.6632175519198132e-06, "loss": 0.9804, "step": 1064 }, { "epoch": 0.8223938223938224, "grad_norm": 7.0625, "learning_rate": 1.6617424122040947e-06, "loss": 0.8568, "step": 1065 }, { "epoch": 0.8231660231660232, "grad_norm": 4.6875, "learning_rate": 1.6602666290005787e-06, "loss": 0.8954, "step": 1066 }, { "epoch": 0.8239382239382239, "grad_norm": 3.90625, "learning_rate": 1.658790204615678e-06, "loss": 0.8951, "step": 1067 }, { "epoch": 0.8247104247104247, "grad_norm": 4.03125, "learning_rate": 1.6573131413568094e-06, "loss": 0.8383, "step": 1068 }, { "epoch": 0.8254826254826255, "grad_norm": 5.3125, "learning_rate": 1.655835441532388e-06, "loss": 0.8631, "step": 1069 }, { "epoch": 0.8262548262548263, "grad_norm": 3.96875, "learning_rate": 1.6543571074518221e-06, "loss": 0.9402, "step": 1070 }, { "epoch": 0.827027027027027, "grad_norm": 4.25, "learning_rate": 1.6528781414255132e-06, "loss": 0.9119, "step": 1071 }, { "epoch": 0.8277992277992278, "grad_norm": 3.796875, "learning_rate": 1.6513985457648493e-06, "loss": 0.8971, "step": 1072 }, { "epoch": 0.8285714285714286, "grad_norm": 4.4375, "learning_rate": 1.6499183227822024e-06, "loss": 0.8849, "step": 1073 }, { "epoch": 0.8293436293436294, "grad_norm": 3.84375, "learning_rate": 1.648437474790926e-06, "loss": 0.9041, "step": 1074 }, { "epoch": 0.8301158301158301, "grad_norm": 4.34375, "learning_rate": 1.646956004105349e-06, "loss": 1.0281, "step": 1075 }, { "epoch": 0.8308880308880309, "grad_norm": 4.6875, "learning_rate": 1.6454739130407743e-06, "loss": 0.9392, "step": 1076 }, { "epoch": 0.8316602316602316, "grad_norm": 4.03125, "learning_rate": 1.6439912039134742e-06, "loss": 0.9809, "step": 1077 }, { "epoch": 0.8324324324324325, "grad_norm": 7.3125, "learning_rate": 1.6425078790406864e-06, "loss": 0.8327, "step": 1078 }, { "epoch": 0.8332046332046332, "grad_norm": 3.640625, "learning_rate": 1.641023940740612e-06, "loss": 0.7837, "step": 1079 }, { "epoch": 0.833976833976834, "grad_norm": 4.5625, "learning_rate": 1.6395393913324098e-06, "loss": 1.0539, "step": 1080 }, { "epoch": 0.8347490347490347, "grad_norm": 3.9375, "learning_rate": 1.638054233136194e-06, "loss": 0.6917, "step": 1081 }, { "epoch": 0.8355212355212355, "grad_norm": 6.34375, "learning_rate": 1.6365684684730303e-06, "loss": 0.8442, "step": 1082 }, { "epoch": 0.8362934362934363, "grad_norm": 4.71875, "learning_rate": 1.6350820996649325e-06, "loss": 0.857, "step": 1083 }, { "epoch": 0.8370656370656371, "grad_norm": 4.5, "learning_rate": 1.6335951290348579e-06, "loss": 0.8423, "step": 1084 }, { "epoch": 0.8378378378378378, "grad_norm": 7.125, "learning_rate": 1.6321075589067048e-06, "loss": 0.7711, "step": 1085 }, { "epoch": 0.8386100386100386, "grad_norm": 4.75, "learning_rate": 1.6306193916053084e-06, "loss": 0.8513, "step": 1086 }, { "epoch": 0.8393822393822394, "grad_norm": 5.5, "learning_rate": 1.6291306294564369e-06, "loss": 0.7782, "step": 1087 }, { "epoch": 0.8401544401544402, "grad_norm": 7.09375, "learning_rate": 1.6276412747867889e-06, "loss": 0.9183, "step": 1088 }, { "epoch": 0.8409266409266409, "grad_norm": 4.25, "learning_rate": 1.6261513299239882e-06, "loss": 0.98, "step": 1089 }, { "epoch": 0.8416988416988417, "grad_norm": 4.09375, "learning_rate": 1.6246607971965812e-06, "loss": 0.8163, "step": 1090 }, { "epoch": 0.8424710424710424, "grad_norm": 4.8125, "learning_rate": 1.6231696789340326e-06, "loss": 1.0246, "step": 1091 }, { "epoch": 0.8432432432432433, "grad_norm": 4.3125, "learning_rate": 1.6216779774667238e-06, "loss": 0.8445, "step": 1092 }, { "epoch": 0.844015444015444, "grad_norm": 4.78125, "learning_rate": 1.6201856951259454e-06, "loss": 0.9022, "step": 1093 }, { "epoch": 0.8447876447876448, "grad_norm": 3.859375, "learning_rate": 1.6186928342438982e-06, "loss": 0.7867, "step": 1094 }, { "epoch": 0.8455598455598455, "grad_norm": 4.65625, "learning_rate": 1.6171993971536848e-06, "loss": 0.9803, "step": 1095 }, { "epoch": 0.8463320463320463, "grad_norm": 3.765625, "learning_rate": 1.61570538618931e-06, "loss": 0.6843, "step": 1096 }, { "epoch": 0.8471042471042471, "grad_norm": 3.421875, "learning_rate": 1.6142108036856756e-06, "loss": 0.8811, "step": 1097 }, { "epoch": 0.8478764478764479, "grad_norm": 4.40625, "learning_rate": 1.6127156519785748e-06, "loss": 0.9467, "step": 1098 }, { "epoch": 0.8486486486486486, "grad_norm": 4.21875, "learning_rate": 1.6112199334046915e-06, "loss": 0.8105, "step": 1099 }, { "epoch": 0.8494208494208494, "grad_norm": 4.46875, "learning_rate": 1.6097236503015967e-06, "loss": 0.9599, "step": 1100 }, { "epoch": 0.8501930501930502, "grad_norm": 4.34375, "learning_rate": 1.6082268050077415e-06, "loss": 0.9437, "step": 1101 }, { "epoch": 0.850965250965251, "grad_norm": 4.03125, "learning_rate": 1.6067293998624572e-06, "loss": 0.9224, "step": 1102 }, { "epoch": 0.8517374517374517, "grad_norm": 56.0, "learning_rate": 1.6052314372059492e-06, "loss": 0.8686, "step": 1103 }, { "epoch": 0.8525096525096525, "grad_norm": 5.5625, "learning_rate": 1.6037329193792944e-06, "loss": 0.8486, "step": 1104 }, { "epoch": 0.8532818532818532, "grad_norm": 3.46875, "learning_rate": 1.6022338487244381e-06, "loss": 0.8508, "step": 1105 }, { "epoch": 0.8540540540540541, "grad_norm": 4.0625, "learning_rate": 1.6007342275841886e-06, "loss": 0.9727, "step": 1106 }, { "epoch": 0.8548262548262548, "grad_norm": 4.53125, "learning_rate": 1.5992340583022143e-06, "loss": 0.8997, "step": 1107 }, { "epoch": 0.8555984555984556, "grad_norm": 4.0625, "learning_rate": 1.5977333432230412e-06, "loss": 0.9246, "step": 1108 }, { "epoch": 0.8563706563706563, "grad_norm": 3.796875, "learning_rate": 1.5962320846920481e-06, "loss": 0.8548, "step": 1109 }, { "epoch": 0.8571428571428571, "grad_norm": 3.90625, "learning_rate": 1.594730285055463e-06, "loss": 0.4284, "step": 1110 }, { "epoch": 0.857915057915058, "grad_norm": 4.9375, "learning_rate": 1.593227946660359e-06, "loss": 0.8762, "step": 1111 }, { "epoch": 0.8586872586872587, "grad_norm": 3.859375, "learning_rate": 1.591725071854652e-06, "loss": 0.8533, "step": 1112 }, { "epoch": 0.8594594594594595, "grad_norm": 3.65625, "learning_rate": 1.5902216629870956e-06, "loss": 0.8737, "step": 1113 }, { "epoch": 0.8602316602316602, "grad_norm": 3.921875, "learning_rate": 1.588717722407279e-06, "loss": 0.7848, "step": 1114 }, { "epoch": 0.861003861003861, "grad_norm": 4.3125, "learning_rate": 1.5872132524656208e-06, "loss": 0.9142, "step": 1115 }, { "epoch": 0.8617760617760618, "grad_norm": 4.09375, "learning_rate": 1.585708255513369e-06, "loss": 0.8936, "step": 1116 }, { "epoch": 0.8625482625482626, "grad_norm": 4.25, "learning_rate": 1.5842027339025933e-06, "loss": 0.7935, "step": 1117 }, { "epoch": 0.8633204633204633, "grad_norm": 4.625, "learning_rate": 1.5826966899861842e-06, "loss": 0.8933, "step": 1118 }, { "epoch": 0.864092664092664, "grad_norm": 5.03125, "learning_rate": 1.5811901261178485e-06, "loss": 0.7957, "step": 1119 }, { "epoch": 0.8648648648648649, "grad_norm": 4.4375, "learning_rate": 1.5796830446521059e-06, "loss": 0.8232, "step": 1120 }, { "epoch": 0.8656370656370657, "grad_norm": 3.8125, "learning_rate": 1.578175447944284e-06, "loss": 0.8612, "step": 1121 }, { "epoch": 0.8664092664092664, "grad_norm": 4.0, "learning_rate": 1.5766673383505168e-06, "loss": 0.9304, "step": 1122 }, { "epoch": 0.8671814671814672, "grad_norm": 4.0, "learning_rate": 1.575158718227739e-06, "loss": 0.8592, "step": 1123 }, { "epoch": 0.8679536679536679, "grad_norm": 4.8125, "learning_rate": 1.5736495899336834e-06, "loss": 1.0185, "step": 1124 }, { "epoch": 0.8687258687258688, "grad_norm": 3.84375, "learning_rate": 1.5721399558268777e-06, "loss": 0.9777, "step": 1125 }, { "epoch": 0.8694980694980695, "grad_norm": 9.5625, "learning_rate": 1.5706298182666394e-06, "loss": 0.7969, "step": 1126 }, { "epoch": 0.8702702702702703, "grad_norm": 4.03125, "learning_rate": 1.5691191796130722e-06, "loss": 0.933, "step": 1127 }, { "epoch": 0.871042471042471, "grad_norm": 3.90625, "learning_rate": 1.5676080422270647e-06, "loss": 0.9443, "step": 1128 }, { "epoch": 0.8718146718146718, "grad_norm": 4.5625, "learning_rate": 1.5660964084702835e-06, "loss": 0.8685, "step": 1129 }, { "epoch": 0.8725868725868726, "grad_norm": 4.25, "learning_rate": 1.564584280705171e-06, "loss": 0.9638, "step": 1130 }, { "epoch": 0.8733590733590734, "grad_norm": 4.25, "learning_rate": 1.563071661294942e-06, "loss": 0.7558, "step": 1131 }, { "epoch": 0.8741312741312741, "grad_norm": 3.765625, "learning_rate": 1.5615585526035804e-06, "loss": 0.7694, "step": 1132 }, { "epoch": 0.8749034749034749, "grad_norm": 4.15625, "learning_rate": 1.5600449569958335e-06, "loss": 0.9961, "step": 1133 }, { "epoch": 0.8756756756756757, "grad_norm": 3.9375, "learning_rate": 1.55853087683721e-06, "loss": 0.747, "step": 1134 }, { "epoch": 0.8764478764478765, "grad_norm": 4.3125, "learning_rate": 1.5570163144939762e-06, "loss": 0.8064, "step": 1135 }, { "epoch": 0.8772200772200772, "grad_norm": 3.921875, "learning_rate": 1.555501272333152e-06, "loss": 0.855, "step": 1136 }, { "epoch": 0.877992277992278, "grad_norm": 4.53125, "learning_rate": 1.5539857527225067e-06, "loss": 0.9056, "step": 1137 }, { "epoch": 0.8787644787644787, "grad_norm": 4.0625, "learning_rate": 1.5524697580305558e-06, "loss": 0.945, "step": 1138 }, { "epoch": 0.8795366795366796, "grad_norm": 4.21875, "learning_rate": 1.550953290626558e-06, "loss": 0.937, "step": 1139 }, { "epoch": 0.8803088803088803, "grad_norm": 4.78125, "learning_rate": 1.5494363528805095e-06, "loss": 0.8816, "step": 1140 }, { "epoch": 0.8810810810810811, "grad_norm": 4.0625, "learning_rate": 1.5479189471631435e-06, "loss": 0.9474, "step": 1141 }, { "epoch": 0.8818532818532818, "grad_norm": 3.96875, "learning_rate": 1.5464010758459224e-06, "loss": 1.0079, "step": 1142 }, { "epoch": 0.8826254826254826, "grad_norm": 3.546875, "learning_rate": 1.5448827413010372e-06, "loss": 0.6841, "step": 1143 }, { "epoch": 0.8833976833976834, "grad_norm": 3.828125, "learning_rate": 1.5433639459014039e-06, "loss": 0.8736, "step": 1144 }, { "epoch": 0.8841698841698842, "grad_norm": 6.40625, "learning_rate": 1.541844692020657e-06, "loss": 0.8257, "step": 1145 }, { "epoch": 0.8849420849420849, "grad_norm": 4.46875, "learning_rate": 1.5403249820331483e-06, "loss": 0.8992, "step": 1146 }, { "epoch": 0.8857142857142857, "grad_norm": 3.96875, "learning_rate": 1.538804818313943e-06, "loss": 0.8815, "step": 1147 }, { "epoch": 0.8864864864864865, "grad_norm": 3.703125, "learning_rate": 1.537284203238814e-06, "loss": 0.9024, "step": 1148 }, { "epoch": 0.8872586872586873, "grad_norm": 4.15625, "learning_rate": 1.535763139184241e-06, "loss": 0.7519, "step": 1149 }, { "epoch": 0.888030888030888, "grad_norm": 4.3125, "learning_rate": 1.5342416285274052e-06, "loss": 0.8546, "step": 1150 }, { "epoch": 0.8888030888030888, "grad_norm": 6.5625, "learning_rate": 1.5327196736461841e-06, "loss": 0.8806, "step": 1151 }, { "epoch": 0.8895752895752895, "grad_norm": 3.609375, "learning_rate": 1.5311972769191516e-06, "loss": 0.8247, "step": 1152 }, { "epoch": 0.8903474903474904, "grad_norm": 4.375, "learning_rate": 1.5296744407255711e-06, "loss": 0.9334, "step": 1153 }, { "epoch": 0.8911196911196911, "grad_norm": 5.90625, "learning_rate": 1.528151167445393e-06, "loss": 0.9418, "step": 1154 }, { "epoch": 0.8918918918918919, "grad_norm": 4.03125, "learning_rate": 1.526627459459251e-06, "loss": 0.9717, "step": 1155 }, { "epoch": 0.8926640926640926, "grad_norm": 4.28125, "learning_rate": 1.5251033191484574e-06, "loss": 0.9234, "step": 1156 }, { "epoch": 0.8934362934362934, "grad_norm": 4.46875, "learning_rate": 1.5235787488950014e-06, "loss": 0.9141, "step": 1157 }, { "epoch": 0.8942084942084942, "grad_norm": 4.40625, "learning_rate": 1.5220537510815429e-06, "loss": 0.9426, "step": 1158 }, { "epoch": 0.894980694980695, "grad_norm": 3.59375, "learning_rate": 1.5205283280914105e-06, "loss": 0.8401, "step": 1159 }, { "epoch": 0.8957528957528957, "grad_norm": 4.46875, "learning_rate": 1.5190024823085975e-06, "loss": 0.8253, "step": 1160 }, { "epoch": 0.8965250965250965, "grad_norm": 12.5625, "learning_rate": 1.5174762161177581e-06, "loss": 0.8692, "step": 1161 }, { "epoch": 0.8972972972972973, "grad_norm": 3.625, "learning_rate": 1.5159495319042028e-06, "loss": 0.7199, "step": 1162 }, { "epoch": 0.8980694980694981, "grad_norm": 5.375, "learning_rate": 1.5144224320538958e-06, "loss": 0.9295, "step": 1163 }, { "epoch": 0.8988416988416988, "grad_norm": 3.828125, "learning_rate": 1.512894918953451e-06, "loss": 0.7217, "step": 1164 }, { "epoch": 0.8996138996138996, "grad_norm": 3.796875, "learning_rate": 1.5113669949901283e-06, "loss": 0.8152, "step": 1165 }, { "epoch": 0.9003861003861003, "grad_norm": 3.921875, "learning_rate": 1.509838662551829e-06, "loss": 0.8465, "step": 1166 }, { "epoch": 0.9011583011583012, "grad_norm": 4.34375, "learning_rate": 1.5083099240270934e-06, "loss": 0.7721, "step": 1167 }, { "epoch": 0.901930501930502, "grad_norm": 11.375, "learning_rate": 1.5067807818050967e-06, "loss": 0.8919, "step": 1168 }, { "epoch": 0.9027027027027027, "grad_norm": 4.25, "learning_rate": 1.505251238275644e-06, "loss": 0.8378, "step": 1169 }, { "epoch": 0.9034749034749034, "grad_norm": 4.1875, "learning_rate": 1.5037212958291686e-06, "loss": 0.8617, "step": 1170 }, { "epoch": 0.9042471042471042, "grad_norm": 6.125, "learning_rate": 1.5021909568567264e-06, "loss": 0.86, "step": 1171 }, { "epoch": 0.905019305019305, "grad_norm": 3.90625, "learning_rate": 1.5006602237499938e-06, "loss": 0.8505, "step": 1172 }, { "epoch": 0.9057915057915058, "grad_norm": 4.125, "learning_rate": 1.4991290989012622e-06, "loss": 0.9625, "step": 1173 }, { "epoch": 0.9065637065637066, "grad_norm": 3.875, "learning_rate": 1.497597584703437e-06, "loss": 0.8285, "step": 1174 }, { "epoch": 0.9073359073359073, "grad_norm": 4.03125, "learning_rate": 1.4960656835500296e-06, "loss": 0.8701, "step": 1175 }, { "epoch": 0.9081081081081082, "grad_norm": 3.75, "learning_rate": 1.4945333978351581e-06, "loss": 0.8545, "step": 1176 }, { "epoch": 0.9088803088803089, "grad_norm": 3.859375, "learning_rate": 1.4930007299535411e-06, "loss": 0.9657, "step": 1177 }, { "epoch": 0.9096525096525097, "grad_norm": 3.890625, "learning_rate": 1.4914676823004942e-06, "loss": 0.9906, "step": 1178 }, { "epoch": 0.9104247104247104, "grad_norm": 3.828125, "learning_rate": 1.4899342572719264e-06, "loss": 0.8909, "step": 1179 }, { "epoch": 0.9111969111969112, "grad_norm": 3.9375, "learning_rate": 1.488400457264337e-06, "loss": 0.8222, "step": 1180 }, { "epoch": 0.911969111969112, "grad_norm": 3.59375, "learning_rate": 1.4868662846748114e-06, "loss": 0.7885, "step": 1181 }, { "epoch": 0.9127413127413128, "grad_norm": 4.28125, "learning_rate": 1.4853317419010163e-06, "loss": 0.8413, "step": 1182 }, { "epoch": 0.9135135135135135, "grad_norm": 11.9375, "learning_rate": 1.4837968313411977e-06, "loss": 0.9131, "step": 1183 }, { "epoch": 0.9142857142857143, "grad_norm": 3.859375, "learning_rate": 1.4822615553941767e-06, "loss": 0.8431, "step": 1184 }, { "epoch": 0.915057915057915, "grad_norm": 6.28125, "learning_rate": 1.4807259164593445e-06, "loss": 0.7311, "step": 1185 }, { "epoch": 0.9158301158301159, "grad_norm": 3.921875, "learning_rate": 1.479189916936661e-06, "loss": 0.8836, "step": 1186 }, { "epoch": 0.9166023166023166, "grad_norm": 8.6875, "learning_rate": 1.4776535592266474e-06, "loss": 0.8414, "step": 1187 }, { "epoch": 0.9173745173745174, "grad_norm": 4.21875, "learning_rate": 1.4761168457303876e-06, "loss": 1.0189, "step": 1188 }, { "epoch": 0.9181467181467181, "grad_norm": 3.921875, "learning_rate": 1.474579778849519e-06, "loss": 0.8234, "step": 1189 }, { "epoch": 0.918918918918919, "grad_norm": 5.0, "learning_rate": 1.4730423609862323e-06, "loss": 0.9868, "step": 1190 }, { "epoch": 0.9196911196911197, "grad_norm": 5.53125, "learning_rate": 1.4715045945432672e-06, "loss": 0.9854, "step": 1191 }, { "epoch": 0.9204633204633205, "grad_norm": 6.46875, "learning_rate": 1.469966481923907e-06, "loss": 1.0622, "step": 1192 }, { "epoch": 0.9212355212355212, "grad_norm": 6.65625, "learning_rate": 1.4684280255319773e-06, "loss": 0.8818, "step": 1193 }, { "epoch": 0.922007722007722, "grad_norm": 4.5625, "learning_rate": 1.4668892277718402e-06, "loss": 1.024, "step": 1194 }, { "epoch": 0.9227799227799228, "grad_norm": 3.375, "learning_rate": 1.465350091048391e-06, "loss": 0.7364, "step": 1195 }, { "epoch": 0.9235521235521236, "grad_norm": 3.984375, "learning_rate": 1.463810617767056e-06, "loss": 0.8759, "step": 1196 }, { "epoch": 0.9243243243243243, "grad_norm": 10.8125, "learning_rate": 1.4622708103337866e-06, "loss": 0.8548, "step": 1197 }, { "epoch": 0.9250965250965251, "grad_norm": 3.984375, "learning_rate": 1.460730671155056e-06, "loss": 0.9098, "step": 1198 }, { "epoch": 0.9258687258687258, "grad_norm": 3.734375, "learning_rate": 1.459190202637856e-06, "loss": 0.7646, "step": 1199 }, { "epoch": 0.9266409266409267, "grad_norm": 4.1875, "learning_rate": 1.4576494071896945e-06, "loss": 0.8319, "step": 1200 }, { "epoch": 0.9274131274131274, "grad_norm": 4.90625, "learning_rate": 1.4561082872185894e-06, "loss": 0.844, "step": 1201 }, { "epoch": 0.9281853281853282, "grad_norm": 3.984375, "learning_rate": 1.454566845133065e-06, "loss": 0.9116, "step": 1202 }, { "epoch": 0.9289575289575289, "grad_norm": 3.9375, "learning_rate": 1.4530250833421494e-06, "loss": 0.817, "step": 1203 }, { "epoch": 0.9297297297297298, "grad_norm": 4.125, "learning_rate": 1.4514830042553715e-06, "loss": 0.8039, "step": 1204 }, { "epoch": 0.9305019305019305, "grad_norm": 3.90625, "learning_rate": 1.449940610282755e-06, "loss": 0.7683, "step": 1205 }, { "epoch": 0.9312741312741313, "grad_norm": 4.25, "learning_rate": 1.4483979038348165e-06, "loss": 0.8528, "step": 1206 }, { "epoch": 0.932046332046332, "grad_norm": 5.09375, "learning_rate": 1.446854887322559e-06, "loss": 0.9019, "step": 1207 }, { "epoch": 0.9328185328185328, "grad_norm": 4.3125, "learning_rate": 1.4453115631574732e-06, "loss": 0.9478, "step": 1208 }, { "epoch": 0.9335907335907336, "grad_norm": 15.4375, "learning_rate": 1.443767933751528e-06, "loss": 0.9549, "step": 1209 }, { "epoch": 0.9343629343629344, "grad_norm": 3.734375, "learning_rate": 1.4422240015171698e-06, "loss": 0.8541, "step": 1210 }, { "epoch": 0.9351351351351351, "grad_norm": 4.0625, "learning_rate": 1.4406797688673193e-06, "loss": 0.9445, "step": 1211 }, { "epoch": 0.9359073359073359, "grad_norm": 3.765625, "learning_rate": 1.4391352382153664e-06, "loss": 0.7821, "step": 1212 }, { "epoch": 0.9366795366795366, "grad_norm": 4.78125, "learning_rate": 1.437590411975166e-06, "loss": 0.9229, "step": 1213 }, { "epoch": 0.9374517374517375, "grad_norm": 4.40625, "learning_rate": 1.4360452925610357e-06, "loss": 0.8256, "step": 1214 }, { "epoch": 0.9382239382239382, "grad_norm": 4.65625, "learning_rate": 1.4344998823877507e-06, "loss": 0.779, "step": 1215 }, { "epoch": 0.938996138996139, "grad_norm": 4.96875, "learning_rate": 1.4329541838705413e-06, "loss": 0.8149, "step": 1216 }, { "epoch": 0.9397683397683397, "grad_norm": 4.0625, "learning_rate": 1.4314081994250879e-06, "loss": 0.9477, "step": 1217 }, { "epoch": 0.9405405405405406, "grad_norm": 4.375, "learning_rate": 1.4298619314675176e-06, "loss": 0.8179, "step": 1218 }, { "epoch": 0.9413127413127413, "grad_norm": 4.3125, "learning_rate": 1.4283153824144012e-06, "loss": 0.6796, "step": 1219 }, { "epoch": 0.9420849420849421, "grad_norm": 4.1875, "learning_rate": 1.4267685546827485e-06, "loss": 0.8524, "step": 1220 }, { "epoch": 0.9428571428571428, "grad_norm": 4.375, "learning_rate": 1.4252214506900047e-06, "loss": 0.7351, "step": 1221 }, { "epoch": 0.9436293436293436, "grad_norm": 4.53125, "learning_rate": 1.4236740728540468e-06, "loss": 0.9198, "step": 1222 }, { "epoch": 0.9444015444015444, "grad_norm": 3.5, "learning_rate": 1.4221264235931803e-06, "loss": 0.8073, "step": 1223 }, { "epoch": 0.9451737451737452, "grad_norm": 3.78125, "learning_rate": 1.4205785053261336e-06, "loss": 0.9701, "step": 1224 }, { "epoch": 0.9459459459459459, "grad_norm": 3.828125, "learning_rate": 1.4190303204720576e-06, "loss": 0.7613, "step": 1225 }, { "epoch": 0.9467181467181467, "grad_norm": 3.703125, "learning_rate": 1.4174818714505173e-06, "loss": 0.7194, "step": 1226 }, { "epoch": 0.9474903474903474, "grad_norm": 3.875, "learning_rate": 1.4159331606814927e-06, "loss": 0.9724, "step": 1227 }, { "epoch": 0.9482625482625483, "grad_norm": 3.65625, "learning_rate": 1.414384190585372e-06, "loss": 0.8003, "step": 1228 }, { "epoch": 0.949034749034749, "grad_norm": 3.875, "learning_rate": 1.4128349635829483e-06, "loss": 1.048, "step": 1229 }, { "epoch": 0.9498069498069498, "grad_norm": 3.859375, "learning_rate": 1.4112854820954166e-06, "loss": 0.9122, "step": 1230 }, { "epoch": 0.9505791505791505, "grad_norm": 5.21875, "learning_rate": 1.40973574854437e-06, "loss": 1.0198, "step": 1231 }, { "epoch": 0.9513513513513514, "grad_norm": 3.921875, "learning_rate": 1.4081857653517949e-06, "loss": 0.749, "step": 1232 }, { "epoch": 0.9521235521235522, "grad_norm": 5.875, "learning_rate": 1.4066355349400678e-06, "loss": 0.9066, "step": 1233 }, { "epoch": 0.9528957528957529, "grad_norm": 4.90625, "learning_rate": 1.405085059731953e-06, "loss": 0.787, "step": 1234 }, { "epoch": 0.9536679536679536, "grad_norm": 5.21875, "learning_rate": 1.4035343421505949e-06, "loss": 0.8944, "step": 1235 }, { "epoch": 0.9544401544401544, "grad_norm": 4.28125, "learning_rate": 1.4019833846195188e-06, "loss": 0.7879, "step": 1236 }, { "epoch": 0.9552123552123553, "grad_norm": 4.0, "learning_rate": 1.400432189562624e-06, "loss": 0.8001, "step": 1237 }, { "epoch": 0.955984555984556, "grad_norm": 4.3125, "learning_rate": 1.3988807594041815e-06, "loss": 0.9807, "step": 1238 }, { "epoch": 0.9567567567567568, "grad_norm": 3.859375, "learning_rate": 1.3973290965688295e-06, "loss": 0.8761, "step": 1239 }, { "epoch": 0.9575289575289575, "grad_norm": 4.59375, "learning_rate": 1.3957772034815694e-06, "loss": 0.994, "step": 1240 }, { "epoch": 0.9583011583011583, "grad_norm": 4.125, "learning_rate": 1.3942250825677633e-06, "loss": 0.7494, "step": 1241 }, { "epoch": 0.9590733590733591, "grad_norm": 4.75, "learning_rate": 1.3926727362531286e-06, "loss": 0.8959, "step": 1242 }, { "epoch": 0.9598455598455599, "grad_norm": 7.03125, "learning_rate": 1.3911201669637357e-06, "loss": 0.7723, "step": 1243 }, { "epoch": 0.9606177606177606, "grad_norm": 5.15625, "learning_rate": 1.3895673771260026e-06, "loss": 0.8449, "step": 1244 }, { "epoch": 0.9613899613899614, "grad_norm": 4.40625, "learning_rate": 1.3880143691666927e-06, "loss": 0.8938, "step": 1245 }, { "epoch": 0.9621621621621622, "grad_norm": 4.90625, "learning_rate": 1.3864611455129102e-06, "loss": 0.9568, "step": 1246 }, { "epoch": 0.962934362934363, "grad_norm": 4.28125, "learning_rate": 1.3849077085920958e-06, "loss": 0.933, "step": 1247 }, { "epoch": 0.9637065637065637, "grad_norm": 4.09375, "learning_rate": 1.3833540608320245e-06, "loss": 0.8169, "step": 1248 }, { "epoch": 0.9644787644787645, "grad_norm": 12.4375, "learning_rate": 1.3818002046608e-06, "loss": 0.9613, "step": 1249 }, { "epoch": 0.9652509652509652, "grad_norm": 4.15625, "learning_rate": 1.3802461425068514e-06, "loss": 0.8819, "step": 1250 }, { "epoch": 0.9660231660231661, "grad_norm": 3.609375, "learning_rate": 1.3786918767989312e-06, "loss": 0.7039, "step": 1251 }, { "epoch": 0.9667953667953668, "grad_norm": 6.25, "learning_rate": 1.3771374099661088e-06, "loss": 0.8769, "step": 1252 }, { "epoch": 0.9675675675675676, "grad_norm": 3.671875, "learning_rate": 1.375582744437768e-06, "loss": 1.0975, "step": 1253 }, { "epoch": 0.9683397683397683, "grad_norm": 39.0, "learning_rate": 1.3740278826436032e-06, "loss": 0.8262, "step": 1254 }, { "epoch": 0.9691119691119691, "grad_norm": 4.875, "learning_rate": 1.3724728270136158e-06, "loss": 0.8126, "step": 1255 }, { "epoch": 0.9698841698841699, "grad_norm": 4.5, "learning_rate": 1.3709175799781107e-06, "loss": 0.789, "step": 1256 }, { "epoch": 0.9706563706563707, "grad_norm": 4.03125, "learning_rate": 1.3693621439676906e-06, "loss": 0.8808, "step": 1257 }, { "epoch": 0.9714285714285714, "grad_norm": 3.84375, "learning_rate": 1.3678065214132541e-06, "loss": 0.8929, "step": 1258 }, { "epoch": 0.9722007722007722, "grad_norm": 5.15625, "learning_rate": 1.3662507147459922e-06, "loss": 0.8597, "step": 1259 }, { "epoch": 0.972972972972973, "grad_norm": 4.40625, "learning_rate": 1.364694726397382e-06, "loss": 0.9266, "step": 1260 }, { "epoch": 0.9737451737451738, "grad_norm": 67.5, "learning_rate": 1.3631385587991858e-06, "loss": 0.8152, "step": 1261 }, { "epoch": 0.9745173745173745, "grad_norm": 4.0625, "learning_rate": 1.3615822143834455e-06, "loss": 0.9074, "step": 1262 }, { "epoch": 0.9752895752895753, "grad_norm": 4.59375, "learning_rate": 1.3600256955824798e-06, "loss": 0.9443, "step": 1263 }, { "epoch": 0.976061776061776, "grad_norm": 4.03125, "learning_rate": 1.358469004828879e-06, "loss": 0.8632, "step": 1264 }, { "epoch": 0.9768339768339769, "grad_norm": 4.53125, "learning_rate": 1.3569121445555036e-06, "loss": 1.0257, "step": 1265 }, { "epoch": 0.9776061776061776, "grad_norm": 3.8125, "learning_rate": 1.3553551171954777e-06, "loss": 0.8153, "step": 1266 }, { "epoch": 0.9783783783783784, "grad_norm": 4.25, "learning_rate": 1.3537979251821873e-06, "loss": 0.9994, "step": 1267 }, { "epoch": 0.9791505791505791, "grad_norm": 3.90625, "learning_rate": 1.352240570949275e-06, "loss": 0.874, "step": 1268 }, { "epoch": 0.9799227799227799, "grad_norm": 4.875, "learning_rate": 1.3506830569306378e-06, "loss": 0.8746, "step": 1269 }, { "epoch": 0.9806949806949807, "grad_norm": 4.34375, "learning_rate": 1.3491253855604216e-06, "loss": 0.779, "step": 1270 }, { "epoch": 0.9814671814671815, "grad_norm": 4.6875, "learning_rate": 1.3475675592730192e-06, "loss": 0.8719, "step": 1271 }, { "epoch": 0.9822393822393822, "grad_norm": 3.859375, "learning_rate": 1.3460095805030642e-06, "loss": 0.8604, "step": 1272 }, { "epoch": 0.983011583011583, "grad_norm": 4.1875, "learning_rate": 1.3444514516854296e-06, "loss": 0.8011, "step": 1273 }, { "epoch": 0.9837837837837838, "grad_norm": 3.796875, "learning_rate": 1.3428931752552227e-06, "loss": 0.844, "step": 1274 }, { "epoch": 0.9845559845559846, "grad_norm": 4.84375, "learning_rate": 1.3413347536477809e-06, "loss": 0.9281, "step": 1275 }, { "epoch": 0.9853281853281853, "grad_norm": 4.03125, "learning_rate": 1.3397761892986693e-06, "loss": 0.7908, "step": 1276 }, { "epoch": 0.9861003861003861, "grad_norm": 4.03125, "learning_rate": 1.3382174846436756e-06, "loss": 0.8885, "step": 1277 }, { "epoch": 0.9868725868725868, "grad_norm": 3.890625, "learning_rate": 1.3366586421188065e-06, "loss": 0.8543, "step": 1278 }, { "epoch": 0.9876447876447877, "grad_norm": 4.5625, "learning_rate": 1.335099664160285e-06, "loss": 0.9925, "step": 1279 }, { "epoch": 0.9884169884169884, "grad_norm": 3.921875, "learning_rate": 1.3335405532045454e-06, "loss": 1.0096, "step": 1280 }, { "epoch": 0.9891891891891892, "grad_norm": 3.9375, "learning_rate": 1.3319813116882293e-06, "loss": 0.8494, "step": 1281 }, { "epoch": 0.9899613899613899, "grad_norm": 3.875, "learning_rate": 1.3304219420481826e-06, "loss": 0.8857, "step": 1282 }, { "epoch": 0.9907335907335907, "grad_norm": 3.578125, "learning_rate": 1.3288624467214525e-06, "loss": 0.7578, "step": 1283 }, { "epoch": 0.9915057915057915, "grad_norm": 3.859375, "learning_rate": 1.327302828145281e-06, "loss": 0.6938, "step": 1284 }, { "epoch": 0.9922779922779923, "grad_norm": 9.5, "learning_rate": 1.3257430887571038e-06, "loss": 0.9476, "step": 1285 }, { "epoch": 0.993050193050193, "grad_norm": 11.9375, "learning_rate": 1.324183230994545e-06, "loss": 0.789, "step": 1286 }, { "epoch": 0.9938223938223938, "grad_norm": 4.0625, "learning_rate": 1.322623257295414e-06, "loss": 1.036, "step": 1287 }, { "epoch": 0.9945945945945946, "grad_norm": 4.90625, "learning_rate": 1.3210631700977008e-06, "loss": 0.8297, "step": 1288 }, { "epoch": 0.9953667953667954, "grad_norm": 6.78125, "learning_rate": 1.3195029718395739e-06, "loss": 0.8711, "step": 1289 }, { "epoch": 0.9961389961389961, "grad_norm": 4.375, "learning_rate": 1.3179426649593735e-06, "loss": 0.9135, "step": 1290 }, { "epoch": 0.9969111969111969, "grad_norm": 4.71875, "learning_rate": 1.3163822518956115e-06, "loss": 0.8716, "step": 1291 }, { "epoch": 0.9976833976833976, "grad_norm": 7.25, "learning_rate": 1.314821735086965e-06, "loss": 0.881, "step": 1292 }, { "epoch": 0.9984555984555985, "grad_norm": 3.5625, "learning_rate": 1.3132611169722725e-06, "loss": 0.8443, "step": 1293 }, { "epoch": 0.9992277992277993, "grad_norm": 3.9375, "learning_rate": 1.311700399990532e-06, "loss": 0.8487, "step": 1294 }, { "epoch": 1.0, "grad_norm": 4.5, "learning_rate": 1.3101395865808956e-06, "loss": 0.9282, "step": 1295 }, { "epoch": 1.0, "eval_loss": 1.166237711906433, "eval_runtime": 117.1595, "eval_samples_per_second": 14.587, "eval_steps_per_second": 14.587, "step": 1295 }, { "epoch": 1.0007722007722009, "grad_norm": 3.953125, "learning_rate": 1.3085786791826657e-06, "loss": 0.8921, "step": 1296 }, { "epoch": 1.0015444015444015, "grad_norm": 3.859375, "learning_rate": 1.3070176802352923e-06, "loss": 0.8992, "step": 1297 }, { "epoch": 1.0023166023166024, "grad_norm": 4.1875, "learning_rate": 1.3054565921783675e-06, "loss": 0.7923, "step": 1298 }, { "epoch": 1.003088803088803, "grad_norm": 3.921875, "learning_rate": 1.303895417451624e-06, "loss": 0.9169, "step": 1299 }, { "epoch": 1.0038610038610039, "grad_norm": 3.921875, "learning_rate": 1.3023341584949284e-06, "loss": 0.9032, "step": 1300 }, { "epoch": 1.0046332046332047, "grad_norm": 3.5625, "learning_rate": 1.3007728177482804e-06, "loss": 0.8163, "step": 1301 }, { "epoch": 1.0054054054054054, "grad_norm": 4.09375, "learning_rate": 1.2992113976518062e-06, "loss": 0.8649, "step": 1302 }, { "epoch": 1.0061776061776062, "grad_norm": 4.9375, "learning_rate": 1.2976499006457574e-06, "loss": 1.0264, "step": 1303 }, { "epoch": 1.0069498069498068, "grad_norm": 4.9375, "learning_rate": 1.2960883291705042e-06, "loss": 0.8372, "step": 1304 }, { "epoch": 1.0077220077220077, "grad_norm": 4.4375, "learning_rate": 1.294526685666535e-06, "loss": 0.8845, "step": 1305 }, { "epoch": 1.0084942084942086, "grad_norm": 3.96875, "learning_rate": 1.2929649725744492e-06, "loss": 0.9275, "step": 1306 }, { "epoch": 1.0092664092664092, "grad_norm": 3.90625, "learning_rate": 1.291403192334956e-06, "loss": 0.8891, "step": 1307 }, { "epoch": 1.01003861003861, "grad_norm": 3.546875, "learning_rate": 1.2898413473888689e-06, "loss": 0.6692, "step": 1308 }, { "epoch": 1.0108108108108107, "grad_norm": 4.0, "learning_rate": 1.2882794401771027e-06, "loss": 0.7988, "step": 1309 }, { "epoch": 1.0115830115830116, "grad_norm": 6.3125, "learning_rate": 1.286717473140669e-06, "loss": 0.847, "step": 1310 }, { "epoch": 1.0123552123552124, "grad_norm": 3.921875, "learning_rate": 1.2851554487206746e-06, "loss": 0.7418, "step": 1311 }, { "epoch": 1.013127413127413, "grad_norm": 4.125, "learning_rate": 1.283593369358314e-06, "loss": 0.8644, "step": 1312 }, { "epoch": 1.013899613899614, "grad_norm": 3.828125, "learning_rate": 1.2820312374948687e-06, "loss": 0.9868, "step": 1313 }, { "epoch": 1.0146718146718148, "grad_norm": 4.34375, "learning_rate": 1.280469055571702e-06, "loss": 1.0081, "step": 1314 }, { "epoch": 1.0154440154440154, "grad_norm": 4.25, "learning_rate": 1.278906826030255e-06, "loss": 0.8124, "step": 1315 }, { "epoch": 1.0162162162162163, "grad_norm": 6.96875, "learning_rate": 1.2773445513120442e-06, "loss": 0.8898, "step": 1316 }, { "epoch": 1.016988416988417, "grad_norm": 3.640625, "learning_rate": 1.2757822338586557e-06, "loss": 0.7833, "step": 1317 }, { "epoch": 1.0177606177606178, "grad_norm": 3.609375, "learning_rate": 1.2742198761117428e-06, "loss": 0.7846, "step": 1318 }, { "epoch": 1.0185328185328186, "grad_norm": 4.90625, "learning_rate": 1.272657480513022e-06, "loss": 0.7721, "step": 1319 }, { "epoch": 1.0193050193050193, "grad_norm": 4.21875, "learning_rate": 1.2710950495042687e-06, "loss": 0.8946, "step": 1320 }, { "epoch": 1.0200772200772201, "grad_norm": 3.703125, "learning_rate": 1.2695325855273134e-06, "loss": 0.6866, "step": 1321 }, { "epoch": 1.0208494208494208, "grad_norm": 3.890625, "learning_rate": 1.2679700910240384e-06, "loss": 0.8033, "step": 1322 }, { "epoch": 1.0216216216216216, "grad_norm": 3.9375, "learning_rate": 1.2664075684363738e-06, "loss": 0.9072, "step": 1323 }, { "epoch": 1.0223938223938225, "grad_norm": 3.875, "learning_rate": 1.2648450202062936e-06, "loss": 0.905, "step": 1324 }, { "epoch": 1.0231660231660231, "grad_norm": 4.09375, "learning_rate": 1.263282448775812e-06, "loss": 0.7892, "step": 1325 }, { "epoch": 1.023938223938224, "grad_norm": 3.984375, "learning_rate": 1.2617198565869783e-06, "loss": 0.7828, "step": 1326 }, { "epoch": 1.0247104247104246, "grad_norm": 4.34375, "learning_rate": 1.2601572460818763e-06, "loss": 0.8467, "step": 1327 }, { "epoch": 1.0254826254826255, "grad_norm": 4.40625, "learning_rate": 1.2585946197026166e-06, "loss": 0.9286, "step": 1328 }, { "epoch": 1.0262548262548263, "grad_norm": 4.15625, "learning_rate": 1.2570319798913356e-06, "loss": 0.8534, "step": 1329 }, { "epoch": 1.027027027027027, "grad_norm": 4.1875, "learning_rate": 1.2554693290901899e-06, "loss": 0.9241, "step": 1330 }, { "epoch": 1.0277992277992278, "grad_norm": 4.15625, "learning_rate": 1.2539066697413543e-06, "loss": 0.9374, "step": 1331 }, { "epoch": 1.0285714285714285, "grad_norm": 5.09375, "learning_rate": 1.2523440042870163e-06, "loss": 0.8927, "step": 1332 }, { "epoch": 1.0293436293436293, "grad_norm": 6.21875, "learning_rate": 1.2507813351693728e-06, "loss": 0.752, "step": 1333 }, { "epoch": 1.0301158301158302, "grad_norm": 3.640625, "learning_rate": 1.2492186648306274e-06, "loss": 0.8318, "step": 1334 }, { "epoch": 1.0308880308880308, "grad_norm": 4.5, "learning_rate": 1.247655995712984e-06, "loss": 0.8396, "step": 1335 }, { "epoch": 1.0316602316602317, "grad_norm": 3.6875, "learning_rate": 1.2460933302586455e-06, "loss": 0.7458, "step": 1336 }, { "epoch": 1.0324324324324325, "grad_norm": 3.765625, "learning_rate": 1.2445306709098104e-06, "loss": 0.8077, "step": 1337 }, { "epoch": 1.0332046332046332, "grad_norm": 6.4375, "learning_rate": 1.2429680201086648e-06, "loss": 0.8131, "step": 1338 }, { "epoch": 1.033976833976834, "grad_norm": 4.0625, "learning_rate": 1.241405380297384e-06, "loss": 0.9696, "step": 1339 }, { "epoch": 1.0347490347490347, "grad_norm": 4.3125, "learning_rate": 1.2398427539181242e-06, "loss": 0.884, "step": 1340 }, { "epoch": 1.0355212355212355, "grad_norm": 5.28125, "learning_rate": 1.2382801434130219e-06, "loss": 0.8348, "step": 1341 }, { "epoch": 1.0362934362934364, "grad_norm": 4.96875, "learning_rate": 1.2367175512241889e-06, "loss": 0.9382, "step": 1342 }, { "epoch": 1.037065637065637, "grad_norm": 5.5, "learning_rate": 1.2351549797937068e-06, "loss": 0.9106, "step": 1343 }, { "epoch": 1.037837837837838, "grad_norm": 4.125, "learning_rate": 1.2335924315636264e-06, "loss": 0.8737, "step": 1344 }, { "epoch": 1.0386100386100385, "grad_norm": 3.59375, "learning_rate": 1.232029908975962e-06, "loss": 0.7547, "step": 1345 }, { "epoch": 1.0393822393822394, "grad_norm": 4.5625, "learning_rate": 1.230467414472687e-06, "loss": 0.9211, "step": 1346 }, { "epoch": 1.0401544401544403, "grad_norm": 4.1875, "learning_rate": 1.2289049504957321e-06, "loss": 0.8032, "step": 1347 }, { "epoch": 1.040926640926641, "grad_norm": 3.90625, "learning_rate": 1.2273425194869784e-06, "loss": 0.8994, "step": 1348 }, { "epoch": 1.0416988416988417, "grad_norm": 3.859375, "learning_rate": 1.2257801238882574e-06, "loss": 0.8932, "step": 1349 }, { "epoch": 1.0424710424710424, "grad_norm": 5.75, "learning_rate": 1.224217766141345e-06, "loss": 0.959, "step": 1350 }, { "epoch": 1.0432432432432432, "grad_norm": 4.0625, "learning_rate": 1.2226554486879563e-06, "loss": 0.9221, "step": 1351 }, { "epoch": 1.044015444015444, "grad_norm": 4.0625, "learning_rate": 1.221093173969745e-06, "loss": 0.8457, "step": 1352 }, { "epoch": 1.0447876447876447, "grad_norm": 4.625, "learning_rate": 1.2195309444282983e-06, "loss": 0.7228, "step": 1353 }, { "epoch": 1.0455598455598456, "grad_norm": 4.1875, "learning_rate": 1.2179687625051317e-06, "loss": 0.7437, "step": 1354 }, { "epoch": 1.0463320463320462, "grad_norm": 4.15625, "learning_rate": 1.2164066306416866e-06, "loss": 0.8908, "step": 1355 }, { "epoch": 1.047104247104247, "grad_norm": 4.53125, "learning_rate": 1.2148445512793258e-06, "loss": 0.7783, "step": 1356 }, { "epoch": 1.047876447876448, "grad_norm": 3.625, "learning_rate": 1.213282526859331e-06, "loss": 0.7671, "step": 1357 }, { "epoch": 1.0486486486486486, "grad_norm": 4.03125, "learning_rate": 1.2117205598228981e-06, "loss": 0.8526, "step": 1358 }, { "epoch": 1.0494208494208495, "grad_norm": 4.28125, "learning_rate": 1.2101586526111315e-06, "loss": 0.8799, "step": 1359 }, { "epoch": 1.05019305019305, "grad_norm": 4.46875, "learning_rate": 1.208596807665044e-06, "loss": 0.7966, "step": 1360 }, { "epoch": 1.050965250965251, "grad_norm": 3.875, "learning_rate": 1.2070350274255512e-06, "loss": 0.954, "step": 1361 }, { "epoch": 1.0517374517374518, "grad_norm": 3.625, "learning_rate": 1.2054733143334651e-06, "loss": 0.769, "step": 1362 }, { "epoch": 1.0525096525096524, "grad_norm": 4.4375, "learning_rate": 1.203911670829496e-06, "loss": 0.7462, "step": 1363 }, { "epoch": 1.0532818532818533, "grad_norm": 3.703125, "learning_rate": 1.202350099354243e-06, "loss": 0.8299, "step": 1364 }, { "epoch": 1.054054054054054, "grad_norm": 4.125, "learning_rate": 1.200788602348194e-06, "loss": 0.9225, "step": 1365 }, { "epoch": 1.0548262548262548, "grad_norm": 4.25, "learning_rate": 1.1992271822517202e-06, "loss": 0.7662, "step": 1366 }, { "epoch": 1.0555984555984557, "grad_norm": 4.125, "learning_rate": 1.197665841505072e-06, "loss": 0.9211, "step": 1367 }, { "epoch": 1.0563706563706563, "grad_norm": 3.859375, "learning_rate": 1.1961045825483763e-06, "loss": 0.7429, "step": 1368 }, { "epoch": 1.0571428571428572, "grad_norm": 3.875, "learning_rate": 1.1945434078216327e-06, "loss": 0.9716, "step": 1369 }, { "epoch": 1.057915057915058, "grad_norm": 4.125, "learning_rate": 1.192982319764708e-06, "loss": 0.8618, "step": 1370 }, { "epoch": 1.0586872586872587, "grad_norm": 4.25, "learning_rate": 1.191421320817335e-06, "loss": 0.891, "step": 1371 }, { "epoch": 1.0594594594594595, "grad_norm": 4.71875, "learning_rate": 1.1898604134191048e-06, "loss": 0.9193, "step": 1372 }, { "epoch": 1.0602316602316602, "grad_norm": 5.25, "learning_rate": 1.1882996000094683e-06, "loss": 0.9202, "step": 1373 }, { "epoch": 1.061003861003861, "grad_norm": 3.71875, "learning_rate": 1.186738883027728e-06, "loss": 0.8065, "step": 1374 }, { "epoch": 1.0617760617760619, "grad_norm": 3.5625, "learning_rate": 1.1851782649130355e-06, "loss": 0.8588, "step": 1375 }, { "epoch": 1.0625482625482625, "grad_norm": 3.859375, "learning_rate": 1.1836177481043885e-06, "loss": 0.8454, "step": 1376 }, { "epoch": 1.0633204633204634, "grad_norm": 3.984375, "learning_rate": 1.1820573350406269e-06, "loss": 0.8878, "step": 1377 }, { "epoch": 1.064092664092664, "grad_norm": 3.9375, "learning_rate": 1.1804970281604265e-06, "loss": 0.8874, "step": 1378 }, { "epoch": 1.0648648648648649, "grad_norm": 6.8125, "learning_rate": 1.1789368299022997e-06, "loss": 0.804, "step": 1379 }, { "epoch": 1.0656370656370657, "grad_norm": 3.625, "learning_rate": 1.1773767427045864e-06, "loss": 0.8393, "step": 1380 }, { "epoch": 1.0664092664092664, "grad_norm": 4.40625, "learning_rate": 1.175816769005455e-06, "loss": 1.0005, "step": 1381 }, { "epoch": 1.0671814671814672, "grad_norm": 5.90625, "learning_rate": 1.1742569112428969e-06, "loss": 0.8725, "step": 1382 }, { "epoch": 1.0679536679536679, "grad_norm": 5.125, "learning_rate": 1.1726971718547196e-06, "loss": 0.9454, "step": 1383 }, { "epoch": 1.0687258687258687, "grad_norm": 4.09375, "learning_rate": 1.171137553278548e-06, "loss": 0.8605, "step": 1384 }, { "epoch": 1.0694980694980696, "grad_norm": 3.90625, "learning_rate": 1.1695780579518176e-06, "loss": 0.9719, "step": 1385 }, { "epoch": 1.0702702702702702, "grad_norm": 3.75, "learning_rate": 1.1680186883117711e-06, "loss": 0.7308, "step": 1386 }, { "epoch": 1.071042471042471, "grad_norm": 3.765625, "learning_rate": 1.1664594467954552e-06, "loss": 0.8812, "step": 1387 }, { "epoch": 1.0718146718146717, "grad_norm": 3.953125, "learning_rate": 1.1649003358397153e-06, "loss": 0.9152, "step": 1388 }, { "epoch": 1.0725868725868726, "grad_norm": 3.875, "learning_rate": 1.1633413578811937e-06, "loss": 0.9577, "step": 1389 }, { "epoch": 1.0733590733590734, "grad_norm": 5.21875, "learning_rate": 1.161782515356325e-06, "loss": 0.8018, "step": 1390 }, { "epoch": 1.074131274131274, "grad_norm": 3.578125, "learning_rate": 1.160223810701331e-06, "loss": 0.7264, "step": 1391 }, { "epoch": 1.074903474903475, "grad_norm": 4.46875, "learning_rate": 1.1586652463522193e-06, "loss": 0.9539, "step": 1392 }, { "epoch": 1.0756756756756758, "grad_norm": 3.75, "learning_rate": 1.1571068247447777e-06, "loss": 0.7473, "step": 1393 }, { "epoch": 1.0764478764478764, "grad_norm": 4.59375, "learning_rate": 1.1555485483145708e-06, "loss": 0.9609, "step": 1394 }, { "epoch": 1.0772200772200773, "grad_norm": 4.21875, "learning_rate": 1.1539904194969364e-06, "loss": 0.9284, "step": 1395 }, { "epoch": 1.077992277992278, "grad_norm": 3.640625, "learning_rate": 1.1524324407269814e-06, "loss": 0.86, "step": 1396 }, { "epoch": 1.0787644787644788, "grad_norm": 4.84375, "learning_rate": 1.1508746144395784e-06, "loss": 0.8292, "step": 1397 }, { "epoch": 1.0795366795366794, "grad_norm": 13.0, "learning_rate": 1.1493169430693628e-06, "loss": 0.8311, "step": 1398 }, { "epoch": 1.0803088803088803, "grad_norm": 3.953125, "learning_rate": 1.1477594290507254e-06, "loss": 0.8723, "step": 1399 }, { "epoch": 1.0810810810810811, "grad_norm": 5.375, "learning_rate": 1.1462020748178129e-06, "loss": 0.9062, "step": 1400 }, { "epoch": 1.0818532818532818, "grad_norm": 4.09375, "learning_rate": 1.1446448828045227e-06, "loss": 0.9722, "step": 1401 }, { "epoch": 1.0826254826254826, "grad_norm": 3.34375, "learning_rate": 1.1430878554444966e-06, "loss": 0.6784, "step": 1402 }, { "epoch": 1.0833976833976835, "grad_norm": 4.1875, "learning_rate": 1.1415309951711213e-06, "loss": 0.9522, "step": 1403 }, { "epoch": 1.0841698841698841, "grad_norm": 5.3125, "learning_rate": 1.1399743044175206e-06, "loss": 0.8812, "step": 1404 }, { "epoch": 1.084942084942085, "grad_norm": 9.1875, "learning_rate": 1.1384177856165547e-06, "loss": 0.8771, "step": 1405 }, { "epoch": 1.0857142857142856, "grad_norm": 3.609375, "learning_rate": 1.1368614412008146e-06, "loss": 0.7735, "step": 1406 }, { "epoch": 1.0864864864864865, "grad_norm": 11.125, "learning_rate": 1.1353052736026183e-06, "loss": 0.8828, "step": 1407 }, { "epoch": 1.0872586872586874, "grad_norm": 6.03125, "learning_rate": 1.133749285254008e-06, "loss": 0.8755, "step": 1408 }, { "epoch": 1.088030888030888, "grad_norm": 3.65625, "learning_rate": 1.132193478586746e-06, "loss": 0.8297, "step": 1409 }, { "epoch": 1.0888030888030888, "grad_norm": 3.875, "learning_rate": 1.1306378560323096e-06, "loss": 0.7679, "step": 1410 }, { "epoch": 1.0895752895752895, "grad_norm": 3.8125, "learning_rate": 1.12908242002189e-06, "loss": 0.8329, "step": 1411 }, { "epoch": 1.0903474903474903, "grad_norm": 5.53125, "learning_rate": 1.1275271729863844e-06, "loss": 0.8805, "step": 1412 }, { "epoch": 1.0911196911196912, "grad_norm": 4.0, "learning_rate": 1.1259721173563972e-06, "loss": 0.8535, "step": 1413 }, { "epoch": 1.0918918918918918, "grad_norm": 3.953125, "learning_rate": 1.1244172555622327e-06, "loss": 0.8375, "step": 1414 }, { "epoch": 1.0926640926640927, "grad_norm": 4.21875, "learning_rate": 1.1228625900338916e-06, "loss": 0.8574, "step": 1415 }, { "epoch": 1.0934362934362933, "grad_norm": 10.1875, "learning_rate": 1.1213081232010688e-06, "loss": 0.6946, "step": 1416 }, { "epoch": 1.0942084942084942, "grad_norm": 5.84375, "learning_rate": 1.1197538574931488e-06, "loss": 0.9415, "step": 1417 }, { "epoch": 1.094980694980695, "grad_norm": 9.8125, "learning_rate": 1.1181997953392004e-06, "loss": 0.8089, "step": 1418 }, { "epoch": 1.0957528957528957, "grad_norm": 4.25, "learning_rate": 1.116645939167976e-06, "loss": 0.7994, "step": 1419 }, { "epoch": 1.0965250965250966, "grad_norm": 4.09375, "learning_rate": 1.1150922914079044e-06, "loss": 0.9021, "step": 1420 }, { "epoch": 1.0972972972972972, "grad_norm": 5.28125, "learning_rate": 1.11353885448709e-06, "loss": 0.8443, "step": 1421 }, { "epoch": 1.098069498069498, "grad_norm": 4.1875, "learning_rate": 1.111985630833308e-06, "loss": 0.8836, "step": 1422 }, { "epoch": 1.098841698841699, "grad_norm": 5.84375, "learning_rate": 1.1104326228739978e-06, "loss": 0.9469, "step": 1423 }, { "epoch": 1.0996138996138995, "grad_norm": 4.84375, "learning_rate": 1.1088798330362645e-06, "loss": 0.97, "step": 1424 }, { "epoch": 1.1003861003861004, "grad_norm": 4.90625, "learning_rate": 1.1073272637468716e-06, "loss": 0.9374, "step": 1425 }, { "epoch": 1.1011583011583013, "grad_norm": 4.0, "learning_rate": 1.105774917432237e-06, "loss": 0.826, "step": 1426 }, { "epoch": 1.101930501930502, "grad_norm": 3.953125, "learning_rate": 1.1042227965184312e-06, "loss": 0.8807, "step": 1427 }, { "epoch": 1.1027027027027028, "grad_norm": 3.859375, "learning_rate": 1.102670903431171e-06, "loss": 0.8662, "step": 1428 }, { "epoch": 1.1034749034749034, "grad_norm": 4.1875, "learning_rate": 1.1011192405958187e-06, "loss": 0.8221, "step": 1429 }, { "epoch": 1.1042471042471043, "grad_norm": 3.9375, "learning_rate": 1.0995678104373764e-06, "loss": 0.6403, "step": 1430 }, { "epoch": 1.1050193050193051, "grad_norm": 3.84375, "learning_rate": 1.0980166153804814e-06, "loss": 0.6884, "step": 1431 }, { "epoch": 1.1057915057915058, "grad_norm": 3.84375, "learning_rate": 1.0964656578494053e-06, "loss": 0.6648, "step": 1432 }, { "epoch": 1.1065637065637066, "grad_norm": 4.1875, "learning_rate": 1.0949149402680475e-06, "loss": 1.0424, "step": 1433 }, { "epoch": 1.1073359073359073, "grad_norm": 4.125, "learning_rate": 1.0933644650599324e-06, "loss": 0.8736, "step": 1434 }, { "epoch": 1.1081081081081081, "grad_norm": 3.828125, "learning_rate": 1.0918142346482056e-06, "loss": 0.8264, "step": 1435 }, { "epoch": 1.108880308880309, "grad_norm": 4.34375, "learning_rate": 1.0902642514556304e-06, "loss": 0.8705, "step": 1436 }, { "epoch": 1.1096525096525096, "grad_norm": 6.34375, "learning_rate": 1.0887145179045836e-06, "loss": 0.7945, "step": 1437 }, { "epoch": 1.1104247104247105, "grad_norm": 4.75, "learning_rate": 1.0871650364170523e-06, "loss": 0.8578, "step": 1438 }, { "epoch": 1.111196911196911, "grad_norm": 4.625, "learning_rate": 1.0856158094146284e-06, "loss": 0.9491, "step": 1439 }, { "epoch": 1.111969111969112, "grad_norm": 3.984375, "learning_rate": 1.0840668393185073e-06, "loss": 0.8704, "step": 1440 }, { "epoch": 1.1127413127413128, "grad_norm": 4.375, "learning_rate": 1.082518128549483e-06, "loss": 0.8792, "step": 1441 }, { "epoch": 1.1135135135135135, "grad_norm": 3.859375, "learning_rate": 1.0809696795279428e-06, "loss": 0.8743, "step": 1442 }, { "epoch": 1.1142857142857143, "grad_norm": 4.0625, "learning_rate": 1.0794214946738666e-06, "loss": 0.9235, "step": 1443 }, { "epoch": 1.115057915057915, "grad_norm": 3.578125, "learning_rate": 1.0778735764068201e-06, "loss": 0.7794, "step": 1444 }, { "epoch": 1.1158301158301158, "grad_norm": 5.125, "learning_rate": 1.0763259271459532e-06, "loss": 0.8958, "step": 1445 }, { "epoch": 1.1166023166023167, "grad_norm": 3.890625, "learning_rate": 1.074778549309996e-06, "loss": 0.8529, "step": 1446 }, { "epoch": 1.1173745173745173, "grad_norm": 3.8125, "learning_rate": 1.0732314453172517e-06, "loss": 0.8888, "step": 1447 }, { "epoch": 1.1181467181467182, "grad_norm": 4.0625, "learning_rate": 1.071684617585599e-06, "loss": 0.8904, "step": 1448 }, { "epoch": 1.118918918918919, "grad_norm": 4.15625, "learning_rate": 1.070138068532483e-06, "loss": 0.9224, "step": 1449 }, { "epoch": 1.1196911196911197, "grad_norm": 3.984375, "learning_rate": 1.0685918005749125e-06, "loss": 0.8516, "step": 1450 }, { "epoch": 1.1204633204633205, "grad_norm": 5.0, "learning_rate": 1.0670458161294587e-06, "loss": 0.767, "step": 1451 }, { "epoch": 1.1212355212355212, "grad_norm": 3.703125, "learning_rate": 1.0655001176122497e-06, "loss": 0.8514, "step": 1452 }, { "epoch": 1.122007722007722, "grad_norm": 3.8125, "learning_rate": 1.0639547074389647e-06, "loss": 0.914, "step": 1453 }, { "epoch": 1.1227799227799227, "grad_norm": 4.40625, "learning_rate": 1.0624095880248344e-06, "loss": 0.941, "step": 1454 }, { "epoch": 1.1235521235521235, "grad_norm": 3.671875, "learning_rate": 1.0608647617846338e-06, "loss": 0.784, "step": 1455 }, { "epoch": 1.1243243243243244, "grad_norm": 4.6875, "learning_rate": 1.0593202311326807e-06, "loss": 0.8191, "step": 1456 }, { "epoch": 1.125096525096525, "grad_norm": 4.34375, "learning_rate": 1.0577759984828308e-06, "loss": 0.8128, "step": 1457 }, { "epoch": 1.1258687258687259, "grad_norm": 3.609375, "learning_rate": 1.0562320662484727e-06, "loss": 0.7243, "step": 1458 }, { "epoch": 1.1266409266409267, "grad_norm": 4.53125, "learning_rate": 1.054688436842527e-06, "loss": 0.8614, "step": 1459 }, { "epoch": 1.1274131274131274, "grad_norm": 5.5, "learning_rate": 1.0531451126774412e-06, "loss": 0.9314, "step": 1460 }, { "epoch": 1.1281853281853282, "grad_norm": 3.875, "learning_rate": 1.051602096165184e-06, "loss": 0.9411, "step": 1461 }, { "epoch": 1.1289575289575289, "grad_norm": 4.28125, "learning_rate": 1.0500593897172453e-06, "loss": 0.8694, "step": 1462 }, { "epoch": 1.1297297297297297, "grad_norm": 4.3125, "learning_rate": 1.048516995744629e-06, "loss": 0.9183, "step": 1463 }, { "epoch": 1.1305019305019306, "grad_norm": 3.875, "learning_rate": 1.0469749166578508e-06, "loss": 0.7989, "step": 1464 }, { "epoch": 1.1312741312741312, "grad_norm": 4.21875, "learning_rate": 1.0454331548669359e-06, "loss": 0.9669, "step": 1465 }, { "epoch": 1.132046332046332, "grad_norm": 4.25, "learning_rate": 1.043891712781411e-06, "loss": 0.89, "step": 1466 }, { "epoch": 1.1328185328185327, "grad_norm": 3.8125, "learning_rate": 1.0423505928103053e-06, "loss": 0.8149, "step": 1467 }, { "epoch": 1.1335907335907336, "grad_norm": 4.5, "learning_rate": 1.0408097973621442e-06, "loss": 0.9398, "step": 1468 }, { "epoch": 1.1343629343629344, "grad_norm": 3.703125, "learning_rate": 1.0392693288449447e-06, "loss": 0.8588, "step": 1469 }, { "epoch": 1.135135135135135, "grad_norm": 3.65625, "learning_rate": 1.0377291896662142e-06, "loss": 0.8148, "step": 1470 }, { "epoch": 1.135907335907336, "grad_norm": 29.625, "learning_rate": 1.0361893822329442e-06, "loss": 0.8518, "step": 1471 }, { "epoch": 1.1366795366795366, "grad_norm": 3.71875, "learning_rate": 1.0346499089516091e-06, "loss": 0.81, "step": 1472 }, { "epoch": 1.1374517374517374, "grad_norm": 4.625, "learning_rate": 1.0331107722281602e-06, "loss": 0.8354, "step": 1473 }, { "epoch": 1.1382239382239383, "grad_norm": 12.5625, "learning_rate": 1.031571974468023e-06, "loss": 0.8967, "step": 1474 }, { "epoch": 1.138996138996139, "grad_norm": 4.25, "learning_rate": 1.0300335180760931e-06, "loss": 0.8727, "step": 1475 }, { "epoch": 1.1397683397683398, "grad_norm": 7.0, "learning_rate": 1.0284954054567334e-06, "loss": 0.9646, "step": 1476 }, { "epoch": 1.1405405405405404, "grad_norm": 5.03125, "learning_rate": 1.026957639013768e-06, "loss": 0.8463, "step": 1477 }, { "epoch": 1.1413127413127413, "grad_norm": 3.90625, "learning_rate": 1.0254202211504815e-06, "loss": 0.8022, "step": 1478 }, { "epoch": 1.1420849420849422, "grad_norm": 3.84375, "learning_rate": 1.0238831542696126e-06, "loss": 0.8863, "step": 1479 }, { "epoch": 1.1428571428571428, "grad_norm": 3.65625, "learning_rate": 1.0223464407733524e-06, "loss": 0.8564, "step": 1480 }, { "epoch": 1.1436293436293437, "grad_norm": 3.96875, "learning_rate": 1.0208100830633397e-06, "loss": 0.8922, "step": 1481 }, { "epoch": 1.1444015444015445, "grad_norm": 3.859375, "learning_rate": 1.0192740835406557e-06, "loss": 1.0242, "step": 1482 }, { "epoch": 1.1451737451737452, "grad_norm": 8.6875, "learning_rate": 1.0177384446058235e-06, "loss": 0.9454, "step": 1483 }, { "epoch": 1.145945945945946, "grad_norm": 4.71875, "learning_rate": 1.0162031686588025e-06, "loss": 0.8822, "step": 1484 }, { "epoch": 1.1467181467181466, "grad_norm": 3.9375, "learning_rate": 1.0146682580989841e-06, "loss": 0.8877, "step": 1485 }, { "epoch": 1.1474903474903475, "grad_norm": 4.53125, "learning_rate": 1.0131337153251892e-06, "loss": 0.8711, "step": 1486 }, { "epoch": 1.1482625482625481, "grad_norm": 4.6875, "learning_rate": 1.0115995427356632e-06, "loss": 0.8507, "step": 1487 }, { "epoch": 1.149034749034749, "grad_norm": 5.625, "learning_rate": 1.0100657427280736e-06, "loss": 0.7455, "step": 1488 }, { "epoch": 1.1498069498069499, "grad_norm": 4.8125, "learning_rate": 1.0085323176995063e-06, "loss": 1.0715, "step": 1489 }, { "epoch": 1.1505791505791505, "grad_norm": 11.4375, "learning_rate": 1.0069992700464593e-06, "loss": 0.8409, "step": 1490 }, { "epoch": 1.1513513513513514, "grad_norm": 4.25, "learning_rate": 1.005466602164842e-06, "loss": 0.9199, "step": 1491 }, { "epoch": 1.1521235521235522, "grad_norm": 4.46875, "learning_rate": 1.003934316449971e-06, "loss": 0.8802, "step": 1492 }, { "epoch": 1.1528957528957529, "grad_norm": 4.0625, "learning_rate": 1.0024024152965637e-06, "loss": 0.7694, "step": 1493 }, { "epoch": 1.1536679536679537, "grad_norm": 4.3125, "learning_rate": 1.000870901098738e-06, "loss": 0.8695, "step": 1494 }, { "epoch": 1.1544401544401544, "grad_norm": 4.28125, "learning_rate": 9.993397762500066e-07, "loss": 0.8026, "step": 1495 }, { "epoch": 1.1552123552123552, "grad_norm": 4.40625, "learning_rate": 9.978090431432738e-07, "loss": 0.9059, "step": 1496 }, { "epoch": 1.155984555984556, "grad_norm": 3.46875, "learning_rate": 9.96278704170832e-07, "loss": 0.7216, "step": 1497 }, { "epoch": 1.1567567567567567, "grad_norm": 3.734375, "learning_rate": 9.947487617243563e-07, "loss": 0.7665, "step": 1498 }, { "epoch": 1.1575289575289576, "grad_norm": 3.796875, "learning_rate": 9.932192181949035e-07, "loss": 0.8581, "step": 1499 }, { "epoch": 1.1583011583011582, "grad_norm": 5.6875, "learning_rate": 9.916900759729068e-07, "loss": 0.9369, "step": 1500 }, { "epoch": 1.159073359073359, "grad_norm": 3.765625, "learning_rate": 9.901613374481712e-07, "loss": 0.8171, "step": 1501 }, { "epoch": 1.15984555984556, "grad_norm": 5.40625, "learning_rate": 9.88633005009872e-07, "loss": 0.8772, "step": 1502 }, { "epoch": 1.1606177606177606, "grad_norm": 5.28125, "learning_rate": 9.871050810465495e-07, "loss": 0.9448, "step": 1503 }, { "epoch": 1.1613899613899614, "grad_norm": 3.765625, "learning_rate": 9.855775679461047e-07, "loss": 0.8648, "step": 1504 }, { "epoch": 1.1621621621621623, "grad_norm": 4.25, "learning_rate": 9.840504680957979e-07, "loss": 0.7913, "step": 1505 }, { "epoch": 1.162934362934363, "grad_norm": 4.0625, "learning_rate": 9.825237838822423e-07, "loss": 0.73, "step": 1506 }, { "epoch": 1.1637065637065638, "grad_norm": 6.71875, "learning_rate": 9.809975176914025e-07, "loss": 0.9936, "step": 1507 }, { "epoch": 1.1644787644787644, "grad_norm": 4.46875, "learning_rate": 9.7947167190859e-07, "loss": 0.9128, "step": 1508 }, { "epoch": 1.1652509652509653, "grad_norm": 4.3125, "learning_rate": 9.779462489184576e-07, "loss": 0.9951, "step": 1509 }, { "epoch": 1.166023166023166, "grad_norm": 4.09375, "learning_rate": 9.764212511049993e-07, "loss": 0.8522, "step": 1510 }, { "epoch": 1.1667953667953668, "grad_norm": 4.65625, "learning_rate": 9.748966808515428e-07, "loss": 0.7509, "step": 1511 }, { "epoch": 1.1675675675675676, "grad_norm": 16.75, "learning_rate": 9.733725405407492e-07, "loss": 0.7869, "step": 1512 }, { "epoch": 1.1683397683397683, "grad_norm": 4.9375, "learning_rate": 9.718488325546072e-07, "loss": 0.8801, "step": 1513 }, { "epoch": 1.1691119691119691, "grad_norm": 3.9375, "learning_rate": 9.703255592744293e-07, "loss": 0.688, "step": 1514 }, { "epoch": 1.16988416988417, "grad_norm": 3.953125, "learning_rate": 9.688027230808486e-07, "loss": 1.0034, "step": 1515 }, { "epoch": 1.1706563706563706, "grad_norm": 3.859375, "learning_rate": 9.672803263538163e-07, "loss": 0.7153, "step": 1516 }, { "epoch": 1.1714285714285715, "grad_norm": 4.0, "learning_rate": 9.657583714725953e-07, "loss": 0.7217, "step": 1517 }, { "epoch": 1.1722007722007721, "grad_norm": 3.765625, "learning_rate": 9.642368608157593e-07, "loss": 0.9013, "step": 1518 }, { "epoch": 1.172972972972973, "grad_norm": 12.5, "learning_rate": 9.627157967611862e-07, "loss": 0.8988, "step": 1519 }, { "epoch": 1.1737451737451738, "grad_norm": 3.90625, "learning_rate": 9.611951816860573e-07, "loss": 0.8784, "step": 1520 }, { "epoch": 1.1745173745173745, "grad_norm": 3.71875, "learning_rate": 9.596750179668521e-07, "loss": 0.9655, "step": 1521 }, { "epoch": 1.1752895752895753, "grad_norm": 3.75, "learning_rate": 9.581553079793435e-07, "loss": 0.8313, "step": 1522 }, { "epoch": 1.176061776061776, "grad_norm": 5.1875, "learning_rate": 9.566360540985963e-07, "loss": 0.8842, "step": 1523 }, { "epoch": 1.1768339768339768, "grad_norm": 3.75, "learning_rate": 9.55117258698963e-07, "loss": 0.8134, "step": 1524 }, { "epoch": 1.1776061776061777, "grad_norm": 3.84375, "learning_rate": 9.53598924154078e-07, "loss": 0.8214, "step": 1525 }, { "epoch": 1.1783783783783783, "grad_norm": 4.65625, "learning_rate": 9.520810528368573e-07, "loss": 0.8271, "step": 1526 }, { "epoch": 1.1791505791505792, "grad_norm": 3.890625, "learning_rate": 9.505636471194906e-07, "loss": 0.8063, "step": 1527 }, { "epoch": 1.1799227799227798, "grad_norm": 4.28125, "learning_rate": 9.490467093734424e-07, "loss": 0.7982, "step": 1528 }, { "epoch": 1.1806949806949807, "grad_norm": 3.203125, "learning_rate": 9.475302419694447e-07, "loss": 0.6696, "step": 1529 }, { "epoch": 1.1814671814671815, "grad_norm": 4.46875, "learning_rate": 9.460142472774937e-07, "loss": 0.7606, "step": 1530 }, { "epoch": 1.1822393822393822, "grad_norm": 3.9375, "learning_rate": 9.444987276668481e-07, "loss": 0.811, "step": 1531 }, { "epoch": 1.183011583011583, "grad_norm": 4.125, "learning_rate": 9.429836855060243e-07, "loss": 0.9347, "step": 1532 }, { "epoch": 1.1837837837837837, "grad_norm": 4.3125, "learning_rate": 9.414691231627903e-07, "loss": 0.8313, "step": 1533 }, { "epoch": 1.1845559845559845, "grad_norm": 4.09375, "learning_rate": 9.399550430041671e-07, "loss": 0.8187, "step": 1534 }, { "epoch": 1.1853281853281854, "grad_norm": 4.53125, "learning_rate": 9.384414473964199e-07, "loss": 0.7982, "step": 1535 }, { "epoch": 1.186100386100386, "grad_norm": 4.09375, "learning_rate": 9.369283387050579e-07, "loss": 0.8363, "step": 1536 }, { "epoch": 1.186872586872587, "grad_norm": 4.0, "learning_rate": 9.354157192948296e-07, "loss": 0.859, "step": 1537 }, { "epoch": 1.1876447876447878, "grad_norm": 3.828125, "learning_rate": 9.33903591529717e-07, "loss": 0.7797, "step": 1538 }, { "epoch": 1.1884169884169884, "grad_norm": 7.34375, "learning_rate": 9.323919577729354e-07, "loss": 0.7858, "step": 1539 }, { "epoch": 1.1891891891891893, "grad_norm": 3.953125, "learning_rate": 9.308808203869281e-07, "loss": 0.8566, "step": 1540 }, { "epoch": 1.18996138996139, "grad_norm": 4.1875, "learning_rate": 9.293701817333611e-07, "loss": 0.8175, "step": 1541 }, { "epoch": 1.1907335907335908, "grad_norm": 3.90625, "learning_rate": 9.278600441731226e-07, "loss": 0.7253, "step": 1542 }, { "epoch": 1.1915057915057914, "grad_norm": 4.28125, "learning_rate": 9.26350410066317e-07, "loss": 0.977, "step": 1543 }, { "epoch": 1.1922779922779922, "grad_norm": 4.1875, "learning_rate": 9.248412817722616e-07, "loss": 1.0162, "step": 1544 }, { "epoch": 1.193050193050193, "grad_norm": 52.5, "learning_rate": 9.233326616494839e-07, "loss": 0.9997, "step": 1545 }, { "epoch": 1.1938223938223937, "grad_norm": 4.25, "learning_rate": 9.218245520557165e-07, "loss": 0.8844, "step": 1546 }, { "epoch": 1.1945945945945946, "grad_norm": 4.15625, "learning_rate": 9.203169553478944e-07, "loss": 0.8462, "step": 1547 }, { "epoch": 1.1953667953667955, "grad_norm": 3.90625, "learning_rate": 9.188098738821519e-07, "loss": 0.8703, "step": 1548 }, { "epoch": 1.196138996138996, "grad_norm": 3.78125, "learning_rate": 9.173033100138161e-07, "loss": 0.8172, "step": 1549 }, { "epoch": 1.196911196911197, "grad_norm": 5.71875, "learning_rate": 9.157972660974074e-07, "loss": 1.0274, "step": 1550 }, { "epoch": 1.1976833976833976, "grad_norm": 3.5625, "learning_rate": 9.142917444866314e-07, "loss": 0.7796, "step": 1551 }, { "epoch": 1.1984555984555985, "grad_norm": 4.90625, "learning_rate": 9.127867475343792e-07, "loss": 0.8915, "step": 1552 }, { "epoch": 1.1992277992277993, "grad_norm": 4.28125, "learning_rate": 9.112822775927214e-07, "loss": 0.8632, "step": 1553 }, { "epoch": 1.2, "grad_norm": 3.828125, "learning_rate": 9.097783370129049e-07, "loss": 0.8926, "step": 1554 }, { "epoch": 1.2007722007722008, "grad_norm": 4.59375, "learning_rate": 9.082749281453485e-07, "loss": 0.8371, "step": 1555 }, { "epoch": 1.2015444015444015, "grad_norm": 4.53125, "learning_rate": 9.067720533396416e-07, "loss": 0.93, "step": 1556 }, { "epoch": 1.2023166023166023, "grad_norm": 4.53125, "learning_rate": 9.052697149445372e-07, "loss": 0.9408, "step": 1557 }, { "epoch": 1.2030888030888032, "grad_norm": 4.3125, "learning_rate": 9.037679153079523e-07, "loss": 0.9367, "step": 1558 }, { "epoch": 1.2038610038610038, "grad_norm": 4.84375, "learning_rate": 9.022666567769591e-07, "loss": 0.8528, "step": 1559 }, { "epoch": 1.2046332046332047, "grad_norm": 5.25, "learning_rate": 9.00765941697786e-07, "loss": 0.8943, "step": 1560 }, { "epoch": 1.2054054054054055, "grad_norm": 4.03125, "learning_rate": 8.992657724158121e-07, "loss": 0.8191, "step": 1561 }, { "epoch": 1.2061776061776062, "grad_norm": 5.75, "learning_rate": 8.977661512755623e-07, "loss": 0.8315, "step": 1562 }, { "epoch": 1.206949806949807, "grad_norm": 3.4375, "learning_rate": 8.962670806207055e-07, "loss": 0.7952, "step": 1563 }, { "epoch": 1.2077220077220077, "grad_norm": 3.953125, "learning_rate": 8.947685627940511e-07, "loss": 0.8334, "step": 1564 }, { "epoch": 1.2084942084942085, "grad_norm": 4.5, "learning_rate": 8.93270600137543e-07, "loss": 0.8956, "step": 1565 }, { "epoch": 1.2092664092664092, "grad_norm": 3.953125, "learning_rate": 8.917731949922589e-07, "loss": 1.0505, "step": 1566 }, { "epoch": 1.21003861003861, "grad_norm": 4.0, "learning_rate": 8.902763496984038e-07, "loss": 0.7873, "step": 1567 }, { "epoch": 1.2108108108108109, "grad_norm": 3.953125, "learning_rate": 8.887800665953086e-07, "loss": 0.9581, "step": 1568 }, { "epoch": 1.2115830115830115, "grad_norm": 3.84375, "learning_rate": 8.872843480214261e-07, "loss": 0.8607, "step": 1569 }, { "epoch": 1.2123552123552124, "grad_norm": 4.125, "learning_rate": 8.857891963143251e-07, "loss": 0.8901, "step": 1570 }, { "epoch": 1.2131274131274132, "grad_norm": 3.5625, "learning_rate": 8.842946138106899e-07, "loss": 0.7541, "step": 1571 }, { "epoch": 1.2138996138996139, "grad_norm": 3.546875, "learning_rate": 8.828006028463157e-07, "loss": 0.783, "step": 1572 }, { "epoch": 1.2146718146718147, "grad_norm": 3.6875, "learning_rate": 8.813071657561024e-07, "loss": 0.837, "step": 1573 }, { "epoch": 1.2154440154440154, "grad_norm": 6.59375, "learning_rate": 8.798143048740548e-07, "loss": 0.7662, "step": 1574 }, { "epoch": 1.2162162162162162, "grad_norm": 3.953125, "learning_rate": 8.783220225332767e-07, "loss": 0.9099, "step": 1575 }, { "epoch": 1.2169884169884169, "grad_norm": 6.40625, "learning_rate": 8.768303210659675e-07, "loss": 0.973, "step": 1576 }, { "epoch": 1.2177606177606177, "grad_norm": 4.9375, "learning_rate": 8.753392028034197e-07, "loss": 0.8732, "step": 1577 }, { "epoch": 1.2185328185328186, "grad_norm": 3.96875, "learning_rate": 8.738486700760123e-07, "loss": 0.7687, "step": 1578 }, { "epoch": 1.2193050193050192, "grad_norm": 4.4375, "learning_rate": 8.723587252132112e-07, "loss": 0.843, "step": 1579 }, { "epoch": 1.22007722007722, "grad_norm": 4.625, "learning_rate": 8.708693705435633e-07, "loss": 0.9488, "step": 1580 }, { "epoch": 1.220849420849421, "grad_norm": 3.640625, "learning_rate": 8.693806083946919e-07, "loss": 0.8171, "step": 1581 }, { "epoch": 1.2216216216216216, "grad_norm": 5.25, "learning_rate": 8.678924410932957e-07, "loss": 0.8199, "step": 1582 }, { "epoch": 1.2223938223938224, "grad_norm": 4.15625, "learning_rate": 8.664048709651427e-07, "loss": 0.9089, "step": 1583 }, { "epoch": 1.223166023166023, "grad_norm": 3.671875, "learning_rate": 8.649179003350679e-07, "loss": 0.7858, "step": 1584 }, { "epoch": 1.223938223938224, "grad_norm": 3.5, "learning_rate": 8.634315315269701e-07, "loss": 0.7933, "step": 1585 }, { "epoch": 1.2247104247104248, "grad_norm": 3.609375, "learning_rate": 8.619457668638062e-07, "loss": 0.8948, "step": 1586 }, { "epoch": 1.2254826254826254, "grad_norm": 5.65625, "learning_rate": 8.604606086675904e-07, "loss": 0.8229, "step": 1587 }, { "epoch": 1.2262548262548263, "grad_norm": 3.78125, "learning_rate": 8.589760592593885e-07, "loss": 0.772, "step": 1588 }, { "epoch": 1.227027027027027, "grad_norm": 3.6875, "learning_rate": 8.574921209593137e-07, "loss": 0.8532, "step": 1589 }, { "epoch": 1.2277992277992278, "grad_norm": 3.765625, "learning_rate": 8.560087960865265e-07, "loss": 0.8551, "step": 1590 }, { "epoch": 1.2285714285714286, "grad_norm": 4.15625, "learning_rate": 8.545260869592261e-07, "loss": 0.8908, "step": 1591 }, { "epoch": 1.2293436293436293, "grad_norm": 3.984375, "learning_rate": 8.530439958946511e-07, "loss": 0.8782, "step": 1592 }, { "epoch": 1.2301158301158301, "grad_norm": 4.03125, "learning_rate": 8.515625252090742e-07, "loss": 0.817, "step": 1593 }, { "epoch": 1.230888030888031, "grad_norm": 4.0625, "learning_rate": 8.500816772177977e-07, "loss": 0.888, "step": 1594 }, { "epoch": 1.2316602316602316, "grad_norm": 3.609375, "learning_rate": 8.486014542351511e-07, "loss": 0.8671, "step": 1595 }, { "epoch": 1.2324324324324325, "grad_norm": 9.5625, "learning_rate": 8.471218585744873e-07, "loss": 0.7616, "step": 1596 }, { "epoch": 1.2332046332046331, "grad_norm": 3.53125, "learning_rate": 8.45642892548178e-07, "loss": 0.6937, "step": 1597 }, { "epoch": 1.233976833976834, "grad_norm": 4.25, "learning_rate": 8.441645584676128e-07, "loss": 0.9343, "step": 1598 }, { "epoch": 1.2347490347490346, "grad_norm": 4.21875, "learning_rate": 8.426868586431907e-07, "loss": 0.9376, "step": 1599 }, { "epoch": 1.2355212355212355, "grad_norm": 3.96875, "learning_rate": 8.412097953843222e-07, "loss": 0.7981, "step": 1600 }, { "epoch": 1.2362934362934364, "grad_norm": 4.09375, "learning_rate": 8.397333709994221e-07, "loss": 0.8228, "step": 1601 }, { "epoch": 1.237065637065637, "grad_norm": 4.1875, "learning_rate": 8.382575877959054e-07, "loss": 0.9465, "step": 1602 }, { "epoch": 1.2378378378378379, "grad_norm": 5.59375, "learning_rate": 8.36782448080187e-07, "loss": 0.7132, "step": 1603 }, { "epoch": 1.2386100386100387, "grad_norm": 4.0625, "learning_rate": 8.35307954157675e-07, "loss": 1.0105, "step": 1604 }, { "epoch": 1.2393822393822393, "grad_norm": 5.0, "learning_rate": 8.338341083327687e-07, "loss": 0.8873, "step": 1605 }, { "epoch": 1.2401544401544402, "grad_norm": 6.21875, "learning_rate": 8.323609129088544e-07, "loss": 0.8504, "step": 1606 }, { "epoch": 1.2409266409266408, "grad_norm": 3.984375, "learning_rate": 8.308883701883014e-07, "loss": 0.9963, "step": 1607 }, { "epoch": 1.2416988416988417, "grad_norm": 3.921875, "learning_rate": 8.294164824724596e-07, "loss": 0.8879, "step": 1608 }, { "epoch": 1.2424710424710426, "grad_norm": 4.34375, "learning_rate": 8.279452520616554e-07, "loss": 0.8786, "step": 1609 }, { "epoch": 1.2432432432432432, "grad_norm": 4.0, "learning_rate": 8.264746812551869e-07, "loss": 0.8902, "step": 1610 }, { "epoch": 1.244015444015444, "grad_norm": 4.1875, "learning_rate": 8.250047723513225e-07, "loss": 0.9854, "step": 1611 }, { "epoch": 1.2447876447876447, "grad_norm": 4.34375, "learning_rate": 8.235355276472958e-07, "loss": 0.8599, "step": 1612 }, { "epoch": 1.2455598455598456, "grad_norm": 4.71875, "learning_rate": 8.220669494393014e-07, "loss": 0.8251, "step": 1613 }, { "epoch": 1.2463320463320464, "grad_norm": 4.875, "learning_rate": 8.20599040022494e-07, "loss": 0.857, "step": 1614 }, { "epoch": 1.247104247104247, "grad_norm": 4.34375, "learning_rate": 8.191318016909813e-07, "loss": 1.0047, "step": 1615 }, { "epoch": 1.247876447876448, "grad_norm": 3.890625, "learning_rate": 8.176652367378237e-07, "loss": 0.7959, "step": 1616 }, { "epoch": 1.2486486486486488, "grad_norm": 4.0, "learning_rate": 8.161993474550289e-07, "loss": 0.9926, "step": 1617 }, { "epoch": 1.2494208494208494, "grad_norm": 4.53125, "learning_rate": 8.147341361335476e-07, "loss": 0.8453, "step": 1618 }, { "epoch": 1.2501930501930503, "grad_norm": 4.125, "learning_rate": 8.132696050632717e-07, "loss": 1.054, "step": 1619 }, { "epoch": 1.250965250965251, "grad_norm": 3.828125, "learning_rate": 8.118057565330308e-07, "loss": 0.8955, "step": 1620 }, { "epoch": 1.2517374517374518, "grad_norm": 4.21875, "learning_rate": 8.103425928305858e-07, "loss": 0.8096, "step": 1621 }, { "epoch": 1.2525096525096524, "grad_norm": 4.6875, "learning_rate": 8.088801162426294e-07, "loss": 0.8714, "step": 1622 }, { "epoch": 1.2532818532818533, "grad_norm": 4.78125, "learning_rate": 8.074183290547791e-07, "loss": 0.9019, "step": 1623 }, { "epoch": 1.2540540540540541, "grad_norm": 4.15625, "learning_rate": 8.059572335515753e-07, "loss": 0.7973, "step": 1624 }, { "epoch": 1.2548262548262548, "grad_norm": 3.84375, "learning_rate": 8.04496832016478e-07, "loss": 0.8321, "step": 1625 }, { "epoch": 1.2555984555984556, "grad_norm": 4.46875, "learning_rate": 8.030371267318615e-07, "loss": 0.918, "step": 1626 }, { "epoch": 1.2563706563706565, "grad_norm": 3.65625, "learning_rate": 8.015781199790129e-07, "loss": 0.8293, "step": 1627 }, { "epoch": 1.2571428571428571, "grad_norm": 4.625, "learning_rate": 8.001198140381281e-07, "loss": 0.7854, "step": 1628 }, { "epoch": 1.257915057915058, "grad_norm": 3.859375, "learning_rate": 7.986622111883055e-07, "loss": 0.7296, "step": 1629 }, { "epoch": 1.2586872586872586, "grad_norm": 4.25, "learning_rate": 7.972053137075478e-07, "loss": 0.9205, "step": 1630 }, { "epoch": 1.2594594594594595, "grad_norm": 3.65625, "learning_rate": 7.957491238727519e-07, "loss": 0.8642, "step": 1631 }, { "epoch": 1.2602316602316601, "grad_norm": 4.09375, "learning_rate": 7.942936439597117e-07, "loss": 0.8891, "step": 1632 }, { "epoch": 1.261003861003861, "grad_norm": 3.65625, "learning_rate": 7.928388762431104e-07, "loss": 0.7832, "step": 1633 }, { "epoch": 1.2617760617760618, "grad_norm": 5.5, "learning_rate": 7.913848229965176e-07, "loss": 0.9185, "step": 1634 }, { "epoch": 1.2625482625482625, "grad_norm": 5.78125, "learning_rate": 7.899314864923877e-07, "loss": 1.1249, "step": 1635 }, { "epoch": 1.2633204633204633, "grad_norm": 4.53125, "learning_rate": 7.884788690020534e-07, "loss": 0.7625, "step": 1636 }, { "epoch": 1.2640926640926642, "grad_norm": 5.09375, "learning_rate": 7.870269727957247e-07, "loss": 0.845, "step": 1637 }, { "epoch": 1.2648648648648648, "grad_norm": 3.796875, "learning_rate": 7.855758001424844e-07, "loss": 0.8344, "step": 1638 }, { "epoch": 1.2656370656370657, "grad_norm": 3.59375, "learning_rate": 7.841253533102835e-07, "loss": 0.7226, "step": 1639 }, { "epoch": 1.2664092664092665, "grad_norm": 5.5625, "learning_rate": 7.826756345659397e-07, "loss": 0.9839, "step": 1640 }, { "epoch": 1.2671814671814672, "grad_norm": 5.875, "learning_rate": 7.812266461751331e-07, "loss": 0.791, "step": 1641 }, { "epoch": 1.2679536679536678, "grad_norm": 13.6875, "learning_rate": 7.797783904024004e-07, "loss": 0.8722, "step": 1642 }, { "epoch": 1.2687258687258687, "grad_norm": 3.4375, "learning_rate": 7.783308695111352e-07, "loss": 0.8444, "step": 1643 }, { "epoch": 1.2694980694980695, "grad_norm": 4.125, "learning_rate": 7.768840857635827e-07, "loss": 0.8271, "step": 1644 }, { "epoch": 1.2702702702702702, "grad_norm": 4.375, "learning_rate": 7.754380414208346e-07, "loss": 0.7724, "step": 1645 }, { "epoch": 1.271042471042471, "grad_norm": 3.6875, "learning_rate": 7.739927387428287e-07, "loss": 0.8161, "step": 1646 }, { "epoch": 1.271814671814672, "grad_norm": 4.84375, "learning_rate": 7.725481799883417e-07, "loss": 1.0044, "step": 1647 }, { "epoch": 1.2725868725868725, "grad_norm": 4.09375, "learning_rate": 7.711043674149896e-07, "loss": 0.8027, "step": 1648 }, { "epoch": 1.2733590733590734, "grad_norm": 3.828125, "learning_rate": 7.696613032792216e-07, "loss": 0.782, "step": 1649 }, { "epoch": 1.2741312741312742, "grad_norm": 3.578125, "learning_rate": 7.682189898363164e-07, "loss": 0.7417, "step": 1650 }, { "epoch": 1.2749034749034749, "grad_norm": 3.953125, "learning_rate": 7.667774293403804e-07, "loss": 0.8725, "step": 1651 }, { "epoch": 1.2756756756756757, "grad_norm": 4.375, "learning_rate": 7.653366240443435e-07, "loss": 0.9067, "step": 1652 }, { "epoch": 1.2764478764478764, "grad_norm": 7.34375, "learning_rate": 7.638965761999541e-07, "loss": 1.0468, "step": 1653 }, { "epoch": 1.2772200772200772, "grad_norm": 5.5625, "learning_rate": 7.62457288057778e-07, "loss": 0.7233, "step": 1654 }, { "epoch": 1.2779922779922779, "grad_norm": 4.375, "learning_rate": 7.610187618671929e-07, "loss": 0.9234, "step": 1655 }, { "epoch": 1.2787644787644787, "grad_norm": 4.78125, "learning_rate": 7.595809998763866e-07, "loss": 0.9241, "step": 1656 }, { "epoch": 1.2795366795366796, "grad_norm": 4.25, "learning_rate": 7.581440043323522e-07, "loss": 0.8604, "step": 1657 }, { "epoch": 1.2803088803088802, "grad_norm": 3.96875, "learning_rate": 7.567077774808839e-07, "loss": 0.8185, "step": 1658 }, { "epoch": 1.281081081081081, "grad_norm": 3.9375, "learning_rate": 7.552723215665761e-07, "loss": 0.7796, "step": 1659 }, { "epoch": 1.281853281853282, "grad_norm": 3.75, "learning_rate": 7.538376388328181e-07, "loss": 0.8645, "step": 1660 }, { "epoch": 1.2826254826254826, "grad_norm": 4.5625, "learning_rate": 7.524037315217893e-07, "loss": 0.7852, "step": 1661 }, { "epoch": 1.2833976833976835, "grad_norm": 5.03125, "learning_rate": 7.509706018744595e-07, "loss": 0.8679, "step": 1662 }, { "epoch": 1.284169884169884, "grad_norm": 3.4375, "learning_rate": 7.495382521305816e-07, "loss": 0.8314, "step": 1663 }, { "epoch": 1.284942084942085, "grad_norm": 4.375, "learning_rate": 7.481066845286898e-07, "loss": 0.8698, "step": 1664 }, { "epoch": 1.2857142857142856, "grad_norm": 5.4375, "learning_rate": 7.466759013060968e-07, "loss": 0.8465, "step": 1665 }, { "epoch": 1.2864864864864864, "grad_norm": 4.96875, "learning_rate": 7.452459046988883e-07, "loss": 1.0064, "step": 1666 }, { "epoch": 1.2872586872586873, "grad_norm": 7.65625, "learning_rate": 7.438166969419214e-07, "loss": 0.8278, "step": 1667 }, { "epoch": 1.288030888030888, "grad_norm": 3.78125, "learning_rate": 7.423882802688206e-07, "loss": 0.7834, "step": 1668 }, { "epoch": 1.2888030888030888, "grad_norm": 4.15625, "learning_rate": 7.409606569119729e-07, "loss": 1.0048, "step": 1669 }, { "epoch": 1.2895752895752897, "grad_norm": 4.125, "learning_rate": 7.395338291025269e-07, "loss": 0.8986, "step": 1670 }, { "epoch": 1.2903474903474903, "grad_norm": 3.875, "learning_rate": 7.381077990703861e-07, "loss": 0.7976, "step": 1671 }, { "epoch": 1.2911196911196912, "grad_norm": 3.828125, "learning_rate": 7.36682569044209e-07, "loss": 0.9736, "step": 1672 }, { "epoch": 1.291891891891892, "grad_norm": 3.71875, "learning_rate": 7.352581412514033e-07, "loss": 0.7533, "step": 1673 }, { "epoch": 1.2926640926640927, "grad_norm": 3.890625, "learning_rate": 7.338345179181222e-07, "loss": 0.955, "step": 1674 }, { "epoch": 1.2934362934362935, "grad_norm": 4.09375, "learning_rate": 7.324117012692625e-07, "loss": 0.8333, "step": 1675 }, { "epoch": 1.2942084942084942, "grad_norm": 4.40625, "learning_rate": 7.309896935284599e-07, "loss": 0.8148, "step": 1676 }, { "epoch": 1.294980694980695, "grad_norm": 3.859375, "learning_rate": 7.295684969180858e-07, "loss": 0.7954, "step": 1677 }, { "epoch": 1.2957528957528957, "grad_norm": 3.5, "learning_rate": 7.281481136592445e-07, "loss": 0.7964, "step": 1678 }, { "epoch": 1.2965250965250965, "grad_norm": 4.59375, "learning_rate": 7.267285459717682e-07, "loss": 0.969, "step": 1679 }, { "epoch": 1.2972972972972974, "grad_norm": 4.375, "learning_rate": 7.25309796074215e-07, "loss": 0.7696, "step": 1680 }, { "epoch": 1.298069498069498, "grad_norm": 3.765625, "learning_rate": 7.238918661838665e-07, "loss": 0.9975, "step": 1681 }, { "epoch": 1.2988416988416989, "grad_norm": 3.796875, "learning_rate": 7.224747585167189e-07, "loss": 0.856, "step": 1682 }, { "epoch": 1.2996138996138997, "grad_norm": 4.40625, "learning_rate": 7.210584752874874e-07, "loss": 0.9473, "step": 1683 }, { "epoch": 1.3003861003861004, "grad_norm": 3.375, "learning_rate": 7.196430187095965e-07, "loss": 0.7636, "step": 1684 }, { "epoch": 1.3011583011583012, "grad_norm": 4.625, "learning_rate": 7.182283909951796e-07, "loss": 1.0614, "step": 1685 }, { "epoch": 1.3019305019305019, "grad_norm": 3.703125, "learning_rate": 7.168145943550736e-07, "loss": 0.6863, "step": 1686 }, { "epoch": 1.3027027027027027, "grad_norm": 5.90625, "learning_rate": 7.154016309988191e-07, "loss": 0.9015, "step": 1687 }, { "epoch": 1.3034749034749034, "grad_norm": 4.1875, "learning_rate": 7.139895031346507e-07, "loss": 0.9181, "step": 1688 }, { "epoch": 1.3042471042471042, "grad_norm": 8.4375, "learning_rate": 7.125782129695008e-07, "loss": 0.7883, "step": 1689 }, { "epoch": 1.305019305019305, "grad_norm": 3.890625, "learning_rate": 7.111677627089906e-07, "loss": 0.8656, "step": 1690 }, { "epoch": 1.3057915057915057, "grad_norm": 3.875, "learning_rate": 7.097581545574289e-07, "loss": 0.8874, "step": 1691 }, { "epoch": 1.3065637065637066, "grad_norm": 5.4375, "learning_rate": 7.083493907178098e-07, "loss": 0.8312, "step": 1692 }, { "epoch": 1.3073359073359074, "grad_norm": 4.28125, "learning_rate": 7.06941473391805e-07, "loss": 0.8846, "step": 1693 }, { "epoch": 1.308108108108108, "grad_norm": 3.578125, "learning_rate": 7.055344047797663e-07, "loss": 0.8892, "step": 1694 }, { "epoch": 1.308880308880309, "grad_norm": 6.1875, "learning_rate": 7.041281870807176e-07, "loss": 0.7897, "step": 1695 }, { "epoch": 1.3096525096525098, "grad_norm": 7.125, "learning_rate": 7.027228224923528e-07, "loss": 0.9779, "step": 1696 }, { "epoch": 1.3104247104247104, "grad_norm": 6.3125, "learning_rate": 7.013183132110331e-07, "loss": 0.796, "step": 1697 }, { "epoch": 1.311196911196911, "grad_norm": 5.15625, "learning_rate": 6.999146614317827e-07, "loss": 0.9413, "step": 1698 }, { "epoch": 1.311969111969112, "grad_norm": 4.15625, "learning_rate": 6.985118693482853e-07, "loss": 0.8326, "step": 1699 }, { "epoch": 1.3127413127413128, "grad_norm": 4.46875, "learning_rate": 6.971099391528823e-07, "loss": 0.805, "step": 1700 }, { "epoch": 1.3135135135135134, "grad_norm": 4.21875, "learning_rate": 6.957088730365667e-07, "loss": 0.85, "step": 1701 }, { "epoch": 1.3142857142857143, "grad_norm": 3.796875, "learning_rate": 6.943086731889814e-07, "loss": 0.8742, "step": 1702 }, { "epoch": 1.3150579150579151, "grad_norm": 4.6875, "learning_rate": 6.929093417984172e-07, "loss": 0.8221, "step": 1703 }, { "epoch": 1.3158301158301158, "grad_norm": 4.09375, "learning_rate": 6.91510881051804e-07, "loss": 0.7977, "step": 1704 }, { "epoch": 1.3166023166023166, "grad_norm": 3.921875, "learning_rate": 6.901132931347151e-07, "loss": 0.875, "step": 1705 }, { "epoch": 1.3173745173745175, "grad_norm": 3.84375, "learning_rate": 6.887165802313558e-07, "loss": 0.9495, "step": 1706 }, { "epoch": 1.3181467181467181, "grad_norm": 4.1875, "learning_rate": 6.873207445245677e-07, "loss": 0.7844, "step": 1707 }, { "epoch": 1.318918918918919, "grad_norm": 4.03125, "learning_rate": 6.859257881958187e-07, "loss": 0.8096, "step": 1708 }, { "epoch": 1.3196911196911196, "grad_norm": 3.5, "learning_rate": 6.845317134252036e-07, "loss": 0.8774, "step": 1709 }, { "epoch": 1.3204633204633205, "grad_norm": 3.859375, "learning_rate": 6.831385223914385e-07, "loss": 0.7329, "step": 1710 }, { "epoch": 1.3212355212355211, "grad_norm": 3.40625, "learning_rate": 6.817462172718601e-07, "loss": 0.6888, "step": 1711 }, { "epoch": 1.322007722007722, "grad_norm": 15.6875, "learning_rate": 6.803548002424192e-07, "loss": 0.7833, "step": 1712 }, { "epoch": 1.3227799227799228, "grad_norm": 3.796875, "learning_rate": 6.789642734776786e-07, "loss": 0.9026, "step": 1713 }, { "epoch": 1.3235521235521235, "grad_norm": 4.375, "learning_rate": 6.775746391508105e-07, "loss": 0.8733, "step": 1714 }, { "epoch": 1.3243243243243243, "grad_norm": 4.125, "learning_rate": 6.761858994335913e-07, "loss": 0.8889, "step": 1715 }, { "epoch": 1.3250965250965252, "grad_norm": 15.5, "learning_rate": 6.747980564964012e-07, "loss": 0.9366, "step": 1716 }, { "epoch": 1.3258687258687258, "grad_norm": 3.640625, "learning_rate": 6.73411112508216e-07, "loss": 0.7715, "step": 1717 }, { "epoch": 1.3266409266409267, "grad_norm": 3.84375, "learning_rate": 6.720250696366096e-07, "loss": 0.8419, "step": 1718 }, { "epoch": 1.3274131274131273, "grad_norm": 4.15625, "learning_rate": 6.706399300477456e-07, "loss": 0.8913, "step": 1719 }, { "epoch": 1.3281853281853282, "grad_norm": 3.921875, "learning_rate": 6.692556959063763e-07, "loss": 0.8513, "step": 1720 }, { "epoch": 1.3289575289575288, "grad_norm": 3.640625, "learning_rate": 6.678723693758403e-07, "loss": 0.8583, "step": 1721 }, { "epoch": 1.3297297297297297, "grad_norm": 16.375, "learning_rate": 6.664899526180547e-07, "loss": 0.8391, "step": 1722 }, { "epoch": 1.3305019305019306, "grad_norm": 3.90625, "learning_rate": 6.651084477935183e-07, "loss": 0.897, "step": 1723 }, { "epoch": 1.3312741312741312, "grad_norm": 4.09375, "learning_rate": 6.637278570613024e-07, "loss": 0.8865, "step": 1724 }, { "epoch": 1.332046332046332, "grad_norm": 3.546875, "learning_rate": 6.623481825790503e-07, "loss": 0.8357, "step": 1725 }, { "epoch": 1.332818532818533, "grad_norm": 3.8125, "learning_rate": 6.609694265029734e-07, "loss": 0.8474, "step": 1726 }, { "epoch": 1.3335907335907335, "grad_norm": 4.0625, "learning_rate": 6.595915909878489e-07, "loss": 0.9265, "step": 1727 }, { "epoch": 1.3343629343629344, "grad_norm": 4.03125, "learning_rate": 6.582146781870122e-07, "loss": 0.8714, "step": 1728 }, { "epoch": 1.3351351351351353, "grad_norm": 3.859375, "learning_rate": 6.568386902523602e-07, "loss": 0.8234, "step": 1729 }, { "epoch": 1.335907335907336, "grad_norm": 3.65625, "learning_rate": 6.554636293343424e-07, "loss": 0.8039, "step": 1730 }, { "epoch": 1.3366795366795368, "grad_norm": 4.3125, "learning_rate": 6.540894975819597e-07, "loss": 0.7853, "step": 1731 }, { "epoch": 1.3374517374517374, "grad_norm": 4.75, "learning_rate": 6.527162971427623e-07, "loss": 0.9504, "step": 1732 }, { "epoch": 1.3382239382239383, "grad_norm": 4.09375, "learning_rate": 6.513440301628418e-07, "loss": 0.8849, "step": 1733 }, { "epoch": 1.338996138996139, "grad_norm": 3.84375, "learning_rate": 6.499726987868343e-07, "loss": 0.8974, "step": 1734 }, { "epoch": 1.3397683397683398, "grad_norm": 5.625, "learning_rate": 6.486023051579122e-07, "loss": 0.8387, "step": 1735 }, { "epoch": 1.3405405405405406, "grad_norm": 6.5625, "learning_rate": 6.472328514177817e-07, "loss": 0.9246, "step": 1736 }, { "epoch": 1.3413127413127413, "grad_norm": 4.90625, "learning_rate": 6.458643397066813e-07, "loss": 0.9701, "step": 1737 }, { "epoch": 1.3420849420849421, "grad_norm": 4.3125, "learning_rate": 6.44496772163377e-07, "loss": 0.8228, "step": 1738 }, { "epoch": 1.342857142857143, "grad_norm": 5.46875, "learning_rate": 6.431301509251581e-07, "loss": 0.759, "step": 1739 }, { "epoch": 1.3436293436293436, "grad_norm": 4.1875, "learning_rate": 6.41764478127837e-07, "loss": 0.8518, "step": 1740 }, { "epoch": 1.3444015444015445, "grad_norm": 3.453125, "learning_rate": 6.403997559057425e-07, "loss": 0.7614, "step": 1741 }, { "epoch": 1.345173745173745, "grad_norm": 4.09375, "learning_rate": 6.390359863917171e-07, "loss": 0.8831, "step": 1742 }, { "epoch": 1.345945945945946, "grad_norm": 3.96875, "learning_rate": 6.376731717171171e-07, "loss": 0.7221, "step": 1743 }, { "epoch": 1.3467181467181466, "grad_norm": 4.625, "learning_rate": 6.363113140118026e-07, "loss": 0.77, "step": 1744 }, { "epoch": 1.3474903474903475, "grad_norm": 4.0625, "learning_rate": 6.349504154041422e-07, "loss": 0.8971, "step": 1745 }, { "epoch": 1.3482625482625483, "grad_norm": 4.625, "learning_rate": 6.335904780210017e-07, "loss": 0.8318, "step": 1746 }, { "epoch": 1.349034749034749, "grad_norm": 4.0, "learning_rate": 6.322315039877483e-07, "loss": 0.9475, "step": 1747 }, { "epoch": 1.3498069498069498, "grad_norm": 3.765625, "learning_rate": 6.30873495428241e-07, "loss": 0.754, "step": 1748 }, { "epoch": 1.3505791505791507, "grad_norm": 3.625, "learning_rate": 6.295164544648311e-07, "loss": 0.6355, "step": 1749 }, { "epoch": 1.3513513513513513, "grad_norm": 4.21875, "learning_rate": 6.281603832183569e-07, "loss": 0.8965, "step": 1750 }, { "epoch": 1.3521235521235522, "grad_norm": 3.953125, "learning_rate": 6.268052838081428e-07, "loss": 0.8583, "step": 1751 }, { "epoch": 1.352895752895753, "grad_norm": 3.546875, "learning_rate": 6.254511583519926e-07, "loss": 0.7909, "step": 1752 }, { "epoch": 1.3536679536679537, "grad_norm": 4.09375, "learning_rate": 6.240980089661888e-07, "loss": 0.8789, "step": 1753 }, { "epoch": 1.3544401544401543, "grad_norm": 5.6875, "learning_rate": 6.227458377654885e-07, "loss": 0.9242, "step": 1754 }, { "epoch": 1.3552123552123552, "grad_norm": 4.0, "learning_rate": 6.213946468631192e-07, "loss": 0.9244, "step": 1755 }, { "epoch": 1.355984555984556, "grad_norm": 4.71875, "learning_rate": 6.200444383707783e-07, "loss": 0.9192, "step": 1756 }, { "epoch": 1.3567567567567567, "grad_norm": 3.921875, "learning_rate": 6.186952143986251e-07, "loss": 0.7152, "step": 1757 }, { "epoch": 1.3575289575289575, "grad_norm": 4.59375, "learning_rate": 6.173469770552828e-07, "loss": 0.7707, "step": 1758 }, { "epoch": 1.3583011583011584, "grad_norm": 3.59375, "learning_rate": 6.159997284478314e-07, "loss": 0.8576, "step": 1759 }, { "epoch": 1.359073359073359, "grad_norm": 3.921875, "learning_rate": 6.146534706818051e-07, "loss": 0.805, "step": 1760 }, { "epoch": 1.3598455598455599, "grad_norm": 4.125, "learning_rate": 6.133082058611918e-07, "loss": 0.8639, "step": 1761 }, { "epoch": 1.3606177606177607, "grad_norm": 4.0625, "learning_rate": 6.119639360884244e-07, "loss": 0.9422, "step": 1762 }, { "epoch": 1.3613899613899614, "grad_norm": 4.5625, "learning_rate": 6.106206634643837e-07, "loss": 0.9223, "step": 1763 }, { "epoch": 1.3621621621621622, "grad_norm": 3.828125, "learning_rate": 6.092783900883903e-07, "loss": 0.9274, "step": 1764 }, { "epoch": 1.3629343629343629, "grad_norm": 3.765625, "learning_rate": 6.079371180582038e-07, "loss": 0.8436, "step": 1765 }, { "epoch": 1.3637065637065637, "grad_norm": 4.6875, "learning_rate": 6.065968494700181e-07, "loss": 0.9433, "step": 1766 }, { "epoch": 1.3644787644787644, "grad_norm": 4.0, "learning_rate": 6.052575864184609e-07, "loss": 0.7827, "step": 1767 }, { "epoch": 1.3652509652509652, "grad_norm": 3.625, "learning_rate": 6.039193309965853e-07, "loss": 0.7521, "step": 1768 }, { "epoch": 1.366023166023166, "grad_norm": 3.6875, "learning_rate": 6.025820852958726e-07, "loss": 0.7728, "step": 1769 }, { "epoch": 1.3667953667953667, "grad_norm": 4.25, "learning_rate": 6.012458514062243e-07, "loss": 0.7562, "step": 1770 }, { "epoch": 1.3675675675675676, "grad_norm": 3.796875, "learning_rate": 5.999106314159607e-07, "loss": 0.8732, "step": 1771 }, { "epoch": 1.3683397683397684, "grad_norm": 4.21875, "learning_rate": 5.985764274118194e-07, "loss": 0.8962, "step": 1772 }, { "epoch": 1.369111969111969, "grad_norm": 3.71875, "learning_rate": 5.972432414789467e-07, "loss": 0.7849, "step": 1773 }, { "epoch": 1.36988416988417, "grad_norm": 4.78125, "learning_rate": 5.959110757009014e-07, "loss": 0.8967, "step": 1774 }, { "epoch": 1.3706563706563706, "grad_norm": 4.25, "learning_rate": 5.945799321596458e-07, "loss": 0.765, "step": 1775 }, { "epoch": 1.3714285714285714, "grad_norm": 3.953125, "learning_rate": 5.932498129355452e-07, "loss": 0.8683, "step": 1776 }, { "epoch": 1.372200772200772, "grad_norm": 4.75, "learning_rate": 5.919207201073642e-07, "loss": 0.8686, "step": 1777 }, { "epoch": 1.372972972972973, "grad_norm": 5.25, "learning_rate": 5.905926557522629e-07, "loss": 0.8266, "step": 1778 }, { "epoch": 1.3737451737451738, "grad_norm": 5.09375, "learning_rate": 5.892656219457942e-07, "loss": 0.8566, "step": 1779 }, { "epoch": 1.3745173745173744, "grad_norm": 4.0, "learning_rate": 5.87939620761901e-07, "loss": 0.8095, "step": 1780 }, { "epoch": 1.3752895752895753, "grad_norm": 3.90625, "learning_rate": 5.866146542729119e-07, "loss": 0.8584, "step": 1781 }, { "epoch": 1.3760617760617762, "grad_norm": 4.59375, "learning_rate": 5.852907245495378e-07, "loss": 0.7615, "step": 1782 }, { "epoch": 1.3768339768339768, "grad_norm": 3.765625, "learning_rate": 5.839678336608713e-07, "loss": 0.8035, "step": 1783 }, { "epoch": 1.3776061776061777, "grad_norm": 3.859375, "learning_rate": 5.826459836743784e-07, "loss": 0.8249, "step": 1784 }, { "epoch": 1.3783783783783785, "grad_norm": 4.5625, "learning_rate": 5.813251766559018e-07, "loss": 0.8488, "step": 1785 }, { "epoch": 1.3791505791505791, "grad_norm": 3.578125, "learning_rate": 5.800054146696506e-07, "loss": 0.7483, "step": 1786 }, { "epoch": 1.3799227799227798, "grad_norm": 4.8125, "learning_rate": 5.78686699778204e-07, "loss": 1.0012, "step": 1787 }, { "epoch": 1.3806949806949806, "grad_norm": 4.5625, "learning_rate": 5.773690340425027e-07, "loss": 0.8294, "step": 1788 }, { "epoch": 1.3814671814671815, "grad_norm": 3.65625, "learning_rate": 5.760524195218484e-07, "loss": 0.7572, "step": 1789 }, { "epoch": 1.3822393822393821, "grad_norm": 4.09375, "learning_rate": 5.747368582738995e-07, "loss": 0.812, "step": 1790 }, { "epoch": 1.383011583011583, "grad_norm": 5.21875, "learning_rate": 5.734223523546695e-07, "loss": 0.8795, "step": 1791 }, { "epoch": 1.3837837837837839, "grad_norm": 3.71875, "learning_rate": 5.721089038185213e-07, "loss": 0.7826, "step": 1792 }, { "epoch": 1.3845559845559845, "grad_norm": 4.5, "learning_rate": 5.707965147181657e-07, "loss": 0.8299, "step": 1793 }, { "epoch": 1.3853281853281854, "grad_norm": 5.125, "learning_rate": 5.694851871046578e-07, "loss": 0.9368, "step": 1794 }, { "epoch": 1.3861003861003862, "grad_norm": 4.46875, "learning_rate": 5.681749230273934e-07, "loss": 0.894, "step": 1795 }, { "epoch": 1.3868725868725869, "grad_norm": 4.9375, "learning_rate": 5.668657245341076e-07, "loss": 0.8951, "step": 1796 }, { "epoch": 1.3876447876447877, "grad_norm": 4.1875, "learning_rate": 5.655575936708676e-07, "loss": 0.9619, "step": 1797 }, { "epoch": 1.3884169884169884, "grad_norm": 3.734375, "learning_rate": 5.642505324820748e-07, "loss": 0.7664, "step": 1798 }, { "epoch": 1.3891891891891892, "grad_norm": 4.15625, "learning_rate": 5.62944543010457e-07, "loss": 0.7457, "step": 1799 }, { "epoch": 1.3899613899613898, "grad_norm": 4.15625, "learning_rate": 5.616396272970674e-07, "loss": 0.876, "step": 1800 }, { "epoch": 1.3907335907335907, "grad_norm": 4.3125, "learning_rate": 5.603357873812828e-07, "loss": 0.8922, "step": 1801 }, { "epoch": 1.3915057915057916, "grad_norm": 4.5, "learning_rate": 5.590330253007954e-07, "loss": 1.043, "step": 1802 }, { "epoch": 1.3922779922779922, "grad_norm": 4.9375, "learning_rate": 5.577313430916162e-07, "loss": 1.0113, "step": 1803 }, { "epoch": 1.393050193050193, "grad_norm": 3.828125, "learning_rate": 5.564307427880665e-07, "loss": 0.8597, "step": 1804 }, { "epoch": 1.393822393822394, "grad_norm": 11.125, "learning_rate": 5.551312264227776e-07, "loss": 0.8461, "step": 1805 }, { "epoch": 1.3945945945945946, "grad_norm": 5.5625, "learning_rate": 5.538327960266859e-07, "loss": 0.8142, "step": 1806 }, { "epoch": 1.3953667953667954, "grad_norm": 5.15625, "learning_rate": 5.525354536290328e-07, "loss": 0.9585, "step": 1807 }, { "epoch": 1.396138996138996, "grad_norm": 3.609375, "learning_rate": 5.51239201257356e-07, "loss": 0.7967, "step": 1808 }, { "epoch": 1.396911196911197, "grad_norm": 4.28125, "learning_rate": 5.499440409374928e-07, "loss": 0.8332, "step": 1809 }, { "epoch": 1.3976833976833976, "grad_norm": 3.921875, "learning_rate": 5.486499746935717e-07, "loss": 0.7873, "step": 1810 }, { "epoch": 1.3984555984555984, "grad_norm": 4.34375, "learning_rate": 5.47357004548012e-07, "loss": 0.8482, "step": 1811 }, { "epoch": 1.3992277992277993, "grad_norm": 4.0, "learning_rate": 5.460651325215214e-07, "loss": 0.8766, "step": 1812 }, { "epoch": 1.4, "grad_norm": 3.8125, "learning_rate": 5.44774360633088e-07, "loss": 0.822, "step": 1813 }, { "epoch": 1.4007722007722008, "grad_norm": 6.09375, "learning_rate": 5.434846908999842e-07, "loss": 0.9471, "step": 1814 }, { "epoch": 1.4015444015444016, "grad_norm": 3.421875, "learning_rate": 5.421961253377579e-07, "loss": 0.7615, "step": 1815 }, { "epoch": 1.4023166023166023, "grad_norm": 4.5, "learning_rate": 5.409086659602317e-07, "loss": 0.9039, "step": 1816 }, { "epoch": 1.4030888030888031, "grad_norm": 4.46875, "learning_rate": 5.396223147794996e-07, "loss": 0.8843, "step": 1817 }, { "epoch": 1.403861003861004, "grad_norm": 4.34375, "learning_rate": 5.383370738059235e-07, "loss": 0.9446, "step": 1818 }, { "epoch": 1.4046332046332046, "grad_norm": 3.71875, "learning_rate": 5.370529450481299e-07, "loss": 0.8393, "step": 1819 }, { "epoch": 1.4054054054054055, "grad_norm": 3.703125, "learning_rate": 5.357699305130085e-07, "loss": 0.9306, "step": 1820 }, { "epoch": 1.4061776061776061, "grad_norm": 4.0, "learning_rate": 5.344880322057058e-07, "loss": 0.8207, "step": 1821 }, { "epoch": 1.406949806949807, "grad_norm": 3.421875, "learning_rate": 5.332072521296246e-07, "loss": 0.7287, "step": 1822 }, { "epoch": 1.4077220077220076, "grad_norm": 3.671875, "learning_rate": 5.319275922864213e-07, "loss": 0.8051, "step": 1823 }, { "epoch": 1.4084942084942085, "grad_norm": 3.65625, "learning_rate": 5.306490546759985e-07, "loss": 0.8911, "step": 1824 }, { "epoch": 1.4092664092664093, "grad_norm": 3.546875, "learning_rate": 5.293716412965089e-07, "loss": 0.7776, "step": 1825 }, { "epoch": 1.41003861003861, "grad_norm": 4.4375, "learning_rate": 5.280953541443439e-07, "loss": 0.8904, "step": 1826 }, { "epoch": 1.4108108108108108, "grad_norm": 3.96875, "learning_rate": 5.268201952141387e-07, "loss": 0.7931, "step": 1827 }, { "epoch": 1.4115830115830117, "grad_norm": 5.84375, "learning_rate": 5.255461664987629e-07, "loss": 0.891, "step": 1828 }, { "epoch": 1.4123552123552123, "grad_norm": 3.46875, "learning_rate": 5.242732699893205e-07, "loss": 0.8085, "step": 1829 }, { "epoch": 1.4131274131274132, "grad_norm": 4.28125, "learning_rate": 5.230015076751456e-07, "loss": 0.7842, "step": 1830 }, { "epoch": 1.4138996138996138, "grad_norm": 4.71875, "learning_rate": 5.217308815438009e-07, "loss": 0.8685, "step": 1831 }, { "epoch": 1.4146718146718147, "grad_norm": 3.921875, "learning_rate": 5.204613935810723e-07, "loss": 0.8885, "step": 1832 }, { "epoch": 1.4154440154440153, "grad_norm": 4.3125, "learning_rate": 5.191930457709671e-07, "loss": 0.8169, "step": 1833 }, { "epoch": 1.4162162162162162, "grad_norm": 4.4375, "learning_rate": 5.179258400957109e-07, "loss": 0.7632, "step": 1834 }, { "epoch": 1.416988416988417, "grad_norm": 3.53125, "learning_rate": 5.166597785357439e-07, "loss": 0.7025, "step": 1835 }, { "epoch": 1.4177606177606177, "grad_norm": 3.984375, "learning_rate": 5.153948630697199e-07, "loss": 0.8342, "step": 1836 }, { "epoch": 1.4185328185328185, "grad_norm": 3.78125, "learning_rate": 5.141310956744987e-07, "loss": 0.8603, "step": 1837 }, { "epoch": 1.4193050193050194, "grad_norm": 5.03125, "learning_rate": 5.128684783251485e-07, "loss": 0.9533, "step": 1838 }, { "epoch": 1.42007722007722, "grad_norm": 3.765625, "learning_rate": 5.116070129949387e-07, "loss": 0.936, "step": 1839 }, { "epoch": 1.420849420849421, "grad_norm": 4.3125, "learning_rate": 5.103467016553382e-07, "loss": 0.9303, "step": 1840 }, { "epoch": 1.4216216216216218, "grad_norm": 5.875, "learning_rate": 5.090875462760143e-07, "loss": 0.9198, "step": 1841 }, { "epoch": 1.4223938223938224, "grad_norm": 3.984375, "learning_rate": 5.078295488248243e-07, "loss": 0.9924, "step": 1842 }, { "epoch": 1.423166023166023, "grad_norm": 3.375, "learning_rate": 5.065727112678194e-07, "loss": 0.7905, "step": 1843 }, { "epoch": 1.423938223938224, "grad_norm": 3.953125, "learning_rate": 5.053170355692358e-07, "loss": 0.931, "step": 1844 }, { "epoch": 1.4247104247104247, "grad_norm": 4.0, "learning_rate": 5.040625236914946e-07, "loss": 0.8642, "step": 1845 }, { "epoch": 1.4254826254826254, "grad_norm": 22.375, "learning_rate": 5.028091775951978e-07, "loss": 0.9582, "step": 1846 }, { "epoch": 1.4262548262548262, "grad_norm": 3.96875, "learning_rate": 5.015569992391269e-07, "loss": 0.9074, "step": 1847 }, { "epoch": 1.427027027027027, "grad_norm": 4.625, "learning_rate": 5.003059905802353e-07, "loss": 0.8879, "step": 1848 }, { "epoch": 1.4277992277992277, "grad_norm": 4.15625, "learning_rate": 4.990561535736516e-07, "loss": 0.9356, "step": 1849 }, { "epoch": 1.4285714285714286, "grad_norm": 3.8125, "learning_rate": 4.978074901726717e-07, "loss": 0.7466, "step": 1850 }, { "epoch": 1.4293436293436295, "grad_norm": 5.71875, "learning_rate": 4.965600023287569e-07, "loss": 0.9533, "step": 1851 }, { "epoch": 1.43011583011583, "grad_norm": 2.140625, "learning_rate": 4.953136919915333e-07, "loss": 0.3164, "step": 1852 }, { "epoch": 1.430888030888031, "grad_norm": 5.21875, "learning_rate": 4.940685611087838e-07, "loss": 0.9674, "step": 1853 }, { "epoch": 1.4316602316602316, "grad_norm": 4.15625, "learning_rate": 4.928246116264508e-07, "loss": 0.7411, "step": 1854 }, { "epoch": 1.4324324324324325, "grad_norm": 4.625, "learning_rate": 4.915818454886289e-07, "loss": 0.9046, "step": 1855 }, { "epoch": 1.433204633204633, "grad_norm": 3.953125, "learning_rate": 4.903402646375635e-07, "loss": 0.8816, "step": 1856 }, { "epoch": 1.433976833976834, "grad_norm": 3.65625, "learning_rate": 4.890998710136477e-07, "loss": 0.7849, "step": 1857 }, { "epoch": 1.4347490347490348, "grad_norm": 3.984375, "learning_rate": 4.878606665554192e-07, "loss": 0.8682, "step": 1858 }, { "epoch": 1.4355212355212355, "grad_norm": 4.0, "learning_rate": 4.866226531995565e-07, "loss": 0.8872, "step": 1859 }, { "epoch": 1.4362934362934363, "grad_norm": 4.21875, "learning_rate": 4.853858328808783e-07, "loss": 0.9324, "step": 1860 }, { "epoch": 1.4370656370656372, "grad_norm": 4.15625, "learning_rate": 4.841502075323372e-07, "loss": 0.8422, "step": 1861 }, { "epoch": 1.4378378378378378, "grad_norm": 3.703125, "learning_rate": 4.829157790850185e-07, "loss": 0.7943, "step": 1862 }, { "epoch": 1.4386100386100387, "grad_norm": 4.40625, "learning_rate": 4.816825494681384e-07, "loss": 0.791, "step": 1863 }, { "epoch": 1.4393822393822393, "grad_norm": 10.75, "learning_rate": 4.804505206090366e-07, "loss": 0.9508, "step": 1864 }, { "epoch": 1.4401544401544402, "grad_norm": 4.09375, "learning_rate": 4.792196944331796e-07, "loss": 0.8856, "step": 1865 }, { "epoch": 1.4409266409266408, "grad_norm": 4.5625, "learning_rate": 4.779900728641513e-07, "loss": 0.7784, "step": 1866 }, { "epoch": 1.4416988416988417, "grad_norm": 4.28125, "learning_rate": 4.76761657823655e-07, "loss": 0.8138, "step": 1867 }, { "epoch": 1.4424710424710425, "grad_norm": 4.3125, "learning_rate": 4.7553445123150773e-07, "loss": 0.9982, "step": 1868 }, { "epoch": 1.4432432432432432, "grad_norm": 4.78125, "learning_rate": 4.743084550056377e-07, "loss": 0.8931, "step": 1869 }, { "epoch": 1.444015444015444, "grad_norm": 3.703125, "learning_rate": 4.7308367106208145e-07, "loss": 0.871, "step": 1870 }, { "epoch": 1.4447876447876449, "grad_norm": 4.53125, "learning_rate": 4.718601013149819e-07, "loss": 0.9611, "step": 1871 }, { "epoch": 1.4455598455598455, "grad_norm": 4.46875, "learning_rate": 4.706377476765832e-07, "loss": 0.7958, "step": 1872 }, { "epoch": 1.4463320463320464, "grad_norm": 4.28125, "learning_rate": 4.694166120572294e-07, "loss": 0.9189, "step": 1873 }, { "epoch": 1.4471042471042472, "grad_norm": 3.640625, "learning_rate": 4.68196696365361e-07, "loss": 0.8188, "step": 1874 }, { "epoch": 1.4478764478764479, "grad_norm": 4.03125, "learning_rate": 4.6697800250751146e-07, "loss": 0.957, "step": 1875 }, { "epoch": 1.4486486486486487, "grad_norm": 4.5625, "learning_rate": 4.657605323883063e-07, "loss": 0.8007, "step": 1876 }, { "epoch": 1.4494208494208494, "grad_norm": 3.96875, "learning_rate": 4.6454428791045575e-07, "loss": 0.8779, "step": 1877 }, { "epoch": 1.4501930501930502, "grad_norm": 3.984375, "learning_rate": 4.6332927097475737e-07, "loss": 0.8227, "step": 1878 }, { "epoch": 1.4509652509652509, "grad_norm": 7.53125, "learning_rate": 4.621154834800887e-07, "loss": 0.7647, "step": 1879 }, { "epoch": 1.4517374517374517, "grad_norm": 4.21875, "learning_rate": 4.6090292732340584e-07, "loss": 0.8427, "step": 1880 }, { "epoch": 1.4525096525096526, "grad_norm": 4.96875, "learning_rate": 4.596916043997422e-07, "loss": 0.9078, "step": 1881 }, { "epoch": 1.4532818532818532, "grad_norm": 4.34375, "learning_rate": 4.584815166022008e-07, "loss": 0.9216, "step": 1882 }, { "epoch": 1.454054054054054, "grad_norm": 7.34375, "learning_rate": 4.5727266582195743e-07, "loss": 0.9028, "step": 1883 }, { "epoch": 1.454826254826255, "grad_norm": 3.796875, "learning_rate": 4.560650539482526e-07, "loss": 0.8112, "step": 1884 }, { "epoch": 1.4555984555984556, "grad_norm": 5.5625, "learning_rate": 4.5485868286839165e-07, "loss": 0.8558, "step": 1885 }, { "epoch": 1.4563706563706564, "grad_norm": 4.125, "learning_rate": 4.5365355446773966e-07, "loss": 1.0183, "step": 1886 }, { "epoch": 1.457142857142857, "grad_norm": 4.84375, "learning_rate": 4.5244967062972185e-07, "loss": 0.9008, "step": 1887 }, { "epoch": 1.457915057915058, "grad_norm": 4.0, "learning_rate": 4.5124703323581514e-07, "loss": 0.7499, "step": 1888 }, { "epoch": 1.4586872586872586, "grad_norm": 3.421875, "learning_rate": 4.500456441655514e-07, "loss": 0.7213, "step": 1889 }, { "epoch": 1.4594594594594594, "grad_norm": 4.0625, "learning_rate": 4.4884550529650986e-07, "loss": 0.8478, "step": 1890 }, { "epoch": 1.4602316602316603, "grad_norm": 4.21875, "learning_rate": 4.4764661850431614e-07, "loss": 0.9177, "step": 1891 }, { "epoch": 1.461003861003861, "grad_norm": 4.25, "learning_rate": 4.4644898566264044e-07, "loss": 0.8707, "step": 1892 }, { "epoch": 1.4617760617760618, "grad_norm": 126.5, "learning_rate": 4.452526086431905e-07, "loss": 0.7893, "step": 1893 }, { "epoch": 1.4625482625482626, "grad_norm": 2.78125, "learning_rate": 4.4405748931571413e-07, "loss": 0.5358, "step": 1894 }, { "epoch": 1.4633204633204633, "grad_norm": 3.90625, "learning_rate": 4.4286362954799227e-07, "loss": 0.8613, "step": 1895 }, { "epoch": 1.4640926640926641, "grad_norm": 4.9375, "learning_rate": 4.416710312058374e-07, "loss": 0.8728, "step": 1896 }, { "epoch": 1.464864864864865, "grad_norm": 4.96875, "learning_rate": 4.404796961530906e-07, "loss": 0.9573, "step": 1897 }, { "epoch": 1.4656370656370656, "grad_norm": 3.8125, "learning_rate": 4.3928962625161916e-07, "loss": 0.8432, "step": 1898 }, { "epoch": 1.4664092664092663, "grad_norm": 4.34375, "learning_rate": 4.381008233613121e-07, "loss": 0.9988, "step": 1899 }, { "epoch": 1.4671814671814671, "grad_norm": 4.5, "learning_rate": 4.3691328934007974e-07, "loss": 0.9599, "step": 1900 }, { "epoch": 1.467953667953668, "grad_norm": 4.4375, "learning_rate": 4.3572702604384815e-07, "loss": 0.9009, "step": 1901 }, { "epoch": 1.4687258687258686, "grad_norm": 3.90625, "learning_rate": 4.3454203532655767e-07, "loss": 0.7877, "step": 1902 }, { "epoch": 1.4694980694980695, "grad_norm": 3.875, "learning_rate": 4.3335831904016106e-07, "loss": 0.8482, "step": 1903 }, { "epoch": 1.4702702702702704, "grad_norm": 10.5625, "learning_rate": 4.3217587903461684e-07, "loss": 0.8729, "step": 1904 }, { "epoch": 1.471042471042471, "grad_norm": 4.09375, "learning_rate": 4.309947171578914e-07, "loss": 0.8453, "step": 1905 }, { "epoch": 1.4718146718146718, "grad_norm": 3.828125, "learning_rate": 4.2981483525595226e-07, "loss": 0.7928, "step": 1906 }, { "epoch": 1.4725868725868727, "grad_norm": 4.4375, "learning_rate": 4.28636235172767e-07, "loss": 0.8799, "step": 1907 }, { "epoch": 1.4733590733590733, "grad_norm": 3.65625, "learning_rate": 4.2745891875029965e-07, "loss": 0.7322, "step": 1908 }, { "epoch": 1.4741312741312742, "grad_norm": 3.84375, "learning_rate": 4.2628288782850814e-07, "loss": 0.7739, "step": 1909 }, { "epoch": 1.4749034749034748, "grad_norm": 5.09375, "learning_rate": 4.2510814424534126e-07, "loss": 0.9672, "step": 1910 }, { "epoch": 1.4756756756756757, "grad_norm": 4.15625, "learning_rate": 4.2393468983673676e-07, "loss": 0.9543, "step": 1911 }, { "epoch": 1.4764478764478763, "grad_norm": 3.828125, "learning_rate": 4.227625264366167e-07, "loss": 0.7517, "step": 1912 }, { "epoch": 1.4772200772200772, "grad_norm": 4.21875, "learning_rate": 4.215916558768851e-07, "loss": 0.8776, "step": 1913 }, { "epoch": 1.477992277992278, "grad_norm": 3.9375, "learning_rate": 4.2042207998742757e-07, "loss": 0.9591, "step": 1914 }, { "epoch": 1.4787644787644787, "grad_norm": 3.78125, "learning_rate": 4.1925380059610306e-07, "loss": 0.9216, "step": 1915 }, { "epoch": 1.4795366795366796, "grad_norm": 3.828125, "learning_rate": 4.18086819528748e-07, "loss": 0.827, "step": 1916 }, { "epoch": 1.4803088803088804, "grad_norm": 3.640625, "learning_rate": 4.169211386091661e-07, "loss": 0.9272, "step": 1917 }, { "epoch": 1.481081081081081, "grad_norm": 7.59375, "learning_rate": 4.157567596591322e-07, "loss": 0.8904, "step": 1918 }, { "epoch": 1.481853281853282, "grad_norm": 3.90625, "learning_rate": 4.1459368449838474e-07, "loss": 0.8289, "step": 1919 }, { "epoch": 1.4826254826254825, "grad_norm": 3.78125, "learning_rate": 4.1343191494462446e-07, "loss": 0.8783, "step": 1920 }, { "epoch": 1.4833976833976834, "grad_norm": 3.84375, "learning_rate": 4.12271452813513e-07, "loss": 0.8013, "step": 1921 }, { "epoch": 1.484169884169884, "grad_norm": 4.5, "learning_rate": 4.111122999186673e-07, "loss": 0.8684, "step": 1922 }, { "epoch": 1.484942084942085, "grad_norm": 4.28125, "learning_rate": 4.099544580716587e-07, "loss": 0.882, "step": 1923 }, { "epoch": 1.4857142857142858, "grad_norm": 4.125, "learning_rate": 4.087979290820096e-07, "loss": 0.8741, "step": 1924 }, { "epoch": 1.4864864864864864, "grad_norm": 10.375, "learning_rate": 4.076427147571904e-07, "loss": 0.8881, "step": 1925 }, { "epoch": 1.4872586872586873, "grad_norm": 4.125, "learning_rate": 4.0648881690261697e-07, "loss": 0.8169, "step": 1926 }, { "epoch": 1.4880308880308881, "grad_norm": 4.09375, "learning_rate": 4.053362373216488e-07, "loss": 0.9706, "step": 1927 }, { "epoch": 1.4888030888030888, "grad_norm": 3.671875, "learning_rate": 4.041849778155826e-07, "loss": 0.7698, "step": 1928 }, { "epoch": 1.4895752895752896, "grad_norm": 3.0625, "learning_rate": 4.0303504018365495e-07, "loss": 0.4142, "step": 1929 }, { "epoch": 1.4903474903474905, "grad_norm": 3.84375, "learning_rate": 4.018864262230346e-07, "loss": 0.7064, "step": 1930 }, { "epoch": 1.4911196911196911, "grad_norm": 4.25, "learning_rate": 4.007391377288217e-07, "loss": 0.8969, "step": 1931 }, { "epoch": 1.491891891891892, "grad_norm": 3.703125, "learning_rate": 3.995931764940468e-07, "loss": 0.7282, "step": 1932 }, { "epoch": 1.4926640926640926, "grad_norm": 3.890625, "learning_rate": 3.9844854430966305e-07, "loss": 0.8975, "step": 1933 }, { "epoch": 1.4934362934362935, "grad_norm": 8.1875, "learning_rate": 3.973052429645495e-07, "loss": 0.8187, "step": 1934 }, { "epoch": 1.494208494208494, "grad_norm": 3.796875, "learning_rate": 3.9616327424550365e-07, "loss": 0.8047, "step": 1935 }, { "epoch": 1.494980694980695, "grad_norm": 4.15625, "learning_rate": 3.9502263993724046e-07, "loss": 0.798, "step": 1936 }, { "epoch": 1.4957528957528958, "grad_norm": 4.03125, "learning_rate": 3.9388334182238945e-07, "loss": 0.9122, "step": 1937 }, { "epoch": 1.4965250965250965, "grad_norm": 10.625, "learning_rate": 3.9274538168149316e-07, "loss": 0.893, "step": 1938 }, { "epoch": 1.4972972972972973, "grad_norm": 3.671875, "learning_rate": 3.9160876129300056e-07, "loss": 0.8205, "step": 1939 }, { "epoch": 1.4980694980694982, "grad_norm": 3.921875, "learning_rate": 3.9047348243326927e-07, "loss": 0.9854, "step": 1940 }, { "epoch": 1.4988416988416988, "grad_norm": 85.5, "learning_rate": 3.89339546876559e-07, "loss": 0.8733, "step": 1941 }, { "epoch": 1.4996138996138997, "grad_norm": 3.953125, "learning_rate": 3.8820695639503e-07, "loss": 0.8645, "step": 1942 }, { "epoch": 1.5003861003861005, "grad_norm": 3.8125, "learning_rate": 3.8707571275874184e-07, "loss": 0.8467, "step": 1943 }, { "epoch": 1.5011583011583012, "grad_norm": 4.09375, "learning_rate": 3.8594581773564664e-07, "loss": 0.88, "step": 1944 }, { "epoch": 1.5019305019305018, "grad_norm": 4.28125, "learning_rate": 3.848172730915915e-07, "loss": 0.8751, "step": 1945 }, { "epoch": 1.5027027027027027, "grad_norm": 3.875, "learning_rate": 3.8369008059031154e-07, "loss": 0.85, "step": 1946 }, { "epoch": 1.5034749034749035, "grad_norm": 3.46875, "learning_rate": 3.825642419934289e-07, "loss": 0.8112, "step": 1947 }, { "epoch": 1.5042471042471042, "grad_norm": 9.4375, "learning_rate": 3.814397590604502e-07, "loss": 0.7744, "step": 1948 }, { "epoch": 1.505019305019305, "grad_norm": 4.1875, "learning_rate": 3.803166335487631e-07, "loss": 0.9275, "step": 1949 }, { "epoch": 1.505791505791506, "grad_norm": 5.53125, "learning_rate": 3.791948672136332e-07, "loss": 0.7586, "step": 1950 }, { "epoch": 1.5065637065637065, "grad_norm": 3.59375, "learning_rate": 3.7807446180820356e-07, "loss": 0.8529, "step": 1951 }, { "epoch": 1.5073359073359074, "grad_norm": 4.3125, "learning_rate": 3.7695541908348914e-07, "loss": 0.8423, "step": 1952 }, { "epoch": 1.5081081081081082, "grad_norm": 3.484375, "learning_rate": 3.7583774078837466e-07, "loss": 0.7884, "step": 1953 }, { "epoch": 1.5088803088803089, "grad_norm": 4.5, "learning_rate": 3.747214286696148e-07, "loss": 0.7954, "step": 1954 }, { "epoch": 1.5096525096525095, "grad_norm": 3.9375, "learning_rate": 3.736064844718257e-07, "loss": 0.9079, "step": 1955 }, { "epoch": 1.5104247104247104, "grad_norm": 3.65625, "learning_rate": 3.724929099374892e-07, "loss": 0.8001, "step": 1956 }, { "epoch": 1.5111969111969112, "grad_norm": 3.578125, "learning_rate": 3.713807068069433e-07, "loss": 0.6988, "step": 1957 }, { "epoch": 1.5119691119691119, "grad_norm": 3.953125, "learning_rate": 3.7026987681838573e-07, "loss": 0.8707, "step": 1958 }, { "epoch": 1.5127413127413127, "grad_norm": 4.3125, "learning_rate": 3.6916042170786607e-07, "loss": 0.8502, "step": 1959 }, { "epoch": 1.5135135135135136, "grad_norm": 12.4375, "learning_rate": 3.680523432092857e-07, "loss": 0.7906, "step": 1960 }, { "epoch": 1.5142857142857142, "grad_norm": 4.28125, "learning_rate": 3.669456430543955e-07, "loss": 1.0032, "step": 1961 }, { "epoch": 1.515057915057915, "grad_norm": 5.0, "learning_rate": 3.6584032297279135e-07, "loss": 0.9378, "step": 1962 }, { "epoch": 1.515830115830116, "grad_norm": 4.84375, "learning_rate": 3.647363846919121e-07, "loss": 0.8543, "step": 1963 }, { "epoch": 1.5166023166023166, "grad_norm": 4.34375, "learning_rate": 3.636338299370376e-07, "loss": 0.9627, "step": 1964 }, { "epoch": 1.5173745173745172, "grad_norm": 3.953125, "learning_rate": 3.6253266043128537e-07, "loss": 0.8982, "step": 1965 }, { "epoch": 1.5181467181467183, "grad_norm": 3.78125, "learning_rate": 3.614328778956075e-07, "loss": 0.7162, "step": 1966 }, { "epoch": 1.518918918918919, "grad_norm": 6.75, "learning_rate": 3.603344840487899e-07, "loss": 0.876, "step": 1967 }, { "epoch": 1.5196911196911196, "grad_norm": 8.8125, "learning_rate": 3.5923748060744586e-07, "loss": 0.8265, "step": 1968 }, { "epoch": 1.5204633204633204, "grad_norm": 3.734375, "learning_rate": 3.5814186928601794e-07, "loss": 0.7416, "step": 1969 }, { "epoch": 1.5212355212355213, "grad_norm": 6.1875, "learning_rate": 3.570476517967718e-07, "loss": 0.8194, "step": 1970 }, { "epoch": 1.522007722007722, "grad_norm": 4.1875, "learning_rate": 3.5595482984979475e-07, "loss": 0.8913, "step": 1971 }, { "epoch": 1.5227799227799228, "grad_norm": 5.03125, "learning_rate": 3.548634051529944e-07, "loss": 0.8912, "step": 1972 }, { "epoch": 1.5235521235521237, "grad_norm": 4.09375, "learning_rate": 3.5377337941209233e-07, "loss": 0.7951, "step": 1973 }, { "epoch": 1.5243243243243243, "grad_norm": 3.609375, "learning_rate": 3.526847543306261e-07, "loss": 0.803, "step": 1974 }, { "epoch": 1.525096525096525, "grad_norm": 3.375, "learning_rate": 3.5159753160994307e-07, "loss": 0.7604, "step": 1975 }, { "epoch": 1.525868725868726, "grad_norm": 5.25, "learning_rate": 3.5051171294919915e-07, "loss": 0.8083, "step": 1976 }, { "epoch": 1.5266409266409267, "grad_norm": 3.890625, "learning_rate": 3.494273000453553e-07, "loss": 0.841, "step": 1977 }, { "epoch": 1.5274131274131273, "grad_norm": 3.828125, "learning_rate": 3.4834429459317767e-07, "loss": 0.8076, "step": 1978 }, { "epoch": 1.5281853281853282, "grad_norm": 5.46875, "learning_rate": 3.472626982852294e-07, "loss": 0.8844, "step": 1979 }, { "epoch": 1.528957528957529, "grad_norm": 3.953125, "learning_rate": 3.461825128118745e-07, "loss": 0.7406, "step": 1980 }, { "epoch": 1.5297297297297296, "grad_norm": 4.09375, "learning_rate": 3.451037398612704e-07, "loss": 0.9509, "step": 1981 }, { "epoch": 1.5305019305019305, "grad_norm": 3.796875, "learning_rate": 3.44026381119367e-07, "loss": 0.8642, "step": 1982 }, { "epoch": 1.5312741312741314, "grad_norm": 3.546875, "learning_rate": 3.4295043826990545e-07, "loss": 0.8242, "step": 1983 }, { "epoch": 1.532046332046332, "grad_norm": 3.65625, "learning_rate": 3.418759129944116e-07, "loss": 0.8371, "step": 1984 }, { "epoch": 1.5328185328185329, "grad_norm": 3.859375, "learning_rate": 3.408028069721983e-07, "loss": 1.0067, "step": 1985 }, { "epoch": 1.5335907335907337, "grad_norm": 3.6875, "learning_rate": 3.3973112188035906e-07, "loss": 0.7583, "step": 1986 }, { "epoch": 1.5343629343629344, "grad_norm": 3.515625, "learning_rate": 3.3866085939376696e-07, "loss": 0.764, "step": 1987 }, { "epoch": 1.535135135135135, "grad_norm": 3.71875, "learning_rate": 3.375920211850717e-07, "loss": 0.7887, "step": 1988 }, { "epoch": 1.5359073359073359, "grad_norm": 4.40625, "learning_rate": 3.3652460892469707e-07, "loss": 0.821, "step": 1989 }, { "epoch": 1.5366795366795367, "grad_norm": 3.6875, "learning_rate": 3.3545862428083817e-07, "loss": 0.7202, "step": 1990 }, { "epoch": 1.5374517374517374, "grad_norm": 4.03125, "learning_rate": 3.3439406891945983e-07, "loss": 0.8774, "step": 1991 }, { "epoch": 1.5382239382239382, "grad_norm": 3.65625, "learning_rate": 3.3333094450429204e-07, "loss": 0.8679, "step": 1992 }, { "epoch": 1.538996138996139, "grad_norm": 6.8125, "learning_rate": 3.322692526968288e-07, "loss": 0.8909, "step": 1993 }, { "epoch": 1.5397683397683397, "grad_norm": 3.8125, "learning_rate": 3.3120899515632604e-07, "loss": 0.896, "step": 1994 }, { "epoch": 1.5405405405405406, "grad_norm": 4.125, "learning_rate": 3.301501735397962e-07, "loss": 0.7783, "step": 1995 }, { "epoch": 1.5413127413127414, "grad_norm": 4.15625, "learning_rate": 3.2909278950201017e-07, "loss": 0.8713, "step": 1996 }, { "epoch": 1.542084942084942, "grad_norm": 4.09375, "learning_rate": 3.280368446954892e-07, "loss": 0.8557, "step": 1997 }, { "epoch": 1.5428571428571427, "grad_norm": 4.125, "learning_rate": 3.269823407705079e-07, "loss": 0.9265, "step": 1998 }, { "epoch": 1.5436293436293438, "grad_norm": 4.3125, "learning_rate": 3.259292793750878e-07, "loss": 0.887, "step": 1999 }, { "epoch": 1.5444015444015444, "grad_norm": 3.5, "learning_rate": 3.248776621549954e-07, "loss": 0.9071, "step": 2000 }, { "epoch": 1.545173745173745, "grad_norm": 5.28125, "learning_rate": 3.2382749075374197e-07, "loss": 0.7407, "step": 2001 }, { "epoch": 1.545945945945946, "grad_norm": 3.921875, "learning_rate": 3.2277876681257745e-07, "loss": 0.9859, "step": 2002 }, { "epoch": 1.5467181467181468, "grad_norm": 3.8125, "learning_rate": 3.2173149197049045e-07, "loss": 0.8294, "step": 2003 }, { "epoch": 1.5474903474903474, "grad_norm": 4.09375, "learning_rate": 3.2068566786420453e-07, "loss": 0.8879, "step": 2004 }, { "epoch": 1.5482625482625483, "grad_norm": 8.25, "learning_rate": 3.196412961281764e-07, "loss": 0.8422, "step": 2005 }, { "epoch": 1.5490347490347491, "grad_norm": 3.625, "learning_rate": 3.185983783945923e-07, "loss": 0.7871, "step": 2006 }, { "epoch": 1.5498069498069498, "grad_norm": 19.25, "learning_rate": 3.1755691629336756e-07, "loss": 0.7753, "step": 2007 }, { "epoch": 1.5505791505791506, "grad_norm": 3.640625, "learning_rate": 3.1651691145214004e-07, "loss": 0.7977, "step": 2008 }, { "epoch": 1.5513513513513515, "grad_norm": 5.875, "learning_rate": 3.154783654962727e-07, "loss": 0.9963, "step": 2009 }, { "epoch": 1.5521235521235521, "grad_norm": 4.09375, "learning_rate": 3.144412800488471e-07, "loss": 0.8576, "step": 2010 }, { "epoch": 1.5528957528957528, "grad_norm": 4.0625, "learning_rate": 3.1340565673066234e-07, "loss": 0.8107, "step": 2011 }, { "epoch": 1.5536679536679536, "grad_norm": 6.5625, "learning_rate": 3.123714971602336e-07, "loss": 0.8451, "step": 2012 }, { "epoch": 1.5544401544401545, "grad_norm": 3.734375, "learning_rate": 3.113388029537862e-07, "loss": 0.8984, "step": 2013 }, { "epoch": 1.5552123552123551, "grad_norm": 4.1875, "learning_rate": 3.103075757252576e-07, "loss": 0.8164, "step": 2014 }, { "epoch": 1.555984555984556, "grad_norm": 4.4375, "learning_rate": 3.092778170862916e-07, "loss": 0.7757, "step": 2015 }, { "epoch": 1.5567567567567568, "grad_norm": 4.28125, "learning_rate": 3.082495286462367e-07, "loss": 0.9107, "step": 2016 }, { "epoch": 1.5575289575289575, "grad_norm": 4.65625, "learning_rate": 3.072227120121435e-07, "loss": 0.8618, "step": 2017 }, { "epoch": 1.5583011583011583, "grad_norm": 3.734375, "learning_rate": 3.0619736878876413e-07, "loss": 0.7495, "step": 2018 }, { "epoch": 1.5590733590733592, "grad_norm": 4.3125, "learning_rate": 3.05173500578545e-07, "loss": 0.9266, "step": 2019 }, { "epoch": 1.5598455598455598, "grad_norm": 3.875, "learning_rate": 3.0415110898163043e-07, "loss": 0.7565, "step": 2020 }, { "epoch": 1.5606177606177605, "grad_norm": 3.75, "learning_rate": 3.03130195595855e-07, "loss": 0.7987, "step": 2021 }, { "epoch": 1.5613899613899616, "grad_norm": 4.9375, "learning_rate": 3.0211076201674336e-07, "loss": 0.9621, "step": 2022 }, { "epoch": 1.5621621621621622, "grad_norm": 4.34375, "learning_rate": 3.010928098375089e-07, "loss": 1.0032, "step": 2023 }, { "epoch": 1.5629343629343628, "grad_norm": 3.578125, "learning_rate": 3.000763406490474e-07, "loss": 0.8574, "step": 2024 }, { "epoch": 1.5637065637065637, "grad_norm": 3.78125, "learning_rate": 2.990613560399391e-07, "loss": 0.6449, "step": 2025 }, { "epoch": 1.5644787644787645, "grad_norm": 3.859375, "learning_rate": 2.9804785759644323e-07, "loss": 0.8699, "step": 2026 }, { "epoch": 1.5652509652509652, "grad_norm": 4.40625, "learning_rate": 2.97035846902496e-07, "loss": 0.9597, "step": 2027 }, { "epoch": 1.566023166023166, "grad_norm": 6.625, "learning_rate": 2.960253255397093e-07, "loss": 0.9996, "step": 2028 }, { "epoch": 1.566795366795367, "grad_norm": 3.828125, "learning_rate": 2.950162950873667e-07, "loss": 0.7568, "step": 2029 }, { "epoch": 1.5675675675675675, "grad_norm": 4.3125, "learning_rate": 2.940087571224219e-07, "loss": 0.8639, "step": 2030 }, { "epoch": 1.5683397683397682, "grad_norm": 7.0, "learning_rate": 2.930027132194969e-07, "loss": 0.8313, "step": 2031 }, { "epoch": 1.5691119691119693, "grad_norm": 3.90625, "learning_rate": 2.9199816495087784e-07, "loss": 0.8174, "step": 2032 }, { "epoch": 1.56988416988417, "grad_norm": 4.59375, "learning_rate": 2.90995113886513e-07, "loss": 0.887, "step": 2033 }, { "epoch": 1.5706563706563705, "grad_norm": 4.59375, "learning_rate": 2.8999356159401263e-07, "loss": 0.9379, "step": 2034 }, { "epoch": 1.5714285714285714, "grad_norm": 7.875, "learning_rate": 2.8899350963864206e-07, "loss": 0.8935, "step": 2035 }, { "epoch": 1.5722007722007723, "grad_norm": 3.84375, "learning_rate": 2.879949595833244e-07, "loss": 0.8508, "step": 2036 }, { "epoch": 1.572972972972973, "grad_norm": 3.734375, "learning_rate": 2.8699791298863315e-07, "loss": 0.9406, "step": 2037 }, { "epoch": 1.5737451737451738, "grad_norm": 4.5625, "learning_rate": 2.8600237141279406e-07, "loss": 0.866, "step": 2038 }, { "epoch": 1.5745173745173746, "grad_norm": 3.484375, "learning_rate": 2.8500833641167986e-07, "loss": 0.772, "step": 2039 }, { "epoch": 1.5752895752895753, "grad_norm": 7.84375, "learning_rate": 2.8401580953880863e-07, "loss": 0.8205, "step": 2040 }, { "epoch": 1.576061776061776, "grad_norm": 3.90625, "learning_rate": 2.8302479234534234e-07, "loss": 0.8232, "step": 2041 }, { "epoch": 1.576833976833977, "grad_norm": 5.84375, "learning_rate": 2.8203528638008253e-07, "loss": 0.8926, "step": 2042 }, { "epoch": 1.5776061776061776, "grad_norm": 4.53125, "learning_rate": 2.8104729318946964e-07, "loss": 0.8505, "step": 2043 }, { "epoch": 1.5783783783783782, "grad_norm": 3.5, "learning_rate": 2.800608143175794e-07, "loss": 0.7754, "step": 2044 }, { "epoch": 1.579150579150579, "grad_norm": 3.859375, "learning_rate": 2.790758513061213e-07, "loss": 0.9453, "step": 2045 }, { "epoch": 1.57992277992278, "grad_norm": 4.9375, "learning_rate": 2.7809240569443537e-07, "loss": 0.9201, "step": 2046 }, { "epoch": 1.5806949806949806, "grad_norm": 4.28125, "learning_rate": 2.771104790194912e-07, "loss": 0.8929, "step": 2047 }, { "epoch": 1.5814671814671815, "grad_norm": 4.375, "learning_rate": 2.761300728158824e-07, "loss": 0.9495, "step": 2048 }, { "epoch": 1.5822393822393823, "grad_norm": 3.375, "learning_rate": 2.7515118861582895e-07, "loss": 0.743, "step": 2049 }, { "epoch": 1.583011583011583, "grad_norm": 3.765625, "learning_rate": 2.741738279491704e-07, "loss": 0.8024, "step": 2050 }, { "epoch": 1.5837837837837838, "grad_norm": 3.6875, "learning_rate": 2.7319799234336546e-07, "loss": 0.8314, "step": 2051 }, { "epoch": 1.5845559845559847, "grad_norm": 3.734375, "learning_rate": 2.722236833234908e-07, "loss": 0.8529, "step": 2052 }, { "epoch": 1.5853281853281853, "grad_norm": 5.03125, "learning_rate": 2.712509024122348e-07, "loss": 0.963, "step": 2053 }, { "epoch": 1.586100386100386, "grad_norm": 4.0625, "learning_rate": 2.702796511299e-07, "loss": 0.8962, "step": 2054 }, { "epoch": 1.586872586872587, "grad_norm": 4.375, "learning_rate": 2.6930993099439723e-07, "loss": 0.9535, "step": 2055 }, { "epoch": 1.5876447876447877, "grad_norm": 3.953125, "learning_rate": 2.6834174352124444e-07, "loss": 0.8317, "step": 2056 }, { "epoch": 1.5884169884169883, "grad_norm": 4.3125, "learning_rate": 2.673750902235643e-07, "loss": 0.7368, "step": 2057 }, { "epoch": 1.5891891891891892, "grad_norm": 3.984375, "learning_rate": 2.664099726120828e-07, "loss": 0.8885, "step": 2058 }, { "epoch": 1.58996138996139, "grad_norm": 4.1875, "learning_rate": 2.6544639219512365e-07, "loss": 0.7021, "step": 2059 }, { "epoch": 1.5907335907335907, "grad_norm": 5.09375, "learning_rate": 2.644843504786104e-07, "loss": 0.8797, "step": 2060 }, { "epoch": 1.5915057915057915, "grad_norm": 4.5, "learning_rate": 2.6352384896606104e-07, "loss": 0.765, "step": 2061 }, { "epoch": 1.5922779922779924, "grad_norm": 4.1875, "learning_rate": 2.625648891585858e-07, "loss": 0.9476, "step": 2062 }, { "epoch": 1.593050193050193, "grad_norm": 3.9375, "learning_rate": 2.616074725548871e-07, "loss": 0.697, "step": 2063 }, { "epoch": 1.5938223938223937, "grad_norm": 6.0, "learning_rate": 2.6065160065125343e-07, "loss": 0.8712, "step": 2064 }, { "epoch": 1.5945945945945947, "grad_norm": 3.734375, "learning_rate": 2.5969727494156107e-07, "loss": 0.9534, "step": 2065 }, { "epoch": 1.5953667953667954, "grad_norm": 4.0625, "learning_rate": 2.587444969172689e-07, "loss": 0.7774, "step": 2066 }, { "epoch": 1.596138996138996, "grad_norm": 3.65625, "learning_rate": 2.5779326806741724e-07, "loss": 0.8722, "step": 2067 }, { "epoch": 1.5969111969111969, "grad_norm": 5.6875, "learning_rate": 2.568435898786249e-07, "loss": 0.8801, "step": 2068 }, { "epoch": 1.5976833976833977, "grad_norm": 3.578125, "learning_rate": 2.5589546383508773e-07, "loss": 0.806, "step": 2069 }, { "epoch": 1.5984555984555984, "grad_norm": 3.640625, "learning_rate": 2.549488914185762e-07, "loss": 0.7331, "step": 2070 }, { "epoch": 1.5992277992277992, "grad_norm": 4.125, "learning_rate": 2.5400387410843165e-07, "loss": 0.898, "step": 2071 }, { "epoch": 1.6, "grad_norm": 4.25, "learning_rate": 2.5306041338156577e-07, "loss": 0.9019, "step": 2072 }, { "epoch": 1.6007722007722007, "grad_norm": 5.4375, "learning_rate": 2.5211851071245715e-07, "loss": 0.858, "step": 2073 }, { "epoch": 1.6015444015444016, "grad_norm": 3.875, "learning_rate": 2.511781675731505e-07, "loss": 0.7996, "step": 2074 }, { "epoch": 1.6023166023166024, "grad_norm": 4.75, "learning_rate": 2.502393854332509e-07, "loss": 0.9293, "step": 2075 }, { "epoch": 1.603088803088803, "grad_norm": 4.0625, "learning_rate": 2.493021657599268e-07, "loss": 0.7975, "step": 2076 }, { "epoch": 1.6038610038610037, "grad_norm": 4.3125, "learning_rate": 2.483665100179018e-07, "loss": 0.8481, "step": 2077 }, { "epoch": 1.6046332046332046, "grad_norm": 3.609375, "learning_rate": 2.4743241966945786e-07, "loss": 0.7867, "step": 2078 }, { "epoch": 1.6054054054054054, "grad_norm": 7.21875, "learning_rate": 2.46499896174429e-07, "loss": 0.8224, "step": 2079 }, { "epoch": 1.606177606177606, "grad_norm": 3.796875, "learning_rate": 2.4556894099020053e-07, "loss": 0.9208, "step": 2080 }, { "epoch": 1.606949806949807, "grad_norm": 4.09375, "learning_rate": 2.4463955557170793e-07, "loss": 0.8739, "step": 2081 }, { "epoch": 1.6077220077220078, "grad_norm": 4.125, "learning_rate": 2.4371174137143196e-07, "loss": 0.8571, "step": 2082 }, { "epoch": 1.6084942084942084, "grad_norm": 4.5625, "learning_rate": 2.427854998393986e-07, "loss": 0.9042, "step": 2083 }, { "epoch": 1.6092664092664093, "grad_norm": 4.0, "learning_rate": 2.4186083242317583e-07, "loss": 0.8372, "step": 2084 }, { "epoch": 1.6100386100386102, "grad_norm": 3.78125, "learning_rate": 2.409377405678713e-07, "loss": 0.8675, "step": 2085 }, { "epoch": 1.6108108108108108, "grad_norm": 4.5625, "learning_rate": 2.400162257161304e-07, "loss": 0.8134, "step": 2086 }, { "epoch": 1.6115830115830114, "grad_norm": 4.28125, "learning_rate": 2.39096289308135e-07, "loss": 0.9093, "step": 2087 }, { "epoch": 1.6123552123552125, "grad_norm": 3.96875, "learning_rate": 2.381779327815978e-07, "loss": 0.9224, "step": 2088 }, { "epoch": 1.6131274131274131, "grad_norm": 6.75, "learning_rate": 2.372611575717647e-07, "loss": 0.773, "step": 2089 }, { "epoch": 1.6138996138996138, "grad_norm": 4.03125, "learning_rate": 2.3634596511140902e-07, "loss": 0.9715, "step": 2090 }, { "epoch": 1.6146718146718146, "grad_norm": 4.09375, "learning_rate": 2.3543235683083046e-07, "loss": 0.8521, "step": 2091 }, { "epoch": 1.6154440154440155, "grad_norm": 4.1875, "learning_rate": 2.3452033415785434e-07, "loss": 0.9063, "step": 2092 }, { "epoch": 1.6162162162162161, "grad_norm": 6.6875, "learning_rate": 2.3360989851782521e-07, "loss": 0.7847, "step": 2093 }, { "epoch": 1.616988416988417, "grad_norm": 5.28125, "learning_rate": 2.3270105133361019e-07, "loss": 0.9389, "step": 2094 }, { "epoch": 1.6177606177606179, "grad_norm": 3.78125, "learning_rate": 2.317937940255921e-07, "loss": 0.764, "step": 2095 }, { "epoch": 1.6185328185328185, "grad_norm": 4.125, "learning_rate": 2.308881280116697e-07, "loss": 0.8059, "step": 2096 }, { "epoch": 1.6193050193050194, "grad_norm": 4.25, "learning_rate": 2.2998405470725447e-07, "loss": 0.8568, "step": 2097 }, { "epoch": 1.6200772200772202, "grad_norm": 4.65625, "learning_rate": 2.290815755252697e-07, "loss": 0.907, "step": 2098 }, { "epoch": 1.6208494208494209, "grad_norm": 4.46875, "learning_rate": 2.2818069187614548e-07, "loss": 0.8965, "step": 2099 }, { "epoch": 1.6216216216216215, "grad_norm": 5.125, "learning_rate": 2.272814051678203e-07, "loss": 0.7963, "step": 2100 }, { "epoch": 1.6223938223938223, "grad_norm": 27.75, "learning_rate": 2.2638371680573575e-07, "loss": 0.793, "step": 2101 }, { "epoch": 1.6231660231660232, "grad_norm": 4.78125, "learning_rate": 2.2548762819283546e-07, "loss": 0.7976, "step": 2102 }, { "epoch": 1.6239382239382238, "grad_norm": 4.78125, "learning_rate": 2.2459314072956402e-07, "loss": 0.8545, "step": 2103 }, { "epoch": 1.6247104247104247, "grad_norm": 3.484375, "learning_rate": 2.2370025581386165e-07, "loss": 0.6776, "step": 2104 }, { "epoch": 1.6254826254826256, "grad_norm": 5.5625, "learning_rate": 2.2280897484116632e-07, "loss": 0.8512, "step": 2105 }, { "epoch": 1.6262548262548262, "grad_norm": 12.0625, "learning_rate": 2.2191929920440752e-07, "loss": 0.8828, "step": 2106 }, { "epoch": 1.627027027027027, "grad_norm": 3.859375, "learning_rate": 2.210312302940068e-07, "loss": 0.9239, "step": 2107 }, { "epoch": 1.627799227799228, "grad_norm": 3.953125, "learning_rate": 2.201447694978745e-07, "loss": 0.9256, "step": 2108 }, { "epoch": 1.6285714285714286, "grad_norm": 6.59375, "learning_rate": 2.1925991820140698e-07, "loss": 0.9403, "step": 2109 }, { "epoch": 1.6293436293436292, "grad_norm": 3.796875, "learning_rate": 2.183766777874867e-07, "loss": 0.8205, "step": 2110 }, { "epoch": 1.6301158301158303, "grad_norm": 3.875, "learning_rate": 2.174950496364775e-07, "loss": 0.8662, "step": 2111 }, { "epoch": 1.630888030888031, "grad_norm": 3.90625, "learning_rate": 2.1661503512622348e-07, "loss": 0.827, "step": 2112 }, { "epoch": 1.6316602316602316, "grad_norm": 3.765625, "learning_rate": 2.15736635632047e-07, "loss": 0.8417, "step": 2113 }, { "epoch": 1.6324324324324324, "grad_norm": 4.34375, "learning_rate": 2.1485985252674742e-07, "loss": 0.7901, "step": 2114 }, { "epoch": 1.6332046332046333, "grad_norm": 6.71875, "learning_rate": 2.139846871805959e-07, "loss": 0.8608, "step": 2115 }, { "epoch": 1.633976833976834, "grad_norm": 4.59375, "learning_rate": 2.131111409613375e-07, "loss": 0.7209, "step": 2116 }, { "epoch": 1.6347490347490348, "grad_norm": 4.28125, "learning_rate": 2.122392152341847e-07, "loss": 0.8082, "step": 2117 }, { "epoch": 1.6355212355212356, "grad_norm": 4.875, "learning_rate": 2.1136891136181948e-07, "loss": 0.9169, "step": 2118 }, { "epoch": 1.6362934362934363, "grad_norm": 3.640625, "learning_rate": 2.1050023070438768e-07, "loss": 0.7894, "step": 2119 }, { "epoch": 1.637065637065637, "grad_norm": 4.09375, "learning_rate": 2.0963317461949865e-07, "loss": 0.8587, "step": 2120 }, { "epoch": 1.637837837837838, "grad_norm": 3.78125, "learning_rate": 2.0876774446222353e-07, "loss": 0.8834, "step": 2121 }, { "epoch": 1.6386100386100386, "grad_norm": 4.0, "learning_rate": 2.0790394158509127e-07, "loss": 0.8652, "step": 2122 }, { "epoch": 1.6393822393822393, "grad_norm": 3.890625, "learning_rate": 2.070417673380884e-07, "loss": 0.7897, "step": 2123 }, { "epoch": 1.6401544401544401, "grad_norm": 4.1875, "learning_rate": 2.0618122306865578e-07, "loss": 0.8949, "step": 2124 }, { "epoch": 1.640926640926641, "grad_norm": 3.859375, "learning_rate": 2.053223101216871e-07, "loss": 0.9243, "step": 2125 }, { "epoch": 1.6416988416988416, "grad_norm": 4.625, "learning_rate": 2.0446502983952606e-07, "loss": 0.9206, "step": 2126 }, { "epoch": 1.6424710424710425, "grad_norm": 4.25, "learning_rate": 2.0360938356196625e-07, "loss": 0.8421, "step": 2127 }, { "epoch": 1.6432432432432433, "grad_norm": 3.65625, "learning_rate": 2.0275537262624504e-07, "loss": 0.7157, "step": 2128 }, { "epoch": 1.644015444015444, "grad_norm": 3.953125, "learning_rate": 2.019029983670466e-07, "loss": 0.7937, "step": 2129 }, { "epoch": 1.6447876447876448, "grad_norm": 3.734375, "learning_rate": 2.010522621164958e-07, "loss": 0.7169, "step": 2130 }, { "epoch": 1.6455598455598457, "grad_norm": 4.5625, "learning_rate": 2.0020316520415736e-07, "loss": 1.0352, "step": 2131 }, { "epoch": 1.6463320463320463, "grad_norm": 3.640625, "learning_rate": 1.993557089570357e-07, "loss": 0.8419, "step": 2132 }, { "epoch": 1.647104247104247, "grad_norm": 5.1875, "learning_rate": 1.9850989469956852e-07, "loss": 0.8352, "step": 2133 }, { "epoch": 1.6478764478764478, "grad_norm": 3.921875, "learning_rate": 1.976657237536296e-07, "loss": 0.875, "step": 2134 }, { "epoch": 1.6486486486486487, "grad_norm": 3.890625, "learning_rate": 1.968231974385236e-07, "loss": 0.9168, "step": 2135 }, { "epoch": 1.6494208494208493, "grad_norm": 4.53125, "learning_rate": 1.9598231707098472e-07, "loss": 0.8112, "step": 2136 }, { "epoch": 1.6501930501930502, "grad_norm": 5.5, "learning_rate": 1.9514308396517467e-07, "loss": 0.9693, "step": 2137 }, { "epoch": 1.650965250965251, "grad_norm": 4.0, "learning_rate": 1.9430549943268235e-07, "loss": 0.7272, "step": 2138 }, { "epoch": 1.6517374517374517, "grad_norm": 3.703125, "learning_rate": 1.9346956478251744e-07, "loss": 0.6572, "step": 2139 }, { "epoch": 1.6525096525096525, "grad_norm": 3.53125, "learning_rate": 1.9263528132111354e-07, "loss": 0.7338, "step": 2140 }, { "epoch": 1.6532818532818534, "grad_norm": 6.9375, "learning_rate": 1.9180265035232265e-07, "loss": 0.8303, "step": 2141 }, { "epoch": 1.654054054054054, "grad_norm": 3.796875, "learning_rate": 1.9097167317741389e-07, "loss": 0.8474, "step": 2142 }, { "epoch": 1.6548262548262547, "grad_norm": 17.5, "learning_rate": 1.9014235109507327e-07, "loss": 0.9274, "step": 2143 }, { "epoch": 1.6555984555984558, "grad_norm": 4.59375, "learning_rate": 1.893146854013976e-07, "loss": 0.9039, "step": 2144 }, { "epoch": 1.6563706563706564, "grad_norm": 3.90625, "learning_rate": 1.884886773898978e-07, "loss": 0.8454, "step": 2145 }, { "epoch": 1.657142857142857, "grad_norm": 3.859375, "learning_rate": 1.8766432835149215e-07, "loss": 0.802, "step": 2146 }, { "epoch": 1.6579150579150579, "grad_norm": 3.859375, "learning_rate": 1.868416395745072e-07, "loss": 0.885, "step": 2147 }, { "epoch": 1.6586872586872587, "grad_norm": 4.78125, "learning_rate": 1.8602061234467382e-07, "loss": 0.7957, "step": 2148 }, { "epoch": 1.6594594594594594, "grad_norm": 4.09375, "learning_rate": 1.852012479451276e-07, "loss": 0.8754, "step": 2149 }, { "epoch": 1.6602316602316602, "grad_norm": 2.828125, "learning_rate": 1.843835476564043e-07, "loss": 0.4895, "step": 2150 }, { "epoch": 1.661003861003861, "grad_norm": 6.625, "learning_rate": 1.8356751275643904e-07, "loss": 0.9432, "step": 2151 }, { "epoch": 1.6617760617760617, "grad_norm": 3.71875, "learning_rate": 1.8275314452056436e-07, "loss": 0.8977, "step": 2152 }, { "epoch": 1.6625482625482626, "grad_norm": 4.3125, "learning_rate": 1.8194044422150795e-07, "loss": 0.9572, "step": 2153 }, { "epoch": 1.6633204633204635, "grad_norm": 4.125, "learning_rate": 1.811294131293917e-07, "loss": 0.9976, "step": 2154 }, { "epoch": 1.664092664092664, "grad_norm": 4.9375, "learning_rate": 1.8032005251172685e-07, "loss": 0.8503, "step": 2155 }, { "epoch": 1.6648648648648647, "grad_norm": 4.0625, "learning_rate": 1.7951236363341605e-07, "loss": 0.8538, "step": 2156 }, { "epoch": 1.6656370656370656, "grad_norm": 4.625, "learning_rate": 1.7870634775674786e-07, "loss": 0.8969, "step": 2157 }, { "epoch": 1.6664092664092665, "grad_norm": 4.0625, "learning_rate": 1.779020061413969e-07, "loss": 0.882, "step": 2158 }, { "epoch": 1.667181467181467, "grad_norm": 4.09375, "learning_rate": 1.7709934004442098e-07, "loss": 0.839, "step": 2159 }, { "epoch": 1.667953667953668, "grad_norm": 4.25, "learning_rate": 1.7629835072025896e-07, "loss": 0.7176, "step": 2160 }, { "epoch": 1.6687258687258688, "grad_norm": 5.0625, "learning_rate": 1.754990394207301e-07, "loss": 0.965, "step": 2161 }, { "epoch": 1.6694980694980694, "grad_norm": 4.3125, "learning_rate": 1.7470140739503038e-07, "loss": 0.7699, "step": 2162 }, { "epoch": 1.6702702702702703, "grad_norm": 3.9375, "learning_rate": 1.7390545588973156e-07, "loss": 0.8677, "step": 2163 }, { "epoch": 1.6710424710424712, "grad_norm": 4.0, "learning_rate": 1.7311118614877886e-07, "loss": 0.8506, "step": 2164 }, { "epoch": 1.6718146718146718, "grad_norm": 3.875, "learning_rate": 1.7231859941349017e-07, "loss": 0.8384, "step": 2165 }, { "epoch": 1.6725868725868724, "grad_norm": 4.34375, "learning_rate": 1.7152769692255113e-07, "loss": 0.7957, "step": 2166 }, { "epoch": 1.6733590733590735, "grad_norm": 4.3125, "learning_rate": 1.707384799120175e-07, "loss": 0.9705, "step": 2167 }, { "epoch": 1.6741312741312742, "grad_norm": 5.5, "learning_rate": 1.699509496153086e-07, "loss": 0.8714, "step": 2168 }, { "epoch": 1.6749034749034748, "grad_norm": 3.921875, "learning_rate": 1.691651072632096e-07, "loss": 0.9803, "step": 2169 }, { "epoch": 1.6756756756756757, "grad_norm": 4.5625, "learning_rate": 1.6838095408386671e-07, "loss": 0.9434, "step": 2170 }, { "epoch": 1.6764478764478765, "grad_norm": 3.640625, "learning_rate": 1.6759849130278617e-07, "loss": 0.8465, "step": 2171 }, { "epoch": 1.6772200772200772, "grad_norm": 17.375, "learning_rate": 1.668177201428331e-07, "loss": 0.7652, "step": 2172 }, { "epoch": 1.677992277992278, "grad_norm": 3.96875, "learning_rate": 1.6603864182422824e-07, "loss": 0.8606, "step": 2173 }, { "epoch": 1.6787644787644789, "grad_norm": 4.125, "learning_rate": 1.6526125756454695e-07, "loss": 0.8369, "step": 2174 }, { "epoch": 1.6795366795366795, "grad_norm": 4.8125, "learning_rate": 1.6448556857871695e-07, "loss": 0.9022, "step": 2175 }, { "epoch": 1.6803088803088801, "grad_norm": 4.34375, "learning_rate": 1.6371157607901645e-07, "loss": 0.9026, "step": 2176 }, { "epoch": 1.6810810810810812, "grad_norm": 5.65625, "learning_rate": 1.629392812750724e-07, "loss": 0.7781, "step": 2177 }, { "epoch": 1.6818532818532819, "grad_norm": 5.15625, "learning_rate": 1.621686853738591e-07, "loss": 0.8366, "step": 2178 }, { "epoch": 1.6826254826254825, "grad_norm": 3.71875, "learning_rate": 1.6139978957969413e-07, "loss": 0.8465, "step": 2179 }, { "epoch": 1.6833976833976834, "grad_norm": 3.796875, "learning_rate": 1.6063259509424014e-07, "loss": 0.8691, "step": 2180 }, { "epoch": 1.6841698841698842, "grad_norm": 3.75, "learning_rate": 1.5986710311649944e-07, "loss": 0.8843, "step": 2181 }, { "epoch": 1.6849420849420849, "grad_norm": 4.125, "learning_rate": 1.5910331484281395e-07, "loss": 0.9727, "step": 2182 }, { "epoch": 1.6857142857142857, "grad_norm": 4.28125, "learning_rate": 1.5834123146686364e-07, "loss": 0.8778, "step": 2183 }, { "epoch": 1.6864864864864866, "grad_norm": 5.03125, "learning_rate": 1.575808541796625e-07, "loss": 0.8711, "step": 2184 }, { "epoch": 1.6872586872586872, "grad_norm": 4.09375, "learning_rate": 1.5682218416955983e-07, "loss": 0.8311, "step": 2185 }, { "epoch": 1.688030888030888, "grad_norm": 3.984375, "learning_rate": 1.5606522262223575e-07, "loss": 0.8942, "step": 2186 }, { "epoch": 1.688803088803089, "grad_norm": 4.21875, "learning_rate": 1.5530997072070063e-07, "loss": 0.8781, "step": 2187 }, { "epoch": 1.6895752895752896, "grad_norm": 3.609375, "learning_rate": 1.545564296452927e-07, "loss": 0.7819, "step": 2188 }, { "epoch": 1.6903474903474902, "grad_norm": 3.640625, "learning_rate": 1.5380460057367694e-07, "loss": 0.7771, "step": 2189 }, { "epoch": 1.691119691119691, "grad_norm": 3.65625, "learning_rate": 1.5305448468084212e-07, "loss": 0.748, "step": 2190 }, { "epoch": 1.691891891891892, "grad_norm": 5.09375, "learning_rate": 1.5230608313910017e-07, "loss": 0.8736, "step": 2191 }, { "epoch": 1.6926640926640926, "grad_norm": 3.96875, "learning_rate": 1.515593971180833e-07, "loss": 0.9818, "step": 2192 }, { "epoch": 1.6934362934362934, "grad_norm": 4.03125, "learning_rate": 1.5081442778474244e-07, "loss": 0.8309, "step": 2193 }, { "epoch": 1.6942084942084943, "grad_norm": 4.4375, "learning_rate": 1.500711763033469e-07, "loss": 0.8696, "step": 2194 }, { "epoch": 1.694980694980695, "grad_norm": 3.984375, "learning_rate": 1.4932964383547937e-07, "loss": 0.7462, "step": 2195 }, { "epoch": 1.6957528957528958, "grad_norm": 4.28125, "learning_rate": 1.485898315400376e-07, "loss": 0.8239, "step": 2196 }, { "epoch": 1.6965250965250966, "grad_norm": 4.40625, "learning_rate": 1.478517405732302e-07, "loss": 0.9517, "step": 2197 }, { "epoch": 1.6972972972972973, "grad_norm": 3.8125, "learning_rate": 1.4711537208857567e-07, "loss": 0.8461, "step": 2198 }, { "epoch": 1.698069498069498, "grad_norm": 4.09375, "learning_rate": 1.463807272369007e-07, "loss": 0.912, "step": 2199 }, { "epoch": 1.698841698841699, "grad_norm": 4.25, "learning_rate": 1.4564780716633794e-07, "loss": 0.8851, "step": 2200 }, { "epoch": 1.6996138996138996, "grad_norm": 3.859375, "learning_rate": 1.4491661302232522e-07, "loss": 0.8122, "step": 2201 }, { "epoch": 1.7003861003861003, "grad_norm": 3.421875, "learning_rate": 1.4418714594760214e-07, "loss": 0.7587, "step": 2202 }, { "epoch": 1.7011583011583011, "grad_norm": 3.78125, "learning_rate": 1.4345940708220956e-07, "loss": 0.7805, "step": 2203 }, { "epoch": 1.701930501930502, "grad_norm": 4.03125, "learning_rate": 1.4273339756348731e-07, "loss": 0.9542, "step": 2204 }, { "epoch": 1.7027027027027026, "grad_norm": 6.28125, "learning_rate": 1.4200911852607315e-07, "loss": 0.8639, "step": 2205 }, { "epoch": 1.7034749034749035, "grad_norm": 12.5625, "learning_rate": 1.4128657110189888e-07, "loss": 0.8769, "step": 2206 }, { "epoch": 1.7042471042471043, "grad_norm": 3.78125, "learning_rate": 1.405657564201922e-07, "loss": 0.8546, "step": 2207 }, { "epoch": 1.705019305019305, "grad_norm": 3.375, "learning_rate": 1.3984667560747038e-07, "loss": 0.6528, "step": 2208 }, { "epoch": 1.7057915057915058, "grad_norm": 3.9375, "learning_rate": 1.3912932978754318e-07, "loss": 0.9005, "step": 2209 }, { "epoch": 1.7065637065637067, "grad_norm": 6.71875, "learning_rate": 1.3841372008150736e-07, "loss": 0.8884, "step": 2210 }, { "epoch": 1.7073359073359073, "grad_norm": 4.75, "learning_rate": 1.3769984760774667e-07, "loss": 0.8473, "step": 2211 }, { "epoch": 1.708108108108108, "grad_norm": 4.09375, "learning_rate": 1.3698771348193072e-07, "loss": 0.9144, "step": 2212 }, { "epoch": 1.7088803088803088, "grad_norm": 3.84375, "learning_rate": 1.362773188170112e-07, "loss": 0.8046, "step": 2213 }, { "epoch": 1.7096525096525097, "grad_norm": 3.703125, "learning_rate": 1.355686647232221e-07, "loss": 0.8214, "step": 2214 }, { "epoch": 1.7104247104247103, "grad_norm": 4.1875, "learning_rate": 1.348617523080767e-07, "loss": 0.7767, "step": 2215 }, { "epoch": 1.7111969111969112, "grad_norm": 4.53125, "learning_rate": 1.3415658267636647e-07, "loss": 0.9369, "step": 2216 }, { "epoch": 1.711969111969112, "grad_norm": 5.875, "learning_rate": 1.334531569301592e-07, "loss": 0.8951, "step": 2217 }, { "epoch": 1.7127413127413127, "grad_norm": 4.1875, "learning_rate": 1.327514761687977e-07, "loss": 0.8101, "step": 2218 }, { "epoch": 1.7135135135135136, "grad_norm": 4.1875, "learning_rate": 1.3205154148889635e-07, "loss": 0.9427, "step": 2219 }, { "epoch": 1.7142857142857144, "grad_norm": 3.796875, "learning_rate": 1.313533539843423e-07, "loss": 0.8344, "step": 2220 }, { "epoch": 1.715057915057915, "grad_norm": 3.609375, "learning_rate": 1.306569147462912e-07, "loss": 0.7608, "step": 2221 }, { "epoch": 1.7158301158301157, "grad_norm": 4.1875, "learning_rate": 1.2996222486316628e-07, "loss": 0.8361, "step": 2222 }, { "epoch": 1.7166023166023165, "grad_norm": 4.5, "learning_rate": 1.29269285420658e-07, "loss": 0.9703, "step": 2223 }, { "epoch": 1.7173745173745174, "grad_norm": 4.375, "learning_rate": 1.2857809750171938e-07, "loss": 0.9258, "step": 2224 }, { "epoch": 1.718146718146718, "grad_norm": 3.8125, "learning_rate": 1.2788866218656756e-07, "loss": 0.7688, "step": 2225 }, { "epoch": 1.718918918918919, "grad_norm": 4.21875, "learning_rate": 1.272009805526801e-07, "loss": 0.8871, "step": 2226 }, { "epoch": 1.7196911196911198, "grad_norm": 3.671875, "learning_rate": 1.2651505367479361e-07, "loss": 0.8744, "step": 2227 }, { "epoch": 1.7204633204633204, "grad_norm": 3.84375, "learning_rate": 1.2583088262490245e-07, "loss": 0.8991, "step": 2228 }, { "epoch": 1.7212355212355213, "grad_norm": 3.796875, "learning_rate": 1.2514846847225746e-07, "loss": 0.9063, "step": 2229 }, { "epoch": 1.7220077220077221, "grad_norm": 3.828125, "learning_rate": 1.24467812283363e-07, "loss": 0.8114, "step": 2230 }, { "epoch": 1.7227799227799228, "grad_norm": 4.21875, "learning_rate": 1.2378891512197622e-07, "loss": 0.9112, "step": 2231 }, { "epoch": 1.7235521235521234, "grad_norm": 3.6875, "learning_rate": 1.231117780491052e-07, "loss": 0.9379, "step": 2232 }, { "epoch": 1.7243243243243245, "grad_norm": 5.0, "learning_rate": 1.2243640212300705e-07, "loss": 0.8696, "step": 2233 }, { "epoch": 1.7250965250965251, "grad_norm": 3.5625, "learning_rate": 1.217627883991876e-07, "loss": 0.8425, "step": 2234 }, { "epoch": 1.7258687258687258, "grad_norm": 3.984375, "learning_rate": 1.2109093793039678e-07, "loss": 0.841, "step": 2235 }, { "epoch": 1.7266409266409266, "grad_norm": 3.71875, "learning_rate": 1.2042085176663055e-07, "loss": 0.8114, "step": 2236 }, { "epoch": 1.7274131274131275, "grad_norm": 3.859375, "learning_rate": 1.197525309551266e-07, "loss": 0.8823, "step": 2237 }, { "epoch": 1.728185328185328, "grad_norm": 4.03125, "learning_rate": 1.1908597654036388e-07, "loss": 0.8006, "step": 2238 }, { "epoch": 1.728957528957529, "grad_norm": 3.796875, "learning_rate": 1.184211895640608e-07, "loss": 0.8574, "step": 2239 }, { "epoch": 1.7297297297297298, "grad_norm": 4.71875, "learning_rate": 1.1775817106517318e-07, "loss": 1.0447, "step": 2240 }, { "epoch": 1.7305019305019305, "grad_norm": 7.59375, "learning_rate": 1.1709692207989399e-07, "loss": 0.9063, "step": 2241 }, { "epoch": 1.7312741312741313, "grad_norm": 4.21875, "learning_rate": 1.1643744364164947e-07, "loss": 0.8997, "step": 2242 }, { "epoch": 1.7320463320463322, "grad_norm": 3.875, "learning_rate": 1.1577973678109963e-07, "loss": 0.818, "step": 2243 }, { "epoch": 1.7328185328185328, "grad_norm": 4.0, "learning_rate": 1.1512380252613531e-07, "loss": 0.8874, "step": 2244 }, { "epoch": 1.7335907335907335, "grad_norm": 7.34375, "learning_rate": 1.1446964190187774e-07, "loss": 0.9153, "step": 2245 }, { "epoch": 1.7343629343629343, "grad_norm": 3.890625, "learning_rate": 1.1381725593067494e-07, "loss": 0.7484, "step": 2246 }, { "epoch": 1.7351351351351352, "grad_norm": 4.65625, "learning_rate": 1.131666456321033e-07, "loss": 0.8394, "step": 2247 }, { "epoch": 1.7359073359073358, "grad_norm": 4.4375, "learning_rate": 1.1251781202296197e-07, "loss": 0.7901, "step": 2248 }, { "epoch": 1.7366795366795367, "grad_norm": 8.125, "learning_rate": 1.1187075611727522e-07, "loss": 0.7334, "step": 2249 }, { "epoch": 1.7374517374517375, "grad_norm": 3.828125, "learning_rate": 1.1122547892628829e-07, "loss": 0.9073, "step": 2250 }, { "epoch": 1.7382239382239382, "grad_norm": 4.375, "learning_rate": 1.1058198145846632e-07, "loss": 0.7993, "step": 2251 }, { "epoch": 1.738996138996139, "grad_norm": 4.6875, "learning_rate": 1.0994026471949374e-07, "loss": 0.9642, "step": 2252 }, { "epoch": 1.7397683397683399, "grad_norm": 3.671875, "learning_rate": 1.0930032971227147e-07, "loss": 0.8579, "step": 2253 }, { "epoch": 1.7405405405405405, "grad_norm": 3.765625, "learning_rate": 1.0866217743691578e-07, "loss": 0.7705, "step": 2254 }, { "epoch": 1.7413127413127412, "grad_norm": 3.5625, "learning_rate": 1.0802580889075723e-07, "loss": 0.8159, "step": 2255 }, { "epoch": 1.7420849420849422, "grad_norm": 3.796875, "learning_rate": 1.073912250683383e-07, "loss": 0.8019, "step": 2256 }, { "epoch": 1.7428571428571429, "grad_norm": 3.96875, "learning_rate": 1.0675842696141234e-07, "loss": 0.9241, "step": 2257 }, { "epoch": 1.7436293436293435, "grad_norm": 4.03125, "learning_rate": 1.0612741555894271e-07, "loss": 0.9009, "step": 2258 }, { "epoch": 1.7444015444015444, "grad_norm": 5.15625, "learning_rate": 1.0549819184709862e-07, "loss": 0.99, "step": 2259 }, { "epoch": 1.7451737451737452, "grad_norm": 4.125, "learning_rate": 1.0487075680925719e-07, "loss": 0.9438, "step": 2260 }, { "epoch": 1.7459459459459459, "grad_norm": 4.09375, "learning_rate": 1.042451114259993e-07, "loss": 0.8868, "step": 2261 }, { "epoch": 1.7467181467181467, "grad_norm": 5.25, "learning_rate": 1.036212566751088e-07, "loss": 0.8821, "step": 2262 }, { "epoch": 1.7474903474903476, "grad_norm": 4.125, "learning_rate": 1.0299919353157189e-07, "loss": 0.806, "step": 2263 }, { "epoch": 1.7482625482625482, "grad_norm": 4.0, "learning_rate": 1.023789229675734e-07, "loss": 0.9001, "step": 2264 }, { "epoch": 1.7490347490347489, "grad_norm": 3.859375, "learning_rate": 1.017604459524979e-07, "loss": 0.798, "step": 2265 }, { "epoch": 1.74980694980695, "grad_norm": 3.78125, "learning_rate": 1.0114376345292623e-07, "loss": 0.7292, "step": 2266 }, { "epoch": 1.7505791505791506, "grad_norm": 3.75, "learning_rate": 1.0052887643263496e-07, "loss": 0.8359, "step": 2267 }, { "epoch": 1.7513513513513512, "grad_norm": 3.53125, "learning_rate": 9.99157858525944e-08, "loss": 0.8582, "step": 2268 }, { "epoch": 1.752123552123552, "grad_norm": 4.15625, "learning_rate": 9.930449267096784e-08, "loss": 0.9261, "step": 2269 }, { "epoch": 1.752895752895753, "grad_norm": 4.1875, "learning_rate": 9.869499784310884e-08, "loss": 0.9841, "step": 2270 }, { "epoch": 1.7536679536679536, "grad_norm": 8.625, "learning_rate": 9.808730232156074e-08, "loss": 0.7505, "step": 2271 }, { "epoch": 1.7544401544401544, "grad_norm": 3.890625, "learning_rate": 9.748140705605508e-08, "loss": 0.9413, "step": 2272 }, { "epoch": 1.7552123552123553, "grad_norm": 4.03125, "learning_rate": 9.68773129935091e-08, "loss": 0.8915, "step": 2273 }, { "epoch": 1.755984555984556, "grad_norm": 4.1875, "learning_rate": 9.627502107802624e-08, "loss": 0.8601, "step": 2274 }, { "epoch": 1.7567567567567568, "grad_norm": 4.40625, "learning_rate": 9.56745322508923e-08, "loss": 0.8339, "step": 2275 }, { "epoch": 1.7575289575289577, "grad_norm": 4.09375, "learning_rate": 9.5075847450576e-08, "loss": 0.8998, "step": 2276 }, { "epoch": 1.7583011583011583, "grad_norm": 4.6875, "learning_rate": 9.447896761272612e-08, "loss": 0.9489, "step": 2277 }, { "epoch": 1.759073359073359, "grad_norm": 3.8125, "learning_rate": 9.388389367017082e-08, "loss": 0.8747, "step": 2278 }, { "epoch": 1.7598455598455598, "grad_norm": 4.28125, "learning_rate": 9.329062655291585e-08, "loss": 0.8748, "step": 2279 }, { "epoch": 1.7606177606177607, "grad_norm": 3.921875, "learning_rate": 9.269916718814304e-08, "loss": 0.788, "step": 2280 }, { "epoch": 1.7613899613899613, "grad_norm": 4.71875, "learning_rate": 9.21095165002095e-08, "loss": 0.8681, "step": 2281 }, { "epoch": 1.7621621621621621, "grad_norm": 9.1875, "learning_rate": 9.152167541064516e-08, "loss": 0.8641, "step": 2282 }, { "epoch": 1.762934362934363, "grad_norm": 4.59375, "learning_rate": 9.093564483815196e-08, "loss": 0.8581, "step": 2283 }, { "epoch": 1.7637065637065636, "grad_norm": 3.9375, "learning_rate": 9.035142569860225e-08, "loss": 0.9453, "step": 2284 }, { "epoch": 1.7644787644787645, "grad_norm": 4.28125, "learning_rate": 8.976901890503792e-08, "loss": 0.8476, "step": 2285 }, { "epoch": 1.7652509652509654, "grad_norm": 3.765625, "learning_rate": 8.91884253676674e-08, "loss": 0.7917, "step": 2286 }, { "epoch": 1.766023166023166, "grad_norm": 8.0625, "learning_rate": 8.860964599386651e-08, "loss": 0.8317, "step": 2287 }, { "epoch": 1.7667953667953666, "grad_norm": 3.703125, "learning_rate": 8.803268168817437e-08, "loss": 0.9213, "step": 2288 }, { "epoch": 1.7675675675675677, "grad_norm": 3.875, "learning_rate": 8.745753335229506e-08, "loss": 0.8682, "step": 2289 }, { "epoch": 1.7683397683397684, "grad_norm": 4.125, "learning_rate": 8.68842018850935e-08, "loss": 0.791, "step": 2290 }, { "epoch": 1.769111969111969, "grad_norm": 3.953125, "learning_rate": 8.631268818259528e-08, "loss": 0.8312, "step": 2291 }, { "epoch": 1.7698841698841699, "grad_norm": 4.40625, "learning_rate": 8.574299313798579e-08, "loss": 0.9126, "step": 2292 }, { "epoch": 1.7706563706563707, "grad_norm": 4.25, "learning_rate": 8.517511764160714e-08, "loss": 0.8957, "step": 2293 }, { "epoch": 1.7714285714285714, "grad_norm": 4.875, "learning_rate": 8.460906258095842e-08, "loss": 0.9792, "step": 2294 }, { "epoch": 1.7722007722007722, "grad_norm": 4.0, "learning_rate": 8.404482884069351e-08, "loss": 0.8235, "step": 2295 }, { "epoch": 1.772972972972973, "grad_norm": 3.9375, "learning_rate": 8.348241730261996e-08, "loss": 0.7483, "step": 2296 }, { "epoch": 1.7737451737451737, "grad_norm": 4.625, "learning_rate": 8.292182884569705e-08, "loss": 0.9316, "step": 2297 }, { "epoch": 1.7745173745173746, "grad_norm": 4.375, "learning_rate": 8.236306434603603e-08, "loss": 0.8127, "step": 2298 }, { "epoch": 1.7752895752895754, "grad_norm": 3.8125, "learning_rate": 8.180612467689572e-08, "loss": 0.8582, "step": 2299 }, { "epoch": 1.776061776061776, "grad_norm": 4.71875, "learning_rate": 8.125101070868489e-08, "loss": 0.8289, "step": 2300 }, { "epoch": 1.7768339768339767, "grad_norm": 3.546875, "learning_rate": 8.069772330895814e-08, "loss": 0.7585, "step": 2301 }, { "epoch": 1.7776061776061776, "grad_norm": 3.859375, "learning_rate": 8.014626334241508e-08, "loss": 0.8401, "step": 2302 }, { "epoch": 1.7783783783783784, "grad_norm": 3.9375, "learning_rate": 7.959663167090043e-08, "loss": 0.9013, "step": 2303 }, { "epoch": 1.779150579150579, "grad_norm": 4.3125, "learning_rate": 7.904882915340028e-08, "loss": 0.8605, "step": 2304 }, { "epoch": 1.77992277992278, "grad_norm": 4.46875, "learning_rate": 7.85028566460433e-08, "loss": 0.9205, "step": 2305 }, { "epoch": 1.7806949806949808, "grad_norm": 4.53125, "learning_rate": 7.795871500209718e-08, "loss": 0.6442, "step": 2306 }, { "epoch": 1.7814671814671814, "grad_norm": 3.546875, "learning_rate": 7.741640507196876e-08, "loss": 0.6387, "step": 2307 }, { "epoch": 1.7822393822393823, "grad_norm": 9.375, "learning_rate": 7.68759277032019e-08, "loss": 0.7811, "step": 2308 }, { "epoch": 1.7830115830115831, "grad_norm": 3.703125, "learning_rate": 7.633728374047689e-08, "loss": 0.7603, "step": 2309 }, { "epoch": 1.7837837837837838, "grad_norm": 4.28125, "learning_rate": 7.580047402560826e-08, "loss": 0.9572, "step": 2310 }, { "epoch": 1.7845559845559844, "grad_norm": 4.75, "learning_rate": 7.526549939754421e-08, "loss": 0.8476, "step": 2311 }, { "epoch": 1.7853281853281855, "grad_norm": 4.0, "learning_rate": 7.473236069236472e-08, "loss": 0.8596, "step": 2312 }, { "epoch": 1.7861003861003861, "grad_norm": 4.03125, "learning_rate": 7.420105874328052e-08, "loss": 0.9109, "step": 2313 }, { "epoch": 1.7868725868725868, "grad_norm": 4.6875, "learning_rate": 7.367159438063237e-08, "loss": 0.8329, "step": 2314 }, { "epoch": 1.7876447876447876, "grad_norm": 15.125, "learning_rate": 7.314396843188826e-08, "loss": 0.9244, "step": 2315 }, { "epoch": 1.7884169884169885, "grad_norm": 4.0, "learning_rate": 7.26181817216437e-08, "loss": 0.8076, "step": 2316 }, { "epoch": 1.7891891891891891, "grad_norm": 3.859375, "learning_rate": 7.209423507161972e-08, "loss": 0.8227, "step": 2317 }, { "epoch": 1.78996138996139, "grad_norm": 4.09375, "learning_rate": 7.157212930066118e-08, "loss": 0.7887, "step": 2318 }, { "epoch": 1.7907335907335908, "grad_norm": 3.90625, "learning_rate": 7.105186522473634e-08, "loss": 0.8584, "step": 2319 }, { "epoch": 1.7915057915057915, "grad_norm": 3.734375, "learning_rate": 7.053344365693492e-08, "loss": 0.7546, "step": 2320 }, { "epoch": 1.7922779922779921, "grad_norm": 3.765625, "learning_rate": 7.00168654074676e-08, "loss": 0.8601, "step": 2321 }, { "epoch": 1.7930501930501932, "grad_norm": 14.375, "learning_rate": 6.95021312836637e-08, "loss": 0.8438, "step": 2322 }, { "epoch": 1.7938223938223938, "grad_norm": 7.09375, "learning_rate": 6.898924208997057e-08, "loss": 0.6985, "step": 2323 }, { "epoch": 1.7945945945945945, "grad_norm": 4.21875, "learning_rate": 6.84781986279523e-08, "loss": 0.8411, "step": 2324 }, { "epoch": 1.7953667953667953, "grad_norm": 4.0625, "learning_rate": 6.796900169628875e-08, "loss": 0.9035, "step": 2325 }, { "epoch": 1.7961389961389962, "grad_norm": 12.8125, "learning_rate": 6.74616520907731e-08, "loss": 0.8062, "step": 2326 }, { "epoch": 1.7969111969111968, "grad_norm": 4.40625, "learning_rate": 6.695615060431246e-08, "loss": 0.9718, "step": 2327 }, { "epoch": 1.7976833976833977, "grad_norm": 3.65625, "learning_rate": 6.645249802692461e-08, "loss": 0.8445, "step": 2328 }, { "epoch": 1.7984555984555985, "grad_norm": 4.375, "learning_rate": 6.595069514573879e-08, "loss": 0.918, "step": 2329 }, { "epoch": 1.7992277992277992, "grad_norm": 6.0625, "learning_rate": 6.545074274499255e-08, "loss": 0.9103, "step": 2330 }, { "epoch": 1.8, "grad_norm": 4.03125, "learning_rate": 6.495264160603199e-08, "loss": 0.8227, "step": 2331 }, { "epoch": 1.800772200772201, "grad_norm": 3.609375, "learning_rate": 6.445639250731009e-08, "loss": 0.7644, "step": 2332 }, { "epoch": 1.8015444015444015, "grad_norm": 3.671875, "learning_rate": 6.396199622438496e-08, "loss": 0.7791, "step": 2333 }, { "epoch": 1.8023166023166022, "grad_norm": 3.734375, "learning_rate": 6.346945352991907e-08, "loss": 0.7947, "step": 2334 }, { "epoch": 1.803088803088803, "grad_norm": 44.75, "learning_rate": 6.297876519367848e-08, "loss": 0.9314, "step": 2335 }, { "epoch": 1.803861003861004, "grad_norm": 3.984375, "learning_rate": 6.248993198253061e-08, "loss": 0.954, "step": 2336 }, { "epoch": 1.8046332046332045, "grad_norm": 3.984375, "learning_rate": 6.200295466044393e-08, "loss": 0.8747, "step": 2337 }, { "epoch": 1.8054054054054054, "grad_norm": 4.5625, "learning_rate": 6.151783398848685e-08, "loss": 0.9096, "step": 2338 }, { "epoch": 1.8061776061776063, "grad_norm": 5.40625, "learning_rate": 6.103457072482485e-08, "loss": 0.9559, "step": 2339 }, { "epoch": 1.806949806949807, "grad_norm": 7.34375, "learning_rate": 6.055316562472227e-08, "loss": 0.7932, "step": 2340 }, { "epoch": 1.8077220077220078, "grad_norm": 4.375, "learning_rate": 6.007361944053824e-08, "loss": 0.8236, "step": 2341 }, { "epoch": 1.8084942084942086, "grad_norm": 4.21875, "learning_rate": 5.9595932921726885e-08, "loss": 0.8103, "step": 2342 }, { "epoch": 1.8092664092664092, "grad_norm": 4.03125, "learning_rate": 5.9120106814836567e-08, "loss": 0.8816, "step": 2343 }, { "epoch": 1.8100386100386099, "grad_norm": 3.421875, "learning_rate": 5.864614186350717e-08, "loss": 0.7069, "step": 2344 }, { "epoch": 1.810810810810811, "grad_norm": 4.03125, "learning_rate": 5.8174038808470886e-08, "loss": 0.7335, "step": 2345 }, { "epoch": 1.8115830115830116, "grad_norm": 4.4375, "learning_rate": 5.770379838754933e-08, "loss": 0.8952, "step": 2346 }, { "epoch": 1.8123552123552122, "grad_norm": 3.75, "learning_rate": 5.7235421335653377e-08, "loss": 0.7115, "step": 2347 }, { "epoch": 1.813127413127413, "grad_norm": 4.59375, "learning_rate": 5.6768908384781664e-08, "loss": 0.9992, "step": 2348 }, { "epoch": 1.813899613899614, "grad_norm": 3.984375, "learning_rate": 5.630426026402e-08, "loss": 0.7469, "step": 2349 }, { "epoch": 1.8146718146718146, "grad_norm": 4.3125, "learning_rate": 5.58414776995389e-08, "loss": 0.94, "step": 2350 }, { "epoch": 1.8154440154440155, "grad_norm": 3.765625, "learning_rate": 5.5380561414594133e-08, "loss": 0.9436, "step": 2351 }, { "epoch": 1.8162162162162163, "grad_norm": 4.46875, "learning_rate": 5.4921512129524345e-08, "loss": 0.9655, "step": 2352 }, { "epoch": 1.816988416988417, "grad_norm": 3.859375, "learning_rate": 5.44643305617501e-08, "loss": 0.9032, "step": 2353 }, { "epoch": 1.8177606177606178, "grad_norm": 6.84375, "learning_rate": 5.400901742577402e-08, "loss": 0.8898, "step": 2354 }, { "epoch": 1.8185328185328187, "grad_norm": 3.640625, "learning_rate": 5.355557343317733e-08, "loss": 0.9172, "step": 2355 }, { "epoch": 1.8193050193050193, "grad_norm": 4.0625, "learning_rate": 5.3103999292621197e-08, "loss": 0.9436, "step": 2356 }, { "epoch": 1.82007722007722, "grad_norm": 10.1875, "learning_rate": 5.265429570984387e-08, "loss": 0.9237, "step": 2357 }, { "epoch": 1.8208494208494208, "grad_norm": 3.8125, "learning_rate": 5.2206463387660527e-08, "loss": 0.8051, "step": 2358 }, { "epoch": 1.8216216216216217, "grad_norm": 4.0625, "learning_rate": 5.1760503025961586e-08, "loss": 0.9169, "step": 2359 }, { "epoch": 1.8223938223938223, "grad_norm": 4.0625, "learning_rate": 5.1316415321712025e-08, "loss": 0.8305, "step": 2360 }, { "epoch": 1.8231660231660232, "grad_norm": 4.28125, "learning_rate": 5.087420096895057e-08, "loss": 0.9134, "step": 2361 }, { "epoch": 1.823938223938224, "grad_norm": 3.796875, "learning_rate": 5.043386065878744e-08, "loss": 0.8993, "step": 2362 }, { "epoch": 1.8247104247104247, "grad_norm": 4.125, "learning_rate": 4.999539507940451e-08, "loss": 0.8941, "step": 2363 }, { "epoch": 1.8254826254826255, "grad_norm": 4.125, "learning_rate": 4.955880491605336e-08, "loss": 0.9344, "step": 2364 }, { "epoch": 1.8262548262548264, "grad_norm": 4.5625, "learning_rate": 4.9124090851055546e-08, "loss": 0.9307, "step": 2365 }, { "epoch": 1.827027027027027, "grad_norm": 4.28125, "learning_rate": 4.869125356379914e-08, "loss": 0.8547, "step": 2366 }, { "epoch": 1.8277992277992277, "grad_norm": 3.78125, "learning_rate": 4.82602937307404e-08, "loss": 0.8502, "step": 2367 }, { "epoch": 1.8285714285714287, "grad_norm": 4.25, "learning_rate": 4.783121202540056e-08, "loss": 0.8965, "step": 2368 }, { "epoch": 1.8293436293436294, "grad_norm": 3.78125, "learning_rate": 4.740400911836626e-08, "loss": 0.8682, "step": 2369 }, { "epoch": 1.83011583011583, "grad_norm": 3.78125, "learning_rate": 4.6978685677287335e-08, "loss": 0.83, "step": 2370 }, { "epoch": 1.8308880308880309, "grad_norm": 4.1875, "learning_rate": 4.65552423668765e-08, "loss": 0.8382, "step": 2371 }, { "epoch": 1.8316602316602317, "grad_norm": 3.9375, "learning_rate": 4.613367984890857e-08, "loss": 0.842, "step": 2372 }, { "epoch": 1.8324324324324324, "grad_norm": 4.125, "learning_rate": 4.5713998782218325e-08, "loss": 0.9043, "step": 2373 }, { "epoch": 1.8332046332046332, "grad_norm": 3.765625, "learning_rate": 4.529619982270042e-08, "loss": 0.8713, "step": 2374 }, { "epoch": 1.833976833976834, "grad_norm": 5.0625, "learning_rate": 4.48802836233081e-08, "loss": 1.0113, "step": 2375 }, { "epoch": 1.8347490347490347, "grad_norm": 7.15625, "learning_rate": 4.446625083405237e-08, "loss": 0.9572, "step": 2376 }, { "epoch": 1.8355212355212354, "grad_norm": 3.890625, "learning_rate": 4.405410210200009e-08, "loss": 0.9597, "step": 2377 }, { "epoch": 1.8362934362934364, "grad_norm": 3.59375, "learning_rate": 4.364383807127462e-08, "loss": 0.8365, "step": 2378 }, { "epoch": 1.837065637065637, "grad_norm": 4.71875, "learning_rate": 4.323545938305265e-08, "loss": 1.0054, "step": 2379 }, { "epoch": 1.8378378378378377, "grad_norm": 4.21875, "learning_rate": 4.282896667556532e-08, "loss": 0.774, "step": 2380 }, { "epoch": 1.8386100386100386, "grad_norm": 4.15625, "learning_rate": 4.242436058409613e-08, "loss": 0.8431, "step": 2381 }, { "epoch": 1.8393822393822394, "grad_norm": 4.78125, "learning_rate": 4.202164174097953e-08, "loss": 0.9768, "step": 2382 }, { "epoch": 1.84015444015444, "grad_norm": 8.3125, "learning_rate": 4.162081077560151e-08, "loss": 0.8309, "step": 2383 }, { "epoch": 1.840926640926641, "grad_norm": 11.25, "learning_rate": 4.1221868314396535e-08, "loss": 0.8113, "step": 2384 }, { "epoch": 1.8416988416988418, "grad_norm": 6.65625, "learning_rate": 4.082481498084823e-08, "loss": 0.8466, "step": 2385 }, { "epoch": 1.8424710424710424, "grad_norm": 4.75, "learning_rate": 4.042965139548785e-08, "loss": 0.9088, "step": 2386 }, { "epoch": 1.8432432432432433, "grad_norm": 4.75, "learning_rate": 4.0036378175892926e-08, "loss": 0.9517, "step": 2387 }, { "epoch": 1.8440154440154441, "grad_norm": 4.0, "learning_rate": 3.96449959366868e-08, "loss": 0.759, "step": 2388 }, { "epoch": 1.8447876447876448, "grad_norm": 4.59375, "learning_rate": 3.925550528953798e-08, "loss": 1.0366, "step": 2389 }, { "epoch": 1.8455598455598454, "grad_norm": 4.25, "learning_rate": 3.886790684315786e-08, "loss": 0.9739, "step": 2390 }, { "epoch": 1.8463320463320463, "grad_norm": 4.0, "learning_rate": 3.848220120330134e-08, "loss": 0.8682, "step": 2391 }, { "epoch": 1.8471042471042471, "grad_norm": 3.875, "learning_rate": 3.809838897276472e-08, "loss": 0.8424, "step": 2392 }, { "epoch": 1.8478764478764478, "grad_norm": 5.71875, "learning_rate": 3.7716470751385374e-08, "loss": 0.8019, "step": 2393 }, { "epoch": 1.8486486486486486, "grad_norm": 4.5, "learning_rate": 3.7336447136041007e-08, "loss": 0.8498, "step": 2394 }, { "epoch": 1.8494208494208495, "grad_norm": 3.34375, "learning_rate": 3.695831872064723e-08, "loss": 0.7062, "step": 2395 }, { "epoch": 1.8501930501930501, "grad_norm": 3.734375, "learning_rate": 3.6582086096159234e-08, "loss": 0.9295, "step": 2396 }, { "epoch": 1.850965250965251, "grad_norm": 3.96875, "learning_rate": 3.6207749850568494e-08, "loss": 0.903, "step": 2397 }, { "epoch": 1.8517374517374519, "grad_norm": 4.09375, "learning_rate": 3.5835310568902713e-08, "loss": 0.9381, "step": 2398 }, { "epoch": 1.8525096525096525, "grad_norm": 4.1875, "learning_rate": 3.546476883322519e-08, "loss": 0.8258, "step": 2399 }, { "epoch": 1.8532818532818531, "grad_norm": 3.71875, "learning_rate": 3.509612522263395e-08, "loss": 0.9008, "step": 2400 }, { "epoch": 1.8540540540540542, "grad_norm": 3.953125, "learning_rate": 3.4729380313259924e-08, "loss": 0.9417, "step": 2401 }, { "epoch": 1.8548262548262548, "grad_norm": 3.984375, "learning_rate": 3.4364534678267e-08, "loss": 0.9332, "step": 2402 }, { "epoch": 1.8555984555984555, "grad_norm": 3.59375, "learning_rate": 3.400158888785102e-08, "loss": 0.7395, "step": 2403 }, { "epoch": 1.8563706563706563, "grad_norm": 3.984375, "learning_rate": 3.3640543509238094e-08, "loss": 0.8246, "step": 2404 }, { "epoch": 1.8571428571428572, "grad_norm": 3.859375, "learning_rate": 3.328139910668507e-08, "loss": 0.7664, "step": 2405 }, { "epoch": 1.8579150579150578, "grad_norm": 4.59375, "learning_rate": 3.292415624147685e-08, "loss": 0.8103, "step": 2406 }, { "epoch": 1.8586872586872587, "grad_norm": 3.828125, "learning_rate": 3.2568815471927665e-08, "loss": 0.7955, "step": 2407 }, { "epoch": 1.8594594594594596, "grad_norm": 4.125, "learning_rate": 3.221537735337829e-08, "loss": 0.8804, "step": 2408 }, { "epoch": 1.8602316602316602, "grad_norm": 3.90625, "learning_rate": 3.186384243819618e-08, "loss": 0.8576, "step": 2409 }, { "epoch": 1.8610038610038608, "grad_norm": 8.5, "learning_rate": 3.1514211275774503e-08, "loss": 0.9091, "step": 2410 }, { "epoch": 1.861776061776062, "grad_norm": 3.703125, "learning_rate": 3.11664844125309e-08, "loss": 0.7916, "step": 2411 }, { "epoch": 1.8625482625482626, "grad_norm": 3.84375, "learning_rate": 3.0820662391907176e-08, "loss": 0.8928, "step": 2412 }, { "epoch": 1.8633204633204632, "grad_norm": 3.890625, "learning_rate": 3.0476745754368235e-08, "loss": 0.842, "step": 2413 }, { "epoch": 1.864092664092664, "grad_norm": 4.40625, "learning_rate": 3.0134735037400663e-08, "loss": 0.8769, "step": 2414 }, { "epoch": 1.864864864864865, "grad_norm": 6.90625, "learning_rate": 2.9794630775512868e-08, "loss": 0.8839, "step": 2415 }, { "epoch": 1.8656370656370656, "grad_norm": 4.09375, "learning_rate": 2.9456433500233973e-08, "loss": 0.9127, "step": 2416 }, { "epoch": 1.8664092664092664, "grad_norm": 3.703125, "learning_rate": 2.9120143740111738e-08, "loss": 0.8311, "step": 2417 }, { "epoch": 1.8671814671814673, "grad_norm": 4.1875, "learning_rate": 2.8785762020714092e-08, "loss": 0.9866, "step": 2418 }, { "epoch": 1.867953667953668, "grad_norm": 3.59375, "learning_rate": 2.8453288864626062e-08, "loss": 0.787, "step": 2419 }, { "epoch": 1.8687258687258688, "grad_norm": 4.3125, "learning_rate": 2.812272479145034e-08, "loss": 0.8442, "step": 2420 }, { "epoch": 1.8694980694980696, "grad_norm": 4.375, "learning_rate": 2.7794070317805898e-08, "loss": 0.85, "step": 2421 }, { "epoch": 1.8702702702702703, "grad_norm": 3.71875, "learning_rate": 2.746732595732729e-08, "loss": 0.7732, "step": 2422 }, { "epoch": 1.871042471042471, "grad_norm": 4.15625, "learning_rate": 2.714249222066409e-08, "loss": 0.9734, "step": 2423 }, { "epoch": 1.8718146718146718, "grad_norm": 4.28125, "learning_rate": 2.6819569615479378e-08, "loss": 0.8161, "step": 2424 }, { "epoch": 1.8725868725868726, "grad_norm": 3.671875, "learning_rate": 2.6498558646450012e-08, "loss": 0.7557, "step": 2425 }, { "epoch": 1.8733590733590733, "grad_norm": 4.0, "learning_rate": 2.6179459815264956e-08, "loss": 0.7852, "step": 2426 }, { "epoch": 1.8741312741312741, "grad_norm": 4.0625, "learning_rate": 2.5862273620624602e-08, "loss": 0.8574, "step": 2427 }, { "epoch": 1.874903474903475, "grad_norm": 3.859375, "learning_rate": 2.5547000558240608e-08, "loss": 0.8337, "step": 2428 }, { "epoch": 1.8756756756756756, "grad_norm": 4.1875, "learning_rate": 2.523364112083468e-08, "loss": 0.8542, "step": 2429 }, { "epoch": 1.8764478764478765, "grad_norm": 3.859375, "learning_rate": 2.4922195798137566e-08, "loss": 0.8006, "step": 2430 }, { "epoch": 1.8772200772200773, "grad_norm": 4.03125, "learning_rate": 2.461266507688867e-08, "loss": 0.8639, "step": 2431 }, { "epoch": 1.877992277992278, "grad_norm": 3.625, "learning_rate": 2.4305049440835198e-08, "loss": 0.8467, "step": 2432 }, { "epoch": 1.8787644787644786, "grad_norm": 3.859375, "learning_rate": 2.399934937073134e-08, "loss": 0.8854, "step": 2433 }, { "epoch": 1.8795366795366797, "grad_norm": 4.09375, "learning_rate": 2.3695565344337838e-08, "loss": 0.7162, "step": 2434 }, { "epoch": 1.8803088803088803, "grad_norm": 4.375, "learning_rate": 2.339369783642048e-08, "loss": 0.7695, "step": 2435 }, { "epoch": 1.881081081081081, "grad_norm": 3.9375, "learning_rate": 2.309374731875022e-08, "loss": 0.7832, "step": 2436 }, { "epoch": 1.8818532818532818, "grad_norm": 5.09375, "learning_rate": 2.2795714260101937e-08, "loss": 0.7208, "step": 2437 }, { "epoch": 1.8826254826254827, "grad_norm": 4.0625, "learning_rate": 2.249959912625374e-08, "loss": 0.9753, "step": 2438 }, { "epoch": 1.8833976833976833, "grad_norm": 4.65625, "learning_rate": 2.220540237998642e-08, "loss": 0.9649, "step": 2439 }, { "epoch": 1.8841698841698842, "grad_norm": 4.28125, "learning_rate": 2.1913124481082737e-08, "loss": 0.8329, "step": 2440 }, { "epoch": 1.884942084942085, "grad_norm": 17.75, "learning_rate": 2.1622765886326335e-08, "loss": 0.7445, "step": 2441 }, { "epoch": 1.8857142857142857, "grad_norm": 4.75, "learning_rate": 2.1334327049501723e-08, "loss": 0.9578, "step": 2442 }, { "epoch": 1.8864864864864865, "grad_norm": 3.984375, "learning_rate": 2.1047808421392478e-08, "loss": 0.8387, "step": 2443 }, { "epoch": 1.8872586872586874, "grad_norm": 4.0625, "learning_rate": 2.076321044978166e-08, "loss": 0.9403, "step": 2444 }, { "epoch": 1.888030888030888, "grad_norm": 4.25, "learning_rate": 2.0480533579450845e-08, "loss": 0.7658, "step": 2445 }, { "epoch": 1.8888030888030887, "grad_norm": 5.59375, "learning_rate": 2.0199778252178587e-08, "loss": 0.8838, "step": 2446 }, { "epoch": 1.8895752895752895, "grad_norm": 20.125, "learning_rate": 1.9920944906741126e-08, "loss": 0.7474, "step": 2447 }, { "epoch": 1.8903474903474904, "grad_norm": 4.40625, "learning_rate": 1.9644033978910155e-08, "loss": 0.9608, "step": 2448 }, { "epoch": 1.891119691119691, "grad_norm": 4.75, "learning_rate": 1.9369045901453524e-08, "loss": 0.9065, "step": 2449 }, { "epoch": 1.8918918918918919, "grad_norm": 3.703125, "learning_rate": 1.9095981104133855e-08, "loss": 0.9058, "step": 2450 }, { "epoch": 1.8926640926640927, "grad_norm": 5.375, "learning_rate": 1.8824840013707822e-08, "loss": 0.8178, "step": 2451 }, { "epoch": 1.8934362934362934, "grad_norm": 3.984375, "learning_rate": 1.8555623053925777e-08, "loss": 0.8992, "step": 2452 }, { "epoch": 1.8942084942084942, "grad_norm": 7.0, "learning_rate": 1.828833064553101e-08, "loss": 0.7747, "step": 2453 }, { "epoch": 1.894980694980695, "grad_norm": 4.125, "learning_rate": 1.8022963206258958e-08, "loss": 0.9842, "step": 2454 }, { "epoch": 1.8957528957528957, "grad_norm": 3.890625, "learning_rate": 1.7759521150836485e-08, "loss": 0.9061, "step": 2455 }, { "epoch": 1.8965250965250964, "grad_norm": 28.625, "learning_rate": 1.7498004890981884e-08, "loss": 0.8093, "step": 2456 }, { "epoch": 1.8972972972972975, "grad_norm": 4.15625, "learning_rate": 1.723841483540309e-08, "loss": 0.9009, "step": 2457 }, { "epoch": 1.898069498069498, "grad_norm": 6.71875, "learning_rate": 1.698075138979821e-08, "loss": 1.0012, "step": 2458 }, { "epoch": 1.8988416988416987, "grad_norm": 3.78125, "learning_rate": 1.6725014956854025e-08, "loss": 0.8693, "step": 2459 }, { "epoch": 1.8996138996138996, "grad_norm": 4.15625, "learning_rate": 1.647120593624582e-08, "loss": 0.8883, "step": 2460 }, { "epoch": 1.9003861003861005, "grad_norm": 4.15625, "learning_rate": 1.621932472463686e-08, "loss": 0.8861, "step": 2461 }, { "epoch": 1.901158301158301, "grad_norm": 3.78125, "learning_rate": 1.596937171567711e-08, "loss": 0.8989, "step": 2462 }, { "epoch": 1.901930501930502, "grad_norm": 4.8125, "learning_rate": 1.572134730000355e-08, "loss": 0.9371, "step": 2463 }, { "epoch": 1.9027027027027028, "grad_norm": 5.4375, "learning_rate": 1.5475251865238738e-08, "loss": 0.7272, "step": 2464 }, { "epoch": 1.9034749034749034, "grad_norm": 4.0, "learning_rate": 1.5231085795990447e-08, "loss": 0.8842, "step": 2465 }, { "epoch": 1.904247104247104, "grad_norm": 6.59375, "learning_rate": 1.4988849473851626e-08, "loss": 0.6896, "step": 2466 }, { "epoch": 1.9050193050193052, "grad_norm": 4.09375, "learning_rate": 1.4748543277399024e-08, "loss": 0.8129, "step": 2467 }, { "epoch": 1.9057915057915058, "grad_norm": 5.40625, "learning_rate": 1.4510167582192647e-08, "loss": 0.8097, "step": 2468 }, { "epoch": 1.9065637065637064, "grad_norm": 3.859375, "learning_rate": 1.4273722760776015e-08, "loss": 0.8331, "step": 2469 }, { "epoch": 1.9073359073359073, "grad_norm": 3.703125, "learning_rate": 1.403920918267465e-08, "loss": 0.9189, "step": 2470 }, { "epoch": 1.9081081081081082, "grad_norm": 4.03125, "learning_rate": 1.3806627214395935e-08, "loss": 0.8558, "step": 2471 }, { "epoch": 1.9088803088803088, "grad_norm": 4.21875, "learning_rate": 1.3575977219428548e-08, "loss": 0.8054, "step": 2472 }, { "epoch": 1.9096525096525097, "grad_norm": 4.375, "learning_rate": 1.3347259558241233e-08, "loss": 0.7976, "step": 2473 }, { "epoch": 1.9104247104247105, "grad_norm": 4.21875, "learning_rate": 1.3120474588283888e-08, "loss": 0.8991, "step": 2474 }, { "epoch": 1.9111969111969112, "grad_norm": 6.40625, "learning_rate": 1.2895622663984808e-08, "loss": 0.877, "step": 2475 }, { "epoch": 1.911969111969112, "grad_norm": 5.15625, "learning_rate": 1.2672704136751923e-08, "loss": 0.8499, "step": 2476 }, { "epoch": 1.9127413127413129, "grad_norm": 4.59375, "learning_rate": 1.2451719354971414e-08, "loss": 1.0127, "step": 2477 }, { "epoch": 1.9135135135135135, "grad_norm": 3.625, "learning_rate": 1.2232668664007158e-08, "loss": 0.8199, "step": 2478 }, { "epoch": 1.9142857142857141, "grad_norm": 4.34375, "learning_rate": 1.2015552406200587e-08, "loss": 0.8732, "step": 2479 }, { "epoch": 1.915057915057915, "grad_norm": 4.15625, "learning_rate": 1.1800370920869858e-08, "loss": 0.9238, "step": 2480 }, { "epoch": 1.9158301158301159, "grad_norm": 4.15625, "learning_rate": 1.1587124544309297e-08, "loss": 0.7945, "step": 2481 }, { "epoch": 1.9166023166023165, "grad_norm": 3.765625, "learning_rate": 1.1375813609788983e-08, "loss": 0.8741, "step": 2482 }, { "epoch": 1.9173745173745174, "grad_norm": 3.703125, "learning_rate": 1.1166438447554329e-08, "loss": 0.7599, "step": 2483 }, { "epoch": 1.9181467181467182, "grad_norm": 3.921875, "learning_rate": 1.0958999384825253e-08, "loss": 0.7631, "step": 2484 }, { "epoch": 1.9189189189189189, "grad_norm": 4.65625, "learning_rate": 1.0753496745796176e-08, "loss": 0.9535, "step": 2485 }, { "epoch": 1.9196911196911197, "grad_norm": 4.09375, "learning_rate": 1.0549930851634638e-08, "loss": 0.8857, "step": 2486 }, { "epoch": 1.9204633204633206, "grad_norm": 3.71875, "learning_rate": 1.0348302020481981e-08, "loss": 0.7518, "step": 2487 }, { "epoch": 1.9212355212355212, "grad_norm": 3.875, "learning_rate": 1.0148610567451838e-08, "loss": 1.0302, "step": 2488 }, { "epoch": 1.9220077220077219, "grad_norm": 3.96875, "learning_rate": 9.950856804630261e-09, "loss": 0.7878, "step": 2489 }, { "epoch": 1.922779922779923, "grad_norm": 3.390625, "learning_rate": 9.755041041074747e-09, "loss": 0.7189, "step": 2490 }, { "epoch": 1.9235521235521236, "grad_norm": 5.625, "learning_rate": 9.561163582814115e-09, "loss": 0.7591, "step": 2491 }, { "epoch": 1.9243243243243242, "grad_norm": 5.125, "learning_rate": 9.36922473284807e-09, "loss": 0.9188, "step": 2492 }, { "epoch": 1.925096525096525, "grad_norm": 4.09375, "learning_rate": 9.179224791146384e-09, "loss": 0.9937, "step": 2493 }, { "epoch": 1.925868725868726, "grad_norm": 8.8125, "learning_rate": 8.991164054648754e-09, "loss": 1.0302, "step": 2494 }, { "epoch": 1.9266409266409266, "grad_norm": 5.53125, "learning_rate": 8.805042817263965e-09, "loss": 0.8391, "step": 2495 }, { "epoch": 1.9274131274131274, "grad_norm": 3.796875, "learning_rate": 8.62086136987017e-09, "loss": 0.7651, "step": 2496 }, { "epoch": 1.9281853281853283, "grad_norm": 4.4375, "learning_rate": 8.438620000313369e-09, "loss": 0.8548, "step": 2497 }, { "epoch": 1.928957528957529, "grad_norm": 3.6875, "learning_rate": 8.258318993407954e-09, "loss": 0.7979, "step": 2498 }, { "epoch": 1.9297297297297298, "grad_norm": 4.375, "learning_rate": 8.079958630935609e-09, "loss": 0.9846, "step": 2499 }, { "epoch": 1.9305019305019306, "grad_norm": 3.921875, "learning_rate": 7.90353919164516e-09, "loss": 0.8856, "step": 2500 }, { "epoch": 1.9312741312741313, "grad_norm": 3.703125, "learning_rate": 7.72906095125217e-09, "loss": 0.7484, "step": 2501 }, { "epoch": 1.932046332046332, "grad_norm": 4.5, "learning_rate": 7.556524182438102e-09, "loss": 0.819, "step": 2502 }, { "epoch": 1.9328185328185328, "grad_norm": 3.71875, "learning_rate": 7.385929154850591e-09, "loss": 0.7794, "step": 2503 }, { "epoch": 1.9335907335907336, "grad_norm": 5.28125, "learning_rate": 7.217276135102619e-09, "loss": 0.8431, "step": 2504 }, { "epoch": 1.9343629343629343, "grad_norm": 4.15625, "learning_rate": 7.050565386771818e-09, "loss": 0.8736, "step": 2505 }, { "epoch": 1.9351351351351351, "grad_norm": 3.65625, "learning_rate": 6.88579717040061e-09, "loss": 0.8211, "step": 2506 }, { "epoch": 1.935907335907336, "grad_norm": 3.984375, "learning_rate": 6.722971743495371e-09, "loss": 0.79, "step": 2507 }, { "epoch": 1.9366795366795366, "grad_norm": 5.65625, "learning_rate": 6.562089360526436e-09, "loss": 0.8262, "step": 2508 }, { "epoch": 1.9374517374517375, "grad_norm": 5.4375, "learning_rate": 6.403150272927261e-09, "loss": 0.7608, "step": 2509 }, { "epoch": 1.9382239382239383, "grad_norm": 4.0, "learning_rate": 6.2461547290942895e-09, "loss": 0.8638, "step": 2510 }, { "epoch": 1.938996138996139, "grad_norm": 4.0, "learning_rate": 6.09110297438667e-09, "loss": 0.8601, "step": 2511 }, { "epoch": 1.9397683397683396, "grad_norm": 7.09375, "learning_rate": 5.937995251125428e-09, "loss": 0.9826, "step": 2512 }, { "epoch": 1.9405405405405407, "grad_norm": 3.640625, "learning_rate": 5.7868317985936e-09, "loss": 0.9051, "step": 2513 }, { "epoch": 1.9413127413127413, "grad_norm": 5.75, "learning_rate": 5.6376128530358236e-09, "loss": 0.9166, "step": 2514 }, { "epoch": 1.942084942084942, "grad_norm": 3.75, "learning_rate": 5.490338647657078e-09, "loss": 0.8173, "step": 2515 }, { "epoch": 1.9428571428571428, "grad_norm": 4.34375, "learning_rate": 5.345009412623808e-09, "loss": 0.8759, "step": 2516 }, { "epoch": 1.9436293436293437, "grad_norm": 4.71875, "learning_rate": 5.201625375062386e-09, "loss": 0.8196, "step": 2517 }, { "epoch": 1.9444015444015443, "grad_norm": 3.921875, "learning_rate": 5.060186759059254e-09, "loss": 0.8597, "step": 2518 }, { "epoch": 1.9451737451737452, "grad_norm": 4.09375, "learning_rate": 4.920693785660374e-09, "loss": 0.9491, "step": 2519 }, { "epoch": 1.945945945945946, "grad_norm": 4.03125, "learning_rate": 4.783146672871081e-09, "loss": 0.9515, "step": 2520 }, { "epoch": 1.9467181467181467, "grad_norm": 5.21875, "learning_rate": 4.64754563565567e-09, "loss": 0.765, "step": 2521 }, { "epoch": 1.9474903474903473, "grad_norm": 3.453125, "learning_rate": 4.513890885936845e-09, "loss": 0.6928, "step": 2522 }, { "epoch": 1.9482625482625484, "grad_norm": 4.15625, "learning_rate": 4.382182632595711e-09, "loss": 1.1018, "step": 2523 }, { "epoch": 1.949034749034749, "grad_norm": 4.5625, "learning_rate": 4.2524210814712254e-09, "loss": 0.9071, "step": 2524 }, { "epoch": 1.9498069498069497, "grad_norm": 3.8125, "learning_rate": 4.124606435360195e-09, "loss": 0.8832, "step": 2525 }, { "epoch": 1.9505791505791505, "grad_norm": 3.78125, "learning_rate": 3.998738894016307e-09, "loss": 0.8615, "step": 2526 }, { "epoch": 1.9513513513513514, "grad_norm": 7.5625, "learning_rate": 3.874818654150681e-09, "loss": 0.8183, "step": 2527 }, { "epoch": 1.952123552123552, "grad_norm": 4.53125, "learning_rate": 3.7528459094308965e-09, "loss": 0.9636, "step": 2528 }, { "epoch": 1.952895752895753, "grad_norm": 6.40625, "learning_rate": 3.6328208504808613e-09, "loss": 0.9357, "step": 2529 }, { "epoch": 1.9536679536679538, "grad_norm": 3.640625, "learning_rate": 3.5147436648806654e-09, "loss": 0.8766, "step": 2530 }, { "epoch": 1.9544401544401544, "grad_norm": 3.890625, "learning_rate": 3.3986145371660284e-09, "loss": 0.8469, "step": 2531 }, { "epoch": 1.9552123552123553, "grad_norm": 6.25, "learning_rate": 3.2844336488284388e-09, "loss": 0.8244, "step": 2532 }, { "epoch": 1.9559845559845561, "grad_norm": 4.4375, "learning_rate": 3.1722011783143215e-09, "loss": 0.8024, "step": 2533 }, { "epoch": 1.9567567567567568, "grad_norm": 4.25, "learning_rate": 3.0619173010251746e-09, "loss": 0.8773, "step": 2534 }, { "epoch": 1.9575289575289574, "grad_norm": 3.625, "learning_rate": 2.953582189317017e-09, "loss": 0.7904, "step": 2535 }, { "epoch": 1.9583011583011583, "grad_norm": 5.3125, "learning_rate": 2.847196012500664e-09, "loss": 0.8692, "step": 2536 }, { "epoch": 1.959073359073359, "grad_norm": 3.8125, "learning_rate": 2.742758936840617e-09, "loss": 0.8276, "step": 2537 }, { "epoch": 1.9598455598455597, "grad_norm": 5.1875, "learning_rate": 2.640271125555205e-09, "loss": 0.8784, "step": 2538 }, { "epoch": 1.9606177606177606, "grad_norm": 6.21875, "learning_rate": 2.539732738816858e-09, "loss": 0.669, "step": 2539 }, { "epoch": 1.9613899613899615, "grad_norm": 3.875, "learning_rate": 2.4411439337508613e-09, "loss": 0.9392, "step": 2540 }, { "epoch": 1.962162162162162, "grad_norm": 4.3125, "learning_rate": 2.3445048644357706e-09, "loss": 0.8474, "step": 2541 }, { "epoch": 1.962934362934363, "grad_norm": 13.625, "learning_rate": 2.2498156819031335e-09, "loss": 0.884, "step": 2542 }, { "epoch": 1.9637065637065638, "grad_norm": 3.625, "learning_rate": 2.1570765341372148e-09, "loss": 0.8088, "step": 2543 }, { "epoch": 1.9644787644787645, "grad_norm": 3.609375, "learning_rate": 2.066287566074299e-09, "loss": 0.8558, "step": 2544 }, { "epoch": 1.965250965250965, "grad_norm": 4.625, "learning_rate": 1.977448919603109e-09, "loss": 0.8002, "step": 2545 }, { "epoch": 1.9660231660231662, "grad_norm": 4.125, "learning_rate": 1.8905607335643894e-09, "loss": 0.8289, "step": 2546 }, { "epoch": 1.9667953667953668, "grad_norm": 3.78125, "learning_rate": 1.805623143750629e-09, "loss": 0.8303, "step": 2547 }, { "epoch": 1.9675675675675675, "grad_norm": 5.15625, "learning_rate": 1.7226362829053655e-09, "loss": 0.9761, "step": 2548 }, { "epoch": 1.9683397683397683, "grad_norm": 3.875, "learning_rate": 1.6416002807242982e-09, "loss": 0.7829, "step": 2549 }, { "epoch": 1.9691119691119692, "grad_norm": 6.0625, "learning_rate": 1.5625152638534813e-09, "loss": 0.9263, "step": 2550 }, { "epoch": 1.9698841698841698, "grad_norm": 4.90625, "learning_rate": 1.4853813558902974e-09, "loss": 0.7838, "step": 2551 }, { "epoch": 1.9706563706563707, "grad_norm": 4.40625, "learning_rate": 1.4101986773827626e-09, "loss": 0.7068, "step": 2552 }, { "epoch": 1.9714285714285715, "grad_norm": 3.65625, "learning_rate": 1.3369673458295263e-09, "loss": 0.8388, "step": 2553 }, { "epoch": 1.9722007722007722, "grad_norm": 4.375, "learning_rate": 1.265687475679317e-09, "loss": 0.8349, "step": 2554 }, { "epoch": 1.972972972972973, "grad_norm": 4.15625, "learning_rate": 1.196359178331219e-09, "loss": 0.8109, "step": 2555 }, { "epoch": 1.9737451737451739, "grad_norm": 4.03125, "learning_rate": 1.1289825621343952e-09, "loss": 0.8246, "step": 2556 }, { "epoch": 1.9745173745173745, "grad_norm": 3.9375, "learning_rate": 1.0635577323878099e-09, "loss": 0.8109, "step": 2557 }, { "epoch": 1.9752895752895752, "grad_norm": 3.59375, "learning_rate": 1.000084791340089e-09, "loss": 0.8382, "step": 2558 }, { "epoch": 1.976061776061776, "grad_norm": 6.40625, "learning_rate": 9.385638381891048e-10, "loss": 0.7913, "step": 2559 }, { "epoch": 1.9768339768339769, "grad_norm": 3.71875, "learning_rate": 8.789949690823918e-10, "loss": 0.8052, "step": 2560 }, { "epoch": 1.9776061776061775, "grad_norm": 5.15625, "learning_rate": 8.2137827711673e-10, "loss": 0.8799, "step": 2561 }, { "epoch": 1.9783783783783784, "grad_norm": 4.25, "learning_rate": 7.657138523377295e-10, "loss": 0.8017, "step": 2562 }, { "epoch": 1.9791505791505792, "grad_norm": 4.15625, "learning_rate": 7.120017817401071e-10, "loss": 0.9139, "step": 2563 }, { "epoch": 1.9799227799227799, "grad_norm": 4.46875, "learning_rate": 6.602421492671318e-10, "loss": 0.8306, "step": 2564 }, { "epoch": 1.9806949806949807, "grad_norm": 3.5625, "learning_rate": 6.104350358109023e-10, "loss": 0.8273, "step": 2565 }, { "epoch": 1.9814671814671816, "grad_norm": 3.46875, "learning_rate": 5.625805192120693e-10, "loss": 0.7852, "step": 2566 }, { "epoch": 1.9822393822393822, "grad_norm": 4.9375, "learning_rate": 5.166786742595576e-10, "loss": 0.8287, "step": 2567 }, { "epoch": 1.9830115830115829, "grad_norm": 9.125, "learning_rate": 4.727295726907055e-10, "loss": 0.8372, "step": 2568 }, { "epoch": 1.983783783783784, "grad_norm": 3.25, "learning_rate": 4.307332831908484e-10, "loss": 0.7417, "step": 2569 }, { "epoch": 1.9845559845559846, "grad_norm": 4.65625, "learning_rate": 3.906898713935958e-10, "loss": 0.903, "step": 2570 }, { "epoch": 1.9853281853281852, "grad_norm": 4.1875, "learning_rate": 3.525993998804156e-10, "loss": 0.9255, "step": 2571 }, { "epoch": 1.986100386100386, "grad_norm": 3.59375, "learning_rate": 3.164619281806336e-10, "loss": 0.8034, "step": 2572 }, { "epoch": 1.986872586872587, "grad_norm": 4.21875, "learning_rate": 2.822775127712951e-10, "loss": 0.844, "step": 2573 }, { "epoch": 1.9876447876447876, "grad_norm": 4.1875, "learning_rate": 2.5004620707730356e-10, "loss": 0.9057, "step": 2574 }, { "epoch": 1.9884169884169884, "grad_norm": 3.828125, "learning_rate": 2.1976806147100405e-10, "loss": 0.8253, "step": 2575 }, { "epoch": 1.9891891891891893, "grad_norm": 4.5625, "learning_rate": 1.914431232721836e-10, "loss": 0.9996, "step": 2576 }, { "epoch": 1.98996138996139, "grad_norm": 4.28125, "learning_rate": 1.6507143674848736e-10, "loss": 0.7631, "step": 2577 }, { "epoch": 1.9907335907335906, "grad_norm": 4.0625, "learning_rate": 1.4065304311444706e-10, "loss": 0.9559, "step": 2578 }, { "epoch": 1.9915057915057917, "grad_norm": 3.953125, "learning_rate": 1.1818798053217505e-10, "loss": 0.9117, "step": 2579 }, { "epoch": 1.9922779922779923, "grad_norm": 4.5, "learning_rate": 9.767628411094798e-11, "loss": 0.8467, "step": 2580 }, { "epoch": 1.993050193050193, "grad_norm": 12.875, "learning_rate": 7.911798590748421e-11, "loss": 0.787, "step": 2581 }, { "epoch": 1.9938223938223938, "grad_norm": 6.15625, "learning_rate": 6.251311492511125e-11, "loss": 0.9207, "step": 2582 }, { "epoch": 1.9945945945945946, "grad_norm": 4.25, "learning_rate": 4.786169711487598e-11, "loss": 0.9311, "step": 2583 }, { "epoch": 1.9953667953667953, "grad_norm": 4.6875, "learning_rate": 3.5163755374434394e-11, "loss": 0.9866, "step": 2584 }, { "epoch": 1.9961389961389961, "grad_norm": 4.71875, "learning_rate": 2.4419309548884274e-11, "loss": 0.8525, "step": 2585 }, { "epoch": 1.996911196911197, "grad_norm": 3.765625, "learning_rate": 1.5628376429793758e-11, "loss": 0.7833, "step": 2586 }, { "epoch": 1.9976833976833976, "grad_norm": 3.671875, "learning_rate": 8.790969756033995e-12, "loss": 0.8259, "step": 2587 }, { "epoch": 1.9984555984555985, "grad_norm": 3.859375, "learning_rate": 3.90710021364038e-12, "loss": 0.8914, "step": 2588 }, { "epoch": 1.9992277992277994, "grad_norm": 3.875, "learning_rate": 9.767754349798709e-13, "loss": 0.9336, "step": 2589 }, { "epoch": 2.0, "grad_norm": 4.15625, "learning_rate": 0.0, "loss": 0.8784, "step": 2590 } ], "logging_steps": 1, "max_steps": 2590, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 648, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.89502808588288e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }