{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999605631581023, "eval_steps": 768, "global_step": 6339, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015774736759080333, "grad_norm": 10.605186462402344, "learning_rate": 7.8125e-09, "loss": 1.7188, "step": 1 }, { "epoch": 0.00031549473518160666, "grad_norm": 10.251529693603516, "learning_rate": 1.5625e-08, "loss": 1.7404, "step": 2 }, { "epoch": 0.00047324210277240996, "grad_norm": 8.667778015136719, "learning_rate": 2.34375e-08, "loss": 1.6564, "step": 3 }, { "epoch": 0.0006309894703632133, "grad_norm": 9.559566497802734, "learning_rate": 3.125e-08, "loss": 1.692, "step": 4 }, { "epoch": 0.0007887368379540167, "grad_norm": 9.882808685302734, "learning_rate": 3.9062499999999997e-08, "loss": 1.7072, "step": 5 }, { "epoch": 0.0009464842055448199, "grad_norm": 9.359736442565918, "learning_rate": 4.6875e-08, "loss": 1.699, "step": 6 }, { "epoch": 0.0011042315731356234, "grad_norm": 9.632488250732422, "learning_rate": 5.46875e-08, "loss": 1.713, "step": 7 }, { "epoch": 0.0012619789407264266, "grad_norm": 10.159477233886719, "learning_rate": 6.25e-08, "loss": 1.7004, "step": 8 }, { "epoch": 0.0014197263083172299, "grad_norm": 8.86357593536377, "learning_rate": 7.03125e-08, "loss": 1.6718, "step": 9 }, { "epoch": 0.0015774736759080333, "grad_norm": 9.487836837768555, "learning_rate": 7.812499999999999e-08, "loss": 1.6739, "step": 10 }, { "epoch": 0.0017352210434988366, "grad_norm": 11.373818397521973, "learning_rate": 8.59375e-08, "loss": 1.7022, "step": 11 }, { "epoch": 0.0018929684110896398, "grad_norm": 8.63364315032959, "learning_rate": 9.375e-08, "loss": 1.6508, "step": 12 }, { "epoch": 0.0020507157786804433, "grad_norm": 9.603653907775879, "learning_rate": 1.015625e-07, "loss": 1.7101, "step": 13 }, { "epoch": 0.0022084631462712468, "grad_norm": 9.238273620605469, "learning_rate": 1.09375e-07, "loss": 1.6795, "step": 14 }, { "epoch": 0.00236621051386205, "grad_norm": 9.683836936950684, "learning_rate": 1.1718749999999999e-07, "loss": 1.6917, "step": 15 }, { "epoch": 0.0025239578814528533, "grad_norm": 10.214293479919434, "learning_rate": 1.25e-07, "loss": 1.6798, "step": 16 }, { "epoch": 0.0026817052490436567, "grad_norm": 9.098894119262695, "learning_rate": 1.328125e-07, "loss": 1.6695, "step": 17 }, { "epoch": 0.0028394526166344598, "grad_norm": 9.310189247131348, "learning_rate": 1.40625e-07, "loss": 1.6801, "step": 18 }, { "epoch": 0.0029971999842252632, "grad_norm": 9.319585800170898, "learning_rate": 1.4843749999999998e-07, "loss": 1.6781, "step": 19 }, { "epoch": 0.0031549473518160667, "grad_norm": 9.21566104888916, "learning_rate": 1.5624999999999999e-07, "loss": 1.6803, "step": 20 }, { "epoch": 0.0033126947194068697, "grad_norm": 8.562273025512695, "learning_rate": 1.640625e-07, "loss": 1.6577, "step": 21 }, { "epoch": 0.003470442086997673, "grad_norm": 15.55296802520752, "learning_rate": 1.71875e-07, "loss": 1.6569, "step": 22 }, { "epoch": 0.0036281894545884767, "grad_norm": 9.303814888000488, "learning_rate": 1.796875e-07, "loss": 1.6736, "step": 23 }, { "epoch": 0.0037859368221792797, "grad_norm": 10.956175804138184, "learning_rate": 1.875e-07, "loss": 1.7147, "step": 24 }, { "epoch": 0.003943684189770084, "grad_norm": 8.357747077941895, "learning_rate": 1.9531249999999998e-07, "loss": 1.6438, "step": 25 }, { "epoch": 0.004101431557360887, "grad_norm": 9.625614166259766, "learning_rate": 2.03125e-07, "loss": 1.6928, "step": 26 }, { "epoch": 0.00425917892495169, "grad_norm": 9.122293472290039, "learning_rate": 2.109375e-07, "loss": 1.6726, "step": 27 }, { "epoch": 0.0044169262925424935, "grad_norm": 9.233643531799316, "learning_rate": 2.1875e-07, "loss": 1.6593, "step": 28 }, { "epoch": 0.004574673660133297, "grad_norm": 8.721983909606934, "learning_rate": 2.265625e-07, "loss": 1.6597, "step": 29 }, { "epoch": 0.0047324210277241, "grad_norm": 8.40044116973877, "learning_rate": 2.3437499999999998e-07, "loss": 1.635, "step": 30 }, { "epoch": 0.0048901683953149035, "grad_norm": 9.595640182495117, "learning_rate": 2.421875e-07, "loss": 1.6837, "step": 31 }, { "epoch": 0.0050479157629057065, "grad_norm": 8.767451286315918, "learning_rate": 2.5e-07, "loss": 1.6323, "step": 32 }, { "epoch": 0.0052056631304965096, "grad_norm": 8.943890571594238, "learning_rate": 2.5781249999999997e-07, "loss": 1.6379, "step": 33 }, { "epoch": 0.0053634104980873135, "grad_norm": 9.733226776123047, "learning_rate": 2.65625e-07, "loss": 1.65, "step": 34 }, { "epoch": 0.0055211578656781165, "grad_norm": 8.848211288452148, "learning_rate": 2.734375e-07, "loss": 1.636, "step": 35 }, { "epoch": 0.0056789052332689195, "grad_norm": 8.562799453735352, "learning_rate": 2.8125e-07, "loss": 1.6429, "step": 36 }, { "epoch": 0.005836652600859723, "grad_norm": 8.89713191986084, "learning_rate": 2.890625e-07, "loss": 1.6226, "step": 37 }, { "epoch": 0.0059943999684505265, "grad_norm": 9.156657218933105, "learning_rate": 2.9687499999999996e-07, "loss": 1.6312, "step": 38 }, { "epoch": 0.0061521473360413295, "grad_norm": 9.411709785461426, "learning_rate": 3.046875e-07, "loss": 1.6134, "step": 39 }, { "epoch": 0.006309894703632133, "grad_norm": 8.37478256225586, "learning_rate": 3.1249999999999997e-07, "loss": 1.6248, "step": 40 }, { "epoch": 0.006467642071222936, "grad_norm": 9.141951560974121, "learning_rate": 3.203125e-07, "loss": 1.6487, "step": 41 }, { "epoch": 0.0066253894388137394, "grad_norm": 9.102831840515137, "learning_rate": 3.28125e-07, "loss": 1.6006, "step": 42 }, { "epoch": 0.006783136806404543, "grad_norm": 8.59057903289795, "learning_rate": 3.3593749999999996e-07, "loss": 1.6389, "step": 43 }, { "epoch": 0.006940884173995346, "grad_norm": 7.083738803863525, "learning_rate": 3.4375e-07, "loss": 1.5743, "step": 44 }, { "epoch": 0.007098631541586149, "grad_norm": 7.837166786193848, "learning_rate": 3.5156249999999997e-07, "loss": 1.5881, "step": 45 }, { "epoch": 0.007256378909176953, "grad_norm": 9.42550277709961, "learning_rate": 3.59375e-07, "loss": 1.6255, "step": 46 }, { "epoch": 0.007414126276767756, "grad_norm": 9.34886646270752, "learning_rate": 3.671875e-07, "loss": 1.6094, "step": 47 }, { "epoch": 0.007571873644358559, "grad_norm": 8.45085334777832, "learning_rate": 3.75e-07, "loss": 1.5775, "step": 48 }, { "epoch": 0.007729621011949363, "grad_norm": 9.391676902770996, "learning_rate": 3.828125e-07, "loss": 1.6273, "step": 49 }, { "epoch": 0.007887368379540167, "grad_norm": 8.811574935913086, "learning_rate": 3.9062499999999997e-07, "loss": 1.5805, "step": 50 }, { "epoch": 0.00804511574713097, "grad_norm": 8.590651512145996, "learning_rate": 3.984375e-07, "loss": 1.6052, "step": 51 }, { "epoch": 0.008202863114721773, "grad_norm": 9.803258895874023, "learning_rate": 4.0625e-07, "loss": 1.5911, "step": 52 }, { "epoch": 0.008360610482312576, "grad_norm": 10.158894538879395, "learning_rate": 4.140625e-07, "loss": 1.6011, "step": 53 }, { "epoch": 0.00851835784990338, "grad_norm": 8.869569778442383, "learning_rate": 4.21875e-07, "loss": 1.5658, "step": 54 }, { "epoch": 0.008676105217494182, "grad_norm": 9.200840950012207, "learning_rate": 4.2968749999999996e-07, "loss": 1.5631, "step": 55 }, { "epoch": 0.008833852585084987, "grad_norm": 9.003901481628418, "learning_rate": 4.375e-07, "loss": 1.5568, "step": 56 }, { "epoch": 0.00899159995267579, "grad_norm": 10.028145790100098, "learning_rate": 4.4531249999999997e-07, "loss": 1.5362, "step": 57 }, { "epoch": 0.009149347320266593, "grad_norm": 8.84991455078125, "learning_rate": 4.53125e-07, "loss": 1.5227, "step": 58 }, { "epoch": 0.009307094687857396, "grad_norm": 9.381906509399414, "learning_rate": 4.609375e-07, "loss": 1.5435, "step": 59 }, { "epoch": 0.0094648420554482, "grad_norm": 9.109251976013184, "learning_rate": 4.6874999999999996e-07, "loss": 1.5378, "step": 60 }, { "epoch": 0.009622589423039002, "grad_norm": 8.686064720153809, "learning_rate": 4.765625e-07, "loss": 1.5229, "step": 61 }, { "epoch": 0.009780336790629807, "grad_norm": 8.528047561645508, "learning_rate": 4.84375e-07, "loss": 1.5224, "step": 62 }, { "epoch": 0.00993808415822061, "grad_norm": 9.493647575378418, "learning_rate": 4.921875e-07, "loss": 1.5203, "step": 63 }, { "epoch": 0.010095831525811413, "grad_norm": 8.9859037399292, "learning_rate": 5e-07, "loss": 1.5174, "step": 64 }, { "epoch": 0.010253578893402216, "grad_norm": 9.881206512451172, "learning_rate": 5.078125e-07, "loss": 1.4765, "step": 65 }, { "epoch": 0.010411326260993019, "grad_norm": 9.306422233581543, "learning_rate": 5.156249999999999e-07, "loss": 1.4665, "step": 66 }, { "epoch": 0.010569073628583822, "grad_norm": 9.779091835021973, "learning_rate": 5.234375e-07, "loss": 1.486, "step": 67 }, { "epoch": 0.010726820996174627, "grad_norm": 10.422354698181152, "learning_rate": 5.3125e-07, "loss": 1.4697, "step": 68 }, { "epoch": 0.01088456836376543, "grad_norm": 9.95809555053711, "learning_rate": 5.390625e-07, "loss": 1.453, "step": 69 }, { "epoch": 0.011042315731356233, "grad_norm": 11.298678398132324, "learning_rate": 5.46875e-07, "loss": 1.4186, "step": 70 }, { "epoch": 0.011200063098947036, "grad_norm": 10.737797737121582, "learning_rate": 5.546874999999999e-07, "loss": 1.43, "step": 71 }, { "epoch": 0.011357810466537839, "grad_norm": 10.393301010131836, "learning_rate": 5.625e-07, "loss": 1.4079, "step": 72 }, { "epoch": 0.011515557834128644, "grad_norm": 10.883868217468262, "learning_rate": 5.703125e-07, "loss": 1.4192, "step": 73 }, { "epoch": 0.011673305201719447, "grad_norm": 16.47789192199707, "learning_rate": 5.78125e-07, "loss": 1.3894, "step": 74 }, { "epoch": 0.01183105256931025, "grad_norm": 11.472434997558594, "learning_rate": 5.859375e-07, "loss": 1.3653, "step": 75 }, { "epoch": 0.011988799936901053, "grad_norm": 12.478020668029785, "learning_rate": 5.937499999999999e-07, "loss": 1.3557, "step": 76 }, { "epoch": 0.012146547304491856, "grad_norm": 14.180948257446289, "learning_rate": 6.015625e-07, "loss": 1.3193, "step": 77 }, { "epoch": 0.012304294672082659, "grad_norm": 12.813949584960938, "learning_rate": 6.09375e-07, "loss": 1.3267, "step": 78 }, { "epoch": 0.012462042039673464, "grad_norm": 11.938271522521973, "learning_rate": 6.171875e-07, "loss": 1.2896, "step": 79 }, { "epoch": 0.012619789407264267, "grad_norm": 12.180197715759277, "learning_rate": 6.249999999999999e-07, "loss": 1.2662, "step": 80 }, { "epoch": 0.01277753677485507, "grad_norm": 10.459107398986816, "learning_rate": 6.328124999999999e-07, "loss": 1.2404, "step": 81 }, { "epoch": 0.012935284142445873, "grad_norm": 12.862483024597168, "learning_rate": 6.40625e-07, "loss": 1.1687, "step": 82 }, { "epoch": 0.013093031510036676, "grad_norm": 8.410504341125488, "learning_rate": 6.484375e-07, "loss": 1.2366, "step": 83 }, { "epoch": 0.013250778877627479, "grad_norm": 708.03125, "learning_rate": 6.5625e-07, "loss": 1.2195, "step": 84 }, { "epoch": 0.013408526245218284, "grad_norm": 8.076370239257812, "learning_rate": 6.640624999999999e-07, "loss": 1.1526, "step": 85 }, { "epoch": 0.013566273612809087, "grad_norm": 7.006606101989746, "learning_rate": 6.718749999999999e-07, "loss": 1.2265, "step": 86 }, { "epoch": 0.01372402098039989, "grad_norm": 7.100209712982178, "learning_rate": 6.796875e-07, "loss": 1.1768, "step": 87 }, { "epoch": 0.013881768347990693, "grad_norm": 8.758187294006348, "learning_rate": 6.875e-07, "loss": 1.1765, "step": 88 }, { "epoch": 0.014039515715581496, "grad_norm": 8.036221504211426, "learning_rate": 6.953125e-07, "loss": 1.1236, "step": 89 }, { "epoch": 0.014197263083172299, "grad_norm": 6.954113006591797, "learning_rate": 7.031249999999999e-07, "loss": 1.1204, "step": 90 }, { "epoch": 0.014355010450763104, "grad_norm": 7.305809497833252, "learning_rate": 7.109374999999999e-07, "loss": 1.1557, "step": 91 }, { "epoch": 0.014512757818353907, "grad_norm": 5.923920631408691, "learning_rate": 7.1875e-07, "loss": 1.0172, "step": 92 }, { "epoch": 0.01467050518594471, "grad_norm": 8.816319465637207, "learning_rate": 7.265625e-07, "loss": 1.1, "step": 93 }, { "epoch": 0.014828252553535513, "grad_norm": 8.399360656738281, "learning_rate": 7.34375e-07, "loss": 1.1316, "step": 94 }, { "epoch": 0.014985999921126316, "grad_norm": 8.26098346710205, "learning_rate": 7.421874999999999e-07, "loss": 1.059, "step": 95 }, { "epoch": 0.015143747288717119, "grad_norm": 7.330342769622803, "learning_rate": 7.5e-07, "loss": 1.0831, "step": 96 }, { "epoch": 0.015301494656307923, "grad_norm": 7.985196113586426, "learning_rate": 7.578125e-07, "loss": 1.0961, "step": 97 }, { "epoch": 0.015459242023898727, "grad_norm": 17.839303970336914, "learning_rate": 7.65625e-07, "loss": 1.0891, "step": 98 }, { "epoch": 0.01561698939148953, "grad_norm": 6.162847995758057, "learning_rate": 7.734375e-07, "loss": 1.0715, "step": 99 }, { "epoch": 0.015774736759080334, "grad_norm": 120.85552978515625, "learning_rate": 7.812499999999999e-07, "loss": 1.0538, "step": 100 }, { "epoch": 0.015932484126671136, "grad_norm": 6.262155055999756, "learning_rate": 7.890625e-07, "loss": 0.9359, "step": 101 }, { "epoch": 0.01609023149426194, "grad_norm": 8.068511962890625, "learning_rate": 7.96875e-07, "loss": 1.0444, "step": 102 }, { "epoch": 0.01624797886185274, "grad_norm": 5.990355968475342, "learning_rate": 8.046875e-07, "loss": 0.9598, "step": 103 }, { "epoch": 0.016405726229443546, "grad_norm": 7.656558990478516, "learning_rate": 8.125e-07, "loss": 1.0295, "step": 104 }, { "epoch": 0.01656347359703435, "grad_norm": 8.8897123336792, "learning_rate": 8.203124999999999e-07, "loss": 0.9684, "step": 105 }, { "epoch": 0.016721220964625153, "grad_norm": 6.440278053283691, "learning_rate": 8.28125e-07, "loss": 0.9391, "step": 106 }, { "epoch": 0.016878968332215957, "grad_norm": 6.96622896194458, "learning_rate": 8.359375e-07, "loss": 0.9678, "step": 107 }, { "epoch": 0.01703671569980676, "grad_norm": 6.747005939483643, "learning_rate": 8.4375e-07, "loss": 0.9858, "step": 108 }, { "epoch": 0.017194463067397563, "grad_norm": 7.434170722961426, "learning_rate": 8.515624999999999e-07, "loss": 0.9164, "step": 109 }, { "epoch": 0.017352210434988365, "grad_norm": 5.481407165527344, "learning_rate": 8.593749999999999e-07, "loss": 0.9099, "step": 110 }, { "epoch": 0.01750995780257917, "grad_norm": 7.978529930114746, "learning_rate": 8.671875e-07, "loss": 0.8959, "step": 111 }, { "epoch": 0.017667705170169974, "grad_norm": 15.46996021270752, "learning_rate": 8.75e-07, "loss": 0.9717, "step": 112 }, { "epoch": 0.017825452537760775, "grad_norm": 6.016064643859863, "learning_rate": 8.828125e-07, "loss": 0.9127, "step": 113 }, { "epoch": 0.01798319990535158, "grad_norm": 7.6805009841918945, "learning_rate": 8.906249999999999e-07, "loss": 0.8833, "step": 114 }, { "epoch": 0.01814094727294238, "grad_norm": 5.962435722351074, "learning_rate": 8.984374999999999e-07, "loss": 0.9002, "step": 115 }, { "epoch": 0.018298694640533186, "grad_norm": 34.05613327026367, "learning_rate": 9.0625e-07, "loss": 0.8052, "step": 116 }, { "epoch": 0.01845644200812399, "grad_norm": 7.545765399932861, "learning_rate": 9.140625e-07, "loss": 0.8975, "step": 117 }, { "epoch": 0.018614189375714792, "grad_norm": 6.937775611877441, "learning_rate": 9.21875e-07, "loss": 0.8179, "step": 118 }, { "epoch": 0.018771936743305597, "grad_norm": 7.694420337677002, "learning_rate": 9.296874999999999e-07, "loss": 0.8432, "step": 119 }, { "epoch": 0.0189296841108964, "grad_norm": 6.258421897888184, "learning_rate": 9.374999999999999e-07, "loss": 0.8252, "step": 120 }, { "epoch": 0.019087431478487203, "grad_norm": 6.795842170715332, "learning_rate": 9.453125e-07, "loss": 0.7777, "step": 121 }, { "epoch": 0.019245178846078004, "grad_norm": 8.250446319580078, "learning_rate": 9.53125e-07, "loss": 0.923, "step": 122 }, { "epoch": 0.01940292621366881, "grad_norm": 5.323596477508545, "learning_rate": 9.609374999999999e-07, "loss": 0.7119, "step": 123 }, { "epoch": 0.019560673581259614, "grad_norm": 6.934390544891357, "learning_rate": 9.6875e-07, "loss": 0.8482, "step": 124 }, { "epoch": 0.019718420948850415, "grad_norm": 6.222138404846191, "learning_rate": 9.765625e-07, "loss": 0.8095, "step": 125 }, { "epoch": 0.01987616831644122, "grad_norm": 8.063146591186523, "learning_rate": 9.84375e-07, "loss": 0.7474, "step": 126 }, { "epoch": 0.02003391568403202, "grad_norm": 8.791608810424805, "learning_rate": 9.921875e-07, "loss": 0.7099, "step": 127 }, { "epoch": 0.020191663051622826, "grad_norm": 9.693697929382324, "learning_rate": 1e-06, "loss": 0.6944, "step": 128 }, { "epoch": 0.02034941041921363, "grad_norm": 9.193809509277344, "learning_rate": 9.998389953308647e-07, "loss": 0.7153, "step": 129 }, { "epoch": 0.020507157786804432, "grad_norm": 5.666736125946045, "learning_rate": 9.996779906617292e-07, "loss": 0.6735, "step": 130 }, { "epoch": 0.020664905154395237, "grad_norm": 8.746193885803223, "learning_rate": 9.995169859925937e-07, "loss": 0.7298, "step": 131 }, { "epoch": 0.020822652521986038, "grad_norm": 30.897165298461914, "learning_rate": 9.993559813234584e-07, "loss": 0.6898, "step": 132 }, { "epoch": 0.020980399889576843, "grad_norm": 5.508756160736084, "learning_rate": 9.991949766543229e-07, "loss": 0.6829, "step": 133 }, { "epoch": 0.021138147257167644, "grad_norm": 9.121854782104492, "learning_rate": 9.990339719851876e-07, "loss": 0.6911, "step": 134 }, { "epoch": 0.02129589462475845, "grad_norm": 4.720840930938721, "learning_rate": 9.98872967316052e-07, "loss": 0.5923, "step": 135 }, { "epoch": 0.021453641992349254, "grad_norm": 5.468916416168213, "learning_rate": 9.987119626469168e-07, "loss": 0.6364, "step": 136 }, { "epoch": 0.021611389359940055, "grad_norm": 7.017789840698242, "learning_rate": 9.985509579777813e-07, "loss": 0.6458, "step": 137 }, { "epoch": 0.02176913672753086, "grad_norm": 7.074862003326416, "learning_rate": 9.98389953308646e-07, "loss": 0.5727, "step": 138 }, { "epoch": 0.02192688409512166, "grad_norm": 5.27205753326416, "learning_rate": 9.982289486395105e-07, "loss": 0.5757, "step": 139 }, { "epoch": 0.022084631462712466, "grad_norm": 5.0011677742004395, "learning_rate": 9.98067943970375e-07, "loss": 0.5238, "step": 140 }, { "epoch": 0.02224237883030327, "grad_norm": 6.091749668121338, "learning_rate": 9.979069393012397e-07, "loss": 0.5863, "step": 141 }, { "epoch": 0.022400126197894072, "grad_norm": 10.343565940856934, "learning_rate": 9.977459346321042e-07, "loss": 0.5043, "step": 142 }, { "epoch": 0.022557873565484877, "grad_norm": 9.453807830810547, "learning_rate": 9.975849299629687e-07, "loss": 0.5515, "step": 143 }, { "epoch": 0.022715620933075678, "grad_norm": 17.887365341186523, "learning_rate": 9.974239252938334e-07, "loss": 0.5898, "step": 144 }, { "epoch": 0.022873368300666483, "grad_norm": 4.972782611846924, "learning_rate": 9.972629206246982e-07, "loss": 0.5454, "step": 145 }, { "epoch": 0.023031115668257288, "grad_norm": 4.8516526222229, "learning_rate": 9.971019159555627e-07, "loss": 0.539, "step": 146 }, { "epoch": 0.02318886303584809, "grad_norm": 11.199058532714844, "learning_rate": 9.969409112864272e-07, "loss": 0.4849, "step": 147 }, { "epoch": 0.023346610403438894, "grad_norm": 4.304190158843994, "learning_rate": 9.967799066172919e-07, "loss": 0.5409, "step": 148 }, { "epoch": 0.023504357771029695, "grad_norm": 6.043247222900391, "learning_rate": 9.966189019481564e-07, "loss": 0.4697, "step": 149 }, { "epoch": 0.0236621051386205, "grad_norm": 4.9490966796875, "learning_rate": 9.96457897279021e-07, "loss": 0.5018, "step": 150 }, { "epoch": 0.0238198525062113, "grad_norm": 8.612019538879395, "learning_rate": 9.962968926098856e-07, "loss": 0.537, "step": 151 }, { "epoch": 0.023977599873802106, "grad_norm": 5.846463203430176, "learning_rate": 9.961358879407503e-07, "loss": 0.5215, "step": 152 }, { "epoch": 0.02413534724139291, "grad_norm": 8.96881103515625, "learning_rate": 9.959748832716148e-07, "loss": 0.5182, "step": 153 }, { "epoch": 0.024293094608983712, "grad_norm": 5.526561260223389, "learning_rate": 9.958138786024795e-07, "loss": 0.4682, "step": 154 }, { "epoch": 0.024450841976574517, "grad_norm": 5.215470314025879, "learning_rate": 9.95652873933344e-07, "loss": 0.5077, "step": 155 }, { "epoch": 0.024608589344165318, "grad_norm": 7.315175533294678, "learning_rate": 9.954918692642085e-07, "loss": 0.4901, "step": 156 }, { "epoch": 0.024766336711756123, "grad_norm": 5.9891037940979, "learning_rate": 9.953308645950732e-07, "loss": 0.4722, "step": 157 }, { "epoch": 0.024924084079346927, "grad_norm": 10.207083702087402, "learning_rate": 9.95169859925938e-07, "loss": 0.4682, "step": 158 }, { "epoch": 0.02508183144693773, "grad_norm": 4.114821434020996, "learning_rate": 9.950088552568024e-07, "loss": 0.4392, "step": 159 }, { "epoch": 0.025239578814528534, "grad_norm": 4.719483375549316, "learning_rate": 9.94847850587667e-07, "loss": 0.4616, "step": 160 }, { "epoch": 0.025397326182119335, "grad_norm": 5.122009754180908, "learning_rate": 9.946868459185316e-07, "loss": 0.4797, "step": 161 }, { "epoch": 0.02555507354971014, "grad_norm": 6.682754039764404, "learning_rate": 9.945258412493961e-07, "loss": 0.3869, "step": 162 }, { "epoch": 0.02571282091730094, "grad_norm": 5.727705001831055, "learning_rate": 9.943648365802609e-07, "loss": 0.4822, "step": 163 }, { "epoch": 0.025870568284891746, "grad_norm": 5.69813346862793, "learning_rate": 9.942038319111254e-07, "loss": 0.4607, "step": 164 }, { "epoch": 0.02602831565248255, "grad_norm": 14.972919464111328, "learning_rate": 9.940428272419899e-07, "loss": 0.3776, "step": 165 }, { "epoch": 0.02618606302007335, "grad_norm": 4.726632118225098, "learning_rate": 9.938818225728546e-07, "loss": 0.401, "step": 166 }, { "epoch": 0.026343810387664156, "grad_norm": 4.312942028045654, "learning_rate": 9.937208179037193e-07, "loss": 0.3945, "step": 167 }, { "epoch": 0.026501557755254958, "grad_norm": 10.081215858459473, "learning_rate": 9.935598132345838e-07, "loss": 0.4185, "step": 168 }, { "epoch": 0.026659305122845763, "grad_norm": 6.823802947998047, "learning_rate": 9.933988085654483e-07, "loss": 0.3749, "step": 169 }, { "epoch": 0.026817052490436567, "grad_norm": 5.444103717803955, "learning_rate": 9.93237803896313e-07, "loss": 0.3603, "step": 170 }, { "epoch": 0.02697479985802737, "grad_norm": 7.1029887199401855, "learning_rate": 9.930767992271775e-07, "loss": 0.3454, "step": 171 }, { "epoch": 0.027132547225618173, "grad_norm": 19.086397171020508, "learning_rate": 9.92915794558042e-07, "loss": 0.3951, "step": 172 }, { "epoch": 0.027290294593208975, "grad_norm": 6.493707180023193, "learning_rate": 9.927547898889067e-07, "loss": 0.3904, "step": 173 }, { "epoch": 0.02744804196079978, "grad_norm": 15.91336441040039, "learning_rate": 9.925937852197714e-07, "loss": 0.4409, "step": 174 }, { "epoch": 0.027605789328390584, "grad_norm": 3.995539426803589, "learning_rate": 9.92432780550636e-07, "loss": 0.3163, "step": 175 }, { "epoch": 0.027763536695981385, "grad_norm": 13.101820945739746, "learning_rate": 9.922717758815006e-07, "loss": 0.3018, "step": 176 }, { "epoch": 0.02792128406357219, "grad_norm": 5.566201686859131, "learning_rate": 9.921107712123651e-07, "loss": 0.3522, "step": 177 }, { "epoch": 0.02807903143116299, "grad_norm": 4.240898609161377, "learning_rate": 9.919497665432296e-07, "loss": 0.3409, "step": 178 }, { "epoch": 0.028236778798753796, "grad_norm": 5.057911396026611, "learning_rate": 9.917887618740943e-07, "loss": 0.2774, "step": 179 }, { "epoch": 0.028394526166344598, "grad_norm": 10.959158897399902, "learning_rate": 9.916277572049588e-07, "loss": 0.292, "step": 180 }, { "epoch": 0.028552273533935402, "grad_norm": 10.364619255065918, "learning_rate": 9.914667525358236e-07, "loss": 0.3145, "step": 181 }, { "epoch": 0.028710020901526207, "grad_norm": 6.3560004234313965, "learning_rate": 9.91305747866688e-07, "loss": 0.3154, "step": 182 }, { "epoch": 0.02886776826911701, "grad_norm": 17.252134323120117, "learning_rate": 9.911447431975528e-07, "loss": 0.3492, "step": 183 }, { "epoch": 0.029025515636707813, "grad_norm": 5.359389305114746, "learning_rate": 9.909837385284173e-07, "loss": 0.2964, "step": 184 }, { "epoch": 0.029183263004298615, "grad_norm": 9.04397201538086, "learning_rate": 9.908227338592818e-07, "loss": 0.2977, "step": 185 }, { "epoch": 0.02934101037188942, "grad_norm": 13.794440269470215, "learning_rate": 9.906617291901465e-07, "loss": 0.2825, "step": 186 }, { "epoch": 0.029498757739480224, "grad_norm": 5.914454460144043, "learning_rate": 9.90500724521011e-07, "loss": 0.3258, "step": 187 }, { "epoch": 0.029656505107071025, "grad_norm": 4.176006317138672, "learning_rate": 9.903397198518757e-07, "loss": 0.2532, "step": 188 }, { "epoch": 0.02981425247466183, "grad_norm": 4.511537075042725, "learning_rate": 9.901787151827402e-07, "loss": 0.3124, "step": 189 }, { "epoch": 0.02997199984225263, "grad_norm": 12.462079048156738, "learning_rate": 9.90017710513605e-07, "loss": 0.252, "step": 190 }, { "epoch": 0.030129747209843436, "grad_norm": 8.845741271972656, "learning_rate": 9.898567058444694e-07, "loss": 0.3113, "step": 191 }, { "epoch": 0.030287494577434237, "grad_norm": 7.979472637176514, "learning_rate": 9.896957011753341e-07, "loss": 0.25, "step": 192 }, { "epoch": 0.030445241945025042, "grad_norm": 5.637496471405029, "learning_rate": 9.895346965061986e-07, "loss": 0.2759, "step": 193 }, { "epoch": 0.030602989312615847, "grad_norm": 6.822127819061279, "learning_rate": 9.893736918370631e-07, "loss": 0.2119, "step": 194 }, { "epoch": 0.03076073668020665, "grad_norm": 7.106624126434326, "learning_rate": 9.892126871679278e-07, "loss": 0.3029, "step": 195 }, { "epoch": 0.030918484047797453, "grad_norm": 8.90538501739502, "learning_rate": 9.890516824987925e-07, "loss": 0.3089, "step": 196 }, { "epoch": 0.031076231415388254, "grad_norm": 7.088816165924072, "learning_rate": 9.88890677829657e-07, "loss": 0.3151, "step": 197 }, { "epoch": 0.03123397878297906, "grad_norm": 3.6720592975616455, "learning_rate": 9.887296731605215e-07, "loss": 0.2142, "step": 198 }, { "epoch": 0.031391726150569864, "grad_norm": 4.644659042358398, "learning_rate": 9.885686684913863e-07, "loss": 0.1983, "step": 199 }, { "epoch": 0.03154947351816067, "grad_norm": 6.448646545410156, "learning_rate": 9.884076638222508e-07, "loss": 0.2224, "step": 200 }, { "epoch": 0.031707220885751466, "grad_norm": 6.825787544250488, "learning_rate": 9.882466591531155e-07, "loss": 0.2749, "step": 201 }, { "epoch": 0.03186496825334227, "grad_norm": 5.165946960449219, "learning_rate": 9.8808565448398e-07, "loss": 0.2671, "step": 202 }, { "epoch": 0.032022715620933076, "grad_norm": 5.763522148132324, "learning_rate": 9.879246498148445e-07, "loss": 0.2456, "step": 203 }, { "epoch": 0.03218046298852388, "grad_norm": 5.943336486816406, "learning_rate": 9.877636451457092e-07, "loss": 0.255, "step": 204 }, { "epoch": 0.032338210356114686, "grad_norm": 13.404995918273926, "learning_rate": 9.87602640476574e-07, "loss": 0.2258, "step": 205 }, { "epoch": 0.03249595772370548, "grad_norm": 6.111392498016357, "learning_rate": 9.874416358074384e-07, "loss": 0.1813, "step": 206 }, { "epoch": 0.03265370509129629, "grad_norm": 6.81652307510376, "learning_rate": 9.87280631138303e-07, "loss": 0.3109, "step": 207 }, { "epoch": 0.03281145245888709, "grad_norm": 6.360020637512207, "learning_rate": 9.871196264691676e-07, "loss": 0.23, "step": 208 }, { "epoch": 0.0329691998264779, "grad_norm": 9.0701904296875, "learning_rate": 9.869586218000321e-07, "loss": 0.2352, "step": 209 }, { "epoch": 0.0331269471940687, "grad_norm": 5.986306667327881, "learning_rate": 9.867976171308966e-07, "loss": 0.2041, "step": 210 }, { "epoch": 0.0332846945616595, "grad_norm": 8.649664878845215, "learning_rate": 9.866366124617613e-07, "loss": 0.2144, "step": 211 }, { "epoch": 0.033442441929250305, "grad_norm": 8.318222045898438, "learning_rate": 9.86475607792626e-07, "loss": 0.2599, "step": 212 }, { "epoch": 0.03360018929684111, "grad_norm": 4.279948711395264, "learning_rate": 9.863146031234905e-07, "loss": 0.2252, "step": 213 }, { "epoch": 0.033757936664431915, "grad_norm": 9.401074409484863, "learning_rate": 9.86153598454355e-07, "loss": 0.2168, "step": 214 }, { "epoch": 0.03391568403202271, "grad_norm": 8.848161697387695, "learning_rate": 9.859925937852198e-07, "loss": 0.2156, "step": 215 }, { "epoch": 0.03407343139961352, "grad_norm": 6.798628807067871, "learning_rate": 9.858315891160843e-07, "loss": 0.2374, "step": 216 }, { "epoch": 0.03423117876720432, "grad_norm": 8.470605850219727, "learning_rate": 9.85670584446949e-07, "loss": 0.1854, "step": 217 }, { "epoch": 0.03438892613479513, "grad_norm": 5.13956880569458, "learning_rate": 9.855095797778135e-07, "loss": 0.1908, "step": 218 }, { "epoch": 0.03454667350238593, "grad_norm": 6.615530967712402, "learning_rate": 9.853485751086782e-07, "loss": 0.1661, "step": 219 }, { "epoch": 0.03470442086997673, "grad_norm": 9.583069801330566, "learning_rate": 9.851875704395427e-07, "loss": 0.2093, "step": 220 }, { "epoch": 0.034862168237567534, "grad_norm": 6.675043106079102, "learning_rate": 9.850265657704074e-07, "loss": 0.1603, "step": 221 }, { "epoch": 0.03501991560515834, "grad_norm": 7.040927886962891, "learning_rate": 9.848655611012719e-07, "loss": 0.2393, "step": 222 }, { "epoch": 0.035177662972749144, "grad_norm": 5.56865930557251, "learning_rate": 9.847045564321364e-07, "loss": 0.2089, "step": 223 }, { "epoch": 0.03533541034033995, "grad_norm": 8.290128707885742, "learning_rate": 9.84543551763001e-07, "loss": 0.1896, "step": 224 }, { "epoch": 0.035493157707930746, "grad_norm": 6.122547626495361, "learning_rate": 9.843825470938656e-07, "loss": 0.2031, "step": 225 }, { "epoch": 0.03565090507552155, "grad_norm": 6.4090094566345215, "learning_rate": 9.842215424247303e-07, "loss": 0.1658, "step": 226 }, { "epoch": 0.035808652443112356, "grad_norm": 7.354506969451904, "learning_rate": 9.840605377555948e-07, "loss": 0.1616, "step": 227 }, { "epoch": 0.03596639981070316, "grad_norm": 4.053621768951416, "learning_rate": 9.838995330864595e-07, "loss": 0.1381, "step": 228 }, { "epoch": 0.036124147178293965, "grad_norm": 8.635991096496582, "learning_rate": 9.83738528417324e-07, "loss": 0.148, "step": 229 }, { "epoch": 0.03628189454588476, "grad_norm": 5.523995399475098, "learning_rate": 9.835775237481887e-07, "loss": 0.1908, "step": 230 }, { "epoch": 0.03643964191347557, "grad_norm": 10.925482749938965, "learning_rate": 9.834165190790532e-07, "loss": 0.1971, "step": 231 }, { "epoch": 0.03659738928106637, "grad_norm": 7.450096607208252, "learning_rate": 9.832555144099177e-07, "loss": 0.1653, "step": 232 }, { "epoch": 0.03675513664865718, "grad_norm": 7.926517486572266, "learning_rate": 9.830945097407825e-07, "loss": 0.1813, "step": 233 }, { "epoch": 0.03691288401624798, "grad_norm": 16.86855125427246, "learning_rate": 9.829335050716472e-07, "loss": 0.153, "step": 234 }, { "epoch": 0.03707063138383878, "grad_norm": 4.815715312957764, "learning_rate": 9.827725004025117e-07, "loss": 0.2121, "step": 235 }, { "epoch": 0.037228378751429585, "grad_norm": 5.170910358428955, "learning_rate": 9.826114957333762e-07, "loss": 0.1429, "step": 236 }, { "epoch": 0.03738612611902039, "grad_norm": 5.570741653442383, "learning_rate": 9.824504910642409e-07, "loss": 0.1527, "step": 237 }, { "epoch": 0.037543873486611194, "grad_norm": 8.256614685058594, "learning_rate": 9.822894863951054e-07, "loss": 0.1797, "step": 238 }, { "epoch": 0.03770162085420199, "grad_norm": 4.869175910949707, "learning_rate": 9.821284817259699e-07, "loss": 0.1974, "step": 239 }, { "epoch": 0.0378593682217928, "grad_norm": 4.715397834777832, "learning_rate": 9.819674770568346e-07, "loss": 0.1887, "step": 240 }, { "epoch": 0.0380171155893836, "grad_norm": 7.1969404220581055, "learning_rate": 9.818064723876993e-07, "loss": 0.1525, "step": 241 }, { "epoch": 0.038174862956974406, "grad_norm": 5.895638942718506, "learning_rate": 9.816454677185638e-07, "loss": 0.1723, "step": 242 }, { "epoch": 0.03833261032456521, "grad_norm": 5.916856288909912, "learning_rate": 9.814844630494283e-07, "loss": 0.1762, "step": 243 }, { "epoch": 0.03849035769215601, "grad_norm": 7.5934367179870605, "learning_rate": 9.81323458380293e-07, "loss": 0.1887, "step": 244 }, { "epoch": 0.038648105059746814, "grad_norm": 6.210063934326172, "learning_rate": 9.811624537111575e-07, "loss": 0.1499, "step": 245 }, { "epoch": 0.03880585242733762, "grad_norm": 4.598802089691162, "learning_rate": 9.810014490420222e-07, "loss": 0.1259, "step": 246 }, { "epoch": 0.03896359979492842, "grad_norm": 12.598335266113281, "learning_rate": 9.808404443728867e-07, "loss": 0.2052, "step": 247 }, { "epoch": 0.03912134716251923, "grad_norm": 7.708802700042725, "learning_rate": 9.806794397037512e-07, "loss": 0.2005, "step": 248 }, { "epoch": 0.039279094530110026, "grad_norm": 6.15869665145874, "learning_rate": 9.80518435034616e-07, "loss": 0.1366, "step": 249 }, { "epoch": 0.03943684189770083, "grad_norm": 5.237630367279053, "learning_rate": 9.803574303654807e-07, "loss": 0.1429, "step": 250 }, { "epoch": 0.039594589265291635, "grad_norm": 4.9005842208862305, "learning_rate": 9.801964256963452e-07, "loss": 0.1147, "step": 251 }, { "epoch": 0.03975233663288244, "grad_norm": 129.9768524169922, "learning_rate": 9.800354210272097e-07, "loss": 0.1916, "step": 252 }, { "epoch": 0.039910084000473245, "grad_norm": 6.41878604888916, "learning_rate": 9.798744163580744e-07, "loss": 0.1334, "step": 253 }, { "epoch": 0.04006783136806404, "grad_norm": 6.262964725494385, "learning_rate": 9.797134116889389e-07, "loss": 0.1779, "step": 254 }, { "epoch": 0.04022557873565485, "grad_norm": 6.25661563873291, "learning_rate": 9.795524070198036e-07, "loss": 0.1479, "step": 255 }, { "epoch": 0.04038332610324565, "grad_norm": 5.191387176513672, "learning_rate": 9.79391402350668e-07, "loss": 0.133, "step": 256 }, { "epoch": 0.04054107347083646, "grad_norm": 7.769615650177002, "learning_rate": 9.792303976815328e-07, "loss": 0.1854, "step": 257 }, { "epoch": 0.04069882083842726, "grad_norm": 7.134732723236084, "learning_rate": 9.790693930123973e-07, "loss": 0.1926, "step": 258 }, { "epoch": 0.04085656820601806, "grad_norm": 5.413559436798096, "learning_rate": 9.78908388343262e-07, "loss": 0.1899, "step": 259 }, { "epoch": 0.041014315573608864, "grad_norm": 4.809391975402832, "learning_rate": 9.787473836741265e-07, "loss": 0.1293, "step": 260 }, { "epoch": 0.04117206294119967, "grad_norm": 5.961005687713623, "learning_rate": 9.78586379004991e-07, "loss": 0.1242, "step": 261 }, { "epoch": 0.041329810308790474, "grad_norm": 6.710011959075928, "learning_rate": 9.784253743358557e-07, "loss": 0.2335, "step": 262 }, { "epoch": 0.04148755767638128, "grad_norm": 8.22215747833252, "learning_rate": 9.782643696667202e-07, "loss": 0.2066, "step": 263 }, { "epoch": 0.041645305043972076, "grad_norm": 7.861912250518799, "learning_rate": 9.78103364997585e-07, "loss": 0.148, "step": 264 }, { "epoch": 0.04180305241156288, "grad_norm": 7.59419584274292, "learning_rate": 9.779423603284494e-07, "loss": 0.1698, "step": 265 }, { "epoch": 0.041960799779153686, "grad_norm": 6.258653163909912, "learning_rate": 9.777813556593141e-07, "loss": 0.2071, "step": 266 }, { "epoch": 0.04211854714674449, "grad_norm": 9.125118255615234, "learning_rate": 9.776203509901786e-07, "loss": 0.1292, "step": 267 }, { "epoch": 0.04227629451433529, "grad_norm": 6.597840309143066, "learning_rate": 9.774593463210434e-07, "loss": 0.1656, "step": 268 }, { "epoch": 0.04243404188192609, "grad_norm": 5.397624492645264, "learning_rate": 9.772983416519079e-07, "loss": 0.1225, "step": 269 }, { "epoch": 0.0425917892495169, "grad_norm": 5.821804523468018, "learning_rate": 9.771373369827724e-07, "loss": 0.1419, "step": 270 }, { "epoch": 0.0427495366171077, "grad_norm": 8.308530807495117, "learning_rate": 9.76976332313637e-07, "loss": 0.1593, "step": 271 }, { "epoch": 0.04290728398469851, "grad_norm": 6.071272373199463, "learning_rate": 9.768153276445018e-07, "loss": 0.1264, "step": 272 }, { "epoch": 0.043065031352289306, "grad_norm": 5.041172027587891, "learning_rate": 9.766543229753663e-07, "loss": 0.1934, "step": 273 }, { "epoch": 0.04322277871988011, "grad_norm": 6.917977809906006, "learning_rate": 9.764933183062308e-07, "loss": 0.0926, "step": 274 }, { "epoch": 0.043380526087470915, "grad_norm": 8.757829666137695, "learning_rate": 9.763323136370955e-07, "loss": 0.1772, "step": 275 }, { "epoch": 0.04353827345506172, "grad_norm": 4.275970458984375, "learning_rate": 9.7617130896796e-07, "loss": 0.1311, "step": 276 }, { "epoch": 0.043696020822652525, "grad_norm": 3.909147262573242, "learning_rate": 9.760103042988245e-07, "loss": 0.1522, "step": 277 }, { "epoch": 0.04385376819024332, "grad_norm": 9.127202987670898, "learning_rate": 9.758492996296892e-07, "loss": 0.1836, "step": 278 }, { "epoch": 0.04401151555783413, "grad_norm": 59.553279876708984, "learning_rate": 9.75688294960554e-07, "loss": 0.1497, "step": 279 }, { "epoch": 0.04416926292542493, "grad_norm": 8.238790512084961, "learning_rate": 9.755272902914184e-07, "loss": 0.1657, "step": 280 }, { "epoch": 0.04432701029301574, "grad_norm": 6.173616409301758, "learning_rate": 9.75366285622283e-07, "loss": 0.1609, "step": 281 }, { "epoch": 0.04448475766060654, "grad_norm": 4.654531955718994, "learning_rate": 9.752052809531476e-07, "loss": 0.115, "step": 282 }, { "epoch": 0.04464250502819734, "grad_norm": 3.280255079269409, "learning_rate": 9.750442762840121e-07, "loss": 0.0819, "step": 283 }, { "epoch": 0.044800252395788144, "grad_norm": 14.191688537597656, "learning_rate": 9.748832716148768e-07, "loss": 0.1864, "step": 284 }, { "epoch": 0.04495799976337895, "grad_norm": 8.970497131347656, "learning_rate": 9.747222669457413e-07, "loss": 0.1204, "step": 285 }, { "epoch": 0.045115747130969754, "grad_norm": 3.3083322048187256, "learning_rate": 9.74561262276606e-07, "loss": 0.1487, "step": 286 }, { "epoch": 0.04527349449856056, "grad_norm": 10.429438591003418, "learning_rate": 9.744002576074706e-07, "loss": 0.1775, "step": 287 }, { "epoch": 0.045431241866151356, "grad_norm": 5.250263690948486, "learning_rate": 9.742392529383353e-07, "loss": 0.165, "step": 288 }, { "epoch": 0.04558898923374216, "grad_norm": 20.89002227783203, "learning_rate": 9.740782482691998e-07, "loss": 0.2032, "step": 289 }, { "epoch": 0.045746736601332966, "grad_norm": 5.709809303283691, "learning_rate": 9.739172436000643e-07, "loss": 0.145, "step": 290 }, { "epoch": 0.04590448396892377, "grad_norm": 13.901089668273926, "learning_rate": 9.73756238930929e-07, "loss": 0.0993, "step": 291 }, { "epoch": 0.046062231336514575, "grad_norm": 9.392309188842773, "learning_rate": 9.735952342617935e-07, "loss": 0.1707, "step": 292 }, { "epoch": 0.04621997870410537, "grad_norm": 11.4054594039917, "learning_rate": 9.734342295926582e-07, "loss": 0.1482, "step": 293 }, { "epoch": 0.04637772607169618, "grad_norm": 4.401249885559082, "learning_rate": 9.732732249235227e-07, "loss": 0.1087, "step": 294 }, { "epoch": 0.04653547343928698, "grad_norm": 3.186814069747925, "learning_rate": 9.731122202543874e-07, "loss": 0.1321, "step": 295 }, { "epoch": 0.04669322080687779, "grad_norm": 3.693132162094116, "learning_rate": 9.72951215585252e-07, "loss": 0.1089, "step": 296 }, { "epoch": 0.046850968174468585, "grad_norm": 8.566479682922363, "learning_rate": 9.727902109161166e-07, "loss": 0.1497, "step": 297 }, { "epoch": 0.04700871554205939, "grad_norm": 4.2073211669921875, "learning_rate": 9.726292062469811e-07, "loss": 0.1217, "step": 298 }, { "epoch": 0.047166462909650195, "grad_norm": 6.404031276702881, "learning_rate": 9.724682015778456e-07, "loss": 0.1168, "step": 299 }, { "epoch": 0.047324210277241, "grad_norm": 9.464739799499512, "learning_rate": 9.723071969087103e-07, "loss": 0.1161, "step": 300 }, { "epoch": 0.047481957644831804, "grad_norm": 5.61821985244751, "learning_rate": 9.72146192239575e-07, "loss": 0.1246, "step": 301 }, { "epoch": 0.0476397050124226, "grad_norm": 7.7798991203308105, "learning_rate": 9.719851875704395e-07, "loss": 0.1115, "step": 302 }, { "epoch": 0.04779745238001341, "grad_norm": 4.6815290451049805, "learning_rate": 9.71824182901304e-07, "loss": 0.1037, "step": 303 }, { "epoch": 0.04795519974760421, "grad_norm": 4.269176006317139, "learning_rate": 9.716631782321688e-07, "loss": 0.1384, "step": 304 }, { "epoch": 0.048112947115195016, "grad_norm": 7.13702917098999, "learning_rate": 9.715021735630333e-07, "loss": 0.1228, "step": 305 }, { "epoch": 0.04827069448278582, "grad_norm": 4.8622727394104, "learning_rate": 9.713411688938978e-07, "loss": 0.1779, "step": 306 }, { "epoch": 0.04842844185037662, "grad_norm": 4.0259013175964355, "learning_rate": 9.711801642247625e-07, "loss": 0.1285, "step": 307 }, { "epoch": 0.048586189217967424, "grad_norm": 11.432190895080566, "learning_rate": 9.71019159555627e-07, "loss": 0.1798, "step": 308 }, { "epoch": 0.04874393658555823, "grad_norm": 10.696998596191406, "learning_rate": 9.708581548864917e-07, "loss": 0.1703, "step": 309 }, { "epoch": 0.04890168395314903, "grad_norm": 7.325720310211182, "learning_rate": 9.706971502173562e-07, "loss": 0.1329, "step": 310 }, { "epoch": 0.04905943132073984, "grad_norm": 4.719336032867432, "learning_rate": 9.70536145548221e-07, "loss": 0.0911, "step": 311 }, { "epoch": 0.049217178688330636, "grad_norm": 3.4688565731048584, "learning_rate": 9.703751408790854e-07, "loss": 0.0969, "step": 312 }, { "epoch": 0.04937492605592144, "grad_norm": 5.5488433837890625, "learning_rate": 9.702141362099501e-07, "loss": 0.1138, "step": 313 }, { "epoch": 0.049532673423512245, "grad_norm": 10.009937286376953, "learning_rate": 9.700531315408146e-07, "loss": 0.1413, "step": 314 }, { "epoch": 0.04969042079110305, "grad_norm": 6.785848617553711, "learning_rate": 9.698921268716791e-07, "loss": 0.1184, "step": 315 }, { "epoch": 0.049848168158693855, "grad_norm": 15.140206336975098, "learning_rate": 9.697311222025438e-07, "loss": 0.1195, "step": 316 }, { "epoch": 0.05000591552628465, "grad_norm": 7.2620930671691895, "learning_rate": 9.695701175334085e-07, "loss": 0.1006, "step": 317 }, { "epoch": 0.05016366289387546, "grad_norm": 6.647378444671631, "learning_rate": 9.69409112864273e-07, "loss": 0.0997, "step": 318 }, { "epoch": 0.05032141026146626, "grad_norm": 3.3054749965667725, "learning_rate": 9.692481081951375e-07, "loss": 0.0707, "step": 319 }, { "epoch": 0.05047915762905707, "grad_norm": 8.351949691772461, "learning_rate": 9.690871035260022e-07, "loss": 0.1555, "step": 320 }, { "epoch": 0.05063690499664787, "grad_norm": 6.779848575592041, "learning_rate": 9.689260988568667e-07, "loss": 0.0918, "step": 321 }, { "epoch": 0.05079465236423867, "grad_norm": 4.860285758972168, "learning_rate": 9.687650941877315e-07, "loss": 0.1497, "step": 322 }, { "epoch": 0.050952399731829474, "grad_norm": 9.101405143737793, "learning_rate": 9.68604089518596e-07, "loss": 0.0948, "step": 323 }, { "epoch": 0.05111014709942028, "grad_norm": 7.681159019470215, "learning_rate": 9.684430848494607e-07, "loss": 0.1159, "step": 324 }, { "epoch": 0.051267894467011084, "grad_norm": 11.528342247009277, "learning_rate": 9.682820801803252e-07, "loss": 0.186, "step": 325 }, { "epoch": 0.05142564183460188, "grad_norm": 9.96857738494873, "learning_rate": 9.681210755111899e-07, "loss": 0.1376, "step": 326 }, { "epoch": 0.051583389202192687, "grad_norm": 4.946990966796875, "learning_rate": 9.679600708420544e-07, "loss": 0.1328, "step": 327 }, { "epoch": 0.05174113656978349, "grad_norm": 7.803967475891113, "learning_rate": 9.677990661729189e-07, "loss": 0.1327, "step": 328 }, { "epoch": 0.051898883937374296, "grad_norm": 7.988615036010742, "learning_rate": 9.676380615037836e-07, "loss": 0.1666, "step": 329 }, { "epoch": 0.0520566313049651, "grad_norm": 5.772839546203613, "learning_rate": 9.67477056834648e-07, "loss": 0.0978, "step": 330 }, { "epoch": 0.0522143786725559, "grad_norm": 10.387200355529785, "learning_rate": 9.673160521655128e-07, "loss": 0.1152, "step": 331 }, { "epoch": 0.0523721260401467, "grad_norm": 5.836713790893555, "learning_rate": 9.671550474963773e-07, "loss": 0.0987, "step": 332 }, { "epoch": 0.05252987340773751, "grad_norm": 9.30050277709961, "learning_rate": 9.66994042827242e-07, "loss": 0.1603, "step": 333 }, { "epoch": 0.05268762077532831, "grad_norm": 7.9058451652526855, "learning_rate": 9.668330381581065e-07, "loss": 0.1947, "step": 334 }, { "epoch": 0.05284536814291912, "grad_norm": 7.316274642944336, "learning_rate": 9.66672033488971e-07, "loss": 0.1462, "step": 335 }, { "epoch": 0.053003115510509916, "grad_norm": 4.279013156890869, "learning_rate": 9.665110288198357e-07, "loss": 0.1366, "step": 336 }, { "epoch": 0.05316086287810072, "grad_norm": 10.732933044433594, "learning_rate": 9.663500241507002e-07, "loss": 0.1455, "step": 337 }, { "epoch": 0.053318610245691525, "grad_norm": 3.3139238357543945, "learning_rate": 9.66189019481565e-07, "loss": 0.0694, "step": 338 }, { "epoch": 0.05347635761328233, "grad_norm": 6.18437385559082, "learning_rate": 9.660280148124297e-07, "loss": 0.082, "step": 339 }, { "epoch": 0.053634104980873135, "grad_norm": 12.412300109863281, "learning_rate": 9.658670101432942e-07, "loss": 0.1503, "step": 340 }, { "epoch": 0.05379185234846393, "grad_norm": 6.401512145996094, "learning_rate": 9.657060054741587e-07, "loss": 0.0825, "step": 341 }, { "epoch": 0.05394959971605474, "grad_norm": 7.424494743347168, "learning_rate": 9.655450008050234e-07, "loss": 0.0997, "step": 342 }, { "epoch": 0.05410734708364554, "grad_norm": 9.988298416137695, "learning_rate": 9.653839961358879e-07, "loss": 0.2279, "step": 343 }, { "epoch": 0.05426509445123635, "grad_norm": 10.520110130310059, "learning_rate": 9.652229914667524e-07, "loss": 0.1491, "step": 344 }, { "epoch": 0.05442284181882715, "grad_norm": 5.183448314666748, "learning_rate": 9.65061986797617e-07, "loss": 0.2024, "step": 345 }, { "epoch": 0.05458058918641795, "grad_norm": 4.082580089569092, "learning_rate": 9.649009821284818e-07, "loss": 0.1181, "step": 346 }, { "epoch": 0.054738336554008754, "grad_norm": 10.461071968078613, "learning_rate": 9.647399774593463e-07, "loss": 0.1077, "step": 347 }, { "epoch": 0.05489608392159956, "grad_norm": 4.259407997131348, "learning_rate": 9.645789727902108e-07, "loss": 0.1165, "step": 348 }, { "epoch": 0.055053831289190364, "grad_norm": 7.56532621383667, "learning_rate": 9.644179681210755e-07, "loss": 0.1025, "step": 349 }, { "epoch": 0.05521157865678117, "grad_norm": 4.267522811889648, "learning_rate": 9.6425696345194e-07, "loss": 0.1156, "step": 350 }, { "epoch": 0.055369326024371966, "grad_norm": 4.609867095947266, "learning_rate": 9.640959587828047e-07, "loss": 0.1236, "step": 351 }, { "epoch": 0.05552707339196277, "grad_norm": 12.157086372375488, "learning_rate": 9.639349541136692e-07, "loss": 0.1221, "step": 352 }, { "epoch": 0.055684820759553576, "grad_norm": 9.895761489868164, "learning_rate": 9.637739494445337e-07, "loss": 0.1346, "step": 353 }, { "epoch": 0.05584256812714438, "grad_norm": 10.083800315856934, "learning_rate": 9.636129447753984e-07, "loss": 0.1094, "step": 354 }, { "epoch": 0.05600031549473518, "grad_norm": 5.506272315979004, "learning_rate": 9.634519401062632e-07, "loss": 0.0883, "step": 355 }, { "epoch": 0.05615806286232598, "grad_norm": 7.488781452178955, "learning_rate": 9.632909354371277e-07, "loss": 0.1136, "step": 356 }, { "epoch": 0.05631581022991679, "grad_norm": 5.5719146728515625, "learning_rate": 9.631299307679922e-07, "loss": 0.1914, "step": 357 }, { "epoch": 0.05647355759750759, "grad_norm": 11.344675064086914, "learning_rate": 9.629689260988569e-07, "loss": 0.1, "step": 358 }, { "epoch": 0.0566313049650984, "grad_norm": 12.2915620803833, "learning_rate": 9.628079214297214e-07, "loss": 0.1473, "step": 359 }, { "epoch": 0.056789052332689195, "grad_norm": 7.278500080108643, "learning_rate": 9.626469167605859e-07, "loss": 0.1206, "step": 360 }, { "epoch": 0.05694679970028, "grad_norm": 10.03773307800293, "learning_rate": 9.624859120914506e-07, "loss": 0.1689, "step": 361 }, { "epoch": 0.057104547067870805, "grad_norm": 7.41524600982666, "learning_rate": 9.623249074223153e-07, "loss": 0.0993, "step": 362 }, { "epoch": 0.05726229443546161, "grad_norm": 8.416111946105957, "learning_rate": 9.621639027531798e-07, "loss": 0.1379, "step": 363 }, { "epoch": 0.057420041803052414, "grad_norm": 5.612109184265137, "learning_rate": 9.620028980840445e-07, "loss": 0.0697, "step": 364 }, { "epoch": 0.05757778917064321, "grad_norm": 5.46721887588501, "learning_rate": 9.61841893414909e-07, "loss": 0.0799, "step": 365 }, { "epoch": 0.05773553653823402, "grad_norm": 8.063959121704102, "learning_rate": 9.616808887457735e-07, "loss": 0.073, "step": 366 }, { "epoch": 0.05789328390582482, "grad_norm": 7.9931559562683105, "learning_rate": 9.615198840766382e-07, "loss": 0.1438, "step": 367 }, { "epoch": 0.058051031273415626, "grad_norm": 5.741889953613281, "learning_rate": 9.613588794075027e-07, "loss": 0.0832, "step": 368 }, { "epoch": 0.05820877864100643, "grad_norm": 4.304195880889893, "learning_rate": 9.611978747383674e-07, "loss": 0.1258, "step": 369 }, { "epoch": 0.05836652600859723, "grad_norm": 9.83571720123291, "learning_rate": 9.61036870069232e-07, "loss": 0.1328, "step": 370 }, { "epoch": 0.058524273376188034, "grad_norm": 4.696883201599121, "learning_rate": 9.608758654000966e-07, "loss": 0.1282, "step": 371 }, { "epoch": 0.05868202074377884, "grad_norm": 5.337697505950928, "learning_rate": 9.607148607309611e-07, "loss": 0.1203, "step": 372 }, { "epoch": 0.05883976811136964, "grad_norm": 6.526247501373291, "learning_rate": 9.605538560618256e-07, "loss": 0.1294, "step": 373 }, { "epoch": 0.05899751547896045, "grad_norm": 4.350911617279053, "learning_rate": 9.603928513926904e-07, "loss": 0.1189, "step": 374 }, { "epoch": 0.059155262846551246, "grad_norm": 6.42826509475708, "learning_rate": 9.602318467235549e-07, "loss": 0.0815, "step": 375 }, { "epoch": 0.05931301021414205, "grad_norm": 7.661411285400391, "learning_rate": 9.600708420544196e-07, "loss": 0.0944, "step": 376 }, { "epoch": 0.059470757581732855, "grad_norm": 5.3927764892578125, "learning_rate": 9.59909837385284e-07, "loss": 0.0781, "step": 377 }, { "epoch": 0.05962850494932366, "grad_norm": 5.518739223480225, "learning_rate": 9.597488327161488e-07, "loss": 0.0927, "step": 378 }, { "epoch": 0.059786252316914465, "grad_norm": 5.976505756378174, "learning_rate": 9.595878280470133e-07, "loss": 0.1466, "step": 379 }, { "epoch": 0.05994399968450526, "grad_norm": 5.26506233215332, "learning_rate": 9.59426823377878e-07, "loss": 0.1124, "step": 380 }, { "epoch": 0.06010174705209607, "grad_norm": 4.402028560638428, "learning_rate": 9.592658187087425e-07, "loss": 0.0617, "step": 381 }, { "epoch": 0.06025949441968687, "grad_norm": 5.284600257873535, "learning_rate": 9.59104814039607e-07, "loss": 0.0855, "step": 382 }, { "epoch": 0.06041724178727768, "grad_norm": 3.978062391281128, "learning_rate": 9.589438093704717e-07, "loss": 0.1116, "step": 383 }, { "epoch": 0.060574989154868475, "grad_norm": 4.164371490478516, "learning_rate": 9.587828047013364e-07, "loss": 0.1204, "step": 384 }, { "epoch": 0.06073273652245928, "grad_norm": 7.169449806213379, "learning_rate": 9.58621800032201e-07, "loss": 0.1236, "step": 385 }, { "epoch": 0.060890483890050084, "grad_norm": 6.995155334472656, "learning_rate": 9.584607953630654e-07, "loss": 0.1152, "step": 386 }, { "epoch": 0.06104823125764089, "grad_norm": 8.239535331726074, "learning_rate": 9.582997906939301e-07, "loss": 0.0693, "step": 387 }, { "epoch": 0.061205978625231694, "grad_norm": 4.845700263977051, "learning_rate": 9.581387860247946e-07, "loss": 0.1102, "step": 388 }, { "epoch": 0.06136372599282249, "grad_norm": 5.36713981628418, "learning_rate": 9.579777813556593e-07, "loss": 0.0923, "step": 389 }, { "epoch": 0.0615214733604133, "grad_norm": 7.444408416748047, "learning_rate": 9.578167766865238e-07, "loss": 0.1109, "step": 390 }, { "epoch": 0.0616792207280041, "grad_norm": 5.161166191101074, "learning_rate": 9.576557720173886e-07, "loss": 0.101, "step": 391 }, { "epoch": 0.061836968095594906, "grad_norm": 6.004199028015137, "learning_rate": 9.57494767348253e-07, "loss": 0.1083, "step": 392 }, { "epoch": 0.06199471546318571, "grad_norm": 9.09014892578125, "learning_rate": 9.573337626791178e-07, "loss": 0.1364, "step": 393 }, { "epoch": 0.06215246283077651, "grad_norm": 4.672868251800537, "learning_rate": 9.571727580099823e-07, "loss": 0.1331, "step": 394 }, { "epoch": 0.06231021019836731, "grad_norm": 8.290534019470215, "learning_rate": 9.570117533408468e-07, "loss": 0.1124, "step": 395 }, { "epoch": 0.06246795756595812, "grad_norm": 4.446094989776611, "learning_rate": 9.568507486717115e-07, "loss": 0.1108, "step": 396 }, { "epoch": 0.06262570493354892, "grad_norm": 11.480560302734375, "learning_rate": 9.56689744002576e-07, "loss": 0.1627, "step": 397 }, { "epoch": 0.06278345230113973, "grad_norm": 5.206017971038818, "learning_rate": 9.565287393334405e-07, "loss": 0.1112, "step": 398 }, { "epoch": 0.06294119966873053, "grad_norm": 7.868137836456299, "learning_rate": 9.563677346643052e-07, "loss": 0.1475, "step": 399 }, { "epoch": 0.06309894703632134, "grad_norm": 11.411048889160156, "learning_rate": 9.5620672999517e-07, "loss": 0.1333, "step": 400 }, { "epoch": 0.06325669440391213, "grad_norm": 6.512931823730469, "learning_rate": 9.560457253260344e-07, "loss": 0.1181, "step": 401 }, { "epoch": 0.06341444177150293, "grad_norm": 4.9762864112854, "learning_rate": 9.55884720656899e-07, "loss": 0.123, "step": 402 }, { "epoch": 0.06357218913909374, "grad_norm": 4.826909065246582, "learning_rate": 9.557237159877636e-07, "loss": 0.1399, "step": 403 }, { "epoch": 0.06372993650668454, "grad_norm": 23.437101364135742, "learning_rate": 9.555627113186281e-07, "loss": 0.1496, "step": 404 }, { "epoch": 0.06388768387427535, "grad_norm": 8.380960464477539, "learning_rate": 9.554017066494928e-07, "loss": 0.1632, "step": 405 }, { "epoch": 0.06404543124186615, "grad_norm": 6.791145324707031, "learning_rate": 9.552407019803573e-07, "loss": 0.1224, "step": 406 }, { "epoch": 0.06420317860945696, "grad_norm": 8.910951614379883, "learning_rate": 9.55079697311222e-07, "loss": 0.1235, "step": 407 }, { "epoch": 0.06436092597704776, "grad_norm": 6.297551155090332, "learning_rate": 9.549186926420865e-07, "loss": 0.0877, "step": 408 }, { "epoch": 0.06451867334463857, "grad_norm": 5.6142497062683105, "learning_rate": 9.547576879729513e-07, "loss": 0.0899, "step": 409 }, { "epoch": 0.06467642071222937, "grad_norm": 5.524415016174316, "learning_rate": 9.545966833038158e-07, "loss": 0.1246, "step": 410 }, { "epoch": 0.06483416807982016, "grad_norm": 6.010134220123291, "learning_rate": 9.544356786346803e-07, "loss": 0.0955, "step": 411 }, { "epoch": 0.06499191544741097, "grad_norm": 6.693892002105713, "learning_rate": 9.54274673965545e-07, "loss": 0.128, "step": 412 }, { "epoch": 0.06514966281500177, "grad_norm": 9.232756614685059, "learning_rate": 9.541136692964095e-07, "loss": 0.0881, "step": 413 }, { "epoch": 0.06530741018259258, "grad_norm": 6.193661689758301, "learning_rate": 9.539526646272742e-07, "loss": 0.1144, "step": 414 }, { "epoch": 0.06546515755018338, "grad_norm": 4.4394683837890625, "learning_rate": 9.537916599581387e-07, "loss": 0.0497, "step": 415 }, { "epoch": 0.06562290491777419, "grad_norm": 4.97087287902832, "learning_rate": 9.536306552890033e-07, "loss": 0.0984, "step": 416 }, { "epoch": 0.06578065228536499, "grad_norm": 3.874796152114868, "learning_rate": 9.534696506198679e-07, "loss": 0.1162, "step": 417 }, { "epoch": 0.0659383996529558, "grad_norm": 7.771538734436035, "learning_rate": 9.533086459507326e-07, "loss": 0.1724, "step": 418 }, { "epoch": 0.0660961470205466, "grad_norm": 5.318310260772705, "learning_rate": 9.531476412815971e-07, "loss": 0.1136, "step": 419 }, { "epoch": 0.0662538943881374, "grad_norm": 7.3136372566223145, "learning_rate": 9.529866366124617e-07, "loss": 0.104, "step": 420 }, { "epoch": 0.0664116417557282, "grad_norm": 10.768095970153809, "learning_rate": 9.528256319433263e-07, "loss": 0.125, "step": 421 }, { "epoch": 0.066569389123319, "grad_norm": 2.916339159011841, "learning_rate": 9.526646272741909e-07, "loss": 0.0639, "step": 422 }, { "epoch": 0.0667271364909098, "grad_norm": 9.273578643798828, "learning_rate": 9.525036226050554e-07, "loss": 0.11, "step": 423 }, { "epoch": 0.06688488385850061, "grad_norm": 4.831707000732422, "learning_rate": 9.5234261793592e-07, "loss": 0.1128, "step": 424 }, { "epoch": 0.06704263122609141, "grad_norm": 10.874245643615723, "learning_rate": 9.521816132667847e-07, "loss": 0.1701, "step": 425 }, { "epoch": 0.06720037859368222, "grad_norm": 5.706363201141357, "learning_rate": 9.520206085976494e-07, "loss": 0.1301, "step": 426 }, { "epoch": 0.06735812596127302, "grad_norm": 4.783963680267334, "learning_rate": 9.518596039285139e-07, "loss": 0.0934, "step": 427 }, { "epoch": 0.06751587332886383, "grad_norm": 8.126564979553223, "learning_rate": 9.516985992593785e-07, "loss": 0.1474, "step": 428 }, { "epoch": 0.06767362069645463, "grad_norm": 5.196165084838867, "learning_rate": 9.515375945902431e-07, "loss": 0.0903, "step": 429 }, { "epoch": 0.06783136806404542, "grad_norm": 15.124307632446289, "learning_rate": 9.513765899211077e-07, "loss": 0.1804, "step": 430 }, { "epoch": 0.06798911543163623, "grad_norm": 5.490636825561523, "learning_rate": 9.512155852519723e-07, "loss": 0.0729, "step": 431 }, { "epoch": 0.06814686279922703, "grad_norm": 6.503512859344482, "learning_rate": 9.510545805828368e-07, "loss": 0.1389, "step": 432 }, { "epoch": 0.06830461016681784, "grad_norm": 10.483194351196289, "learning_rate": 9.508935759137015e-07, "loss": 0.0925, "step": 433 }, { "epoch": 0.06846235753440864, "grad_norm": 6.307246208190918, "learning_rate": 9.507325712445661e-07, "loss": 0.0968, "step": 434 }, { "epoch": 0.06862010490199945, "grad_norm": 6.193094253540039, "learning_rate": 9.505715665754307e-07, "loss": 0.1141, "step": 435 }, { "epoch": 0.06877785226959025, "grad_norm": 68.03524017333984, "learning_rate": 9.504105619062952e-07, "loss": 0.1235, "step": 436 }, { "epoch": 0.06893559963718106, "grad_norm": 5.203610420227051, "learning_rate": 9.502495572371598e-07, "loss": 0.0933, "step": 437 }, { "epoch": 0.06909334700477186, "grad_norm": 7.363839626312256, "learning_rate": 9.500885525680244e-07, "loss": 0.1038, "step": 438 }, { "epoch": 0.06925109437236267, "grad_norm": 5.326133728027344, "learning_rate": 9.49927547898889e-07, "loss": 0.1036, "step": 439 }, { "epoch": 0.06940884173995346, "grad_norm": 7.377047061920166, "learning_rate": 9.497665432297536e-07, "loss": 0.0791, "step": 440 }, { "epoch": 0.06956658910754426, "grad_norm": 6.677900791168213, "learning_rate": 9.496055385606182e-07, "loss": 0.1382, "step": 441 }, { "epoch": 0.06972433647513507, "grad_norm": 4.663441181182861, "learning_rate": 9.494445338914828e-07, "loss": 0.0971, "step": 442 }, { "epoch": 0.06988208384272587, "grad_norm": 8.59867000579834, "learning_rate": 9.492835292223474e-07, "loss": 0.1171, "step": 443 }, { "epoch": 0.07003983121031668, "grad_norm": 7.579411029815674, "learning_rate": 9.49122524553212e-07, "loss": 0.1495, "step": 444 }, { "epoch": 0.07019757857790748, "grad_norm": 7.643759727478027, "learning_rate": 9.489615198840766e-07, "loss": 0.0885, "step": 445 }, { "epoch": 0.07035532594549829, "grad_norm": 9.882652282714844, "learning_rate": 9.488005152149412e-07, "loss": 0.0885, "step": 446 }, { "epoch": 0.07051307331308909, "grad_norm": 5.170772075653076, "learning_rate": 9.486395105458058e-07, "loss": 0.1332, "step": 447 }, { "epoch": 0.0706708206806799, "grad_norm": 6.967252254486084, "learning_rate": 9.484785058766704e-07, "loss": 0.0816, "step": 448 }, { "epoch": 0.0708285680482707, "grad_norm": 7.689639568328857, "learning_rate": 9.48317501207535e-07, "loss": 0.097, "step": 449 }, { "epoch": 0.07098631541586149, "grad_norm": 4.52255916595459, "learning_rate": 9.481564965383996e-07, "loss": 0.0501, "step": 450 }, { "epoch": 0.0711440627834523, "grad_norm": 6.25842809677124, "learning_rate": 9.479954918692642e-07, "loss": 0.1615, "step": 451 }, { "epoch": 0.0713018101510431, "grad_norm": 7.390439987182617, "learning_rate": 9.478344872001287e-07, "loss": 0.0939, "step": 452 }, { "epoch": 0.0714595575186339, "grad_norm": 6.982767581939697, "learning_rate": 9.476734825309933e-07, "loss": 0.057, "step": 453 }, { "epoch": 0.07161730488622471, "grad_norm": 5.735555171966553, "learning_rate": 9.475124778618579e-07, "loss": 0.0916, "step": 454 }, { "epoch": 0.07177505225381552, "grad_norm": 6.87061071395874, "learning_rate": 9.473514731927226e-07, "loss": 0.1062, "step": 455 }, { "epoch": 0.07193279962140632, "grad_norm": 4.926663875579834, "learning_rate": 9.471904685235872e-07, "loss": 0.0669, "step": 456 }, { "epoch": 0.07209054698899713, "grad_norm": 6.2761311531066895, "learning_rate": 9.470294638544517e-07, "loss": 0.1445, "step": 457 }, { "epoch": 0.07224829435658793, "grad_norm": 6.018259525299072, "learning_rate": 9.468684591853163e-07, "loss": 0.1123, "step": 458 }, { "epoch": 0.07240604172417872, "grad_norm": 6.722275257110596, "learning_rate": 9.467074545161809e-07, "loss": 0.1306, "step": 459 }, { "epoch": 0.07256378909176953, "grad_norm": 5.744132995605469, "learning_rate": 9.465464498470455e-07, "loss": 0.0828, "step": 460 }, { "epoch": 0.07272153645936033, "grad_norm": 3.327827215194702, "learning_rate": 9.4638544517791e-07, "loss": 0.0999, "step": 461 }, { "epoch": 0.07287928382695114, "grad_norm": 6.590383052825928, "learning_rate": 9.462244405087747e-07, "loss": 0.0878, "step": 462 }, { "epoch": 0.07303703119454194, "grad_norm": 4.286330223083496, "learning_rate": 9.460634358396394e-07, "loss": 0.0908, "step": 463 }, { "epoch": 0.07319477856213275, "grad_norm": 6.676567077636719, "learning_rate": 9.45902431170504e-07, "loss": 0.1406, "step": 464 }, { "epoch": 0.07335252592972355, "grad_norm": 52.79438018798828, "learning_rate": 9.457414265013685e-07, "loss": 0.1012, "step": 465 }, { "epoch": 0.07351027329731435, "grad_norm": 4.823317527770996, "learning_rate": 9.455804218322331e-07, "loss": 0.1164, "step": 466 }, { "epoch": 0.07366802066490516, "grad_norm": 4.268604755401611, "learning_rate": 9.454194171630977e-07, "loss": 0.0871, "step": 467 }, { "epoch": 0.07382576803249596, "grad_norm": 7.7923760414123535, "learning_rate": 9.452584124939623e-07, "loss": 0.0609, "step": 468 }, { "epoch": 0.07398351540008676, "grad_norm": 7.410219669342041, "learning_rate": 9.450974078248268e-07, "loss": 0.0679, "step": 469 }, { "epoch": 0.07414126276767756, "grad_norm": 4.796772480010986, "learning_rate": 9.449364031556915e-07, "loss": 0.0514, "step": 470 }, { "epoch": 0.07429901013526836, "grad_norm": 11.331911087036133, "learning_rate": 9.447753984865561e-07, "loss": 0.1235, "step": 471 }, { "epoch": 0.07445675750285917, "grad_norm": 6.432934284210205, "learning_rate": 9.446143938174207e-07, "loss": 0.1039, "step": 472 }, { "epoch": 0.07461450487044997, "grad_norm": 6.318420886993408, "learning_rate": 9.444533891482852e-07, "loss": 0.1477, "step": 473 }, { "epoch": 0.07477225223804078, "grad_norm": 12.588046073913574, "learning_rate": 9.442923844791498e-07, "loss": 0.0832, "step": 474 }, { "epoch": 0.07492999960563158, "grad_norm": 6.165432453155518, "learning_rate": 9.441313798100144e-07, "loss": 0.1009, "step": 475 }, { "epoch": 0.07508774697322239, "grad_norm": 5.55441951751709, "learning_rate": 9.43970375140879e-07, "loss": 0.0968, "step": 476 }, { "epoch": 0.0752454943408132, "grad_norm": 11.043139457702637, "learning_rate": 9.438093704717436e-07, "loss": 0.0829, "step": 477 }, { "epoch": 0.07540324170840398, "grad_norm": 4.340719699859619, "learning_rate": 9.436483658026082e-07, "loss": 0.1333, "step": 478 }, { "epoch": 0.07556098907599479, "grad_norm": 4.043093204498291, "learning_rate": 9.434873611334729e-07, "loss": 0.1048, "step": 479 }, { "epoch": 0.0757187364435856, "grad_norm": 7.1659417152404785, "learning_rate": 9.433263564643375e-07, "loss": 0.0923, "step": 480 }, { "epoch": 0.0758764838111764, "grad_norm": 12.38833999633789, "learning_rate": 9.431653517952021e-07, "loss": 0.1171, "step": 481 }, { "epoch": 0.0760342311787672, "grad_norm": 8.507014274597168, "learning_rate": 9.430043471260666e-07, "loss": 0.077, "step": 482 }, { "epoch": 0.07619197854635801, "grad_norm": 3.959944725036621, "learning_rate": 9.428433424569312e-07, "loss": 0.0501, "step": 483 }, { "epoch": 0.07634972591394881, "grad_norm": 4.138267993927002, "learning_rate": 9.426823377877958e-07, "loss": 0.1075, "step": 484 }, { "epoch": 0.07650747328153962, "grad_norm": 6.563967704772949, "learning_rate": 9.425213331186605e-07, "loss": 0.1285, "step": 485 }, { "epoch": 0.07666522064913042, "grad_norm": 5.721851825714111, "learning_rate": 9.42360328449525e-07, "loss": 0.0935, "step": 486 }, { "epoch": 0.07682296801672123, "grad_norm": 4.658355712890625, "learning_rate": 9.421993237803896e-07, "loss": 0.0714, "step": 487 }, { "epoch": 0.07698071538431202, "grad_norm": 7.072540760040283, "learning_rate": 9.420383191112542e-07, "loss": 0.1746, "step": 488 }, { "epoch": 0.07713846275190282, "grad_norm": 8.619388580322266, "learning_rate": 9.418773144421188e-07, "loss": 0.0629, "step": 489 }, { "epoch": 0.07729621011949363, "grad_norm": 4.010007858276367, "learning_rate": 9.417163097729833e-07, "loss": 0.0591, "step": 490 }, { "epoch": 0.07745395748708443, "grad_norm": 8.81683349609375, "learning_rate": 9.415553051038479e-07, "loss": 0.1324, "step": 491 }, { "epoch": 0.07761170485467524, "grad_norm": 5.686316013336182, "learning_rate": 9.413943004347125e-07, "loss": 0.0603, "step": 492 }, { "epoch": 0.07776945222226604, "grad_norm": 14.068903923034668, "learning_rate": 9.412332957655772e-07, "loss": 0.12, "step": 493 }, { "epoch": 0.07792719958985685, "grad_norm": 9.79300594329834, "learning_rate": 9.410722910964417e-07, "loss": 0.0626, "step": 494 }, { "epoch": 0.07808494695744765, "grad_norm": 3.8315680027008057, "learning_rate": 9.409112864273063e-07, "loss": 0.0477, "step": 495 }, { "epoch": 0.07824269432503846, "grad_norm": 6.023341178894043, "learning_rate": 9.40750281758171e-07, "loss": 0.1123, "step": 496 }, { "epoch": 0.07840044169262926, "grad_norm": 7.395686626434326, "learning_rate": 9.405892770890356e-07, "loss": 0.1365, "step": 497 }, { "epoch": 0.07855818906022005, "grad_norm": 3.773207426071167, "learning_rate": 9.404282724199001e-07, "loss": 0.0772, "step": 498 }, { "epoch": 0.07871593642781086, "grad_norm": 5.029270648956299, "learning_rate": 9.402672677507647e-07, "loss": 0.0928, "step": 499 }, { "epoch": 0.07887368379540166, "grad_norm": 8.06304931640625, "learning_rate": 9.401062630816294e-07, "loss": 0.0926, "step": 500 }, { "epoch": 0.07903143116299247, "grad_norm": 8.494391441345215, "learning_rate": 9.39945258412494e-07, "loss": 0.0712, "step": 501 }, { "epoch": 0.07918917853058327, "grad_norm": 19.12388038635254, "learning_rate": 9.397842537433586e-07, "loss": 0.1247, "step": 502 }, { "epoch": 0.07934692589817408, "grad_norm": 4.007694721221924, "learning_rate": 9.396232490742231e-07, "loss": 0.0871, "step": 503 }, { "epoch": 0.07950467326576488, "grad_norm": 4.083090305328369, "learning_rate": 9.394622444050877e-07, "loss": 0.0929, "step": 504 }, { "epoch": 0.07966242063335569, "grad_norm": 5.226316452026367, "learning_rate": 9.393012397359523e-07, "loss": 0.0758, "step": 505 }, { "epoch": 0.07982016800094649, "grad_norm": 5.83637809753418, "learning_rate": 9.391402350668169e-07, "loss": 0.1174, "step": 506 }, { "epoch": 0.07997791536853728, "grad_norm": 10.742006301879883, "learning_rate": 9.389792303976814e-07, "loss": 0.0948, "step": 507 }, { "epoch": 0.08013566273612809, "grad_norm": 11.444242477416992, "learning_rate": 9.388182257285461e-07, "loss": 0.1807, "step": 508 }, { "epoch": 0.08029341010371889, "grad_norm": 7.078170299530029, "learning_rate": 9.386572210594107e-07, "loss": 0.0794, "step": 509 }, { "epoch": 0.0804511574713097, "grad_norm": 4.803638935089111, "learning_rate": 9.384962163902753e-07, "loss": 0.1344, "step": 510 }, { "epoch": 0.0806089048389005, "grad_norm": 6.092920303344727, "learning_rate": 9.383352117211398e-07, "loss": 0.0774, "step": 511 }, { "epoch": 0.0807666522064913, "grad_norm": 17.177648544311523, "learning_rate": 9.381742070520044e-07, "loss": 0.1563, "step": 512 }, { "epoch": 0.08092439957408211, "grad_norm": 11.093384742736816, "learning_rate": 9.38013202382869e-07, "loss": 0.0795, "step": 513 }, { "epoch": 0.08108214694167291, "grad_norm": 9.071053504943848, "learning_rate": 9.378521977137337e-07, "loss": 0.1089, "step": 514 }, { "epoch": 0.08123989430926372, "grad_norm": 6.980352401733398, "learning_rate": 9.376911930445983e-07, "loss": 0.1469, "step": 515 }, { "epoch": 0.08139764167685452, "grad_norm": 6.9618988037109375, "learning_rate": 9.375301883754629e-07, "loss": 0.1253, "step": 516 }, { "epoch": 0.08155538904444531, "grad_norm": 6.925991535186768, "learning_rate": 9.373691837063275e-07, "loss": 0.1222, "step": 517 }, { "epoch": 0.08171313641203612, "grad_norm": 4.175725936889648, "learning_rate": 9.372081790371921e-07, "loss": 0.055, "step": 518 }, { "epoch": 0.08187088377962692, "grad_norm": 9.227218627929688, "learning_rate": 9.370471743680566e-07, "loss": 0.127, "step": 519 }, { "epoch": 0.08202863114721773, "grad_norm": 4.0665459632873535, "learning_rate": 9.368861696989212e-07, "loss": 0.0779, "step": 520 }, { "epoch": 0.08218637851480853, "grad_norm": 11.169240951538086, "learning_rate": 9.367251650297858e-07, "loss": 0.1303, "step": 521 }, { "epoch": 0.08234412588239934, "grad_norm": 9.30782413482666, "learning_rate": 9.365641603606504e-07, "loss": 0.0757, "step": 522 }, { "epoch": 0.08250187324999014, "grad_norm": 15.76601791381836, "learning_rate": 9.36403155691515e-07, "loss": 0.0687, "step": 523 }, { "epoch": 0.08265962061758095, "grad_norm": 4.672974109649658, "learning_rate": 9.362421510223796e-07, "loss": 0.1032, "step": 524 }, { "epoch": 0.08281736798517175, "grad_norm": 5.200424671173096, "learning_rate": 9.360811463532442e-07, "loss": 0.0743, "step": 525 }, { "epoch": 0.08297511535276256, "grad_norm": 7.3553900718688965, "learning_rate": 9.359201416841088e-07, "loss": 0.0567, "step": 526 }, { "epoch": 0.08313286272035335, "grad_norm": 4.422732353210449, "learning_rate": 9.357591370149734e-07, "loss": 0.0665, "step": 527 }, { "epoch": 0.08329061008794415, "grad_norm": 7.514669418334961, "learning_rate": 9.355981323458379e-07, "loss": 0.0975, "step": 528 }, { "epoch": 0.08344835745553496, "grad_norm": 8.483025550842285, "learning_rate": 9.354371276767025e-07, "loss": 0.0869, "step": 529 }, { "epoch": 0.08360610482312576, "grad_norm": 13.895788192749023, "learning_rate": 9.352761230075672e-07, "loss": 0.1096, "step": 530 }, { "epoch": 0.08376385219071657, "grad_norm": 5.938143730163574, "learning_rate": 9.351151183384319e-07, "loss": 0.0868, "step": 531 }, { "epoch": 0.08392159955830737, "grad_norm": 6.248524188995361, "learning_rate": 9.349541136692964e-07, "loss": 0.1188, "step": 532 }, { "epoch": 0.08407934692589818, "grad_norm": 4.3282856941223145, "learning_rate": 9.34793109000161e-07, "loss": 0.0617, "step": 533 }, { "epoch": 0.08423709429348898, "grad_norm": 12.2025785446167, "learning_rate": 9.346321043310256e-07, "loss": 0.1276, "step": 534 }, { "epoch": 0.08439484166107979, "grad_norm": 7.238867282867432, "learning_rate": 9.344710996618902e-07, "loss": 0.1071, "step": 535 }, { "epoch": 0.08455258902867058, "grad_norm": 3.5196988582611084, "learning_rate": 9.343100949927547e-07, "loss": 0.0471, "step": 536 }, { "epoch": 0.08471033639626138, "grad_norm": 4.482356071472168, "learning_rate": 9.341490903236193e-07, "loss": 0.0739, "step": 537 }, { "epoch": 0.08486808376385219, "grad_norm": 3.579318046569824, "learning_rate": 9.33988085654484e-07, "loss": 0.0514, "step": 538 }, { "epoch": 0.08502583113144299, "grad_norm": 5.4972639083862305, "learning_rate": 9.338270809853486e-07, "loss": 0.0577, "step": 539 }, { "epoch": 0.0851835784990338, "grad_norm": 4.826931476593018, "learning_rate": 9.336660763162131e-07, "loss": 0.0622, "step": 540 }, { "epoch": 0.0853413258666246, "grad_norm": 4.429808139801025, "learning_rate": 9.335050716470777e-07, "loss": 0.067, "step": 541 }, { "epoch": 0.0854990732342154, "grad_norm": 5.156866073608398, "learning_rate": 9.333440669779423e-07, "loss": 0.0924, "step": 542 }, { "epoch": 0.08565682060180621, "grad_norm": 3.470766067504883, "learning_rate": 9.331830623088069e-07, "loss": 0.082, "step": 543 }, { "epoch": 0.08581456796939702, "grad_norm": 8.749771118164062, "learning_rate": 9.330220576396714e-07, "loss": 0.0935, "step": 544 }, { "epoch": 0.08597231533698782, "grad_norm": 8.987846374511719, "learning_rate": 9.328610529705361e-07, "loss": 0.1161, "step": 545 }, { "epoch": 0.08613006270457861, "grad_norm": 7.917525291442871, "learning_rate": 9.327000483014007e-07, "loss": 0.068, "step": 546 }, { "epoch": 0.08628781007216942, "grad_norm": 7.498785018920898, "learning_rate": 9.325390436322653e-07, "loss": 0.1017, "step": 547 }, { "epoch": 0.08644555743976022, "grad_norm": 6.345965385437012, "learning_rate": 9.3237803896313e-07, "loss": 0.1245, "step": 548 }, { "epoch": 0.08660330480735103, "grad_norm": 5.038855075836182, "learning_rate": 9.322170342939944e-07, "loss": 0.0764, "step": 549 }, { "epoch": 0.08676105217494183, "grad_norm": 7.756494045257568, "learning_rate": 9.320560296248591e-07, "loss": 0.0708, "step": 550 }, { "epoch": 0.08691879954253263, "grad_norm": 9.325003623962402, "learning_rate": 9.318950249557237e-07, "loss": 0.0832, "step": 551 }, { "epoch": 0.08707654691012344, "grad_norm": 8.301737785339355, "learning_rate": 9.317340202865883e-07, "loss": 0.0767, "step": 552 }, { "epoch": 0.08723429427771424, "grad_norm": 8.105409622192383, "learning_rate": 9.315730156174529e-07, "loss": 0.1151, "step": 553 }, { "epoch": 0.08739204164530505, "grad_norm": 7.55850887298584, "learning_rate": 9.314120109483175e-07, "loss": 0.0657, "step": 554 }, { "epoch": 0.08754978901289585, "grad_norm": 7.7853851318359375, "learning_rate": 9.312510062791821e-07, "loss": 0.1194, "step": 555 }, { "epoch": 0.08770753638048664, "grad_norm": 5.793752193450928, "learning_rate": 9.310900016100467e-07, "loss": 0.0739, "step": 556 }, { "epoch": 0.08786528374807745, "grad_norm": 7.338489532470703, "learning_rate": 9.309289969409112e-07, "loss": 0.118, "step": 557 }, { "epoch": 0.08802303111566825, "grad_norm": 5.680853843688965, "learning_rate": 9.307679922717758e-07, "loss": 0.0967, "step": 558 }, { "epoch": 0.08818077848325906, "grad_norm": 4.661684989929199, "learning_rate": 9.306069876026404e-07, "loss": 0.0893, "step": 559 }, { "epoch": 0.08833852585084986, "grad_norm": 8.23542594909668, "learning_rate": 9.304459829335051e-07, "loss": 0.1228, "step": 560 }, { "epoch": 0.08849627321844067, "grad_norm": 4.448554515838623, "learning_rate": 9.302849782643696e-07, "loss": 0.1152, "step": 561 }, { "epoch": 0.08865402058603147, "grad_norm": 3.9955506324768066, "learning_rate": 9.301239735952342e-07, "loss": 0.0488, "step": 562 }, { "epoch": 0.08881176795362228, "grad_norm": 4.207185745239258, "learning_rate": 9.299629689260988e-07, "loss": 0.0612, "step": 563 }, { "epoch": 0.08896951532121308, "grad_norm": 3.6789309978485107, "learning_rate": 9.298019642569634e-07, "loss": 0.0847, "step": 564 }, { "epoch": 0.08912726268880387, "grad_norm": 3.0475735664367676, "learning_rate": 9.296409595878279e-07, "loss": 0.0582, "step": 565 }, { "epoch": 0.08928501005639468, "grad_norm": 5.452734470367432, "learning_rate": 9.294799549186925e-07, "loss": 0.0614, "step": 566 }, { "epoch": 0.08944275742398548, "grad_norm": 8.011670112609863, "learning_rate": 9.293189502495572e-07, "loss": 0.0633, "step": 567 }, { "epoch": 0.08960050479157629, "grad_norm": 5.0398993492126465, "learning_rate": 9.291579455804219e-07, "loss": 0.0647, "step": 568 }, { "epoch": 0.08975825215916709, "grad_norm": 3.8250749111175537, "learning_rate": 9.289969409112864e-07, "loss": 0.0717, "step": 569 }, { "epoch": 0.0899159995267579, "grad_norm": 3.7562711238861084, "learning_rate": 9.28835936242151e-07, "loss": 0.0505, "step": 570 }, { "epoch": 0.0900737468943487, "grad_norm": 8.792325973510742, "learning_rate": 9.286749315730156e-07, "loss": 0.1248, "step": 571 }, { "epoch": 0.09023149426193951, "grad_norm": 6.8114118576049805, "learning_rate": 9.285139269038802e-07, "loss": 0.0419, "step": 572 }, { "epoch": 0.09038924162953031, "grad_norm": 5.860121726989746, "learning_rate": 9.283529222347448e-07, "loss": 0.0777, "step": 573 }, { "epoch": 0.09054698899712112, "grad_norm": 4.751367092132568, "learning_rate": 9.281919175656093e-07, "loss": 0.0588, "step": 574 }, { "epoch": 0.09070473636471191, "grad_norm": 5.606316566467285, "learning_rate": 9.28030912896474e-07, "loss": 0.0737, "step": 575 }, { "epoch": 0.09086248373230271, "grad_norm": 5.022028923034668, "learning_rate": 9.278699082273386e-07, "loss": 0.078, "step": 576 }, { "epoch": 0.09102023109989352, "grad_norm": 7.044178485870361, "learning_rate": 9.277089035582032e-07, "loss": 0.1334, "step": 577 }, { "epoch": 0.09117797846748432, "grad_norm": 3.5324037075042725, "learning_rate": 9.275478988890677e-07, "loss": 0.0626, "step": 578 }, { "epoch": 0.09133572583507513, "grad_norm": 5.3293962478637695, "learning_rate": 9.273868942199323e-07, "loss": 0.0676, "step": 579 }, { "epoch": 0.09149347320266593, "grad_norm": 5.908418655395508, "learning_rate": 9.272258895507969e-07, "loss": 0.095, "step": 580 }, { "epoch": 0.09165122057025674, "grad_norm": 5.927475452423096, "learning_rate": 9.270648848816615e-07, "loss": 0.1069, "step": 581 }, { "epoch": 0.09180896793784754, "grad_norm": 9.431535720825195, "learning_rate": 9.26903880212526e-07, "loss": 0.0812, "step": 582 }, { "epoch": 0.09196671530543835, "grad_norm": 4.260199546813965, "learning_rate": 9.267428755433907e-07, "loss": 0.0458, "step": 583 }, { "epoch": 0.09212446267302915, "grad_norm": 6.684591770172119, "learning_rate": 9.265818708742554e-07, "loss": 0.1157, "step": 584 }, { "epoch": 0.09228221004061994, "grad_norm": 9.132654190063477, "learning_rate": 9.2642086620512e-07, "loss": 0.1202, "step": 585 }, { "epoch": 0.09243995740821075, "grad_norm": 5.602100372314453, "learning_rate": 9.262598615359845e-07, "loss": 0.0618, "step": 586 }, { "epoch": 0.09259770477580155, "grad_norm": 5.033472537994385, "learning_rate": 9.260988568668491e-07, "loss": 0.0484, "step": 587 }, { "epoch": 0.09275545214339236, "grad_norm": 7.528144359588623, "learning_rate": 9.259378521977137e-07, "loss": 0.0992, "step": 588 }, { "epoch": 0.09291319951098316, "grad_norm": 10.023838996887207, "learning_rate": 9.257768475285783e-07, "loss": 0.0755, "step": 589 }, { "epoch": 0.09307094687857397, "grad_norm": 5.312619209289551, "learning_rate": 9.256158428594428e-07, "loss": 0.0948, "step": 590 }, { "epoch": 0.09322869424616477, "grad_norm": 3.866603136062622, "learning_rate": 9.254548381903075e-07, "loss": 0.0632, "step": 591 }, { "epoch": 0.09338644161375557, "grad_norm": 5.804541110992432, "learning_rate": 9.252938335211721e-07, "loss": 0.1006, "step": 592 }, { "epoch": 0.09354418898134638, "grad_norm": 9.800780296325684, "learning_rate": 9.251328288520367e-07, "loss": 0.0794, "step": 593 }, { "epoch": 0.09370193634893717, "grad_norm": 6.569456577301025, "learning_rate": 9.249718241829013e-07, "loss": 0.0752, "step": 594 }, { "epoch": 0.09385968371652798, "grad_norm": 4.535599708557129, "learning_rate": 9.248108195137658e-07, "loss": 0.0926, "step": 595 }, { "epoch": 0.09401743108411878, "grad_norm": 5.393555164337158, "learning_rate": 9.246498148446304e-07, "loss": 0.0546, "step": 596 }, { "epoch": 0.09417517845170958, "grad_norm": 9.806357383728027, "learning_rate": 9.24488810175495e-07, "loss": 0.0646, "step": 597 }, { "epoch": 0.09433292581930039, "grad_norm": 4.138769626617432, "learning_rate": 9.243278055063597e-07, "loss": 0.0725, "step": 598 }, { "epoch": 0.0944906731868912, "grad_norm": 4.142092227935791, "learning_rate": 9.241668008372242e-07, "loss": 0.1253, "step": 599 }, { "epoch": 0.094648420554482, "grad_norm": 6.697494029998779, "learning_rate": 9.240057961680888e-07, "loss": 0.119, "step": 600 }, { "epoch": 0.0948061679220728, "grad_norm": 4.450563430786133, "learning_rate": 9.238447914989534e-07, "loss": 0.1033, "step": 601 }, { "epoch": 0.09496391528966361, "grad_norm": 8.81712532043457, "learning_rate": 9.236837868298181e-07, "loss": 0.0871, "step": 602 }, { "epoch": 0.09512166265725441, "grad_norm": 2.71614933013916, "learning_rate": 9.235227821606826e-07, "loss": 0.0315, "step": 603 }, { "epoch": 0.0952794100248452, "grad_norm": 5.211971282958984, "learning_rate": 9.233617774915472e-07, "loss": 0.1361, "step": 604 }, { "epoch": 0.09543715739243601, "grad_norm": 4.793868541717529, "learning_rate": 9.232007728224119e-07, "loss": 0.0529, "step": 605 }, { "epoch": 0.09559490476002681, "grad_norm": 5.744448661804199, "learning_rate": 9.230397681532765e-07, "loss": 0.0878, "step": 606 }, { "epoch": 0.09575265212761762, "grad_norm": 7.048700332641602, "learning_rate": 9.22878763484141e-07, "loss": 0.0548, "step": 607 }, { "epoch": 0.09591039949520842, "grad_norm": 3.6247646808624268, "learning_rate": 9.227177588150056e-07, "loss": 0.0841, "step": 608 }, { "epoch": 0.09606814686279923, "grad_norm": 15.860011100769043, "learning_rate": 9.225567541458702e-07, "loss": 0.1282, "step": 609 }, { "epoch": 0.09622589423039003, "grad_norm": 4.523679733276367, "learning_rate": 9.223957494767348e-07, "loss": 0.0486, "step": 610 }, { "epoch": 0.09638364159798084, "grad_norm": 5.463552951812744, "learning_rate": 9.222347448075993e-07, "loss": 0.1049, "step": 611 }, { "epoch": 0.09654138896557164, "grad_norm": 6.040287017822266, "learning_rate": 9.220737401384639e-07, "loss": 0.1149, "step": 612 }, { "epoch": 0.09669913633316245, "grad_norm": 5.69699764251709, "learning_rate": 9.219127354693286e-07, "loss": 0.0711, "step": 613 }, { "epoch": 0.09685688370075324, "grad_norm": 8.530750274658203, "learning_rate": 9.217517308001932e-07, "loss": 0.056, "step": 614 }, { "epoch": 0.09701463106834404, "grad_norm": 5.582493305206299, "learning_rate": 9.215907261310577e-07, "loss": 0.0793, "step": 615 }, { "epoch": 0.09717237843593485, "grad_norm": 13.381341934204102, "learning_rate": 9.214297214619223e-07, "loss": 0.1297, "step": 616 }, { "epoch": 0.09733012580352565, "grad_norm": 8.435693740844727, "learning_rate": 9.212687167927869e-07, "loss": 0.1178, "step": 617 }, { "epoch": 0.09748787317111646, "grad_norm": 7.7765703201293945, "learning_rate": 9.211077121236515e-07, "loss": 0.1133, "step": 618 }, { "epoch": 0.09764562053870726, "grad_norm": 6.829525947570801, "learning_rate": 9.209467074545161e-07, "loss": 0.124, "step": 619 }, { "epoch": 0.09780336790629807, "grad_norm": 7.434627532958984, "learning_rate": 9.207857027853806e-07, "loss": 0.0649, "step": 620 }, { "epoch": 0.09796111527388887, "grad_norm": 8.623050689697266, "learning_rate": 9.206246981162454e-07, "loss": 0.1165, "step": 621 }, { "epoch": 0.09811886264147968, "grad_norm": 6.121469974517822, "learning_rate": 9.2046369344711e-07, "loss": 0.0831, "step": 622 }, { "epoch": 0.09827661000907047, "grad_norm": 6.457248210906982, "learning_rate": 9.203026887779746e-07, "loss": 0.0733, "step": 623 }, { "epoch": 0.09843435737666127, "grad_norm": 6.442403316497803, "learning_rate": 9.201416841088391e-07, "loss": 0.0985, "step": 624 }, { "epoch": 0.09859210474425208, "grad_norm": 6.161303997039795, "learning_rate": 9.199806794397037e-07, "loss": 0.082, "step": 625 }, { "epoch": 0.09874985211184288, "grad_norm": 7.090782642364502, "learning_rate": 9.198196747705683e-07, "loss": 0.0338, "step": 626 }, { "epoch": 0.09890759947943369, "grad_norm": 6.823625087738037, "learning_rate": 9.196586701014329e-07, "loss": 0.0977, "step": 627 }, { "epoch": 0.09906534684702449, "grad_norm": 6.264141082763672, "learning_rate": 9.194976654322975e-07, "loss": 0.0556, "step": 628 }, { "epoch": 0.0992230942146153, "grad_norm": 4.8218841552734375, "learning_rate": 9.193366607631621e-07, "loss": 0.0698, "step": 629 }, { "epoch": 0.0993808415822061, "grad_norm": 5.638354778289795, "learning_rate": 9.191756560940267e-07, "loss": 0.0851, "step": 630 }, { "epoch": 0.0995385889497969, "grad_norm": 4.278707981109619, "learning_rate": 9.190146514248913e-07, "loss": 0.0892, "step": 631 }, { "epoch": 0.09969633631738771, "grad_norm": 5.753275394439697, "learning_rate": 9.188536467557558e-07, "loss": 0.083, "step": 632 }, { "epoch": 0.0998540836849785, "grad_norm": 5.237423419952393, "learning_rate": 9.186926420866204e-07, "loss": 0.0786, "step": 633 }, { "epoch": 0.1000118310525693, "grad_norm": 12.854137420654297, "learning_rate": 9.18531637417485e-07, "loss": 0.125, "step": 634 }, { "epoch": 0.10016957842016011, "grad_norm": 8.807177543640137, "learning_rate": 9.183706327483497e-07, "loss": 0.1198, "step": 635 }, { "epoch": 0.10032732578775092, "grad_norm": 7.128739833831787, "learning_rate": 9.182096280792142e-07, "loss": 0.0891, "step": 636 }, { "epoch": 0.10048507315534172, "grad_norm": 5.885706424713135, "learning_rate": 9.180486234100789e-07, "loss": 0.0565, "step": 637 }, { "epoch": 0.10064282052293252, "grad_norm": 11.299050331115723, "learning_rate": 9.178876187409435e-07, "loss": 0.1356, "step": 638 }, { "epoch": 0.10080056789052333, "grad_norm": 6.594298839569092, "learning_rate": 9.177266140718081e-07, "loss": 0.0788, "step": 639 }, { "epoch": 0.10095831525811413, "grad_norm": 5.020556926727295, "learning_rate": 9.175656094026727e-07, "loss": 0.0933, "step": 640 }, { "epoch": 0.10111606262570494, "grad_norm": 7.084664344787598, "learning_rate": 9.174046047335372e-07, "loss": 0.099, "step": 641 }, { "epoch": 0.10127380999329574, "grad_norm": 6.887861728668213, "learning_rate": 9.172436000644018e-07, "loss": 0.0652, "step": 642 }, { "epoch": 0.10143155736088653, "grad_norm": 9.95553207397461, "learning_rate": 9.170825953952665e-07, "loss": 0.105, "step": 643 }, { "epoch": 0.10158930472847734, "grad_norm": 52.05778121948242, "learning_rate": 9.169215907261311e-07, "loss": 0.0901, "step": 644 }, { "epoch": 0.10174705209606814, "grad_norm": 7.226211071014404, "learning_rate": 9.167605860569956e-07, "loss": 0.0862, "step": 645 }, { "epoch": 0.10190479946365895, "grad_norm": 4.614170551300049, "learning_rate": 9.165995813878602e-07, "loss": 0.0907, "step": 646 }, { "epoch": 0.10206254683124975, "grad_norm": 3.3297107219696045, "learning_rate": 9.164385767187248e-07, "loss": 0.0389, "step": 647 }, { "epoch": 0.10222029419884056, "grad_norm": 8.3242826461792, "learning_rate": 9.162775720495894e-07, "loss": 0.0745, "step": 648 }, { "epoch": 0.10237804156643136, "grad_norm": 4.23464822769165, "learning_rate": 9.161165673804539e-07, "loss": 0.0714, "step": 649 }, { "epoch": 0.10253578893402217, "grad_norm": 5.633603096008301, "learning_rate": 9.159555627113185e-07, "loss": 0.0813, "step": 650 }, { "epoch": 0.10269353630161297, "grad_norm": 10.278568267822266, "learning_rate": 9.157945580421832e-07, "loss": 0.1132, "step": 651 }, { "epoch": 0.10285128366920376, "grad_norm": 22.042381286621094, "learning_rate": 9.156335533730478e-07, "loss": 0.0874, "step": 652 }, { "epoch": 0.10300903103679457, "grad_norm": 9.171586990356445, "learning_rate": 9.154725487039123e-07, "loss": 0.1028, "step": 653 }, { "epoch": 0.10316677840438537, "grad_norm": 8.835233688354492, "learning_rate": 9.153115440347769e-07, "loss": 0.0658, "step": 654 }, { "epoch": 0.10332452577197618, "grad_norm": 4.944420337677002, "learning_rate": 9.151505393656416e-07, "loss": 0.083, "step": 655 }, { "epoch": 0.10348227313956698, "grad_norm": 7.005538463592529, "learning_rate": 9.149895346965062e-07, "loss": 0.1406, "step": 656 }, { "epoch": 0.10364002050715779, "grad_norm": 11.50197982788086, "learning_rate": 9.148285300273707e-07, "loss": 0.1425, "step": 657 }, { "epoch": 0.10379776787474859, "grad_norm": 3.9614639282226562, "learning_rate": 9.146675253582354e-07, "loss": 0.0601, "step": 658 }, { "epoch": 0.1039555152423394, "grad_norm": 3.4335761070251465, "learning_rate": 9.145065206891e-07, "loss": 0.0668, "step": 659 }, { "epoch": 0.1041132626099302, "grad_norm": 6.671053886413574, "learning_rate": 9.143455160199646e-07, "loss": 0.0929, "step": 660 }, { "epoch": 0.104271009977521, "grad_norm": 3.743391990661621, "learning_rate": 9.141845113508291e-07, "loss": 0.0758, "step": 661 }, { "epoch": 0.1044287573451118, "grad_norm": 6.328144073486328, "learning_rate": 9.140235066816937e-07, "loss": 0.0597, "step": 662 }, { "epoch": 0.1045865047127026, "grad_norm": 5.457065105438232, "learning_rate": 9.138625020125583e-07, "loss": 0.0971, "step": 663 }, { "epoch": 0.1047442520802934, "grad_norm": 5.440207481384277, "learning_rate": 9.137014973434229e-07, "loss": 0.1172, "step": 664 }, { "epoch": 0.10490199944788421, "grad_norm": 3.1148476600646973, "learning_rate": 9.135404926742876e-07, "loss": 0.0433, "step": 665 }, { "epoch": 0.10505974681547502, "grad_norm": 3.905869483947754, "learning_rate": 9.133794880051521e-07, "loss": 0.0362, "step": 666 }, { "epoch": 0.10521749418306582, "grad_norm": 6.365758895874023, "learning_rate": 9.132184833360167e-07, "loss": 0.0725, "step": 667 }, { "epoch": 0.10537524155065663, "grad_norm": 7.652137756347656, "learning_rate": 9.130574786668813e-07, "loss": 0.0773, "step": 668 }, { "epoch": 0.10553298891824743, "grad_norm": 4.283510208129883, "learning_rate": 9.128964739977459e-07, "loss": 0.0729, "step": 669 }, { "epoch": 0.10569073628583824, "grad_norm": 6.367171764373779, "learning_rate": 9.127354693286104e-07, "loss": 0.1088, "step": 670 }, { "epoch": 0.10584848365342904, "grad_norm": 6.150599479675293, "learning_rate": 9.12574464659475e-07, "loss": 0.062, "step": 671 }, { "epoch": 0.10600623102101983, "grad_norm": 5.077731132507324, "learning_rate": 9.124134599903396e-07, "loss": 0.0469, "step": 672 }, { "epoch": 0.10616397838861064, "grad_norm": 4.419490337371826, "learning_rate": 9.122524553212044e-07, "loss": 0.0919, "step": 673 }, { "epoch": 0.10632172575620144, "grad_norm": 5.757754802703857, "learning_rate": 9.120914506520689e-07, "loss": 0.0529, "step": 674 }, { "epoch": 0.10647947312379225, "grad_norm": 5.260188579559326, "learning_rate": 9.119304459829335e-07, "loss": 0.0714, "step": 675 }, { "epoch": 0.10663722049138305, "grad_norm": 1.9488763809204102, "learning_rate": 9.117694413137981e-07, "loss": 0.0251, "step": 676 }, { "epoch": 0.10679496785897385, "grad_norm": 8.528887748718262, "learning_rate": 9.116084366446627e-07, "loss": 0.1339, "step": 677 }, { "epoch": 0.10695271522656466, "grad_norm": 8.196385383605957, "learning_rate": 9.114474319755272e-07, "loss": 0.1058, "step": 678 }, { "epoch": 0.10711046259415546, "grad_norm": 7.687489032745361, "learning_rate": 9.112864273063918e-07, "loss": 0.0996, "step": 679 }, { "epoch": 0.10726820996174627, "grad_norm": 3.2332191467285156, "learning_rate": 9.111254226372565e-07, "loss": 0.0705, "step": 680 }, { "epoch": 0.10742595732933706, "grad_norm": 6.897693634033203, "learning_rate": 9.109644179681211e-07, "loss": 0.1039, "step": 681 }, { "epoch": 0.10758370469692786, "grad_norm": 6.623891830444336, "learning_rate": 9.108034132989856e-07, "loss": 0.1054, "step": 682 }, { "epoch": 0.10774145206451867, "grad_norm": 5.592535018920898, "learning_rate": 9.106424086298502e-07, "loss": 0.0701, "step": 683 }, { "epoch": 0.10789919943210947, "grad_norm": 11.811220169067383, "learning_rate": 9.104814039607148e-07, "loss": 0.1015, "step": 684 }, { "epoch": 0.10805694679970028, "grad_norm": 4.838003158569336, "learning_rate": 9.103203992915794e-07, "loss": 0.0752, "step": 685 }, { "epoch": 0.10821469416729108, "grad_norm": 5.717372894287109, "learning_rate": 9.101593946224439e-07, "loss": 0.1081, "step": 686 }, { "epoch": 0.10837244153488189, "grad_norm": 6.643096923828125, "learning_rate": 9.099983899533085e-07, "loss": 0.0901, "step": 687 }, { "epoch": 0.1085301889024727, "grad_norm": 7.170506477355957, "learning_rate": 9.098373852841732e-07, "loss": 0.0657, "step": 688 }, { "epoch": 0.1086879362700635, "grad_norm": 5.268221378326416, "learning_rate": 9.096763806150379e-07, "loss": 0.0793, "step": 689 }, { "epoch": 0.1088456836376543, "grad_norm": 5.993264198303223, "learning_rate": 9.095153759459025e-07, "loss": 0.0979, "step": 690 }, { "epoch": 0.1090034310052451, "grad_norm": 6.888185501098633, "learning_rate": 9.09354371276767e-07, "loss": 0.1079, "step": 691 }, { "epoch": 0.1091611783728359, "grad_norm": 3.7119643688201904, "learning_rate": 9.091933666076316e-07, "loss": 0.0458, "step": 692 }, { "epoch": 0.1093189257404267, "grad_norm": 8.08841609954834, "learning_rate": 9.090323619384962e-07, "loss": 0.0827, "step": 693 }, { "epoch": 0.10947667310801751, "grad_norm": 5.303411960601807, "learning_rate": 9.088713572693608e-07, "loss": 0.0643, "step": 694 }, { "epoch": 0.10963442047560831, "grad_norm": 7.4574971199035645, "learning_rate": 9.087103526002253e-07, "loss": 0.1011, "step": 695 }, { "epoch": 0.10979216784319912, "grad_norm": 7.693200588226318, "learning_rate": 9.0854934793109e-07, "loss": 0.0873, "step": 696 }, { "epoch": 0.10994991521078992, "grad_norm": 5.869884014129639, "learning_rate": 9.083883432619546e-07, "loss": 0.0639, "step": 697 }, { "epoch": 0.11010766257838073, "grad_norm": 5.635510444641113, "learning_rate": 9.082273385928192e-07, "loss": 0.0461, "step": 698 }, { "epoch": 0.11026540994597153, "grad_norm": 5.684610366821289, "learning_rate": 9.080663339236837e-07, "loss": 0.0598, "step": 699 }, { "epoch": 0.11042315731356234, "grad_norm": 8.58759880065918, "learning_rate": 9.079053292545483e-07, "loss": 0.0769, "step": 700 }, { "epoch": 0.11058090468115313, "grad_norm": 6.110040664672852, "learning_rate": 9.077443245854129e-07, "loss": 0.076, "step": 701 }, { "epoch": 0.11073865204874393, "grad_norm": 5.440070152282715, "learning_rate": 9.075833199162775e-07, "loss": 0.0723, "step": 702 }, { "epoch": 0.11089639941633474, "grad_norm": 7.805794715881348, "learning_rate": 9.074223152471421e-07, "loss": 0.0898, "step": 703 }, { "epoch": 0.11105414678392554, "grad_norm": 7.114494323730469, "learning_rate": 9.072613105780067e-07, "loss": 0.0765, "step": 704 }, { "epoch": 0.11121189415151635, "grad_norm": 6.303781986236572, "learning_rate": 9.071003059088713e-07, "loss": 0.1273, "step": 705 }, { "epoch": 0.11136964151910715, "grad_norm": 3.669435977935791, "learning_rate": 9.069393012397359e-07, "loss": 0.0703, "step": 706 }, { "epoch": 0.11152738888669796, "grad_norm": 4.772210121154785, "learning_rate": 9.067782965706004e-07, "loss": 0.0752, "step": 707 }, { "epoch": 0.11168513625428876, "grad_norm": 3.1551268100738525, "learning_rate": 9.06617291901465e-07, "loss": 0.0615, "step": 708 }, { "epoch": 0.11184288362187957, "grad_norm": 3.9344019889831543, "learning_rate": 9.064562872323297e-07, "loss": 0.0664, "step": 709 }, { "epoch": 0.11200063098947036, "grad_norm": 3.228260040283203, "learning_rate": 9.062952825631944e-07, "loss": 0.0814, "step": 710 }, { "epoch": 0.11215837835706116, "grad_norm": 5.874837875366211, "learning_rate": 9.06134277894059e-07, "loss": 0.0872, "step": 711 }, { "epoch": 0.11231612572465197, "grad_norm": 9.931432723999023, "learning_rate": 9.059732732249235e-07, "loss": 0.0745, "step": 712 }, { "epoch": 0.11247387309224277, "grad_norm": 10.438408851623535, "learning_rate": 9.058122685557881e-07, "loss": 0.1201, "step": 713 }, { "epoch": 0.11263162045983358, "grad_norm": 5.636549949645996, "learning_rate": 9.056512638866527e-07, "loss": 0.066, "step": 714 }, { "epoch": 0.11278936782742438, "grad_norm": 5.628836154937744, "learning_rate": 9.054902592175173e-07, "loss": 0.0778, "step": 715 }, { "epoch": 0.11294711519501519, "grad_norm": 3.7619338035583496, "learning_rate": 9.053292545483818e-07, "loss": 0.0459, "step": 716 }, { "epoch": 0.11310486256260599, "grad_norm": 6.656398773193359, "learning_rate": 9.051682498792464e-07, "loss": 0.1077, "step": 717 }, { "epoch": 0.1132626099301968, "grad_norm": 7.641002655029297, "learning_rate": 9.050072452101111e-07, "loss": 0.1254, "step": 718 }, { "epoch": 0.1134203572977876, "grad_norm": 6.015817642211914, "learning_rate": 9.048462405409757e-07, "loss": 0.0674, "step": 719 }, { "epoch": 0.11357810466537839, "grad_norm": 7.3636698722839355, "learning_rate": 9.046852358718402e-07, "loss": 0.0895, "step": 720 }, { "epoch": 0.1137358520329692, "grad_norm": 10.179230690002441, "learning_rate": 9.045242312027048e-07, "loss": 0.1238, "step": 721 }, { "epoch": 0.11389359940056, "grad_norm": 3.5914306640625, "learning_rate": 9.043632265335694e-07, "loss": 0.0765, "step": 722 }, { "epoch": 0.1140513467681508, "grad_norm": 4.742732048034668, "learning_rate": 9.04202221864434e-07, "loss": 0.0714, "step": 723 }, { "epoch": 0.11420909413574161, "grad_norm": 5.424478530883789, "learning_rate": 9.040412171952985e-07, "loss": 0.0517, "step": 724 }, { "epoch": 0.11436684150333241, "grad_norm": 11.305291175842285, "learning_rate": 9.038802125261631e-07, "loss": 0.1395, "step": 725 }, { "epoch": 0.11452458887092322, "grad_norm": 3.5070836544036865, "learning_rate": 9.037192078570279e-07, "loss": 0.0719, "step": 726 }, { "epoch": 0.11468233623851402, "grad_norm": 10.200228691101074, "learning_rate": 9.035582031878925e-07, "loss": 0.1091, "step": 727 }, { "epoch": 0.11484008360610483, "grad_norm": 3.7730553150177, "learning_rate": 9.03397198518757e-07, "loss": 0.0639, "step": 728 }, { "epoch": 0.11499783097369563, "grad_norm": 5.975857257843018, "learning_rate": 9.032361938496216e-07, "loss": 0.0646, "step": 729 }, { "epoch": 0.11515557834128642, "grad_norm": 4.199368476867676, "learning_rate": 9.030751891804862e-07, "loss": 0.1113, "step": 730 }, { "epoch": 0.11531332570887723, "grad_norm": 7.727891445159912, "learning_rate": 9.029141845113508e-07, "loss": 0.1398, "step": 731 }, { "epoch": 0.11547107307646803, "grad_norm": 4.718742847442627, "learning_rate": 9.027531798422153e-07, "loss": 0.0419, "step": 732 }, { "epoch": 0.11562882044405884, "grad_norm": 4.954360008239746, "learning_rate": 9.0259217517308e-07, "loss": 0.1133, "step": 733 }, { "epoch": 0.11578656781164964, "grad_norm": 2.603778839111328, "learning_rate": 9.024311705039446e-07, "loss": 0.0611, "step": 734 }, { "epoch": 0.11594431517924045, "grad_norm": 6.089944362640381, "learning_rate": 9.022701658348092e-07, "loss": 0.0555, "step": 735 }, { "epoch": 0.11610206254683125, "grad_norm": 3.6806843280792236, "learning_rate": 9.021091611656738e-07, "loss": 0.0568, "step": 736 }, { "epoch": 0.11625980991442206, "grad_norm": 7.865287780761719, "learning_rate": 9.019481564965383e-07, "loss": 0.0858, "step": 737 }, { "epoch": 0.11641755728201286, "grad_norm": 4.575242042541504, "learning_rate": 9.017871518274029e-07, "loss": 0.0885, "step": 738 }, { "epoch": 0.11657530464960365, "grad_norm": 7.171676158905029, "learning_rate": 9.016261471582675e-07, "loss": 0.0523, "step": 739 }, { "epoch": 0.11673305201719446, "grad_norm": 4.008410930633545, "learning_rate": 9.014651424891322e-07, "loss": 0.0532, "step": 740 }, { "epoch": 0.11689079938478526, "grad_norm": 6.986484050750732, "learning_rate": 9.013041378199967e-07, "loss": 0.0436, "step": 741 }, { "epoch": 0.11704854675237607, "grad_norm": 5.582924842834473, "learning_rate": 9.011431331508613e-07, "loss": 0.0587, "step": 742 }, { "epoch": 0.11720629411996687, "grad_norm": 3.2100069522857666, "learning_rate": 9.00982128481726e-07, "loss": 0.064, "step": 743 }, { "epoch": 0.11736404148755768, "grad_norm": 4.56215238571167, "learning_rate": 9.008211238125906e-07, "loss": 0.0859, "step": 744 }, { "epoch": 0.11752178885514848, "grad_norm": 6.167077541351318, "learning_rate": 9.006601191434551e-07, "loss": 0.0664, "step": 745 }, { "epoch": 0.11767953622273929, "grad_norm": 7.485244274139404, "learning_rate": 9.004991144743197e-07, "loss": 0.1289, "step": 746 }, { "epoch": 0.11783728359033009, "grad_norm": 10.55856990814209, "learning_rate": 9.003381098051843e-07, "loss": 0.08, "step": 747 }, { "epoch": 0.1179950309579209, "grad_norm": 15.230066299438477, "learning_rate": 9.00177105136049e-07, "loss": 0.0937, "step": 748 }, { "epoch": 0.11815277832551169, "grad_norm": 5.164748191833496, "learning_rate": 9.000161004669135e-07, "loss": 0.0805, "step": 749 }, { "epoch": 0.11831052569310249, "grad_norm": 7.331662178039551, "learning_rate": 8.998550957977781e-07, "loss": 0.056, "step": 750 }, { "epoch": 0.1184682730606933, "grad_norm": 7.451263904571533, "learning_rate": 8.996940911286427e-07, "loss": 0.0618, "step": 751 }, { "epoch": 0.1186260204282841, "grad_norm": 3.639286994934082, "learning_rate": 8.995330864595073e-07, "loss": 0.0487, "step": 752 }, { "epoch": 0.1187837677958749, "grad_norm": 5.050006866455078, "learning_rate": 8.993720817903718e-07, "loss": 0.0723, "step": 753 }, { "epoch": 0.11894151516346571, "grad_norm": 5.0852837562561035, "learning_rate": 8.992110771212364e-07, "loss": 0.1024, "step": 754 }, { "epoch": 0.11909926253105652, "grad_norm": 5.076583385467529, "learning_rate": 8.99050072452101e-07, "loss": 0.1035, "step": 755 }, { "epoch": 0.11925700989864732, "grad_norm": 8.556181907653809, "learning_rate": 8.988890677829657e-07, "loss": 0.0744, "step": 756 }, { "epoch": 0.11941475726623813, "grad_norm": 4.467113018035889, "learning_rate": 8.987280631138303e-07, "loss": 0.0343, "step": 757 }, { "epoch": 0.11957250463382893, "grad_norm": 5.956662178039551, "learning_rate": 8.985670584446948e-07, "loss": 0.0757, "step": 758 }, { "epoch": 0.11973025200141972, "grad_norm": 9.729576110839844, "learning_rate": 8.984060537755594e-07, "loss": 0.0774, "step": 759 }, { "epoch": 0.11988799936901053, "grad_norm": 5.245589256286621, "learning_rate": 8.98245049106424e-07, "loss": 0.082, "step": 760 }, { "epoch": 0.12004574673660133, "grad_norm": 9.502345085144043, "learning_rate": 8.980840444372887e-07, "loss": 0.1337, "step": 761 }, { "epoch": 0.12020349410419214, "grad_norm": 6.284233570098877, "learning_rate": 8.979230397681532e-07, "loss": 0.0802, "step": 762 }, { "epoch": 0.12036124147178294, "grad_norm": 4.558141231536865, "learning_rate": 8.977620350990179e-07, "loss": 0.0639, "step": 763 }, { "epoch": 0.12051898883937374, "grad_norm": 3.1438100337982178, "learning_rate": 8.976010304298825e-07, "loss": 0.0714, "step": 764 }, { "epoch": 0.12067673620696455, "grad_norm": 5.920631408691406, "learning_rate": 8.974400257607471e-07, "loss": 0.1316, "step": 765 }, { "epoch": 0.12083448357455535, "grad_norm": 4.774268627166748, "learning_rate": 8.972790210916116e-07, "loss": 0.0631, "step": 766 }, { "epoch": 0.12099223094214616, "grad_norm": 4.985866069793701, "learning_rate": 8.971180164224762e-07, "loss": 0.1117, "step": 767 }, { "epoch": 0.12114997830973695, "grad_norm": 4.642230987548828, "learning_rate": 8.969570117533408e-07, "loss": 0.0664, "step": 768 }, { "epoch": 0.12114997830973695, "eval_accuracy": 0.980177959626334, "eval_f1": 0.980177959626334, "eval_loss": 0.068110391497612, "eval_runtime": 4727.7868, "eval_samples_per_second": 42.907, "eval_steps_per_second": 2.682, "step": 768 }, { "epoch": 0.12130772567732775, "grad_norm": 6.041478633880615, "learning_rate": 8.967960070842054e-07, "loss": 0.0674, "step": 769 }, { "epoch": 0.12146547304491856, "grad_norm": 5.090635776519775, "learning_rate": 8.966350024150699e-07, "loss": 0.043, "step": 770 }, { "epoch": 0.12162322041250936, "grad_norm": 8.246053695678711, "learning_rate": 8.964739977459346e-07, "loss": 0.0305, "step": 771 }, { "epoch": 0.12178096778010017, "grad_norm": 8.57614517211914, "learning_rate": 8.963129930767992e-07, "loss": 0.1028, "step": 772 }, { "epoch": 0.12193871514769097, "grad_norm": 7.169241428375244, "learning_rate": 8.961519884076638e-07, "loss": 0.0653, "step": 773 }, { "epoch": 0.12209646251528178, "grad_norm": 5.16396951675415, "learning_rate": 8.959909837385283e-07, "loss": 0.0891, "step": 774 }, { "epoch": 0.12225420988287258, "grad_norm": 6.805628299713135, "learning_rate": 8.958299790693929e-07, "loss": 0.0815, "step": 775 }, { "epoch": 0.12241195725046339, "grad_norm": 7.052548885345459, "learning_rate": 8.956689744002575e-07, "loss": 0.0992, "step": 776 }, { "epoch": 0.12256970461805419, "grad_norm": 3.9773361682891846, "learning_rate": 8.955079697311221e-07, "loss": 0.0624, "step": 777 }, { "epoch": 0.12272745198564498, "grad_norm": 6.479729175567627, "learning_rate": 8.953469650619868e-07, "loss": 0.0691, "step": 778 }, { "epoch": 0.12288519935323579, "grad_norm": 7.385898113250732, "learning_rate": 8.951859603928514e-07, "loss": 0.1458, "step": 779 }, { "epoch": 0.1230429467208266, "grad_norm": 6.460968017578125, "learning_rate": 8.95024955723716e-07, "loss": 0.0868, "step": 780 }, { "epoch": 0.1232006940884174, "grad_norm": 7.318996906280518, "learning_rate": 8.948639510545806e-07, "loss": 0.088, "step": 781 }, { "epoch": 0.1233584414560082, "grad_norm": 7.164853572845459, "learning_rate": 8.947029463854452e-07, "loss": 0.0804, "step": 782 }, { "epoch": 0.12351618882359901, "grad_norm": 11.578691482543945, "learning_rate": 8.945419417163097e-07, "loss": 0.075, "step": 783 }, { "epoch": 0.12367393619118981, "grad_norm": 2.8673319816589355, "learning_rate": 8.943809370471743e-07, "loss": 0.0567, "step": 784 }, { "epoch": 0.12383168355878062, "grad_norm": 7.566540241241455, "learning_rate": 8.942199323780389e-07, "loss": 0.0914, "step": 785 }, { "epoch": 0.12398943092637142, "grad_norm": 5.447342395782471, "learning_rate": 8.940589277089036e-07, "loss": 0.0883, "step": 786 }, { "epoch": 0.12414717829396221, "grad_norm": 3.222623348236084, "learning_rate": 8.938979230397681e-07, "loss": 0.0385, "step": 787 }, { "epoch": 0.12430492566155302, "grad_norm": 12.599985122680664, "learning_rate": 8.937369183706327e-07, "loss": 0.1081, "step": 788 }, { "epoch": 0.12446267302914382, "grad_norm": 4.441560745239258, "learning_rate": 8.935759137014973e-07, "loss": 0.0628, "step": 789 }, { "epoch": 0.12462042039673463, "grad_norm": 5.921494483947754, "learning_rate": 8.934149090323619e-07, "loss": 0.1036, "step": 790 }, { "epoch": 0.12477816776432543, "grad_norm": 8.730775833129883, "learning_rate": 8.932539043632264e-07, "loss": 0.0895, "step": 791 }, { "epoch": 0.12493591513191624, "grad_norm": 4.8975911140441895, "learning_rate": 8.93092899694091e-07, "loss": 0.0645, "step": 792 }, { "epoch": 0.12509366249950704, "grad_norm": 4.241909503936768, "learning_rate": 8.929318950249557e-07, "loss": 0.0622, "step": 793 }, { "epoch": 0.12525140986709785, "grad_norm": 6.301782131195068, "learning_rate": 8.927708903558203e-07, "loss": 0.0722, "step": 794 }, { "epoch": 0.12540915723468865, "grad_norm": 2.9589102268218994, "learning_rate": 8.926098856866848e-07, "loss": 0.0362, "step": 795 }, { "epoch": 0.12556690460227946, "grad_norm": 6.449861526489258, "learning_rate": 8.924488810175495e-07, "loss": 0.0808, "step": 796 }, { "epoch": 0.12572465196987026, "grad_norm": 4.228991508483887, "learning_rate": 8.922878763484141e-07, "loss": 0.0605, "step": 797 }, { "epoch": 0.12588239933746107, "grad_norm": 9.525351524353027, "learning_rate": 8.921268716792787e-07, "loss": 0.1215, "step": 798 }, { "epoch": 0.12604014670505187, "grad_norm": 5.587832450866699, "learning_rate": 8.919658670101432e-07, "loss": 0.0933, "step": 799 }, { "epoch": 0.12619789407264267, "grad_norm": 3.7256529331207275, "learning_rate": 8.918048623410078e-07, "loss": 0.0882, "step": 800 }, { "epoch": 0.12635564144023348, "grad_norm": 4.9349236488342285, "learning_rate": 8.916438576718725e-07, "loss": 0.0573, "step": 801 }, { "epoch": 0.12651338880782426, "grad_norm": 7.595797061920166, "learning_rate": 8.914828530027371e-07, "loss": 0.0727, "step": 802 }, { "epoch": 0.12667113617541506, "grad_norm": 8.809452056884766, "learning_rate": 8.913218483336017e-07, "loss": 0.098, "step": 803 }, { "epoch": 0.12682888354300587, "grad_norm": 4.205000877380371, "learning_rate": 8.911608436644662e-07, "loss": 0.0784, "step": 804 }, { "epoch": 0.12698663091059667, "grad_norm": 3.946079969406128, "learning_rate": 8.909998389953308e-07, "loss": 0.0487, "step": 805 }, { "epoch": 0.12714437827818748, "grad_norm": 7.7502570152282715, "learning_rate": 8.908388343261954e-07, "loss": 0.0747, "step": 806 }, { "epoch": 0.12730212564577828, "grad_norm": 4.870746612548828, "learning_rate": 8.9067782965706e-07, "loss": 0.0653, "step": 807 }, { "epoch": 0.12745987301336908, "grad_norm": 5.179272174835205, "learning_rate": 8.905168249879246e-07, "loss": 0.0597, "step": 808 }, { "epoch": 0.1276176203809599, "grad_norm": 5.307404518127441, "learning_rate": 8.903558203187892e-07, "loss": 0.0847, "step": 809 }, { "epoch": 0.1277753677485507, "grad_norm": 6.88541841506958, "learning_rate": 8.901948156496538e-07, "loss": 0.0781, "step": 810 }, { "epoch": 0.1279331151161415, "grad_norm": 10.111740112304688, "learning_rate": 8.900338109805184e-07, "loss": 0.0569, "step": 811 }, { "epoch": 0.1280908624837323, "grad_norm": 6.572051048278809, "learning_rate": 8.898728063113829e-07, "loss": 0.0949, "step": 812 }, { "epoch": 0.1282486098513231, "grad_norm": 7.902374744415283, "learning_rate": 8.897118016422476e-07, "loss": 0.0693, "step": 813 }, { "epoch": 0.1284063572189139, "grad_norm": 7.52032995223999, "learning_rate": 8.895507969731122e-07, "loss": 0.046, "step": 814 }, { "epoch": 0.12856410458650472, "grad_norm": 4.0166239738464355, "learning_rate": 8.893897923039768e-07, "loss": 0.0889, "step": 815 }, { "epoch": 0.12872185195409552, "grad_norm": 7.301949977874756, "learning_rate": 8.892287876348414e-07, "loss": 0.0758, "step": 816 }, { "epoch": 0.12887959932168633, "grad_norm": 3.707253932952881, "learning_rate": 8.89067782965706e-07, "loss": 0.0677, "step": 817 }, { "epoch": 0.12903734668927713, "grad_norm": 4.05390739440918, "learning_rate": 8.889067782965706e-07, "loss": 0.0543, "step": 818 }, { "epoch": 0.12919509405686794, "grad_norm": 3.006601095199585, "learning_rate": 8.887457736274352e-07, "loss": 0.031, "step": 819 }, { "epoch": 0.12935284142445874, "grad_norm": 6.196488857269287, "learning_rate": 8.885847689582997e-07, "loss": 0.129, "step": 820 }, { "epoch": 0.12951058879204952, "grad_norm": 4.589459419250488, "learning_rate": 8.884237642891643e-07, "loss": 0.0635, "step": 821 }, { "epoch": 0.12966833615964032, "grad_norm": 7.604683876037598, "learning_rate": 8.882627596200289e-07, "loss": 0.0734, "step": 822 }, { "epoch": 0.12982608352723113, "grad_norm": 6.847631454467773, "learning_rate": 8.881017549508936e-07, "loss": 0.0493, "step": 823 }, { "epoch": 0.12998383089482193, "grad_norm": 5.853129863739014, "learning_rate": 8.879407502817581e-07, "loss": 0.0716, "step": 824 }, { "epoch": 0.13014157826241274, "grad_norm": 8.411120414733887, "learning_rate": 8.877797456126227e-07, "loss": 0.111, "step": 825 }, { "epoch": 0.13029932563000354, "grad_norm": 10.987421989440918, "learning_rate": 8.876187409434873e-07, "loss": 0.118, "step": 826 }, { "epoch": 0.13045707299759435, "grad_norm": 4.021767616271973, "learning_rate": 8.874577362743519e-07, "loss": 0.066, "step": 827 }, { "epoch": 0.13061482036518515, "grad_norm": 9.605419158935547, "learning_rate": 8.872967316052165e-07, "loss": 0.0765, "step": 828 }, { "epoch": 0.13077256773277596, "grad_norm": 7.201488018035889, "learning_rate": 8.87135726936081e-07, "loss": 0.0735, "step": 829 }, { "epoch": 0.13093031510036676, "grad_norm": 5.675152778625488, "learning_rate": 8.869747222669456e-07, "loss": 0.0825, "step": 830 }, { "epoch": 0.13108806246795757, "grad_norm": 6.44282865524292, "learning_rate": 8.868137175978104e-07, "loss": 0.0875, "step": 831 }, { "epoch": 0.13124580983554837, "grad_norm": 5.081498146057129, "learning_rate": 8.86652712928675e-07, "loss": 0.0719, "step": 832 }, { "epoch": 0.13140355720313918, "grad_norm": 7.329372882843018, "learning_rate": 8.864917082595395e-07, "loss": 0.0707, "step": 833 }, { "epoch": 0.13156130457072998, "grad_norm": 5.028716564178467, "learning_rate": 8.863307035904041e-07, "loss": 0.0587, "step": 834 }, { "epoch": 0.13171905193832079, "grad_norm": 4.705847263336182, "learning_rate": 8.861696989212687e-07, "loss": 0.0481, "step": 835 }, { "epoch": 0.1318767993059116, "grad_norm": 7.79337215423584, "learning_rate": 8.860086942521333e-07, "loss": 0.1407, "step": 836 }, { "epoch": 0.1320345466735024, "grad_norm": 4.879453182220459, "learning_rate": 8.858476895829978e-07, "loss": 0.0852, "step": 837 }, { "epoch": 0.1321922940410932, "grad_norm": 4.048136234283447, "learning_rate": 8.856866849138625e-07, "loss": 0.0419, "step": 838 }, { "epoch": 0.132350041408684, "grad_norm": 5.209859371185303, "learning_rate": 8.855256802447271e-07, "loss": 0.0496, "step": 839 }, { "epoch": 0.1325077887762748, "grad_norm": 6.314807891845703, "learning_rate": 8.853646755755917e-07, "loss": 0.1217, "step": 840 }, { "epoch": 0.1326655361438656, "grad_norm": 5.608424663543701, "learning_rate": 8.852036709064562e-07, "loss": 0.0795, "step": 841 }, { "epoch": 0.1328232835114564, "grad_norm": 8.808143615722656, "learning_rate": 8.850426662373208e-07, "loss": 0.0573, "step": 842 }, { "epoch": 0.1329810308790472, "grad_norm": 6.290168762207031, "learning_rate": 8.848816615681854e-07, "loss": 0.0691, "step": 843 }, { "epoch": 0.133138778246638, "grad_norm": 7.928153038024902, "learning_rate": 8.8472065689905e-07, "loss": 0.0669, "step": 844 }, { "epoch": 0.1332965256142288, "grad_norm": 10.569119453430176, "learning_rate": 8.845596522299145e-07, "loss": 0.1071, "step": 845 }, { "epoch": 0.1334542729818196, "grad_norm": 8.861801147460938, "learning_rate": 8.843986475607792e-07, "loss": 0.071, "step": 846 }, { "epoch": 0.13361202034941042, "grad_norm": 8.680996894836426, "learning_rate": 8.842376428916438e-07, "loss": 0.0885, "step": 847 }, { "epoch": 0.13376976771700122, "grad_norm": 4.941028118133545, "learning_rate": 8.840766382225085e-07, "loss": 0.0677, "step": 848 }, { "epoch": 0.13392751508459202, "grad_norm": 4.489545822143555, "learning_rate": 8.83915633553373e-07, "loss": 0.04, "step": 849 }, { "epoch": 0.13408526245218283, "grad_norm": 10.692946434020996, "learning_rate": 8.837546288842376e-07, "loss": 0.0641, "step": 850 }, { "epoch": 0.13424300981977363, "grad_norm": 7.841832160949707, "learning_rate": 8.835936242151022e-07, "loss": 0.0606, "step": 851 }, { "epoch": 0.13440075718736444, "grad_norm": 7.537064552307129, "learning_rate": 8.834326195459668e-07, "loss": 0.0785, "step": 852 }, { "epoch": 0.13455850455495524, "grad_norm": 8.915136337280273, "learning_rate": 8.832716148768315e-07, "loss": 0.0517, "step": 853 }, { "epoch": 0.13471625192254605, "grad_norm": 7.323801517486572, "learning_rate": 8.83110610207696e-07, "loss": 0.0766, "step": 854 }, { "epoch": 0.13487399929013685, "grad_norm": 4.316955089569092, "learning_rate": 8.829496055385606e-07, "loss": 0.0428, "step": 855 }, { "epoch": 0.13503174665772766, "grad_norm": 8.194013595581055, "learning_rate": 8.827886008694252e-07, "loss": 0.0681, "step": 856 }, { "epoch": 0.13518949402531846, "grad_norm": 6.799248218536377, "learning_rate": 8.826275962002898e-07, "loss": 0.0697, "step": 857 }, { "epoch": 0.13534724139290927, "grad_norm": 11.271206855773926, "learning_rate": 8.824665915311543e-07, "loss": 0.0967, "step": 858 }, { "epoch": 0.13550498876050007, "grad_norm": 4.689577102661133, "learning_rate": 8.823055868620189e-07, "loss": 0.0548, "step": 859 }, { "epoch": 0.13566273612809085, "grad_norm": 5.299933910369873, "learning_rate": 8.821445821928835e-07, "loss": 0.0993, "step": 860 }, { "epoch": 0.13582048349568165, "grad_norm": 6.0095601081848145, "learning_rate": 8.819835775237482e-07, "loss": 0.0766, "step": 861 }, { "epoch": 0.13597823086327246, "grad_norm": 9.537050247192383, "learning_rate": 8.818225728546127e-07, "loss": 0.0891, "step": 862 }, { "epoch": 0.13613597823086326, "grad_norm": 5.553176403045654, "learning_rate": 8.816615681854773e-07, "loss": 0.0674, "step": 863 }, { "epoch": 0.13629372559845407, "grad_norm": 4.781669616699219, "learning_rate": 8.815005635163419e-07, "loss": 0.03, "step": 864 }, { "epoch": 0.13645147296604487, "grad_norm": 5.221872329711914, "learning_rate": 8.813395588472065e-07, "loss": 0.0742, "step": 865 }, { "epoch": 0.13660922033363568, "grad_norm": 4.754615783691406, "learning_rate": 8.81178554178071e-07, "loss": 0.0593, "step": 866 }, { "epoch": 0.13676696770122648, "grad_norm": 3.2132647037506104, "learning_rate": 8.810175495089357e-07, "loss": 0.0935, "step": 867 }, { "epoch": 0.1369247150688173, "grad_norm": 3.5838680267333984, "learning_rate": 8.808565448398004e-07, "loss": 0.0559, "step": 868 }, { "epoch": 0.1370824624364081, "grad_norm": 2.9936325550079346, "learning_rate": 8.80695540170665e-07, "loss": 0.0246, "step": 869 }, { "epoch": 0.1372402098039989, "grad_norm": 10.4288330078125, "learning_rate": 8.805345355015295e-07, "loss": 0.1194, "step": 870 }, { "epoch": 0.1373979571715897, "grad_norm": 5.2311110496521, "learning_rate": 8.803735308323941e-07, "loss": 0.0953, "step": 871 }, { "epoch": 0.1375557045391805, "grad_norm": 1.4418354034423828, "learning_rate": 8.802125261632587e-07, "loss": 0.0196, "step": 872 }, { "epoch": 0.1377134519067713, "grad_norm": 5.885601997375488, "learning_rate": 8.800515214941233e-07, "loss": 0.1315, "step": 873 }, { "epoch": 0.13787119927436212, "grad_norm": 4.334197998046875, "learning_rate": 8.798905168249879e-07, "loss": 0.0832, "step": 874 }, { "epoch": 0.13802894664195292, "grad_norm": 4.670304775238037, "learning_rate": 8.797295121558524e-07, "loss": 0.0383, "step": 875 }, { "epoch": 0.13818669400954373, "grad_norm": 7.2617669105529785, "learning_rate": 8.795685074867171e-07, "loss": 0.1078, "step": 876 }, { "epoch": 0.13834444137713453, "grad_norm": 5.498252868652344, "learning_rate": 8.794075028175817e-07, "loss": 0.0972, "step": 877 }, { "epoch": 0.13850218874472534, "grad_norm": 6.531783103942871, "learning_rate": 8.792464981484463e-07, "loss": 0.0993, "step": 878 }, { "epoch": 0.1386599361123161, "grad_norm": 4.48145055770874, "learning_rate": 8.790854934793108e-07, "loss": 0.0363, "step": 879 }, { "epoch": 0.13881768347990692, "grad_norm": 3.881742000579834, "learning_rate": 8.789244888101754e-07, "loss": 0.0352, "step": 880 }, { "epoch": 0.13897543084749772, "grad_norm": 2.8471145629882812, "learning_rate": 8.7876348414104e-07, "loss": 0.0559, "step": 881 }, { "epoch": 0.13913317821508853, "grad_norm": 4.34542179107666, "learning_rate": 8.786024794719046e-07, "loss": 0.037, "step": 882 }, { "epoch": 0.13929092558267933, "grad_norm": 7.625082015991211, "learning_rate": 8.784414748027693e-07, "loss": 0.061, "step": 883 }, { "epoch": 0.13944867295027014, "grad_norm": 5.236334323883057, "learning_rate": 8.782804701336339e-07, "loss": 0.0552, "step": 884 }, { "epoch": 0.13960642031786094, "grad_norm": 5.2504963874816895, "learning_rate": 8.781194654644985e-07, "loss": 0.0959, "step": 885 }, { "epoch": 0.13976416768545175, "grad_norm": 4.942289352416992, "learning_rate": 8.779584607953631e-07, "loss": 0.0733, "step": 886 }, { "epoch": 0.13992191505304255, "grad_norm": 5.178107738494873, "learning_rate": 8.777974561262276e-07, "loss": 0.0996, "step": 887 }, { "epoch": 0.14007966242063336, "grad_norm": 5.820649147033691, "learning_rate": 8.776364514570922e-07, "loss": 0.0443, "step": 888 }, { "epoch": 0.14023740978822416, "grad_norm": 3.677860736846924, "learning_rate": 8.774754467879568e-07, "loss": 0.055, "step": 889 }, { "epoch": 0.14039515715581496, "grad_norm": 3.097083568572998, "learning_rate": 8.773144421188214e-07, "loss": 0.0293, "step": 890 }, { "epoch": 0.14055290452340577, "grad_norm": 5.426209926605225, "learning_rate": 8.77153437449686e-07, "loss": 0.0501, "step": 891 }, { "epoch": 0.14071065189099657, "grad_norm": 8.975652694702148, "learning_rate": 8.769924327805506e-07, "loss": 0.0475, "step": 892 }, { "epoch": 0.14086839925858738, "grad_norm": 3.8866279125213623, "learning_rate": 8.768314281114152e-07, "loss": 0.0776, "step": 893 }, { "epoch": 0.14102614662617818, "grad_norm": 5.4157280921936035, "learning_rate": 8.766704234422798e-07, "loss": 0.0702, "step": 894 }, { "epoch": 0.141183893993769, "grad_norm": 7.1885600090026855, "learning_rate": 8.765094187731443e-07, "loss": 0.0748, "step": 895 }, { "epoch": 0.1413416413613598, "grad_norm": 10.391587257385254, "learning_rate": 8.763484141040089e-07, "loss": 0.1019, "step": 896 }, { "epoch": 0.1414993887289506, "grad_norm": 8.128127098083496, "learning_rate": 8.761874094348735e-07, "loss": 0.1077, "step": 897 }, { "epoch": 0.1416571360965414, "grad_norm": 7.078885078430176, "learning_rate": 8.760264047657382e-07, "loss": 0.0775, "step": 898 }, { "epoch": 0.14181488346413218, "grad_norm": 5.855307579040527, "learning_rate": 8.758654000966028e-07, "loss": 0.0534, "step": 899 }, { "epoch": 0.14197263083172298, "grad_norm": 8.8378324508667, "learning_rate": 8.757043954274673e-07, "loss": 0.1473, "step": 900 }, { "epoch": 0.1421303781993138, "grad_norm": 6.6874518394470215, "learning_rate": 8.75543390758332e-07, "loss": 0.0767, "step": 901 }, { "epoch": 0.1422881255669046, "grad_norm": 4.136755466461182, "learning_rate": 8.753823860891966e-07, "loss": 0.0864, "step": 902 }, { "epoch": 0.1424458729344954, "grad_norm": 10.590389251708984, "learning_rate": 8.752213814200612e-07, "loss": 0.0944, "step": 903 }, { "epoch": 0.1426036203020862, "grad_norm": 9.044832229614258, "learning_rate": 8.750603767509257e-07, "loss": 0.1435, "step": 904 }, { "epoch": 0.142761367669677, "grad_norm": 6.6368021965026855, "learning_rate": 8.748993720817903e-07, "loss": 0.0932, "step": 905 }, { "epoch": 0.1429191150372678, "grad_norm": 6.0270562171936035, "learning_rate": 8.74738367412655e-07, "loss": 0.0668, "step": 906 }, { "epoch": 0.14307686240485862, "grad_norm": 5.822785377502441, "learning_rate": 8.745773627435196e-07, "loss": 0.0734, "step": 907 }, { "epoch": 0.14323460977244942, "grad_norm": 5.617862701416016, "learning_rate": 8.744163580743841e-07, "loss": 0.0667, "step": 908 }, { "epoch": 0.14339235714004023, "grad_norm": 7.765786647796631, "learning_rate": 8.742553534052487e-07, "loss": 0.0538, "step": 909 }, { "epoch": 0.14355010450763103, "grad_norm": 5.429433822631836, "learning_rate": 8.740943487361133e-07, "loss": 0.0769, "step": 910 }, { "epoch": 0.14370785187522184, "grad_norm": 4.444888591766357, "learning_rate": 8.739333440669779e-07, "loss": 0.053, "step": 911 }, { "epoch": 0.14386559924281264, "grad_norm": 8.31460189819336, "learning_rate": 8.737723393978424e-07, "loss": 0.096, "step": 912 }, { "epoch": 0.14402334661040345, "grad_norm": 8.856794357299805, "learning_rate": 8.736113347287071e-07, "loss": 0.084, "step": 913 }, { "epoch": 0.14418109397799425, "grad_norm": 5.17337703704834, "learning_rate": 8.734503300595717e-07, "loss": 0.0826, "step": 914 }, { "epoch": 0.14433884134558506, "grad_norm": 5.695919990539551, "learning_rate": 8.732893253904363e-07, "loss": 0.0729, "step": 915 }, { "epoch": 0.14449658871317586, "grad_norm": 2.785297393798828, "learning_rate": 8.731283207213008e-07, "loss": 0.0503, "step": 916 }, { "epoch": 0.14465433608076667, "grad_norm": 4.552045822143555, "learning_rate": 8.729673160521654e-07, "loss": 0.0618, "step": 917 }, { "epoch": 0.14481208344835744, "grad_norm": 11.460582733154297, "learning_rate": 8.7280631138303e-07, "loss": 0.1406, "step": 918 }, { "epoch": 0.14496983081594825, "grad_norm": 7.194328308105469, "learning_rate": 8.726453067138947e-07, "loss": 0.1099, "step": 919 }, { "epoch": 0.14512757818353905, "grad_norm": 5.801459312438965, "learning_rate": 8.724843020447593e-07, "loss": 0.0688, "step": 920 }, { "epoch": 0.14528532555112986, "grad_norm": 8.886085510253906, "learning_rate": 8.723232973756239e-07, "loss": 0.1025, "step": 921 }, { "epoch": 0.14544307291872066, "grad_norm": 6.849817276000977, "learning_rate": 8.721622927064885e-07, "loss": 0.1059, "step": 922 }, { "epoch": 0.14560082028631147, "grad_norm": 3.839834213256836, "learning_rate": 8.720012880373531e-07, "loss": 0.0568, "step": 923 }, { "epoch": 0.14575856765390227, "grad_norm": 11.36760139465332, "learning_rate": 8.718402833682177e-07, "loss": 0.1034, "step": 924 }, { "epoch": 0.14591631502149308, "grad_norm": 6.963069915771484, "learning_rate": 8.716792786990822e-07, "loss": 0.0885, "step": 925 }, { "epoch": 0.14607406238908388, "grad_norm": 13.822622299194336, "learning_rate": 8.715182740299468e-07, "loss": 0.0581, "step": 926 }, { "epoch": 0.14623180975667469, "grad_norm": 3.9957492351531982, "learning_rate": 8.713572693608114e-07, "loss": 0.0641, "step": 927 }, { "epoch": 0.1463895571242655, "grad_norm": 5.5053229331970215, "learning_rate": 8.711962646916761e-07, "loss": 0.0436, "step": 928 }, { "epoch": 0.1465473044918563, "grad_norm": 5.216220378875732, "learning_rate": 8.710352600225406e-07, "loss": 0.0793, "step": 929 }, { "epoch": 0.1467050518594471, "grad_norm": 5.825531005859375, "learning_rate": 8.708742553534052e-07, "loss": 0.1178, "step": 930 }, { "epoch": 0.1468627992270379, "grad_norm": 6.612665176391602, "learning_rate": 8.707132506842698e-07, "loss": 0.0723, "step": 931 }, { "epoch": 0.1470205465946287, "grad_norm": 25.09065055847168, "learning_rate": 8.705522460151344e-07, "loss": 0.0635, "step": 932 }, { "epoch": 0.14717829396221951, "grad_norm": 5.503942966461182, "learning_rate": 8.703912413459989e-07, "loss": 0.0533, "step": 933 }, { "epoch": 0.14733604132981032, "grad_norm": 5.840237140655518, "learning_rate": 8.702302366768635e-07, "loss": 0.0705, "step": 934 }, { "epoch": 0.14749378869740112, "grad_norm": 4.470847129821777, "learning_rate": 8.700692320077281e-07, "loss": 0.0629, "step": 935 }, { "epoch": 0.14765153606499193, "grad_norm": 13.115463256835938, "learning_rate": 8.699082273385929e-07, "loss": 0.1077, "step": 936 }, { "epoch": 0.1478092834325827, "grad_norm": 7.014602184295654, "learning_rate": 8.697472226694574e-07, "loss": 0.0464, "step": 937 }, { "epoch": 0.1479670308001735, "grad_norm": 10.690193176269531, "learning_rate": 8.69586218000322e-07, "loss": 0.1457, "step": 938 }, { "epoch": 0.14812477816776431, "grad_norm": 7.957321643829346, "learning_rate": 8.694252133311866e-07, "loss": 0.0614, "step": 939 }, { "epoch": 0.14828252553535512, "grad_norm": 5.306655406951904, "learning_rate": 8.692642086620512e-07, "loss": 0.0963, "step": 940 }, { "epoch": 0.14844027290294592, "grad_norm": 6.521378040313721, "learning_rate": 8.691032039929157e-07, "loss": 0.0836, "step": 941 }, { "epoch": 0.14859802027053673, "grad_norm": 6.258962631225586, "learning_rate": 8.689421993237803e-07, "loss": 0.0904, "step": 942 }, { "epoch": 0.14875576763812753, "grad_norm": 5.522530555725098, "learning_rate": 8.68781194654645e-07, "loss": 0.0627, "step": 943 }, { "epoch": 0.14891351500571834, "grad_norm": 3.896024465560913, "learning_rate": 8.686201899855096e-07, "loss": 0.0576, "step": 944 }, { "epoch": 0.14907126237330914, "grad_norm": 8.584480285644531, "learning_rate": 8.684591853163742e-07, "loss": 0.0483, "step": 945 }, { "epoch": 0.14922900974089995, "grad_norm": 4.397729873657227, "learning_rate": 8.682981806472387e-07, "loss": 0.053, "step": 946 }, { "epoch": 0.14938675710849075, "grad_norm": 4.222675323486328, "learning_rate": 8.681371759781033e-07, "loss": 0.0363, "step": 947 }, { "epoch": 0.14954450447608156, "grad_norm": 6.897716522216797, "learning_rate": 8.679761713089679e-07, "loss": 0.0565, "step": 948 }, { "epoch": 0.14970225184367236, "grad_norm": 3.7198383808135986, "learning_rate": 8.678151666398325e-07, "loss": 0.0737, "step": 949 }, { "epoch": 0.14985999921126317, "grad_norm": 5.227583885192871, "learning_rate": 8.67654161970697e-07, "loss": 0.0567, "step": 950 }, { "epoch": 0.15001774657885397, "grad_norm": 4.30453634262085, "learning_rate": 8.674931573015617e-07, "loss": 0.0723, "step": 951 }, { "epoch": 0.15017549394644478, "grad_norm": 7.840729713439941, "learning_rate": 8.673321526324263e-07, "loss": 0.0971, "step": 952 }, { "epoch": 0.15033324131403558, "grad_norm": 5.92392110824585, "learning_rate": 8.67171147963291e-07, "loss": 0.0314, "step": 953 }, { "epoch": 0.1504909886816264, "grad_norm": 4.194683074951172, "learning_rate": 8.670101432941555e-07, "loss": 0.0544, "step": 954 }, { "epoch": 0.1506487360492172, "grad_norm": 2.73091721534729, "learning_rate": 8.668491386250201e-07, "loss": 0.0282, "step": 955 }, { "epoch": 0.15080648341680797, "grad_norm": 4.007771015167236, "learning_rate": 8.666881339558847e-07, "loss": 0.0445, "step": 956 }, { "epoch": 0.15096423078439877, "grad_norm": 4.4001054763793945, "learning_rate": 8.665271292867493e-07, "loss": 0.0619, "step": 957 }, { "epoch": 0.15112197815198958, "grad_norm": 5.6491522789001465, "learning_rate": 8.663661246176138e-07, "loss": 0.0527, "step": 958 }, { "epoch": 0.15127972551958038, "grad_norm": 4.243597984313965, "learning_rate": 8.662051199484785e-07, "loss": 0.0589, "step": 959 }, { "epoch": 0.1514374728871712, "grad_norm": 3.485116720199585, "learning_rate": 8.660441152793431e-07, "loss": 0.0274, "step": 960 }, { "epoch": 0.151595220254762, "grad_norm": 4.097177505493164, "learning_rate": 8.658831106102077e-07, "loss": 0.0724, "step": 961 }, { "epoch": 0.1517529676223528, "grad_norm": 3.5488181114196777, "learning_rate": 8.657221059410722e-07, "loss": 0.0375, "step": 962 }, { "epoch": 0.1519107149899436, "grad_norm": 3.5617055892944336, "learning_rate": 8.655611012719368e-07, "loss": 0.0635, "step": 963 }, { "epoch": 0.1520684623575344, "grad_norm": 5.917942047119141, "learning_rate": 8.654000966028014e-07, "loss": 0.0732, "step": 964 }, { "epoch": 0.1522262097251252, "grad_norm": 7.556006908416748, "learning_rate": 8.65239091933666e-07, "loss": 0.0382, "step": 965 }, { "epoch": 0.15238395709271602, "grad_norm": 10.01443862915039, "learning_rate": 8.650780872645307e-07, "loss": 0.0893, "step": 966 }, { "epoch": 0.15254170446030682, "grad_norm": 8.719974517822266, "learning_rate": 8.649170825953952e-07, "loss": 0.0729, "step": 967 }, { "epoch": 0.15269945182789763, "grad_norm": 9.003377914428711, "learning_rate": 8.647560779262598e-07, "loss": 0.1022, "step": 968 }, { "epoch": 0.15285719919548843, "grad_norm": 6.390311241149902, "learning_rate": 8.645950732571244e-07, "loss": 0.1335, "step": 969 }, { "epoch": 0.15301494656307923, "grad_norm": 14.419649124145508, "learning_rate": 8.64434068587989e-07, "loss": 0.1022, "step": 970 }, { "epoch": 0.15317269393067004, "grad_norm": 5.124694347381592, "learning_rate": 8.642730639188535e-07, "loss": 0.1289, "step": 971 }, { "epoch": 0.15333044129826084, "grad_norm": 8.405393600463867, "learning_rate": 8.641120592497182e-07, "loss": 0.0545, "step": 972 }, { "epoch": 0.15348818866585165, "grad_norm": 7.67399263381958, "learning_rate": 8.639510545805829e-07, "loss": 0.0632, "step": 973 }, { "epoch": 0.15364593603344245, "grad_norm": 6.533464431762695, "learning_rate": 8.637900499114475e-07, "loss": 0.0677, "step": 974 }, { "epoch": 0.15380368340103326, "grad_norm": 9.284419059753418, "learning_rate": 8.63629045242312e-07, "loss": 0.087, "step": 975 }, { "epoch": 0.15396143076862404, "grad_norm": 4.728756904602051, "learning_rate": 8.634680405731766e-07, "loss": 0.0651, "step": 976 }, { "epoch": 0.15411917813621484, "grad_norm": 8.1788969039917, "learning_rate": 8.633070359040412e-07, "loss": 0.0738, "step": 977 }, { "epoch": 0.15427692550380565, "grad_norm": 13.664631843566895, "learning_rate": 8.631460312349058e-07, "loss": 0.0805, "step": 978 }, { "epoch": 0.15443467287139645, "grad_norm": 10.1797456741333, "learning_rate": 8.629850265657703e-07, "loss": 0.0571, "step": 979 }, { "epoch": 0.15459242023898725, "grad_norm": 4.150187015533447, "learning_rate": 8.628240218966349e-07, "loss": 0.0303, "step": 980 }, { "epoch": 0.15475016760657806, "grad_norm": 3.7078182697296143, "learning_rate": 8.626630172274996e-07, "loss": 0.0406, "step": 981 }, { "epoch": 0.15490791497416886, "grad_norm": 7.318813323974609, "learning_rate": 8.625020125583642e-07, "loss": 0.0502, "step": 982 }, { "epoch": 0.15506566234175967, "grad_norm": 6.322595596313477, "learning_rate": 8.623410078892287e-07, "loss": 0.0582, "step": 983 }, { "epoch": 0.15522340970935047, "grad_norm": 5.230018615722656, "learning_rate": 8.621800032200933e-07, "loss": 0.0487, "step": 984 }, { "epoch": 0.15538115707694128, "grad_norm": 5.482806205749512, "learning_rate": 8.620189985509579e-07, "loss": 0.0419, "step": 985 }, { "epoch": 0.15553890444453208, "grad_norm": 6.209947109222412, "learning_rate": 8.618579938818225e-07, "loss": 0.0504, "step": 986 }, { "epoch": 0.1556966518121229, "grad_norm": 8.243134498596191, "learning_rate": 8.61696989212687e-07, "loss": 0.0585, "step": 987 }, { "epoch": 0.1558543991797137, "grad_norm": 4.481822490692139, "learning_rate": 8.615359845435516e-07, "loss": 0.0529, "step": 988 }, { "epoch": 0.1560121465473045, "grad_norm": 7.807392597198486, "learning_rate": 8.613749798744164e-07, "loss": 0.0348, "step": 989 }, { "epoch": 0.1561698939148953, "grad_norm": 9.053619384765625, "learning_rate": 8.61213975205281e-07, "loss": 0.0759, "step": 990 }, { "epoch": 0.1563276412824861, "grad_norm": 9.76534366607666, "learning_rate": 8.610529705361456e-07, "loss": 0.0552, "step": 991 }, { "epoch": 0.1564853886500769, "grad_norm": 9.809041976928711, "learning_rate": 8.608919658670101e-07, "loss": 0.0747, "step": 992 }, { "epoch": 0.15664313601766772, "grad_norm": 6.268856048583984, "learning_rate": 8.607309611978747e-07, "loss": 0.0861, "step": 993 }, { "epoch": 0.15680088338525852, "grad_norm": 142.48008728027344, "learning_rate": 8.605699565287393e-07, "loss": 0.1093, "step": 994 }, { "epoch": 0.1569586307528493, "grad_norm": 4.752850532531738, "learning_rate": 8.604089518596039e-07, "loss": 0.1125, "step": 995 }, { "epoch": 0.1571163781204401, "grad_norm": 6.792891025543213, "learning_rate": 8.602479471904685e-07, "loss": 0.0506, "step": 996 }, { "epoch": 0.1572741254880309, "grad_norm": 5.308335304260254, "learning_rate": 8.600869425213331e-07, "loss": 0.037, "step": 997 }, { "epoch": 0.1574318728556217, "grad_norm": 4.207362651824951, "learning_rate": 8.599259378521977e-07, "loss": 0.052, "step": 998 }, { "epoch": 0.15758962022321252, "grad_norm": 6.1989922523498535, "learning_rate": 8.597649331830623e-07, "loss": 0.0505, "step": 999 }, { "epoch": 0.15774736759080332, "grad_norm": 6.644219398498535, "learning_rate": 8.596039285139268e-07, "loss": 0.0708, "step": 1000 }, { "epoch": 0.15790511495839413, "grad_norm": 8.120538711547852, "learning_rate": 8.594429238447914e-07, "loss": 0.0471, "step": 1001 }, { "epoch": 0.15806286232598493, "grad_norm": 3.678455352783203, "learning_rate": 8.59281919175656e-07, "loss": 0.0294, "step": 1002 }, { "epoch": 0.15822060969357574, "grad_norm": 6.847627639770508, "learning_rate": 8.591209145065207e-07, "loss": 0.1008, "step": 1003 }, { "epoch": 0.15837835706116654, "grad_norm": 3.4793381690979004, "learning_rate": 8.589599098373852e-07, "loss": 0.0329, "step": 1004 }, { "epoch": 0.15853610442875735, "grad_norm": 7.70238733291626, "learning_rate": 8.587989051682498e-07, "loss": 0.0634, "step": 1005 }, { "epoch": 0.15869385179634815, "grad_norm": 4.5630106925964355, "learning_rate": 8.586379004991145e-07, "loss": 0.0774, "step": 1006 }, { "epoch": 0.15885159916393896, "grad_norm": 8.479558944702148, "learning_rate": 8.584768958299791e-07, "loss": 0.0769, "step": 1007 }, { "epoch": 0.15900934653152976, "grad_norm": 8.264494895935059, "learning_rate": 8.583158911608436e-07, "loss": 0.0663, "step": 1008 }, { "epoch": 0.15916709389912057, "grad_norm": 4.046088218688965, "learning_rate": 8.581548864917082e-07, "loss": 0.036, "step": 1009 }, { "epoch": 0.15932484126671137, "grad_norm": 7.706320762634277, "learning_rate": 8.579938818225728e-07, "loss": 0.0693, "step": 1010 }, { "epoch": 0.15948258863430217, "grad_norm": 3.29152250289917, "learning_rate": 8.578328771534375e-07, "loss": 0.0391, "step": 1011 }, { "epoch": 0.15964033600189298, "grad_norm": 3.251323699951172, "learning_rate": 8.576718724843021e-07, "loss": 0.0414, "step": 1012 }, { "epoch": 0.15979808336948378, "grad_norm": 5.912606716156006, "learning_rate": 8.575108678151666e-07, "loss": 0.067, "step": 1013 }, { "epoch": 0.15995583073707456, "grad_norm": 5.435115814208984, "learning_rate": 8.573498631460312e-07, "loss": 0.0865, "step": 1014 }, { "epoch": 0.16011357810466537, "grad_norm": 4.942360877990723, "learning_rate": 8.571888584768958e-07, "loss": 0.1016, "step": 1015 }, { "epoch": 0.16027132547225617, "grad_norm": 4.604283332824707, "learning_rate": 8.570278538077604e-07, "loss": 0.0371, "step": 1016 }, { "epoch": 0.16042907283984698, "grad_norm": 6.982027530670166, "learning_rate": 8.568668491386249e-07, "loss": 0.0997, "step": 1017 }, { "epoch": 0.16058682020743778, "grad_norm": 6.0777668952941895, "learning_rate": 8.567058444694896e-07, "loss": 0.0893, "step": 1018 }, { "epoch": 0.16074456757502859, "grad_norm": 3.114046573638916, "learning_rate": 8.565448398003542e-07, "loss": 0.0447, "step": 1019 }, { "epoch": 0.1609023149426194, "grad_norm": 5.120523452758789, "learning_rate": 8.563838351312188e-07, "loss": 0.0506, "step": 1020 }, { "epoch": 0.1610600623102102, "grad_norm": 6.959108352661133, "learning_rate": 8.562228304620833e-07, "loss": 0.0949, "step": 1021 }, { "epoch": 0.161217809677801, "grad_norm": 6.767091751098633, "learning_rate": 8.560618257929479e-07, "loss": 0.1029, "step": 1022 }, { "epoch": 0.1613755570453918, "grad_norm": 6.899235725402832, "learning_rate": 8.559008211238125e-07, "loss": 0.0677, "step": 1023 }, { "epoch": 0.1615333044129826, "grad_norm": 7.200080871582031, "learning_rate": 8.557398164546772e-07, "loss": 0.0737, "step": 1024 }, { "epoch": 0.1616910517805734, "grad_norm": 4.691421985626221, "learning_rate": 8.555788117855417e-07, "loss": 0.0601, "step": 1025 }, { "epoch": 0.16184879914816422, "grad_norm": 6.024470806121826, "learning_rate": 8.554178071164064e-07, "loss": 0.0558, "step": 1026 }, { "epoch": 0.16200654651575502, "grad_norm": 4.782193183898926, "learning_rate": 8.55256802447271e-07, "loss": 0.0446, "step": 1027 }, { "epoch": 0.16216429388334583, "grad_norm": 4.7500176429748535, "learning_rate": 8.550957977781356e-07, "loss": 0.0442, "step": 1028 }, { "epoch": 0.16232204125093663, "grad_norm": 6.006361961364746, "learning_rate": 8.549347931090001e-07, "loss": 0.0875, "step": 1029 }, { "epoch": 0.16247978861852744, "grad_norm": 5.781008720397949, "learning_rate": 8.547737884398647e-07, "loss": 0.0889, "step": 1030 }, { "epoch": 0.16263753598611824, "grad_norm": 6.284361362457275, "learning_rate": 8.546127837707293e-07, "loss": 0.063, "step": 1031 }, { "epoch": 0.16279528335370905, "grad_norm": 15.59167766571045, "learning_rate": 8.544517791015939e-07, "loss": 0.0708, "step": 1032 }, { "epoch": 0.16295303072129985, "grad_norm": 5.2419304847717285, "learning_rate": 8.542907744324584e-07, "loss": 0.0629, "step": 1033 }, { "epoch": 0.16311077808889063, "grad_norm": 5.976012229919434, "learning_rate": 8.541297697633231e-07, "loss": 0.0489, "step": 1034 }, { "epoch": 0.16326852545648143, "grad_norm": 6.138513088226318, "learning_rate": 8.539687650941877e-07, "loss": 0.0612, "step": 1035 }, { "epoch": 0.16342627282407224, "grad_norm": 14.731651306152344, "learning_rate": 8.538077604250523e-07, "loss": 0.06, "step": 1036 }, { "epoch": 0.16358402019166304, "grad_norm": 7.656831741333008, "learning_rate": 8.536467557559169e-07, "loss": 0.1064, "step": 1037 }, { "epoch": 0.16374176755925385, "grad_norm": 5.557590484619141, "learning_rate": 8.534857510867814e-07, "loss": 0.1172, "step": 1038 }, { "epoch": 0.16389951492684465, "grad_norm": 5.93520450592041, "learning_rate": 8.53324746417646e-07, "loss": 0.1034, "step": 1039 }, { "epoch": 0.16405726229443546, "grad_norm": 9.557320594787598, "learning_rate": 8.531637417485106e-07, "loss": 0.0541, "step": 1040 }, { "epoch": 0.16421500966202626, "grad_norm": 5.482620716094971, "learning_rate": 8.530027370793754e-07, "loss": 0.0638, "step": 1041 }, { "epoch": 0.16437275702961707, "grad_norm": 7.604795455932617, "learning_rate": 8.528417324102399e-07, "loss": 0.0622, "step": 1042 }, { "epoch": 0.16453050439720787, "grad_norm": 5.067941188812256, "learning_rate": 8.526807277411045e-07, "loss": 0.0624, "step": 1043 }, { "epoch": 0.16468825176479868, "grad_norm": 7.364793300628662, "learning_rate": 8.525197230719691e-07, "loss": 0.0741, "step": 1044 }, { "epoch": 0.16484599913238948, "grad_norm": 5.77717399597168, "learning_rate": 8.523587184028337e-07, "loss": 0.0471, "step": 1045 }, { "epoch": 0.16500374649998029, "grad_norm": 7.496662139892578, "learning_rate": 8.521977137336982e-07, "loss": 0.0667, "step": 1046 }, { "epoch": 0.1651614938675711, "grad_norm": 8.605033874511719, "learning_rate": 8.520367090645628e-07, "loss": 0.0818, "step": 1047 }, { "epoch": 0.1653192412351619, "grad_norm": 6.1481781005859375, "learning_rate": 8.518757043954275e-07, "loss": 0.0884, "step": 1048 }, { "epoch": 0.1654769886027527, "grad_norm": 4.9542059898376465, "learning_rate": 8.517146997262921e-07, "loss": 0.0778, "step": 1049 }, { "epoch": 0.1656347359703435, "grad_norm": 7.27318000793457, "learning_rate": 8.515536950571566e-07, "loss": 0.0653, "step": 1050 }, { "epoch": 0.1657924833379343, "grad_norm": 5.166891098022461, "learning_rate": 8.513926903880212e-07, "loss": 0.0554, "step": 1051 }, { "epoch": 0.16595023070552511, "grad_norm": 9.332530975341797, "learning_rate": 8.512316857188858e-07, "loss": 0.0753, "step": 1052 }, { "epoch": 0.1661079780731159, "grad_norm": 4.665710926055908, "learning_rate": 8.510706810497504e-07, "loss": 0.0662, "step": 1053 }, { "epoch": 0.1662657254407067, "grad_norm": 5.497588634490967, "learning_rate": 8.509096763806149e-07, "loss": 0.0852, "step": 1054 }, { "epoch": 0.1664234728082975, "grad_norm": 5.279402732849121, "learning_rate": 8.507486717114795e-07, "loss": 0.0511, "step": 1055 }, { "epoch": 0.1665812201758883, "grad_norm": 4.8032097816467285, "learning_rate": 8.505876670423442e-07, "loss": 0.0743, "step": 1056 }, { "epoch": 0.1667389675434791, "grad_norm": 7.598001480102539, "learning_rate": 8.504266623732088e-07, "loss": 0.0997, "step": 1057 }, { "epoch": 0.16689671491106992, "grad_norm": 6.238626956939697, "learning_rate": 8.502656577040733e-07, "loss": 0.0529, "step": 1058 }, { "epoch": 0.16705446227866072, "grad_norm": 9.464849472045898, "learning_rate": 8.50104653034938e-07, "loss": 0.0858, "step": 1059 }, { "epoch": 0.16721220964625153, "grad_norm": 4.487782955169678, "learning_rate": 8.499436483658026e-07, "loss": 0.0772, "step": 1060 }, { "epoch": 0.16736995701384233, "grad_norm": 6.810683250427246, "learning_rate": 8.497826436966672e-07, "loss": 0.1174, "step": 1061 }, { "epoch": 0.16752770438143313, "grad_norm": 4.956742286682129, "learning_rate": 8.496216390275318e-07, "loss": 0.09, "step": 1062 }, { "epoch": 0.16768545174902394, "grad_norm": 6.974895477294922, "learning_rate": 8.494606343583963e-07, "loss": 0.0801, "step": 1063 }, { "epoch": 0.16784319911661474, "grad_norm": 4.244524002075195, "learning_rate": 8.49299629689261e-07, "loss": 0.0587, "step": 1064 }, { "epoch": 0.16800094648420555, "grad_norm": 3.368680715560913, "learning_rate": 8.491386250201256e-07, "loss": 0.0819, "step": 1065 }, { "epoch": 0.16815869385179635, "grad_norm": 6.945837020874023, "learning_rate": 8.489776203509902e-07, "loss": 0.115, "step": 1066 }, { "epoch": 0.16831644121938716, "grad_norm": 10.10990047454834, "learning_rate": 8.488166156818547e-07, "loss": 0.104, "step": 1067 }, { "epoch": 0.16847418858697796, "grad_norm": 4.6603007316589355, "learning_rate": 8.486556110127193e-07, "loss": 0.0301, "step": 1068 }, { "epoch": 0.16863193595456877, "grad_norm": 4.7445969581604, "learning_rate": 8.484946063435839e-07, "loss": 0.0885, "step": 1069 }, { "epoch": 0.16878968332215957, "grad_norm": 7.254500389099121, "learning_rate": 8.483336016744485e-07, "loss": 0.0929, "step": 1070 }, { "epoch": 0.16894743068975038, "grad_norm": 3.6324713230133057, "learning_rate": 8.481725970053131e-07, "loss": 0.0365, "step": 1071 }, { "epoch": 0.16910517805734115, "grad_norm": 8.888936042785645, "learning_rate": 8.480115923361777e-07, "loss": 0.0819, "step": 1072 }, { "epoch": 0.16926292542493196, "grad_norm": 11.244965553283691, "learning_rate": 8.478505876670423e-07, "loss": 0.0637, "step": 1073 }, { "epoch": 0.16942067279252276, "grad_norm": 7.391339302062988, "learning_rate": 8.476895829979069e-07, "loss": 0.08, "step": 1074 }, { "epoch": 0.16957842016011357, "grad_norm": 12.288595199584961, "learning_rate": 8.475285783287714e-07, "loss": 0.0703, "step": 1075 }, { "epoch": 0.16973616752770437, "grad_norm": 5.894357681274414, "learning_rate": 8.47367573659636e-07, "loss": 0.0843, "step": 1076 }, { "epoch": 0.16989391489529518, "grad_norm": 8.744696617126465, "learning_rate": 8.472065689905007e-07, "loss": 0.0985, "step": 1077 }, { "epoch": 0.17005166226288598, "grad_norm": 6.621856212615967, "learning_rate": 8.470455643213654e-07, "loss": 0.0672, "step": 1078 }, { "epoch": 0.1702094096304768, "grad_norm": 6.276869297027588, "learning_rate": 8.468845596522299e-07, "loss": 0.0482, "step": 1079 }, { "epoch": 0.1703671569980676, "grad_norm": 5.934649467468262, "learning_rate": 8.467235549830945e-07, "loss": 0.0662, "step": 1080 }, { "epoch": 0.1705249043656584, "grad_norm": 4.509552478790283, "learning_rate": 8.465625503139591e-07, "loss": 0.0608, "step": 1081 }, { "epoch": 0.1706826517332492, "grad_norm": 5.281670570373535, "learning_rate": 8.464015456448237e-07, "loss": 0.0638, "step": 1082 }, { "epoch": 0.17084039910084, "grad_norm": 4.402935981750488, "learning_rate": 8.462405409756883e-07, "loss": 0.0386, "step": 1083 }, { "epoch": 0.1709981464684308, "grad_norm": 4.183838367462158, "learning_rate": 8.460795363065528e-07, "loss": 0.0955, "step": 1084 }, { "epoch": 0.17115589383602162, "grad_norm": 5.913980960845947, "learning_rate": 8.459185316374174e-07, "loss": 0.0793, "step": 1085 }, { "epoch": 0.17131364120361242, "grad_norm": 8.913019180297852, "learning_rate": 8.457575269682821e-07, "loss": 0.0787, "step": 1086 }, { "epoch": 0.17147138857120323, "grad_norm": 5.881781578063965, "learning_rate": 8.455965222991467e-07, "loss": 0.0781, "step": 1087 }, { "epoch": 0.17162913593879403, "grad_norm": 7.500895023345947, "learning_rate": 8.454355176300112e-07, "loss": 0.0884, "step": 1088 }, { "epoch": 0.17178688330638484, "grad_norm": 8.74000358581543, "learning_rate": 8.452745129608758e-07, "loss": 0.0661, "step": 1089 }, { "epoch": 0.17194463067397564, "grad_norm": 7.282954692840576, "learning_rate": 8.451135082917404e-07, "loss": 0.1357, "step": 1090 }, { "epoch": 0.17210237804156645, "grad_norm": 3.7355973720550537, "learning_rate": 8.44952503622605e-07, "loss": 0.0493, "step": 1091 }, { "epoch": 0.17226012540915722, "grad_norm": 3.9269330501556396, "learning_rate": 8.447914989534695e-07, "loss": 0.0325, "step": 1092 }, { "epoch": 0.17241787277674803, "grad_norm": 6.021385192871094, "learning_rate": 8.446304942843341e-07, "loss": 0.1451, "step": 1093 }, { "epoch": 0.17257562014433883, "grad_norm": 3.8144242763519287, "learning_rate": 8.444694896151989e-07, "loss": 0.0452, "step": 1094 }, { "epoch": 0.17273336751192964, "grad_norm": 6.527584075927734, "learning_rate": 8.443084849460635e-07, "loss": 0.0695, "step": 1095 }, { "epoch": 0.17289111487952044, "grad_norm": 3.24444317817688, "learning_rate": 8.44147480276928e-07, "loss": 0.0555, "step": 1096 }, { "epoch": 0.17304886224711125, "grad_norm": 6.450403213500977, "learning_rate": 8.439864756077926e-07, "loss": 0.0499, "step": 1097 }, { "epoch": 0.17320660961470205, "grad_norm": 6.687957286834717, "learning_rate": 8.438254709386572e-07, "loss": 0.0643, "step": 1098 }, { "epoch": 0.17336435698229286, "grad_norm": 7.201638698577881, "learning_rate": 8.436644662695218e-07, "loss": 0.0889, "step": 1099 }, { "epoch": 0.17352210434988366, "grad_norm": 3.677889347076416, "learning_rate": 8.435034616003863e-07, "loss": 0.0551, "step": 1100 }, { "epoch": 0.17367985171747446, "grad_norm": 6.6136627197265625, "learning_rate": 8.43342456931251e-07, "loss": 0.0787, "step": 1101 }, { "epoch": 0.17383759908506527, "grad_norm": 2.215071678161621, "learning_rate": 8.431814522621156e-07, "loss": 0.0369, "step": 1102 }, { "epoch": 0.17399534645265607, "grad_norm": 5.720557689666748, "learning_rate": 8.430204475929802e-07, "loss": 0.0522, "step": 1103 }, { "epoch": 0.17415309382024688, "grad_norm": 3.361826181411743, "learning_rate": 8.428594429238447e-07, "loss": 0.032, "step": 1104 }, { "epoch": 0.17431084118783768, "grad_norm": 4.106998920440674, "learning_rate": 8.426984382547093e-07, "loss": 0.0742, "step": 1105 }, { "epoch": 0.1744685885554285, "grad_norm": 5.182785987854004, "learning_rate": 8.425374335855739e-07, "loss": 0.0779, "step": 1106 }, { "epoch": 0.1746263359230193, "grad_norm": 5.918130397796631, "learning_rate": 8.423764289164385e-07, "loss": 0.0382, "step": 1107 }, { "epoch": 0.1747840832906101, "grad_norm": 8.346705436706543, "learning_rate": 8.422154242473032e-07, "loss": 0.0692, "step": 1108 }, { "epoch": 0.1749418306582009, "grad_norm": 5.706119060516357, "learning_rate": 8.420544195781677e-07, "loss": 0.0713, "step": 1109 }, { "epoch": 0.1750995780257917, "grad_norm": 5.7744927406311035, "learning_rate": 8.418934149090323e-07, "loss": 0.0614, "step": 1110 }, { "epoch": 0.17525732539338248, "grad_norm": 5.5424394607543945, "learning_rate": 8.41732410239897e-07, "loss": 0.0786, "step": 1111 }, { "epoch": 0.1754150727609733, "grad_norm": 8.933950424194336, "learning_rate": 8.415714055707616e-07, "loss": 0.0916, "step": 1112 }, { "epoch": 0.1755728201285641, "grad_norm": 4.873121738433838, "learning_rate": 8.414104009016261e-07, "loss": 0.0791, "step": 1113 }, { "epoch": 0.1757305674961549, "grad_norm": 11.223404884338379, "learning_rate": 8.412493962324907e-07, "loss": 0.0894, "step": 1114 }, { "epoch": 0.1758883148637457, "grad_norm": 8.899439811706543, "learning_rate": 8.410883915633553e-07, "loss": 0.0627, "step": 1115 }, { "epoch": 0.1760460622313365, "grad_norm": 8.759340286254883, "learning_rate": 8.4092738689422e-07, "loss": 0.0678, "step": 1116 }, { "epoch": 0.1762038095989273, "grad_norm": 8.697700500488281, "learning_rate": 8.407663822250845e-07, "loss": 0.0842, "step": 1117 }, { "epoch": 0.17636155696651812, "grad_norm": 5.81970739364624, "learning_rate": 8.406053775559491e-07, "loss": 0.0806, "step": 1118 }, { "epoch": 0.17651930433410892, "grad_norm": 3.0905957221984863, "learning_rate": 8.404443728868137e-07, "loss": 0.0616, "step": 1119 }, { "epoch": 0.17667705170169973, "grad_norm": 5.192538261413574, "learning_rate": 8.402833682176783e-07, "loss": 0.068, "step": 1120 }, { "epoch": 0.17683479906929053, "grad_norm": 5.564635276794434, "learning_rate": 8.401223635485428e-07, "loss": 0.0597, "step": 1121 }, { "epoch": 0.17699254643688134, "grad_norm": 7.305998802185059, "learning_rate": 8.399613588794074e-07, "loss": 0.092, "step": 1122 }, { "epoch": 0.17715029380447214, "grad_norm": 8.529744148254395, "learning_rate": 8.39800354210272e-07, "loss": 0.0492, "step": 1123 }, { "epoch": 0.17730804117206295, "grad_norm": 6.663406848907471, "learning_rate": 8.396393495411367e-07, "loss": 0.058, "step": 1124 }, { "epoch": 0.17746578853965375, "grad_norm": 5.353763580322266, "learning_rate": 8.394783448720012e-07, "loss": 0.0416, "step": 1125 }, { "epoch": 0.17762353590724456, "grad_norm": 5.439206123352051, "learning_rate": 8.393173402028658e-07, "loss": 0.034, "step": 1126 }, { "epoch": 0.17778128327483536, "grad_norm": 5.433595180511475, "learning_rate": 8.391563355337304e-07, "loss": 0.0859, "step": 1127 }, { "epoch": 0.17793903064242617, "grad_norm": 5.620081901550293, "learning_rate": 8.38995330864595e-07, "loss": 0.0746, "step": 1128 }, { "epoch": 0.17809677801001697, "grad_norm": 5.438671112060547, "learning_rate": 8.388343261954597e-07, "loss": 0.051, "step": 1129 }, { "epoch": 0.17825452537760775, "grad_norm": 5.7582221031188965, "learning_rate": 8.386733215263242e-07, "loss": 0.0663, "step": 1130 }, { "epoch": 0.17841227274519855, "grad_norm": 3.84271240234375, "learning_rate": 8.385123168571889e-07, "loss": 0.0603, "step": 1131 }, { "epoch": 0.17857002011278936, "grad_norm": 10.873653411865234, "learning_rate": 8.383513121880535e-07, "loss": 0.0839, "step": 1132 }, { "epoch": 0.17872776748038016, "grad_norm": 6.634921073913574, "learning_rate": 8.381903075189181e-07, "loss": 0.0687, "step": 1133 }, { "epoch": 0.17888551484797097, "grad_norm": 9.545279502868652, "learning_rate": 8.380293028497826e-07, "loss": 0.0471, "step": 1134 }, { "epoch": 0.17904326221556177, "grad_norm": 5.004254341125488, "learning_rate": 8.378682981806472e-07, "loss": 0.0417, "step": 1135 }, { "epoch": 0.17920100958315258, "grad_norm": 6.581553936004639, "learning_rate": 8.377072935115118e-07, "loss": 0.0982, "step": 1136 }, { "epoch": 0.17935875695074338, "grad_norm": 6.660752296447754, "learning_rate": 8.375462888423764e-07, "loss": 0.0807, "step": 1137 }, { "epoch": 0.17951650431833419, "grad_norm": 3.5822994709014893, "learning_rate": 8.373852841732409e-07, "loss": 0.0605, "step": 1138 }, { "epoch": 0.179674251685925, "grad_norm": 7.432138442993164, "learning_rate": 8.372242795041056e-07, "loss": 0.0533, "step": 1139 }, { "epoch": 0.1798319990535158, "grad_norm": 5.05093240737915, "learning_rate": 8.370632748349702e-07, "loss": 0.0563, "step": 1140 }, { "epoch": 0.1799897464211066, "grad_norm": 3.2812182903289795, "learning_rate": 8.369022701658348e-07, "loss": 0.0371, "step": 1141 }, { "epoch": 0.1801474937886974, "grad_norm": 5.70815372467041, "learning_rate": 8.367412654966993e-07, "loss": 0.0748, "step": 1142 }, { "epoch": 0.1803052411562882, "grad_norm": 5.593191146850586, "learning_rate": 8.365802608275639e-07, "loss": 0.0634, "step": 1143 }, { "epoch": 0.18046298852387901, "grad_norm": 5.0286478996276855, "learning_rate": 8.364192561584285e-07, "loss": 0.0528, "step": 1144 }, { "epoch": 0.18062073589146982, "grad_norm": 4.4398698806762695, "learning_rate": 8.362582514892931e-07, "loss": 0.0998, "step": 1145 }, { "epoch": 0.18077848325906062, "grad_norm": 4.408929824829102, "learning_rate": 8.360972468201577e-07, "loss": 0.0658, "step": 1146 }, { "epoch": 0.18093623062665143, "grad_norm": 3.4798245429992676, "learning_rate": 8.359362421510224e-07, "loss": 0.057, "step": 1147 }, { "epoch": 0.18109397799424223, "grad_norm": 2.3720192909240723, "learning_rate": 8.35775237481887e-07, "loss": 0.023, "step": 1148 }, { "epoch": 0.18125172536183304, "grad_norm": 1.9957060813903809, "learning_rate": 8.356142328127516e-07, "loss": 0.022, "step": 1149 }, { "epoch": 0.18140947272942382, "grad_norm": 6.715579509735107, "learning_rate": 8.354532281436161e-07, "loss": 0.139, "step": 1150 }, { "epoch": 0.18156722009701462, "grad_norm": 6.875566005706787, "learning_rate": 8.352922234744807e-07, "loss": 0.0692, "step": 1151 }, { "epoch": 0.18172496746460542, "grad_norm": 3.6216447353363037, "learning_rate": 8.351312188053453e-07, "loss": 0.0336, "step": 1152 }, { "epoch": 0.18188271483219623, "grad_norm": 4.116306781768799, "learning_rate": 8.349702141362099e-07, "loss": 0.0441, "step": 1153 }, { "epoch": 0.18204046219978703, "grad_norm": 6.943159580230713, "learning_rate": 8.348092094670746e-07, "loss": 0.0913, "step": 1154 }, { "epoch": 0.18219820956737784, "grad_norm": 3.5021755695343018, "learning_rate": 8.346482047979391e-07, "loss": 0.0607, "step": 1155 }, { "epoch": 0.18235595693496864, "grad_norm": 5.7585649490356445, "learning_rate": 8.344872001288037e-07, "loss": 0.0786, "step": 1156 }, { "epoch": 0.18251370430255945, "grad_norm": 5.901808738708496, "learning_rate": 8.343261954596683e-07, "loss": 0.0541, "step": 1157 }, { "epoch": 0.18267145167015025, "grad_norm": 8.584582328796387, "learning_rate": 8.341651907905329e-07, "loss": 0.0558, "step": 1158 }, { "epoch": 0.18282919903774106, "grad_norm": 6.354942798614502, "learning_rate": 8.340041861213974e-07, "loss": 0.0514, "step": 1159 }, { "epoch": 0.18298694640533186, "grad_norm": 5.005200386047363, "learning_rate": 8.33843181452262e-07, "loss": 0.081, "step": 1160 }, { "epoch": 0.18314469377292267, "grad_norm": 4.33692741394043, "learning_rate": 8.336821767831267e-07, "loss": 0.0676, "step": 1161 }, { "epoch": 0.18330244114051347, "grad_norm": 5.106905460357666, "learning_rate": 8.335211721139913e-07, "loss": 0.0686, "step": 1162 }, { "epoch": 0.18346018850810428, "grad_norm": 6.16445255279541, "learning_rate": 8.333601674448558e-07, "loss": 0.0909, "step": 1163 }, { "epoch": 0.18361793587569508, "grad_norm": 3.997206211090088, "learning_rate": 8.331991627757204e-07, "loss": 0.0472, "step": 1164 }, { "epoch": 0.1837756832432859, "grad_norm": 4.126903533935547, "learning_rate": 8.330381581065851e-07, "loss": 0.0588, "step": 1165 }, { "epoch": 0.1839334306108767, "grad_norm": 10.28675365447998, "learning_rate": 8.328771534374497e-07, "loss": 0.0675, "step": 1166 }, { "epoch": 0.1840911779784675, "grad_norm": 7.186713695526123, "learning_rate": 8.327161487683142e-07, "loss": 0.0862, "step": 1167 }, { "epoch": 0.1842489253460583, "grad_norm": 5.181392192840576, "learning_rate": 8.325551440991788e-07, "loss": 0.0534, "step": 1168 }, { "epoch": 0.18440667271364908, "grad_norm": 4.184267044067383, "learning_rate": 8.323941394300435e-07, "loss": 0.0542, "step": 1169 }, { "epoch": 0.18456442008123988, "grad_norm": 8.53089714050293, "learning_rate": 8.322331347609081e-07, "loss": 0.067, "step": 1170 }, { "epoch": 0.1847221674488307, "grad_norm": 6.048624515533447, "learning_rate": 8.320721300917726e-07, "loss": 0.1088, "step": 1171 }, { "epoch": 0.1848799148164215, "grad_norm": 6.11414098739624, "learning_rate": 8.319111254226372e-07, "loss": 0.0945, "step": 1172 }, { "epoch": 0.1850376621840123, "grad_norm": 8.064154624938965, "learning_rate": 8.317501207535018e-07, "loss": 0.0806, "step": 1173 }, { "epoch": 0.1851954095516031, "grad_norm": 4.403115272521973, "learning_rate": 8.315891160843664e-07, "loss": 0.0323, "step": 1174 }, { "epoch": 0.1853531569191939, "grad_norm": 3.644397497177124, "learning_rate": 8.31428111415231e-07, "loss": 0.0336, "step": 1175 }, { "epoch": 0.1855109042867847, "grad_norm": 2.348759412765503, "learning_rate": 8.312671067460956e-07, "loss": 0.0217, "step": 1176 }, { "epoch": 0.18566865165437552, "grad_norm": 8.147252082824707, "learning_rate": 8.311061020769602e-07, "loss": 0.0559, "step": 1177 }, { "epoch": 0.18582639902196632, "grad_norm": 3.9271461963653564, "learning_rate": 8.309450974078248e-07, "loss": 0.0753, "step": 1178 }, { "epoch": 0.18598414638955713, "grad_norm": 3.8691606521606445, "learning_rate": 8.307840927386894e-07, "loss": 0.063, "step": 1179 }, { "epoch": 0.18614189375714793, "grad_norm": 7.3613152503967285, "learning_rate": 8.306230880695539e-07, "loss": 0.0477, "step": 1180 }, { "epoch": 0.18629964112473874, "grad_norm": 6.239074230194092, "learning_rate": 8.304620834004185e-07, "loss": 0.0313, "step": 1181 }, { "epoch": 0.18645738849232954, "grad_norm": 6.47470235824585, "learning_rate": 8.303010787312832e-07, "loss": 0.0405, "step": 1182 }, { "epoch": 0.18661513585992034, "grad_norm": 14.254313468933105, "learning_rate": 8.301400740621478e-07, "loss": 0.1435, "step": 1183 }, { "epoch": 0.18677288322751115, "grad_norm": 4.88731050491333, "learning_rate": 8.299790693930124e-07, "loss": 0.0442, "step": 1184 }, { "epoch": 0.18693063059510195, "grad_norm": 4.566811561584473, "learning_rate": 8.29818064723877e-07, "loss": 0.0496, "step": 1185 }, { "epoch": 0.18708837796269276, "grad_norm": 5.288136005401611, "learning_rate": 8.296570600547416e-07, "loss": 0.0606, "step": 1186 }, { "epoch": 0.18724612533028356, "grad_norm": 14.072057723999023, "learning_rate": 8.294960553856062e-07, "loss": 0.0592, "step": 1187 }, { "epoch": 0.18740387269787434, "grad_norm": 3.6332509517669678, "learning_rate": 8.293350507164707e-07, "loss": 0.091, "step": 1188 }, { "epoch": 0.18756162006546515, "grad_norm": 7.204614162445068, "learning_rate": 8.291740460473353e-07, "loss": 0.0825, "step": 1189 }, { "epoch": 0.18771936743305595, "grad_norm": 5.5783867835998535, "learning_rate": 8.290130413781999e-07, "loss": 0.0624, "step": 1190 }, { "epoch": 0.18787711480064676, "grad_norm": 8.766402244567871, "learning_rate": 8.288520367090646e-07, "loss": 0.0828, "step": 1191 }, { "epoch": 0.18803486216823756, "grad_norm": 4.802553176879883, "learning_rate": 8.286910320399291e-07, "loss": 0.074, "step": 1192 }, { "epoch": 0.18819260953582836, "grad_norm": 11.380311965942383, "learning_rate": 8.285300273707937e-07, "loss": 0.0936, "step": 1193 }, { "epoch": 0.18835035690341917, "grad_norm": 9.392463684082031, "learning_rate": 8.283690227016583e-07, "loss": 0.094, "step": 1194 }, { "epoch": 0.18850810427100997, "grad_norm": 7.408383369445801, "learning_rate": 8.282080180325229e-07, "loss": 0.0684, "step": 1195 }, { "epoch": 0.18866585163860078, "grad_norm": 6.016057014465332, "learning_rate": 8.280470133633874e-07, "loss": 0.0684, "step": 1196 }, { "epoch": 0.18882359900619158, "grad_norm": 4.20705509185791, "learning_rate": 8.27886008694252e-07, "loss": 0.0592, "step": 1197 }, { "epoch": 0.1889813463737824, "grad_norm": 6.25236701965332, "learning_rate": 8.277250040251166e-07, "loss": 0.109, "step": 1198 }, { "epoch": 0.1891390937413732, "grad_norm": 8.855053901672363, "learning_rate": 8.275639993559814e-07, "loss": 0.0567, "step": 1199 }, { "epoch": 0.189296841108964, "grad_norm": 7.839231014251709, "learning_rate": 8.27402994686846e-07, "loss": 0.08, "step": 1200 }, { "epoch": 0.1894545884765548, "grad_norm": 5.923698902130127, "learning_rate": 8.272419900177105e-07, "loss": 0.0526, "step": 1201 }, { "epoch": 0.1896123358441456, "grad_norm": 3.367675542831421, "learning_rate": 8.270809853485751e-07, "loss": 0.032, "step": 1202 }, { "epoch": 0.1897700832117364, "grad_norm": 7.34796667098999, "learning_rate": 8.269199806794397e-07, "loss": 0.0781, "step": 1203 }, { "epoch": 0.18992783057932722, "grad_norm": 4.488637447357178, "learning_rate": 8.267589760103043e-07, "loss": 0.0655, "step": 1204 }, { "epoch": 0.19008557794691802, "grad_norm": 7.351264476776123, "learning_rate": 8.265979713411688e-07, "loss": 0.0438, "step": 1205 }, { "epoch": 0.19024332531450883, "grad_norm": 6.292731761932373, "learning_rate": 8.264369666720335e-07, "loss": 0.053, "step": 1206 }, { "epoch": 0.19040107268209963, "grad_norm": 9.767741203308105, "learning_rate": 8.262759620028981e-07, "loss": 0.066, "step": 1207 }, { "epoch": 0.1905588200496904, "grad_norm": 6.028949737548828, "learning_rate": 8.261149573337627e-07, "loss": 0.053, "step": 1208 }, { "epoch": 0.1907165674172812, "grad_norm": 9.830450057983398, "learning_rate": 8.259539526646272e-07, "loss": 0.0771, "step": 1209 }, { "epoch": 0.19087431478487202, "grad_norm": 6.367222309112549, "learning_rate": 8.257929479954918e-07, "loss": 0.0933, "step": 1210 }, { "epoch": 0.19103206215246282, "grad_norm": 3.649705410003662, "learning_rate": 8.256319433263564e-07, "loss": 0.0728, "step": 1211 }, { "epoch": 0.19118980952005363, "grad_norm": 6.903989315032959, "learning_rate": 8.25470938657221e-07, "loss": 0.1195, "step": 1212 }, { "epoch": 0.19134755688764443, "grad_norm": 9.6062593460083, "learning_rate": 8.253099339880855e-07, "loss": 0.1184, "step": 1213 }, { "epoch": 0.19150530425523524, "grad_norm": 4.583438873291016, "learning_rate": 8.251489293189502e-07, "loss": 0.0317, "step": 1214 }, { "epoch": 0.19166305162282604, "grad_norm": 3.6098432540893555, "learning_rate": 8.249879246498148e-07, "loss": 0.0639, "step": 1215 }, { "epoch": 0.19182079899041685, "grad_norm": 5.552302360534668, "learning_rate": 8.248269199806794e-07, "loss": 0.0482, "step": 1216 }, { "epoch": 0.19197854635800765, "grad_norm": 4.402776718139648, "learning_rate": 8.24665915311544e-07, "loss": 0.0933, "step": 1217 }, { "epoch": 0.19213629372559846, "grad_norm": 6.575020790100098, "learning_rate": 8.245049106424086e-07, "loss": 0.0679, "step": 1218 }, { "epoch": 0.19229404109318926, "grad_norm": 6.469968795776367, "learning_rate": 8.243439059732732e-07, "loss": 0.0729, "step": 1219 }, { "epoch": 0.19245178846078007, "grad_norm": 5.68861722946167, "learning_rate": 8.241829013041378e-07, "loss": 0.1167, "step": 1220 }, { "epoch": 0.19260953582837087, "grad_norm": 8.055570602416992, "learning_rate": 8.240218966350024e-07, "loss": 0.0498, "step": 1221 }, { "epoch": 0.19276728319596168, "grad_norm": 4.277229309082031, "learning_rate": 8.23860891965867e-07, "loss": 0.03, "step": 1222 }, { "epoch": 0.19292503056355248, "grad_norm": 5.417783737182617, "learning_rate": 8.236998872967316e-07, "loss": 0.0699, "step": 1223 }, { "epoch": 0.19308277793114328, "grad_norm": 5.803797245025635, "learning_rate": 8.235388826275962e-07, "loss": 0.0586, "step": 1224 }, { "epoch": 0.1932405252987341, "grad_norm": 3.9364662170410156, "learning_rate": 8.233778779584608e-07, "loss": 0.0489, "step": 1225 }, { "epoch": 0.1933982726663249, "grad_norm": 7.794634819030762, "learning_rate": 8.232168732893253e-07, "loss": 0.1499, "step": 1226 }, { "epoch": 0.19355602003391567, "grad_norm": 3.498791456222534, "learning_rate": 8.230558686201899e-07, "loss": 0.0737, "step": 1227 }, { "epoch": 0.19371376740150648, "grad_norm": 6.459693908691406, "learning_rate": 8.228948639510545e-07, "loss": 0.0896, "step": 1228 }, { "epoch": 0.19387151476909728, "grad_norm": 4.633272171020508, "learning_rate": 8.227338592819192e-07, "loss": 0.0434, "step": 1229 }, { "epoch": 0.19402926213668809, "grad_norm": 6.239524841308594, "learning_rate": 8.225728546127837e-07, "loss": 0.0456, "step": 1230 }, { "epoch": 0.1941870095042789, "grad_norm": 3.814872980117798, "learning_rate": 8.224118499436483e-07, "loss": 0.06, "step": 1231 }, { "epoch": 0.1943447568718697, "grad_norm": 4.028870582580566, "learning_rate": 8.222508452745129e-07, "loss": 0.0408, "step": 1232 }, { "epoch": 0.1945025042394605, "grad_norm": 12.71030330657959, "learning_rate": 8.220898406053775e-07, "loss": 0.0828, "step": 1233 }, { "epoch": 0.1946602516070513, "grad_norm": 2.8151774406433105, "learning_rate": 8.21928835936242e-07, "loss": 0.0308, "step": 1234 }, { "epoch": 0.1948179989746421, "grad_norm": 8.508167266845703, "learning_rate": 8.217678312671067e-07, "loss": 0.0902, "step": 1235 }, { "epoch": 0.19497574634223291, "grad_norm": 7.739496231079102, "learning_rate": 8.216068265979714e-07, "loss": 0.0605, "step": 1236 }, { "epoch": 0.19513349370982372, "grad_norm": 5.401721477508545, "learning_rate": 8.21445821928836e-07, "loss": 0.0559, "step": 1237 }, { "epoch": 0.19529124107741452, "grad_norm": 3.6133265495300293, "learning_rate": 8.212848172597005e-07, "loss": 0.0669, "step": 1238 }, { "epoch": 0.19544898844500533, "grad_norm": 6.507582664489746, "learning_rate": 8.211238125905651e-07, "loss": 0.0815, "step": 1239 }, { "epoch": 0.19560673581259613, "grad_norm": 3.1642470359802246, "learning_rate": 8.209628079214297e-07, "loss": 0.031, "step": 1240 }, { "epoch": 0.19576448318018694, "grad_norm": 4.73131799697876, "learning_rate": 8.208018032522943e-07, "loss": 0.0462, "step": 1241 }, { "epoch": 0.19592223054777774, "grad_norm": 8.675853729248047, "learning_rate": 8.206407985831588e-07, "loss": 0.1086, "step": 1242 }, { "epoch": 0.19607997791536855, "grad_norm": 3.636523962020874, "learning_rate": 8.204797939140234e-07, "loss": 0.0459, "step": 1243 }, { "epoch": 0.19623772528295935, "grad_norm": 8.941878318786621, "learning_rate": 8.203187892448881e-07, "loss": 0.0617, "step": 1244 }, { "epoch": 0.19639547265055016, "grad_norm": 1.8642269372940063, "learning_rate": 8.201577845757527e-07, "loss": 0.0185, "step": 1245 }, { "epoch": 0.19655322001814093, "grad_norm": 11.972249984741211, "learning_rate": 8.199967799066173e-07, "loss": 0.0908, "step": 1246 }, { "epoch": 0.19671096738573174, "grad_norm": 3.977522850036621, "learning_rate": 8.198357752374818e-07, "loss": 0.0561, "step": 1247 }, { "epoch": 0.19686871475332254, "grad_norm": 7.002127647399902, "learning_rate": 8.196747705683464e-07, "loss": 0.1224, "step": 1248 }, { "epoch": 0.19702646212091335, "grad_norm": 3.3856682777404785, "learning_rate": 8.19513765899211e-07, "loss": 0.0444, "step": 1249 }, { "epoch": 0.19718420948850415, "grad_norm": 7.356138706207275, "learning_rate": 8.193527612300756e-07, "loss": 0.0639, "step": 1250 }, { "epoch": 0.19734195685609496, "grad_norm": 5.135031700134277, "learning_rate": 8.191917565609402e-07, "loss": 0.0695, "step": 1251 }, { "epoch": 0.19749970422368576, "grad_norm": 5.035935878753662, "learning_rate": 8.190307518918049e-07, "loss": 0.0493, "step": 1252 }, { "epoch": 0.19765745159127657, "grad_norm": 3.736492872238159, "learning_rate": 8.188697472226695e-07, "loss": 0.0306, "step": 1253 }, { "epoch": 0.19781519895886737, "grad_norm": 8.930371284484863, "learning_rate": 8.187087425535341e-07, "loss": 0.087, "step": 1254 }, { "epoch": 0.19797294632645818, "grad_norm": 7.436129570007324, "learning_rate": 8.185477378843986e-07, "loss": 0.0928, "step": 1255 }, { "epoch": 0.19813069369404898, "grad_norm": 9.20053768157959, "learning_rate": 8.183867332152632e-07, "loss": 0.1073, "step": 1256 }, { "epoch": 0.1982884410616398, "grad_norm": 9.247213363647461, "learning_rate": 8.182257285461278e-07, "loss": 0.0914, "step": 1257 }, { "epoch": 0.1984461884292306, "grad_norm": 5.168614387512207, "learning_rate": 8.180647238769924e-07, "loss": 0.0525, "step": 1258 }, { "epoch": 0.1986039357968214, "grad_norm": 4.512944221496582, "learning_rate": 8.17903719207857e-07, "loss": 0.0874, "step": 1259 }, { "epoch": 0.1987616831644122, "grad_norm": 4.579786777496338, "learning_rate": 8.177427145387216e-07, "loss": 0.0633, "step": 1260 }, { "epoch": 0.198919430532003, "grad_norm": 7.017485618591309, "learning_rate": 8.175817098695862e-07, "loss": 0.072, "step": 1261 }, { "epoch": 0.1990771778995938, "grad_norm": 6.463603496551514, "learning_rate": 8.174207052004508e-07, "loss": 0.0697, "step": 1262 }, { "epoch": 0.19923492526718461, "grad_norm": 5.57963752746582, "learning_rate": 8.172597005313153e-07, "loss": 0.1092, "step": 1263 }, { "epoch": 0.19939267263477542, "grad_norm": 7.083054065704346, "learning_rate": 8.170986958621799e-07, "loss": 0.0666, "step": 1264 }, { "epoch": 0.1995504200023662, "grad_norm": 8.915925979614258, "learning_rate": 8.169376911930445e-07, "loss": 0.1097, "step": 1265 }, { "epoch": 0.199708167369957, "grad_norm": 6.554561614990234, "learning_rate": 8.167766865239092e-07, "loss": 0.0687, "step": 1266 }, { "epoch": 0.1998659147375478, "grad_norm": 7.839459419250488, "learning_rate": 8.166156818547737e-07, "loss": 0.0548, "step": 1267 }, { "epoch": 0.2000236621051386, "grad_norm": 6.417100429534912, "learning_rate": 8.164546771856383e-07, "loss": 0.0362, "step": 1268 }, { "epoch": 0.20018140947272942, "grad_norm": 6.159739017486572, "learning_rate": 8.16293672516503e-07, "loss": 0.0846, "step": 1269 }, { "epoch": 0.20033915684032022, "grad_norm": 3.370793342590332, "learning_rate": 8.161326678473676e-07, "loss": 0.0212, "step": 1270 }, { "epoch": 0.20049690420791103, "grad_norm": 7.1980204582214355, "learning_rate": 8.159716631782322e-07, "loss": 0.0891, "step": 1271 }, { "epoch": 0.20065465157550183, "grad_norm": 6.110872268676758, "learning_rate": 8.158106585090967e-07, "loss": 0.051, "step": 1272 }, { "epoch": 0.20081239894309263, "grad_norm": 3.0168726444244385, "learning_rate": 8.156496538399613e-07, "loss": 0.0416, "step": 1273 }, { "epoch": 0.20097014631068344, "grad_norm": 6.868875980377197, "learning_rate": 8.15488649170826e-07, "loss": 0.0664, "step": 1274 }, { "epoch": 0.20112789367827424, "grad_norm": 4.881394863128662, "learning_rate": 8.153276445016906e-07, "loss": 0.066, "step": 1275 }, { "epoch": 0.20128564104586505, "grad_norm": 8.150283813476562, "learning_rate": 8.151666398325551e-07, "loss": 0.1026, "step": 1276 }, { "epoch": 0.20144338841345585, "grad_norm": 8.353609085083008, "learning_rate": 8.150056351634197e-07, "loss": 0.0588, "step": 1277 }, { "epoch": 0.20160113578104666, "grad_norm": 8.24417781829834, "learning_rate": 8.148446304942843e-07, "loss": 0.0824, "step": 1278 }, { "epoch": 0.20175888314863746, "grad_norm": 3.7639694213867188, "learning_rate": 8.146836258251489e-07, "loss": 0.0743, "step": 1279 }, { "epoch": 0.20191663051622827, "grad_norm": 4.55227518081665, "learning_rate": 8.145226211560134e-07, "loss": 0.0685, "step": 1280 }, { "epoch": 0.20207437788381907, "grad_norm": 5.401048183441162, "learning_rate": 8.143616164868781e-07, "loss": 0.0287, "step": 1281 }, { "epoch": 0.20223212525140988, "grad_norm": 5.142983913421631, "learning_rate": 8.142006118177427e-07, "loss": 0.0818, "step": 1282 }, { "epoch": 0.20238987261900068, "grad_norm": 5.776869773864746, "learning_rate": 8.140396071486073e-07, "loss": 0.0632, "step": 1283 }, { "epoch": 0.2025476199865915, "grad_norm": 4.408137321472168, "learning_rate": 8.138786024794718e-07, "loss": 0.0448, "step": 1284 }, { "epoch": 0.20270536735418226, "grad_norm": 7.559304237365723, "learning_rate": 8.137175978103364e-07, "loss": 0.0956, "step": 1285 }, { "epoch": 0.20286311472177307, "grad_norm": 5.189079284667969, "learning_rate": 8.13556593141201e-07, "loss": 0.0409, "step": 1286 }, { "epoch": 0.20302086208936387, "grad_norm": 4.444090843200684, "learning_rate": 8.133955884720657e-07, "loss": 0.0585, "step": 1287 }, { "epoch": 0.20317860945695468, "grad_norm": 2.579726457595825, "learning_rate": 8.132345838029302e-07, "loss": 0.0307, "step": 1288 }, { "epoch": 0.20333635682454548, "grad_norm": 9.362491607666016, "learning_rate": 8.130735791337949e-07, "loss": 0.0689, "step": 1289 }, { "epoch": 0.2034941041921363, "grad_norm": 6.831413745880127, "learning_rate": 8.129125744646595e-07, "loss": 0.0439, "step": 1290 }, { "epoch": 0.2036518515597271, "grad_norm": 2.6339004039764404, "learning_rate": 8.127515697955241e-07, "loss": 0.028, "step": 1291 }, { "epoch": 0.2038095989273179, "grad_norm": 7.570190906524658, "learning_rate": 8.125905651263887e-07, "loss": 0.0692, "step": 1292 }, { "epoch": 0.2039673462949087, "grad_norm": 4.936234474182129, "learning_rate": 8.124295604572532e-07, "loss": 0.0602, "step": 1293 }, { "epoch": 0.2041250936624995, "grad_norm": 4.387109756469727, "learning_rate": 8.122685557881178e-07, "loss": 0.0481, "step": 1294 }, { "epoch": 0.2042828410300903, "grad_norm": 5.723708152770996, "learning_rate": 8.121075511189824e-07, "loss": 0.0404, "step": 1295 }, { "epoch": 0.20444058839768112, "grad_norm": 3.7193493843078613, "learning_rate": 8.119465464498471e-07, "loss": 0.0392, "step": 1296 }, { "epoch": 0.20459833576527192, "grad_norm": 8.105212211608887, "learning_rate": 8.117855417807116e-07, "loss": 0.0664, "step": 1297 }, { "epoch": 0.20475608313286273, "grad_norm": 5.5150628089904785, "learning_rate": 8.116245371115762e-07, "loss": 0.0491, "step": 1298 }, { "epoch": 0.20491383050045353, "grad_norm": 7.704408645629883, "learning_rate": 8.114635324424408e-07, "loss": 0.0674, "step": 1299 }, { "epoch": 0.20507157786804434, "grad_norm": 4.652005672454834, "learning_rate": 8.113025277733054e-07, "loss": 0.0475, "step": 1300 }, { "epoch": 0.20522932523563514, "grad_norm": 6.252089023590088, "learning_rate": 8.111415231041699e-07, "loss": 0.051, "step": 1301 }, { "epoch": 0.20538707260322595, "grad_norm": 8.732755661010742, "learning_rate": 8.109805184350345e-07, "loss": 0.0956, "step": 1302 }, { "epoch": 0.20554481997081675, "grad_norm": 5.888700485229492, "learning_rate": 8.108195137658991e-07, "loss": 0.1075, "step": 1303 }, { "epoch": 0.20570256733840753, "grad_norm": 5.2593674659729, "learning_rate": 8.106585090967639e-07, "loss": 0.0555, "step": 1304 }, { "epoch": 0.20586031470599833, "grad_norm": 7.008612155914307, "learning_rate": 8.104975044276284e-07, "loss": 0.0594, "step": 1305 }, { "epoch": 0.20601806207358914, "grad_norm": 9.058858871459961, "learning_rate": 8.10336499758493e-07, "loss": 0.0775, "step": 1306 }, { "epoch": 0.20617580944117994, "grad_norm": 5.718960762023926, "learning_rate": 8.101754950893576e-07, "loss": 0.0513, "step": 1307 }, { "epoch": 0.20633355680877075, "grad_norm": 4.485774040222168, "learning_rate": 8.100144904202222e-07, "loss": 0.0474, "step": 1308 }, { "epoch": 0.20649130417636155, "grad_norm": 10.537572860717773, "learning_rate": 8.098534857510867e-07, "loss": 0.0431, "step": 1309 }, { "epoch": 0.20664905154395236, "grad_norm": 5.365055561065674, "learning_rate": 8.096924810819513e-07, "loss": 0.0398, "step": 1310 }, { "epoch": 0.20680679891154316, "grad_norm": 8.628813743591309, "learning_rate": 8.09531476412816e-07, "loss": 0.0722, "step": 1311 }, { "epoch": 0.20696454627913397, "grad_norm": 6.088448524475098, "learning_rate": 8.093704717436806e-07, "loss": 0.1014, "step": 1312 }, { "epoch": 0.20712229364672477, "grad_norm": 3.6947081089019775, "learning_rate": 8.092094670745451e-07, "loss": 0.0686, "step": 1313 }, { "epoch": 0.20728004101431557, "grad_norm": 3.5171139240264893, "learning_rate": 8.090484624054097e-07, "loss": 0.038, "step": 1314 }, { "epoch": 0.20743778838190638, "grad_norm": 6.769224166870117, "learning_rate": 8.088874577362743e-07, "loss": 0.0867, "step": 1315 }, { "epoch": 0.20759553574949718, "grad_norm": 5.466546058654785, "learning_rate": 8.087264530671389e-07, "loss": 0.0652, "step": 1316 }, { "epoch": 0.207753283117088, "grad_norm": 3.244213581085205, "learning_rate": 8.085654483980035e-07, "loss": 0.0783, "step": 1317 }, { "epoch": 0.2079110304846788, "grad_norm": 3.2699074745178223, "learning_rate": 8.08404443728868e-07, "loss": 0.0478, "step": 1318 }, { "epoch": 0.2080687778522696, "grad_norm": 7.128431797027588, "learning_rate": 8.082434390597327e-07, "loss": 0.046, "step": 1319 }, { "epoch": 0.2082265252198604, "grad_norm": 7.5372467041015625, "learning_rate": 8.080824343905973e-07, "loss": 0.0688, "step": 1320 }, { "epoch": 0.2083842725874512, "grad_norm": 2.642981767654419, "learning_rate": 8.07921429721462e-07, "loss": 0.0256, "step": 1321 }, { "epoch": 0.208542019955042, "grad_norm": 7.668417930603027, "learning_rate": 8.077604250523264e-07, "loss": 0.0748, "step": 1322 }, { "epoch": 0.2086997673226328, "grad_norm": 8.645655632019043, "learning_rate": 8.075994203831911e-07, "loss": 0.0862, "step": 1323 }, { "epoch": 0.2088575146902236, "grad_norm": 2.2922632694244385, "learning_rate": 8.074384157140557e-07, "loss": 0.0268, "step": 1324 }, { "epoch": 0.2090152620578144, "grad_norm": 5.980663776397705, "learning_rate": 8.072774110449203e-07, "loss": 0.0716, "step": 1325 }, { "epoch": 0.2091730094254052, "grad_norm": 4.555629730224609, "learning_rate": 8.071164063757848e-07, "loss": 0.0333, "step": 1326 }, { "epoch": 0.209330756792996, "grad_norm": 4.295013427734375, "learning_rate": 8.069554017066495e-07, "loss": 0.025, "step": 1327 }, { "epoch": 0.2094885041605868, "grad_norm": 7.122177600860596, "learning_rate": 8.067943970375141e-07, "loss": 0.047, "step": 1328 }, { "epoch": 0.20964625152817762, "grad_norm": 5.486662864685059, "learning_rate": 8.066333923683787e-07, "loss": 0.0724, "step": 1329 }, { "epoch": 0.20980399889576842, "grad_norm": 3.3127543926239014, "learning_rate": 8.064723876992432e-07, "loss": 0.0235, "step": 1330 }, { "epoch": 0.20996174626335923, "grad_norm": 4.185500621795654, "learning_rate": 8.063113830301078e-07, "loss": 0.0151, "step": 1331 }, { "epoch": 0.21011949363095003, "grad_norm": 6.932736873626709, "learning_rate": 8.061503783609724e-07, "loss": 0.1283, "step": 1332 }, { "epoch": 0.21027724099854084, "grad_norm": 3.9714646339416504, "learning_rate": 8.05989373691837e-07, "loss": 0.0159, "step": 1333 }, { "epoch": 0.21043498836613164, "grad_norm": 6.8552350997924805, "learning_rate": 8.058283690227016e-07, "loss": 0.0685, "step": 1334 }, { "epoch": 0.21059273573372245, "grad_norm": 5.01610803604126, "learning_rate": 8.056673643535662e-07, "loss": 0.0534, "step": 1335 }, { "epoch": 0.21075048310131325, "grad_norm": 5.589885711669922, "learning_rate": 8.055063596844308e-07, "loss": 0.0417, "step": 1336 }, { "epoch": 0.21090823046890406, "grad_norm": 5.473964214324951, "learning_rate": 8.053453550152954e-07, "loss": 0.0433, "step": 1337 }, { "epoch": 0.21106597783649486, "grad_norm": 2.978450298309326, "learning_rate": 8.0518435034616e-07, "loss": 0.054, "step": 1338 }, { "epoch": 0.21122372520408567, "grad_norm": 8.028962135314941, "learning_rate": 8.050233456770245e-07, "loss": 0.0431, "step": 1339 }, { "epoch": 0.21138147257167647, "grad_norm": 4.471803188323975, "learning_rate": 8.048623410078891e-07, "loss": 0.0774, "step": 1340 }, { "epoch": 0.21153921993926728, "grad_norm": 6.427778720855713, "learning_rate": 8.047013363387539e-07, "loss": 0.0652, "step": 1341 }, { "epoch": 0.21169696730685808, "grad_norm": 4.534457206726074, "learning_rate": 8.045403316696185e-07, "loss": 0.061, "step": 1342 }, { "epoch": 0.21185471467444886, "grad_norm": 5.0695576667785645, "learning_rate": 8.04379327000483e-07, "loss": 0.0607, "step": 1343 }, { "epoch": 0.21201246204203966, "grad_norm": 7.301562786102295, "learning_rate": 8.042183223313476e-07, "loss": 0.0793, "step": 1344 }, { "epoch": 0.21217020940963047, "grad_norm": 5.415078639984131, "learning_rate": 8.040573176622122e-07, "loss": 0.0633, "step": 1345 }, { "epoch": 0.21232795677722127, "grad_norm": 7.417275428771973, "learning_rate": 8.038963129930768e-07, "loss": 0.0794, "step": 1346 }, { "epoch": 0.21248570414481208, "grad_norm": 5.172476291656494, "learning_rate": 8.037353083239413e-07, "loss": 0.0586, "step": 1347 }, { "epoch": 0.21264345151240288, "grad_norm": 4.431525230407715, "learning_rate": 8.035743036548059e-07, "loss": 0.0413, "step": 1348 }, { "epoch": 0.21280119887999369, "grad_norm": 5.791142463684082, "learning_rate": 8.034132989856706e-07, "loss": 0.0636, "step": 1349 }, { "epoch": 0.2129589462475845, "grad_norm": 4.702826023101807, "learning_rate": 8.032522943165352e-07, "loss": 0.0536, "step": 1350 }, { "epoch": 0.2131166936151753, "grad_norm": 8.566353797912598, "learning_rate": 8.030912896473997e-07, "loss": 0.0381, "step": 1351 }, { "epoch": 0.2132744409827661, "grad_norm": 6.175302028656006, "learning_rate": 8.029302849782643e-07, "loss": 0.0686, "step": 1352 }, { "epoch": 0.2134321883503569, "grad_norm": 4.509758472442627, "learning_rate": 8.027692803091289e-07, "loss": 0.0375, "step": 1353 }, { "epoch": 0.2135899357179477, "grad_norm": 9.859237670898438, "learning_rate": 8.026082756399935e-07, "loss": 0.0583, "step": 1354 }, { "epoch": 0.21374768308553851, "grad_norm": 3.6694836616516113, "learning_rate": 8.02447270970858e-07, "loss": 0.0303, "step": 1355 }, { "epoch": 0.21390543045312932, "grad_norm": 4.9124579429626465, "learning_rate": 8.022862663017226e-07, "loss": 0.0792, "step": 1356 }, { "epoch": 0.21406317782072012, "grad_norm": 4.5436110496521, "learning_rate": 8.021252616325874e-07, "loss": 0.075, "step": 1357 }, { "epoch": 0.21422092518831093, "grad_norm": 7.841489791870117, "learning_rate": 8.01964256963452e-07, "loss": 0.0614, "step": 1358 }, { "epoch": 0.21437867255590173, "grad_norm": 10.84503173828125, "learning_rate": 8.018032522943165e-07, "loss": 0.1507, "step": 1359 }, { "epoch": 0.21453641992349254, "grad_norm": 5.0677103996276855, "learning_rate": 8.016422476251811e-07, "loss": 0.058, "step": 1360 }, { "epoch": 0.21469416729108334, "grad_norm": 5.633657932281494, "learning_rate": 8.014812429560457e-07, "loss": 0.0513, "step": 1361 }, { "epoch": 0.21485191465867412, "grad_norm": 6.73211669921875, "learning_rate": 8.013202382869103e-07, "loss": 0.0705, "step": 1362 }, { "epoch": 0.21500966202626492, "grad_norm": 10.13681697845459, "learning_rate": 8.011592336177749e-07, "loss": 0.0725, "step": 1363 }, { "epoch": 0.21516740939385573, "grad_norm": 8.70879077911377, "learning_rate": 8.009982289486395e-07, "loss": 0.0515, "step": 1364 }, { "epoch": 0.21532515676144653, "grad_norm": 4.02731466293335, "learning_rate": 8.008372242795041e-07, "loss": 0.0526, "step": 1365 }, { "epoch": 0.21548290412903734, "grad_norm": 6.5245466232299805, "learning_rate": 8.006762196103687e-07, "loss": 0.087, "step": 1366 }, { "epoch": 0.21564065149662814, "grad_norm": 5.074929237365723, "learning_rate": 8.005152149412333e-07, "loss": 0.0595, "step": 1367 }, { "epoch": 0.21579839886421895, "grad_norm": 4.632253170013428, "learning_rate": 8.003542102720978e-07, "loss": 0.0488, "step": 1368 }, { "epoch": 0.21595614623180975, "grad_norm": 4.121384620666504, "learning_rate": 8.001932056029624e-07, "loss": 0.0288, "step": 1369 }, { "epoch": 0.21611389359940056, "grad_norm": 6.601824760437012, "learning_rate": 8.00032200933827e-07, "loss": 0.0931, "step": 1370 }, { "epoch": 0.21627164096699136, "grad_norm": 6.737264633178711, "learning_rate": 7.998711962646917e-07, "loss": 0.04, "step": 1371 }, { "epoch": 0.21642938833458217, "grad_norm": 7.603921413421631, "learning_rate": 7.997101915955562e-07, "loss": 0.05, "step": 1372 }, { "epoch": 0.21658713570217297, "grad_norm": 4.314795017242432, "learning_rate": 7.995491869264208e-07, "loss": 0.0385, "step": 1373 }, { "epoch": 0.21674488306976378, "grad_norm": 7.3531174659729, "learning_rate": 7.993881822572854e-07, "loss": 0.0681, "step": 1374 }, { "epoch": 0.21690263043735458, "grad_norm": 14.709685325622559, "learning_rate": 7.9922717758815e-07, "loss": 0.0719, "step": 1375 }, { "epoch": 0.2170603778049454, "grad_norm": 6.657068252563477, "learning_rate": 7.990661729190146e-07, "loss": 0.0705, "step": 1376 }, { "epoch": 0.2172181251725362, "grad_norm": 5.743774890899658, "learning_rate": 7.989051682498792e-07, "loss": 0.0613, "step": 1377 }, { "epoch": 0.217375872540127, "grad_norm": 4.418606281280518, "learning_rate": 7.987441635807438e-07, "loss": 0.0876, "step": 1378 }, { "epoch": 0.2175336199077178, "grad_norm": 7.777172088623047, "learning_rate": 7.985831589116085e-07, "loss": 0.0558, "step": 1379 }, { "epoch": 0.2176913672753086, "grad_norm": 1.5176472663879395, "learning_rate": 7.98422154242473e-07, "loss": 0.0112, "step": 1380 }, { "epoch": 0.21784911464289938, "grad_norm": 5.456096172332764, "learning_rate": 7.982611495733376e-07, "loss": 0.043, "step": 1381 }, { "epoch": 0.2180068620104902, "grad_norm": 11.711322784423828, "learning_rate": 7.981001449042022e-07, "loss": 0.1085, "step": 1382 }, { "epoch": 0.218164609378081, "grad_norm": 3.6353113651275635, "learning_rate": 7.979391402350668e-07, "loss": 0.0245, "step": 1383 }, { "epoch": 0.2183223567456718, "grad_norm": 8.09837818145752, "learning_rate": 7.977781355659313e-07, "loss": 0.0565, "step": 1384 }, { "epoch": 0.2184801041132626, "grad_norm": 6.092850208282471, "learning_rate": 7.976171308967959e-07, "loss": 0.0585, "step": 1385 }, { "epoch": 0.2186378514808534, "grad_norm": 7.001495361328125, "learning_rate": 7.974561262276606e-07, "loss": 0.0659, "step": 1386 }, { "epoch": 0.2187955988484442, "grad_norm": 5.3207197189331055, "learning_rate": 7.972951215585252e-07, "loss": 0.0883, "step": 1387 }, { "epoch": 0.21895334621603502, "grad_norm": 5.347443103790283, "learning_rate": 7.971341168893898e-07, "loss": 0.0562, "step": 1388 }, { "epoch": 0.21911109358362582, "grad_norm": 6.29995059967041, "learning_rate": 7.969731122202543e-07, "loss": 0.0423, "step": 1389 }, { "epoch": 0.21926884095121663, "grad_norm": 2.837951183319092, "learning_rate": 7.968121075511189e-07, "loss": 0.0388, "step": 1390 }, { "epoch": 0.21942658831880743, "grad_norm": 4.2165985107421875, "learning_rate": 7.966511028819835e-07, "loss": 0.0278, "step": 1391 }, { "epoch": 0.21958433568639824, "grad_norm": 4.890751361846924, "learning_rate": 7.964900982128481e-07, "loss": 0.0794, "step": 1392 }, { "epoch": 0.21974208305398904, "grad_norm": 5.265214920043945, "learning_rate": 7.963290935437126e-07, "loss": 0.0353, "step": 1393 }, { "epoch": 0.21989983042157984, "grad_norm": 3.7690224647521973, "learning_rate": 7.961680888745774e-07, "loss": 0.0438, "step": 1394 }, { "epoch": 0.22005757778917065, "grad_norm": 5.014218330383301, "learning_rate": 7.96007084205442e-07, "loss": 0.0512, "step": 1395 }, { "epoch": 0.22021532515676145, "grad_norm": 9.617183685302734, "learning_rate": 7.958460795363066e-07, "loss": 0.0949, "step": 1396 }, { "epoch": 0.22037307252435226, "grad_norm": 2.744088649749756, "learning_rate": 7.956850748671711e-07, "loss": 0.0335, "step": 1397 }, { "epoch": 0.22053081989194306, "grad_norm": 3.987654447555542, "learning_rate": 7.955240701980357e-07, "loss": 0.0356, "step": 1398 }, { "epoch": 0.22068856725953387, "grad_norm": 8.036004066467285, "learning_rate": 7.953630655289003e-07, "loss": 0.0422, "step": 1399 }, { "epoch": 0.22084631462712467, "grad_norm": 6.1750030517578125, "learning_rate": 7.952020608597649e-07, "loss": 0.0475, "step": 1400 }, { "epoch": 0.22100406199471545, "grad_norm": 5.037681579589844, "learning_rate": 7.950410561906294e-07, "loss": 0.0372, "step": 1401 }, { "epoch": 0.22116180936230626, "grad_norm": 3.706131935119629, "learning_rate": 7.948800515214941e-07, "loss": 0.0447, "step": 1402 }, { "epoch": 0.22131955672989706, "grad_norm": 3.950047731399536, "learning_rate": 7.947190468523587e-07, "loss": 0.0664, "step": 1403 }, { "epoch": 0.22147730409748786, "grad_norm": 3.1588940620422363, "learning_rate": 7.945580421832233e-07, "loss": 0.0358, "step": 1404 }, { "epoch": 0.22163505146507867, "grad_norm": 3.942070722579956, "learning_rate": 7.943970375140878e-07, "loss": 0.0551, "step": 1405 }, { "epoch": 0.22179279883266947, "grad_norm": 4.074398994445801, "learning_rate": 7.942360328449524e-07, "loss": 0.0352, "step": 1406 }, { "epoch": 0.22195054620026028, "grad_norm": 4.072987079620361, "learning_rate": 7.94075028175817e-07, "loss": 0.0698, "step": 1407 }, { "epoch": 0.22210829356785108, "grad_norm": 6.349443435668945, "learning_rate": 7.939140235066816e-07, "loss": 0.0742, "step": 1408 }, { "epoch": 0.2222660409354419, "grad_norm": 5.44382381439209, "learning_rate": 7.937530188375463e-07, "loss": 0.0634, "step": 1409 }, { "epoch": 0.2224237883030327, "grad_norm": 5.967154502868652, "learning_rate": 7.935920141684109e-07, "loss": 0.0765, "step": 1410 }, { "epoch": 0.2225815356706235, "grad_norm": 7.5302839279174805, "learning_rate": 7.934310094992755e-07, "loss": 0.0446, "step": 1411 }, { "epoch": 0.2227392830382143, "grad_norm": 3.2310304641723633, "learning_rate": 7.932700048301401e-07, "loss": 0.0167, "step": 1412 }, { "epoch": 0.2228970304058051, "grad_norm": 7.038295745849609, "learning_rate": 7.931090001610047e-07, "loss": 0.0896, "step": 1413 }, { "epoch": 0.2230547777733959, "grad_norm": 6.531866550445557, "learning_rate": 7.929479954918692e-07, "loss": 0.0629, "step": 1414 }, { "epoch": 0.22321252514098672, "grad_norm": 8.073969841003418, "learning_rate": 7.927869908227338e-07, "loss": 0.0976, "step": 1415 }, { "epoch": 0.22337027250857752, "grad_norm": 4.7604851722717285, "learning_rate": 7.926259861535985e-07, "loss": 0.0232, "step": 1416 }, { "epoch": 0.22352801987616833, "grad_norm": 6.101916790008545, "learning_rate": 7.924649814844631e-07, "loss": 0.0616, "step": 1417 }, { "epoch": 0.22368576724375913, "grad_norm": 6.440509796142578, "learning_rate": 7.923039768153276e-07, "loss": 0.0754, "step": 1418 }, { "epoch": 0.22384351461134994, "grad_norm": 8.148072242736816, "learning_rate": 7.921429721461922e-07, "loss": 0.0685, "step": 1419 }, { "epoch": 0.2240012619789407, "grad_norm": 10.16176986694336, "learning_rate": 7.919819674770568e-07, "loss": 0.0519, "step": 1420 }, { "epoch": 0.22415900934653152, "grad_norm": 6.655317306518555, "learning_rate": 7.918209628079214e-07, "loss": 0.0949, "step": 1421 }, { "epoch": 0.22431675671412232, "grad_norm": 7.478221416473389, "learning_rate": 7.916599581387859e-07, "loss": 0.0734, "step": 1422 }, { "epoch": 0.22447450408171313, "grad_norm": 5.7581281661987305, "learning_rate": 7.914989534696505e-07, "loss": 0.0766, "step": 1423 }, { "epoch": 0.22463225144930393, "grad_norm": 10.845542907714844, "learning_rate": 7.913379488005152e-07, "loss": 0.0732, "step": 1424 }, { "epoch": 0.22478999881689474, "grad_norm": 8.282747268676758, "learning_rate": 7.911769441313798e-07, "loss": 0.0962, "step": 1425 }, { "epoch": 0.22494774618448554, "grad_norm": 5.4240193367004395, "learning_rate": 7.910159394622443e-07, "loss": 0.0231, "step": 1426 }, { "epoch": 0.22510549355207635, "grad_norm": 7.236989498138428, "learning_rate": 7.908549347931089e-07, "loss": 0.0724, "step": 1427 }, { "epoch": 0.22526324091966715, "grad_norm": 11.251788139343262, "learning_rate": 7.906939301239736e-07, "loss": 0.0582, "step": 1428 }, { "epoch": 0.22542098828725796, "grad_norm": 5.494878768920898, "learning_rate": 7.905329254548382e-07, "loss": 0.0589, "step": 1429 }, { "epoch": 0.22557873565484876, "grad_norm": 8.22620677947998, "learning_rate": 7.903719207857027e-07, "loss": 0.0915, "step": 1430 }, { "epoch": 0.22573648302243957, "grad_norm": 9.50594711303711, "learning_rate": 7.902109161165673e-07, "loss": 0.0721, "step": 1431 }, { "epoch": 0.22589423039003037, "grad_norm": 7.40491247177124, "learning_rate": 7.90049911447432e-07, "loss": 0.0521, "step": 1432 }, { "epoch": 0.22605197775762118, "grad_norm": 13.662528991699219, "learning_rate": 7.898889067782966e-07, "loss": 0.117, "step": 1433 }, { "epoch": 0.22620972512521198, "grad_norm": 7.314889430999756, "learning_rate": 7.897279021091612e-07, "loss": 0.0799, "step": 1434 }, { "epoch": 0.22636747249280278, "grad_norm": 6.094870567321777, "learning_rate": 7.895668974400257e-07, "loss": 0.0715, "step": 1435 }, { "epoch": 0.2265252198603936, "grad_norm": 2.415949583053589, "learning_rate": 7.894058927708903e-07, "loss": 0.0255, "step": 1436 }, { "epoch": 0.2266829672279844, "grad_norm": 3.3921942710876465, "learning_rate": 7.892448881017549e-07, "loss": 0.0576, "step": 1437 }, { "epoch": 0.2268407145955752, "grad_norm": 6.06171178817749, "learning_rate": 7.890838834326195e-07, "loss": 0.0725, "step": 1438 }, { "epoch": 0.22699846196316598, "grad_norm": 6.461246490478516, "learning_rate": 7.889228787634841e-07, "loss": 0.0768, "step": 1439 }, { "epoch": 0.22715620933075678, "grad_norm": 5.678943157196045, "learning_rate": 7.887618740943487e-07, "loss": 0.0812, "step": 1440 }, { "epoch": 0.22731395669834759, "grad_norm": 4.532436370849609, "learning_rate": 7.886008694252133e-07, "loss": 0.0417, "step": 1441 }, { "epoch": 0.2274717040659384, "grad_norm": 7.049404144287109, "learning_rate": 7.884398647560779e-07, "loss": 0.1101, "step": 1442 }, { "epoch": 0.2276294514335292, "grad_norm": 5.901423931121826, "learning_rate": 7.882788600869424e-07, "loss": 0.1286, "step": 1443 }, { "epoch": 0.22778719880112, "grad_norm": 7.790620803833008, "learning_rate": 7.88117855417807e-07, "loss": 0.0405, "step": 1444 }, { "epoch": 0.2279449461687108, "grad_norm": 5.334627628326416, "learning_rate": 7.879568507486716e-07, "loss": 0.0479, "step": 1445 }, { "epoch": 0.2281026935363016, "grad_norm": 4.426734924316406, "learning_rate": 7.877958460795364e-07, "loss": 0.0293, "step": 1446 }, { "epoch": 0.22826044090389241, "grad_norm": 4.468956470489502, "learning_rate": 7.876348414104009e-07, "loss": 0.0251, "step": 1447 }, { "epoch": 0.22841818827148322, "grad_norm": 5.049088954925537, "learning_rate": 7.874738367412655e-07, "loss": 0.0462, "step": 1448 }, { "epoch": 0.22857593563907402, "grad_norm": 8.321356773376465, "learning_rate": 7.873128320721301e-07, "loss": 0.0737, "step": 1449 }, { "epoch": 0.22873368300666483, "grad_norm": 7.106385231018066, "learning_rate": 7.871518274029947e-07, "loss": 0.0603, "step": 1450 }, { "epoch": 0.22889143037425563, "grad_norm": 5.824122428894043, "learning_rate": 7.869908227338592e-07, "loss": 0.0539, "step": 1451 }, { "epoch": 0.22904917774184644, "grad_norm": 6.70082950592041, "learning_rate": 7.868298180647238e-07, "loss": 0.1212, "step": 1452 }, { "epoch": 0.22920692510943724, "grad_norm": 4.612785339355469, "learning_rate": 7.866688133955884e-07, "loss": 0.0456, "step": 1453 }, { "epoch": 0.22936467247702805, "grad_norm": 9.111032485961914, "learning_rate": 7.865078087264531e-07, "loss": 0.0929, "step": 1454 }, { "epoch": 0.22952241984461885, "grad_norm": 5.721334457397461, "learning_rate": 7.863468040573177e-07, "loss": 0.1015, "step": 1455 }, { "epoch": 0.22968016721220966, "grad_norm": 4.232248783111572, "learning_rate": 7.861857993881822e-07, "loss": 0.0533, "step": 1456 }, { "epoch": 0.22983791457980046, "grad_norm": 8.858681678771973, "learning_rate": 7.860247947190468e-07, "loss": 0.1098, "step": 1457 }, { "epoch": 0.22999566194739127, "grad_norm": 4.750298023223877, "learning_rate": 7.858637900499114e-07, "loss": 0.0663, "step": 1458 }, { "epoch": 0.23015340931498204, "grad_norm": 5.576642036437988, "learning_rate": 7.85702785380776e-07, "loss": 0.1292, "step": 1459 }, { "epoch": 0.23031115668257285, "grad_norm": 4.532076835632324, "learning_rate": 7.855417807116405e-07, "loss": 0.0765, "step": 1460 }, { "epoch": 0.23046890405016365, "grad_norm": 4.859536647796631, "learning_rate": 7.853807760425051e-07, "loss": 0.0325, "step": 1461 }, { "epoch": 0.23062665141775446, "grad_norm": 3.2009525299072266, "learning_rate": 7.852197713733698e-07, "loss": 0.0568, "step": 1462 }, { "epoch": 0.23078439878534526, "grad_norm": 8.067152976989746, "learning_rate": 7.850587667042345e-07, "loss": 0.0931, "step": 1463 }, { "epoch": 0.23094214615293607, "grad_norm": 6.133688449859619, "learning_rate": 7.84897762035099e-07, "loss": 0.0596, "step": 1464 }, { "epoch": 0.23109989352052687, "grad_norm": 5.80610466003418, "learning_rate": 7.847367573659636e-07, "loss": 0.0347, "step": 1465 }, { "epoch": 0.23125764088811768, "grad_norm": 8.714088439941406, "learning_rate": 7.845757526968282e-07, "loss": 0.1071, "step": 1466 }, { "epoch": 0.23141538825570848, "grad_norm": 6.408808708190918, "learning_rate": 7.844147480276928e-07, "loss": 0.0635, "step": 1467 }, { "epoch": 0.2315731356232993, "grad_norm": 5.011511325836182, "learning_rate": 7.842537433585573e-07, "loss": 0.0445, "step": 1468 }, { "epoch": 0.2317308829908901, "grad_norm": 7.067837715148926, "learning_rate": 7.84092738689422e-07, "loss": 0.0737, "step": 1469 }, { "epoch": 0.2318886303584809, "grad_norm": 4.492575645446777, "learning_rate": 7.839317340202866e-07, "loss": 0.0341, "step": 1470 }, { "epoch": 0.2320463777260717, "grad_norm": 2.238722085952759, "learning_rate": 7.837707293511512e-07, "loss": 0.0211, "step": 1471 }, { "epoch": 0.2322041250936625, "grad_norm": 1.7708102464675903, "learning_rate": 7.836097246820157e-07, "loss": 0.0257, "step": 1472 }, { "epoch": 0.2323618724612533, "grad_norm": 7.906129360198975, "learning_rate": 7.834487200128803e-07, "loss": 0.0535, "step": 1473 }, { "epoch": 0.23251961982884412, "grad_norm": 5.424860954284668, "learning_rate": 7.832877153437449e-07, "loss": 0.0592, "step": 1474 }, { "epoch": 0.23267736719643492, "grad_norm": 6.904181003570557, "learning_rate": 7.831267106746095e-07, "loss": 0.0919, "step": 1475 }, { "epoch": 0.23283511456402572, "grad_norm": 2.7707386016845703, "learning_rate": 7.82965706005474e-07, "loss": 0.0337, "step": 1476 }, { "epoch": 0.23299286193161653, "grad_norm": 5.375368595123291, "learning_rate": 7.828047013363387e-07, "loss": 0.0896, "step": 1477 }, { "epoch": 0.2331506092992073, "grad_norm": 6.6298017501831055, "learning_rate": 7.826436966672033e-07, "loss": 0.076, "step": 1478 }, { "epoch": 0.2333083566667981, "grad_norm": 3.2984378337860107, "learning_rate": 7.824826919980679e-07, "loss": 0.0453, "step": 1479 }, { "epoch": 0.23346610403438892, "grad_norm": 3.9714133739471436, "learning_rate": 7.823216873289326e-07, "loss": 0.0458, "step": 1480 }, { "epoch": 0.23362385140197972, "grad_norm": 4.95203161239624, "learning_rate": 7.82160682659797e-07, "loss": 0.0577, "step": 1481 }, { "epoch": 0.23378159876957053, "grad_norm": 6.471922874450684, "learning_rate": 7.819996779906617e-07, "loss": 0.0984, "step": 1482 }, { "epoch": 0.23393934613716133, "grad_norm": 10.565709114074707, "learning_rate": 7.818386733215263e-07, "loss": 0.0924, "step": 1483 }, { "epoch": 0.23409709350475214, "grad_norm": 5.166487216949463, "learning_rate": 7.81677668652391e-07, "loss": 0.0671, "step": 1484 }, { "epoch": 0.23425484087234294, "grad_norm": 4.182351112365723, "learning_rate": 7.815166639832555e-07, "loss": 0.0479, "step": 1485 }, { "epoch": 0.23441258823993374, "grad_norm": 6.305501937866211, "learning_rate": 7.813556593141201e-07, "loss": 0.044, "step": 1486 }, { "epoch": 0.23457033560752455, "grad_norm": 11.590909004211426, "learning_rate": 7.811946546449847e-07, "loss": 0.0694, "step": 1487 }, { "epoch": 0.23472808297511535, "grad_norm": 5.491660118103027, "learning_rate": 7.810336499758493e-07, "loss": 0.0617, "step": 1488 }, { "epoch": 0.23488583034270616, "grad_norm": 7.87093448638916, "learning_rate": 7.808726453067138e-07, "loss": 0.0719, "step": 1489 }, { "epoch": 0.23504357771029696, "grad_norm": 4.106072902679443, "learning_rate": 7.807116406375784e-07, "loss": 0.0308, "step": 1490 }, { "epoch": 0.23520132507788777, "grad_norm": 5.372685432434082, "learning_rate": 7.80550635968443e-07, "loss": 0.0881, "step": 1491 }, { "epoch": 0.23535907244547857, "grad_norm": 4.103046894073486, "learning_rate": 7.803896312993077e-07, "loss": 0.0203, "step": 1492 }, { "epoch": 0.23551681981306938, "grad_norm": 5.021068096160889, "learning_rate": 7.802286266301722e-07, "loss": 0.0348, "step": 1493 }, { "epoch": 0.23567456718066018, "grad_norm": 5.834413528442383, "learning_rate": 7.800676219610368e-07, "loss": 0.0302, "step": 1494 }, { "epoch": 0.235832314548251, "grad_norm": 7.424976348876953, "learning_rate": 7.799066172919014e-07, "loss": 0.0567, "step": 1495 }, { "epoch": 0.2359900619158418, "grad_norm": 2.5958831310272217, "learning_rate": 7.79745612622766e-07, "loss": 0.0213, "step": 1496 }, { "epoch": 0.23614780928343257, "grad_norm": 5.730758190155029, "learning_rate": 7.795846079536305e-07, "loss": 0.0216, "step": 1497 }, { "epoch": 0.23630555665102337, "grad_norm": 5.889350891113281, "learning_rate": 7.794236032844951e-07, "loss": 0.0231, "step": 1498 }, { "epoch": 0.23646330401861418, "grad_norm": 3.6521618366241455, "learning_rate": 7.792625986153599e-07, "loss": 0.0447, "step": 1499 }, { "epoch": 0.23662105138620498, "grad_norm": 5.1871161460876465, "learning_rate": 7.791015939462245e-07, "loss": 0.0608, "step": 1500 }, { "epoch": 0.2367787987537958, "grad_norm": 7.352590084075928, "learning_rate": 7.789405892770891e-07, "loss": 0.0774, "step": 1501 }, { "epoch": 0.2369365461213866, "grad_norm": 8.463130950927734, "learning_rate": 7.787795846079536e-07, "loss": 0.0804, "step": 1502 }, { "epoch": 0.2370942934889774, "grad_norm": 4.437898635864258, "learning_rate": 7.786185799388182e-07, "loss": 0.047, "step": 1503 }, { "epoch": 0.2372520408565682, "grad_norm": 4.352744102478027, "learning_rate": 7.784575752696828e-07, "loss": 0.0437, "step": 1504 }, { "epoch": 0.237409788224159, "grad_norm": 4.607551097869873, "learning_rate": 7.782965706005474e-07, "loss": 0.0632, "step": 1505 }, { "epoch": 0.2375675355917498, "grad_norm": 4.5728325843811035, "learning_rate": 7.781355659314119e-07, "loss": 0.048, "step": 1506 }, { "epoch": 0.23772528295934062, "grad_norm": 4.126950740814209, "learning_rate": 7.779745612622766e-07, "loss": 0.0274, "step": 1507 }, { "epoch": 0.23788303032693142, "grad_norm": 7.351589679718018, "learning_rate": 7.778135565931412e-07, "loss": 0.0453, "step": 1508 }, { "epoch": 0.23804077769452223, "grad_norm": 4.392864227294922, "learning_rate": 7.776525519240058e-07, "loss": 0.098, "step": 1509 }, { "epoch": 0.23819852506211303, "grad_norm": 3.9069530963897705, "learning_rate": 7.774915472548703e-07, "loss": 0.0409, "step": 1510 }, { "epoch": 0.23835627242970384, "grad_norm": 6.062881946563721, "learning_rate": 7.773305425857349e-07, "loss": 0.0518, "step": 1511 }, { "epoch": 0.23851401979729464, "grad_norm": 6.978428840637207, "learning_rate": 7.771695379165995e-07, "loss": 0.1017, "step": 1512 }, { "epoch": 0.23867176716488545, "grad_norm": 5.4202656745910645, "learning_rate": 7.770085332474641e-07, "loss": 0.043, "step": 1513 }, { "epoch": 0.23882951453247625, "grad_norm": 2.6532363891601562, "learning_rate": 7.768475285783287e-07, "loss": 0.0538, "step": 1514 }, { "epoch": 0.23898726190006706, "grad_norm": 4.850531101226807, "learning_rate": 7.766865239091933e-07, "loss": 0.0668, "step": 1515 }, { "epoch": 0.23914500926765786, "grad_norm": 5.29673957824707, "learning_rate": 7.76525519240058e-07, "loss": 0.0392, "step": 1516 }, { "epoch": 0.23930275663524864, "grad_norm": 2.9764435291290283, "learning_rate": 7.763645145709226e-07, "loss": 0.0531, "step": 1517 }, { "epoch": 0.23946050400283944, "grad_norm": 13.339690208435059, "learning_rate": 7.762035099017871e-07, "loss": 0.0657, "step": 1518 }, { "epoch": 0.23961825137043025, "grad_norm": 2.644252300262451, "learning_rate": 7.760425052326517e-07, "loss": 0.0242, "step": 1519 }, { "epoch": 0.23977599873802105, "grad_norm": 4.701986312866211, "learning_rate": 7.758815005635163e-07, "loss": 0.0519, "step": 1520 }, { "epoch": 0.23993374610561186, "grad_norm": 3.8381757736206055, "learning_rate": 7.757204958943809e-07, "loss": 0.0239, "step": 1521 }, { "epoch": 0.24009149347320266, "grad_norm": 4.815766334533691, "learning_rate": 7.755594912252455e-07, "loss": 0.0388, "step": 1522 }, { "epoch": 0.24024924084079347, "grad_norm": 9.112265586853027, "learning_rate": 7.753984865561101e-07, "loss": 0.0802, "step": 1523 }, { "epoch": 0.24040698820838427, "grad_norm": 3.9090425968170166, "learning_rate": 7.752374818869747e-07, "loss": 0.0289, "step": 1524 }, { "epoch": 0.24056473557597507, "grad_norm": 5.512587070465088, "learning_rate": 7.750764772178393e-07, "loss": 0.0351, "step": 1525 }, { "epoch": 0.24072248294356588, "grad_norm": 10.256319999694824, "learning_rate": 7.749154725487039e-07, "loss": 0.0861, "step": 1526 }, { "epoch": 0.24088023031115668, "grad_norm": 6.231867790222168, "learning_rate": 7.747544678795684e-07, "loss": 0.0685, "step": 1527 }, { "epoch": 0.2410379776787475, "grad_norm": 6.662831783294678, "learning_rate": 7.74593463210433e-07, "loss": 0.0743, "step": 1528 }, { "epoch": 0.2411957250463383, "grad_norm": 4.1656646728515625, "learning_rate": 7.744324585412977e-07, "loss": 0.0482, "step": 1529 }, { "epoch": 0.2413534724139291, "grad_norm": 4.898168087005615, "learning_rate": 7.742714538721623e-07, "loss": 0.0809, "step": 1530 }, { "epoch": 0.2415112197815199, "grad_norm": 4.342949390411377, "learning_rate": 7.741104492030268e-07, "loss": 0.0342, "step": 1531 }, { "epoch": 0.2416689671491107, "grad_norm": 8.505386352539062, "learning_rate": 7.739494445338914e-07, "loss": 0.1033, "step": 1532 }, { "epoch": 0.2418267145167015, "grad_norm": 3.9101195335388184, "learning_rate": 7.73788439864756e-07, "loss": 0.0373, "step": 1533 }, { "epoch": 0.24198446188429232, "grad_norm": 4.149836540222168, "learning_rate": 7.736274351956207e-07, "loss": 0.0493, "step": 1534 }, { "epoch": 0.24214220925188312, "grad_norm": 3.7258353233337402, "learning_rate": 7.734664305264852e-07, "loss": 0.0283, "step": 1535 }, { "epoch": 0.2422999566194739, "grad_norm": 15.704261779785156, "learning_rate": 7.733054258573498e-07, "loss": 0.0666, "step": 1536 }, { "epoch": 0.2422999566194739, "eval_accuracy": 0.9838110965960908, "eval_f1": 0.9838110965960908, "eval_loss": 0.05488148331642151, "eval_runtime": 4723.7889, "eval_samples_per_second": 42.943, "eval_steps_per_second": 2.684, "step": 1536 }, { "epoch": 0.2424577039870647, "grad_norm": 5.802548408508301, "learning_rate": 7.731444211882145e-07, "loss": 0.0763, "step": 1537 }, { "epoch": 0.2426154513546555, "grad_norm": 8.643044471740723, "learning_rate": 7.729834165190791e-07, "loss": 0.0463, "step": 1538 }, { "epoch": 0.24277319872224631, "grad_norm": 8.600137710571289, "learning_rate": 7.728224118499436e-07, "loss": 0.0757, "step": 1539 }, { "epoch": 0.24293094608983712, "grad_norm": 6.230480670928955, "learning_rate": 7.726614071808082e-07, "loss": 0.0523, "step": 1540 }, { "epoch": 0.24308869345742792, "grad_norm": 2.5582406520843506, "learning_rate": 7.725004025116728e-07, "loss": 0.0326, "step": 1541 }, { "epoch": 0.24324644082501873, "grad_norm": 10.910799980163574, "learning_rate": 7.723393978425374e-07, "loss": 0.0665, "step": 1542 }, { "epoch": 0.24340418819260953, "grad_norm": 6.541328430175781, "learning_rate": 7.721783931734019e-07, "loss": 0.09, "step": 1543 }, { "epoch": 0.24356193556020034, "grad_norm": 4.641464710235596, "learning_rate": 7.720173885042666e-07, "loss": 0.0548, "step": 1544 }, { "epoch": 0.24371968292779114, "grad_norm": 4.99911642074585, "learning_rate": 7.718563838351312e-07, "loss": 0.0283, "step": 1545 }, { "epoch": 0.24387743029538195, "grad_norm": 6.955615520477295, "learning_rate": 7.716953791659958e-07, "loss": 0.1133, "step": 1546 }, { "epoch": 0.24403517766297275, "grad_norm": 5.527281284332275, "learning_rate": 7.715343744968604e-07, "loss": 0.0799, "step": 1547 }, { "epoch": 0.24419292503056356, "grad_norm": 5.46390962600708, "learning_rate": 7.713733698277249e-07, "loss": 0.0429, "step": 1548 }, { "epoch": 0.24435067239815436, "grad_norm": 3.071164608001709, "learning_rate": 7.712123651585895e-07, "loss": 0.0365, "step": 1549 }, { "epoch": 0.24450841976574517, "grad_norm": 5.630229949951172, "learning_rate": 7.710513604894541e-07, "loss": 0.0669, "step": 1550 }, { "epoch": 0.24466616713333597, "grad_norm": 6.220067501068115, "learning_rate": 7.708903558203188e-07, "loss": 0.0709, "step": 1551 }, { "epoch": 0.24482391450092678, "grad_norm": 7.007501125335693, "learning_rate": 7.707293511511834e-07, "loss": 0.0735, "step": 1552 }, { "epoch": 0.24498166186851758, "grad_norm": 6.205754280090332, "learning_rate": 7.70568346482048e-07, "loss": 0.1477, "step": 1553 }, { "epoch": 0.24513940923610839, "grad_norm": 5.855749130249023, "learning_rate": 7.704073418129126e-07, "loss": 0.0297, "step": 1554 }, { "epoch": 0.24529715660369916, "grad_norm": 4.720376014709473, "learning_rate": 7.702463371437772e-07, "loss": 0.0562, "step": 1555 }, { "epoch": 0.24545490397128997, "grad_norm": 4.498953342437744, "learning_rate": 7.700853324746417e-07, "loss": 0.09, "step": 1556 }, { "epoch": 0.24561265133888077, "grad_norm": 3.5153050422668457, "learning_rate": 7.699243278055063e-07, "loss": 0.0588, "step": 1557 }, { "epoch": 0.24577039870647158, "grad_norm": 5.03090238571167, "learning_rate": 7.697633231363709e-07, "loss": 0.0915, "step": 1558 }, { "epoch": 0.24592814607406238, "grad_norm": 5.303913116455078, "learning_rate": 7.696023184672356e-07, "loss": 0.0934, "step": 1559 }, { "epoch": 0.2460858934416532, "grad_norm": 6.614214897155762, "learning_rate": 7.694413137981001e-07, "loss": 0.0796, "step": 1560 }, { "epoch": 0.246243640809244, "grad_norm": 5.710455417633057, "learning_rate": 7.692803091289647e-07, "loss": 0.0503, "step": 1561 }, { "epoch": 0.2464013881768348, "grad_norm": 3.4222452640533447, "learning_rate": 7.691193044598293e-07, "loss": 0.024, "step": 1562 }, { "epoch": 0.2465591355444256, "grad_norm": 3.0048317909240723, "learning_rate": 7.689582997906939e-07, "loss": 0.0356, "step": 1563 }, { "epoch": 0.2467168829120164, "grad_norm": 5.317868232727051, "learning_rate": 7.687972951215584e-07, "loss": 0.0575, "step": 1564 }, { "epoch": 0.2468746302796072, "grad_norm": 6.72121524810791, "learning_rate": 7.68636290452423e-07, "loss": 0.0526, "step": 1565 }, { "epoch": 0.24703237764719801, "grad_norm": 2.8865790367126465, "learning_rate": 7.684752857832876e-07, "loss": 0.0194, "step": 1566 }, { "epoch": 0.24719012501478882, "grad_norm": 6.781136989593506, "learning_rate": 7.683142811141523e-07, "loss": 0.0424, "step": 1567 }, { "epoch": 0.24734787238237962, "grad_norm": 5.859102725982666, "learning_rate": 7.681532764450168e-07, "loss": 0.0393, "step": 1568 }, { "epoch": 0.24750561974997043, "grad_norm": 4.861766338348389, "learning_rate": 7.679922717758815e-07, "loss": 0.0692, "step": 1569 }, { "epoch": 0.24766336711756123, "grad_norm": 5.112473011016846, "learning_rate": 7.678312671067461e-07, "loss": 0.046, "step": 1570 }, { "epoch": 0.24782111448515204, "grad_norm": 4.80613899230957, "learning_rate": 7.676702624376107e-07, "loss": 0.0207, "step": 1571 }, { "epoch": 0.24797886185274284, "grad_norm": 4.593803882598877, "learning_rate": 7.675092577684753e-07, "loss": 0.025, "step": 1572 }, { "epoch": 0.24813660922033365, "grad_norm": 4.530654430389404, "learning_rate": 7.673482530993398e-07, "loss": 0.0583, "step": 1573 }, { "epoch": 0.24829435658792443, "grad_norm": 5.102366924285889, "learning_rate": 7.671872484302045e-07, "loss": 0.0383, "step": 1574 }, { "epoch": 0.24845210395551523, "grad_norm": 7.638766288757324, "learning_rate": 7.670262437610691e-07, "loss": 0.0549, "step": 1575 }, { "epoch": 0.24860985132310603, "grad_norm": 9.579071998596191, "learning_rate": 7.668652390919337e-07, "loss": 0.0531, "step": 1576 }, { "epoch": 0.24876759869069684, "grad_norm": 8.592358589172363, "learning_rate": 7.667042344227982e-07, "loss": 0.082, "step": 1577 }, { "epoch": 0.24892534605828764, "grad_norm": 5.915648937225342, "learning_rate": 7.665432297536628e-07, "loss": 0.0534, "step": 1578 }, { "epoch": 0.24908309342587845, "grad_norm": 7.780115127563477, "learning_rate": 7.663822250845274e-07, "loss": 0.0693, "step": 1579 }, { "epoch": 0.24924084079346925, "grad_norm": 2.9398467540740967, "learning_rate": 7.66221220415392e-07, "loss": 0.0384, "step": 1580 }, { "epoch": 0.24939858816106006, "grad_norm": 6.325009346008301, "learning_rate": 7.660602157462565e-07, "loss": 0.034, "step": 1581 }, { "epoch": 0.24955633552865086, "grad_norm": 4.324703216552734, "learning_rate": 7.658992110771212e-07, "loss": 0.0769, "step": 1582 }, { "epoch": 0.24971408289624167, "grad_norm": 6.651468276977539, "learning_rate": 7.657382064079858e-07, "loss": 0.0459, "step": 1583 }, { "epoch": 0.24987183026383247, "grad_norm": 4.541459083557129, "learning_rate": 7.655772017388504e-07, "loss": 0.0362, "step": 1584 }, { "epoch": 0.25002957763142325, "grad_norm": 6.376233100891113, "learning_rate": 7.654161970697149e-07, "loss": 0.0735, "step": 1585 }, { "epoch": 0.2501873249990141, "grad_norm": 2.1708052158355713, "learning_rate": 7.652551924005796e-07, "loss": 0.0235, "step": 1586 }, { "epoch": 0.25034507236660486, "grad_norm": 2.9557206630706787, "learning_rate": 7.650941877314442e-07, "loss": 0.027, "step": 1587 }, { "epoch": 0.2505028197341957, "grad_norm": 5.037871360778809, "learning_rate": 7.649331830623088e-07, "loss": 0.0485, "step": 1588 }, { "epoch": 0.25066056710178647, "grad_norm": 4.366393566131592, "learning_rate": 7.647721783931734e-07, "loss": 0.0365, "step": 1589 }, { "epoch": 0.2508183144693773, "grad_norm": 2.030135154724121, "learning_rate": 7.64611173724038e-07, "loss": 0.0278, "step": 1590 }, { "epoch": 0.2509760618369681, "grad_norm": 8.954212188720703, "learning_rate": 7.644501690549026e-07, "loss": 0.1446, "step": 1591 }, { "epoch": 0.2511338092045589, "grad_norm": 3.5425732135772705, "learning_rate": 7.642891643857672e-07, "loss": 0.0451, "step": 1592 }, { "epoch": 0.2512915565721497, "grad_norm": 3.917052745819092, "learning_rate": 7.641281597166317e-07, "loss": 0.0396, "step": 1593 }, { "epoch": 0.2514493039397405, "grad_norm": 5.786766529083252, "learning_rate": 7.639671550474963e-07, "loss": 0.0824, "step": 1594 }, { "epoch": 0.2516070513073313, "grad_norm": 4.057224750518799, "learning_rate": 7.638061503783609e-07, "loss": 0.0378, "step": 1595 }, { "epoch": 0.25176479867492213, "grad_norm": 5.296603679656982, "learning_rate": 7.636451457092255e-07, "loss": 0.0246, "step": 1596 }, { "epoch": 0.2519225460425129, "grad_norm": 6.155651569366455, "learning_rate": 7.634841410400902e-07, "loss": 0.0818, "step": 1597 }, { "epoch": 0.25208029341010374, "grad_norm": 4.915137767791748, "learning_rate": 7.633231363709547e-07, "loss": 0.0715, "step": 1598 }, { "epoch": 0.2522380407776945, "grad_norm": 5.413578510284424, "learning_rate": 7.631621317018193e-07, "loss": 0.0527, "step": 1599 }, { "epoch": 0.25239578814528535, "grad_norm": 4.126835346221924, "learning_rate": 7.630011270326839e-07, "loss": 0.0705, "step": 1600 }, { "epoch": 0.2525535355128761, "grad_norm": 6.050610542297363, "learning_rate": 7.628401223635485e-07, "loss": 0.081, "step": 1601 }, { "epoch": 0.25271128288046696, "grad_norm": 4.388615608215332, "learning_rate": 7.62679117694413e-07, "loss": 0.0301, "step": 1602 }, { "epoch": 0.25286903024805774, "grad_norm": 11.63635540008545, "learning_rate": 7.625181130252776e-07, "loss": 0.0639, "step": 1603 }, { "epoch": 0.2530267776156485, "grad_norm": 3.628242254257202, "learning_rate": 7.623571083561424e-07, "loss": 0.0335, "step": 1604 }, { "epoch": 0.25318452498323935, "grad_norm": 10.505038261413574, "learning_rate": 7.62196103687007e-07, "loss": 0.0713, "step": 1605 }, { "epoch": 0.2533422723508301, "grad_norm": 12.409416198730469, "learning_rate": 7.620350990178715e-07, "loss": 0.1076, "step": 1606 }, { "epoch": 0.25350001971842095, "grad_norm": 4.488946914672852, "learning_rate": 7.618740943487361e-07, "loss": 0.0381, "step": 1607 }, { "epoch": 0.25365776708601173, "grad_norm": 10.696093559265137, "learning_rate": 7.617130896796007e-07, "loss": 0.0654, "step": 1608 }, { "epoch": 0.25381551445360256, "grad_norm": 5.161859035491943, "learning_rate": 7.615520850104653e-07, "loss": 0.0353, "step": 1609 }, { "epoch": 0.25397326182119334, "grad_norm": 5.989293098449707, "learning_rate": 7.613910803413298e-07, "loss": 0.059, "step": 1610 }, { "epoch": 0.2541310091887842, "grad_norm": 5.287972927093506, "learning_rate": 7.612300756721944e-07, "loss": 0.0252, "step": 1611 }, { "epoch": 0.25428875655637495, "grad_norm": 3.6004478931427, "learning_rate": 7.610690710030591e-07, "loss": 0.0318, "step": 1612 }, { "epoch": 0.2544465039239658, "grad_norm": 6.214033603668213, "learning_rate": 7.609080663339237e-07, "loss": 0.0651, "step": 1613 }, { "epoch": 0.25460425129155656, "grad_norm": 5.563488006591797, "learning_rate": 7.607470616647882e-07, "loss": 0.0827, "step": 1614 }, { "epoch": 0.2547619986591474, "grad_norm": 7.151546001434326, "learning_rate": 7.605860569956528e-07, "loss": 0.0592, "step": 1615 }, { "epoch": 0.25491974602673817, "grad_norm": 6.595117092132568, "learning_rate": 7.604250523265174e-07, "loss": 0.0646, "step": 1616 }, { "epoch": 0.255077493394329, "grad_norm": 5.908832550048828, "learning_rate": 7.60264047657382e-07, "loss": 0.0643, "step": 1617 }, { "epoch": 0.2552352407619198, "grad_norm": 6.553600788116455, "learning_rate": 7.601030429882466e-07, "loss": 0.039, "step": 1618 }, { "epoch": 0.2553929881295106, "grad_norm": 2.677720069885254, "learning_rate": 7.599420383191112e-07, "loss": 0.0396, "step": 1619 }, { "epoch": 0.2555507354971014, "grad_norm": 6.571342468261719, "learning_rate": 7.597810336499758e-07, "loss": 0.1106, "step": 1620 }, { "epoch": 0.2557084828646922, "grad_norm": 11.148405075073242, "learning_rate": 7.596200289808405e-07, "loss": 0.0835, "step": 1621 }, { "epoch": 0.255866230232283, "grad_norm": 4.706027984619141, "learning_rate": 7.594590243117051e-07, "loss": 0.036, "step": 1622 }, { "epoch": 0.2560239775998738, "grad_norm": 4.21353816986084, "learning_rate": 7.592980196425696e-07, "loss": 0.0497, "step": 1623 }, { "epoch": 0.2561817249674646, "grad_norm": 8.491364479064941, "learning_rate": 7.591370149734342e-07, "loss": 0.0477, "step": 1624 }, { "epoch": 0.2563394723350554, "grad_norm": 4.532919406890869, "learning_rate": 7.589760103042988e-07, "loss": 0.0794, "step": 1625 }, { "epoch": 0.2564972197026462, "grad_norm": 4.545327663421631, "learning_rate": 7.588150056351634e-07, "loss": 0.0403, "step": 1626 }, { "epoch": 0.256654967070237, "grad_norm": 3.378826379776001, "learning_rate": 7.58654000966028e-07, "loss": 0.0306, "step": 1627 }, { "epoch": 0.2568127144378278, "grad_norm": 3.9472615718841553, "learning_rate": 7.584929962968926e-07, "loss": 0.0343, "step": 1628 }, { "epoch": 0.2569704618054186, "grad_norm": 5.402244567871094, "learning_rate": 7.583319916277572e-07, "loss": 0.0513, "step": 1629 }, { "epoch": 0.25712820917300944, "grad_norm": 5.572901725769043, "learning_rate": 7.581709869586218e-07, "loss": 0.0621, "step": 1630 }, { "epoch": 0.2572859565406002, "grad_norm": 4.3812713623046875, "learning_rate": 7.580099822894863e-07, "loss": 0.032, "step": 1631 }, { "epoch": 0.25744370390819105, "grad_norm": 7.747415065765381, "learning_rate": 7.578489776203509e-07, "loss": 0.0566, "step": 1632 }, { "epoch": 0.2576014512757818, "grad_norm": 1.9615048170089722, "learning_rate": 7.576879729512155e-07, "loss": 0.0216, "step": 1633 }, { "epoch": 0.25775919864337266, "grad_norm": 4.1552653312683105, "learning_rate": 7.575269682820802e-07, "loss": 0.0268, "step": 1634 }, { "epoch": 0.25791694601096343, "grad_norm": 5.072568893432617, "learning_rate": 7.573659636129447e-07, "loss": 0.0688, "step": 1635 }, { "epoch": 0.25807469337855427, "grad_norm": 2.2559962272644043, "learning_rate": 7.572049589438093e-07, "loss": 0.0331, "step": 1636 }, { "epoch": 0.25823244074614504, "grad_norm": 4.044931888580322, "learning_rate": 7.570439542746739e-07, "loss": 0.0391, "step": 1637 }, { "epoch": 0.2583901881137359, "grad_norm": 5.895998001098633, "learning_rate": 7.568829496055385e-07, "loss": 0.0722, "step": 1638 }, { "epoch": 0.25854793548132665, "grad_norm": 6.41295051574707, "learning_rate": 7.56721944936403e-07, "loss": 0.0673, "step": 1639 }, { "epoch": 0.2587056828489175, "grad_norm": 3.9313929080963135, "learning_rate": 7.565609402672677e-07, "loss": 0.0268, "step": 1640 }, { "epoch": 0.25886343021650826, "grad_norm": 6.33918571472168, "learning_rate": 7.563999355981323e-07, "loss": 0.0792, "step": 1641 }, { "epoch": 0.25902117758409904, "grad_norm": 5.593765735626221, "learning_rate": 7.56238930928997e-07, "loss": 0.0371, "step": 1642 }, { "epoch": 0.25917892495168987, "grad_norm": 4.316140174865723, "learning_rate": 7.560779262598616e-07, "loss": 0.0639, "step": 1643 }, { "epoch": 0.25933667231928065, "grad_norm": 5.701627731323242, "learning_rate": 7.559169215907261e-07, "loss": 0.0407, "step": 1644 }, { "epoch": 0.2594944196868715, "grad_norm": 4.160895347595215, "learning_rate": 7.557559169215907e-07, "loss": 0.0504, "step": 1645 }, { "epoch": 0.25965216705446226, "grad_norm": 3.8167333602905273, "learning_rate": 7.555949122524553e-07, "loss": 0.0584, "step": 1646 }, { "epoch": 0.2598099144220531, "grad_norm": 6.050884246826172, "learning_rate": 7.554339075833199e-07, "loss": 0.0618, "step": 1647 }, { "epoch": 0.25996766178964387, "grad_norm": 7.4451422691345215, "learning_rate": 7.552729029141844e-07, "loss": 0.0527, "step": 1648 }, { "epoch": 0.2601254091572347, "grad_norm": 2.439089059829712, "learning_rate": 7.551118982450491e-07, "loss": 0.0596, "step": 1649 }, { "epoch": 0.2602831565248255, "grad_norm": 2.6400222778320312, "learning_rate": 7.549508935759137e-07, "loss": 0.0165, "step": 1650 }, { "epoch": 0.2604409038924163, "grad_norm": 4.077780723571777, "learning_rate": 7.547898889067783e-07, "loss": 0.0783, "step": 1651 }, { "epoch": 0.2605986512600071, "grad_norm": 5.45765495300293, "learning_rate": 7.546288842376428e-07, "loss": 0.0693, "step": 1652 }, { "epoch": 0.2607563986275979, "grad_norm": 5.8802289962768555, "learning_rate": 7.544678795685074e-07, "loss": 0.0947, "step": 1653 }, { "epoch": 0.2609141459951887, "grad_norm": 11.156391143798828, "learning_rate": 7.54306874899372e-07, "loss": 0.0526, "step": 1654 }, { "epoch": 0.26107189336277953, "grad_norm": 4.166376113891602, "learning_rate": 7.541458702302366e-07, "loss": 0.0389, "step": 1655 }, { "epoch": 0.2612296407303703, "grad_norm": 7.223203659057617, "learning_rate": 7.539848655611011e-07, "loss": 0.0954, "step": 1656 }, { "epoch": 0.26138738809796114, "grad_norm": 8.58761215209961, "learning_rate": 7.538238608919659e-07, "loss": 0.1029, "step": 1657 }, { "epoch": 0.2615451354655519, "grad_norm": 3.298081159591675, "learning_rate": 7.536628562228305e-07, "loss": 0.0364, "step": 1658 }, { "epoch": 0.26170288283314275, "grad_norm": 5.4777350425720215, "learning_rate": 7.535018515536951e-07, "loss": 0.0827, "step": 1659 }, { "epoch": 0.2618606302007335, "grad_norm": 5.848228931427002, "learning_rate": 7.533408468845596e-07, "loss": 0.0533, "step": 1660 }, { "epoch": 0.2620183775683243, "grad_norm": 7.866878986358643, "learning_rate": 7.531798422154242e-07, "loss": 0.0796, "step": 1661 }, { "epoch": 0.26217612493591513, "grad_norm": 2.7035603523254395, "learning_rate": 7.530188375462888e-07, "loss": 0.0213, "step": 1662 }, { "epoch": 0.2623338723035059, "grad_norm": 5.373941898345947, "learning_rate": 7.528578328771534e-07, "loss": 0.0618, "step": 1663 }, { "epoch": 0.26249161967109674, "grad_norm": 8.629209518432617, "learning_rate": 7.526968282080181e-07, "loss": 0.0901, "step": 1664 }, { "epoch": 0.2626493670386875, "grad_norm": 5.4822001457214355, "learning_rate": 7.525358235388826e-07, "loss": 0.0914, "step": 1665 }, { "epoch": 0.26280711440627835, "grad_norm": 4.280784606933594, "learning_rate": 7.523748188697472e-07, "loss": 0.0918, "step": 1666 }, { "epoch": 0.26296486177386913, "grad_norm": 3.0485968589782715, "learning_rate": 7.522138142006118e-07, "loss": 0.0525, "step": 1667 }, { "epoch": 0.26312260914145996, "grad_norm": 7.273502826690674, "learning_rate": 7.520528095314764e-07, "loss": 0.0763, "step": 1668 }, { "epoch": 0.26328035650905074, "grad_norm": 6.036385536193848, "learning_rate": 7.518918048623409e-07, "loss": 0.0641, "step": 1669 }, { "epoch": 0.26343810387664157, "grad_norm": 5.04290771484375, "learning_rate": 7.517308001932055e-07, "loss": 0.1048, "step": 1670 }, { "epoch": 0.26359585124423235, "grad_norm": 5.335455417633057, "learning_rate": 7.515697955240701e-07, "loss": 0.0769, "step": 1671 }, { "epoch": 0.2637535986118232, "grad_norm": 5.425839424133301, "learning_rate": 7.514087908549348e-07, "loss": 0.0333, "step": 1672 }, { "epoch": 0.26391134597941396, "grad_norm": 5.6087260246276855, "learning_rate": 7.512477861857993e-07, "loss": 0.061, "step": 1673 }, { "epoch": 0.2640690933470048, "grad_norm": 8.458930969238281, "learning_rate": 7.51086781516664e-07, "loss": 0.042, "step": 1674 }, { "epoch": 0.26422684071459557, "grad_norm": 9.07277774810791, "learning_rate": 7.509257768475286e-07, "loss": 0.0753, "step": 1675 }, { "epoch": 0.2643845880821864, "grad_norm": 5.768187522888184, "learning_rate": 7.507647721783932e-07, "loss": 0.026, "step": 1676 }, { "epoch": 0.2645423354497772, "grad_norm": 4.110996723175049, "learning_rate": 7.506037675092577e-07, "loss": 0.0463, "step": 1677 }, { "epoch": 0.264700082817368, "grad_norm": 2.877336025238037, "learning_rate": 7.504427628401223e-07, "loss": 0.0265, "step": 1678 }, { "epoch": 0.2648578301849588, "grad_norm": 7.911360740661621, "learning_rate": 7.50281758170987e-07, "loss": 0.0371, "step": 1679 }, { "epoch": 0.2650155775525496, "grad_norm": 6.589733600616455, "learning_rate": 7.501207535018516e-07, "loss": 0.0654, "step": 1680 }, { "epoch": 0.2651733249201404, "grad_norm": 5.051817893981934, "learning_rate": 7.499597488327161e-07, "loss": 0.0715, "step": 1681 }, { "epoch": 0.2653310722877312, "grad_norm": 5.483920574188232, "learning_rate": 7.497987441635807e-07, "loss": 0.0436, "step": 1682 }, { "epoch": 0.265488819655322, "grad_norm": 4.896912097930908, "learning_rate": 7.496377394944453e-07, "loss": 0.0575, "step": 1683 }, { "epoch": 0.2656465670229128, "grad_norm": 7.662925720214844, "learning_rate": 7.494767348253099e-07, "loss": 0.0479, "step": 1684 }, { "epoch": 0.2658043143905036, "grad_norm": 4.1565260887146, "learning_rate": 7.493157301561744e-07, "loss": 0.0353, "step": 1685 }, { "epoch": 0.2659620617580944, "grad_norm": 5.179498195648193, "learning_rate": 7.49154725487039e-07, "loss": 0.034, "step": 1686 }, { "epoch": 0.2661198091256852, "grad_norm": 5.550738334655762, "learning_rate": 7.489937208179037e-07, "loss": 0.0402, "step": 1687 }, { "epoch": 0.266277556493276, "grad_norm": 4.709192752838135, "learning_rate": 7.488327161487683e-07, "loss": 0.0757, "step": 1688 }, { "epoch": 0.26643530386086683, "grad_norm": 5.049219131469727, "learning_rate": 7.486717114796329e-07, "loss": 0.0789, "step": 1689 }, { "epoch": 0.2665930512284576, "grad_norm": 3.1105828285217285, "learning_rate": 7.485107068104974e-07, "loss": 0.0318, "step": 1690 }, { "epoch": 0.26675079859604844, "grad_norm": 4.199319362640381, "learning_rate": 7.48349702141362e-07, "loss": 0.0842, "step": 1691 }, { "epoch": 0.2669085459636392, "grad_norm": 4.327901840209961, "learning_rate": 7.481886974722267e-07, "loss": 0.045, "step": 1692 }, { "epoch": 0.26706629333123005, "grad_norm": 11.174415588378906, "learning_rate": 7.480276928030913e-07, "loss": 0.1128, "step": 1693 }, { "epoch": 0.26722404069882083, "grad_norm": 5.058723449707031, "learning_rate": 7.478666881339558e-07, "loss": 0.071, "step": 1694 }, { "epoch": 0.26738178806641166, "grad_norm": 4.530946731567383, "learning_rate": 7.477056834648205e-07, "loss": 0.0458, "step": 1695 }, { "epoch": 0.26753953543400244, "grad_norm": 8.258978843688965, "learning_rate": 7.475446787956851e-07, "loss": 0.0286, "step": 1696 }, { "epoch": 0.2676972828015933, "grad_norm": 2.9627537727355957, "learning_rate": 7.473836741265497e-07, "loss": 0.0275, "step": 1697 }, { "epoch": 0.26785503016918405, "grad_norm": 6.273197174072266, "learning_rate": 7.472226694574142e-07, "loss": 0.0506, "step": 1698 }, { "epoch": 0.2680127775367749, "grad_norm": 5.220770835876465, "learning_rate": 7.470616647882788e-07, "loss": 0.0442, "step": 1699 }, { "epoch": 0.26817052490436566, "grad_norm": 8.129053115844727, "learning_rate": 7.469006601191434e-07, "loss": 0.0536, "step": 1700 }, { "epoch": 0.26832827227195644, "grad_norm": 5.066961765289307, "learning_rate": 7.46739655450008e-07, "loss": 0.0518, "step": 1701 }, { "epoch": 0.26848601963954727, "grad_norm": 5.97332763671875, "learning_rate": 7.465786507808726e-07, "loss": 0.112, "step": 1702 }, { "epoch": 0.26864376700713805, "grad_norm": 4.293923854827881, "learning_rate": 7.464176461117372e-07, "loss": 0.0402, "step": 1703 }, { "epoch": 0.2688015143747289, "grad_norm": 4.722505569458008, "learning_rate": 7.462566414426018e-07, "loss": 0.0439, "step": 1704 }, { "epoch": 0.26895926174231966, "grad_norm": 4.975640296936035, "learning_rate": 7.460956367734664e-07, "loss": 0.0282, "step": 1705 }, { "epoch": 0.2691170091099105, "grad_norm": 19.83115005493164, "learning_rate": 7.459346321043309e-07, "loss": 0.0639, "step": 1706 }, { "epoch": 0.26927475647750126, "grad_norm": 7.357720851898193, "learning_rate": 7.457736274351955e-07, "loss": 0.0362, "step": 1707 }, { "epoch": 0.2694325038450921, "grad_norm": 10.837241172790527, "learning_rate": 7.456126227660601e-07, "loss": 0.0884, "step": 1708 }, { "epoch": 0.2695902512126829, "grad_norm": 3.5052623748779297, "learning_rate": 7.454516180969249e-07, "loss": 0.0322, "step": 1709 }, { "epoch": 0.2697479985802737, "grad_norm": 10.239457130432129, "learning_rate": 7.452906134277895e-07, "loss": 0.0449, "step": 1710 }, { "epoch": 0.2699057459478645, "grad_norm": 4.1607985496521, "learning_rate": 7.45129608758654e-07, "loss": 0.0814, "step": 1711 }, { "epoch": 0.2700634933154553, "grad_norm": 4.808945178985596, "learning_rate": 7.449686040895186e-07, "loss": 0.0529, "step": 1712 }, { "epoch": 0.2702212406830461, "grad_norm": 3.835298776626587, "learning_rate": 7.448075994203832e-07, "loss": 0.0599, "step": 1713 }, { "epoch": 0.2703789880506369, "grad_norm": 8.88935375213623, "learning_rate": 7.446465947512478e-07, "loss": 0.0413, "step": 1714 }, { "epoch": 0.2705367354182277, "grad_norm": 16.979846954345703, "learning_rate": 7.444855900821123e-07, "loss": 0.073, "step": 1715 }, { "epoch": 0.27069448278581854, "grad_norm": 4.93610954284668, "learning_rate": 7.443245854129769e-07, "loss": 0.1082, "step": 1716 }, { "epoch": 0.2708522301534093, "grad_norm": 5.720614910125732, "learning_rate": 7.441635807438416e-07, "loss": 0.063, "step": 1717 }, { "epoch": 0.27100997752100014, "grad_norm": 6.2367329597473145, "learning_rate": 7.440025760747062e-07, "loss": 0.0386, "step": 1718 }, { "epoch": 0.2711677248885909, "grad_norm": 6.605106830596924, "learning_rate": 7.438415714055707e-07, "loss": 0.0822, "step": 1719 }, { "epoch": 0.2713254722561817, "grad_norm": 11.338903427124023, "learning_rate": 7.436805667364353e-07, "loss": 0.0729, "step": 1720 }, { "epoch": 0.27148321962377253, "grad_norm": 8.502854347229004, "learning_rate": 7.435195620672999e-07, "loss": 0.0316, "step": 1721 }, { "epoch": 0.2716409669913633, "grad_norm": 5.40214729309082, "learning_rate": 7.433585573981645e-07, "loss": 0.0399, "step": 1722 }, { "epoch": 0.27179871435895414, "grad_norm": 5.6454548835754395, "learning_rate": 7.43197552729029e-07, "loss": 0.0589, "step": 1723 }, { "epoch": 0.2719564617265449, "grad_norm": 3.156519651412964, "learning_rate": 7.430365480598937e-07, "loss": 0.0531, "step": 1724 }, { "epoch": 0.27211420909413575, "grad_norm": 4.796392917633057, "learning_rate": 7.428755433907583e-07, "loss": 0.0282, "step": 1725 }, { "epoch": 0.2722719564617265, "grad_norm": 4.910034656524658, "learning_rate": 7.42714538721623e-07, "loss": 0.0437, "step": 1726 }, { "epoch": 0.27242970382931736, "grad_norm": 7.996124267578125, "learning_rate": 7.425535340524875e-07, "loss": 0.082, "step": 1727 }, { "epoch": 0.27258745119690814, "grad_norm": 9.234084129333496, "learning_rate": 7.423925293833521e-07, "loss": 0.0594, "step": 1728 }, { "epoch": 0.27274519856449897, "grad_norm": 7.054522514343262, "learning_rate": 7.422315247142167e-07, "loss": 0.0441, "step": 1729 }, { "epoch": 0.27290294593208975, "grad_norm": 5.047627925872803, "learning_rate": 7.420705200450813e-07, "loss": 0.0526, "step": 1730 }, { "epoch": 0.2730606932996806, "grad_norm": 5.253188610076904, "learning_rate": 7.419095153759458e-07, "loss": 0.0338, "step": 1731 }, { "epoch": 0.27321844066727136, "grad_norm": 8.9965238571167, "learning_rate": 7.417485107068105e-07, "loss": 0.0817, "step": 1732 }, { "epoch": 0.2733761880348622, "grad_norm": 3.8212831020355225, "learning_rate": 7.415875060376751e-07, "loss": 0.0258, "step": 1733 }, { "epoch": 0.27353393540245297, "grad_norm": 1.7131558656692505, "learning_rate": 7.414265013685397e-07, "loss": 0.0187, "step": 1734 }, { "epoch": 0.2736916827700438, "grad_norm": 2.771113634109497, "learning_rate": 7.412654966994043e-07, "loss": 0.032, "step": 1735 }, { "epoch": 0.2738494301376346, "grad_norm": 3.408644199371338, "learning_rate": 7.411044920302688e-07, "loss": 0.0673, "step": 1736 }, { "epoch": 0.2740071775052254, "grad_norm": 8.400445938110352, "learning_rate": 7.409434873611334e-07, "loss": 0.0787, "step": 1737 }, { "epoch": 0.2741649248728162, "grad_norm": 6.512462615966797, "learning_rate": 7.40782482691998e-07, "loss": 0.0315, "step": 1738 }, { "epoch": 0.27432267224040696, "grad_norm": 6.375429153442383, "learning_rate": 7.406214780228627e-07, "loss": 0.1009, "step": 1739 }, { "epoch": 0.2744804196079978, "grad_norm": 3.734159231185913, "learning_rate": 7.404604733537272e-07, "loss": 0.0553, "step": 1740 }, { "epoch": 0.27463816697558857, "grad_norm": 2.839085340499878, "learning_rate": 7.402994686845918e-07, "loss": 0.0152, "step": 1741 }, { "epoch": 0.2747959143431794, "grad_norm": 7.769985675811768, "learning_rate": 7.401384640154564e-07, "loss": 0.0598, "step": 1742 }, { "epoch": 0.2749536617107702, "grad_norm": 9.466279029846191, "learning_rate": 7.39977459346321e-07, "loss": 0.0468, "step": 1743 }, { "epoch": 0.275111409078361, "grad_norm": 4.23649263381958, "learning_rate": 7.398164546771855e-07, "loss": 0.0529, "step": 1744 }, { "epoch": 0.2752691564459518, "grad_norm": 2.059992790222168, "learning_rate": 7.396554500080502e-07, "loss": 0.0523, "step": 1745 }, { "epoch": 0.2754269038135426, "grad_norm": 8.96181869506836, "learning_rate": 7.394944453389148e-07, "loss": 0.0925, "step": 1746 }, { "epoch": 0.2755846511811334, "grad_norm": 4.507099628448486, "learning_rate": 7.393334406697795e-07, "loss": 0.0548, "step": 1747 }, { "epoch": 0.27574239854872423, "grad_norm": 5.502170562744141, "learning_rate": 7.39172436000644e-07, "loss": 0.0999, "step": 1748 }, { "epoch": 0.275900145916315, "grad_norm": 3.1486527919769287, "learning_rate": 7.390114313315086e-07, "loss": 0.0366, "step": 1749 }, { "epoch": 0.27605789328390584, "grad_norm": 7.729282379150391, "learning_rate": 7.388504266623732e-07, "loss": 0.0846, "step": 1750 }, { "epoch": 0.2762156406514966, "grad_norm": 5.723428249359131, "learning_rate": 7.386894219932378e-07, "loss": 0.0655, "step": 1751 }, { "epoch": 0.27637338801908745, "grad_norm": 9.416695594787598, "learning_rate": 7.385284173241023e-07, "loss": 0.0954, "step": 1752 }, { "epoch": 0.27653113538667823, "grad_norm": 6.169697284698486, "learning_rate": 7.383674126549669e-07, "loss": 0.0746, "step": 1753 }, { "epoch": 0.27668888275426906, "grad_norm": 4.9604973793029785, "learning_rate": 7.382064079858316e-07, "loss": 0.0756, "step": 1754 }, { "epoch": 0.27684663012185984, "grad_norm": 6.345669269561768, "learning_rate": 7.380454033166962e-07, "loss": 0.0553, "step": 1755 }, { "epoch": 0.27700437748945067, "grad_norm": 6.011808395385742, "learning_rate": 7.378843986475607e-07, "loss": 0.0634, "step": 1756 }, { "epoch": 0.27716212485704145, "grad_norm": 9.210107803344727, "learning_rate": 7.377233939784253e-07, "loss": 0.0733, "step": 1757 }, { "epoch": 0.2773198722246322, "grad_norm": 5.413785934448242, "learning_rate": 7.375623893092899e-07, "loss": 0.031, "step": 1758 }, { "epoch": 0.27747761959222306, "grad_norm": 5.12843132019043, "learning_rate": 7.374013846401545e-07, "loss": 0.082, "step": 1759 }, { "epoch": 0.27763536695981383, "grad_norm": 3.3309543132781982, "learning_rate": 7.372403799710191e-07, "loss": 0.0643, "step": 1760 }, { "epoch": 0.27779311432740467, "grad_norm": 6.6534953117370605, "learning_rate": 7.370793753018836e-07, "loss": 0.085, "step": 1761 }, { "epoch": 0.27795086169499544, "grad_norm": 7.947381496429443, "learning_rate": 7.369183706327484e-07, "loss": 0.086, "step": 1762 }, { "epoch": 0.2781086090625863, "grad_norm": 4.031906604766846, "learning_rate": 7.36757365963613e-07, "loss": 0.0528, "step": 1763 }, { "epoch": 0.27826635643017705, "grad_norm": 4.700561046600342, "learning_rate": 7.365963612944776e-07, "loss": 0.0422, "step": 1764 }, { "epoch": 0.2784241037977679, "grad_norm": 4.3197021484375, "learning_rate": 7.364353566253421e-07, "loss": 0.0754, "step": 1765 }, { "epoch": 0.27858185116535866, "grad_norm": 4.787306308746338, "learning_rate": 7.362743519562067e-07, "loss": 0.0489, "step": 1766 }, { "epoch": 0.2787395985329495, "grad_norm": 11.220168113708496, "learning_rate": 7.361133472870713e-07, "loss": 0.1168, "step": 1767 }, { "epoch": 0.27889734590054027, "grad_norm": 9.612006187438965, "learning_rate": 7.359523426179359e-07, "loss": 0.0759, "step": 1768 }, { "epoch": 0.2790550932681311, "grad_norm": 3.2283174991607666, "learning_rate": 7.357913379488004e-07, "loss": 0.0286, "step": 1769 }, { "epoch": 0.2792128406357219, "grad_norm": 1.8329448699951172, "learning_rate": 7.356303332796651e-07, "loss": 0.0155, "step": 1770 }, { "epoch": 0.2793705880033127, "grad_norm": 6.679264545440674, "learning_rate": 7.354693286105297e-07, "loss": 0.03, "step": 1771 }, { "epoch": 0.2795283353709035, "grad_norm": 6.550128936767578, "learning_rate": 7.353083239413943e-07, "loss": 0.0833, "step": 1772 }, { "epoch": 0.2796860827384943, "grad_norm": 6.05768346786499, "learning_rate": 7.351473192722588e-07, "loss": 0.0443, "step": 1773 }, { "epoch": 0.2798438301060851, "grad_norm": 6.712532043457031, "learning_rate": 7.349863146031234e-07, "loss": 0.0761, "step": 1774 }, { "epoch": 0.28000157747367593, "grad_norm": 3.4038772583007812, "learning_rate": 7.34825309933988e-07, "loss": 0.018, "step": 1775 }, { "epoch": 0.2801593248412667, "grad_norm": 5.155402183532715, "learning_rate": 7.346643052648526e-07, "loss": 0.041, "step": 1776 }, { "epoch": 0.2803170722088575, "grad_norm": 5.3723530769348145, "learning_rate": 7.345033005957172e-07, "loss": 0.0416, "step": 1777 }, { "epoch": 0.2804748195764483, "grad_norm": 6.428321361541748, "learning_rate": 7.343422959265818e-07, "loss": 0.0798, "step": 1778 }, { "epoch": 0.2806325669440391, "grad_norm": 4.262994766235352, "learning_rate": 7.341812912574465e-07, "loss": 0.0375, "step": 1779 }, { "epoch": 0.28079031431162993, "grad_norm": 5.236276626586914, "learning_rate": 7.340202865883111e-07, "loss": 0.0359, "step": 1780 }, { "epoch": 0.2809480616792207, "grad_norm": 4.757218837738037, "learning_rate": 7.338592819191757e-07, "loss": 0.0331, "step": 1781 }, { "epoch": 0.28110580904681154, "grad_norm": 4.1465678215026855, "learning_rate": 7.336982772500402e-07, "loss": 0.0346, "step": 1782 }, { "epoch": 0.2812635564144023, "grad_norm": 3.4765822887420654, "learning_rate": 7.335372725809048e-07, "loss": 0.0375, "step": 1783 }, { "epoch": 0.28142130378199315, "grad_norm": 6.680180549621582, "learning_rate": 7.333762679117695e-07, "loss": 0.0249, "step": 1784 }, { "epoch": 0.2815790511495839, "grad_norm": 6.115419864654541, "learning_rate": 7.332152632426341e-07, "loss": 0.0497, "step": 1785 }, { "epoch": 0.28173679851717476, "grad_norm": 11.377446174621582, "learning_rate": 7.330542585734986e-07, "loss": 0.0664, "step": 1786 }, { "epoch": 0.28189454588476553, "grad_norm": 4.166162490844727, "learning_rate": 7.328932539043632e-07, "loss": 0.0746, "step": 1787 }, { "epoch": 0.28205229325235637, "grad_norm": 3.433807849884033, "learning_rate": 7.327322492352278e-07, "loss": 0.0693, "step": 1788 }, { "epoch": 0.28221004061994714, "grad_norm": 4.4710693359375, "learning_rate": 7.325712445660924e-07, "loss": 0.0485, "step": 1789 }, { "epoch": 0.282367787987538, "grad_norm": 4.173577785491943, "learning_rate": 7.324102398969569e-07, "loss": 0.0441, "step": 1790 }, { "epoch": 0.28252553535512875, "grad_norm": 7.301423072814941, "learning_rate": 7.322492352278215e-07, "loss": 0.0499, "step": 1791 }, { "epoch": 0.2826832827227196, "grad_norm": 8.932049751281738, "learning_rate": 7.320882305586862e-07, "loss": 0.0746, "step": 1792 }, { "epoch": 0.28284103009031036, "grad_norm": 6.043249130249023, "learning_rate": 7.319272258895508e-07, "loss": 0.0605, "step": 1793 }, { "epoch": 0.2829987774579012, "grad_norm": 4.231344223022461, "learning_rate": 7.317662212204153e-07, "loss": 0.0488, "step": 1794 }, { "epoch": 0.283156524825492, "grad_norm": 8.559317588806152, "learning_rate": 7.316052165512799e-07, "loss": 0.1268, "step": 1795 }, { "epoch": 0.2833142721930828, "grad_norm": 11.188226699829102, "learning_rate": 7.314442118821445e-07, "loss": 0.0812, "step": 1796 }, { "epoch": 0.2834720195606736, "grad_norm": 5.2370500564575195, "learning_rate": 7.312832072130092e-07, "loss": 0.0689, "step": 1797 }, { "epoch": 0.28362976692826436, "grad_norm": 2.636385679244995, "learning_rate": 7.311222025438737e-07, "loss": 0.0198, "step": 1798 }, { "epoch": 0.2837875142958552, "grad_norm": 7.2173895835876465, "learning_rate": 7.309611978747383e-07, "loss": 0.0523, "step": 1799 }, { "epoch": 0.28394526166344597, "grad_norm": 4.4801411628723145, "learning_rate": 7.30800193205603e-07, "loss": 0.0791, "step": 1800 }, { "epoch": 0.2841030090310368, "grad_norm": 2.042889356613159, "learning_rate": 7.306391885364676e-07, "loss": 0.0115, "step": 1801 }, { "epoch": 0.2842607563986276, "grad_norm": 6.690401077270508, "learning_rate": 7.304781838673321e-07, "loss": 0.0838, "step": 1802 }, { "epoch": 0.2844185037662184, "grad_norm": 3.1295642852783203, "learning_rate": 7.303171791981967e-07, "loss": 0.0304, "step": 1803 }, { "epoch": 0.2845762511338092, "grad_norm": 6.915700912475586, "learning_rate": 7.301561745290613e-07, "loss": 0.0409, "step": 1804 }, { "epoch": 0.2847339985014, "grad_norm": 4.392620086669922, "learning_rate": 7.299951698599259e-07, "loss": 0.114, "step": 1805 }, { "epoch": 0.2848917458689908, "grad_norm": 7.388136863708496, "learning_rate": 7.298341651907905e-07, "loss": 0.0718, "step": 1806 }, { "epoch": 0.28504949323658163, "grad_norm": 10.599838256835938, "learning_rate": 7.296731605216551e-07, "loss": 0.0849, "step": 1807 }, { "epoch": 0.2852072406041724, "grad_norm": 4.139700412750244, "learning_rate": 7.295121558525197e-07, "loss": 0.0318, "step": 1808 }, { "epoch": 0.28536498797176324, "grad_norm": 5.016416072845459, "learning_rate": 7.293511511833843e-07, "loss": 0.0412, "step": 1809 }, { "epoch": 0.285522735339354, "grad_norm": 11.005081176757812, "learning_rate": 7.291901465142489e-07, "loss": 0.0772, "step": 1810 }, { "epoch": 0.28568048270694485, "grad_norm": 3.681155204772949, "learning_rate": 7.290291418451134e-07, "loss": 0.0408, "step": 1811 }, { "epoch": 0.2858382300745356, "grad_norm": 6.3219146728515625, "learning_rate": 7.28868137175978e-07, "loss": 0.0363, "step": 1812 }, { "epoch": 0.28599597744212646, "grad_norm": 6.1140522956848145, "learning_rate": 7.287071325068426e-07, "loss": 0.0821, "step": 1813 }, { "epoch": 0.28615372480971724, "grad_norm": 8.338149070739746, "learning_rate": 7.285461278377074e-07, "loss": 0.0644, "step": 1814 }, { "epoch": 0.28631147217730807, "grad_norm": 8.887350082397461, "learning_rate": 7.283851231685719e-07, "loss": 0.0998, "step": 1815 }, { "epoch": 0.28646921954489885, "grad_norm": 3.9195213317871094, "learning_rate": 7.282241184994365e-07, "loss": 0.0464, "step": 1816 }, { "epoch": 0.2866269669124896, "grad_norm": 6.6678996086120605, "learning_rate": 7.280631138303011e-07, "loss": 0.0583, "step": 1817 }, { "epoch": 0.28678471428008045, "grad_norm": 6.425255298614502, "learning_rate": 7.279021091611657e-07, "loss": 0.0823, "step": 1818 }, { "epoch": 0.28694246164767123, "grad_norm": 3.6952288150787354, "learning_rate": 7.277411044920302e-07, "loss": 0.0389, "step": 1819 }, { "epoch": 0.28710020901526206, "grad_norm": 10.184599876403809, "learning_rate": 7.275800998228948e-07, "loss": 0.0359, "step": 1820 }, { "epoch": 0.28725795638285284, "grad_norm": 4.51015567779541, "learning_rate": 7.274190951537594e-07, "loss": 0.0753, "step": 1821 }, { "epoch": 0.2874157037504437, "grad_norm": 20.935108184814453, "learning_rate": 7.272580904846241e-07, "loss": 0.0988, "step": 1822 }, { "epoch": 0.28757345111803445, "grad_norm": 4.455567359924316, "learning_rate": 7.270970858154886e-07, "loss": 0.0348, "step": 1823 }, { "epoch": 0.2877311984856253, "grad_norm": 3.6792962551116943, "learning_rate": 7.269360811463532e-07, "loss": 0.0244, "step": 1824 }, { "epoch": 0.28788894585321606, "grad_norm": 4.358467102050781, "learning_rate": 7.267750764772178e-07, "loss": 0.0365, "step": 1825 }, { "epoch": 0.2880466932208069, "grad_norm": 10.892244338989258, "learning_rate": 7.266140718080824e-07, "loss": 0.0535, "step": 1826 }, { "epoch": 0.28820444058839767, "grad_norm": 3.952486991882324, "learning_rate": 7.26453067138947e-07, "loss": 0.0583, "step": 1827 }, { "epoch": 0.2883621879559885, "grad_norm": 2.9721004962921143, "learning_rate": 7.262920624698115e-07, "loss": 0.0282, "step": 1828 }, { "epoch": 0.2885199353235793, "grad_norm": 8.202765464782715, "learning_rate": 7.261310578006761e-07, "loss": 0.0941, "step": 1829 }, { "epoch": 0.2886776826911701, "grad_norm": 6.014126777648926, "learning_rate": 7.259700531315408e-07, "loss": 0.0527, "step": 1830 }, { "epoch": 0.2888354300587609, "grad_norm": 4.782886505126953, "learning_rate": 7.258090484624055e-07, "loss": 0.0494, "step": 1831 }, { "epoch": 0.2889931774263517, "grad_norm": 2.926882028579712, "learning_rate": 7.2564804379327e-07, "loss": 0.0217, "step": 1832 }, { "epoch": 0.2891509247939425, "grad_norm": 5.702027797698975, "learning_rate": 7.254870391241346e-07, "loss": 0.0771, "step": 1833 }, { "epoch": 0.28930867216153333, "grad_norm": 7.120554447174072, "learning_rate": 7.253260344549992e-07, "loss": 0.0353, "step": 1834 }, { "epoch": 0.2894664195291241, "grad_norm": 23.75204849243164, "learning_rate": 7.251650297858638e-07, "loss": 0.0424, "step": 1835 }, { "epoch": 0.2896241668967149, "grad_norm": 6.956763744354248, "learning_rate": 7.250040251167283e-07, "loss": 0.0921, "step": 1836 }, { "epoch": 0.2897819142643057, "grad_norm": 3.723970651626587, "learning_rate": 7.24843020447593e-07, "loss": 0.0291, "step": 1837 }, { "epoch": 0.2899396616318965, "grad_norm": 5.069478511810303, "learning_rate": 7.246820157784576e-07, "loss": 0.0632, "step": 1838 }, { "epoch": 0.2900974089994873, "grad_norm": 5.687950134277344, "learning_rate": 7.245210111093222e-07, "loss": 0.0449, "step": 1839 }, { "epoch": 0.2902551563670781, "grad_norm": 4.0997114181518555, "learning_rate": 7.243600064401867e-07, "loss": 0.0797, "step": 1840 }, { "epoch": 0.29041290373466894, "grad_norm": 5.048920154571533, "learning_rate": 7.241990017710513e-07, "loss": 0.0234, "step": 1841 }, { "epoch": 0.2905706511022597, "grad_norm": 7.273456573486328, "learning_rate": 7.240379971019159e-07, "loss": 0.0473, "step": 1842 }, { "epoch": 0.29072839846985055, "grad_norm": 6.181704998016357, "learning_rate": 7.238769924327805e-07, "loss": 0.0879, "step": 1843 }, { "epoch": 0.2908861458374413, "grad_norm": 5.6075310707092285, "learning_rate": 7.23715987763645e-07, "loss": 0.0599, "step": 1844 }, { "epoch": 0.29104389320503216, "grad_norm": 6.415543079376221, "learning_rate": 7.235549830945097e-07, "loss": 0.065, "step": 1845 }, { "epoch": 0.29120164057262293, "grad_norm": 4.180621147155762, "learning_rate": 7.233939784253743e-07, "loss": 0.0609, "step": 1846 }, { "epoch": 0.29135938794021377, "grad_norm": 3.543720006942749, "learning_rate": 7.232329737562389e-07, "loss": 0.0442, "step": 1847 }, { "epoch": 0.29151713530780454, "grad_norm": 8.813889503479004, "learning_rate": 7.230719690871034e-07, "loss": 0.1239, "step": 1848 }, { "epoch": 0.2916748826753954, "grad_norm": 7.704339981079102, "learning_rate": 7.22910964417968e-07, "loss": 0.0891, "step": 1849 }, { "epoch": 0.29183263004298615, "grad_norm": 7.240747928619385, "learning_rate": 7.227499597488327e-07, "loss": 0.0787, "step": 1850 }, { "epoch": 0.291990377410577, "grad_norm": 4.727330684661865, "learning_rate": 7.225889550796973e-07, "loss": 0.0357, "step": 1851 }, { "epoch": 0.29214812477816776, "grad_norm": 4.392090797424316, "learning_rate": 7.22427950410562e-07, "loss": 0.0307, "step": 1852 }, { "epoch": 0.2923058721457586, "grad_norm": 4.477465629577637, "learning_rate": 7.222669457414265e-07, "loss": 0.073, "step": 1853 }, { "epoch": 0.29246361951334937, "grad_norm": 9.257769584655762, "learning_rate": 7.221059410722911e-07, "loss": 0.0808, "step": 1854 }, { "epoch": 0.29262136688094015, "grad_norm": 4.829099655151367, "learning_rate": 7.219449364031557e-07, "loss": 0.0491, "step": 1855 }, { "epoch": 0.292779114248531, "grad_norm": 6.342634677886963, "learning_rate": 7.217839317340203e-07, "loss": 0.0632, "step": 1856 }, { "epoch": 0.29293686161612176, "grad_norm": 3.0647029876708984, "learning_rate": 7.216229270648848e-07, "loss": 0.0425, "step": 1857 }, { "epoch": 0.2930946089837126, "grad_norm": 4.071573257446289, "learning_rate": 7.214619223957494e-07, "loss": 0.0321, "step": 1858 }, { "epoch": 0.29325235635130337, "grad_norm": 5.210506439208984, "learning_rate": 7.21300917726614e-07, "loss": 0.0836, "step": 1859 }, { "epoch": 0.2934101037188942, "grad_norm": 9.28438663482666, "learning_rate": 7.211399130574787e-07, "loss": 0.0645, "step": 1860 }, { "epoch": 0.293567851086485, "grad_norm": 6.821744918823242, "learning_rate": 7.209789083883432e-07, "loss": 0.0871, "step": 1861 }, { "epoch": 0.2937255984540758, "grad_norm": 3.003976821899414, "learning_rate": 7.208179037192078e-07, "loss": 0.0582, "step": 1862 }, { "epoch": 0.2938833458216666, "grad_norm": 6.32629919052124, "learning_rate": 7.206568990500724e-07, "loss": 0.042, "step": 1863 }, { "epoch": 0.2940410931892574, "grad_norm": 8.073723793029785, "learning_rate": 7.20495894380937e-07, "loss": 0.0911, "step": 1864 }, { "epoch": 0.2941988405568482, "grad_norm": 7.266777038574219, "learning_rate": 7.203348897118015e-07, "loss": 0.1142, "step": 1865 }, { "epoch": 0.29435658792443903, "grad_norm": 6.750946044921875, "learning_rate": 7.201738850426661e-07, "loss": 0.0666, "step": 1866 }, { "epoch": 0.2945143352920298, "grad_norm": 3.6444435119628906, "learning_rate": 7.200128803735309e-07, "loss": 0.0288, "step": 1867 }, { "epoch": 0.29467208265962064, "grad_norm": 3.0052032470703125, "learning_rate": 7.198518757043955e-07, "loss": 0.0189, "step": 1868 }, { "epoch": 0.2948298300272114, "grad_norm": 5.2682719230651855, "learning_rate": 7.1969087103526e-07, "loss": 0.0834, "step": 1869 }, { "epoch": 0.29498757739480225, "grad_norm": 5.890918254852295, "learning_rate": 7.195298663661246e-07, "loss": 0.0704, "step": 1870 }, { "epoch": 0.295145324762393, "grad_norm": 3.2959954738616943, "learning_rate": 7.193688616969892e-07, "loss": 0.0186, "step": 1871 }, { "epoch": 0.29530307212998386, "grad_norm": 3.670823097229004, "learning_rate": 7.192078570278538e-07, "loss": 0.0651, "step": 1872 }, { "epoch": 0.29546081949757463, "grad_norm": 5.776731491088867, "learning_rate": 7.190468523587184e-07, "loss": 0.0644, "step": 1873 }, { "epoch": 0.2956185668651654, "grad_norm": 2.765413761138916, "learning_rate": 7.188858476895829e-07, "loss": 0.0554, "step": 1874 }, { "epoch": 0.29577631423275624, "grad_norm": 7.823866844177246, "learning_rate": 7.187248430204476e-07, "loss": 0.0641, "step": 1875 }, { "epoch": 0.295934061600347, "grad_norm": 3.9487831592559814, "learning_rate": 7.185638383513122e-07, "loss": 0.0637, "step": 1876 }, { "epoch": 0.29609180896793785, "grad_norm": 3.203082323074341, "learning_rate": 7.184028336821768e-07, "loss": 0.0401, "step": 1877 }, { "epoch": 0.29624955633552863, "grad_norm": 3.9091076850891113, "learning_rate": 7.182418290130413e-07, "loss": 0.0448, "step": 1878 }, { "epoch": 0.29640730370311946, "grad_norm": 4.002121925354004, "learning_rate": 7.180808243439059e-07, "loss": 0.0873, "step": 1879 }, { "epoch": 0.29656505107071024, "grad_norm": 6.394353866577148, "learning_rate": 7.179198196747705e-07, "loss": 0.0338, "step": 1880 }, { "epoch": 0.29672279843830107, "grad_norm": 6.340281963348389, "learning_rate": 7.177588150056351e-07, "loss": 0.0635, "step": 1881 }, { "epoch": 0.29688054580589185, "grad_norm": 4.5851640701293945, "learning_rate": 7.175978103364997e-07, "loss": 0.0468, "step": 1882 }, { "epoch": 0.2970382931734827, "grad_norm": 5.020273685455322, "learning_rate": 7.174368056673643e-07, "loss": 0.0341, "step": 1883 }, { "epoch": 0.29719604054107346, "grad_norm": 11.44133186340332, "learning_rate": 7.17275800998229e-07, "loss": 0.0875, "step": 1884 }, { "epoch": 0.2973537879086643, "grad_norm": 4.677864074707031, "learning_rate": 7.171147963290936e-07, "loss": 0.0216, "step": 1885 }, { "epoch": 0.29751153527625507, "grad_norm": 10.180984497070312, "learning_rate": 7.169537916599581e-07, "loss": 0.076, "step": 1886 }, { "epoch": 0.2976692826438459, "grad_norm": 3.750933885574341, "learning_rate": 7.167927869908227e-07, "loss": 0.0374, "step": 1887 }, { "epoch": 0.2978270300114367, "grad_norm": 6.10151481628418, "learning_rate": 7.166317823216873e-07, "loss": 0.0442, "step": 1888 }, { "epoch": 0.2979847773790275, "grad_norm": 8.401910781860352, "learning_rate": 7.164707776525519e-07, "loss": 0.0532, "step": 1889 }, { "epoch": 0.2981425247466183, "grad_norm": 4.196383476257324, "learning_rate": 7.163097729834165e-07, "loss": 0.0568, "step": 1890 }, { "epoch": 0.2983002721142091, "grad_norm": 3.7810068130493164, "learning_rate": 7.161487683142811e-07, "loss": 0.0284, "step": 1891 }, { "epoch": 0.2984580194817999, "grad_norm": 8.036553382873535, "learning_rate": 7.159877636451457e-07, "loss": 0.0707, "step": 1892 }, { "epoch": 0.2986157668493907, "grad_norm": 5.693993091583252, "learning_rate": 7.158267589760103e-07, "loss": 0.0834, "step": 1893 }, { "epoch": 0.2987735142169815, "grad_norm": 5.394157409667969, "learning_rate": 7.156657543068748e-07, "loss": 0.0629, "step": 1894 }, { "epoch": 0.2989312615845723, "grad_norm": 7.30886173248291, "learning_rate": 7.155047496377394e-07, "loss": 0.1106, "step": 1895 }, { "epoch": 0.2990890089521631, "grad_norm": 6.162013053894043, "learning_rate": 7.15343744968604e-07, "loss": 0.057, "step": 1896 }, { "epoch": 0.2992467563197539, "grad_norm": 6.323513507843018, "learning_rate": 7.151827402994687e-07, "loss": 0.0469, "step": 1897 }, { "epoch": 0.2994045036873447, "grad_norm": 4.471866607666016, "learning_rate": 7.150217356303333e-07, "loss": 0.0706, "step": 1898 }, { "epoch": 0.2995622510549355, "grad_norm": 7.489285469055176, "learning_rate": 7.148607309611978e-07, "loss": 0.1049, "step": 1899 }, { "epoch": 0.29971999842252633, "grad_norm": 4.253059387207031, "learning_rate": 7.146997262920624e-07, "loss": 0.0361, "step": 1900 }, { "epoch": 0.2998777457901171, "grad_norm": 8.71242904663086, "learning_rate": 7.14538721622927e-07, "loss": 0.1012, "step": 1901 }, { "epoch": 0.30003549315770794, "grad_norm": 6.077664852142334, "learning_rate": 7.143777169537917e-07, "loss": 0.0932, "step": 1902 }, { "epoch": 0.3001932405252987, "grad_norm": 3.9309680461883545, "learning_rate": 7.142167122846562e-07, "loss": 0.0244, "step": 1903 }, { "epoch": 0.30035098789288955, "grad_norm": 6.765064716339111, "learning_rate": 7.140557076155208e-07, "loss": 0.0232, "step": 1904 }, { "epoch": 0.30050873526048033, "grad_norm": 6.2488884925842285, "learning_rate": 7.138947029463855e-07, "loss": 0.0723, "step": 1905 }, { "epoch": 0.30066648262807116, "grad_norm": 5.702681064605713, "learning_rate": 7.137336982772501e-07, "loss": 0.0544, "step": 1906 }, { "epoch": 0.30082422999566194, "grad_norm": 5.73229455947876, "learning_rate": 7.135726936081146e-07, "loss": 0.082, "step": 1907 }, { "epoch": 0.3009819773632528, "grad_norm": 7.134407043457031, "learning_rate": 7.134116889389792e-07, "loss": 0.0366, "step": 1908 }, { "epoch": 0.30113972473084355, "grad_norm": 3.8968775272369385, "learning_rate": 7.132506842698438e-07, "loss": 0.0505, "step": 1909 }, { "epoch": 0.3012974720984344, "grad_norm": 7.5660929679870605, "learning_rate": 7.130896796007084e-07, "loss": 0.0701, "step": 1910 }, { "epoch": 0.30145521946602516, "grad_norm": 11.128520011901855, "learning_rate": 7.129286749315729e-07, "loss": 0.0387, "step": 1911 }, { "epoch": 0.30161296683361594, "grad_norm": 4.411167144775391, "learning_rate": 7.127676702624376e-07, "loss": 0.0559, "step": 1912 }, { "epoch": 0.30177071420120677, "grad_norm": 5.5612263679504395, "learning_rate": 7.126066655933022e-07, "loss": 0.0235, "step": 1913 }, { "epoch": 0.30192846156879755, "grad_norm": 3.3991105556488037, "learning_rate": 7.124456609241668e-07, "loss": 0.0544, "step": 1914 }, { "epoch": 0.3020862089363884, "grad_norm": 3.648372173309326, "learning_rate": 7.122846562550313e-07, "loss": 0.0262, "step": 1915 }, { "epoch": 0.30224395630397916, "grad_norm": 2.522200345993042, "learning_rate": 7.121236515858959e-07, "loss": 0.0177, "step": 1916 }, { "epoch": 0.30240170367157, "grad_norm": 3.3338704109191895, "learning_rate": 7.119626469167605e-07, "loss": 0.0337, "step": 1917 }, { "epoch": 0.30255945103916076, "grad_norm": 4.939285755157471, "learning_rate": 7.118016422476251e-07, "loss": 0.0648, "step": 1918 }, { "epoch": 0.3027171984067516, "grad_norm": 3.7481281757354736, "learning_rate": 7.116406375784896e-07, "loss": 0.058, "step": 1919 }, { "epoch": 0.3028749457743424, "grad_norm": 2.6262574195861816, "learning_rate": 7.114796329093544e-07, "loss": 0.0416, "step": 1920 }, { "epoch": 0.3030326931419332, "grad_norm": 7.282456398010254, "learning_rate": 7.11318628240219e-07, "loss": 0.0788, "step": 1921 }, { "epoch": 0.303190440509524, "grad_norm": 4.289928913116455, "learning_rate": 7.111576235710836e-07, "loss": 0.0326, "step": 1922 }, { "epoch": 0.3033481878771148, "grad_norm": 6.543341159820557, "learning_rate": 7.109966189019482e-07, "loss": 0.0668, "step": 1923 }, { "epoch": 0.3035059352447056, "grad_norm": 10.349213600158691, "learning_rate": 7.108356142328127e-07, "loss": 0.0905, "step": 1924 }, { "epoch": 0.3036636826122964, "grad_norm": 7.131984233856201, "learning_rate": 7.106746095636773e-07, "loss": 0.0634, "step": 1925 }, { "epoch": 0.3038214299798872, "grad_norm": 4.043402671813965, "learning_rate": 7.105136048945419e-07, "loss": 0.0415, "step": 1926 }, { "epoch": 0.30397917734747804, "grad_norm": 2.7604012489318848, "learning_rate": 7.103526002254066e-07, "loss": 0.0153, "step": 1927 }, { "epoch": 0.3041369247150688, "grad_norm": 7.04880952835083, "learning_rate": 7.101915955562711e-07, "loss": 0.1251, "step": 1928 }, { "epoch": 0.30429467208265965, "grad_norm": 4.090600967407227, "learning_rate": 7.100305908871357e-07, "loss": 0.0533, "step": 1929 }, { "epoch": 0.3044524194502504, "grad_norm": 4.4699177742004395, "learning_rate": 7.098695862180003e-07, "loss": 0.0567, "step": 1930 }, { "epoch": 0.30461016681784125, "grad_norm": 53.74309539794922, "learning_rate": 7.097085815488649e-07, "loss": 0.1118, "step": 1931 }, { "epoch": 0.30476791418543203, "grad_norm": 5.222226142883301, "learning_rate": 7.095475768797294e-07, "loss": 0.0486, "step": 1932 }, { "epoch": 0.3049256615530228, "grad_norm": 4.8929266929626465, "learning_rate": 7.09386572210594e-07, "loss": 0.0416, "step": 1933 }, { "epoch": 0.30508340892061364, "grad_norm": 4.212865829467773, "learning_rate": 7.092255675414586e-07, "loss": 0.0697, "step": 1934 }, { "epoch": 0.3052411562882044, "grad_norm": 4.160580635070801, "learning_rate": 7.090645628723233e-07, "loss": 0.0234, "step": 1935 }, { "epoch": 0.30539890365579525, "grad_norm": 4.000143527984619, "learning_rate": 7.089035582031878e-07, "loss": 0.0592, "step": 1936 }, { "epoch": 0.30555665102338603, "grad_norm": 9.259430885314941, "learning_rate": 7.087425535340524e-07, "loss": 0.0591, "step": 1937 }, { "epoch": 0.30571439839097686, "grad_norm": 8.322980880737305, "learning_rate": 7.085815488649171e-07, "loss": 0.0901, "step": 1938 }, { "epoch": 0.30587214575856764, "grad_norm": 6.6517133712768555, "learning_rate": 7.084205441957817e-07, "loss": 0.0672, "step": 1939 }, { "epoch": 0.30602989312615847, "grad_norm": 8.325651168823242, "learning_rate": 7.082595395266462e-07, "loss": 0.0746, "step": 1940 }, { "epoch": 0.30618764049374925, "grad_norm": 5.0230393409729, "learning_rate": 7.080985348575108e-07, "loss": 0.0477, "step": 1941 }, { "epoch": 0.3063453878613401, "grad_norm": 2.52583909034729, "learning_rate": 7.079375301883755e-07, "loss": 0.0285, "step": 1942 }, { "epoch": 0.30650313522893086, "grad_norm": 5.606345176696777, "learning_rate": 7.077765255192401e-07, "loss": 0.072, "step": 1943 }, { "epoch": 0.3066608825965217, "grad_norm": 7.923591613769531, "learning_rate": 7.076155208501047e-07, "loss": 0.0639, "step": 1944 }, { "epoch": 0.30681862996411247, "grad_norm": 4.813804626464844, "learning_rate": 7.074545161809692e-07, "loss": 0.0549, "step": 1945 }, { "epoch": 0.3069763773317033, "grad_norm": 7.287686347961426, "learning_rate": 7.072935115118338e-07, "loss": 0.0409, "step": 1946 }, { "epoch": 0.3071341246992941, "grad_norm": 4.390560150146484, "learning_rate": 7.071325068426984e-07, "loss": 0.0591, "step": 1947 }, { "epoch": 0.3072918720668849, "grad_norm": 3.507387161254883, "learning_rate": 7.06971502173563e-07, "loss": 0.0565, "step": 1948 }, { "epoch": 0.3074496194344757, "grad_norm": 8.796087265014648, "learning_rate": 7.068104975044275e-07, "loss": 0.0663, "step": 1949 }, { "epoch": 0.3076073668020665, "grad_norm": 5.937930583953857, "learning_rate": 7.066494928352922e-07, "loss": 0.0525, "step": 1950 }, { "epoch": 0.3077651141696573, "grad_norm": 5.049323558807373, "learning_rate": 7.064884881661568e-07, "loss": 0.0718, "step": 1951 }, { "epoch": 0.30792286153724807, "grad_norm": 7.667891502380371, "learning_rate": 7.063274834970214e-07, "loss": 0.0425, "step": 1952 }, { "epoch": 0.3080806089048389, "grad_norm": 6.416477680206299, "learning_rate": 7.061664788278859e-07, "loss": 0.0658, "step": 1953 }, { "epoch": 0.3082383562724297, "grad_norm": 11.372632026672363, "learning_rate": 7.060054741587505e-07, "loss": 0.0367, "step": 1954 }, { "epoch": 0.3083961036400205, "grad_norm": 3.07737135887146, "learning_rate": 7.058444694896152e-07, "loss": 0.0255, "step": 1955 }, { "epoch": 0.3085538510076113, "grad_norm": 6.110159397125244, "learning_rate": 7.056834648204798e-07, "loss": 0.1018, "step": 1956 }, { "epoch": 0.3087115983752021, "grad_norm": 12.801488876342773, "learning_rate": 7.055224601513444e-07, "loss": 0.076, "step": 1957 }, { "epoch": 0.3088693457427929, "grad_norm": 5.292741298675537, "learning_rate": 7.05361455482209e-07, "loss": 0.0743, "step": 1958 }, { "epoch": 0.30902709311038373, "grad_norm": 4.4499430656433105, "learning_rate": 7.052004508130736e-07, "loss": 0.0944, "step": 1959 }, { "epoch": 0.3091848404779745, "grad_norm": 6.13045072555542, "learning_rate": 7.050394461439382e-07, "loss": 0.0544, "step": 1960 }, { "epoch": 0.30934258784556534, "grad_norm": 8.339452743530273, "learning_rate": 7.048784414748027e-07, "loss": 0.0413, "step": 1961 }, { "epoch": 0.3095003352131561, "grad_norm": 7.852583408355713, "learning_rate": 7.047174368056673e-07, "loss": 0.0628, "step": 1962 }, { "epoch": 0.30965808258074695, "grad_norm": 6.660775184631348, "learning_rate": 7.045564321365319e-07, "loss": 0.0393, "step": 1963 }, { "epoch": 0.30981582994833773, "grad_norm": 4.972191333770752, "learning_rate": 7.043954274673965e-07, "loss": 0.0336, "step": 1964 }, { "epoch": 0.30997357731592856, "grad_norm": 2.4434168338775635, "learning_rate": 7.042344227982611e-07, "loss": 0.027, "step": 1965 }, { "epoch": 0.31013132468351934, "grad_norm": 4.470157146453857, "learning_rate": 7.040734181291257e-07, "loss": 0.0742, "step": 1966 }, { "epoch": 0.31028907205111017, "grad_norm": 7.267810344696045, "learning_rate": 7.039124134599903e-07, "loss": 0.1028, "step": 1967 }, { "epoch": 0.31044681941870095, "grad_norm": 6.394923686981201, "learning_rate": 7.037514087908549e-07, "loss": 0.0615, "step": 1968 }, { "epoch": 0.3106045667862918, "grad_norm": 6.019500732421875, "learning_rate": 7.035904041217195e-07, "loss": 0.0602, "step": 1969 }, { "epoch": 0.31076231415388256, "grad_norm": 4.828829765319824, "learning_rate": 7.03429399452584e-07, "loss": 0.0455, "step": 1970 }, { "epoch": 0.31092006152147333, "grad_norm": 9.272650718688965, "learning_rate": 7.032683947834486e-07, "loss": 0.0343, "step": 1971 }, { "epoch": 0.31107780888906417, "grad_norm": 5.205511093139648, "learning_rate": 7.031073901143134e-07, "loss": 0.0532, "step": 1972 }, { "epoch": 0.31123555625665494, "grad_norm": 5.682951927185059, "learning_rate": 7.02946385445178e-07, "loss": 0.045, "step": 1973 }, { "epoch": 0.3113933036242458, "grad_norm": 4.142481327056885, "learning_rate": 7.027853807760425e-07, "loss": 0.0319, "step": 1974 }, { "epoch": 0.31155105099183655, "grad_norm": 6.376063823699951, "learning_rate": 7.026243761069071e-07, "loss": 0.0631, "step": 1975 }, { "epoch": 0.3117087983594274, "grad_norm": 7.768744468688965, "learning_rate": 7.024633714377717e-07, "loss": 0.0538, "step": 1976 }, { "epoch": 0.31186654572701816, "grad_norm": 7.69910192489624, "learning_rate": 7.023023667686363e-07, "loss": 0.0719, "step": 1977 }, { "epoch": 0.312024293094609, "grad_norm": 3.8771684169769287, "learning_rate": 7.021413620995008e-07, "loss": 0.0442, "step": 1978 }, { "epoch": 0.3121820404621998, "grad_norm": 11.783376693725586, "learning_rate": 7.019803574303654e-07, "loss": 0.0493, "step": 1979 }, { "epoch": 0.3123397878297906, "grad_norm": 5.049818992614746, "learning_rate": 7.018193527612301e-07, "loss": 0.0325, "step": 1980 }, { "epoch": 0.3124975351973814, "grad_norm": 5.231262683868408, "learning_rate": 7.016583480920947e-07, "loss": 0.0374, "step": 1981 }, { "epoch": 0.3126552825649722, "grad_norm": 2.590451955795288, "learning_rate": 7.014973434229592e-07, "loss": 0.0494, "step": 1982 }, { "epoch": 0.312813029932563, "grad_norm": 4.199472427368164, "learning_rate": 7.013363387538238e-07, "loss": 0.0495, "step": 1983 }, { "epoch": 0.3129707773001538, "grad_norm": 3.365995168685913, "learning_rate": 7.011753340846884e-07, "loss": 0.0169, "step": 1984 }, { "epoch": 0.3131285246677446, "grad_norm": 7.31893253326416, "learning_rate": 7.01014329415553e-07, "loss": 0.1077, "step": 1985 }, { "epoch": 0.31328627203533543, "grad_norm": 4.630749225616455, "learning_rate": 7.008533247464175e-07, "loss": 0.0563, "step": 1986 }, { "epoch": 0.3134440194029262, "grad_norm": 4.555026054382324, "learning_rate": 7.006923200772822e-07, "loss": 0.0628, "step": 1987 }, { "epoch": 0.31360176677051704, "grad_norm": 3.6789534091949463, "learning_rate": 7.005313154081468e-07, "loss": 0.0419, "step": 1988 }, { "epoch": 0.3137595141381078, "grad_norm": 5.142852306365967, "learning_rate": 7.003703107390114e-07, "loss": 0.0742, "step": 1989 }, { "epoch": 0.3139172615056986, "grad_norm": 5.34221076965332, "learning_rate": 7.002093060698761e-07, "loss": 0.029, "step": 1990 }, { "epoch": 0.31407500887328943, "grad_norm": 6.0126261711120605, "learning_rate": 7.000483014007406e-07, "loss": 0.0674, "step": 1991 }, { "epoch": 0.3142327562408802, "grad_norm": 6.048847198486328, "learning_rate": 6.998872967316052e-07, "loss": 0.0903, "step": 1992 }, { "epoch": 0.31439050360847104, "grad_norm": 5.697565078735352, "learning_rate": 6.997262920624698e-07, "loss": 0.0496, "step": 1993 }, { "epoch": 0.3145482509760618, "grad_norm": 2.6553454399108887, "learning_rate": 6.995652873933344e-07, "loss": 0.0292, "step": 1994 }, { "epoch": 0.31470599834365265, "grad_norm": 3.822661876678467, "learning_rate": 6.99404282724199e-07, "loss": 0.0382, "step": 1995 }, { "epoch": 0.3148637457112434, "grad_norm": 5.864170074462891, "learning_rate": 6.992432780550636e-07, "loss": 0.0975, "step": 1996 }, { "epoch": 0.31502149307883426, "grad_norm": 2.8939883708953857, "learning_rate": 6.990822733859282e-07, "loss": 0.0157, "step": 1997 }, { "epoch": 0.31517924044642504, "grad_norm": 6.828817367553711, "learning_rate": 6.989212687167928e-07, "loss": 0.0615, "step": 1998 }, { "epoch": 0.31533698781401587, "grad_norm": 6.745560646057129, "learning_rate": 6.987602640476573e-07, "loss": 0.0683, "step": 1999 }, { "epoch": 0.31549473518160664, "grad_norm": 7.2444539070129395, "learning_rate": 6.985992593785219e-07, "loss": 0.0238, "step": 2000 }, { "epoch": 0.3156524825491975, "grad_norm": 2.476250648498535, "learning_rate": 6.984382547093865e-07, "loss": 0.0303, "step": 2001 }, { "epoch": 0.31581022991678825, "grad_norm": 3.634782075881958, "learning_rate": 6.982772500402512e-07, "loss": 0.0332, "step": 2002 }, { "epoch": 0.3159679772843791, "grad_norm": 6.093087673187256, "learning_rate": 6.981162453711157e-07, "loss": 0.0594, "step": 2003 }, { "epoch": 0.31612572465196986, "grad_norm": 4.593890190124512, "learning_rate": 6.979552407019803e-07, "loss": 0.0619, "step": 2004 }, { "epoch": 0.3162834720195607, "grad_norm": 6.26422643661499, "learning_rate": 6.977942360328449e-07, "loss": 0.0453, "step": 2005 }, { "epoch": 0.3164412193871515, "grad_norm": 4.618278503417969, "learning_rate": 6.976332313637095e-07, "loss": 0.0746, "step": 2006 }, { "epoch": 0.3165989667547423, "grad_norm": 3.2007484436035156, "learning_rate": 6.97472226694574e-07, "loss": 0.0333, "step": 2007 }, { "epoch": 0.3167567141223331, "grad_norm": 4.1666154861450195, "learning_rate": 6.973112220254387e-07, "loss": 0.0394, "step": 2008 }, { "epoch": 0.31691446148992386, "grad_norm": 2.576082468032837, "learning_rate": 6.971502173563033e-07, "loss": 0.0605, "step": 2009 }, { "epoch": 0.3170722088575147, "grad_norm": 10.811822891235352, "learning_rate": 6.96989212687168e-07, "loss": 0.0873, "step": 2010 }, { "epoch": 0.31722995622510547, "grad_norm": 4.870930194854736, "learning_rate": 6.968282080180325e-07, "loss": 0.0439, "step": 2011 }, { "epoch": 0.3173877035926963, "grad_norm": 5.0976643562316895, "learning_rate": 6.966672033488971e-07, "loss": 0.0692, "step": 2012 }, { "epoch": 0.3175454509602871, "grad_norm": 6.9528656005859375, "learning_rate": 6.965061986797617e-07, "loss": 0.0664, "step": 2013 }, { "epoch": 0.3177031983278779, "grad_norm": 7.9897589683532715, "learning_rate": 6.963451940106263e-07, "loss": 0.0913, "step": 2014 }, { "epoch": 0.3178609456954687, "grad_norm": 6.256965637207031, "learning_rate": 6.961841893414909e-07, "loss": 0.0549, "step": 2015 }, { "epoch": 0.3180186930630595, "grad_norm": 7.676211833953857, "learning_rate": 6.960231846723554e-07, "loss": 0.0549, "step": 2016 }, { "epoch": 0.3181764404306503, "grad_norm": 4.518301486968994, "learning_rate": 6.958621800032201e-07, "loss": 0.0432, "step": 2017 }, { "epoch": 0.31833418779824113, "grad_norm": 10.347399711608887, "learning_rate": 6.957011753340847e-07, "loss": 0.0594, "step": 2018 }, { "epoch": 0.3184919351658319, "grad_norm": 5.0956926345825195, "learning_rate": 6.955401706649493e-07, "loss": 0.0662, "step": 2019 }, { "epoch": 0.31864968253342274, "grad_norm": 5.088799476623535, "learning_rate": 6.953791659958138e-07, "loss": 0.0416, "step": 2020 }, { "epoch": 0.3188074299010135, "grad_norm": 3.090458869934082, "learning_rate": 6.952181613266784e-07, "loss": 0.0323, "step": 2021 }, { "epoch": 0.31896517726860435, "grad_norm": 5.809577465057373, "learning_rate": 6.95057156657543e-07, "loss": 0.0568, "step": 2022 }, { "epoch": 0.3191229246361951, "grad_norm": 4.143522262573242, "learning_rate": 6.948961519884076e-07, "loss": 0.0466, "step": 2023 }, { "epoch": 0.31928067200378596, "grad_norm": 6.0525126457214355, "learning_rate": 6.947351473192721e-07, "loss": 0.056, "step": 2024 }, { "epoch": 0.31943841937137674, "grad_norm": 4.867067337036133, "learning_rate": 6.945741426501369e-07, "loss": 0.0598, "step": 2025 }, { "epoch": 0.31959616673896757, "grad_norm": 4.798086643218994, "learning_rate": 6.944131379810015e-07, "loss": 0.0743, "step": 2026 }, { "epoch": 0.31975391410655835, "grad_norm": 4.135916233062744, "learning_rate": 6.942521333118661e-07, "loss": 0.057, "step": 2027 }, { "epoch": 0.3199116614741491, "grad_norm": 4.452865123748779, "learning_rate": 6.940911286427306e-07, "loss": 0.0386, "step": 2028 }, { "epoch": 0.32006940884173996, "grad_norm": 7.033433437347412, "learning_rate": 6.939301239735952e-07, "loss": 0.0528, "step": 2029 }, { "epoch": 0.32022715620933073, "grad_norm": 14.667048454284668, "learning_rate": 6.937691193044598e-07, "loss": 0.0659, "step": 2030 }, { "epoch": 0.32038490357692156, "grad_norm": 3.494577407836914, "learning_rate": 6.936081146353244e-07, "loss": 0.0451, "step": 2031 }, { "epoch": 0.32054265094451234, "grad_norm": 3.9050979614257812, "learning_rate": 6.934471099661889e-07, "loss": 0.0464, "step": 2032 }, { "epoch": 0.3207003983121032, "grad_norm": 8.84255599975586, "learning_rate": 6.932861052970536e-07, "loss": 0.101, "step": 2033 }, { "epoch": 0.32085814567969395, "grad_norm": 6.650998115539551, "learning_rate": 6.931251006279182e-07, "loss": 0.0759, "step": 2034 }, { "epoch": 0.3210158930472848, "grad_norm": 5.78363037109375, "learning_rate": 6.929640959587828e-07, "loss": 0.0775, "step": 2035 }, { "epoch": 0.32117364041487556, "grad_norm": 5.970606803894043, "learning_rate": 6.928030912896474e-07, "loss": 0.0682, "step": 2036 }, { "epoch": 0.3213313877824664, "grad_norm": 7.525320529937744, "learning_rate": 6.926420866205119e-07, "loss": 0.0684, "step": 2037 }, { "epoch": 0.32148913515005717, "grad_norm": 7.529545783996582, "learning_rate": 6.924810819513765e-07, "loss": 0.0678, "step": 2038 }, { "epoch": 0.321646882517648, "grad_norm": 6.4445366859436035, "learning_rate": 6.923200772822411e-07, "loss": 0.0823, "step": 2039 }, { "epoch": 0.3218046298852388, "grad_norm": 3.999107599258423, "learning_rate": 6.921590726131058e-07, "loss": 0.0734, "step": 2040 }, { "epoch": 0.3219623772528296, "grad_norm": 3.815798282623291, "learning_rate": 6.919980679439703e-07, "loss": 0.0503, "step": 2041 }, { "epoch": 0.3221201246204204, "grad_norm": 4.9361958503723145, "learning_rate": 6.91837063274835e-07, "loss": 0.0446, "step": 2042 }, { "epoch": 0.3222778719880112, "grad_norm": 7.048590660095215, "learning_rate": 6.916760586056996e-07, "loss": 0.0762, "step": 2043 }, { "epoch": 0.322435619355602, "grad_norm": 10.876458168029785, "learning_rate": 6.915150539365642e-07, "loss": 0.0634, "step": 2044 }, { "epoch": 0.32259336672319283, "grad_norm": 9.228300094604492, "learning_rate": 6.913540492674287e-07, "loss": 0.0479, "step": 2045 }, { "epoch": 0.3227511140907836, "grad_norm": 5.568813323974609, "learning_rate": 6.911930445982933e-07, "loss": 0.0547, "step": 2046 }, { "epoch": 0.32290886145837444, "grad_norm": 7.572629451751709, "learning_rate": 6.91032039929158e-07, "loss": 0.0675, "step": 2047 }, { "epoch": 0.3230666088259652, "grad_norm": 5.944652080535889, "learning_rate": 6.908710352600226e-07, "loss": 0.0353, "step": 2048 }, { "epoch": 0.323224356193556, "grad_norm": 8.71141242980957, "learning_rate": 6.907100305908871e-07, "loss": 0.0571, "step": 2049 }, { "epoch": 0.3233821035611468, "grad_norm": 7.83239221572876, "learning_rate": 6.905490259217517e-07, "loss": 0.0259, "step": 2050 }, { "epoch": 0.3235398509287376, "grad_norm": 5.153806209564209, "learning_rate": 6.903880212526163e-07, "loss": 0.064, "step": 2051 }, { "epoch": 0.32369759829632844, "grad_norm": 4.7819623947143555, "learning_rate": 6.902270165834809e-07, "loss": 0.0314, "step": 2052 }, { "epoch": 0.3238553456639192, "grad_norm": 6.805207252502441, "learning_rate": 6.900660119143454e-07, "loss": 0.0347, "step": 2053 }, { "epoch": 0.32401309303151005, "grad_norm": 3.347442626953125, "learning_rate": 6.8990500724521e-07, "loss": 0.0356, "step": 2054 }, { "epoch": 0.3241708403991008, "grad_norm": 5.032720565795898, "learning_rate": 6.897440025760747e-07, "loss": 0.0661, "step": 2055 }, { "epoch": 0.32432858776669166, "grad_norm": 5.20153284072876, "learning_rate": 6.895829979069393e-07, "loss": 0.0597, "step": 2056 }, { "epoch": 0.32448633513428243, "grad_norm": 6.703059196472168, "learning_rate": 6.894219932378038e-07, "loss": 0.0479, "step": 2057 }, { "epoch": 0.32464408250187327, "grad_norm": 7.035309791564941, "learning_rate": 6.892609885686684e-07, "loss": 0.0238, "step": 2058 }, { "epoch": 0.32480182986946404, "grad_norm": 2.6613142490386963, "learning_rate": 6.89099983899533e-07, "loss": 0.0788, "step": 2059 }, { "epoch": 0.3249595772370549, "grad_norm": 4.252102851867676, "learning_rate": 6.889389792303976e-07, "loss": 0.0417, "step": 2060 }, { "epoch": 0.32511732460464565, "grad_norm": 2.8893206119537354, "learning_rate": 6.887779745612623e-07, "loss": 0.015, "step": 2061 }, { "epoch": 0.3252750719722365, "grad_norm": 7.74230432510376, "learning_rate": 6.886169698921268e-07, "loss": 0.0819, "step": 2062 }, { "epoch": 0.32543281933982726, "grad_norm": 3.7796642780303955, "learning_rate": 6.884559652229915e-07, "loss": 0.0581, "step": 2063 }, { "epoch": 0.3255905667074181, "grad_norm": 7.510829448699951, "learning_rate": 6.882949605538561e-07, "loss": 0.0723, "step": 2064 }, { "epoch": 0.32574831407500887, "grad_norm": 7.586187362670898, "learning_rate": 6.881339558847207e-07, "loss": 0.0408, "step": 2065 }, { "epoch": 0.3259060614425997, "grad_norm": 5.421665191650391, "learning_rate": 6.879729512155852e-07, "loss": 0.0322, "step": 2066 }, { "epoch": 0.3260638088101905, "grad_norm": 6.571201324462891, "learning_rate": 6.878119465464498e-07, "loss": 0.0686, "step": 2067 }, { "epoch": 0.32622155617778126, "grad_norm": 5.433219909667969, "learning_rate": 6.876509418773144e-07, "loss": 0.0555, "step": 2068 }, { "epoch": 0.3263793035453721, "grad_norm": 8.206216812133789, "learning_rate": 6.87489937208179e-07, "loss": 0.079, "step": 2069 }, { "epoch": 0.32653705091296287, "grad_norm": 5.314496040344238, "learning_rate": 6.873289325390436e-07, "loss": 0.0483, "step": 2070 }, { "epoch": 0.3266947982805537, "grad_norm": 3.5508322715759277, "learning_rate": 6.871679278699082e-07, "loss": 0.0287, "step": 2071 }, { "epoch": 0.3268525456481445, "grad_norm": 3.366732597351074, "learning_rate": 6.870069232007728e-07, "loss": 0.0152, "step": 2072 }, { "epoch": 0.3270102930157353, "grad_norm": 5.403148651123047, "learning_rate": 6.868459185316374e-07, "loss": 0.0548, "step": 2073 }, { "epoch": 0.3271680403833261, "grad_norm": 15.252777099609375, "learning_rate": 6.866849138625019e-07, "loss": 0.1246, "step": 2074 }, { "epoch": 0.3273257877509169, "grad_norm": 3.936343193054199, "learning_rate": 6.865239091933665e-07, "loss": 0.0413, "step": 2075 }, { "epoch": 0.3274835351185077, "grad_norm": 3.2121474742889404, "learning_rate": 6.863629045242311e-07, "loss": 0.0259, "step": 2076 }, { "epoch": 0.32764128248609853, "grad_norm": 3.9176294803619385, "learning_rate": 6.862018998550959e-07, "loss": 0.0343, "step": 2077 }, { "epoch": 0.3277990298536893, "grad_norm": 6.870373249053955, "learning_rate": 6.860408951859604e-07, "loss": 0.0602, "step": 2078 }, { "epoch": 0.32795677722128014, "grad_norm": 4.5205817222595215, "learning_rate": 6.85879890516825e-07, "loss": 0.0696, "step": 2079 }, { "epoch": 0.3281145245888709, "grad_norm": 4.577205657958984, "learning_rate": 6.857188858476896e-07, "loss": 0.0316, "step": 2080 }, { "epoch": 0.32827227195646175, "grad_norm": 10.280963897705078, "learning_rate": 6.855578811785542e-07, "loss": 0.0953, "step": 2081 }, { "epoch": 0.3284300193240525, "grad_norm": 6.28256368637085, "learning_rate": 6.853968765094187e-07, "loss": 0.0338, "step": 2082 }, { "epoch": 0.32858776669164336, "grad_norm": 9.391764640808105, "learning_rate": 6.852358718402833e-07, "loss": 0.0908, "step": 2083 }, { "epoch": 0.32874551405923413, "grad_norm": 5.13310432434082, "learning_rate": 6.850748671711479e-07, "loss": 0.0657, "step": 2084 }, { "epoch": 0.32890326142682497, "grad_norm": 2.295919179916382, "learning_rate": 6.849138625020126e-07, "loss": 0.0181, "step": 2085 }, { "epoch": 0.32906100879441574, "grad_norm": 9.700862884521484, "learning_rate": 6.847528578328772e-07, "loss": 0.042, "step": 2086 }, { "epoch": 0.3292187561620065, "grad_norm": 15.523784637451172, "learning_rate": 6.845918531637417e-07, "loss": 0.111, "step": 2087 }, { "epoch": 0.32937650352959735, "grad_norm": 9.510817527770996, "learning_rate": 6.844308484946063e-07, "loss": 0.0721, "step": 2088 }, { "epoch": 0.32953425089718813, "grad_norm": 3.8640923500061035, "learning_rate": 6.842698438254709e-07, "loss": 0.0596, "step": 2089 }, { "epoch": 0.32969199826477896, "grad_norm": 10.860950469970703, "learning_rate": 6.841088391563355e-07, "loss": 0.0594, "step": 2090 }, { "epoch": 0.32984974563236974, "grad_norm": 11.403396606445312, "learning_rate": 6.839478344872e-07, "loss": 0.0427, "step": 2091 }, { "epoch": 0.33000749299996057, "grad_norm": 3.601780652999878, "learning_rate": 6.837868298180647e-07, "loss": 0.0308, "step": 2092 }, { "epoch": 0.33016524036755135, "grad_norm": 1.7029801607131958, "learning_rate": 6.836258251489293e-07, "loss": 0.0165, "step": 2093 }, { "epoch": 0.3303229877351422, "grad_norm": 4.657985687255859, "learning_rate": 6.834648204797939e-07, "loss": 0.0248, "step": 2094 }, { "epoch": 0.33048073510273296, "grad_norm": 8.094704627990723, "learning_rate": 6.833038158106584e-07, "loss": 0.0859, "step": 2095 }, { "epoch": 0.3306384824703238, "grad_norm": 4.248706340789795, "learning_rate": 6.831428111415231e-07, "loss": 0.0423, "step": 2096 }, { "epoch": 0.33079622983791457, "grad_norm": 4.41000509262085, "learning_rate": 6.829818064723877e-07, "loss": 0.0379, "step": 2097 }, { "epoch": 0.3309539772055054, "grad_norm": 7.829663276672363, "learning_rate": 6.828208018032523e-07, "loss": 0.0746, "step": 2098 }, { "epoch": 0.3311117245730962, "grad_norm": 4.953271865844727, "learning_rate": 6.826597971341168e-07, "loss": 0.0817, "step": 2099 }, { "epoch": 0.331269471940687, "grad_norm": 5.108656883239746, "learning_rate": 6.824987924649815e-07, "loss": 0.0614, "step": 2100 }, { "epoch": 0.3314272193082778, "grad_norm": 5.445789337158203, "learning_rate": 6.823377877958461e-07, "loss": 0.0454, "step": 2101 }, { "epoch": 0.3315849666758686, "grad_norm": 5.153439521789551, "learning_rate": 6.821767831267107e-07, "loss": 0.0689, "step": 2102 }, { "epoch": 0.3317427140434594, "grad_norm": 4.924667835235596, "learning_rate": 6.820157784575752e-07, "loss": 0.0417, "step": 2103 }, { "epoch": 0.33190046141105023, "grad_norm": 7.9560418128967285, "learning_rate": 6.818547737884398e-07, "loss": 0.0388, "step": 2104 }, { "epoch": 0.332058208778641, "grad_norm": 5.098888397216797, "learning_rate": 6.816937691193044e-07, "loss": 0.0628, "step": 2105 }, { "epoch": 0.3322159561462318, "grad_norm": 5.160165309906006, "learning_rate": 6.81532764450169e-07, "loss": 0.035, "step": 2106 }, { "epoch": 0.3323737035138226, "grad_norm": 7.469124794006348, "learning_rate": 6.813717597810337e-07, "loss": 0.0266, "step": 2107 }, { "epoch": 0.3325314508814134, "grad_norm": 8.128046035766602, "learning_rate": 6.812107551118982e-07, "loss": 0.0644, "step": 2108 }, { "epoch": 0.3326891982490042, "grad_norm": 5.858038425445557, "learning_rate": 6.810497504427628e-07, "loss": 0.0426, "step": 2109 }, { "epoch": 0.332846945616595, "grad_norm": 7.542439937591553, "learning_rate": 6.808887457736274e-07, "loss": 0.0654, "step": 2110 }, { "epoch": 0.33300469298418583, "grad_norm": 5.132564544677734, "learning_rate": 6.80727741104492e-07, "loss": 0.0339, "step": 2111 }, { "epoch": 0.3331624403517766, "grad_norm": 5.81063175201416, "learning_rate": 6.805667364353565e-07, "loss": 0.0678, "step": 2112 }, { "epoch": 0.33332018771936744, "grad_norm": 2.5806543827056885, "learning_rate": 6.804057317662211e-07, "loss": 0.03, "step": 2113 }, { "epoch": 0.3334779350869582, "grad_norm": 2.1244475841522217, "learning_rate": 6.802447270970858e-07, "loss": 0.0173, "step": 2114 }, { "epoch": 0.33363568245454905, "grad_norm": 4.667263507843018, "learning_rate": 6.800837224279505e-07, "loss": 0.0466, "step": 2115 }, { "epoch": 0.33379342982213983, "grad_norm": 7.054257392883301, "learning_rate": 6.79922717758815e-07, "loss": 0.0268, "step": 2116 }, { "epoch": 0.33395117718973066, "grad_norm": 4.43825101852417, "learning_rate": 6.797617130896796e-07, "loss": 0.0492, "step": 2117 }, { "epoch": 0.33410892455732144, "grad_norm": 5.486238479614258, "learning_rate": 6.796007084205442e-07, "loss": 0.0377, "step": 2118 }, { "epoch": 0.3342666719249123, "grad_norm": 4.388202667236328, "learning_rate": 6.794397037514088e-07, "loss": 0.0197, "step": 2119 }, { "epoch": 0.33442441929250305, "grad_norm": 6.367566108703613, "learning_rate": 6.792786990822733e-07, "loss": 0.0588, "step": 2120 }, { "epoch": 0.3345821666600939, "grad_norm": 9.597057342529297, "learning_rate": 6.791176944131379e-07, "loss": 0.0593, "step": 2121 }, { "epoch": 0.33473991402768466, "grad_norm": 3.9881293773651123, "learning_rate": 6.789566897440026e-07, "loss": 0.0538, "step": 2122 }, { "epoch": 0.3348976613952755, "grad_norm": 8.376191139221191, "learning_rate": 6.787956850748672e-07, "loss": 0.0617, "step": 2123 }, { "epoch": 0.33505540876286627, "grad_norm": 1.8899438381195068, "learning_rate": 6.786346804057317e-07, "loss": 0.0139, "step": 2124 }, { "epoch": 0.33521315613045705, "grad_norm": 8.477391242980957, "learning_rate": 6.784736757365963e-07, "loss": 0.0518, "step": 2125 }, { "epoch": 0.3353709034980479, "grad_norm": 9.249786376953125, "learning_rate": 6.783126710674609e-07, "loss": 0.0346, "step": 2126 }, { "epoch": 0.33552865086563866, "grad_norm": 8.91741943359375, "learning_rate": 6.781516663983255e-07, "loss": 0.0738, "step": 2127 }, { "epoch": 0.3356863982332295, "grad_norm": 6.2260942459106445, "learning_rate": 6.7799066172919e-07, "loss": 0.062, "step": 2128 }, { "epoch": 0.33584414560082027, "grad_norm": 6.918528079986572, "learning_rate": 6.778296570600546e-07, "loss": 0.0304, "step": 2129 }, { "epoch": 0.3360018929684111, "grad_norm": 7.220362186431885, "learning_rate": 6.776686523909194e-07, "loss": 0.0494, "step": 2130 }, { "epoch": 0.3361596403360019, "grad_norm": 3.4409945011138916, "learning_rate": 6.77507647721784e-07, "loss": 0.0577, "step": 2131 }, { "epoch": 0.3363173877035927, "grad_norm": 5.775416851043701, "learning_rate": 6.773466430526486e-07, "loss": 0.0523, "step": 2132 }, { "epoch": 0.3364751350711835, "grad_norm": 5.859851360321045, "learning_rate": 6.771856383835131e-07, "loss": 0.0648, "step": 2133 }, { "epoch": 0.3366328824387743, "grad_norm": 4.478068828582764, "learning_rate": 6.770246337143777e-07, "loss": 0.0491, "step": 2134 }, { "epoch": 0.3367906298063651, "grad_norm": 8.653120994567871, "learning_rate": 6.768636290452423e-07, "loss": 0.081, "step": 2135 }, { "epoch": 0.3369483771739559, "grad_norm": 6.8522868156433105, "learning_rate": 6.767026243761069e-07, "loss": 0.0527, "step": 2136 }, { "epoch": 0.3371061245415467, "grad_norm": 6.237538814544678, "learning_rate": 6.765416197069714e-07, "loss": 0.0711, "step": 2137 }, { "epoch": 0.33726387190913754, "grad_norm": 4.676800727844238, "learning_rate": 6.763806150378361e-07, "loss": 0.0688, "step": 2138 }, { "epoch": 0.3374216192767283, "grad_norm": 6.894059181213379, "learning_rate": 6.762196103687007e-07, "loss": 0.0857, "step": 2139 }, { "epoch": 0.33757936664431915, "grad_norm": 5.634133815765381, "learning_rate": 6.760586056995653e-07, "loss": 0.0701, "step": 2140 }, { "epoch": 0.3377371140119099, "grad_norm": 8.917547225952148, "learning_rate": 6.758976010304298e-07, "loss": 0.0747, "step": 2141 }, { "epoch": 0.33789486137950075, "grad_norm": 10.881660461425781, "learning_rate": 6.757365963612944e-07, "loss": 0.0441, "step": 2142 }, { "epoch": 0.33805260874709153, "grad_norm": 7.538335800170898, "learning_rate": 6.75575591692159e-07, "loss": 0.0394, "step": 2143 }, { "epoch": 0.3382103561146823, "grad_norm": 8.540990829467773, "learning_rate": 6.754145870230236e-07, "loss": 0.0725, "step": 2144 }, { "epoch": 0.33836810348227314, "grad_norm": 7.7875189781188965, "learning_rate": 6.752535823538882e-07, "loss": 0.0582, "step": 2145 }, { "epoch": 0.3385258508498639, "grad_norm": 4.958803176879883, "learning_rate": 6.750925776847528e-07, "loss": 0.0378, "step": 2146 }, { "epoch": 0.33868359821745475, "grad_norm": 5.001570224761963, "learning_rate": 6.749315730156174e-07, "loss": 0.0312, "step": 2147 }, { "epoch": 0.33884134558504553, "grad_norm": 12.070775985717773, "learning_rate": 6.74770568346482e-07, "loss": 0.0415, "step": 2148 }, { "epoch": 0.33899909295263636, "grad_norm": 9.16423225402832, "learning_rate": 6.746095636773466e-07, "loss": 0.0561, "step": 2149 }, { "epoch": 0.33915684032022714, "grad_norm": 4.144524574279785, "learning_rate": 6.744485590082112e-07, "loss": 0.0262, "step": 2150 }, { "epoch": 0.33931458768781797, "grad_norm": 12.374677658081055, "learning_rate": 6.742875543390758e-07, "loss": 0.0611, "step": 2151 }, { "epoch": 0.33947233505540875, "grad_norm": 9.37118148803711, "learning_rate": 6.741265496699405e-07, "loss": 0.0578, "step": 2152 }, { "epoch": 0.3396300824229996, "grad_norm": 9.701456069946289, "learning_rate": 6.739655450008051e-07, "loss": 0.0411, "step": 2153 }, { "epoch": 0.33978782979059036, "grad_norm": 8.668558120727539, "learning_rate": 6.738045403316696e-07, "loss": 0.0331, "step": 2154 }, { "epoch": 0.3399455771581812, "grad_norm": 4.742426872253418, "learning_rate": 6.736435356625342e-07, "loss": 0.0515, "step": 2155 }, { "epoch": 0.34010332452577197, "grad_norm": 4.369631767272949, "learning_rate": 6.734825309933988e-07, "loss": 0.0561, "step": 2156 }, { "epoch": 0.3402610718933628, "grad_norm": 7.481386661529541, "learning_rate": 6.733215263242634e-07, "loss": 0.1153, "step": 2157 }, { "epoch": 0.3404188192609536, "grad_norm": 5.711971282958984, "learning_rate": 6.731605216551279e-07, "loss": 0.0301, "step": 2158 }, { "epoch": 0.3405765666285444, "grad_norm": 4.880026340484619, "learning_rate": 6.729995169859925e-07, "loss": 0.0467, "step": 2159 }, { "epoch": 0.3407343139961352, "grad_norm": 3.742311954498291, "learning_rate": 6.728385123168572e-07, "loss": 0.0437, "step": 2160 }, { "epoch": 0.340892061363726, "grad_norm": 3.4775466918945312, "learning_rate": 6.726775076477218e-07, "loss": 0.0258, "step": 2161 }, { "epoch": 0.3410498087313168, "grad_norm": 8.224218368530273, "learning_rate": 6.725165029785863e-07, "loss": 0.0583, "step": 2162 }, { "epoch": 0.34120755609890757, "grad_norm": 5.834990501403809, "learning_rate": 6.723554983094509e-07, "loss": 0.0643, "step": 2163 }, { "epoch": 0.3413653034664984, "grad_norm": 3.2723138332366943, "learning_rate": 6.721944936403155e-07, "loss": 0.035, "step": 2164 }, { "epoch": 0.3415230508340892, "grad_norm": 4.137942790985107, "learning_rate": 6.720334889711801e-07, "loss": 0.0326, "step": 2165 }, { "epoch": 0.34168079820168, "grad_norm": 9.54454517364502, "learning_rate": 6.718724843020446e-07, "loss": 0.085, "step": 2166 }, { "epoch": 0.3418385455692708, "grad_norm": 5.521014213562012, "learning_rate": 6.717114796329093e-07, "loss": 0.0306, "step": 2167 }, { "epoch": 0.3419962929368616, "grad_norm": 5.938337802886963, "learning_rate": 6.71550474963774e-07, "loss": 0.0266, "step": 2168 }, { "epoch": 0.3421540403044524, "grad_norm": 2.8671207427978516, "learning_rate": 6.713894702946386e-07, "loss": 0.0155, "step": 2169 }, { "epoch": 0.34231178767204323, "grad_norm": 2.2926113605499268, "learning_rate": 6.712284656255031e-07, "loss": 0.0418, "step": 2170 }, { "epoch": 0.342469535039634, "grad_norm": 4.5548553466796875, "learning_rate": 6.710674609563677e-07, "loss": 0.0669, "step": 2171 }, { "epoch": 0.34262728240722484, "grad_norm": 5.34604024887085, "learning_rate": 6.709064562872323e-07, "loss": 0.0505, "step": 2172 }, { "epoch": 0.3427850297748156, "grad_norm": 5.072606563568115, "learning_rate": 6.707454516180969e-07, "loss": 0.0515, "step": 2173 }, { "epoch": 0.34294277714240645, "grad_norm": 13.048532485961914, "learning_rate": 6.705844469489614e-07, "loss": 0.0748, "step": 2174 }, { "epoch": 0.34310052450999723, "grad_norm": 6.24669075012207, "learning_rate": 6.704234422798261e-07, "loss": 0.0358, "step": 2175 }, { "epoch": 0.34325827187758806, "grad_norm": 4.102876663208008, "learning_rate": 6.702624376106907e-07, "loss": 0.0476, "step": 2176 }, { "epoch": 0.34341601924517884, "grad_norm": 3.69059681892395, "learning_rate": 6.701014329415553e-07, "loss": 0.0222, "step": 2177 }, { "epoch": 0.34357376661276967, "grad_norm": 3.0619795322418213, "learning_rate": 6.699404282724199e-07, "loss": 0.0407, "step": 2178 }, { "epoch": 0.34373151398036045, "grad_norm": 8.129076957702637, "learning_rate": 6.697794236032844e-07, "loss": 0.0876, "step": 2179 }, { "epoch": 0.3438892613479513, "grad_norm": 3.9379684925079346, "learning_rate": 6.69618418934149e-07, "loss": 0.024, "step": 2180 }, { "epoch": 0.34404700871554206, "grad_norm": 6.143450736999512, "learning_rate": 6.694574142650136e-07, "loss": 0.0589, "step": 2181 }, { "epoch": 0.3442047560831329, "grad_norm": 4.693875312805176, "learning_rate": 6.692964095958783e-07, "loss": 0.0715, "step": 2182 }, { "epoch": 0.34436250345072367, "grad_norm": 5.9264750480651855, "learning_rate": 6.691354049267428e-07, "loss": 0.0389, "step": 2183 }, { "epoch": 0.34452025081831444, "grad_norm": 5.538174629211426, "learning_rate": 6.689744002576075e-07, "loss": 0.0425, "step": 2184 }, { "epoch": 0.3446779981859053, "grad_norm": 8.8797607421875, "learning_rate": 6.688133955884721e-07, "loss": 0.0537, "step": 2185 }, { "epoch": 0.34483574555349605, "grad_norm": 5.80300760269165, "learning_rate": 6.686523909193367e-07, "loss": 0.0605, "step": 2186 }, { "epoch": 0.3449934929210869, "grad_norm": 3.030482053756714, "learning_rate": 6.684913862502012e-07, "loss": 0.0254, "step": 2187 }, { "epoch": 0.34515124028867766, "grad_norm": 3.2663259506225586, "learning_rate": 6.683303815810658e-07, "loss": 0.0215, "step": 2188 }, { "epoch": 0.3453089876562685, "grad_norm": 3.619297504425049, "learning_rate": 6.681693769119304e-07, "loss": 0.0278, "step": 2189 }, { "epoch": 0.3454667350238593, "grad_norm": 1.7165285348892212, "learning_rate": 6.680083722427951e-07, "loss": 0.0126, "step": 2190 }, { "epoch": 0.3456244823914501, "grad_norm": 5.1468892097473145, "learning_rate": 6.678473675736596e-07, "loss": 0.0386, "step": 2191 }, { "epoch": 0.3457822297590409, "grad_norm": 5.578167915344238, "learning_rate": 6.676863629045242e-07, "loss": 0.0546, "step": 2192 }, { "epoch": 0.3459399771266317, "grad_norm": 8.097320556640625, "learning_rate": 6.675253582353888e-07, "loss": 0.0392, "step": 2193 }, { "epoch": 0.3460977244942225, "grad_norm": 6.397778034210205, "learning_rate": 6.673643535662534e-07, "loss": 0.0714, "step": 2194 }, { "epoch": 0.3462554718618133, "grad_norm": 7.528390407562256, "learning_rate": 6.672033488971179e-07, "loss": 0.0277, "step": 2195 }, { "epoch": 0.3464132192294041, "grad_norm": 7.353246212005615, "learning_rate": 6.670423442279825e-07, "loss": 0.0518, "step": 2196 }, { "epoch": 0.34657096659699493, "grad_norm": 5.379136562347412, "learning_rate": 6.668813395588471e-07, "loss": 0.0307, "step": 2197 }, { "epoch": 0.3467287139645857, "grad_norm": 7.021356582641602, "learning_rate": 6.667203348897118e-07, "loss": 0.0911, "step": 2198 }, { "epoch": 0.34688646133217654, "grad_norm": 8.127105712890625, "learning_rate": 6.665593302205764e-07, "loss": 0.086, "step": 2199 }, { "epoch": 0.3470442086997673, "grad_norm": 7.807665824890137, "learning_rate": 6.663983255514409e-07, "loss": 0.072, "step": 2200 }, { "epoch": 0.34720195606735815, "grad_norm": 5.4244184494018555, "learning_rate": 6.662373208823056e-07, "loss": 0.0761, "step": 2201 }, { "epoch": 0.34735970343494893, "grad_norm": 2.1045219898223877, "learning_rate": 6.660763162131702e-07, "loss": 0.0111, "step": 2202 }, { "epoch": 0.3475174508025397, "grad_norm": 6.108265399932861, "learning_rate": 6.659153115440348e-07, "loss": 0.0861, "step": 2203 }, { "epoch": 0.34767519817013054, "grad_norm": 3.343996524810791, "learning_rate": 6.657543068748993e-07, "loss": 0.0218, "step": 2204 }, { "epoch": 0.3478329455377213, "grad_norm": 4.500611305236816, "learning_rate": 6.65593302205764e-07, "loss": 0.0495, "step": 2205 }, { "epoch": 0.34799069290531215, "grad_norm": 7.43673038482666, "learning_rate": 6.654322975366286e-07, "loss": 0.0949, "step": 2206 }, { "epoch": 0.3481484402729029, "grad_norm": 10.482646942138672, "learning_rate": 6.652712928674932e-07, "loss": 0.1303, "step": 2207 }, { "epoch": 0.34830618764049376, "grad_norm": 11.60330867767334, "learning_rate": 6.651102881983577e-07, "loss": 0.1006, "step": 2208 }, { "epoch": 0.34846393500808454, "grad_norm": 5.609463691711426, "learning_rate": 6.649492835292223e-07, "loss": 0.0344, "step": 2209 }, { "epoch": 0.34862168237567537, "grad_norm": 6.145679473876953, "learning_rate": 6.647882788600869e-07, "loss": 0.0276, "step": 2210 }, { "epoch": 0.34877942974326615, "grad_norm": 6.1248369216918945, "learning_rate": 6.646272741909515e-07, "loss": 0.0551, "step": 2211 }, { "epoch": 0.348937177110857, "grad_norm": 5.781682968139648, "learning_rate": 6.64466269521816e-07, "loss": 0.0471, "step": 2212 }, { "epoch": 0.34909492447844775, "grad_norm": 6.478988170623779, "learning_rate": 6.643052648526807e-07, "loss": 0.0834, "step": 2213 }, { "epoch": 0.3492526718460386, "grad_norm": 4.307476043701172, "learning_rate": 6.641442601835453e-07, "loss": 0.054, "step": 2214 }, { "epoch": 0.34941041921362936, "grad_norm": 4.316431522369385, "learning_rate": 6.639832555144099e-07, "loss": 0.0367, "step": 2215 }, { "epoch": 0.3495681665812202, "grad_norm": 9.591572761535645, "learning_rate": 6.638222508452744e-07, "loss": 0.0952, "step": 2216 }, { "epoch": 0.349725913948811, "grad_norm": 6.154400825500488, "learning_rate": 6.63661246176139e-07, "loss": 0.0371, "step": 2217 }, { "epoch": 0.3498836613164018, "grad_norm": 4.875076770782471, "learning_rate": 6.635002415070036e-07, "loss": 0.066, "step": 2218 }, { "epoch": 0.3500414086839926, "grad_norm": 3.7643606662750244, "learning_rate": 6.633392368378683e-07, "loss": 0.0476, "step": 2219 }, { "epoch": 0.3501991560515834, "grad_norm": 4.582226753234863, "learning_rate": 6.631782321687329e-07, "loss": 0.0952, "step": 2220 }, { "epoch": 0.3503569034191742, "grad_norm": 4.8520989418029785, "learning_rate": 6.630172274995975e-07, "loss": 0.0425, "step": 2221 }, { "epoch": 0.35051465078676497, "grad_norm": 7.374439239501953, "learning_rate": 6.628562228304621e-07, "loss": 0.0993, "step": 2222 }, { "epoch": 0.3506723981543558, "grad_norm": 3.0643153190612793, "learning_rate": 6.626952181613267e-07, "loss": 0.0241, "step": 2223 }, { "epoch": 0.3508301455219466, "grad_norm": 5.558948040008545, "learning_rate": 6.625342134921913e-07, "loss": 0.0403, "step": 2224 }, { "epoch": 0.3509878928895374, "grad_norm": 7.733405590057373, "learning_rate": 6.623732088230558e-07, "loss": 0.0742, "step": 2225 }, { "epoch": 0.3511456402571282, "grad_norm": 5.296177864074707, "learning_rate": 6.622122041539204e-07, "loss": 0.0502, "step": 2226 }, { "epoch": 0.351303387624719, "grad_norm": 3.479292869567871, "learning_rate": 6.62051199484785e-07, "loss": 0.0352, "step": 2227 }, { "epoch": 0.3514611349923098, "grad_norm": 9.535543441772461, "learning_rate": 6.618901948156497e-07, "loss": 0.0658, "step": 2228 }, { "epoch": 0.35161888235990063, "grad_norm": 3.5360796451568604, "learning_rate": 6.617291901465142e-07, "loss": 0.0382, "step": 2229 }, { "epoch": 0.3517766297274914, "grad_norm": 3.81123948097229, "learning_rate": 6.615681854773788e-07, "loss": 0.0648, "step": 2230 }, { "epoch": 0.35193437709508224, "grad_norm": 4.938370227813721, "learning_rate": 6.614071808082434e-07, "loss": 0.046, "step": 2231 }, { "epoch": 0.352092124462673, "grad_norm": 7.3840179443359375, "learning_rate": 6.61246176139108e-07, "loss": 0.1033, "step": 2232 }, { "epoch": 0.35224987183026385, "grad_norm": 2.876157522201538, "learning_rate": 6.610851714699725e-07, "loss": 0.0433, "step": 2233 }, { "epoch": 0.3524076191978546, "grad_norm": 7.421253204345703, "learning_rate": 6.609241668008371e-07, "loss": 0.0561, "step": 2234 }, { "epoch": 0.35256536656544546, "grad_norm": 5.2663044929504395, "learning_rate": 6.607631621317018e-07, "loss": 0.053, "step": 2235 }, { "epoch": 0.35272311393303624, "grad_norm": 7.223364353179932, "learning_rate": 6.606021574625665e-07, "loss": 0.1221, "step": 2236 }, { "epoch": 0.35288086130062707, "grad_norm": 6.649539470672607, "learning_rate": 6.60441152793431e-07, "loss": 0.0576, "step": 2237 }, { "epoch": 0.35303860866821785, "grad_norm": 6.069492340087891, "learning_rate": 6.602801481242956e-07, "loss": 0.055, "step": 2238 }, { "epoch": 0.3531963560358087, "grad_norm": 6.612998962402344, "learning_rate": 6.601191434551602e-07, "loss": 0.0765, "step": 2239 }, { "epoch": 0.35335410340339946, "grad_norm": 4.131660461425781, "learning_rate": 6.599581387860248e-07, "loss": 0.0332, "step": 2240 }, { "epoch": 0.35351185077099023, "grad_norm": 2.3796064853668213, "learning_rate": 6.597971341168893e-07, "loss": 0.022, "step": 2241 }, { "epoch": 0.35366959813858106, "grad_norm": 7.201269149780273, "learning_rate": 6.596361294477539e-07, "loss": 0.0531, "step": 2242 }, { "epoch": 0.35382734550617184, "grad_norm": 6.009885311126709, "learning_rate": 6.594751247786186e-07, "loss": 0.0577, "step": 2243 }, { "epoch": 0.3539850928737627, "grad_norm": 7.4139299392700195, "learning_rate": 6.593141201094832e-07, "loss": 0.0682, "step": 2244 }, { "epoch": 0.35414284024135345, "grad_norm": 4.9820756912231445, "learning_rate": 6.591531154403478e-07, "loss": 0.0236, "step": 2245 }, { "epoch": 0.3543005876089443, "grad_norm": 5.673033237457275, "learning_rate": 6.589921107712123e-07, "loss": 0.0483, "step": 2246 }, { "epoch": 0.35445833497653506, "grad_norm": 3.904254198074341, "learning_rate": 6.588311061020769e-07, "loss": 0.0353, "step": 2247 }, { "epoch": 0.3546160823441259, "grad_norm": 4.530683994293213, "learning_rate": 6.586701014329415e-07, "loss": 0.0606, "step": 2248 }, { "epoch": 0.35477382971171667, "grad_norm": 4.462540149688721, "learning_rate": 6.585090967638061e-07, "loss": 0.0283, "step": 2249 }, { "epoch": 0.3549315770793075, "grad_norm": 4.321871757507324, "learning_rate": 6.583480920946707e-07, "loss": 0.0524, "step": 2250 }, { "epoch": 0.3550893244468983, "grad_norm": 2.95967435836792, "learning_rate": 6.581870874255353e-07, "loss": 0.0286, "step": 2251 }, { "epoch": 0.3552470718144891, "grad_norm": 6.752710342407227, "learning_rate": 6.580260827563999e-07, "loss": 0.0682, "step": 2252 }, { "epoch": 0.3554048191820799, "grad_norm": 7.123131275177002, "learning_rate": 6.578650780872646e-07, "loss": 0.0915, "step": 2253 }, { "epoch": 0.3555625665496707, "grad_norm": 7.508081912994385, "learning_rate": 6.57704073418129e-07, "loss": 0.0787, "step": 2254 }, { "epoch": 0.3557203139172615, "grad_norm": 12.460843086242676, "learning_rate": 6.575430687489937e-07, "loss": 0.0714, "step": 2255 }, { "epoch": 0.35587806128485233, "grad_norm": 7.170982360839844, "learning_rate": 6.573820640798583e-07, "loss": 0.0487, "step": 2256 }, { "epoch": 0.3560358086524431, "grad_norm": 5.3124165534973145, "learning_rate": 6.572210594107229e-07, "loss": 0.0449, "step": 2257 }, { "epoch": 0.35619355602003394, "grad_norm": 10.241024017333984, "learning_rate": 6.570600547415875e-07, "loss": 0.0925, "step": 2258 }, { "epoch": 0.3563513033876247, "grad_norm": 7.437023639678955, "learning_rate": 6.568990500724521e-07, "loss": 0.0456, "step": 2259 }, { "epoch": 0.3565090507552155, "grad_norm": 4.243247032165527, "learning_rate": 6.567380454033167e-07, "loss": 0.0599, "step": 2260 }, { "epoch": 0.35666679812280633, "grad_norm": 8.64215087890625, "learning_rate": 6.565770407341813e-07, "loss": 0.0651, "step": 2261 }, { "epoch": 0.3568245454903971, "grad_norm": 4.423739433288574, "learning_rate": 6.564160360650458e-07, "loss": 0.044, "step": 2262 }, { "epoch": 0.35698229285798794, "grad_norm": 4.106833457946777, "learning_rate": 6.562550313959104e-07, "loss": 0.0575, "step": 2263 }, { "epoch": 0.3571400402255787, "grad_norm": 7.103672504425049, "learning_rate": 6.56094026726775e-07, "loss": 0.0622, "step": 2264 }, { "epoch": 0.35729778759316955, "grad_norm": 2.7301411628723145, "learning_rate": 6.559330220576397e-07, "loss": 0.0235, "step": 2265 }, { "epoch": 0.3574555349607603, "grad_norm": 6.06307315826416, "learning_rate": 6.557720173885042e-07, "loss": 0.0643, "step": 2266 }, { "epoch": 0.35761328232835116, "grad_norm": 3.6312015056610107, "learning_rate": 6.556110127193688e-07, "loss": 0.0479, "step": 2267 }, { "epoch": 0.35777102969594193, "grad_norm": 5.553654670715332, "learning_rate": 6.554500080502334e-07, "loss": 0.0557, "step": 2268 }, { "epoch": 0.35792877706353277, "grad_norm": 8.613818168640137, "learning_rate": 6.55289003381098e-07, "loss": 0.0714, "step": 2269 }, { "epoch": 0.35808652443112354, "grad_norm": 4.059633255004883, "learning_rate": 6.551279987119626e-07, "loss": 0.0385, "step": 2270 }, { "epoch": 0.3582442717987144, "grad_norm": 4.890695095062256, "learning_rate": 6.549669940428271e-07, "loss": 0.0311, "step": 2271 }, { "epoch": 0.35840201916630515, "grad_norm": 5.212426662445068, "learning_rate": 6.548059893736918e-07, "loss": 0.0394, "step": 2272 }, { "epoch": 0.358559766533896, "grad_norm": 3.5437400341033936, "learning_rate": 6.546449847045565e-07, "loss": 0.0366, "step": 2273 }, { "epoch": 0.35871751390148676, "grad_norm": 24.547109603881836, "learning_rate": 6.544839800354211e-07, "loss": 0.12, "step": 2274 }, { "epoch": 0.3588752612690776, "grad_norm": 7.82403039932251, "learning_rate": 6.543229753662856e-07, "loss": 0.073, "step": 2275 }, { "epoch": 0.35903300863666837, "grad_norm": 3.1485228538513184, "learning_rate": 6.541619706971502e-07, "loss": 0.0837, "step": 2276 }, { "epoch": 0.3591907560042592, "grad_norm": 11.192998886108398, "learning_rate": 6.540009660280148e-07, "loss": 0.053, "step": 2277 }, { "epoch": 0.35934850337185, "grad_norm": 5.769438743591309, "learning_rate": 6.538399613588794e-07, "loss": 0.0845, "step": 2278 }, { "epoch": 0.35950625073944076, "grad_norm": 6.332764148712158, "learning_rate": 6.536789566897439e-07, "loss": 0.0645, "step": 2279 }, { "epoch": 0.3596639981070316, "grad_norm": 1.8677188158035278, "learning_rate": 6.535179520206086e-07, "loss": 0.0168, "step": 2280 }, { "epoch": 0.35982174547462237, "grad_norm": 2.9039602279663086, "learning_rate": 6.533569473514732e-07, "loss": 0.0204, "step": 2281 }, { "epoch": 0.3599794928422132, "grad_norm": 6.307854175567627, "learning_rate": 6.531959426823378e-07, "loss": 0.0508, "step": 2282 }, { "epoch": 0.360137240209804, "grad_norm": 4.860239028930664, "learning_rate": 6.530349380132023e-07, "loss": 0.06, "step": 2283 }, { "epoch": 0.3602949875773948, "grad_norm": 6.497222423553467, "learning_rate": 6.528739333440669e-07, "loss": 0.0523, "step": 2284 }, { "epoch": 0.3604527349449856, "grad_norm": 6.6986002922058105, "learning_rate": 6.527129286749315e-07, "loss": 0.1066, "step": 2285 }, { "epoch": 0.3606104823125764, "grad_norm": 4.625892162322998, "learning_rate": 6.525519240057961e-07, "loss": 0.0288, "step": 2286 }, { "epoch": 0.3607682296801672, "grad_norm": 6.019299507141113, "learning_rate": 6.523909193366606e-07, "loss": 0.0382, "step": 2287 }, { "epoch": 0.36092597704775803, "grad_norm": 5.150125503540039, "learning_rate": 6.522299146675253e-07, "loss": 0.0304, "step": 2288 }, { "epoch": 0.3610837244153488, "grad_norm": 9.732718467712402, "learning_rate": 6.5206890999839e-07, "loss": 0.0852, "step": 2289 }, { "epoch": 0.36124147178293964, "grad_norm": 5.909388542175293, "learning_rate": 6.519079053292546e-07, "loss": 0.0282, "step": 2290 }, { "epoch": 0.3613992191505304, "grad_norm": 5.037200450897217, "learning_rate": 6.517469006601191e-07, "loss": 0.0325, "step": 2291 }, { "epoch": 0.36155696651812125, "grad_norm": 9.236132621765137, "learning_rate": 6.515858959909837e-07, "loss": 0.0724, "step": 2292 }, { "epoch": 0.361714713885712, "grad_norm": 4.459693908691406, "learning_rate": 6.514248913218483e-07, "loss": 0.0343, "step": 2293 }, { "epoch": 0.36187246125330286, "grad_norm": 5.420177936553955, "learning_rate": 6.512638866527129e-07, "loss": 0.0282, "step": 2294 }, { "epoch": 0.36203020862089363, "grad_norm": 3.4490017890930176, "learning_rate": 6.511028819835776e-07, "loss": 0.0183, "step": 2295 }, { "epoch": 0.36218795598848447, "grad_norm": 3.956369400024414, "learning_rate": 6.509418773144421e-07, "loss": 0.0497, "step": 2296 }, { "epoch": 0.36234570335607524, "grad_norm": 6.420275688171387, "learning_rate": 6.507808726453067e-07, "loss": 0.0508, "step": 2297 }, { "epoch": 0.3625034507236661, "grad_norm": 7.002887725830078, "learning_rate": 6.506198679761713e-07, "loss": 0.039, "step": 2298 }, { "epoch": 0.36266119809125685, "grad_norm": 6.247574806213379, "learning_rate": 6.504588633070359e-07, "loss": 0.025, "step": 2299 }, { "epoch": 0.36281894545884763, "grad_norm": 3.8853237628936768, "learning_rate": 6.502978586379004e-07, "loss": 0.0901, "step": 2300 }, { "epoch": 0.36297669282643846, "grad_norm": 5.598296642303467, "learning_rate": 6.50136853968765e-07, "loss": 0.0545, "step": 2301 }, { "epoch": 0.36313444019402924, "grad_norm": 3.694289445877075, "learning_rate": 6.499758492996296e-07, "loss": 0.0464, "step": 2302 }, { "epoch": 0.3632921875616201, "grad_norm": 6.625827312469482, "learning_rate": 6.498148446304943e-07, "loss": 0.0307, "step": 2303 }, { "epoch": 0.36344993492921085, "grad_norm": 3.9422924518585205, "learning_rate": 6.496538399613588e-07, "loss": 0.0507, "step": 2304 }, { "epoch": 0.36344993492921085, "eval_accuracy": 0.9849744891671391, "eval_f1": 0.9849744891671391, "eval_loss": 0.05127612501382828, "eval_runtime": 4719.973, "eval_samples_per_second": 42.978, "eval_steps_per_second": 2.686, "step": 2304 }, { "epoch": 0.3636076822968017, "grad_norm": 6.840505599975586, "learning_rate": 6.494928352922234e-07, "loss": 0.1002, "step": 2305 }, { "epoch": 0.36376542966439246, "grad_norm": 3.1464266777038574, "learning_rate": 6.49331830623088e-07, "loss": 0.0337, "step": 2306 }, { "epoch": 0.3639231770319833, "grad_norm": 5.0393595695495605, "learning_rate": 6.491708259539527e-07, "loss": 0.0617, "step": 2307 }, { "epoch": 0.36408092439957407, "grad_norm": 6.115160942077637, "learning_rate": 6.490098212848172e-07, "loss": 0.0848, "step": 2308 }, { "epoch": 0.3642386717671649, "grad_norm": 4.03957462310791, "learning_rate": 6.488488166156818e-07, "loss": 0.0258, "step": 2309 }, { "epoch": 0.3643964191347557, "grad_norm": 4.076841831207275, "learning_rate": 6.486878119465465e-07, "loss": 0.0319, "step": 2310 }, { "epoch": 0.3645541665023465, "grad_norm": 2.939751386642456, "learning_rate": 6.485268072774111e-07, "loss": 0.0257, "step": 2311 }, { "epoch": 0.3647119138699373, "grad_norm": 8.846038818359375, "learning_rate": 6.483658026082756e-07, "loss": 0.0496, "step": 2312 }, { "epoch": 0.3648696612375281, "grad_norm": 4.721747398376465, "learning_rate": 6.482047979391402e-07, "loss": 0.0591, "step": 2313 }, { "epoch": 0.3650274086051189, "grad_norm": 6.604150295257568, "learning_rate": 6.480437932700048e-07, "loss": 0.1203, "step": 2314 }, { "epoch": 0.36518515597270973, "grad_norm": 5.340870380401611, "learning_rate": 6.478827886008694e-07, "loss": 0.0305, "step": 2315 }, { "epoch": 0.3653429033403005, "grad_norm": 5.70967435836792, "learning_rate": 6.47721783931734e-07, "loss": 0.0727, "step": 2316 }, { "epoch": 0.36550065070789134, "grad_norm": 7.697112083435059, "learning_rate": 6.475607792625985e-07, "loss": 0.0524, "step": 2317 }, { "epoch": 0.3656583980754821, "grad_norm": 3.6987898349761963, "learning_rate": 6.473997745934632e-07, "loss": 0.0369, "step": 2318 }, { "epoch": 0.3658161454430729, "grad_norm": 5.392115592956543, "learning_rate": 6.472387699243278e-07, "loss": 0.0531, "step": 2319 }, { "epoch": 0.3659738928106637, "grad_norm": 6.497808456420898, "learning_rate": 6.470777652551924e-07, "loss": 0.0477, "step": 2320 }, { "epoch": 0.3661316401782545, "grad_norm": 4.177874565124512, "learning_rate": 6.469167605860569e-07, "loss": 0.0568, "step": 2321 }, { "epoch": 0.36628938754584534, "grad_norm": 8.207234382629395, "learning_rate": 6.467557559169215e-07, "loss": 0.0627, "step": 2322 }, { "epoch": 0.3664471349134361, "grad_norm": 6.176510810852051, "learning_rate": 6.465947512477861e-07, "loss": 0.0439, "step": 2323 }, { "epoch": 0.36660488228102694, "grad_norm": 3.681514024734497, "learning_rate": 6.464337465786508e-07, "loss": 0.0454, "step": 2324 }, { "epoch": 0.3667626296486177, "grad_norm": 6.141844272613525, "learning_rate": 6.462727419095154e-07, "loss": 0.0576, "step": 2325 }, { "epoch": 0.36692037701620855, "grad_norm": 3.4974746704101562, "learning_rate": 6.4611173724038e-07, "loss": 0.0504, "step": 2326 }, { "epoch": 0.36707812438379933, "grad_norm": 2.7725894451141357, "learning_rate": 6.459507325712446e-07, "loss": 0.0173, "step": 2327 }, { "epoch": 0.36723587175139016, "grad_norm": 9.339814186096191, "learning_rate": 6.457897279021092e-07, "loss": 0.1016, "step": 2328 }, { "epoch": 0.36739361911898094, "grad_norm": 5.565303802490234, "learning_rate": 6.456287232329737e-07, "loss": 0.0458, "step": 2329 }, { "epoch": 0.3675513664865718, "grad_norm": 4.297898292541504, "learning_rate": 6.454677185638383e-07, "loss": 0.0355, "step": 2330 }, { "epoch": 0.36770911385416255, "grad_norm": 4.813329219818115, "learning_rate": 6.453067138947029e-07, "loss": 0.0243, "step": 2331 }, { "epoch": 0.3678668612217534, "grad_norm": 5.359735488891602, "learning_rate": 6.451457092255675e-07, "loss": 0.0619, "step": 2332 }, { "epoch": 0.36802460858934416, "grad_norm": 4.5004563331604, "learning_rate": 6.449847045564321e-07, "loss": 0.0238, "step": 2333 }, { "epoch": 0.368182355956935, "grad_norm": 3.844249725341797, "learning_rate": 6.448236998872967e-07, "loss": 0.0561, "step": 2334 }, { "epoch": 0.36834010332452577, "grad_norm": 4.186479091644287, "learning_rate": 6.446626952181613e-07, "loss": 0.0609, "step": 2335 }, { "epoch": 0.3684978506921166, "grad_norm": 4.213637351989746, "learning_rate": 6.445016905490259e-07, "loss": 0.0343, "step": 2336 }, { "epoch": 0.3686555980597074, "grad_norm": 6.416787624359131, "learning_rate": 6.443406858798904e-07, "loss": 0.0787, "step": 2337 }, { "epoch": 0.36881334542729816, "grad_norm": 7.132354259490967, "learning_rate": 6.44179681210755e-07, "loss": 0.0677, "step": 2338 }, { "epoch": 0.368971092794889, "grad_norm": 9.036219596862793, "learning_rate": 6.440186765416196e-07, "loss": 0.0536, "step": 2339 }, { "epoch": 0.36912884016247977, "grad_norm": 3.9812729358673096, "learning_rate": 6.438576718724843e-07, "loss": 0.0452, "step": 2340 }, { "epoch": 0.3692865875300706, "grad_norm": 7.19404935836792, "learning_rate": 6.43696667203349e-07, "loss": 0.0442, "step": 2341 }, { "epoch": 0.3694443348976614, "grad_norm": 6.11970329284668, "learning_rate": 6.435356625342135e-07, "loss": 0.06, "step": 2342 }, { "epoch": 0.3696020822652522, "grad_norm": 3.8445143699645996, "learning_rate": 6.433746578650781e-07, "loss": 0.0497, "step": 2343 }, { "epoch": 0.369759829632843, "grad_norm": 5.607524394989014, "learning_rate": 6.432136531959427e-07, "loss": 0.0453, "step": 2344 }, { "epoch": 0.3699175770004338, "grad_norm": 6.61794376373291, "learning_rate": 6.430526485268073e-07, "loss": 0.0903, "step": 2345 }, { "epoch": 0.3700753243680246, "grad_norm": 6.52325963973999, "learning_rate": 6.428916438576718e-07, "loss": 0.0769, "step": 2346 }, { "epoch": 0.3702330717356154, "grad_norm": 4.8962082862854, "learning_rate": 6.427306391885364e-07, "loss": 0.0378, "step": 2347 }, { "epoch": 0.3703908191032062, "grad_norm": 4.14684534072876, "learning_rate": 6.425696345194011e-07, "loss": 0.0498, "step": 2348 }, { "epoch": 0.37054856647079704, "grad_norm": 6.0636820793151855, "learning_rate": 6.424086298502657e-07, "loss": 0.0654, "step": 2349 }, { "epoch": 0.3707063138383878, "grad_norm": 5.343741416931152, "learning_rate": 6.422476251811302e-07, "loss": 0.0331, "step": 2350 }, { "epoch": 0.37086406120597865, "grad_norm": 2.8783695697784424, "learning_rate": 6.420866205119948e-07, "loss": 0.0281, "step": 2351 }, { "epoch": 0.3710218085735694, "grad_norm": 2.5364882946014404, "learning_rate": 6.419256158428594e-07, "loss": 0.0251, "step": 2352 }, { "epoch": 0.37117955594116026, "grad_norm": 4.4716033935546875, "learning_rate": 6.41764611173724e-07, "loss": 0.0267, "step": 2353 }, { "epoch": 0.37133730330875103, "grad_norm": 7.375990390777588, "learning_rate": 6.416036065045885e-07, "loss": 0.0686, "step": 2354 }, { "epoch": 0.37149505067634186, "grad_norm": 2.959165096282959, "learning_rate": 6.414426018354532e-07, "loss": 0.0246, "step": 2355 }, { "epoch": 0.37165279804393264, "grad_norm": 5.672486305236816, "learning_rate": 6.412815971663178e-07, "loss": 0.0935, "step": 2356 }, { "epoch": 0.3718105454115234, "grad_norm": 3.0972232818603516, "learning_rate": 6.411205924971824e-07, "loss": 0.0622, "step": 2357 }, { "epoch": 0.37196829277911425, "grad_norm": 5.868617534637451, "learning_rate": 6.409595878280469e-07, "loss": 0.0666, "step": 2358 }, { "epoch": 0.37212604014670503, "grad_norm": 1.829486608505249, "learning_rate": 6.407985831589115e-07, "loss": 0.0112, "step": 2359 }, { "epoch": 0.37228378751429586, "grad_norm": 3.74314022064209, "learning_rate": 6.406375784897762e-07, "loss": 0.044, "step": 2360 }, { "epoch": 0.37244153488188664, "grad_norm": 4.790905475616455, "learning_rate": 6.404765738206408e-07, "loss": 0.0469, "step": 2361 }, { "epoch": 0.37259928224947747, "grad_norm": 4.5560760498046875, "learning_rate": 6.403155691515054e-07, "loss": 0.0396, "step": 2362 }, { "epoch": 0.37275702961706825, "grad_norm": 3.961839437484741, "learning_rate": 6.4015456448237e-07, "loss": 0.0653, "step": 2363 }, { "epoch": 0.3729147769846591, "grad_norm": 6.641368389129639, "learning_rate": 6.399935598132346e-07, "loss": 0.0646, "step": 2364 }, { "epoch": 0.37307252435224986, "grad_norm": 6.974590301513672, "learning_rate": 6.398325551440992e-07, "loss": 0.0984, "step": 2365 }, { "epoch": 0.3732302717198407, "grad_norm": 7.7075395584106445, "learning_rate": 6.396715504749638e-07, "loss": 0.031, "step": 2366 }, { "epoch": 0.37338801908743147, "grad_norm": 6.126717567443848, "learning_rate": 6.395105458058283e-07, "loss": 0.0444, "step": 2367 }, { "epoch": 0.3735457664550223, "grad_norm": 11.811305046081543, "learning_rate": 6.393495411366929e-07, "loss": 0.0598, "step": 2368 }, { "epoch": 0.3737035138226131, "grad_norm": 4.188122272491455, "learning_rate": 6.391885364675575e-07, "loss": 0.0436, "step": 2369 }, { "epoch": 0.3738612611902039, "grad_norm": 4.03995943069458, "learning_rate": 6.390275317984222e-07, "loss": 0.0401, "step": 2370 }, { "epoch": 0.3740190085577947, "grad_norm": 8.820528030395508, "learning_rate": 6.388665271292867e-07, "loss": 0.063, "step": 2371 }, { "epoch": 0.3741767559253855, "grad_norm": 10.953715324401855, "learning_rate": 6.387055224601513e-07, "loss": 0.0597, "step": 2372 }, { "epoch": 0.3743345032929763, "grad_norm": 4.182105541229248, "learning_rate": 6.385445177910159e-07, "loss": 0.0396, "step": 2373 }, { "epoch": 0.3744922506605671, "grad_norm": 5.633617877960205, "learning_rate": 6.383835131218805e-07, "loss": 0.0612, "step": 2374 }, { "epoch": 0.3746499980281579, "grad_norm": 6.203413963317871, "learning_rate": 6.38222508452745e-07, "loss": 0.0568, "step": 2375 }, { "epoch": 0.3748077453957487, "grad_norm": 4.6891584396362305, "learning_rate": 6.380615037836096e-07, "loss": 0.0448, "step": 2376 }, { "epoch": 0.3749654927633395, "grad_norm": 3.4268014430999756, "learning_rate": 6.379004991144743e-07, "loss": 0.0294, "step": 2377 }, { "epoch": 0.3751232401309303, "grad_norm": 6.151249408721924, "learning_rate": 6.37739494445339e-07, "loss": 0.0439, "step": 2378 }, { "epoch": 0.3752809874985211, "grad_norm": 5.077561855316162, "learning_rate": 6.375784897762035e-07, "loss": 0.0379, "step": 2379 }, { "epoch": 0.3754387348661119, "grad_norm": 7.715423583984375, "learning_rate": 6.374174851070681e-07, "loss": 0.032, "step": 2380 }, { "epoch": 0.37559648223370273, "grad_norm": 5.999269962310791, "learning_rate": 6.372564804379327e-07, "loss": 0.052, "step": 2381 }, { "epoch": 0.3757542296012935, "grad_norm": 6.657078742980957, "learning_rate": 6.370954757687973e-07, "loss": 0.072, "step": 2382 }, { "epoch": 0.37591197696888434, "grad_norm": 6.159919261932373, "learning_rate": 6.369344710996618e-07, "loss": 0.0386, "step": 2383 }, { "epoch": 0.3760697243364751, "grad_norm": 8.939737319946289, "learning_rate": 6.367734664305264e-07, "loss": 0.0474, "step": 2384 }, { "epoch": 0.37622747170406595, "grad_norm": 3.6807610988616943, "learning_rate": 6.366124617613911e-07, "loss": 0.026, "step": 2385 }, { "epoch": 0.37638521907165673, "grad_norm": 5.3889665603637695, "learning_rate": 6.364514570922557e-07, "loss": 0.0222, "step": 2386 }, { "epoch": 0.37654296643924756, "grad_norm": 5.128452777862549, "learning_rate": 6.362904524231203e-07, "loss": 0.0553, "step": 2387 }, { "epoch": 0.37670071380683834, "grad_norm": 4.838190078735352, "learning_rate": 6.361294477539848e-07, "loss": 0.047, "step": 2388 }, { "epoch": 0.37685846117442917, "grad_norm": 3.7896814346313477, "learning_rate": 6.359684430848494e-07, "loss": 0.052, "step": 2389 }, { "epoch": 0.37701620854201995, "grad_norm": 1.4873887300491333, "learning_rate": 6.35807438415714e-07, "loss": 0.0108, "step": 2390 }, { "epoch": 0.3771739559096108, "grad_norm": 5.165665626525879, "learning_rate": 6.356464337465786e-07, "loss": 0.0651, "step": 2391 }, { "epoch": 0.37733170327720156, "grad_norm": 6.901916980743408, "learning_rate": 6.354854290774431e-07, "loss": 0.0468, "step": 2392 }, { "epoch": 0.3774894506447924, "grad_norm": 3.8376920223236084, "learning_rate": 6.353244244083078e-07, "loss": 0.046, "step": 2393 }, { "epoch": 0.37764719801238317, "grad_norm": 3.073216199874878, "learning_rate": 6.351634197391725e-07, "loss": 0.056, "step": 2394 }, { "epoch": 0.37780494537997394, "grad_norm": 7.888088226318359, "learning_rate": 6.350024150700371e-07, "loss": 0.0889, "step": 2395 }, { "epoch": 0.3779626927475648, "grad_norm": 7.4682393074035645, "learning_rate": 6.348414104009016e-07, "loss": 0.0683, "step": 2396 }, { "epoch": 0.37812044011515555, "grad_norm": 5.220827102661133, "learning_rate": 6.346804057317662e-07, "loss": 0.0423, "step": 2397 }, { "epoch": 0.3782781874827464, "grad_norm": 4.824035167694092, "learning_rate": 6.345194010626308e-07, "loss": 0.0361, "step": 2398 }, { "epoch": 0.37843593485033716, "grad_norm": 3.3719117641448975, "learning_rate": 6.343583963934954e-07, "loss": 0.0251, "step": 2399 }, { "epoch": 0.378593682217928, "grad_norm": 3.443359375, "learning_rate": 6.341973917243599e-07, "loss": 0.0138, "step": 2400 }, { "epoch": 0.3787514295855188, "grad_norm": 4.2096781730651855, "learning_rate": 6.340363870552246e-07, "loss": 0.04, "step": 2401 }, { "epoch": 0.3789091769531096, "grad_norm": 5.901605606079102, "learning_rate": 6.338753823860892e-07, "loss": 0.0486, "step": 2402 }, { "epoch": 0.3790669243207004, "grad_norm": 8.777657508850098, "learning_rate": 6.337143777169538e-07, "loss": 0.047, "step": 2403 }, { "epoch": 0.3792246716882912, "grad_norm": 6.585122585296631, "learning_rate": 6.335533730478183e-07, "loss": 0.0529, "step": 2404 }, { "epoch": 0.379382419055882, "grad_norm": 7.888032913208008, "learning_rate": 6.333923683786829e-07, "loss": 0.0799, "step": 2405 }, { "epoch": 0.3795401664234728, "grad_norm": 5.162048816680908, "learning_rate": 6.332313637095475e-07, "loss": 0.0865, "step": 2406 }, { "epoch": 0.3796979137910636, "grad_norm": 4.420879364013672, "learning_rate": 6.330703590404121e-07, "loss": 0.0229, "step": 2407 }, { "epoch": 0.37985566115865443, "grad_norm": 5.623416423797607, "learning_rate": 6.329093543712768e-07, "loss": 0.0429, "step": 2408 }, { "epoch": 0.3800134085262452, "grad_norm": 5.633999824523926, "learning_rate": 6.327483497021413e-07, "loss": 0.0733, "step": 2409 }, { "epoch": 0.38017115589383604, "grad_norm": 2.145211935043335, "learning_rate": 6.325873450330059e-07, "loss": 0.0268, "step": 2410 }, { "epoch": 0.3803289032614268, "grad_norm": 7.416122913360596, "learning_rate": 6.324263403638705e-07, "loss": 0.0431, "step": 2411 }, { "epoch": 0.38048665062901765, "grad_norm": 5.949075698852539, "learning_rate": 6.322653356947352e-07, "loss": 0.0566, "step": 2412 }, { "epoch": 0.38064439799660843, "grad_norm": 4.465049743652344, "learning_rate": 6.321043310255997e-07, "loss": 0.0542, "step": 2413 }, { "epoch": 0.38080214536419926, "grad_norm": 5.267343521118164, "learning_rate": 6.319433263564643e-07, "loss": 0.0359, "step": 2414 }, { "epoch": 0.38095989273179004, "grad_norm": 14.05527114868164, "learning_rate": 6.31782321687329e-07, "loss": 0.0905, "step": 2415 }, { "epoch": 0.3811176400993808, "grad_norm": 7.6798224449157715, "learning_rate": 6.316213170181936e-07, "loss": 0.0884, "step": 2416 }, { "epoch": 0.38127538746697165, "grad_norm": 9.165621757507324, "learning_rate": 6.314603123490581e-07, "loss": 0.0417, "step": 2417 }, { "epoch": 0.3814331348345624, "grad_norm": 11.236653327941895, "learning_rate": 6.312993076799227e-07, "loss": 0.0908, "step": 2418 }, { "epoch": 0.38159088220215326, "grad_norm": 4.317572593688965, "learning_rate": 6.311383030107873e-07, "loss": 0.0681, "step": 2419 }, { "epoch": 0.38174862956974404, "grad_norm": 6.037778377532959, "learning_rate": 6.309772983416519e-07, "loss": 0.0733, "step": 2420 }, { "epoch": 0.38190637693733487, "grad_norm": 4.881504535675049, "learning_rate": 6.308162936725164e-07, "loss": 0.0721, "step": 2421 }, { "epoch": 0.38206412430492565, "grad_norm": 4.847290992736816, "learning_rate": 6.30655289003381e-07, "loss": 0.0366, "step": 2422 }, { "epoch": 0.3822218716725165, "grad_norm": 9.645156860351562, "learning_rate": 6.304942843342457e-07, "loss": 0.0441, "step": 2423 }, { "epoch": 0.38237961904010725, "grad_norm": 4.416476249694824, "learning_rate": 6.303332796651103e-07, "loss": 0.0503, "step": 2424 }, { "epoch": 0.3825373664076981, "grad_norm": 7.714351177215576, "learning_rate": 6.301722749959748e-07, "loss": 0.0625, "step": 2425 }, { "epoch": 0.38269511377528886, "grad_norm": 7.266255855560303, "learning_rate": 6.300112703268394e-07, "loss": 0.0309, "step": 2426 }, { "epoch": 0.3828528611428797, "grad_norm": 2.3878281116485596, "learning_rate": 6.29850265657704e-07, "loss": 0.0249, "step": 2427 }, { "epoch": 0.3830106085104705, "grad_norm": 6.210450649261475, "learning_rate": 6.296892609885686e-07, "loss": 0.0318, "step": 2428 }, { "epoch": 0.3831683558780613, "grad_norm": 3.793093204498291, "learning_rate": 6.295282563194331e-07, "loss": 0.0483, "step": 2429 }, { "epoch": 0.3833261032456521, "grad_norm": 5.334296703338623, "learning_rate": 6.293672516502979e-07, "loss": 0.0808, "step": 2430 }, { "epoch": 0.3834838506132429, "grad_norm": 6.60718297958374, "learning_rate": 6.292062469811625e-07, "loss": 0.0304, "step": 2431 }, { "epoch": 0.3836415979808337, "grad_norm": 3.425261974334717, "learning_rate": 6.290452423120271e-07, "loss": 0.0645, "step": 2432 }, { "epoch": 0.3837993453484245, "grad_norm": 3.1075892448425293, "learning_rate": 6.288842376428917e-07, "loss": 0.0284, "step": 2433 }, { "epoch": 0.3839570927160153, "grad_norm": 5.557161331176758, "learning_rate": 6.287232329737562e-07, "loss": 0.0459, "step": 2434 }, { "epoch": 0.3841148400836061, "grad_norm": 5.449305534362793, "learning_rate": 6.285622283046208e-07, "loss": 0.0582, "step": 2435 }, { "epoch": 0.3842725874511969, "grad_norm": 2.887645721435547, "learning_rate": 6.284012236354854e-07, "loss": 0.0209, "step": 2436 }, { "epoch": 0.3844303348187877, "grad_norm": 4.608367919921875, "learning_rate": 6.2824021896635e-07, "loss": 0.017, "step": 2437 }, { "epoch": 0.3845880821863785, "grad_norm": 10.7024564743042, "learning_rate": 6.280792142972146e-07, "loss": 0.0878, "step": 2438 }, { "epoch": 0.3847458295539693, "grad_norm": 4.104433059692383, "learning_rate": 6.279182096280792e-07, "loss": 0.0483, "step": 2439 }, { "epoch": 0.38490357692156013, "grad_norm": 7.577397346496582, "learning_rate": 6.277572049589438e-07, "loss": 0.0798, "step": 2440 }, { "epoch": 0.3850613242891509, "grad_norm": 4.732962608337402, "learning_rate": 6.275962002898084e-07, "loss": 0.0842, "step": 2441 }, { "epoch": 0.38521907165674174, "grad_norm": 5.1730852127075195, "learning_rate": 6.274351956206729e-07, "loss": 0.0453, "step": 2442 }, { "epoch": 0.3853768190243325, "grad_norm": 3.8691246509552, "learning_rate": 6.272741909515375e-07, "loss": 0.0496, "step": 2443 }, { "epoch": 0.38553456639192335, "grad_norm": 2.767871618270874, "learning_rate": 6.271131862824021e-07, "loss": 0.0443, "step": 2444 }, { "epoch": 0.3856923137595141, "grad_norm": 5.965371131896973, "learning_rate": 6.269521816132668e-07, "loss": 0.0637, "step": 2445 }, { "epoch": 0.38585006112710496, "grad_norm": 13.240874290466309, "learning_rate": 6.267911769441313e-07, "loss": 0.0599, "step": 2446 }, { "epoch": 0.38600780849469574, "grad_norm": 3.1452207565307617, "learning_rate": 6.26630172274996e-07, "loss": 0.0299, "step": 2447 }, { "epoch": 0.38616555586228657, "grad_norm": 8.212433815002441, "learning_rate": 6.264691676058606e-07, "loss": 0.0393, "step": 2448 }, { "epoch": 0.38632330322987735, "grad_norm": 6.709705829620361, "learning_rate": 6.263081629367252e-07, "loss": 0.0846, "step": 2449 }, { "epoch": 0.3864810505974682, "grad_norm": 4.421270847320557, "learning_rate": 6.261471582675897e-07, "loss": 0.0293, "step": 2450 }, { "epoch": 0.38663879796505896, "grad_norm": 4.260890483856201, "learning_rate": 6.259861535984543e-07, "loss": 0.0533, "step": 2451 }, { "epoch": 0.3867965453326498, "grad_norm": 7.362264156341553, "learning_rate": 6.258251489293189e-07, "loss": 0.0538, "step": 2452 }, { "epoch": 0.38695429270024057, "grad_norm": 6.69329309463501, "learning_rate": 6.256641442601836e-07, "loss": 0.0681, "step": 2453 }, { "epoch": 0.38711204006783134, "grad_norm": 3.216433048248291, "learning_rate": 6.255031395910481e-07, "loss": 0.0334, "step": 2454 }, { "epoch": 0.3872697874354222, "grad_norm": 4.169561386108398, "learning_rate": 6.253421349219127e-07, "loss": 0.0167, "step": 2455 }, { "epoch": 0.38742753480301295, "grad_norm": 10.017484664916992, "learning_rate": 6.251811302527773e-07, "loss": 0.09, "step": 2456 }, { "epoch": 0.3875852821706038, "grad_norm": 8.480804443359375, "learning_rate": 6.250201255836419e-07, "loss": 0.0646, "step": 2457 }, { "epoch": 0.38774302953819456, "grad_norm": 6.587338447570801, "learning_rate": 6.248591209145065e-07, "loss": 0.0596, "step": 2458 }, { "epoch": 0.3879007769057854, "grad_norm": 4.2713751792907715, "learning_rate": 6.24698116245371e-07, "loss": 0.054, "step": 2459 }, { "epoch": 0.38805852427337617, "grad_norm": 3.821355104446411, "learning_rate": 6.245371115762357e-07, "loss": 0.0197, "step": 2460 }, { "epoch": 0.388216271640967, "grad_norm": 4.111475467681885, "learning_rate": 6.243761069071003e-07, "loss": 0.0562, "step": 2461 }, { "epoch": 0.3883740190085578, "grad_norm": 3.1913421154022217, "learning_rate": 6.242151022379649e-07, "loss": 0.021, "step": 2462 }, { "epoch": 0.3885317663761486, "grad_norm": 4.067933559417725, "learning_rate": 6.240540975688294e-07, "loss": 0.0314, "step": 2463 }, { "epoch": 0.3886895137437394, "grad_norm": 6.808985233306885, "learning_rate": 6.23893092899694e-07, "loss": 0.112, "step": 2464 }, { "epoch": 0.3888472611113302, "grad_norm": 8.162635803222656, "learning_rate": 6.237320882305587e-07, "loss": 0.06, "step": 2465 }, { "epoch": 0.389005008478921, "grad_norm": 6.6350483894348145, "learning_rate": 6.235710835614233e-07, "loss": 0.035, "step": 2466 }, { "epoch": 0.38916275584651183, "grad_norm": 6.872002601623535, "learning_rate": 6.234100788922878e-07, "loss": 0.051, "step": 2467 }, { "epoch": 0.3893205032141026, "grad_norm": 7.840211391448975, "learning_rate": 6.232490742231525e-07, "loss": 0.0767, "step": 2468 }, { "epoch": 0.38947825058169344, "grad_norm": 3.9296979904174805, "learning_rate": 6.230880695540171e-07, "loss": 0.0416, "step": 2469 }, { "epoch": 0.3896359979492842, "grad_norm": 3.726431369781494, "learning_rate": 6.229270648848817e-07, "loss": 0.0314, "step": 2470 }, { "epoch": 0.38979374531687505, "grad_norm": 4.923587799072266, "learning_rate": 6.227660602157462e-07, "loss": 0.0323, "step": 2471 }, { "epoch": 0.38995149268446583, "grad_norm": 6.875946044921875, "learning_rate": 6.226050555466108e-07, "loss": 0.1052, "step": 2472 }, { "epoch": 0.3901092400520566, "grad_norm": 10.48278522491455, "learning_rate": 6.224440508774754e-07, "loss": 0.1129, "step": 2473 }, { "epoch": 0.39026698741964744, "grad_norm": 7.798339366912842, "learning_rate": 6.2228304620834e-07, "loss": 0.0719, "step": 2474 }, { "epoch": 0.3904247347872382, "grad_norm": 4.402515888214111, "learning_rate": 6.221220415392045e-07, "loss": 0.0881, "step": 2475 }, { "epoch": 0.39058248215482905, "grad_norm": 6.922409534454346, "learning_rate": 6.219610368700692e-07, "loss": 0.0552, "step": 2476 }, { "epoch": 0.3907402295224198, "grad_norm": 4.888563632965088, "learning_rate": 6.218000322009338e-07, "loss": 0.0481, "step": 2477 }, { "epoch": 0.39089797689001066, "grad_norm": 4.72125768661499, "learning_rate": 6.216390275317984e-07, "loss": 0.0999, "step": 2478 }, { "epoch": 0.39105572425760143, "grad_norm": 4.6083526611328125, "learning_rate": 6.21478022862663e-07, "loss": 0.0648, "step": 2479 }, { "epoch": 0.39121347162519227, "grad_norm": 2.519092559814453, "learning_rate": 6.213170181935275e-07, "loss": 0.0162, "step": 2480 }, { "epoch": 0.39137121899278304, "grad_norm": 2.008596658706665, "learning_rate": 6.211560135243921e-07, "loss": 0.0214, "step": 2481 }, { "epoch": 0.3915289663603739, "grad_norm": 3.965940237045288, "learning_rate": 6.209950088552567e-07, "loss": 0.0445, "step": 2482 }, { "epoch": 0.39168671372796465, "grad_norm": 4.958406448364258, "learning_rate": 6.208340041861215e-07, "loss": 0.0409, "step": 2483 }, { "epoch": 0.3918444610955555, "grad_norm": 7.141300678253174, "learning_rate": 6.20672999516986e-07, "loss": 0.0733, "step": 2484 }, { "epoch": 0.39200220846314626, "grad_norm": 3.656527280807495, "learning_rate": 6.205119948478506e-07, "loss": 0.0803, "step": 2485 }, { "epoch": 0.3921599558307371, "grad_norm": 8.250334739685059, "learning_rate": 6.203509901787152e-07, "loss": 0.0888, "step": 2486 }, { "epoch": 0.39231770319832787, "grad_norm": 4.159690856933594, "learning_rate": 6.201899855095798e-07, "loss": 0.0697, "step": 2487 }, { "epoch": 0.3924754505659187, "grad_norm": 4.09848690032959, "learning_rate": 6.200289808404443e-07, "loss": 0.0268, "step": 2488 }, { "epoch": 0.3926331979335095, "grad_norm": 4.453529357910156, "learning_rate": 6.198679761713089e-07, "loss": 0.0382, "step": 2489 }, { "epoch": 0.3927909453011003, "grad_norm": 7.759806156158447, "learning_rate": 6.197069715021736e-07, "loss": 0.0614, "step": 2490 }, { "epoch": 0.3929486926686911, "grad_norm": 4.390666961669922, "learning_rate": 6.195459668330382e-07, "loss": 0.064, "step": 2491 }, { "epoch": 0.39310644003628187, "grad_norm": 3.7275447845458984, "learning_rate": 6.193849621639027e-07, "loss": 0.0275, "step": 2492 }, { "epoch": 0.3932641874038727, "grad_norm": 3.584869384765625, "learning_rate": 6.192239574947673e-07, "loss": 0.0412, "step": 2493 }, { "epoch": 0.3934219347714635, "grad_norm": 2.2914278507232666, "learning_rate": 6.190629528256319e-07, "loss": 0.0227, "step": 2494 }, { "epoch": 0.3935796821390543, "grad_norm": 4.413097858428955, "learning_rate": 6.189019481564965e-07, "loss": 0.0276, "step": 2495 }, { "epoch": 0.3937374295066451, "grad_norm": 6.845357894897461, "learning_rate": 6.18740943487361e-07, "loss": 0.0717, "step": 2496 }, { "epoch": 0.3938951768742359, "grad_norm": 6.780287742614746, "learning_rate": 6.185799388182256e-07, "loss": 0.0354, "step": 2497 }, { "epoch": 0.3940529242418267, "grad_norm": 4.552005290985107, "learning_rate": 6.184189341490903e-07, "loss": 0.0259, "step": 2498 }, { "epoch": 0.39421067160941753, "grad_norm": 3.2987165451049805, "learning_rate": 6.18257929479955e-07, "loss": 0.0228, "step": 2499 }, { "epoch": 0.3943684189770083, "grad_norm": 8.419435501098633, "learning_rate": 6.180969248108195e-07, "loss": 0.0468, "step": 2500 }, { "epoch": 0.39452616634459914, "grad_norm": 3.106199026107788, "learning_rate": 6.179359201416841e-07, "loss": 0.0242, "step": 2501 }, { "epoch": 0.3946839137121899, "grad_norm": 6.093924045562744, "learning_rate": 6.177749154725487e-07, "loss": 0.0662, "step": 2502 }, { "epoch": 0.39484166107978075, "grad_norm": 7.137367248535156, "learning_rate": 6.176139108034133e-07, "loss": 0.0656, "step": 2503 }, { "epoch": 0.3949994084473715, "grad_norm": 2.205962657928467, "learning_rate": 6.174529061342779e-07, "loss": 0.0266, "step": 2504 }, { "epoch": 0.39515715581496236, "grad_norm": 5.25399923324585, "learning_rate": 6.172919014651424e-07, "loss": 0.0439, "step": 2505 }, { "epoch": 0.39531490318255313, "grad_norm": 3.3727121353149414, "learning_rate": 6.171308967960071e-07, "loss": 0.0462, "step": 2506 }, { "epoch": 0.39547265055014397, "grad_norm": 7.983486175537109, "learning_rate": 6.169698921268717e-07, "loss": 0.0919, "step": 2507 }, { "epoch": 0.39563039791773474, "grad_norm": 4.4629998207092285, "learning_rate": 6.168088874577363e-07, "loss": 0.0512, "step": 2508 }, { "epoch": 0.3957881452853256, "grad_norm": 6.60710334777832, "learning_rate": 6.166478827886008e-07, "loss": 0.06, "step": 2509 }, { "epoch": 0.39594589265291635, "grad_norm": 6.631765842437744, "learning_rate": 6.164868781194654e-07, "loss": 0.0604, "step": 2510 }, { "epoch": 0.39610364002050713, "grad_norm": 3.6337196826934814, "learning_rate": 6.1632587345033e-07, "loss": 0.0427, "step": 2511 }, { "epoch": 0.39626138738809796, "grad_norm": 3.338477849960327, "learning_rate": 6.161648687811946e-07, "loss": 0.0232, "step": 2512 }, { "epoch": 0.39641913475568874, "grad_norm": 2.910904884338379, "learning_rate": 6.160038641120592e-07, "loss": 0.0176, "step": 2513 }, { "epoch": 0.3965768821232796, "grad_norm": 3.373647689819336, "learning_rate": 6.158428594429238e-07, "loss": 0.0555, "step": 2514 }, { "epoch": 0.39673462949087035, "grad_norm": 7.375838756561279, "learning_rate": 6.156818547737884e-07, "loss": 0.0661, "step": 2515 }, { "epoch": 0.3968923768584612, "grad_norm": 7.225650310516357, "learning_rate": 6.15520850104653e-07, "loss": 0.0495, "step": 2516 }, { "epoch": 0.39705012422605196, "grad_norm": 3.6008362770080566, "learning_rate": 6.153598454355175e-07, "loss": 0.0244, "step": 2517 }, { "epoch": 0.3972078715936428, "grad_norm": 5.5542097091674805, "learning_rate": 6.151988407663822e-07, "loss": 0.0395, "step": 2518 }, { "epoch": 0.39736561896123357, "grad_norm": 7.228381156921387, "learning_rate": 6.150378360972468e-07, "loss": 0.0545, "step": 2519 }, { "epoch": 0.3975233663288244, "grad_norm": 3.6476194858551025, "learning_rate": 6.148768314281115e-07, "loss": 0.0329, "step": 2520 }, { "epoch": 0.3976811136964152, "grad_norm": 6.134904384613037, "learning_rate": 6.14715826758976e-07, "loss": 0.0756, "step": 2521 }, { "epoch": 0.397838861064006, "grad_norm": 5.202237129211426, "learning_rate": 6.145548220898406e-07, "loss": 0.0278, "step": 2522 }, { "epoch": 0.3979966084315968, "grad_norm": 2.656900644302368, "learning_rate": 6.143938174207052e-07, "loss": 0.0261, "step": 2523 }, { "epoch": 0.3981543557991876, "grad_norm": 7.788497447967529, "learning_rate": 6.142328127515698e-07, "loss": 0.1033, "step": 2524 }, { "epoch": 0.3983121031667784, "grad_norm": 5.942392349243164, "learning_rate": 6.140718080824344e-07, "loss": 0.0538, "step": 2525 }, { "epoch": 0.39846985053436923, "grad_norm": 6.155275344848633, "learning_rate": 6.139108034132989e-07, "loss": 0.025, "step": 2526 }, { "epoch": 0.39862759790196, "grad_norm": 6.200692653656006, "learning_rate": 6.137497987441635e-07, "loss": 0.031, "step": 2527 }, { "epoch": 0.39878534526955084, "grad_norm": 5.335763931274414, "learning_rate": 6.135887940750282e-07, "loss": 0.0535, "step": 2528 }, { "epoch": 0.3989430926371416, "grad_norm": 3.2778477668762207, "learning_rate": 6.134277894058928e-07, "loss": 0.0251, "step": 2529 }, { "epoch": 0.3991008400047324, "grad_norm": 5.646211624145508, "learning_rate": 6.132667847367573e-07, "loss": 0.0627, "step": 2530 }, { "epoch": 0.3992585873723232, "grad_norm": 5.945457935333252, "learning_rate": 6.131057800676219e-07, "loss": 0.0752, "step": 2531 }, { "epoch": 0.399416334739914, "grad_norm": 6.385833740234375, "learning_rate": 6.129447753984865e-07, "loss": 0.0846, "step": 2532 }, { "epoch": 0.39957408210750484, "grad_norm": 3.2346737384796143, "learning_rate": 6.127837707293511e-07, "loss": 0.0308, "step": 2533 }, { "epoch": 0.3997318294750956, "grad_norm": 6.200742721557617, "learning_rate": 6.126227660602156e-07, "loss": 0.0366, "step": 2534 }, { "epoch": 0.39988957684268644, "grad_norm": 4.204428195953369, "learning_rate": 6.124617613910802e-07, "loss": 0.0508, "step": 2535 }, { "epoch": 0.4000473242102772, "grad_norm": 3.469731330871582, "learning_rate": 6.12300756721945e-07, "loss": 0.0241, "step": 2536 }, { "epoch": 0.40020507157786805, "grad_norm": 4.995206356048584, "learning_rate": 6.121397520528096e-07, "loss": 0.0272, "step": 2537 }, { "epoch": 0.40036281894545883, "grad_norm": 4.293266296386719, "learning_rate": 6.119787473836741e-07, "loss": 0.0607, "step": 2538 }, { "epoch": 0.40052056631304966, "grad_norm": 3.424766778945923, "learning_rate": 6.118177427145387e-07, "loss": 0.0165, "step": 2539 }, { "epoch": 0.40067831368064044, "grad_norm": 10.498326301574707, "learning_rate": 6.116567380454033e-07, "loss": 0.0671, "step": 2540 }, { "epoch": 0.4008360610482313, "grad_norm": 6.873740196228027, "learning_rate": 6.114957333762679e-07, "loss": 0.0668, "step": 2541 }, { "epoch": 0.40099380841582205, "grad_norm": 10.00786018371582, "learning_rate": 6.113347287071324e-07, "loss": 0.051, "step": 2542 }, { "epoch": 0.4011515557834129, "grad_norm": 3.9746105670928955, "learning_rate": 6.111737240379971e-07, "loss": 0.033, "step": 2543 }, { "epoch": 0.40130930315100366, "grad_norm": 4.75880241394043, "learning_rate": 6.110127193688617e-07, "loss": 0.1254, "step": 2544 }, { "epoch": 0.4014670505185945, "grad_norm": 6.771790981292725, "learning_rate": 6.108517146997263e-07, "loss": 0.1063, "step": 2545 }, { "epoch": 0.40162479788618527, "grad_norm": 7.923023223876953, "learning_rate": 6.106907100305908e-07, "loss": 0.1006, "step": 2546 }, { "epoch": 0.4017825452537761, "grad_norm": 8.773269653320312, "learning_rate": 6.105297053614554e-07, "loss": 0.0558, "step": 2547 }, { "epoch": 0.4019402926213669, "grad_norm": 4.2924394607543945, "learning_rate": 6.1036870069232e-07, "loss": 0.0447, "step": 2548 }, { "epoch": 0.4020980399889577, "grad_norm": 5.82992696762085, "learning_rate": 6.102076960231846e-07, "loss": 0.052, "step": 2549 }, { "epoch": 0.4022557873565485, "grad_norm": 6.2320146560668945, "learning_rate": 6.100466913540493e-07, "loss": 0.0841, "step": 2550 }, { "epoch": 0.40241353472413927, "grad_norm": 2.127149820327759, "learning_rate": 6.098856866849138e-07, "loss": 0.0128, "step": 2551 }, { "epoch": 0.4025712820917301, "grad_norm": 6.342316150665283, "learning_rate": 6.097246820157785e-07, "loss": 0.0536, "step": 2552 }, { "epoch": 0.4027290294593209, "grad_norm": 3.6407859325408936, "learning_rate": 6.095636773466431e-07, "loss": 0.0214, "step": 2553 }, { "epoch": 0.4028867768269117, "grad_norm": 7.800268650054932, "learning_rate": 6.094026726775077e-07, "loss": 0.0891, "step": 2554 }, { "epoch": 0.4030445241945025, "grad_norm": 5.903280735015869, "learning_rate": 6.092416680083722e-07, "loss": 0.0631, "step": 2555 }, { "epoch": 0.4032022715620933, "grad_norm": 6.41977071762085, "learning_rate": 6.090806633392368e-07, "loss": 0.0454, "step": 2556 }, { "epoch": 0.4033600189296841, "grad_norm": 4.016145706176758, "learning_rate": 6.089196586701014e-07, "loss": 0.0651, "step": 2557 }, { "epoch": 0.4035177662972749, "grad_norm": 3.340258836746216, "learning_rate": 6.087586540009661e-07, "loss": 0.0342, "step": 2558 }, { "epoch": 0.4036755136648657, "grad_norm": 3.5221705436706543, "learning_rate": 6.085976493318306e-07, "loss": 0.0483, "step": 2559 }, { "epoch": 0.40383326103245654, "grad_norm": 5.156766414642334, "learning_rate": 6.084366446626952e-07, "loss": 0.0859, "step": 2560 }, { "epoch": 0.4039910084000473, "grad_norm": 7.001066207885742, "learning_rate": 6.082756399935598e-07, "loss": 0.0256, "step": 2561 }, { "epoch": 0.40414875576763815, "grad_norm": 6.314644813537598, "learning_rate": 6.081146353244244e-07, "loss": 0.0437, "step": 2562 }, { "epoch": 0.4043065031352289, "grad_norm": 4.919968128204346, "learning_rate": 6.079536306552889e-07, "loss": 0.0605, "step": 2563 }, { "epoch": 0.40446425050281976, "grad_norm": 2.208249092102051, "learning_rate": 6.077926259861535e-07, "loss": 0.0173, "step": 2564 }, { "epoch": 0.40462199787041053, "grad_norm": 5.167953014373779, "learning_rate": 6.076316213170181e-07, "loss": 0.0134, "step": 2565 }, { "epoch": 0.40477974523800136, "grad_norm": 4.2204508781433105, "learning_rate": 6.074706166478828e-07, "loss": 0.0451, "step": 2566 }, { "epoch": 0.40493749260559214, "grad_norm": 6.463189125061035, "learning_rate": 6.073096119787473e-07, "loss": 0.0691, "step": 2567 }, { "epoch": 0.405095239973183, "grad_norm": 5.235622406005859, "learning_rate": 6.071486073096119e-07, "loss": 0.0357, "step": 2568 }, { "epoch": 0.40525298734077375, "grad_norm": 5.960876941680908, "learning_rate": 6.069876026404765e-07, "loss": 0.0904, "step": 2569 }, { "epoch": 0.40541073470836453, "grad_norm": 6.063017845153809, "learning_rate": 6.068265979713412e-07, "loss": 0.0544, "step": 2570 }, { "epoch": 0.40556848207595536, "grad_norm": 7.733518600463867, "learning_rate": 6.066655933022058e-07, "loss": 0.0542, "step": 2571 }, { "epoch": 0.40572622944354614, "grad_norm": 4.373366355895996, "learning_rate": 6.065045886330703e-07, "loss": 0.0462, "step": 2572 }, { "epoch": 0.40588397681113697, "grad_norm": 5.196827411651611, "learning_rate": 6.06343583963935e-07, "loss": 0.0704, "step": 2573 }, { "epoch": 0.40604172417872775, "grad_norm": 6.645310401916504, "learning_rate": 6.061825792947996e-07, "loss": 0.0962, "step": 2574 }, { "epoch": 0.4061994715463186, "grad_norm": 5.274722576141357, "learning_rate": 6.060215746256642e-07, "loss": 0.0731, "step": 2575 }, { "epoch": 0.40635721891390936, "grad_norm": 6.908938884735107, "learning_rate": 6.058605699565287e-07, "loss": 0.1043, "step": 2576 }, { "epoch": 0.4065149662815002, "grad_norm": 5.653018951416016, "learning_rate": 6.056995652873933e-07, "loss": 0.0202, "step": 2577 }, { "epoch": 0.40667271364909097, "grad_norm": 6.999205589294434, "learning_rate": 6.055385606182579e-07, "loss": 0.0547, "step": 2578 }, { "epoch": 0.4068304610166818, "grad_norm": 4.579869270324707, "learning_rate": 6.053775559491225e-07, "loss": 0.0541, "step": 2579 }, { "epoch": 0.4069882083842726, "grad_norm": 9.910844802856445, "learning_rate": 6.05216551279987e-07, "loss": 0.0433, "step": 2580 }, { "epoch": 0.4071459557518634, "grad_norm": 4.5902838706970215, "learning_rate": 6.050555466108517e-07, "loss": 0.0377, "step": 2581 }, { "epoch": 0.4073037031194542, "grad_norm": 4.332019329071045, "learning_rate": 6.048945419417163e-07, "loss": 0.0365, "step": 2582 }, { "epoch": 0.407461450487045, "grad_norm": 3.858030319213867, "learning_rate": 6.047335372725809e-07, "loss": 0.048, "step": 2583 }, { "epoch": 0.4076191978546358, "grad_norm": 1.9246933460235596, "learning_rate": 6.045725326034454e-07, "loss": 0.0315, "step": 2584 }, { "epoch": 0.40777694522222663, "grad_norm": 4.131516456604004, "learning_rate": 6.0441152793431e-07, "loss": 0.0313, "step": 2585 }, { "epoch": 0.4079346925898174, "grad_norm": 15.895787239074707, "learning_rate": 6.042505232651746e-07, "loss": 0.0437, "step": 2586 }, { "epoch": 0.40809243995740824, "grad_norm": 3.734407424926758, "learning_rate": 6.040895185960392e-07, "loss": 0.0283, "step": 2587 }, { "epoch": 0.408250187324999, "grad_norm": 5.200179100036621, "learning_rate": 6.039285139269039e-07, "loss": 0.0238, "step": 2588 }, { "epoch": 0.4084079346925898, "grad_norm": 3.5924243927001953, "learning_rate": 6.037675092577685e-07, "loss": 0.0358, "step": 2589 }, { "epoch": 0.4085656820601806, "grad_norm": 3.209386110305786, "learning_rate": 6.036065045886331e-07, "loss": 0.0237, "step": 2590 }, { "epoch": 0.4087234294277714, "grad_norm": 3.4143080711364746, "learning_rate": 6.034454999194977e-07, "loss": 0.0161, "step": 2591 }, { "epoch": 0.40888117679536223, "grad_norm": 6.51207160949707, "learning_rate": 6.032844952503622e-07, "loss": 0.0492, "step": 2592 }, { "epoch": 0.409038924162953, "grad_norm": 9.512336730957031, "learning_rate": 6.031234905812268e-07, "loss": 0.0506, "step": 2593 }, { "epoch": 0.40919667153054384, "grad_norm": 4.175022602081299, "learning_rate": 6.029624859120914e-07, "loss": 0.0726, "step": 2594 }, { "epoch": 0.4093544188981346, "grad_norm": 3.06833553314209, "learning_rate": 6.02801481242956e-07, "loss": 0.0467, "step": 2595 }, { "epoch": 0.40951216626572545, "grad_norm": 4.335803508758545, "learning_rate": 6.026404765738207e-07, "loss": 0.053, "step": 2596 }, { "epoch": 0.40966991363331623, "grad_norm": 7.851492404937744, "learning_rate": 6.024794719046852e-07, "loss": 0.0855, "step": 2597 }, { "epoch": 0.40982766100090706, "grad_norm": 6.55589485168457, "learning_rate": 6.023184672355498e-07, "loss": 0.033, "step": 2598 }, { "epoch": 0.40998540836849784, "grad_norm": 2.5253841876983643, "learning_rate": 6.021574625664144e-07, "loss": 0.0139, "step": 2599 }, { "epoch": 0.41014315573608867, "grad_norm": 3.553863763809204, "learning_rate": 6.01996457897279e-07, "loss": 0.0537, "step": 2600 }, { "epoch": 0.41030090310367945, "grad_norm": 4.84967041015625, "learning_rate": 6.018354532281435e-07, "loss": 0.0793, "step": 2601 }, { "epoch": 0.4104586504712703, "grad_norm": 7.575808048248291, "learning_rate": 6.016744485590081e-07, "loss": 0.0572, "step": 2602 }, { "epoch": 0.41061639783886106, "grad_norm": 8.631465911865234, "learning_rate": 6.015134438898728e-07, "loss": 0.0718, "step": 2603 }, { "epoch": 0.4107741452064519, "grad_norm": 4.6790056228637695, "learning_rate": 6.013524392207374e-07, "loss": 0.0554, "step": 2604 }, { "epoch": 0.41093189257404267, "grad_norm": 4.117552280426025, "learning_rate": 6.01191434551602e-07, "loss": 0.0465, "step": 2605 }, { "epoch": 0.4110896399416335, "grad_norm": 6.920306205749512, "learning_rate": 6.010304298824666e-07, "loss": 0.0572, "step": 2606 }, { "epoch": 0.4112473873092243, "grad_norm": 3.1878037452697754, "learning_rate": 6.008694252133312e-07, "loss": 0.0251, "step": 2607 }, { "epoch": 0.41140513467681505, "grad_norm": 6.447464466094971, "learning_rate": 6.007084205441958e-07, "loss": 0.0467, "step": 2608 }, { "epoch": 0.4115628820444059, "grad_norm": 7.5503363609313965, "learning_rate": 6.005474158750603e-07, "loss": 0.0475, "step": 2609 }, { "epoch": 0.41172062941199666, "grad_norm": 4.07452917098999, "learning_rate": 6.003864112059249e-07, "loss": 0.0227, "step": 2610 }, { "epoch": 0.4118783767795875, "grad_norm": 5.6531524658203125, "learning_rate": 6.002254065367896e-07, "loss": 0.045, "step": 2611 }, { "epoch": 0.4120361241471783, "grad_norm": 4.197863578796387, "learning_rate": 6.000644018676542e-07, "loss": 0.0225, "step": 2612 }, { "epoch": 0.4121938715147691, "grad_norm": 5.841701030731201, "learning_rate": 5.999033971985187e-07, "loss": 0.0415, "step": 2613 }, { "epoch": 0.4123516188823599, "grad_norm": 3.9903364181518555, "learning_rate": 5.997423925293833e-07, "loss": 0.0393, "step": 2614 }, { "epoch": 0.4125093662499507, "grad_norm": 3.155518054962158, "learning_rate": 5.995813878602479e-07, "loss": 0.0145, "step": 2615 }, { "epoch": 0.4126671136175415, "grad_norm": 4.071169853210449, "learning_rate": 5.994203831911125e-07, "loss": 0.0456, "step": 2616 }, { "epoch": 0.4128248609851323, "grad_norm": 3.968475818634033, "learning_rate": 5.99259378521977e-07, "loss": 0.0245, "step": 2617 }, { "epoch": 0.4129826083527231, "grad_norm": 6.552299976348877, "learning_rate": 5.990983738528417e-07, "loss": 0.0944, "step": 2618 }, { "epoch": 0.41314035572031393, "grad_norm": 8.035197257995605, "learning_rate": 5.989373691837063e-07, "loss": 0.1221, "step": 2619 }, { "epoch": 0.4132981030879047, "grad_norm": 8.004145622253418, "learning_rate": 5.987763645145709e-07, "loss": 0.0546, "step": 2620 }, { "epoch": 0.41345585045549554, "grad_norm": 2.6789891719818115, "learning_rate": 5.986153598454355e-07, "loss": 0.0318, "step": 2621 }, { "epoch": 0.4136135978230863, "grad_norm": 5.315359115600586, "learning_rate": 5.984543551763e-07, "loss": 0.0578, "step": 2622 }, { "epoch": 0.41377134519067715, "grad_norm": 4.650211811065674, "learning_rate": 5.982933505071647e-07, "loss": 0.067, "step": 2623 }, { "epoch": 0.41392909255826793, "grad_norm": 4.35648250579834, "learning_rate": 5.981323458380293e-07, "loss": 0.0275, "step": 2624 }, { "epoch": 0.41408683992585876, "grad_norm": 6.478522300720215, "learning_rate": 5.979713411688939e-07, "loss": 0.0403, "step": 2625 }, { "epoch": 0.41424458729344954, "grad_norm": 5.901090621948242, "learning_rate": 5.978103364997585e-07, "loss": 0.0356, "step": 2626 }, { "epoch": 0.4144023346610403, "grad_norm": 12.616568565368652, "learning_rate": 5.976493318306231e-07, "loss": 0.0691, "step": 2627 }, { "epoch": 0.41456008202863115, "grad_norm": 5.331027984619141, "learning_rate": 5.974883271614877e-07, "loss": 0.0284, "step": 2628 }, { "epoch": 0.4147178293962219, "grad_norm": 5.250926494598389, "learning_rate": 5.973273224923523e-07, "loss": 0.0418, "step": 2629 }, { "epoch": 0.41487557676381276, "grad_norm": 4.558555603027344, "learning_rate": 5.971663178232168e-07, "loss": 0.0379, "step": 2630 }, { "epoch": 0.41503332413140354, "grad_norm": 5.668402671813965, "learning_rate": 5.970053131540814e-07, "loss": 0.0452, "step": 2631 }, { "epoch": 0.41519107149899437, "grad_norm": 5.460912227630615, "learning_rate": 5.96844308484946e-07, "loss": 0.0203, "step": 2632 }, { "epoch": 0.41534881886658515, "grad_norm": 5.2398271560668945, "learning_rate": 5.966833038158107e-07, "loss": 0.0538, "step": 2633 }, { "epoch": 0.415506566234176, "grad_norm": 11.468836784362793, "learning_rate": 5.965222991466752e-07, "loss": 0.0935, "step": 2634 }, { "epoch": 0.41566431360176676, "grad_norm": 7.559725761413574, "learning_rate": 5.963612944775398e-07, "loss": 0.0673, "step": 2635 }, { "epoch": 0.4158220609693576, "grad_norm": 5.004055976867676, "learning_rate": 5.962002898084044e-07, "loss": 0.0191, "step": 2636 }, { "epoch": 0.41597980833694836, "grad_norm": 4.78782320022583, "learning_rate": 5.96039285139269e-07, "loss": 0.0261, "step": 2637 }, { "epoch": 0.4161375557045392, "grad_norm": 8.144420623779297, "learning_rate": 5.958782804701335e-07, "loss": 0.0584, "step": 2638 }, { "epoch": 0.41629530307213, "grad_norm": 3.625196695327759, "learning_rate": 5.957172758009981e-07, "loss": 0.0481, "step": 2639 }, { "epoch": 0.4164530504397208, "grad_norm": 4.784508228302002, "learning_rate": 5.955562711318627e-07, "loss": 0.0469, "step": 2640 }, { "epoch": 0.4166107978073116, "grad_norm": 5.008841514587402, "learning_rate": 5.953952664627275e-07, "loss": 0.0434, "step": 2641 }, { "epoch": 0.4167685451749024, "grad_norm": 4.575802803039551, "learning_rate": 5.952342617935921e-07, "loss": 0.0395, "step": 2642 }, { "epoch": 0.4169262925424932, "grad_norm": 4.045619964599609, "learning_rate": 5.950732571244566e-07, "loss": 0.0249, "step": 2643 }, { "epoch": 0.417084039910084, "grad_norm": 4.226224422454834, "learning_rate": 5.949122524553212e-07, "loss": 0.0365, "step": 2644 }, { "epoch": 0.4172417872776748, "grad_norm": 9.37044906616211, "learning_rate": 5.947512477861858e-07, "loss": 0.0638, "step": 2645 }, { "epoch": 0.4173995346452656, "grad_norm": 6.7245001792907715, "learning_rate": 5.945902431170504e-07, "loss": 0.0299, "step": 2646 }, { "epoch": 0.4175572820128564, "grad_norm": 9.082123756408691, "learning_rate": 5.944292384479149e-07, "loss": 0.0668, "step": 2647 }, { "epoch": 0.4177150293804472, "grad_norm": 7.415581226348877, "learning_rate": 5.942682337787796e-07, "loss": 0.1205, "step": 2648 }, { "epoch": 0.417872776748038, "grad_norm": 2.4082369804382324, "learning_rate": 5.941072291096442e-07, "loss": 0.0129, "step": 2649 }, { "epoch": 0.4180305241156288, "grad_norm": 8.189739227294922, "learning_rate": 5.939462244405088e-07, "loss": 0.1089, "step": 2650 }, { "epoch": 0.41818827148321963, "grad_norm": 4.564749240875244, "learning_rate": 5.937852197713733e-07, "loss": 0.061, "step": 2651 }, { "epoch": 0.4183460188508104, "grad_norm": 2.6857526302337646, "learning_rate": 5.936242151022379e-07, "loss": 0.0141, "step": 2652 }, { "epoch": 0.41850376621840124, "grad_norm": 5.747800827026367, "learning_rate": 5.934632104331025e-07, "loss": 0.0419, "step": 2653 }, { "epoch": 0.418661513585992, "grad_norm": 3.823982000350952, "learning_rate": 5.933022057639671e-07, "loss": 0.0235, "step": 2654 }, { "epoch": 0.41881926095358285, "grad_norm": 6.821854591369629, "learning_rate": 5.931412010948316e-07, "loss": 0.057, "step": 2655 }, { "epoch": 0.4189770083211736, "grad_norm": 9.907840728759766, "learning_rate": 5.929801964256963e-07, "loss": 0.0684, "step": 2656 }, { "epoch": 0.41913475568876446, "grad_norm": 6.711484909057617, "learning_rate": 5.92819191756561e-07, "loss": 0.0456, "step": 2657 }, { "epoch": 0.41929250305635524, "grad_norm": 7.317268371582031, "learning_rate": 5.926581870874256e-07, "loss": 0.0809, "step": 2658 }, { "epoch": 0.41945025042394607, "grad_norm": 6.277600288391113, "learning_rate": 5.924971824182901e-07, "loss": 0.057, "step": 2659 }, { "epoch": 0.41960799779153685, "grad_norm": 5.619283676147461, "learning_rate": 5.923361777491547e-07, "loss": 0.0484, "step": 2660 }, { "epoch": 0.4197657451591277, "grad_norm": 6.379857540130615, "learning_rate": 5.921751730800193e-07, "loss": 0.0595, "step": 2661 }, { "epoch": 0.41992349252671846, "grad_norm": 8.349502563476562, "learning_rate": 5.920141684108839e-07, "loss": 0.1024, "step": 2662 }, { "epoch": 0.4200812398943093, "grad_norm": 6.646510601043701, "learning_rate": 5.918531637417485e-07, "loss": 0.0624, "step": 2663 }, { "epoch": 0.42023898726190007, "grad_norm": 10.572782516479492, "learning_rate": 5.916921590726131e-07, "loss": 0.1357, "step": 2664 }, { "epoch": 0.4203967346294909, "grad_norm": 6.063112735748291, "learning_rate": 5.915311544034777e-07, "loss": 0.0458, "step": 2665 }, { "epoch": 0.4205544819970817, "grad_norm": 3.0055274963378906, "learning_rate": 5.913701497343423e-07, "loss": 0.0322, "step": 2666 }, { "epoch": 0.42071222936467245, "grad_norm": 4.2171711921691895, "learning_rate": 5.912091450652069e-07, "loss": 0.0556, "step": 2667 }, { "epoch": 0.4208699767322633, "grad_norm": 8.459450721740723, "learning_rate": 5.910481403960714e-07, "loss": 0.0401, "step": 2668 }, { "epoch": 0.42102772409985406, "grad_norm": 2.972092866897583, "learning_rate": 5.90887135726936e-07, "loss": 0.0295, "step": 2669 }, { "epoch": 0.4211854714674449, "grad_norm": 7.867188930511475, "learning_rate": 5.907261310578006e-07, "loss": 0.07, "step": 2670 }, { "epoch": 0.42134321883503567, "grad_norm": 5.135982513427734, "learning_rate": 5.905651263886653e-07, "loss": 0.0521, "step": 2671 }, { "epoch": 0.4215009662026265, "grad_norm": 4.097264766693115, "learning_rate": 5.904041217195298e-07, "loss": 0.034, "step": 2672 }, { "epoch": 0.4216587135702173, "grad_norm": 3.2581069469451904, "learning_rate": 5.902431170503944e-07, "loss": 0.031, "step": 2673 }, { "epoch": 0.4218164609378081, "grad_norm": 4.152832508087158, "learning_rate": 5.90082112381259e-07, "loss": 0.0388, "step": 2674 }, { "epoch": 0.4219742083053989, "grad_norm": 7.619700908660889, "learning_rate": 5.899211077121237e-07, "loss": 0.0503, "step": 2675 }, { "epoch": 0.4221319556729897, "grad_norm": 3.360625743865967, "learning_rate": 5.897601030429882e-07, "loss": 0.0218, "step": 2676 }, { "epoch": 0.4222897030405805, "grad_norm": 4.949860572814941, "learning_rate": 5.895990983738528e-07, "loss": 0.0276, "step": 2677 }, { "epoch": 0.42244745040817133, "grad_norm": 5.772740840911865, "learning_rate": 5.894380937047175e-07, "loss": 0.0352, "step": 2678 }, { "epoch": 0.4226051977757621, "grad_norm": 4.35075569152832, "learning_rate": 5.892770890355821e-07, "loss": 0.0484, "step": 2679 }, { "epoch": 0.42276294514335294, "grad_norm": 2.8517985343933105, "learning_rate": 5.891160843664466e-07, "loss": 0.0601, "step": 2680 }, { "epoch": 0.4229206925109437, "grad_norm": 2.550802230834961, "learning_rate": 5.889550796973112e-07, "loss": 0.0134, "step": 2681 }, { "epoch": 0.42307843987853455, "grad_norm": 3.6672005653381348, "learning_rate": 5.887940750281758e-07, "loss": 0.0518, "step": 2682 }, { "epoch": 0.42323618724612533, "grad_norm": 6.912148475646973, "learning_rate": 5.886330703590404e-07, "loss": 0.0822, "step": 2683 }, { "epoch": 0.42339393461371616, "grad_norm": 5.84259033203125, "learning_rate": 5.884720656899049e-07, "loss": 0.0667, "step": 2684 }, { "epoch": 0.42355168198130694, "grad_norm": 4.002005100250244, "learning_rate": 5.883110610207695e-07, "loss": 0.0494, "step": 2685 }, { "epoch": 0.4237094293488977, "grad_norm": 5.479091167449951, "learning_rate": 5.881500563516342e-07, "loss": 0.0369, "step": 2686 }, { "epoch": 0.42386717671648855, "grad_norm": 4.5175652503967285, "learning_rate": 5.879890516824988e-07, "loss": 0.0381, "step": 2687 }, { "epoch": 0.4240249240840793, "grad_norm": 3.294260025024414, "learning_rate": 5.878280470133634e-07, "loss": 0.0269, "step": 2688 }, { "epoch": 0.42418267145167016, "grad_norm": 3.613359212875366, "learning_rate": 5.876670423442279e-07, "loss": 0.0457, "step": 2689 }, { "epoch": 0.42434041881926093, "grad_norm": 5.717574596405029, "learning_rate": 5.875060376750925e-07, "loss": 0.0528, "step": 2690 }, { "epoch": 0.42449816618685177, "grad_norm": 3.467639207839966, "learning_rate": 5.873450330059571e-07, "loss": 0.0303, "step": 2691 }, { "epoch": 0.42465591355444254, "grad_norm": 35.10895538330078, "learning_rate": 5.871840283368217e-07, "loss": 0.0447, "step": 2692 }, { "epoch": 0.4248136609220334, "grad_norm": 7.386102676391602, "learning_rate": 5.870230236676864e-07, "loss": 0.0437, "step": 2693 }, { "epoch": 0.42497140828962415, "grad_norm": 4.7817816734313965, "learning_rate": 5.86862018998551e-07, "loss": 0.0501, "step": 2694 }, { "epoch": 0.425129155657215, "grad_norm": 6.132143974304199, "learning_rate": 5.867010143294156e-07, "loss": 0.0751, "step": 2695 }, { "epoch": 0.42528690302480576, "grad_norm": 4.20089054107666, "learning_rate": 5.865400096602802e-07, "loss": 0.0579, "step": 2696 }, { "epoch": 0.4254446503923966, "grad_norm": 1.796299695968628, "learning_rate": 5.863790049911447e-07, "loss": 0.0098, "step": 2697 }, { "epoch": 0.42560239775998737, "grad_norm": 3.0523252487182617, "learning_rate": 5.862180003220093e-07, "loss": 0.024, "step": 2698 }, { "epoch": 0.4257601451275782, "grad_norm": 4.074103355407715, "learning_rate": 5.860569956528739e-07, "loss": 0.0611, "step": 2699 }, { "epoch": 0.425917892495169, "grad_norm": 7.769861221313477, "learning_rate": 5.858959909837385e-07, "loss": 0.1272, "step": 2700 }, { "epoch": 0.4260756398627598, "grad_norm": 6.271566867828369, "learning_rate": 5.857349863146031e-07, "loss": 0.0559, "step": 2701 }, { "epoch": 0.4262333872303506, "grad_norm": 6.888360977172852, "learning_rate": 5.855739816454677e-07, "loss": 0.0598, "step": 2702 }, { "epoch": 0.4263911345979414, "grad_norm": 8.045308113098145, "learning_rate": 5.854129769763323e-07, "loss": 0.0598, "step": 2703 }, { "epoch": 0.4265488819655322, "grad_norm": 5.838111877441406, "learning_rate": 5.852519723071969e-07, "loss": 0.051, "step": 2704 }, { "epoch": 0.426706629333123, "grad_norm": 7.2040557861328125, "learning_rate": 5.850909676380614e-07, "loss": 0.0525, "step": 2705 }, { "epoch": 0.4268643767007138, "grad_norm": 3.799769878387451, "learning_rate": 5.84929962968926e-07, "loss": 0.0349, "step": 2706 }, { "epoch": 0.4270221240683046, "grad_norm": 3.744792938232422, "learning_rate": 5.847689582997906e-07, "loss": 0.0325, "step": 2707 }, { "epoch": 0.4271798714358954, "grad_norm": 7.798043727874756, "learning_rate": 5.846079536306553e-07, "loss": 0.0347, "step": 2708 }, { "epoch": 0.4273376188034862, "grad_norm": 4.0187764167785645, "learning_rate": 5.844469489615198e-07, "loss": 0.0298, "step": 2709 }, { "epoch": 0.42749536617107703, "grad_norm": 8.834007263183594, "learning_rate": 5.842859442923844e-07, "loss": 0.034, "step": 2710 }, { "epoch": 0.4276531135386678, "grad_norm": 7.793360710144043, "learning_rate": 5.841249396232491e-07, "loss": 0.0674, "step": 2711 }, { "epoch": 0.42781086090625864, "grad_norm": 1.7781180143356323, "learning_rate": 5.839639349541137e-07, "loss": 0.0082, "step": 2712 }, { "epoch": 0.4279686082738494, "grad_norm": 6.2372941970825195, "learning_rate": 5.838029302849783e-07, "loss": 0.0319, "step": 2713 }, { "epoch": 0.42812635564144025, "grad_norm": 24.141183853149414, "learning_rate": 5.836419256158428e-07, "loss": 0.0079, "step": 2714 }, { "epoch": 0.428284103009031, "grad_norm": 13.051074028015137, "learning_rate": 5.834809209467074e-07, "loss": 0.0286, "step": 2715 }, { "epoch": 0.42844185037662186, "grad_norm": 11.537595748901367, "learning_rate": 5.833199162775721e-07, "loss": 0.0876, "step": 2716 }, { "epoch": 0.42859959774421263, "grad_norm": 3.638516664505005, "learning_rate": 5.831589116084367e-07, "loss": 0.0311, "step": 2717 }, { "epoch": 0.42875734511180347, "grad_norm": 8.744967460632324, "learning_rate": 5.829979069393012e-07, "loss": 0.0528, "step": 2718 }, { "epoch": 0.42891509247939424, "grad_norm": 12.257424354553223, "learning_rate": 5.828369022701658e-07, "loss": 0.0731, "step": 2719 }, { "epoch": 0.4290728398469851, "grad_norm": 6.0300116539001465, "learning_rate": 5.826758976010304e-07, "loss": 0.0724, "step": 2720 }, { "epoch": 0.42923058721457585, "grad_norm": 8.850271224975586, "learning_rate": 5.82514892931895e-07, "loss": 0.0757, "step": 2721 }, { "epoch": 0.4293883345821667, "grad_norm": 8.085939407348633, "learning_rate": 5.823538882627595e-07, "loss": 0.0785, "step": 2722 }, { "epoch": 0.42954608194975746, "grad_norm": 2.521941900253296, "learning_rate": 5.821928835936242e-07, "loss": 0.0375, "step": 2723 }, { "epoch": 0.42970382931734824, "grad_norm": 4.878946781158447, "learning_rate": 5.820318789244888e-07, "loss": 0.0276, "step": 2724 }, { "epoch": 0.4298615766849391, "grad_norm": 5.004683494567871, "learning_rate": 5.818708742553534e-07, "loss": 0.0587, "step": 2725 }, { "epoch": 0.43001932405252985, "grad_norm": 3.161196231842041, "learning_rate": 5.817098695862179e-07, "loss": 0.0352, "step": 2726 }, { "epoch": 0.4301770714201207, "grad_norm": 4.550516128540039, "learning_rate": 5.815488649170825e-07, "loss": 0.0362, "step": 2727 }, { "epoch": 0.43033481878771146, "grad_norm": 3.5981874465942383, "learning_rate": 5.813878602479471e-07, "loss": 0.0324, "step": 2728 }, { "epoch": 0.4304925661553023, "grad_norm": 7.343104839324951, "learning_rate": 5.812268555788118e-07, "loss": 0.0488, "step": 2729 }, { "epoch": 0.43065031352289307, "grad_norm": 4.300084114074707, "learning_rate": 5.810658509096763e-07, "loss": 0.064, "step": 2730 }, { "epoch": 0.4308080608904839, "grad_norm": 6.5787153244018555, "learning_rate": 5.80904846240541e-07, "loss": 0.0865, "step": 2731 }, { "epoch": 0.4309658082580747, "grad_norm": 6.296169281005859, "learning_rate": 5.807438415714056e-07, "loss": 0.0759, "step": 2732 }, { "epoch": 0.4311235556256655, "grad_norm": 4.846346378326416, "learning_rate": 5.805828369022702e-07, "loss": 0.0779, "step": 2733 }, { "epoch": 0.4312813029932563, "grad_norm": 3.2218270301818848, "learning_rate": 5.804218322331348e-07, "loss": 0.0375, "step": 2734 }, { "epoch": 0.4314390503608471, "grad_norm": 5.525953769683838, "learning_rate": 5.802608275639993e-07, "loss": 0.0571, "step": 2735 }, { "epoch": 0.4315967977284379, "grad_norm": 6.556750774383545, "learning_rate": 5.800998228948639e-07, "loss": 0.0584, "step": 2736 }, { "epoch": 0.43175454509602873, "grad_norm": 4.103269577026367, "learning_rate": 5.799388182257285e-07, "loss": 0.0402, "step": 2737 }, { "epoch": 0.4319122924636195, "grad_norm": 6.050023078918457, "learning_rate": 5.797778135565932e-07, "loss": 0.0632, "step": 2738 }, { "epoch": 0.43207003983121034, "grad_norm": 3.4731900691986084, "learning_rate": 5.796168088874577e-07, "loss": 0.0248, "step": 2739 }, { "epoch": 0.4322277871988011, "grad_norm": 10.451546669006348, "learning_rate": 5.794558042183223e-07, "loss": 0.0933, "step": 2740 }, { "epoch": 0.43238553456639195, "grad_norm": 3.39365291595459, "learning_rate": 5.792947995491869e-07, "loss": 0.0462, "step": 2741 }, { "epoch": 0.4325432819339827, "grad_norm": 3.792912483215332, "learning_rate": 5.791337948800515e-07, "loss": 0.0165, "step": 2742 }, { "epoch": 0.4327010293015735, "grad_norm": 2.6316964626312256, "learning_rate": 5.78972790210916e-07, "loss": 0.0131, "step": 2743 }, { "epoch": 0.43285877666916434, "grad_norm": 6.959858417510986, "learning_rate": 5.788117855417806e-07, "loss": 0.0567, "step": 2744 }, { "epoch": 0.4330165240367551, "grad_norm": 10.351690292358398, "learning_rate": 5.786507808726452e-07, "loss": 0.0672, "step": 2745 }, { "epoch": 0.43317427140434595, "grad_norm": 5.9646830558776855, "learning_rate": 5.7848977620351e-07, "loss": 0.0437, "step": 2746 }, { "epoch": 0.4333320187719367, "grad_norm": 5.861454010009766, "learning_rate": 5.783287715343745e-07, "loss": 0.0524, "step": 2747 }, { "epoch": 0.43348976613952755, "grad_norm": 8.58069896697998, "learning_rate": 5.781677668652391e-07, "loss": 0.0347, "step": 2748 }, { "epoch": 0.43364751350711833, "grad_norm": 5.208137035369873, "learning_rate": 5.780067621961037e-07, "loss": 0.0622, "step": 2749 }, { "epoch": 0.43380526087470916, "grad_norm": 3.031008243560791, "learning_rate": 5.778457575269683e-07, "loss": 0.0423, "step": 2750 }, { "epoch": 0.43396300824229994, "grad_norm": 6.715963840484619, "learning_rate": 5.776847528578328e-07, "loss": 0.0508, "step": 2751 }, { "epoch": 0.4341207556098908, "grad_norm": 3.5705153942108154, "learning_rate": 5.775237481886974e-07, "loss": 0.0146, "step": 2752 }, { "epoch": 0.43427850297748155, "grad_norm": 4.775838375091553, "learning_rate": 5.773627435195621e-07, "loss": 0.1016, "step": 2753 }, { "epoch": 0.4344362503450724, "grad_norm": 6.519713401794434, "learning_rate": 5.772017388504267e-07, "loss": 0.0768, "step": 2754 }, { "epoch": 0.43459399771266316, "grad_norm": 12.248930931091309, "learning_rate": 5.770407341812912e-07, "loss": 0.0351, "step": 2755 }, { "epoch": 0.434751745080254, "grad_norm": 2.580695152282715, "learning_rate": 5.768797295121558e-07, "loss": 0.0195, "step": 2756 }, { "epoch": 0.43490949244784477, "grad_norm": 9.897636413574219, "learning_rate": 5.767187248430204e-07, "loss": 0.0939, "step": 2757 }, { "epoch": 0.4350672398154356, "grad_norm": 26.89352798461914, "learning_rate": 5.76557720173885e-07, "loss": 0.0545, "step": 2758 }, { "epoch": 0.4352249871830264, "grad_norm": 4.561646461486816, "learning_rate": 5.763967155047496e-07, "loss": 0.0573, "step": 2759 }, { "epoch": 0.4353827345506172, "grad_norm": 2.932121753692627, "learning_rate": 5.762357108356141e-07, "loss": 0.03, "step": 2760 }, { "epoch": 0.435540481918208, "grad_norm": 7.722933769226074, "learning_rate": 5.760747061664788e-07, "loss": 0.0513, "step": 2761 }, { "epoch": 0.43569822928579877, "grad_norm": 4.556098937988281, "learning_rate": 5.759137014973434e-07, "loss": 0.0715, "step": 2762 }, { "epoch": 0.4358559766533896, "grad_norm": 5.859991073608398, "learning_rate": 5.757526968282081e-07, "loss": 0.071, "step": 2763 }, { "epoch": 0.4360137240209804, "grad_norm": 7.509644508361816, "learning_rate": 5.755916921590726e-07, "loss": 0.084, "step": 2764 }, { "epoch": 0.4361714713885712, "grad_norm": 7.722994804382324, "learning_rate": 5.754306874899372e-07, "loss": 0.0973, "step": 2765 }, { "epoch": 0.436329218756162, "grad_norm": 6.072455883026123, "learning_rate": 5.752696828208018e-07, "loss": 0.025, "step": 2766 }, { "epoch": 0.4364869661237528, "grad_norm": 6.797508239746094, "learning_rate": 5.751086781516664e-07, "loss": 0.0842, "step": 2767 }, { "epoch": 0.4366447134913436, "grad_norm": 10.333138465881348, "learning_rate": 5.74947673482531e-07, "loss": 0.1186, "step": 2768 }, { "epoch": 0.4368024608589344, "grad_norm": 6.689957141876221, "learning_rate": 5.747866688133956e-07, "loss": 0.0428, "step": 2769 }, { "epoch": 0.4369602082265252, "grad_norm": 5.772156715393066, "learning_rate": 5.746256641442602e-07, "loss": 0.0744, "step": 2770 }, { "epoch": 0.43711795559411604, "grad_norm": 7.585948944091797, "learning_rate": 5.744646594751248e-07, "loss": 0.0448, "step": 2771 }, { "epoch": 0.4372757029617068, "grad_norm": 4.13621187210083, "learning_rate": 5.743036548059893e-07, "loss": 0.0339, "step": 2772 }, { "epoch": 0.43743345032929765, "grad_norm": 7.188488483428955, "learning_rate": 5.741426501368539e-07, "loss": 0.0341, "step": 2773 }, { "epoch": 0.4375911976968884, "grad_norm": 2.731147289276123, "learning_rate": 5.739816454677185e-07, "loss": 0.0304, "step": 2774 }, { "epoch": 0.43774894506447926, "grad_norm": 4.252033710479736, "learning_rate": 5.738206407985831e-07, "loss": 0.0355, "step": 2775 }, { "epoch": 0.43790669243207003, "grad_norm": 11.173271179199219, "learning_rate": 5.736596361294477e-07, "loss": 0.0743, "step": 2776 }, { "epoch": 0.43806443979966087, "grad_norm": 5.05621337890625, "learning_rate": 5.734986314603123e-07, "loss": 0.0661, "step": 2777 }, { "epoch": 0.43822218716725164, "grad_norm": 7.741921901702881, "learning_rate": 5.733376267911769e-07, "loss": 0.0532, "step": 2778 }, { "epoch": 0.4383799345348425, "grad_norm": 3.764737367630005, "learning_rate": 5.731766221220415e-07, "loss": 0.0505, "step": 2779 }, { "epoch": 0.43853768190243325, "grad_norm": 4.446591854095459, "learning_rate": 5.730156174529061e-07, "loss": 0.0296, "step": 2780 }, { "epoch": 0.43869542927002403, "grad_norm": 7.5737833976745605, "learning_rate": 5.728546127837706e-07, "loss": 0.0662, "step": 2781 }, { "epoch": 0.43885317663761486, "grad_norm": 6.038362503051758, "learning_rate": 5.726936081146353e-07, "loss": 0.0429, "step": 2782 }, { "epoch": 0.43901092400520564, "grad_norm": 10.07390308380127, "learning_rate": 5.725326034455e-07, "loss": 0.0397, "step": 2783 }, { "epoch": 0.43916867137279647, "grad_norm": 3.347057819366455, "learning_rate": 5.723715987763646e-07, "loss": 0.0326, "step": 2784 }, { "epoch": 0.43932641874038725, "grad_norm": 3.8456451892852783, "learning_rate": 5.722105941072291e-07, "loss": 0.0722, "step": 2785 }, { "epoch": 0.4394841661079781, "grad_norm": 5.601995944976807, "learning_rate": 5.720495894380937e-07, "loss": 0.0328, "step": 2786 }, { "epoch": 0.43964191347556886, "grad_norm": 7.750946521759033, "learning_rate": 5.718885847689583e-07, "loss": 0.073, "step": 2787 }, { "epoch": 0.4397996608431597, "grad_norm": 3.2933003902435303, "learning_rate": 5.717275800998229e-07, "loss": 0.0139, "step": 2788 }, { "epoch": 0.43995740821075047, "grad_norm": 5.042623519897461, "learning_rate": 5.715665754306874e-07, "loss": 0.0895, "step": 2789 }, { "epoch": 0.4401151555783413, "grad_norm": 2.9454407691955566, "learning_rate": 5.71405570761552e-07, "loss": 0.0171, "step": 2790 }, { "epoch": 0.4402729029459321, "grad_norm": 5.160466194152832, "learning_rate": 5.712445660924167e-07, "loss": 0.0472, "step": 2791 }, { "epoch": 0.4404306503135229, "grad_norm": 3.523851156234741, "learning_rate": 5.710835614232813e-07, "loss": 0.0342, "step": 2792 }, { "epoch": 0.4405883976811137, "grad_norm": 4.827638626098633, "learning_rate": 5.709225567541458e-07, "loss": 0.0385, "step": 2793 }, { "epoch": 0.4407461450487045, "grad_norm": 5.04912805557251, "learning_rate": 5.707615520850104e-07, "loss": 0.0793, "step": 2794 }, { "epoch": 0.4409038924162953, "grad_norm": 6.973541259765625, "learning_rate": 5.70600547415875e-07, "loss": 0.0738, "step": 2795 }, { "epoch": 0.44106163978388613, "grad_norm": 5.731598377227783, "learning_rate": 5.704395427467396e-07, "loss": 0.0474, "step": 2796 }, { "epoch": 0.4412193871514769, "grad_norm": 4.190987586975098, "learning_rate": 5.702785380776041e-07, "loss": 0.0313, "step": 2797 }, { "epoch": 0.44137713451906774, "grad_norm": 3.4082014560699463, "learning_rate": 5.701175334084689e-07, "loss": 0.0202, "step": 2798 }, { "epoch": 0.4415348818866585, "grad_norm": 5.786175727844238, "learning_rate": 5.699565287393335e-07, "loss": 0.0601, "step": 2799 }, { "epoch": 0.44169262925424935, "grad_norm": 4.239172458648682, "learning_rate": 5.697955240701981e-07, "loss": 0.0689, "step": 2800 }, { "epoch": 0.4418503766218401, "grad_norm": 3.3087871074676514, "learning_rate": 5.696345194010626e-07, "loss": 0.0495, "step": 2801 }, { "epoch": 0.4420081239894309, "grad_norm": 6.40525484085083, "learning_rate": 5.694735147319272e-07, "loss": 0.0477, "step": 2802 }, { "epoch": 0.44216587135702173, "grad_norm": 5.296397686004639, "learning_rate": 5.693125100627918e-07, "loss": 0.0669, "step": 2803 }, { "epoch": 0.4423236187246125, "grad_norm": 8.928070068359375, "learning_rate": 5.691515053936564e-07, "loss": 0.0561, "step": 2804 }, { "epoch": 0.44248136609220334, "grad_norm": 3.2695870399475098, "learning_rate": 5.68990500724521e-07, "loss": 0.0434, "step": 2805 }, { "epoch": 0.4426391134597941, "grad_norm": 4.327723979949951, "learning_rate": 5.688294960553856e-07, "loss": 0.0267, "step": 2806 }, { "epoch": 0.44279686082738495, "grad_norm": 4.668676853179932, "learning_rate": 5.686684913862502e-07, "loss": 0.0436, "step": 2807 }, { "epoch": 0.44295460819497573, "grad_norm": 4.603848457336426, "learning_rate": 5.685074867171148e-07, "loss": 0.0389, "step": 2808 }, { "epoch": 0.44311235556256656, "grad_norm": 5.100075721740723, "learning_rate": 5.683464820479794e-07, "loss": 0.0364, "step": 2809 }, { "epoch": 0.44327010293015734, "grad_norm": 2.836162567138672, "learning_rate": 5.681854773788439e-07, "loss": 0.0326, "step": 2810 }, { "epoch": 0.44342785029774817, "grad_norm": 2.471791982650757, "learning_rate": 5.680244727097085e-07, "loss": 0.0176, "step": 2811 }, { "epoch": 0.44358559766533895, "grad_norm": 2.9325854778289795, "learning_rate": 5.678634680405731e-07, "loss": 0.0227, "step": 2812 }, { "epoch": 0.4437433450329298, "grad_norm": 2.0947155952453613, "learning_rate": 5.677024633714378e-07, "loss": 0.0172, "step": 2813 }, { "epoch": 0.44390109240052056, "grad_norm": 4.812472343444824, "learning_rate": 5.675414587023023e-07, "loss": 0.0524, "step": 2814 }, { "epoch": 0.4440588397681114, "grad_norm": 3.985248565673828, "learning_rate": 5.673804540331669e-07, "loss": 0.058, "step": 2815 }, { "epoch": 0.44421658713570217, "grad_norm": 3.3267018795013428, "learning_rate": 5.672194493640316e-07, "loss": 0.0511, "step": 2816 }, { "epoch": 0.444374334503293, "grad_norm": 13.52624225616455, "learning_rate": 5.670584446948962e-07, "loss": 0.0704, "step": 2817 }, { "epoch": 0.4445320818708838, "grad_norm": 3.9268667697906494, "learning_rate": 5.668974400257607e-07, "loss": 0.0291, "step": 2818 }, { "epoch": 0.4446898292384746, "grad_norm": 6.879584789276123, "learning_rate": 5.667364353566253e-07, "loss": 0.0462, "step": 2819 }, { "epoch": 0.4448475766060654, "grad_norm": 14.13100528717041, "learning_rate": 5.665754306874899e-07, "loss": 0.1236, "step": 2820 }, { "epoch": 0.44500532397365616, "grad_norm": 4.642329692840576, "learning_rate": 5.664144260183546e-07, "loss": 0.0347, "step": 2821 }, { "epoch": 0.445163071341247, "grad_norm": 9.75646686553955, "learning_rate": 5.662534213492191e-07, "loss": 0.0472, "step": 2822 }, { "epoch": 0.4453208187088378, "grad_norm": 2.149906635284424, "learning_rate": 5.660924166800837e-07, "loss": 0.0131, "step": 2823 }, { "epoch": 0.4454785660764286, "grad_norm": 3.265413761138916, "learning_rate": 5.659314120109483e-07, "loss": 0.034, "step": 2824 }, { "epoch": 0.4456363134440194, "grad_norm": 8.077981948852539, "learning_rate": 5.657704073418129e-07, "loss": 0.0458, "step": 2825 }, { "epoch": 0.4457940608116102, "grad_norm": 2.121990919113159, "learning_rate": 5.656094026726774e-07, "loss": 0.0165, "step": 2826 }, { "epoch": 0.445951808179201, "grad_norm": 2.8277318477630615, "learning_rate": 5.65448398003542e-07, "loss": 0.0289, "step": 2827 }, { "epoch": 0.4461095555467918, "grad_norm": 5.254321098327637, "learning_rate": 5.652873933344067e-07, "loss": 0.0265, "step": 2828 }, { "epoch": 0.4462673029143826, "grad_norm": 6.698361396789551, "learning_rate": 5.651263886652713e-07, "loss": 0.0487, "step": 2829 }, { "epoch": 0.44642505028197343, "grad_norm": 4.19152307510376, "learning_rate": 5.649653839961359e-07, "loss": 0.0237, "step": 2830 }, { "epoch": 0.4465827976495642, "grad_norm": 5.102362155914307, "learning_rate": 5.648043793270004e-07, "loss": 0.0231, "step": 2831 }, { "epoch": 0.44674054501715504, "grad_norm": 5.332268238067627, "learning_rate": 5.64643374657865e-07, "loss": 0.0409, "step": 2832 }, { "epoch": 0.4468982923847458, "grad_norm": 4.980330467224121, "learning_rate": 5.644823699887296e-07, "loss": 0.0329, "step": 2833 }, { "epoch": 0.44705603975233665, "grad_norm": 7.554039478302002, "learning_rate": 5.643213653195943e-07, "loss": 0.0562, "step": 2834 }, { "epoch": 0.44721378711992743, "grad_norm": 3.8478784561157227, "learning_rate": 5.641603606504588e-07, "loss": 0.0334, "step": 2835 }, { "epoch": 0.44737153448751826, "grad_norm": 5.280252933502197, "learning_rate": 5.639993559813235e-07, "loss": 0.0533, "step": 2836 }, { "epoch": 0.44752928185510904, "grad_norm": 7.56386661529541, "learning_rate": 5.638383513121881e-07, "loss": 0.0215, "step": 2837 }, { "epoch": 0.4476870292226999, "grad_norm": 5.807894229888916, "learning_rate": 5.636773466430527e-07, "loss": 0.0957, "step": 2838 }, { "epoch": 0.44784477659029065, "grad_norm": 5.296294689178467, "learning_rate": 5.635163419739172e-07, "loss": 0.0531, "step": 2839 }, { "epoch": 0.4480025239578814, "grad_norm": 4.388183116912842, "learning_rate": 5.633553373047818e-07, "loss": 0.0389, "step": 2840 }, { "epoch": 0.44816027132547226, "grad_norm": 4.946773052215576, "learning_rate": 5.631943326356464e-07, "loss": 0.0606, "step": 2841 }, { "epoch": 0.44831801869306304, "grad_norm": 4.141297340393066, "learning_rate": 5.63033327966511e-07, "loss": 0.0792, "step": 2842 }, { "epoch": 0.44847576606065387, "grad_norm": 4.016687870025635, "learning_rate": 5.628723232973755e-07, "loss": 0.0392, "step": 2843 }, { "epoch": 0.44863351342824465, "grad_norm": 6.216261863708496, "learning_rate": 5.627113186282402e-07, "loss": 0.0651, "step": 2844 }, { "epoch": 0.4487912607958355, "grad_norm": 2.2858870029449463, "learning_rate": 5.625503139591048e-07, "loss": 0.0208, "step": 2845 }, { "epoch": 0.44894900816342626, "grad_norm": 10.226709365844727, "learning_rate": 5.623893092899694e-07, "loss": 0.0896, "step": 2846 }, { "epoch": 0.4491067555310171, "grad_norm": 2.742591142654419, "learning_rate": 5.622283046208339e-07, "loss": 0.0343, "step": 2847 }, { "epoch": 0.44926450289860786, "grad_norm": 6.542262554168701, "learning_rate": 5.620672999516985e-07, "loss": 0.0589, "step": 2848 }, { "epoch": 0.4494222502661987, "grad_norm": 5.806894779205322, "learning_rate": 5.619062952825631e-07, "loss": 0.0275, "step": 2849 }, { "epoch": 0.4495799976337895, "grad_norm": 4.3196234703063965, "learning_rate": 5.617452906134277e-07, "loss": 0.075, "step": 2850 }, { "epoch": 0.4497377450013803, "grad_norm": 8.437948226928711, "learning_rate": 5.615842859442925e-07, "loss": 0.0619, "step": 2851 }, { "epoch": 0.4498954923689711, "grad_norm": 5.742941379547119, "learning_rate": 5.61423281275157e-07, "loss": 0.0528, "step": 2852 }, { "epoch": 0.4500532397365619, "grad_norm": 4.836208343505859, "learning_rate": 5.612622766060216e-07, "loss": 0.0452, "step": 2853 }, { "epoch": 0.4502109871041527, "grad_norm": 8.764130592346191, "learning_rate": 5.611012719368862e-07, "loss": 0.0533, "step": 2854 }, { "epoch": 0.4503687344717435, "grad_norm": 7.907119274139404, "learning_rate": 5.609402672677508e-07, "loss": 0.0899, "step": 2855 }, { "epoch": 0.4505264818393343, "grad_norm": 3.403799533843994, "learning_rate": 5.607792625986153e-07, "loss": 0.0305, "step": 2856 }, { "epoch": 0.45068422920692514, "grad_norm": 5.970966815948486, "learning_rate": 5.606182579294799e-07, "loss": 0.0641, "step": 2857 }, { "epoch": 0.4508419765745159, "grad_norm": 8.332094192504883, "learning_rate": 5.604572532603446e-07, "loss": 0.0651, "step": 2858 }, { "epoch": 0.4509997239421067, "grad_norm": 3.923067092895508, "learning_rate": 5.602962485912092e-07, "loss": 0.0323, "step": 2859 }, { "epoch": 0.4511574713096975, "grad_norm": 6.301210880279541, "learning_rate": 5.601352439220737e-07, "loss": 0.049, "step": 2860 }, { "epoch": 0.4513152186772883, "grad_norm": 2.444934129714966, "learning_rate": 5.599742392529383e-07, "loss": 0.0126, "step": 2861 }, { "epoch": 0.45147296604487913, "grad_norm": 5.900997638702393, "learning_rate": 5.598132345838029e-07, "loss": 0.1177, "step": 2862 }, { "epoch": 0.4516307134124699, "grad_norm": 8.015467643737793, "learning_rate": 5.596522299146675e-07, "loss": 0.0777, "step": 2863 }, { "epoch": 0.45178846078006074, "grad_norm": 2.3847241401672363, "learning_rate": 5.59491225245532e-07, "loss": 0.0204, "step": 2864 }, { "epoch": 0.4519462081476515, "grad_norm": 7.526757717132568, "learning_rate": 5.593302205763966e-07, "loss": 0.0971, "step": 2865 }, { "epoch": 0.45210395551524235, "grad_norm": 5.946081638336182, "learning_rate": 5.591692159072613e-07, "loss": 0.0577, "step": 2866 }, { "epoch": 0.4522617028828331, "grad_norm": 6.8733439445495605, "learning_rate": 5.590082112381259e-07, "loss": 0.1003, "step": 2867 }, { "epoch": 0.45241945025042396, "grad_norm": 4.146086692810059, "learning_rate": 5.588472065689904e-07, "loss": 0.0449, "step": 2868 }, { "epoch": 0.45257719761801474, "grad_norm": 3.731523036956787, "learning_rate": 5.58686201899855e-07, "loss": 0.0298, "step": 2869 }, { "epoch": 0.45273494498560557, "grad_norm": 5.817784309387207, "learning_rate": 5.585251972307197e-07, "loss": 0.0903, "step": 2870 }, { "epoch": 0.45289269235319635, "grad_norm": 8.980182647705078, "learning_rate": 5.583641925615843e-07, "loss": 0.0368, "step": 2871 }, { "epoch": 0.4530504397207872, "grad_norm": 5.589970111846924, "learning_rate": 5.582031878924488e-07, "loss": 0.0631, "step": 2872 }, { "epoch": 0.45320818708837796, "grad_norm": 3.7583048343658447, "learning_rate": 5.580421832233134e-07, "loss": 0.0333, "step": 2873 }, { "epoch": 0.4533659344559688, "grad_norm": 4.043829917907715, "learning_rate": 5.578811785541781e-07, "loss": 0.0355, "step": 2874 }, { "epoch": 0.45352368182355957, "grad_norm": 3.1130857467651367, "learning_rate": 5.577201738850427e-07, "loss": 0.0241, "step": 2875 }, { "epoch": 0.4536814291911504, "grad_norm": 4.681988716125488, "learning_rate": 5.575591692159073e-07, "loss": 0.0582, "step": 2876 }, { "epoch": 0.4538391765587412, "grad_norm": 2.468484401702881, "learning_rate": 5.573981645467718e-07, "loss": 0.0468, "step": 2877 }, { "epoch": 0.45399692392633195, "grad_norm": 8.368632316589355, "learning_rate": 5.572371598776364e-07, "loss": 0.0586, "step": 2878 }, { "epoch": 0.4541546712939228, "grad_norm": 5.181319713592529, "learning_rate": 5.57076155208501e-07, "loss": 0.0387, "step": 2879 }, { "epoch": 0.45431241866151356, "grad_norm": 5.580580711364746, "learning_rate": 5.569151505393656e-07, "loss": 0.0583, "step": 2880 }, { "epoch": 0.4544701660291044, "grad_norm": 3.7161035537719727, "learning_rate": 5.567541458702302e-07, "loss": 0.016, "step": 2881 }, { "epoch": 0.45462791339669517, "grad_norm": 6.218023777008057, "learning_rate": 5.565931412010948e-07, "loss": 0.0725, "step": 2882 }, { "epoch": 0.454785660764286, "grad_norm": 6.219332695007324, "learning_rate": 5.564321365319594e-07, "loss": 0.0468, "step": 2883 }, { "epoch": 0.4549434081318768, "grad_norm": 4.914670467376709, "learning_rate": 5.56271131862824e-07, "loss": 0.0206, "step": 2884 }, { "epoch": 0.4551011554994676, "grad_norm": 4.154045581817627, "learning_rate": 5.561101271936885e-07, "loss": 0.0351, "step": 2885 }, { "epoch": 0.4552589028670584, "grad_norm": 2.9888126850128174, "learning_rate": 5.559491225245531e-07, "loss": 0.0342, "step": 2886 }, { "epoch": 0.4554166502346492, "grad_norm": 5.366695880889893, "learning_rate": 5.557881178554178e-07, "loss": 0.0649, "step": 2887 }, { "epoch": 0.45557439760224, "grad_norm": 7.164974689483643, "learning_rate": 5.556271131862825e-07, "loss": 0.0316, "step": 2888 }, { "epoch": 0.45573214496983083, "grad_norm": 3.472078323364258, "learning_rate": 5.55466108517147e-07, "loss": 0.0455, "step": 2889 }, { "epoch": 0.4558898923374216, "grad_norm": 5.037097930908203, "learning_rate": 5.553051038480116e-07, "loss": 0.0471, "step": 2890 }, { "epoch": 0.45604763970501244, "grad_norm": 3.8302767276763916, "learning_rate": 5.551440991788762e-07, "loss": 0.0178, "step": 2891 }, { "epoch": 0.4562053870726032, "grad_norm": 5.1232218742370605, "learning_rate": 5.549830945097408e-07, "loss": 0.0493, "step": 2892 }, { "epoch": 0.45636313444019405, "grad_norm": 7.466702461242676, "learning_rate": 5.548220898406053e-07, "loss": 0.0394, "step": 2893 }, { "epoch": 0.45652088180778483, "grad_norm": 5.414917945861816, "learning_rate": 5.546610851714699e-07, "loss": 0.0427, "step": 2894 }, { "epoch": 0.45667862917537566, "grad_norm": 4.600940227508545, "learning_rate": 5.545000805023345e-07, "loss": 0.0606, "step": 2895 }, { "epoch": 0.45683637654296644, "grad_norm": 7.288424491882324, "learning_rate": 5.543390758331992e-07, "loss": 0.1097, "step": 2896 }, { "epoch": 0.4569941239105572, "grad_norm": 3.1181650161743164, "learning_rate": 5.541780711640638e-07, "loss": 0.0246, "step": 2897 }, { "epoch": 0.45715187127814805, "grad_norm": 8.209742546081543, "learning_rate": 5.540170664949283e-07, "loss": 0.0713, "step": 2898 }, { "epoch": 0.4573096186457388, "grad_norm": 7.193093299865723, "learning_rate": 5.538560618257929e-07, "loss": 0.0545, "step": 2899 }, { "epoch": 0.45746736601332966, "grad_norm": 3.752289295196533, "learning_rate": 5.536950571566575e-07, "loss": 0.0217, "step": 2900 }, { "epoch": 0.45762511338092043, "grad_norm": 4.784655570983887, "learning_rate": 5.535340524875221e-07, "loss": 0.0609, "step": 2901 }, { "epoch": 0.45778286074851127, "grad_norm": 5.997508525848389, "learning_rate": 5.533730478183866e-07, "loss": 0.0772, "step": 2902 }, { "epoch": 0.45794060811610204, "grad_norm": 4.745800018310547, "learning_rate": 5.532120431492512e-07, "loss": 0.0283, "step": 2903 }, { "epoch": 0.4580983554836929, "grad_norm": 3.1995737552642822, "learning_rate": 5.53051038480116e-07, "loss": 0.0358, "step": 2904 }, { "epoch": 0.45825610285128365, "grad_norm": 4.824692249298096, "learning_rate": 5.528900338109806e-07, "loss": 0.0268, "step": 2905 }, { "epoch": 0.4584138502188745, "grad_norm": 3.300462007522583, "learning_rate": 5.527290291418451e-07, "loss": 0.029, "step": 2906 }, { "epoch": 0.45857159758646526, "grad_norm": 4.107739448547363, "learning_rate": 5.525680244727097e-07, "loss": 0.0452, "step": 2907 }, { "epoch": 0.4587293449540561, "grad_norm": 4.978026390075684, "learning_rate": 5.524070198035743e-07, "loss": 0.0862, "step": 2908 }, { "epoch": 0.45888709232164687, "grad_norm": 5.25403356552124, "learning_rate": 5.522460151344389e-07, "loss": 0.0919, "step": 2909 }, { "epoch": 0.4590448396892377, "grad_norm": 3.5655996799468994, "learning_rate": 5.520850104653034e-07, "loss": 0.0295, "step": 2910 }, { "epoch": 0.4592025870568285, "grad_norm": 3.7482802867889404, "learning_rate": 5.519240057961681e-07, "loss": 0.0835, "step": 2911 }, { "epoch": 0.4593603344244193, "grad_norm": 9.63411808013916, "learning_rate": 5.517630011270327e-07, "loss": 0.0399, "step": 2912 }, { "epoch": 0.4595180817920101, "grad_norm": 2.809464931488037, "learning_rate": 5.516019964578973e-07, "loss": 0.0347, "step": 2913 }, { "epoch": 0.4596758291596009, "grad_norm": 4.120750904083252, "learning_rate": 5.514409917887618e-07, "loss": 0.0421, "step": 2914 }, { "epoch": 0.4598335765271917, "grad_norm": 6.6847615242004395, "learning_rate": 5.512799871196264e-07, "loss": 0.0666, "step": 2915 }, { "epoch": 0.45999132389478253, "grad_norm": 40.42054748535156, "learning_rate": 5.51118982450491e-07, "loss": 0.0489, "step": 2916 }, { "epoch": 0.4601490712623733, "grad_norm": 8.054227828979492, "learning_rate": 5.509579777813556e-07, "loss": 0.0498, "step": 2917 }, { "epoch": 0.4603068186299641, "grad_norm": 5.607865333557129, "learning_rate": 5.507969731122201e-07, "loss": 0.0602, "step": 2918 }, { "epoch": 0.4604645659975549, "grad_norm": 5.769576072692871, "learning_rate": 5.506359684430848e-07, "loss": 0.0322, "step": 2919 }, { "epoch": 0.4606223133651457, "grad_norm": 6.5281901359558105, "learning_rate": 5.504749637739494e-07, "loss": 0.0606, "step": 2920 }, { "epoch": 0.46078006073273653, "grad_norm": 4.875824451446533, "learning_rate": 5.50313959104814e-07, "loss": 0.0408, "step": 2921 }, { "epoch": 0.4609378081003273, "grad_norm": 11.520488739013672, "learning_rate": 5.501529544356787e-07, "loss": 0.061, "step": 2922 }, { "epoch": 0.46109555546791814, "grad_norm": 8.879790306091309, "learning_rate": 5.499919497665432e-07, "loss": 0.0576, "step": 2923 }, { "epoch": 0.4612533028355089, "grad_norm": 4.364360809326172, "learning_rate": 5.498309450974078e-07, "loss": 0.0545, "step": 2924 }, { "epoch": 0.46141105020309975, "grad_norm": 4.412820339202881, "learning_rate": 5.496699404282724e-07, "loss": 0.0885, "step": 2925 }, { "epoch": 0.4615687975706905, "grad_norm": 3.2861177921295166, "learning_rate": 5.495089357591371e-07, "loss": 0.0113, "step": 2926 }, { "epoch": 0.46172654493828136, "grad_norm": 7.449189186096191, "learning_rate": 5.493479310900016e-07, "loss": 0.0354, "step": 2927 }, { "epoch": 0.46188429230587214, "grad_norm": 10.700511932373047, "learning_rate": 5.491869264208662e-07, "loss": 0.0757, "step": 2928 }, { "epoch": 0.46204203967346297, "grad_norm": 7.325472831726074, "learning_rate": 5.490259217517308e-07, "loss": 0.0699, "step": 2929 }, { "epoch": 0.46219978704105374, "grad_norm": 2.8952784538269043, "learning_rate": 5.488649170825954e-07, "loss": 0.0302, "step": 2930 }, { "epoch": 0.4623575344086446, "grad_norm": 3.699751615524292, "learning_rate": 5.487039124134599e-07, "loss": 0.0282, "step": 2931 }, { "epoch": 0.46251528177623535, "grad_norm": 5.765395641326904, "learning_rate": 5.485429077443245e-07, "loss": 0.0356, "step": 2932 }, { "epoch": 0.4626730291438262, "grad_norm": 8.634234428405762, "learning_rate": 5.483819030751891e-07, "loss": 0.0492, "step": 2933 }, { "epoch": 0.46283077651141696, "grad_norm": 3.448993444442749, "learning_rate": 5.482208984060538e-07, "loss": 0.0394, "step": 2934 }, { "epoch": 0.4629885238790078, "grad_norm": 5.2671613693237305, "learning_rate": 5.480598937369183e-07, "loss": 0.0296, "step": 2935 }, { "epoch": 0.4631462712465986, "grad_norm": 4.191190242767334, "learning_rate": 5.478988890677829e-07, "loss": 0.0207, "step": 2936 }, { "epoch": 0.46330401861418935, "grad_norm": 3.1752512454986572, "learning_rate": 5.477378843986475e-07, "loss": 0.0424, "step": 2937 }, { "epoch": 0.4634617659817802, "grad_norm": 4.166040420532227, "learning_rate": 5.475768797295121e-07, "loss": 0.0471, "step": 2938 }, { "epoch": 0.46361951334937096, "grad_norm": 4.05285120010376, "learning_rate": 5.474158750603766e-07, "loss": 0.0403, "step": 2939 }, { "epoch": 0.4637772607169618, "grad_norm": 6.6873459815979, "learning_rate": 5.472548703912413e-07, "loss": 0.0365, "step": 2940 }, { "epoch": 0.46393500808455257, "grad_norm": 9.0433988571167, "learning_rate": 5.47093865722106e-07, "loss": 0.0514, "step": 2941 }, { "epoch": 0.4640927554521434, "grad_norm": 11.118558883666992, "learning_rate": 5.469328610529706e-07, "loss": 0.059, "step": 2942 }, { "epoch": 0.4642505028197342, "grad_norm": 3.9807562828063965, "learning_rate": 5.467718563838352e-07, "loss": 0.0517, "step": 2943 }, { "epoch": 0.464408250187325, "grad_norm": 5.029097080230713, "learning_rate": 5.466108517146997e-07, "loss": 0.0534, "step": 2944 }, { "epoch": 0.4645659975549158, "grad_norm": 4.308042526245117, "learning_rate": 5.464498470455643e-07, "loss": 0.0603, "step": 2945 }, { "epoch": 0.4647237449225066, "grad_norm": 4.594994068145752, "learning_rate": 5.462888423764289e-07, "loss": 0.0369, "step": 2946 }, { "epoch": 0.4648814922900974, "grad_norm": 5.747838020324707, "learning_rate": 5.461278377072935e-07, "loss": 0.0466, "step": 2947 }, { "epoch": 0.46503923965768823, "grad_norm": 7.009786128997803, "learning_rate": 5.45966833038158e-07, "loss": 0.0876, "step": 2948 }, { "epoch": 0.465196987025279, "grad_norm": 7.380682468414307, "learning_rate": 5.458058283690227e-07, "loss": 0.0836, "step": 2949 }, { "epoch": 0.46535473439286984, "grad_norm": 5.737110614776611, "learning_rate": 5.456448236998873e-07, "loss": 0.0674, "step": 2950 }, { "epoch": 0.4655124817604606, "grad_norm": 5.259807586669922, "learning_rate": 5.454838190307519e-07, "loss": 0.0371, "step": 2951 }, { "epoch": 0.46567022912805145, "grad_norm": 6.416214466094971, "learning_rate": 5.453228143616164e-07, "loss": 0.0218, "step": 2952 }, { "epoch": 0.4658279764956422, "grad_norm": 6.674959659576416, "learning_rate": 5.45161809692481e-07, "loss": 0.0538, "step": 2953 }, { "epoch": 0.46598572386323306, "grad_norm": 4.255826473236084, "learning_rate": 5.450008050233456e-07, "loss": 0.0544, "step": 2954 }, { "epoch": 0.46614347123082384, "grad_norm": 6.422432899475098, "learning_rate": 5.448398003542102e-07, "loss": 0.0384, "step": 2955 }, { "epoch": 0.4663012185984146, "grad_norm": 5.41008996963501, "learning_rate": 5.446787956850748e-07, "loss": 0.0451, "step": 2956 }, { "epoch": 0.46645896596600545, "grad_norm": 7.205507278442383, "learning_rate": 5.445177910159395e-07, "loss": 0.0911, "step": 2957 }, { "epoch": 0.4666167133335962, "grad_norm": 6.411682605743408, "learning_rate": 5.443567863468041e-07, "loss": 0.0752, "step": 2958 }, { "epoch": 0.46677446070118706, "grad_norm": 2.7243564128875732, "learning_rate": 5.441957816776687e-07, "loss": 0.0226, "step": 2959 }, { "epoch": 0.46693220806877783, "grad_norm": 6.036257266998291, "learning_rate": 5.440347770085332e-07, "loss": 0.0283, "step": 2960 }, { "epoch": 0.46708995543636866, "grad_norm": 6.654210567474365, "learning_rate": 5.438737723393978e-07, "loss": 0.0515, "step": 2961 }, { "epoch": 0.46724770280395944, "grad_norm": 2.447610855102539, "learning_rate": 5.437127676702624e-07, "loss": 0.0234, "step": 2962 }, { "epoch": 0.4674054501715503, "grad_norm": 4.649781703948975, "learning_rate": 5.43551763001127e-07, "loss": 0.0442, "step": 2963 }, { "epoch": 0.46756319753914105, "grad_norm": 5.978750228881836, "learning_rate": 5.433907583319916e-07, "loss": 0.0762, "step": 2964 }, { "epoch": 0.4677209449067319, "grad_norm": 4.587388038635254, "learning_rate": 5.432297536628562e-07, "loss": 0.0646, "step": 2965 }, { "epoch": 0.46787869227432266, "grad_norm": 5.262384414672852, "learning_rate": 5.430687489937208e-07, "loss": 0.1034, "step": 2966 }, { "epoch": 0.4680364396419135, "grad_norm": 5.238274097442627, "learning_rate": 5.429077443245854e-07, "loss": 0.0515, "step": 2967 }, { "epoch": 0.46819418700950427, "grad_norm": 6.681858062744141, "learning_rate": 5.4274673965545e-07, "loss": 0.0602, "step": 2968 }, { "epoch": 0.4683519343770951, "grad_norm": 6.393606662750244, "learning_rate": 5.425857349863145e-07, "loss": 0.0282, "step": 2969 }, { "epoch": 0.4685096817446859, "grad_norm": 7.102097988128662, "learning_rate": 5.424247303171791e-07, "loss": 0.0544, "step": 2970 }, { "epoch": 0.4686674291122767, "grad_norm": 4.880553722381592, "learning_rate": 5.422637256480438e-07, "loss": 0.0459, "step": 2971 }, { "epoch": 0.4688251764798675, "grad_norm": 3.7544023990631104, "learning_rate": 5.421027209789084e-07, "loss": 0.0378, "step": 2972 }, { "epoch": 0.4689829238474583, "grad_norm": 6.994016170501709, "learning_rate": 5.419417163097729e-07, "loss": 0.0429, "step": 2973 }, { "epoch": 0.4691406712150491, "grad_norm": 5.7882080078125, "learning_rate": 5.417807116406376e-07, "loss": 0.0392, "step": 2974 }, { "epoch": 0.4692984185826399, "grad_norm": 5.978911876678467, "learning_rate": 5.416197069715022e-07, "loss": 0.0514, "step": 2975 }, { "epoch": 0.4694561659502307, "grad_norm": 4.241064071655273, "learning_rate": 5.414587023023668e-07, "loss": 0.0513, "step": 2976 }, { "epoch": 0.4696139133178215, "grad_norm": 3.8420863151550293, "learning_rate": 5.412976976332313e-07, "loss": 0.0757, "step": 2977 }, { "epoch": 0.4697716606854123, "grad_norm": 8.296415328979492, "learning_rate": 5.411366929640959e-07, "loss": 0.0469, "step": 2978 }, { "epoch": 0.4699294080530031, "grad_norm": 3.1178243160247803, "learning_rate": 5.409756882949606e-07, "loss": 0.0351, "step": 2979 }, { "epoch": 0.4700871554205939, "grad_norm": 3.7371997833251953, "learning_rate": 5.408146836258252e-07, "loss": 0.0506, "step": 2980 }, { "epoch": 0.4702449027881847, "grad_norm": 6.004145622253418, "learning_rate": 5.406536789566897e-07, "loss": 0.0451, "step": 2981 }, { "epoch": 0.47040265015577554, "grad_norm": 4.889219760894775, "learning_rate": 5.404926742875543e-07, "loss": 0.0768, "step": 2982 }, { "epoch": 0.4705603975233663, "grad_norm": 4.599617958068848, "learning_rate": 5.403316696184189e-07, "loss": 0.0357, "step": 2983 }, { "epoch": 0.47071814489095715, "grad_norm": 2.7061350345611572, "learning_rate": 5.401706649492835e-07, "loss": 0.0362, "step": 2984 }, { "epoch": 0.4708758922585479, "grad_norm": 4.982329368591309, "learning_rate": 5.40009660280148e-07, "loss": 0.0432, "step": 2985 }, { "epoch": 0.47103363962613876, "grad_norm": 6.297840595245361, "learning_rate": 5.398486556110127e-07, "loss": 0.0526, "step": 2986 }, { "epoch": 0.47119138699372953, "grad_norm": 6.057762622833252, "learning_rate": 5.396876509418773e-07, "loss": 0.0927, "step": 2987 }, { "epoch": 0.47134913436132037, "grad_norm": 2.123640775680542, "learning_rate": 5.395266462727419e-07, "loss": 0.0295, "step": 2988 }, { "epoch": 0.47150688172891114, "grad_norm": 5.117542743682861, "learning_rate": 5.393656416036064e-07, "loss": 0.0397, "step": 2989 }, { "epoch": 0.471664629096502, "grad_norm": 10.117819786071777, "learning_rate": 5.39204636934471e-07, "loss": 0.072, "step": 2990 }, { "epoch": 0.47182237646409275, "grad_norm": 6.594187259674072, "learning_rate": 5.390436322653356e-07, "loss": 0.0768, "step": 2991 }, { "epoch": 0.4719801238316836, "grad_norm": 4.6469902992248535, "learning_rate": 5.388826275962003e-07, "loss": 0.0366, "step": 2992 }, { "epoch": 0.47213787119927436, "grad_norm": 5.465667247772217, "learning_rate": 5.387216229270649e-07, "loss": 0.0386, "step": 2993 }, { "epoch": 0.47229561856686514, "grad_norm": 9.858426094055176, "learning_rate": 5.385606182579295e-07, "loss": 0.0833, "step": 2994 }, { "epoch": 0.47245336593445597, "grad_norm": 9.21403694152832, "learning_rate": 5.383996135887941e-07, "loss": 0.081, "step": 2995 }, { "epoch": 0.47261111330204675, "grad_norm": 6.342504978179932, "learning_rate": 5.382386089196587e-07, "loss": 0.0432, "step": 2996 }, { "epoch": 0.4727688606696376, "grad_norm": 3.481053352355957, "learning_rate": 5.380776042505233e-07, "loss": 0.0196, "step": 2997 }, { "epoch": 0.47292660803722836, "grad_norm": 7.552100658416748, "learning_rate": 5.379165995813878e-07, "loss": 0.058, "step": 2998 }, { "epoch": 0.4730843554048192, "grad_norm": 5.199467658996582, "learning_rate": 5.377555949122524e-07, "loss": 0.0364, "step": 2999 }, { "epoch": 0.47324210277240997, "grad_norm": 5.026432991027832, "learning_rate": 5.37594590243117e-07, "loss": 0.0343, "step": 3000 }, { "epoch": 0.4733998501400008, "grad_norm": 4.435891151428223, "learning_rate": 5.374335855739817e-07, "loss": 0.0389, "step": 3001 }, { "epoch": 0.4735575975075916, "grad_norm": 3.941455364227295, "learning_rate": 5.372725809048462e-07, "loss": 0.0467, "step": 3002 }, { "epoch": 0.4737153448751824, "grad_norm": 3.924062728881836, "learning_rate": 5.371115762357108e-07, "loss": 0.0469, "step": 3003 }, { "epoch": 0.4738730922427732, "grad_norm": 4.199190139770508, "learning_rate": 5.369505715665754e-07, "loss": 0.0437, "step": 3004 }, { "epoch": 0.474030839610364, "grad_norm": 6.40954065322876, "learning_rate": 5.3678956689744e-07, "loss": 0.0552, "step": 3005 }, { "epoch": 0.4741885869779548, "grad_norm": 4.79932975769043, "learning_rate": 5.366285622283045e-07, "loss": 0.0377, "step": 3006 }, { "epoch": 0.47434633434554563, "grad_norm": 6.0069684982299805, "learning_rate": 5.364675575591691e-07, "loss": 0.0373, "step": 3007 }, { "epoch": 0.4745040817131364, "grad_norm": 3.9204750061035156, "learning_rate": 5.363065528900337e-07, "loss": 0.02, "step": 3008 }, { "epoch": 0.47466182908072724, "grad_norm": 6.374783515930176, "learning_rate": 5.361455482208985e-07, "loss": 0.0774, "step": 3009 }, { "epoch": 0.474819576448318, "grad_norm": 2.727414608001709, "learning_rate": 5.35984543551763e-07, "loss": 0.0176, "step": 3010 }, { "epoch": 0.47497732381590885, "grad_norm": 11.19305419921875, "learning_rate": 5.358235388826276e-07, "loss": 0.0782, "step": 3011 }, { "epoch": 0.4751350711834996, "grad_norm": 3.4844961166381836, "learning_rate": 5.356625342134922e-07, "loss": 0.0546, "step": 3012 }, { "epoch": 0.4752928185510904, "grad_norm": 10.611740112304688, "learning_rate": 5.355015295443568e-07, "loss": 0.0442, "step": 3013 }, { "epoch": 0.47545056591868123, "grad_norm": 4.291416168212891, "learning_rate": 5.353405248752214e-07, "loss": 0.0392, "step": 3014 }, { "epoch": 0.475608313286272, "grad_norm": 6.187273025512695, "learning_rate": 5.351795202060859e-07, "loss": 0.0579, "step": 3015 }, { "epoch": 0.47576606065386284, "grad_norm": 7.718811511993408, "learning_rate": 5.350185155369506e-07, "loss": 0.0708, "step": 3016 }, { "epoch": 0.4759238080214536, "grad_norm": 4.694831848144531, "learning_rate": 5.348575108678152e-07, "loss": 0.0477, "step": 3017 }, { "epoch": 0.47608155538904445, "grad_norm": 3.8272786140441895, "learning_rate": 5.346965061986798e-07, "loss": 0.0349, "step": 3018 }, { "epoch": 0.47623930275663523, "grad_norm": 3.613823175430298, "learning_rate": 5.345355015295443e-07, "loss": 0.0891, "step": 3019 }, { "epoch": 0.47639705012422606, "grad_norm": 3.9235403537750244, "learning_rate": 5.343744968604089e-07, "loss": 0.0155, "step": 3020 }, { "epoch": 0.47655479749181684, "grad_norm": 9.554059028625488, "learning_rate": 5.342134921912735e-07, "loss": 0.0535, "step": 3021 }, { "epoch": 0.47671254485940767, "grad_norm": 3.8005659580230713, "learning_rate": 5.340524875221381e-07, "loss": 0.0461, "step": 3022 }, { "epoch": 0.47687029222699845, "grad_norm": 5.433043479919434, "learning_rate": 5.338914828530026e-07, "loss": 0.0447, "step": 3023 }, { "epoch": 0.4770280395945893, "grad_norm": 4.371764659881592, "learning_rate": 5.337304781838673e-07, "loss": 0.0495, "step": 3024 }, { "epoch": 0.47718578696218006, "grad_norm": 7.512784004211426, "learning_rate": 5.335694735147319e-07, "loss": 0.0635, "step": 3025 }, { "epoch": 0.4773435343297709, "grad_norm": 3.749889850616455, "learning_rate": 5.334084688455965e-07, "loss": 0.0284, "step": 3026 }, { "epoch": 0.47750128169736167, "grad_norm": 3.9005444049835205, "learning_rate": 5.33247464176461e-07, "loss": 0.0524, "step": 3027 }, { "epoch": 0.4776590290649525, "grad_norm": 2.9798567295074463, "learning_rate": 5.330864595073257e-07, "loss": 0.0374, "step": 3028 }, { "epoch": 0.4778167764325433, "grad_norm": 3.3086020946502686, "learning_rate": 5.329254548381903e-07, "loss": 0.0379, "step": 3029 }, { "epoch": 0.4779745238001341, "grad_norm": 6.154900550842285, "learning_rate": 5.327644501690549e-07, "loss": 0.115, "step": 3030 }, { "epoch": 0.4781322711677249, "grad_norm": 6.707804203033447, "learning_rate": 5.326034454999195e-07, "loss": 0.0655, "step": 3031 }, { "epoch": 0.4782900185353157, "grad_norm": 11.475181579589844, "learning_rate": 5.324424408307841e-07, "loss": 0.0138, "step": 3032 }, { "epoch": 0.4784477659029065, "grad_norm": 5.882479190826416, "learning_rate": 5.322814361616487e-07, "loss": 0.0397, "step": 3033 }, { "epoch": 0.4786055132704973, "grad_norm": 4.037417888641357, "learning_rate": 5.321204314925133e-07, "loss": 0.0269, "step": 3034 }, { "epoch": 0.4787632606380881, "grad_norm": 4.259555339813232, "learning_rate": 5.319594268233778e-07, "loss": 0.0343, "step": 3035 }, { "epoch": 0.4789210080056789, "grad_norm": 4.306237697601318, "learning_rate": 5.317984221542424e-07, "loss": 0.0457, "step": 3036 }, { "epoch": 0.4790787553732697, "grad_norm": 6.921058654785156, "learning_rate": 5.31637417485107e-07, "loss": 0.0475, "step": 3037 }, { "epoch": 0.4792365027408605, "grad_norm": 6.094809055328369, "learning_rate": 5.314764128159716e-07, "loss": 0.062, "step": 3038 }, { "epoch": 0.4793942501084513, "grad_norm": 6.288085460662842, "learning_rate": 5.313154081468363e-07, "loss": 0.0811, "step": 3039 }, { "epoch": 0.4795519974760421, "grad_norm": 6.041825294494629, "learning_rate": 5.311544034777008e-07, "loss": 0.0332, "step": 3040 }, { "epoch": 0.47970974484363293, "grad_norm": 4.585716247558594, "learning_rate": 5.309933988085654e-07, "loss": 0.0374, "step": 3041 }, { "epoch": 0.4798674922112237, "grad_norm": 5.681454181671143, "learning_rate": 5.3083239413943e-07, "loss": 0.0506, "step": 3042 }, { "epoch": 0.48002523957881454, "grad_norm": 6.8209452629089355, "learning_rate": 5.306713894702946e-07, "loss": 0.082, "step": 3043 }, { "epoch": 0.4801829869464053, "grad_norm": 3.82092022895813, "learning_rate": 5.305103848011591e-07, "loss": 0.0435, "step": 3044 }, { "epoch": 0.48034073431399615, "grad_norm": 5.553289890289307, "learning_rate": 5.303493801320238e-07, "loss": 0.0445, "step": 3045 }, { "epoch": 0.48049848168158693, "grad_norm": 4.531376361846924, "learning_rate": 5.301883754628885e-07, "loss": 0.0408, "step": 3046 }, { "epoch": 0.48065622904917776, "grad_norm": 3.5035979747772217, "learning_rate": 5.300273707937531e-07, "loss": 0.0273, "step": 3047 }, { "epoch": 0.48081397641676854, "grad_norm": 4.718772888183594, "learning_rate": 5.298663661246176e-07, "loss": 0.0424, "step": 3048 }, { "epoch": 0.4809717237843594, "grad_norm": 3.934015989303589, "learning_rate": 5.297053614554822e-07, "loss": 0.0445, "step": 3049 }, { "epoch": 0.48112947115195015, "grad_norm": 5.095694541931152, "learning_rate": 5.295443567863468e-07, "loss": 0.0314, "step": 3050 }, { "epoch": 0.481287218519541, "grad_norm": 6.989995956420898, "learning_rate": 5.293833521172114e-07, "loss": 0.0525, "step": 3051 }, { "epoch": 0.48144496588713176, "grad_norm": 7.8963470458984375, "learning_rate": 5.292223474480759e-07, "loss": 0.0641, "step": 3052 }, { "epoch": 0.48160271325472254, "grad_norm": 4.29169225692749, "learning_rate": 5.290613427789405e-07, "loss": 0.0245, "step": 3053 }, { "epoch": 0.48176046062231337, "grad_norm": 6.149662017822266, "learning_rate": 5.289003381098052e-07, "loss": 0.0321, "step": 3054 }, { "epoch": 0.48191820798990415, "grad_norm": 3.3679161071777344, "learning_rate": 5.287393334406698e-07, "loss": 0.0216, "step": 3055 }, { "epoch": 0.482075955357495, "grad_norm": 8.198663711547852, "learning_rate": 5.285783287715343e-07, "loss": 0.0607, "step": 3056 }, { "epoch": 0.48223370272508576, "grad_norm": 5.388924598693848, "learning_rate": 5.284173241023989e-07, "loss": 0.0236, "step": 3057 }, { "epoch": 0.4823914500926766, "grad_norm": 5.866364479064941, "learning_rate": 5.282563194332635e-07, "loss": 0.068, "step": 3058 }, { "epoch": 0.48254919746026737, "grad_norm": 7.354922294616699, "learning_rate": 5.280953147641281e-07, "loss": 0.06, "step": 3059 }, { "epoch": 0.4827069448278582, "grad_norm": 6.349786758422852, "learning_rate": 5.279343100949927e-07, "loss": 0.0549, "step": 3060 }, { "epoch": 0.482864692195449, "grad_norm": 2.514930248260498, "learning_rate": 5.277733054258573e-07, "loss": 0.0511, "step": 3061 }, { "epoch": 0.4830224395630398, "grad_norm": 8.368856430053711, "learning_rate": 5.27612300756722e-07, "loss": 0.076, "step": 3062 }, { "epoch": 0.4831801869306306, "grad_norm": 3.9374332427978516, "learning_rate": 5.274512960875866e-07, "loss": 0.0585, "step": 3063 }, { "epoch": 0.4833379342982214, "grad_norm": 7.416379451751709, "learning_rate": 5.272902914184512e-07, "loss": 0.0622, "step": 3064 }, { "epoch": 0.4834956816658122, "grad_norm": 6.479883670806885, "learning_rate": 5.271292867493157e-07, "loss": 0.0311, "step": 3065 }, { "epoch": 0.483653429033403, "grad_norm": 3.6350669860839844, "learning_rate": 5.269682820801803e-07, "loss": 0.0399, "step": 3066 }, { "epoch": 0.4838111764009938, "grad_norm": 4.032829284667969, "learning_rate": 5.268072774110449e-07, "loss": 0.0329, "step": 3067 }, { "epoch": 0.48396892376858464, "grad_norm": 3.473764657974243, "learning_rate": 5.266462727419095e-07, "loss": 0.0383, "step": 3068 }, { "epoch": 0.4841266711361754, "grad_norm": 5.921380043029785, "learning_rate": 5.264852680727741e-07, "loss": 0.0277, "step": 3069 }, { "epoch": 0.48428441850376625, "grad_norm": 3.3913817405700684, "learning_rate": 5.263242634036387e-07, "loss": 0.0313, "step": 3070 }, { "epoch": 0.484442165871357, "grad_norm": 3.7090003490448, "learning_rate": 5.261632587345033e-07, "loss": 0.0352, "step": 3071 }, { "epoch": 0.4845999132389478, "grad_norm": 7.192994594573975, "learning_rate": 5.260022540653679e-07, "loss": 0.0455, "step": 3072 }, { "epoch": 0.4845999132389478, "eval_accuracy": 0.9856942150797368, "eval_f1": 0.9856942150797368, "eval_loss": 0.046789322048425674, "eval_runtime": 4718.3444, "eval_samples_per_second": 42.993, "eval_steps_per_second": 2.687, "step": 3072 }, { "epoch": 0.48475766060653863, "grad_norm": 6.613557815551758, "learning_rate": 5.258412493962324e-07, "loss": 0.0582, "step": 3073 }, { "epoch": 0.4849154079741294, "grad_norm": 4.4686055183410645, "learning_rate": 5.25680244727097e-07, "loss": 0.0596, "step": 3074 }, { "epoch": 0.48507315534172024, "grad_norm": 6.73822021484375, "learning_rate": 5.255192400579616e-07, "loss": 0.0509, "step": 3075 }, { "epoch": 0.485230902709311, "grad_norm": 6.796287536621094, "learning_rate": 5.253582353888263e-07, "loss": 0.0428, "step": 3076 }, { "epoch": 0.48538865007690185, "grad_norm": 6.841584205627441, "learning_rate": 5.251972307196908e-07, "loss": 0.0494, "step": 3077 }, { "epoch": 0.48554639744449263, "grad_norm": 6.251879692077637, "learning_rate": 5.250362260505554e-07, "loss": 0.0518, "step": 3078 }, { "epoch": 0.48570414481208346, "grad_norm": 7.815091133117676, "learning_rate": 5.2487522138142e-07, "loss": 0.1038, "step": 3079 }, { "epoch": 0.48586189217967424, "grad_norm": 5.727319717407227, "learning_rate": 5.247142167122847e-07, "loss": 0.0509, "step": 3080 }, { "epoch": 0.48601963954726507, "grad_norm": 3.4534387588500977, "learning_rate": 5.245532120431492e-07, "loss": 0.0771, "step": 3081 }, { "epoch": 0.48617738691485585, "grad_norm": 4.194572925567627, "learning_rate": 5.243922073740138e-07, "loss": 0.0443, "step": 3082 }, { "epoch": 0.4863351342824467, "grad_norm": 10.300954818725586, "learning_rate": 5.242312027048784e-07, "loss": 0.0689, "step": 3083 }, { "epoch": 0.48649288165003746, "grad_norm": 5.091367244720459, "learning_rate": 5.240701980357431e-07, "loss": 0.0846, "step": 3084 }, { "epoch": 0.4866506290176283, "grad_norm": 2.955735445022583, "learning_rate": 5.239091933666077e-07, "loss": 0.0502, "step": 3085 }, { "epoch": 0.48680837638521907, "grad_norm": 4.298361778259277, "learning_rate": 5.237481886974722e-07, "loss": 0.0258, "step": 3086 }, { "epoch": 0.4869661237528099, "grad_norm": 3.9187076091766357, "learning_rate": 5.235871840283368e-07, "loss": 0.0644, "step": 3087 }, { "epoch": 0.4871238711204007, "grad_norm": 5.889786243438721, "learning_rate": 5.234261793592014e-07, "loss": 0.0795, "step": 3088 }, { "epoch": 0.4872816184879915, "grad_norm": 12.091011047363281, "learning_rate": 5.23265174690066e-07, "loss": 0.0682, "step": 3089 }, { "epoch": 0.4874393658555823, "grad_norm": 5.780660629272461, "learning_rate": 5.231041700209305e-07, "loss": 0.0319, "step": 3090 }, { "epoch": 0.48759711322317306, "grad_norm": 1.9263288974761963, "learning_rate": 5.229431653517952e-07, "loss": 0.0138, "step": 3091 }, { "epoch": 0.4877548605907639, "grad_norm": 6.742379188537598, "learning_rate": 5.227821606826598e-07, "loss": 0.0789, "step": 3092 }, { "epoch": 0.48791260795835467, "grad_norm": 3.9017274379730225, "learning_rate": 5.226211560135244e-07, "loss": 0.0577, "step": 3093 }, { "epoch": 0.4880703553259455, "grad_norm": 6.11548376083374, "learning_rate": 5.224601513443889e-07, "loss": 0.0603, "step": 3094 }, { "epoch": 0.4882281026935363, "grad_norm": 3.4503278732299805, "learning_rate": 5.222991466752535e-07, "loss": 0.0297, "step": 3095 }, { "epoch": 0.4883858500611271, "grad_norm": 4.641238212585449, "learning_rate": 5.221381420061181e-07, "loss": 0.0481, "step": 3096 }, { "epoch": 0.4885435974287179, "grad_norm": 5.323633193969727, "learning_rate": 5.219771373369828e-07, "loss": 0.022, "step": 3097 }, { "epoch": 0.4887013447963087, "grad_norm": 5.186425685882568, "learning_rate": 5.218161326678473e-07, "loss": 0.0522, "step": 3098 }, { "epoch": 0.4888590921638995, "grad_norm": 4.983971118927002, "learning_rate": 5.21655127998712e-07, "loss": 0.0486, "step": 3099 }, { "epoch": 0.48901683953149033, "grad_norm": 4.3277082443237305, "learning_rate": 5.214941233295766e-07, "loss": 0.0249, "step": 3100 }, { "epoch": 0.4891745868990811, "grad_norm": 4.370128154754639, "learning_rate": 5.213331186604412e-07, "loss": 0.0343, "step": 3101 }, { "epoch": 0.48933233426667194, "grad_norm": 4.3571553230285645, "learning_rate": 5.211721139913057e-07, "loss": 0.0693, "step": 3102 }, { "epoch": 0.4894900816342627, "grad_norm": 2.895589590072632, "learning_rate": 5.210111093221703e-07, "loss": 0.0307, "step": 3103 }, { "epoch": 0.48964782900185355, "grad_norm": 18.217578887939453, "learning_rate": 5.208501046530349e-07, "loss": 0.0699, "step": 3104 }, { "epoch": 0.48980557636944433, "grad_norm": 3.3057503700256348, "learning_rate": 5.206890999838995e-07, "loss": 0.0223, "step": 3105 }, { "epoch": 0.48996332373703516, "grad_norm": 5.851504802703857, "learning_rate": 5.205280953147642e-07, "loss": 0.0717, "step": 3106 }, { "epoch": 0.49012107110462594, "grad_norm": 4.29199743270874, "learning_rate": 5.203670906456287e-07, "loss": 0.0359, "step": 3107 }, { "epoch": 0.49027881847221677, "grad_norm": 12.454666137695312, "learning_rate": 5.202060859764933e-07, "loss": 0.0706, "step": 3108 }, { "epoch": 0.49043656583980755, "grad_norm": 5.318867206573486, "learning_rate": 5.200450813073579e-07, "loss": 0.0397, "step": 3109 }, { "epoch": 0.4905943132073983, "grad_norm": 3.4172487258911133, "learning_rate": 5.198840766382225e-07, "loss": 0.0545, "step": 3110 }, { "epoch": 0.49075206057498916, "grad_norm": 3.336772918701172, "learning_rate": 5.19723071969087e-07, "loss": 0.0617, "step": 3111 }, { "epoch": 0.49090980794257993, "grad_norm": 4.109711647033691, "learning_rate": 5.195620672999516e-07, "loss": 0.017, "step": 3112 }, { "epoch": 0.49106755531017077, "grad_norm": 5.944070816040039, "learning_rate": 5.194010626308162e-07, "loss": 0.0421, "step": 3113 }, { "epoch": 0.49122530267776154, "grad_norm": 9.251203536987305, "learning_rate": 5.19240057961681e-07, "loss": 0.087, "step": 3114 }, { "epoch": 0.4913830500453524, "grad_norm": 2.984105110168457, "learning_rate": 5.190790532925455e-07, "loss": 0.0252, "step": 3115 }, { "epoch": 0.49154079741294315, "grad_norm": 8.3684663772583, "learning_rate": 5.189180486234101e-07, "loss": 0.0767, "step": 3116 }, { "epoch": 0.491698544780534, "grad_norm": 4.011730194091797, "learning_rate": 5.187570439542747e-07, "loss": 0.0203, "step": 3117 }, { "epoch": 0.49185629214812476, "grad_norm": 5.830489158630371, "learning_rate": 5.185960392851393e-07, "loss": 0.0597, "step": 3118 }, { "epoch": 0.4920140395157156, "grad_norm": 3.307762861251831, "learning_rate": 5.184350346160038e-07, "loss": 0.034, "step": 3119 }, { "epoch": 0.4921717868833064, "grad_norm": 4.353969573974609, "learning_rate": 5.182740299468684e-07, "loss": 0.0443, "step": 3120 }, { "epoch": 0.4923295342508972, "grad_norm": 11.109829902648926, "learning_rate": 5.181130252777331e-07, "loss": 0.1123, "step": 3121 }, { "epoch": 0.492487281618488, "grad_norm": 4.706302642822266, "learning_rate": 5.179520206085977e-07, "loss": 0.0456, "step": 3122 }, { "epoch": 0.4926450289860788, "grad_norm": 4.336117744445801, "learning_rate": 5.177910159394622e-07, "loss": 0.0527, "step": 3123 }, { "epoch": 0.4928027763536696, "grad_norm": 6.7515974044799805, "learning_rate": 5.176300112703268e-07, "loss": 0.0409, "step": 3124 }, { "epoch": 0.4929605237212604, "grad_norm": 8.648667335510254, "learning_rate": 5.174690066011914e-07, "loss": 0.0723, "step": 3125 }, { "epoch": 0.4931182710888512, "grad_norm": 6.021974563598633, "learning_rate": 5.17308001932056e-07, "loss": 0.0595, "step": 3126 }, { "epoch": 0.49327601845644203, "grad_norm": 5.117554664611816, "learning_rate": 5.171469972629205e-07, "loss": 0.0302, "step": 3127 }, { "epoch": 0.4934337658240328, "grad_norm": 4.950992584228516, "learning_rate": 5.169859925937851e-07, "loss": 0.0225, "step": 3128 }, { "epoch": 0.4935915131916236, "grad_norm": 4.997193336486816, "learning_rate": 5.168249879246498e-07, "loss": 0.055, "step": 3129 }, { "epoch": 0.4937492605592144, "grad_norm": 4.263675689697266, "learning_rate": 5.166639832555144e-07, "loss": 0.0546, "step": 3130 }, { "epoch": 0.4939070079268052, "grad_norm": 7.349658489227295, "learning_rate": 5.16502978586379e-07, "loss": 0.0453, "step": 3131 }, { "epoch": 0.49406475529439603, "grad_norm": 7.558027267456055, "learning_rate": 5.163419739172435e-07, "loss": 0.0497, "step": 3132 }, { "epoch": 0.4942225026619868, "grad_norm": 9.713709831237793, "learning_rate": 5.161809692481082e-07, "loss": 0.0714, "step": 3133 }, { "epoch": 0.49438025002957764, "grad_norm": 2.164876699447632, "learning_rate": 5.160199645789728e-07, "loss": 0.0163, "step": 3134 }, { "epoch": 0.4945379973971684, "grad_norm": 5.119956970214844, "learning_rate": 5.158589599098374e-07, "loss": 0.0382, "step": 3135 }, { "epoch": 0.49469574476475925, "grad_norm": 6.049105167388916, "learning_rate": 5.15697955240702e-07, "loss": 0.0897, "step": 3136 }, { "epoch": 0.49485349213235, "grad_norm": 6.990386962890625, "learning_rate": 5.155369505715666e-07, "loss": 0.1112, "step": 3137 }, { "epoch": 0.49501123949994086, "grad_norm": 6.017998695373535, "learning_rate": 5.153759459024312e-07, "loss": 0.0496, "step": 3138 }, { "epoch": 0.49516898686753164, "grad_norm": 1.9739807844161987, "learning_rate": 5.152149412332958e-07, "loss": 0.0136, "step": 3139 }, { "epoch": 0.49532673423512247, "grad_norm": 3.636338949203491, "learning_rate": 5.150539365641603e-07, "loss": 0.0613, "step": 3140 }, { "epoch": 0.49548448160271324, "grad_norm": 4.632494926452637, "learning_rate": 5.148929318950249e-07, "loss": 0.0413, "step": 3141 }, { "epoch": 0.4956422289703041, "grad_norm": 6.799563884735107, "learning_rate": 5.147319272258895e-07, "loss": 0.0237, "step": 3142 }, { "epoch": 0.49579997633789485, "grad_norm": 4.903563022613525, "learning_rate": 5.145709225567541e-07, "loss": 0.0705, "step": 3143 }, { "epoch": 0.4959577237054857, "grad_norm": 5.018761157989502, "learning_rate": 5.144099178876187e-07, "loss": 0.0361, "step": 3144 }, { "epoch": 0.49611547107307646, "grad_norm": 7.929637432098389, "learning_rate": 5.142489132184833e-07, "loss": 0.0772, "step": 3145 }, { "epoch": 0.4962732184406673, "grad_norm": 4.951676845550537, "learning_rate": 5.140879085493479e-07, "loss": 0.0297, "step": 3146 }, { "epoch": 0.4964309658082581, "grad_norm": 3.4608020782470703, "learning_rate": 5.139269038802125e-07, "loss": 0.0394, "step": 3147 }, { "epoch": 0.49658871317584885, "grad_norm": 9.678434371948242, "learning_rate": 5.13765899211077e-07, "loss": 0.0365, "step": 3148 }, { "epoch": 0.4967464605434397, "grad_norm": 5.920650005340576, "learning_rate": 5.136048945419416e-07, "loss": 0.0343, "step": 3149 }, { "epoch": 0.49690420791103046, "grad_norm": 7.1352362632751465, "learning_rate": 5.134438898728063e-07, "loss": 0.0492, "step": 3150 }, { "epoch": 0.4970619552786213, "grad_norm": 2.299804449081421, "learning_rate": 5.13282885203671e-07, "loss": 0.0239, "step": 3151 }, { "epoch": 0.49721970264621207, "grad_norm": 3.965886354446411, "learning_rate": 5.131218805345355e-07, "loss": 0.0375, "step": 3152 }, { "epoch": 0.4973774500138029, "grad_norm": 5.144937038421631, "learning_rate": 5.129608758654001e-07, "loss": 0.0323, "step": 3153 }, { "epoch": 0.4975351973813937, "grad_norm": 3.9022388458251953, "learning_rate": 5.127998711962647e-07, "loss": 0.0294, "step": 3154 }, { "epoch": 0.4976929447489845, "grad_norm": 8.25753116607666, "learning_rate": 5.126388665271293e-07, "loss": 0.026, "step": 3155 }, { "epoch": 0.4978506921165753, "grad_norm": 5.1440606117248535, "learning_rate": 5.124778618579939e-07, "loss": 0.0534, "step": 3156 }, { "epoch": 0.4980084394841661, "grad_norm": 2.1168277263641357, "learning_rate": 5.123168571888584e-07, "loss": 0.0136, "step": 3157 }, { "epoch": 0.4981661868517569, "grad_norm": 4.708176612854004, "learning_rate": 5.12155852519723e-07, "loss": 0.0433, "step": 3158 }, { "epoch": 0.49832393421934773, "grad_norm": 4.0774407386779785, "learning_rate": 5.119948478505877e-07, "loss": 0.0318, "step": 3159 }, { "epoch": 0.4984816815869385, "grad_norm": 5.956881523132324, "learning_rate": 5.118338431814523e-07, "loss": 0.0275, "step": 3160 }, { "epoch": 0.49863942895452934, "grad_norm": 7.400354385375977, "learning_rate": 5.116728385123168e-07, "loss": 0.0594, "step": 3161 }, { "epoch": 0.4987971763221201, "grad_norm": 6.836795806884766, "learning_rate": 5.115118338431814e-07, "loss": 0.0577, "step": 3162 }, { "epoch": 0.49895492368971095, "grad_norm": 4.228177547454834, "learning_rate": 5.11350829174046e-07, "loss": 0.0313, "step": 3163 }, { "epoch": 0.4991126710573017, "grad_norm": 4.468157768249512, "learning_rate": 5.111898245049106e-07, "loss": 0.0346, "step": 3164 }, { "epoch": 0.49927041842489256, "grad_norm": 5.014430999755859, "learning_rate": 5.110288198357751e-07, "loss": 0.0458, "step": 3165 }, { "epoch": 0.49942816579248334, "grad_norm": 4.235401630401611, "learning_rate": 5.108678151666398e-07, "loss": 0.0332, "step": 3166 }, { "epoch": 0.49958591316007417, "grad_norm": 4.435441970825195, "learning_rate": 5.107068104975045e-07, "loss": 0.0547, "step": 3167 }, { "epoch": 0.49974366052766495, "grad_norm": 6.001528739929199, "learning_rate": 5.105458058283691e-07, "loss": 0.0698, "step": 3168 }, { "epoch": 0.4999014078952557, "grad_norm": 3.8304319381713867, "learning_rate": 5.103848011592336e-07, "loss": 0.0239, "step": 3169 }, { "epoch": 0.5000591552628465, "grad_norm": 8.665897369384766, "learning_rate": 5.102237964900982e-07, "loss": 0.0695, "step": 3170 }, { "epoch": 0.5002169026304374, "grad_norm": 4.512013912200928, "learning_rate": 5.100627918209628e-07, "loss": 0.0348, "step": 3171 }, { "epoch": 0.5003746499980282, "grad_norm": 3.1103832721710205, "learning_rate": 5.099017871518274e-07, "loss": 0.0267, "step": 3172 }, { "epoch": 0.5005323973656189, "grad_norm": 10.89181137084961, "learning_rate": 5.097407824826919e-07, "loss": 0.0783, "step": 3173 }, { "epoch": 0.5006901447332097, "grad_norm": 6.386903762817383, "learning_rate": 5.095797778135566e-07, "loss": 0.0395, "step": 3174 }, { "epoch": 0.5008478921008006, "grad_norm": 9.237397193908691, "learning_rate": 5.094187731444212e-07, "loss": 0.039, "step": 3175 }, { "epoch": 0.5010056394683914, "grad_norm": 5.752801895141602, "learning_rate": 5.092577684752858e-07, "loss": 0.0715, "step": 3176 }, { "epoch": 0.5011633868359822, "grad_norm": 7.930622577667236, "learning_rate": 5.090967638061504e-07, "loss": 0.0375, "step": 3177 }, { "epoch": 0.5013211342035729, "grad_norm": 4.135472297668457, "learning_rate": 5.089357591370149e-07, "loss": 0.0542, "step": 3178 }, { "epoch": 0.5014788815711638, "grad_norm": 1.8181886672973633, "learning_rate": 5.087747544678795e-07, "loss": 0.0121, "step": 3179 }, { "epoch": 0.5016366289387546, "grad_norm": 9.15134334564209, "learning_rate": 5.086137497987441e-07, "loss": 0.0468, "step": 3180 }, { "epoch": 0.5017943763063454, "grad_norm": 8.41822624206543, "learning_rate": 5.084527451296088e-07, "loss": 0.0768, "step": 3181 }, { "epoch": 0.5019521236739362, "grad_norm": 8.632811546325684, "learning_rate": 5.082917404604733e-07, "loss": 0.0682, "step": 3182 }, { "epoch": 0.502109871041527, "grad_norm": 6.422868251800537, "learning_rate": 5.081307357913379e-07, "loss": 0.0613, "step": 3183 }, { "epoch": 0.5022676184091178, "grad_norm": 5.798893451690674, "learning_rate": 5.079697311222025e-07, "loss": 0.1024, "step": 3184 }, { "epoch": 0.5024253657767086, "grad_norm": 4.80502986907959, "learning_rate": 5.078087264530672e-07, "loss": 0.0473, "step": 3185 }, { "epoch": 0.5025831131442994, "grad_norm": 4.152266979217529, "learning_rate": 5.076477217839317e-07, "loss": 0.0536, "step": 3186 }, { "epoch": 0.5027408605118902, "grad_norm": 9.28447151184082, "learning_rate": 5.074867171147963e-07, "loss": 0.0701, "step": 3187 }, { "epoch": 0.502898607879481, "grad_norm": 4.702619552612305, "learning_rate": 5.073257124456609e-07, "loss": 0.0235, "step": 3188 }, { "epoch": 0.5030563552470718, "grad_norm": 13.083337783813477, "learning_rate": 5.071647077765256e-07, "loss": 0.0684, "step": 3189 }, { "epoch": 0.5032141026146626, "grad_norm": 6.463047981262207, "learning_rate": 5.070037031073901e-07, "loss": 0.0337, "step": 3190 }, { "epoch": 0.5033718499822534, "grad_norm": 4.493224143981934, "learning_rate": 5.068426984382547e-07, "loss": 0.0475, "step": 3191 }, { "epoch": 0.5035295973498443, "grad_norm": 7.519652366638184, "learning_rate": 5.066816937691193e-07, "loss": 0.0244, "step": 3192 }, { "epoch": 0.503687344717435, "grad_norm": 8.25917911529541, "learning_rate": 5.065206890999839e-07, "loss": 0.0964, "step": 3193 }, { "epoch": 0.5038450920850258, "grad_norm": 1.5638859272003174, "learning_rate": 5.063596844308484e-07, "loss": 0.013, "step": 3194 }, { "epoch": 0.5040028394526166, "grad_norm": 4.545310974121094, "learning_rate": 5.06198679761713e-07, "loss": 0.0201, "step": 3195 }, { "epoch": 0.5041605868202075, "grad_norm": 18.580646514892578, "learning_rate": 5.060376750925777e-07, "loss": 0.1505, "step": 3196 }, { "epoch": 0.5043183341877983, "grad_norm": 10.399683952331543, "learning_rate": 5.058766704234423e-07, "loss": 0.0689, "step": 3197 }, { "epoch": 0.504476081555389, "grad_norm": 5.154493808746338, "learning_rate": 5.057156657543068e-07, "loss": 0.039, "step": 3198 }, { "epoch": 0.5046338289229798, "grad_norm": 3.546096086502075, "learning_rate": 5.055546610851714e-07, "loss": 0.0701, "step": 3199 }, { "epoch": 0.5047915762905707, "grad_norm": 3.3994908332824707, "learning_rate": 5.05393656416036e-07, "loss": 0.02, "step": 3200 }, { "epoch": 0.5049493236581615, "grad_norm": 4.627392292022705, "learning_rate": 5.052326517469006e-07, "loss": 0.0547, "step": 3201 }, { "epoch": 0.5051070710257523, "grad_norm": 4.102443218231201, "learning_rate": 5.050716470777652e-07, "loss": 0.0996, "step": 3202 }, { "epoch": 0.505264818393343, "grad_norm": 3.814687490463257, "learning_rate": 5.049106424086297e-07, "loss": 0.0521, "step": 3203 }, { "epoch": 0.5054225657609339, "grad_norm": 3.8533315658569336, "learning_rate": 5.047496377394945e-07, "loss": 0.0597, "step": 3204 }, { "epoch": 0.5055803131285247, "grad_norm": 2.4979841709136963, "learning_rate": 5.045886330703591e-07, "loss": 0.0137, "step": 3205 }, { "epoch": 0.5057380604961155, "grad_norm": 6.9564595222473145, "learning_rate": 5.044276284012237e-07, "loss": 0.0576, "step": 3206 }, { "epoch": 0.5058958078637062, "grad_norm": 4.046496868133545, "learning_rate": 5.042666237320882e-07, "loss": 0.0534, "step": 3207 }, { "epoch": 0.506053555231297, "grad_norm": 5.28488826751709, "learning_rate": 5.041056190629528e-07, "loss": 0.0442, "step": 3208 }, { "epoch": 0.5062113025988879, "grad_norm": 2.557673931121826, "learning_rate": 5.039446143938174e-07, "loss": 0.0223, "step": 3209 }, { "epoch": 0.5063690499664787, "grad_norm": 4.47113561630249, "learning_rate": 5.03783609724682e-07, "loss": 0.0511, "step": 3210 }, { "epoch": 0.5065267973340695, "grad_norm": 3.6947097778320312, "learning_rate": 5.036226050555465e-07, "loss": 0.0437, "step": 3211 }, { "epoch": 0.5066845447016602, "grad_norm": 6.7887282371521, "learning_rate": 5.034616003864112e-07, "loss": 0.0456, "step": 3212 }, { "epoch": 0.5068422920692511, "grad_norm": 6.798984050750732, "learning_rate": 5.033005957172758e-07, "loss": 0.0816, "step": 3213 }, { "epoch": 0.5070000394368419, "grad_norm": 3.5052080154418945, "learning_rate": 5.031395910481404e-07, "loss": 0.0491, "step": 3214 }, { "epoch": 0.5071577868044327, "grad_norm": 3.1653616428375244, "learning_rate": 5.029785863790049e-07, "loss": 0.061, "step": 3215 }, { "epoch": 0.5073155341720235, "grad_norm": 6.844089508056641, "learning_rate": 5.028175817098695e-07, "loss": 0.0316, "step": 3216 }, { "epoch": 0.5074732815396144, "grad_norm": 4.814005374908447, "learning_rate": 5.026565770407341e-07, "loss": 0.0205, "step": 3217 }, { "epoch": 0.5076310289072051, "grad_norm": 7.20102071762085, "learning_rate": 5.024955723715987e-07, "loss": 0.0267, "step": 3218 }, { "epoch": 0.5077887762747959, "grad_norm": 2.9849743843078613, "learning_rate": 5.023345677024633e-07, "loss": 0.0361, "step": 3219 }, { "epoch": 0.5079465236423867, "grad_norm": 5.194559574127197, "learning_rate": 5.02173563033328e-07, "loss": 0.0817, "step": 3220 }, { "epoch": 0.5081042710099776, "grad_norm": 2.9066145420074463, "learning_rate": 5.020125583641926e-07, "loss": 0.0301, "step": 3221 }, { "epoch": 0.5082620183775683, "grad_norm": 1.0378518104553223, "learning_rate": 5.018515536950572e-07, "loss": 0.0082, "step": 3222 }, { "epoch": 0.5084197657451591, "grad_norm": 7.024827480316162, "learning_rate": 5.016905490259218e-07, "loss": 0.053, "step": 3223 }, { "epoch": 0.5085775131127499, "grad_norm": 3.3456127643585205, "learning_rate": 5.015295443567863e-07, "loss": 0.0311, "step": 3224 }, { "epoch": 0.5087352604803408, "grad_norm": 4.731700420379639, "learning_rate": 5.013685396876509e-07, "loss": 0.0175, "step": 3225 }, { "epoch": 0.5088930078479316, "grad_norm": 4.278652667999268, "learning_rate": 5.012075350185156e-07, "loss": 0.0186, "step": 3226 }, { "epoch": 0.5090507552155223, "grad_norm": 1.8399741649627686, "learning_rate": 5.010465303493802e-07, "loss": 0.0135, "step": 3227 }, { "epoch": 0.5092085025831131, "grad_norm": 3.6055192947387695, "learning_rate": 5.008855256802447e-07, "loss": 0.0453, "step": 3228 }, { "epoch": 0.5093662499507039, "grad_norm": 5.718883991241455, "learning_rate": 5.007245210111093e-07, "loss": 0.0458, "step": 3229 }, { "epoch": 0.5095239973182948, "grad_norm": 2.1946067810058594, "learning_rate": 5.005635163419739e-07, "loss": 0.0119, "step": 3230 }, { "epoch": 0.5096817446858856, "grad_norm": 5.639341831207275, "learning_rate": 5.004025116728385e-07, "loss": 0.0614, "step": 3231 }, { "epoch": 0.5098394920534763, "grad_norm": 8.0573148727417, "learning_rate": 5.00241507003703e-07, "loss": 0.0545, "step": 3232 }, { "epoch": 0.5099972394210671, "grad_norm": 6.907105445861816, "learning_rate": 5.000805023345676e-07, "loss": 0.0691, "step": 3233 }, { "epoch": 0.510154986788658, "grad_norm": 5.254229545593262, "learning_rate": 4.999194976654323e-07, "loss": 0.0162, "step": 3234 }, { "epoch": 0.5103127341562488, "grad_norm": 6.818365097045898, "learning_rate": 4.997584929962968e-07, "loss": 0.0641, "step": 3235 }, { "epoch": 0.5104704815238396, "grad_norm": 6.1585235595703125, "learning_rate": 4.995974883271614e-07, "loss": 0.0598, "step": 3236 }, { "epoch": 0.5106282288914303, "grad_norm": 12.957852363586426, "learning_rate": 4.99436483658026e-07, "loss": 0.0614, "step": 3237 }, { "epoch": 0.5107859762590212, "grad_norm": 5.405330657958984, "learning_rate": 4.992754789888907e-07, "loss": 0.029, "step": 3238 }, { "epoch": 0.510943723626612, "grad_norm": 3.506260871887207, "learning_rate": 4.991144743197553e-07, "loss": 0.058, "step": 3239 }, { "epoch": 0.5111014709942028, "grad_norm": 4.510875225067139, "learning_rate": 4.989534696506199e-07, "loss": 0.0475, "step": 3240 }, { "epoch": 0.5112592183617936, "grad_norm": 3.320171356201172, "learning_rate": 4.987924649814844e-07, "loss": 0.0306, "step": 3241 }, { "epoch": 0.5114169657293844, "grad_norm": 5.9775471687316895, "learning_rate": 4.986314603123491e-07, "loss": 0.0306, "step": 3242 }, { "epoch": 0.5115747130969752, "grad_norm": 6.424386501312256, "learning_rate": 4.984704556432136e-07, "loss": 0.0405, "step": 3243 }, { "epoch": 0.511732460464566, "grad_norm": 4.2295074462890625, "learning_rate": 4.983094509740782e-07, "loss": 0.0497, "step": 3244 }, { "epoch": 0.5118902078321568, "grad_norm": 6.756084442138672, "learning_rate": 4.981484463049428e-07, "loss": 0.0642, "step": 3245 }, { "epoch": 0.5120479551997476, "grad_norm": 4.199583053588867, "learning_rate": 4.979874416358074e-07, "loss": 0.044, "step": 3246 }, { "epoch": 0.5122057025673384, "grad_norm": 3.0998520851135254, "learning_rate": 4.97826436966672e-07, "loss": 0.0295, "step": 3247 }, { "epoch": 0.5123634499349292, "grad_norm": 8.630373001098633, "learning_rate": 4.976654322975366e-07, "loss": 0.097, "step": 3248 }, { "epoch": 0.51252119730252, "grad_norm": 3.19980525970459, "learning_rate": 4.975044276284012e-07, "loss": 0.0329, "step": 3249 }, { "epoch": 0.5126789446701108, "grad_norm": 3.5462422370910645, "learning_rate": 4.973434229592658e-07, "loss": 0.072, "step": 3250 }, { "epoch": 0.5128366920377017, "grad_norm": 4.932615756988525, "learning_rate": 4.971824182901304e-07, "loss": 0.0243, "step": 3251 }, { "epoch": 0.5129944394052924, "grad_norm": 4.566836357116699, "learning_rate": 4.970214136209949e-07, "loss": 0.0441, "step": 3252 }, { "epoch": 0.5131521867728832, "grad_norm": 3.056807279586792, "learning_rate": 4.968604089518596e-07, "loss": 0.0196, "step": 3253 }, { "epoch": 0.513309934140474, "grad_norm": 6.167160987854004, "learning_rate": 4.966994042827241e-07, "loss": 0.0564, "step": 3254 }, { "epoch": 0.5134676815080649, "grad_norm": 6.918657302856445, "learning_rate": 4.965383996135887e-07, "loss": 0.0687, "step": 3255 }, { "epoch": 0.5136254288756557, "grad_norm": 6.711324214935303, "learning_rate": 4.963773949444534e-07, "loss": 0.0826, "step": 3256 }, { "epoch": 0.5137831762432464, "grad_norm": 5.007664203643799, "learning_rate": 4.96216390275318e-07, "loss": 0.026, "step": 3257 }, { "epoch": 0.5139409236108372, "grad_norm": 7.329284191131592, "learning_rate": 4.960553856061826e-07, "loss": 0.0754, "step": 3258 }, { "epoch": 0.5140986709784281, "grad_norm": 5.177643775939941, "learning_rate": 4.958943809370472e-07, "loss": 0.0217, "step": 3259 }, { "epoch": 0.5142564183460189, "grad_norm": 4.8610076904296875, "learning_rate": 4.957333762679118e-07, "loss": 0.0401, "step": 3260 }, { "epoch": 0.5144141657136097, "grad_norm": 7.865502834320068, "learning_rate": 4.955723715987764e-07, "loss": 0.0476, "step": 3261 }, { "epoch": 0.5145719130812004, "grad_norm": 2.3840301036834717, "learning_rate": 4.954113669296409e-07, "loss": 0.0136, "step": 3262 }, { "epoch": 0.5147296604487913, "grad_norm": 5.249300479888916, "learning_rate": 4.952503622605055e-07, "loss": 0.0661, "step": 3263 }, { "epoch": 0.5148874078163821, "grad_norm": 3.7689268589019775, "learning_rate": 4.950893575913701e-07, "loss": 0.0366, "step": 3264 }, { "epoch": 0.5150451551839729, "grad_norm": 6.984394550323486, "learning_rate": 4.949283529222347e-07, "loss": 0.0762, "step": 3265 }, { "epoch": 0.5152029025515636, "grad_norm": 3.6469602584838867, "learning_rate": 4.947673482530993e-07, "loss": 0.0213, "step": 3266 }, { "epoch": 0.5153606499191544, "grad_norm": 5.703409194946289, "learning_rate": 4.946063435839639e-07, "loss": 0.0646, "step": 3267 }, { "epoch": 0.5155183972867453, "grad_norm": 6.856706619262695, "learning_rate": 4.944453389148285e-07, "loss": 0.0539, "step": 3268 }, { "epoch": 0.5156761446543361, "grad_norm": 3.3388614654541016, "learning_rate": 4.942843342456931e-07, "loss": 0.0436, "step": 3269 }, { "epoch": 0.5158338920219269, "grad_norm": 4.2724528312683105, "learning_rate": 4.941233295765577e-07, "loss": 0.0298, "step": 3270 }, { "epoch": 0.5159916393895176, "grad_norm": 8.368443489074707, "learning_rate": 4.939623249074222e-07, "loss": 0.0533, "step": 3271 }, { "epoch": 0.5161493867571085, "grad_norm": 2.5894930362701416, "learning_rate": 4.93801320238287e-07, "loss": 0.0225, "step": 3272 }, { "epoch": 0.5163071341246993, "grad_norm": 9.17452621459961, "learning_rate": 4.936403155691515e-07, "loss": 0.0401, "step": 3273 }, { "epoch": 0.5164648814922901, "grad_norm": 4.909448623657227, "learning_rate": 4.934793109000161e-07, "loss": 0.0384, "step": 3274 }, { "epoch": 0.5166226288598809, "grad_norm": 5.070178508758545, "learning_rate": 4.933183062308807e-07, "loss": 0.0648, "step": 3275 }, { "epoch": 0.5167803762274717, "grad_norm": 5.238038539886475, "learning_rate": 4.931573015617453e-07, "loss": 0.0428, "step": 3276 }, { "epoch": 0.5169381235950625, "grad_norm": 6.299678802490234, "learning_rate": 4.929962968926099e-07, "loss": 0.0649, "step": 3277 }, { "epoch": 0.5170958709626533, "grad_norm": 6.64601469039917, "learning_rate": 4.928352922234745e-07, "loss": 0.0456, "step": 3278 }, { "epoch": 0.5172536183302441, "grad_norm": 8.029877662658691, "learning_rate": 4.926742875543391e-07, "loss": 0.0942, "step": 3279 }, { "epoch": 0.517411365697835, "grad_norm": 3.4317123889923096, "learning_rate": 4.925132828852037e-07, "loss": 0.0171, "step": 3280 }, { "epoch": 0.5175691130654257, "grad_norm": 8.241918563842773, "learning_rate": 4.923522782160682e-07, "loss": 0.043, "step": 3281 }, { "epoch": 0.5177268604330165, "grad_norm": 4.322054386138916, "learning_rate": 4.921912735469328e-07, "loss": 0.0377, "step": 3282 }, { "epoch": 0.5178846078006073, "grad_norm": 3.4600963592529297, "learning_rate": 4.920302688777974e-07, "loss": 0.0574, "step": 3283 }, { "epoch": 0.5180423551681981, "grad_norm": 5.678745746612549, "learning_rate": 4.91869264208662e-07, "loss": 0.0507, "step": 3284 }, { "epoch": 0.518200102535789, "grad_norm": 5.465057373046875, "learning_rate": 4.917082595395266e-07, "loss": 0.04, "step": 3285 }, { "epoch": 0.5183578499033797, "grad_norm": 4.774681568145752, "learning_rate": 4.915472548703912e-07, "loss": 0.0463, "step": 3286 }, { "epoch": 0.5185155972709705, "grad_norm": 7.449644088745117, "learning_rate": 4.913862502012558e-07, "loss": 0.0871, "step": 3287 }, { "epoch": 0.5186733446385613, "grad_norm": 5.768908500671387, "learning_rate": 4.912252455321204e-07, "loss": 0.056, "step": 3288 }, { "epoch": 0.5188310920061522, "grad_norm": 4.329742908477783, "learning_rate": 4.910642408629849e-07, "loss": 0.0313, "step": 3289 }, { "epoch": 0.518988839373743, "grad_norm": 3.8610966205596924, "learning_rate": 4.909032361938497e-07, "loss": 0.0284, "step": 3290 }, { "epoch": 0.5191465867413337, "grad_norm": 5.501697540283203, "learning_rate": 4.907422315247142e-07, "loss": 0.0617, "step": 3291 }, { "epoch": 0.5193043341089245, "grad_norm": 2.665956497192383, "learning_rate": 4.905812268555788e-07, "loss": 0.0299, "step": 3292 }, { "epoch": 0.5194620814765154, "grad_norm": 6.519895076751709, "learning_rate": 4.904202221864434e-07, "loss": 0.082, "step": 3293 }, { "epoch": 0.5196198288441062, "grad_norm": 9.04831314086914, "learning_rate": 4.90259217517308e-07, "loss": 0.0611, "step": 3294 }, { "epoch": 0.519777576211697, "grad_norm": 5.466322898864746, "learning_rate": 4.900982128481726e-07, "loss": 0.0283, "step": 3295 }, { "epoch": 0.5199353235792877, "grad_norm": 3.772711992263794, "learning_rate": 4.899372081790372e-07, "loss": 0.0278, "step": 3296 }, { "epoch": 0.5200930709468786, "grad_norm": 4.77584981918335, "learning_rate": 4.897762035099018e-07, "loss": 0.0745, "step": 3297 }, { "epoch": 0.5202508183144694, "grad_norm": 4.437483787536621, "learning_rate": 4.896151988407664e-07, "loss": 0.0628, "step": 3298 }, { "epoch": 0.5204085656820602, "grad_norm": 7.5456318855285645, "learning_rate": 4.89454194171631e-07, "loss": 0.0256, "step": 3299 }, { "epoch": 0.520566313049651, "grad_norm": 2.4569687843322754, "learning_rate": 4.892931895024955e-07, "loss": 0.0174, "step": 3300 }, { "epoch": 0.5207240604172418, "grad_norm": 4.919608116149902, "learning_rate": 4.891321848333601e-07, "loss": 0.0366, "step": 3301 }, { "epoch": 0.5208818077848326, "grad_norm": 6.518052101135254, "learning_rate": 4.889711801642247e-07, "loss": 0.0594, "step": 3302 }, { "epoch": 0.5210395551524234, "grad_norm": 2.119096517562866, "learning_rate": 4.888101754950893e-07, "loss": 0.0145, "step": 3303 }, { "epoch": 0.5211973025200142, "grad_norm": 5.68674898147583, "learning_rate": 4.886491708259539e-07, "loss": 0.0382, "step": 3304 }, { "epoch": 0.521355049887605, "grad_norm": 4.043545722961426, "learning_rate": 4.884881661568185e-07, "loss": 0.0218, "step": 3305 }, { "epoch": 0.5215127972551958, "grad_norm": 7.708515644073486, "learning_rate": 4.883271614876831e-07, "loss": 0.087, "step": 3306 }, { "epoch": 0.5216705446227866, "grad_norm": 6.87803316116333, "learning_rate": 4.881661568185477e-07, "loss": 0.1043, "step": 3307 }, { "epoch": 0.5218282919903774, "grad_norm": 6.581982612609863, "learning_rate": 4.880051521494122e-07, "loss": 0.0574, "step": 3308 }, { "epoch": 0.5219860393579682, "grad_norm": 5.4126458168029785, "learning_rate": 4.87844147480277e-07, "loss": 0.0517, "step": 3309 }, { "epoch": 0.5221437867255591, "grad_norm": 5.3862385749816895, "learning_rate": 4.876831428111415e-07, "loss": 0.0655, "step": 3310 }, { "epoch": 0.5223015340931498, "grad_norm": 8.10891056060791, "learning_rate": 4.875221381420061e-07, "loss": 0.0664, "step": 3311 }, { "epoch": 0.5224592814607406, "grad_norm": 8.003843307495117, "learning_rate": 4.873611334728707e-07, "loss": 0.0771, "step": 3312 }, { "epoch": 0.5226170288283314, "grad_norm": 4.2681989669799805, "learning_rate": 4.872001288037353e-07, "loss": 0.0245, "step": 3313 }, { "epoch": 0.5227747761959223, "grad_norm": 3.433547019958496, "learning_rate": 4.870391241345999e-07, "loss": 0.0145, "step": 3314 }, { "epoch": 0.522932523563513, "grad_norm": 2.166034698486328, "learning_rate": 4.868781194654645e-07, "loss": 0.0134, "step": 3315 }, { "epoch": 0.5230902709311038, "grad_norm": 2.5870463848114014, "learning_rate": 4.867171147963291e-07, "loss": 0.012, "step": 3316 }, { "epoch": 0.5232480182986946, "grad_norm": 2.9055380821228027, "learning_rate": 4.865561101271937e-07, "loss": 0.0302, "step": 3317 }, { "epoch": 0.5234057656662855, "grad_norm": 4.043204307556152, "learning_rate": 4.863951054580583e-07, "loss": 0.0554, "step": 3318 }, { "epoch": 0.5235635130338763, "grad_norm": 5.544101715087891, "learning_rate": 4.862341007889228e-07, "loss": 0.0413, "step": 3319 }, { "epoch": 0.523721260401467, "grad_norm": 3.7651774883270264, "learning_rate": 4.860730961197875e-07, "loss": 0.0378, "step": 3320 }, { "epoch": 0.5238790077690578, "grad_norm": 3.9472198486328125, "learning_rate": 4.85912091450652e-07, "loss": 0.0482, "step": 3321 }, { "epoch": 0.5240367551366486, "grad_norm": 9.249954223632812, "learning_rate": 4.857510867815166e-07, "loss": 0.0849, "step": 3322 }, { "epoch": 0.5241945025042395, "grad_norm": 5.982876300811768, "learning_rate": 4.855900821123812e-07, "loss": 0.0421, "step": 3323 }, { "epoch": 0.5243522498718303, "grad_norm": 53.606571197509766, "learning_rate": 4.854290774432458e-07, "loss": 0.0513, "step": 3324 }, { "epoch": 0.524509997239421, "grad_norm": 1.2392445802688599, "learning_rate": 4.852680727741104e-07, "loss": 0.01, "step": 3325 }, { "epoch": 0.5246677446070118, "grad_norm": 6.681481838226318, "learning_rate": 4.851070681049751e-07, "loss": 0.0309, "step": 3326 }, { "epoch": 0.5248254919746027, "grad_norm": 8.890432357788086, "learning_rate": 4.849460634358396e-07, "loss": 0.0671, "step": 3327 }, { "epoch": 0.5249832393421935, "grad_norm": 4.494774341583252, "learning_rate": 4.847850587667043e-07, "loss": 0.0346, "step": 3328 }, { "epoch": 0.5251409867097843, "grad_norm": 3.080538272857666, "learning_rate": 4.846240540975688e-07, "loss": 0.0134, "step": 3329 }, { "epoch": 0.525298734077375, "grad_norm": 5.779559135437012, "learning_rate": 4.844630494284334e-07, "loss": 0.0403, "step": 3330 }, { "epoch": 0.5254564814449659, "grad_norm": 15.656715393066406, "learning_rate": 4.84302044759298e-07, "loss": 0.1181, "step": 3331 }, { "epoch": 0.5256142288125567, "grad_norm": 5.662373065948486, "learning_rate": 4.841410400901626e-07, "loss": 0.0813, "step": 3332 }, { "epoch": 0.5257719761801475, "grad_norm": 3.515956163406372, "learning_rate": 4.839800354210272e-07, "loss": 0.0248, "step": 3333 }, { "epoch": 0.5259297235477383, "grad_norm": 4.553384304046631, "learning_rate": 4.838190307518918e-07, "loss": 0.017, "step": 3334 }, { "epoch": 0.5260874709153291, "grad_norm": 1.009729266166687, "learning_rate": 4.836580260827564e-07, "loss": 0.0068, "step": 3335 }, { "epoch": 0.5262452182829199, "grad_norm": 3.6028075218200684, "learning_rate": 4.83497021413621e-07, "loss": 0.0338, "step": 3336 }, { "epoch": 0.5264029656505107, "grad_norm": 4.565405368804932, "learning_rate": 4.833360167444855e-07, "loss": 0.0383, "step": 3337 }, { "epoch": 0.5265607130181015, "grad_norm": 6.493062973022461, "learning_rate": 4.831750120753501e-07, "loss": 0.0615, "step": 3338 }, { "epoch": 0.5267184603856924, "grad_norm": 4.326895236968994, "learning_rate": 4.830140074062148e-07, "loss": 0.0196, "step": 3339 }, { "epoch": 0.5268762077532831, "grad_norm": 3.374591588973999, "learning_rate": 4.828530027370793e-07, "loss": 0.0305, "step": 3340 }, { "epoch": 0.5270339551208739, "grad_norm": 5.829579830169678, "learning_rate": 4.826919980679439e-07, "loss": 0.0316, "step": 3341 }, { "epoch": 0.5271917024884647, "grad_norm": 5.114638328552246, "learning_rate": 4.825309933988085e-07, "loss": 0.0455, "step": 3342 }, { "epoch": 0.5273494498560555, "grad_norm": 4.803951740264893, "learning_rate": 4.823699887296732e-07, "loss": 0.033, "step": 3343 }, { "epoch": 0.5275071972236464, "grad_norm": 5.90579080581665, "learning_rate": 4.822089840605378e-07, "loss": 0.0466, "step": 3344 }, { "epoch": 0.5276649445912371, "grad_norm": 3.841439962387085, "learning_rate": 4.820479793914024e-07, "loss": 0.0698, "step": 3345 }, { "epoch": 0.5278226919588279, "grad_norm": 4.033648490905762, "learning_rate": 4.818869747222669e-07, "loss": 0.0406, "step": 3346 }, { "epoch": 0.5279804393264187, "grad_norm": 5.283125400543213, "learning_rate": 4.817259700531316e-07, "loss": 0.0563, "step": 3347 }, { "epoch": 0.5281381866940096, "grad_norm": 2.5686020851135254, "learning_rate": 4.815649653839961e-07, "loss": 0.0134, "step": 3348 }, { "epoch": 0.5282959340616004, "grad_norm": 4.076895236968994, "learning_rate": 4.814039607148607e-07, "loss": 0.072, "step": 3349 }, { "epoch": 0.5284536814291911, "grad_norm": 5.695146560668945, "learning_rate": 4.812429560457253e-07, "loss": 0.0476, "step": 3350 }, { "epoch": 0.5286114287967819, "grad_norm": 6.99587869644165, "learning_rate": 4.810819513765899e-07, "loss": 0.0783, "step": 3351 }, { "epoch": 0.5287691761643728, "grad_norm": 3.8918187618255615, "learning_rate": 4.809209467074545e-07, "loss": 0.0522, "step": 3352 }, { "epoch": 0.5289269235319636, "grad_norm": 5.451104164123535, "learning_rate": 4.807599420383191e-07, "loss": 0.0443, "step": 3353 }, { "epoch": 0.5290846708995544, "grad_norm": 5.873027801513672, "learning_rate": 4.805989373691837e-07, "loss": 0.0751, "step": 3354 }, { "epoch": 0.5292424182671451, "grad_norm": 4.282705783843994, "learning_rate": 4.804379327000483e-07, "loss": 0.0341, "step": 3355 }, { "epoch": 0.529400165634736, "grad_norm": 6.071550369262695, "learning_rate": 4.802769280309128e-07, "loss": 0.0269, "step": 3356 }, { "epoch": 0.5295579130023268, "grad_norm": 8.23840618133545, "learning_rate": 4.801159233617774e-07, "loss": 0.076, "step": 3357 }, { "epoch": 0.5297156603699176, "grad_norm": 4.4299445152282715, "learning_rate": 4.79954918692642e-07, "loss": 0.0623, "step": 3358 }, { "epoch": 0.5298734077375084, "grad_norm": 3.180269956588745, "learning_rate": 4.797939140235066e-07, "loss": 0.0258, "step": 3359 }, { "epoch": 0.5300311551050992, "grad_norm": 8.468398094177246, "learning_rate": 4.796329093543712e-07, "loss": 0.038, "step": 3360 }, { "epoch": 0.53018890247269, "grad_norm": 5.929890155792236, "learning_rate": 4.794719046852359e-07, "loss": 0.0591, "step": 3361 }, { "epoch": 0.5303466498402808, "grad_norm": 3.943211078643799, "learning_rate": 4.793109000161005e-07, "loss": 0.0222, "step": 3362 }, { "epoch": 0.5305043972078716, "grad_norm": 2.4576566219329834, "learning_rate": 4.791498953469651e-07, "loss": 0.0434, "step": 3363 }, { "epoch": 0.5306621445754623, "grad_norm": 5.097107887268066, "learning_rate": 4.789888906778297e-07, "loss": 0.058, "step": 3364 }, { "epoch": 0.5308198919430532, "grad_norm": 2.5385961532592773, "learning_rate": 4.788278860086943e-07, "loss": 0.0179, "step": 3365 }, { "epoch": 0.530977639310644, "grad_norm": 4.564829349517822, "learning_rate": 4.786668813395589e-07, "loss": 0.0831, "step": 3366 }, { "epoch": 0.5311353866782348, "grad_norm": 5.847906112670898, "learning_rate": 4.785058766704234e-07, "loss": 0.0614, "step": 3367 }, { "epoch": 0.5312931340458256, "grad_norm": 4.3448052406311035, "learning_rate": 4.78344872001288e-07, "loss": 0.0612, "step": 3368 }, { "epoch": 0.5314508814134165, "grad_norm": 3.5759341716766357, "learning_rate": 4.781838673321526e-07, "loss": 0.0267, "step": 3369 }, { "epoch": 0.5316086287810072, "grad_norm": 6.660635471343994, "learning_rate": 4.780228626630172e-07, "loss": 0.0415, "step": 3370 }, { "epoch": 0.531766376148598, "grad_norm": 8.639548301696777, "learning_rate": 4.778618579938818e-07, "loss": 0.0597, "step": 3371 }, { "epoch": 0.5319241235161888, "grad_norm": 4.539547443389893, "learning_rate": 4.777008533247464e-07, "loss": 0.0328, "step": 3372 }, { "epoch": 0.5320818708837797, "grad_norm": 51.51942443847656, "learning_rate": 4.77539848655611e-07, "loss": 0.078, "step": 3373 }, { "epoch": 0.5322396182513704, "grad_norm": 4.7608642578125, "learning_rate": 4.773788439864756e-07, "loss": 0.0426, "step": 3374 }, { "epoch": 0.5323973656189612, "grad_norm": 7.365528583526611, "learning_rate": 4.772178393173401e-07, "loss": 0.0379, "step": 3375 }, { "epoch": 0.532555112986552, "grad_norm": 4.892368793487549, "learning_rate": 4.770568346482047e-07, "loss": 0.0493, "step": 3376 }, { "epoch": 0.5327128603541429, "grad_norm": 6.183288097381592, "learning_rate": 4.768958299790693e-07, "loss": 0.0611, "step": 3377 }, { "epoch": 0.5328706077217337, "grad_norm": 6.039890766143799, "learning_rate": 4.7673482530993395e-07, "loss": 0.1078, "step": 3378 }, { "epoch": 0.5330283550893244, "grad_norm": 7.143927574157715, "learning_rate": 4.7657382064079856e-07, "loss": 0.0925, "step": 3379 }, { "epoch": 0.5331861024569152, "grad_norm": 5.521488666534424, "learning_rate": 4.7641281597166316e-07, "loss": 0.0179, "step": 3380 }, { "epoch": 0.533343849824506, "grad_norm": 7.703474044799805, "learning_rate": 4.762518113025277e-07, "loss": 0.0355, "step": 3381 }, { "epoch": 0.5335015971920969, "grad_norm": 3.419008255004883, "learning_rate": 4.760908066333924e-07, "loss": 0.0234, "step": 3382 }, { "epoch": 0.5336593445596877, "grad_norm": 6.611175537109375, "learning_rate": 4.7592980196425693e-07, "loss": 0.0407, "step": 3383 }, { "epoch": 0.5338170919272784, "grad_norm": 6.516146183013916, "learning_rate": 4.7576879729512153e-07, "loss": 0.0252, "step": 3384 }, { "epoch": 0.5339748392948692, "grad_norm": 4.867481708526611, "learning_rate": 4.7560779262598614e-07, "loss": 0.0462, "step": 3385 }, { "epoch": 0.5341325866624601, "grad_norm": 7.547916412353516, "learning_rate": 4.7544678795685075e-07, "loss": 0.0423, "step": 3386 }, { "epoch": 0.5342903340300509, "grad_norm": 4.116353988647461, "learning_rate": 4.7528578328771535e-07, "loss": 0.0303, "step": 3387 }, { "epoch": 0.5344480813976417, "grad_norm": 5.257046699523926, "learning_rate": 4.751247786185799e-07, "loss": 0.0622, "step": 3388 }, { "epoch": 0.5346058287652324, "grad_norm": 2.8776915073394775, "learning_rate": 4.749637739494445e-07, "loss": 0.0421, "step": 3389 }, { "epoch": 0.5347635761328233, "grad_norm": 8.064032554626465, "learning_rate": 4.748027692803091e-07, "loss": 0.0531, "step": 3390 }, { "epoch": 0.5349213235004141, "grad_norm": 4.772770404815674, "learning_rate": 4.746417646111737e-07, "loss": 0.0409, "step": 3391 }, { "epoch": 0.5350790708680049, "grad_norm": 7.8889360427856445, "learning_rate": 4.744807599420383e-07, "loss": 0.0888, "step": 3392 }, { "epoch": 0.5352368182355957, "grad_norm": 4.489109516143799, "learning_rate": 4.743197552729029e-07, "loss": 0.0489, "step": 3393 }, { "epoch": 0.5353945656031865, "grad_norm": 4.8228864669799805, "learning_rate": 4.741587506037675e-07, "loss": 0.037, "step": 3394 }, { "epoch": 0.5355523129707773, "grad_norm": 5.193973064422607, "learning_rate": 4.739977459346321e-07, "loss": 0.0576, "step": 3395 }, { "epoch": 0.5357100603383681, "grad_norm": 4.263784408569336, "learning_rate": 4.7383674126549665e-07, "loss": 0.0334, "step": 3396 }, { "epoch": 0.5358678077059589, "grad_norm": 2.4240798950195312, "learning_rate": 4.736757365963613e-07, "loss": 0.0224, "step": 3397 }, { "epoch": 0.5360255550735498, "grad_norm": 5.9223408699035645, "learning_rate": 4.7351473192722586e-07, "loss": 0.0561, "step": 3398 }, { "epoch": 0.5361833024411405, "grad_norm": 2.8138952255249023, "learning_rate": 4.7335372725809047e-07, "loss": 0.0387, "step": 3399 }, { "epoch": 0.5363410498087313, "grad_norm": 8.36697769165039, "learning_rate": 4.73192722588955e-07, "loss": 0.0304, "step": 3400 }, { "epoch": 0.5364987971763221, "grad_norm": 3.208568572998047, "learning_rate": 4.730317179198197e-07, "loss": 0.0454, "step": 3401 }, { "epoch": 0.5366565445439129, "grad_norm": 6.757406711578369, "learning_rate": 4.7287071325068424e-07, "loss": 0.0463, "step": 3402 }, { "epoch": 0.5368142919115038, "grad_norm": 7.119345664978027, "learning_rate": 4.7270970858154884e-07, "loss": 0.0323, "step": 3403 }, { "epoch": 0.5369720392790945, "grad_norm": 6.767031192779541, "learning_rate": 4.725487039124134e-07, "loss": 0.0429, "step": 3404 }, { "epoch": 0.5371297866466853, "grad_norm": 3.0922765731811523, "learning_rate": 4.7238769924327805e-07, "loss": 0.0472, "step": 3405 }, { "epoch": 0.5372875340142761, "grad_norm": 4.668428421020508, "learning_rate": 4.722266945741426e-07, "loss": 0.0537, "step": 3406 }, { "epoch": 0.537445281381867, "grad_norm": 2.73225998878479, "learning_rate": 4.720656899050072e-07, "loss": 0.0197, "step": 3407 }, { "epoch": 0.5376030287494578, "grad_norm": 4.740433692932129, "learning_rate": 4.719046852358718e-07, "loss": 0.0222, "step": 3408 }, { "epoch": 0.5377607761170485, "grad_norm": 3.2959320545196533, "learning_rate": 4.7174368056673643e-07, "loss": 0.0237, "step": 3409 }, { "epoch": 0.5379185234846393, "grad_norm": 7.326497554779053, "learning_rate": 4.7158267589760103e-07, "loss": 0.0294, "step": 3410 }, { "epoch": 0.5380762708522302, "grad_norm": 6.463442325592041, "learning_rate": 4.714216712284656e-07, "loss": 0.0528, "step": 3411 }, { "epoch": 0.538234018219821, "grad_norm": 3.855602979660034, "learning_rate": 4.7126066655933025e-07, "loss": 0.0431, "step": 3412 }, { "epoch": 0.5383917655874118, "grad_norm": 3.002051830291748, "learning_rate": 4.710996618901948e-07, "loss": 0.0461, "step": 3413 }, { "epoch": 0.5385495129550025, "grad_norm": 4.882457256317139, "learning_rate": 4.709386572210594e-07, "loss": 0.0435, "step": 3414 }, { "epoch": 0.5387072603225934, "grad_norm": 5.218534469604492, "learning_rate": 4.7077765255192396e-07, "loss": 0.0467, "step": 3415 }, { "epoch": 0.5388650076901842, "grad_norm": 2.1747610569000244, "learning_rate": 4.706166478827886e-07, "loss": 0.0134, "step": 3416 }, { "epoch": 0.539022755057775, "grad_norm": 7.50874662399292, "learning_rate": 4.7045564321365317e-07, "loss": 0.0691, "step": 3417 }, { "epoch": 0.5391805024253657, "grad_norm": 4.116052150726318, "learning_rate": 4.702946385445178e-07, "loss": 0.047, "step": 3418 }, { "epoch": 0.5393382497929565, "grad_norm": 9.27847671508789, "learning_rate": 4.7013363387538233e-07, "loss": 0.0587, "step": 3419 }, { "epoch": 0.5394959971605474, "grad_norm": 3.903451919555664, "learning_rate": 4.69972629206247e-07, "loss": 0.044, "step": 3420 }, { "epoch": 0.5396537445281382, "grad_norm": 3.332679033279419, "learning_rate": 4.6981162453711154e-07, "loss": 0.0169, "step": 3421 }, { "epoch": 0.539811491895729, "grad_norm": 6.710604190826416, "learning_rate": 4.6965061986797615e-07, "loss": 0.0864, "step": 3422 }, { "epoch": 0.5399692392633197, "grad_norm": 5.891706466674805, "learning_rate": 4.694896151988407e-07, "loss": 0.0597, "step": 3423 }, { "epoch": 0.5401269866309106, "grad_norm": 6.030353546142578, "learning_rate": 4.6932861052970536e-07, "loss": 0.0466, "step": 3424 }, { "epoch": 0.5402847339985014, "grad_norm": 5.598007678985596, "learning_rate": 4.691676058605699e-07, "loss": 0.0471, "step": 3425 }, { "epoch": 0.5404424813660922, "grad_norm": 8.186391830444336, "learning_rate": 4.690066011914345e-07, "loss": 0.0636, "step": 3426 }, { "epoch": 0.540600228733683, "grad_norm": 8.03498649597168, "learning_rate": 4.6884559652229913e-07, "loss": 0.0664, "step": 3427 }, { "epoch": 0.5407579761012739, "grad_norm": 3.3238210678100586, "learning_rate": 4.6868459185316374e-07, "loss": 0.0298, "step": 3428 }, { "epoch": 0.5409157234688646, "grad_norm": 3.5968828201293945, "learning_rate": 4.685235871840283e-07, "loss": 0.0577, "step": 3429 }, { "epoch": 0.5410734708364554, "grad_norm": 5.372389316558838, "learning_rate": 4.683625825148929e-07, "loss": 0.0455, "step": 3430 }, { "epoch": 0.5412312182040462, "grad_norm": 1.568935751914978, "learning_rate": 4.682015778457575e-07, "loss": 0.014, "step": 3431 }, { "epoch": 0.5413889655716371, "grad_norm": 3.2657830715179443, "learning_rate": 4.680405731766221e-07, "loss": 0.0208, "step": 3432 }, { "epoch": 0.5415467129392278, "grad_norm": 8.174827575683594, "learning_rate": 4.678795685074867e-07, "loss": 0.034, "step": 3433 }, { "epoch": 0.5417044603068186, "grad_norm": 2.3991222381591797, "learning_rate": 4.6771856383835127e-07, "loss": 0.0119, "step": 3434 }, { "epoch": 0.5418622076744094, "grad_norm": 6.57170295715332, "learning_rate": 4.675575591692159e-07, "loss": 0.0433, "step": 3435 }, { "epoch": 0.5420199550420003, "grad_norm": 2.901010036468506, "learning_rate": 4.673965545000805e-07, "loss": 0.039, "step": 3436 }, { "epoch": 0.5421777024095911, "grad_norm": 5.223124027252197, "learning_rate": 4.672355498309451e-07, "loss": 0.0515, "step": 3437 }, { "epoch": 0.5423354497771818, "grad_norm": 3.257049560546875, "learning_rate": 4.6707454516180964e-07, "loss": 0.0343, "step": 3438 }, { "epoch": 0.5424931971447726, "grad_norm": 11.468480110168457, "learning_rate": 4.669135404926743e-07, "loss": 0.0522, "step": 3439 }, { "epoch": 0.5426509445123634, "grad_norm": 8.88228988647461, "learning_rate": 4.6675253582353885e-07, "loss": 0.0772, "step": 3440 }, { "epoch": 0.5428086918799543, "grad_norm": 3.969876289367676, "learning_rate": 4.6659153115440346e-07, "loss": 0.0363, "step": 3441 }, { "epoch": 0.5429664392475451, "grad_norm": 8.859699249267578, "learning_rate": 4.6643052648526806e-07, "loss": 0.0277, "step": 3442 }, { "epoch": 0.5431241866151358, "grad_norm": 6.532208442687988, "learning_rate": 4.6626952181613267e-07, "loss": 0.023, "step": 3443 }, { "epoch": 0.5432819339827266, "grad_norm": 145.56295776367188, "learning_rate": 4.661085171469972e-07, "loss": 0.0473, "step": 3444 }, { "epoch": 0.5434396813503175, "grad_norm": 5.529518127441406, "learning_rate": 4.6594751247786183e-07, "loss": 0.0286, "step": 3445 }, { "epoch": 0.5435974287179083, "grad_norm": 5.401988983154297, "learning_rate": 4.6578650780872644e-07, "loss": 0.0503, "step": 3446 }, { "epoch": 0.5437551760854991, "grad_norm": 6.497735023498535, "learning_rate": 4.6562550313959104e-07, "loss": 0.0424, "step": 3447 }, { "epoch": 0.5439129234530898, "grad_norm": 5.0852203369140625, "learning_rate": 4.654644984704556e-07, "loss": 0.0297, "step": 3448 }, { "epoch": 0.5440706708206807, "grad_norm": 3.3863515853881836, "learning_rate": 4.653034938013202e-07, "loss": 0.0153, "step": 3449 }, { "epoch": 0.5442284181882715, "grad_norm": 2.5436203479766846, "learning_rate": 4.651424891321848e-07, "loss": 0.0303, "step": 3450 }, { "epoch": 0.5443861655558623, "grad_norm": 7.061236381530762, "learning_rate": 4.649814844630494e-07, "loss": 0.0787, "step": 3451 }, { "epoch": 0.544543912923453, "grad_norm": 2.437614679336548, "learning_rate": 4.6482047979391397e-07, "loss": 0.0102, "step": 3452 }, { "epoch": 0.5447016602910439, "grad_norm": 1.9190566539764404, "learning_rate": 4.646594751247786e-07, "loss": 0.0167, "step": 3453 }, { "epoch": 0.5448594076586347, "grad_norm": 4.904125213623047, "learning_rate": 4.644984704556432e-07, "loss": 0.0565, "step": 3454 }, { "epoch": 0.5450171550262255, "grad_norm": 5.0562005043029785, "learning_rate": 4.643374657865078e-07, "loss": 0.0126, "step": 3455 }, { "epoch": 0.5451749023938163, "grad_norm": 2.606792449951172, "learning_rate": 4.641764611173724e-07, "loss": 0.0179, "step": 3456 }, { "epoch": 0.545332649761407, "grad_norm": 6.193460464477539, "learning_rate": 4.64015456448237e-07, "loss": 0.0728, "step": 3457 }, { "epoch": 0.5454903971289979, "grad_norm": 7.9752116203308105, "learning_rate": 4.638544517791016e-07, "loss": 0.059, "step": 3458 }, { "epoch": 0.5456481444965887, "grad_norm": 5.858904838562012, "learning_rate": 4.6369344710996616e-07, "loss": 0.082, "step": 3459 }, { "epoch": 0.5458058918641795, "grad_norm": 4.883467197418213, "learning_rate": 4.6353244244083077e-07, "loss": 0.0474, "step": 3460 }, { "epoch": 0.5459636392317703, "grad_norm": 1.261867880821228, "learning_rate": 4.6337143777169537e-07, "loss": 0.008, "step": 3461 }, { "epoch": 0.5461213865993612, "grad_norm": 11.196954727172852, "learning_rate": 4.6321043310256e-07, "loss": 0.0654, "step": 3462 }, { "epoch": 0.5462791339669519, "grad_norm": 7.130752086639404, "learning_rate": 4.6304942843342453e-07, "loss": 0.067, "step": 3463 }, { "epoch": 0.5464368813345427, "grad_norm": 9.090606689453125, "learning_rate": 4.6288842376428914e-07, "loss": 0.0608, "step": 3464 }, { "epoch": 0.5465946287021335, "grad_norm": 4.5959343910217285, "learning_rate": 4.6272741909515375e-07, "loss": 0.0368, "step": 3465 }, { "epoch": 0.5467523760697244, "grad_norm": 3.92128849029541, "learning_rate": 4.6256641442601835e-07, "loss": 0.0206, "step": 3466 }, { "epoch": 0.5469101234373152, "grad_norm": 4.205663204193115, "learning_rate": 4.624054097568829e-07, "loss": 0.0664, "step": 3467 }, { "epoch": 0.5470678708049059, "grad_norm": 11.866006851196289, "learning_rate": 4.622444050877475e-07, "loss": 0.0505, "step": 3468 }, { "epoch": 0.5472256181724967, "grad_norm": 5.364625930786133, "learning_rate": 4.620834004186121e-07, "loss": 0.0548, "step": 3469 }, { "epoch": 0.5473833655400876, "grad_norm": 4.128734588623047, "learning_rate": 4.619223957494767e-07, "loss": 0.0664, "step": 3470 }, { "epoch": 0.5475411129076784, "grad_norm": 6.497686862945557, "learning_rate": 4.617613910803413e-07, "loss": 0.0823, "step": 3471 }, { "epoch": 0.5476988602752692, "grad_norm": 4.692132949829102, "learning_rate": 4.6160038641120594e-07, "loss": 0.0204, "step": 3472 }, { "epoch": 0.5478566076428599, "grad_norm": 3.7560946941375732, "learning_rate": 4.614393817420705e-07, "loss": 0.0337, "step": 3473 }, { "epoch": 0.5480143550104508, "grad_norm": 5.997998237609863, "learning_rate": 4.612783770729351e-07, "loss": 0.0326, "step": 3474 }, { "epoch": 0.5481721023780416, "grad_norm": 4.13024377822876, "learning_rate": 4.6111737240379965e-07, "loss": 0.0536, "step": 3475 }, { "epoch": 0.5483298497456324, "grad_norm": 4.070066928863525, "learning_rate": 4.609563677346643e-07, "loss": 0.0432, "step": 3476 }, { "epoch": 0.5484875971132231, "grad_norm": 2.8549342155456543, "learning_rate": 4.6079536306552886e-07, "loss": 0.0234, "step": 3477 }, { "epoch": 0.5486453444808139, "grad_norm": 5.868303298950195, "learning_rate": 4.6063435839639347e-07, "loss": 0.0337, "step": 3478 }, { "epoch": 0.5488030918484048, "grad_norm": 3.397012710571289, "learning_rate": 4.604733537272581e-07, "loss": 0.011, "step": 3479 }, { "epoch": 0.5489608392159956, "grad_norm": 8.067663192749023, "learning_rate": 4.603123490581227e-07, "loss": 0.0464, "step": 3480 }, { "epoch": 0.5491185865835864, "grad_norm": 3.4272048473358154, "learning_rate": 4.601513443889873e-07, "loss": 0.0421, "step": 3481 }, { "epoch": 0.5492763339511771, "grad_norm": 5.533441543579102, "learning_rate": 4.5999033971985184e-07, "loss": 0.0545, "step": 3482 }, { "epoch": 0.549434081318768, "grad_norm": 4.946138381958008, "learning_rate": 4.5982933505071645e-07, "loss": 0.0834, "step": 3483 }, { "epoch": 0.5495918286863588, "grad_norm": 4.812375545501709, "learning_rate": 4.5966833038158105e-07, "loss": 0.0384, "step": 3484 }, { "epoch": 0.5497495760539496, "grad_norm": 3.619523286819458, "learning_rate": 4.5950732571244566e-07, "loss": 0.0135, "step": 3485 }, { "epoch": 0.5499073234215404, "grad_norm": 6.208311080932617, "learning_rate": 4.593463210433102e-07, "loss": 0.0358, "step": 3486 }, { "epoch": 0.5500650707891312, "grad_norm": 4.9354248046875, "learning_rate": 4.5918531637417487e-07, "loss": 0.0558, "step": 3487 }, { "epoch": 0.550222818156722, "grad_norm": 4.226499557495117, "learning_rate": 4.590243117050394e-07, "loss": 0.0209, "step": 3488 }, { "epoch": 0.5503805655243128, "grad_norm": 7.435657024383545, "learning_rate": 4.5886330703590403e-07, "loss": 0.0549, "step": 3489 }, { "epoch": 0.5505383128919036, "grad_norm": 5.155275344848633, "learning_rate": 4.587023023667686e-07, "loss": 0.0373, "step": 3490 }, { "epoch": 0.5506960602594945, "grad_norm": 5.168789863586426, "learning_rate": 4.5854129769763324e-07, "loss": 0.0215, "step": 3491 }, { "epoch": 0.5508538076270852, "grad_norm": 6.964191436767578, "learning_rate": 4.583802930284978e-07, "loss": 0.0551, "step": 3492 }, { "epoch": 0.551011554994676, "grad_norm": 4.6709885597229, "learning_rate": 4.582192883593624e-07, "loss": 0.0284, "step": 3493 }, { "epoch": 0.5511693023622668, "grad_norm": 6.141894340515137, "learning_rate": 4.5805828369022696e-07, "loss": 0.0963, "step": 3494 }, { "epoch": 0.5513270497298577, "grad_norm": 1.8826123476028442, "learning_rate": 4.578972790210916e-07, "loss": 0.014, "step": 3495 }, { "epoch": 0.5514847970974485, "grad_norm": 5.1856584548950195, "learning_rate": 4.5773627435195617e-07, "loss": 0.0543, "step": 3496 }, { "epoch": 0.5516425444650392, "grad_norm": 7.011806964874268, "learning_rate": 4.575752696828208e-07, "loss": 0.0453, "step": 3497 }, { "epoch": 0.55180029183263, "grad_norm": 4.731129169464111, "learning_rate": 4.5741426501368533e-07, "loss": 0.042, "step": 3498 }, { "epoch": 0.5519580392002208, "grad_norm": 4.678847312927246, "learning_rate": 4.5725326034455e-07, "loss": 0.0519, "step": 3499 }, { "epoch": 0.5521157865678117, "grad_norm": 3.7979085445404053, "learning_rate": 4.5709225567541454e-07, "loss": 0.0578, "step": 3500 }, { "epoch": 0.5522735339354025, "grad_norm": 7.19497013092041, "learning_rate": 4.5693125100627915e-07, "loss": 0.0694, "step": 3501 }, { "epoch": 0.5524312813029932, "grad_norm": 5.810506343841553, "learning_rate": 4.567702463371438e-07, "loss": 0.0364, "step": 3502 }, { "epoch": 0.552589028670584, "grad_norm": 4.69302225112915, "learning_rate": 4.5660924166800836e-07, "loss": 0.1141, "step": 3503 }, { "epoch": 0.5527467760381749, "grad_norm": 3.8634958267211914, "learning_rate": 4.5644823699887297e-07, "loss": 0.0404, "step": 3504 }, { "epoch": 0.5529045234057657, "grad_norm": 7.281254768371582, "learning_rate": 4.562872323297375e-07, "loss": 0.1027, "step": 3505 }, { "epoch": 0.5530622707733565, "grad_norm": 5.159707069396973, "learning_rate": 4.561262276606022e-07, "loss": 0.0189, "step": 3506 }, { "epoch": 0.5532200181409472, "grad_norm": 5.031998157501221, "learning_rate": 4.5596522299146673e-07, "loss": 0.0293, "step": 3507 }, { "epoch": 0.5533777655085381, "grad_norm": 4.738980293273926, "learning_rate": 4.5580421832233134e-07, "loss": 0.0493, "step": 3508 }, { "epoch": 0.5535355128761289, "grad_norm": 6.045560359954834, "learning_rate": 4.556432136531959e-07, "loss": 0.0848, "step": 3509 }, { "epoch": 0.5536932602437197, "grad_norm": 2.5067272186279297, "learning_rate": 4.5548220898406055e-07, "loss": 0.0101, "step": 3510 }, { "epoch": 0.5538510076113105, "grad_norm": 3.580920696258545, "learning_rate": 4.553212043149251e-07, "loss": 0.0405, "step": 3511 }, { "epoch": 0.5540087549789013, "grad_norm": 4.838515758514404, "learning_rate": 4.551601996457897e-07, "loss": 0.0187, "step": 3512 }, { "epoch": 0.5541665023464921, "grad_norm": 5.049856185913086, "learning_rate": 4.5499919497665427e-07, "loss": 0.0584, "step": 3513 }, { "epoch": 0.5543242497140829, "grad_norm": 1.609534502029419, "learning_rate": 4.548381903075189e-07, "loss": 0.0104, "step": 3514 }, { "epoch": 0.5544819970816737, "grad_norm": 4.600698471069336, "learning_rate": 4.546771856383835e-07, "loss": 0.0333, "step": 3515 }, { "epoch": 0.5546397444492644, "grad_norm": 7.993060111999512, "learning_rate": 4.545161809692481e-07, "loss": 0.0558, "step": 3516 }, { "epoch": 0.5547974918168553, "grad_norm": 3.101815938949585, "learning_rate": 4.5435517630011264e-07, "loss": 0.0228, "step": 3517 }, { "epoch": 0.5549552391844461, "grad_norm": 5.370943546295166, "learning_rate": 4.541941716309773e-07, "loss": 0.1041, "step": 3518 }, { "epoch": 0.5551129865520369, "grad_norm": 7.163643836975098, "learning_rate": 4.5403316696184185e-07, "loss": 0.0535, "step": 3519 }, { "epoch": 0.5552707339196277, "grad_norm": 3.8371834754943848, "learning_rate": 4.5387216229270646e-07, "loss": 0.0255, "step": 3520 }, { "epoch": 0.5554284812872186, "grad_norm": 8.119474411010742, "learning_rate": 4.5371115762357106e-07, "loss": 0.0674, "step": 3521 }, { "epoch": 0.5555862286548093, "grad_norm": 4.2197980880737305, "learning_rate": 4.5355015295443567e-07, "loss": 0.0612, "step": 3522 }, { "epoch": 0.5557439760224001, "grad_norm": 6.642730712890625, "learning_rate": 4.533891482853002e-07, "loss": 0.0458, "step": 3523 }, { "epoch": 0.5559017233899909, "grad_norm": 7.063992023468018, "learning_rate": 4.5322814361616483e-07, "loss": 0.0785, "step": 3524 }, { "epoch": 0.5560594707575818, "grad_norm": 4.294404029846191, "learning_rate": 4.530671389470295e-07, "loss": 0.0348, "step": 3525 }, { "epoch": 0.5562172181251726, "grad_norm": 4.23181676864624, "learning_rate": 4.5290613427789404e-07, "loss": 0.0357, "step": 3526 }, { "epoch": 0.5563749654927633, "grad_norm": 10.125638008117676, "learning_rate": 4.5274512960875865e-07, "loss": 0.0384, "step": 3527 }, { "epoch": 0.5565327128603541, "grad_norm": 5.969682693481445, "learning_rate": 4.525841249396232e-07, "loss": 0.0476, "step": 3528 }, { "epoch": 0.556690460227945, "grad_norm": 4.976404666900635, "learning_rate": 4.5242312027048786e-07, "loss": 0.0816, "step": 3529 }, { "epoch": 0.5568482075955358, "grad_norm": 5.278900146484375, "learning_rate": 4.522621156013524e-07, "loss": 0.0294, "step": 3530 }, { "epoch": 0.5570059549631265, "grad_norm": 0.712295413017273, "learning_rate": 4.52101110932217e-07, "loss": 0.0058, "step": 3531 }, { "epoch": 0.5571637023307173, "grad_norm": 4.528964519500732, "learning_rate": 4.519401062630816e-07, "loss": 0.0404, "step": 3532 }, { "epoch": 0.5573214496983082, "grad_norm": 5.830653190612793, "learning_rate": 4.5177910159394623e-07, "loss": 0.0474, "step": 3533 }, { "epoch": 0.557479197065899, "grad_norm": 5.37799072265625, "learning_rate": 4.516180969248108e-07, "loss": 0.0791, "step": 3534 }, { "epoch": 0.5576369444334898, "grad_norm": 4.497136116027832, "learning_rate": 4.514570922556754e-07, "loss": 0.0459, "step": 3535 }, { "epoch": 0.5577946918010805, "grad_norm": 12.904130935668945, "learning_rate": 4.5129608758654e-07, "loss": 0.0399, "step": 3536 }, { "epoch": 0.5579524391686713, "grad_norm": 7.827426433563232, "learning_rate": 4.511350829174046e-07, "loss": 0.0686, "step": 3537 }, { "epoch": 0.5581101865362622, "grad_norm": 5.223310947418213, "learning_rate": 4.5097407824826916e-07, "loss": 0.0697, "step": 3538 }, { "epoch": 0.558267933903853, "grad_norm": 5.207493782043457, "learning_rate": 4.5081307357913377e-07, "loss": 0.0308, "step": 3539 }, { "epoch": 0.5584256812714438, "grad_norm": 9.767026901245117, "learning_rate": 4.5065206890999837e-07, "loss": 0.0277, "step": 3540 }, { "epoch": 0.5585834286390345, "grad_norm": 3.7552871704101562, "learning_rate": 4.50491064240863e-07, "loss": 0.0664, "step": 3541 }, { "epoch": 0.5587411760066254, "grad_norm": 3.1752941608428955, "learning_rate": 4.5033005957172753e-07, "loss": 0.0214, "step": 3542 }, { "epoch": 0.5588989233742162, "grad_norm": 3.1909866333007812, "learning_rate": 4.5016905490259214e-07, "loss": 0.0194, "step": 3543 }, { "epoch": 0.559056670741807, "grad_norm": 7.0103325843811035, "learning_rate": 4.5000805023345674e-07, "loss": 0.1283, "step": 3544 }, { "epoch": 0.5592144181093978, "grad_norm": 8.944089889526367, "learning_rate": 4.4984704556432135e-07, "loss": 0.0369, "step": 3545 }, { "epoch": 0.5593721654769886, "grad_norm": 2.797886848449707, "learning_rate": 4.496860408951859e-07, "loss": 0.0237, "step": 3546 }, { "epoch": 0.5595299128445794, "grad_norm": 4.817679405212402, "learning_rate": 4.495250362260505e-07, "loss": 0.0402, "step": 3547 }, { "epoch": 0.5596876602121702, "grad_norm": 4.421696662902832, "learning_rate": 4.4936403155691517e-07, "loss": 0.0787, "step": 3548 }, { "epoch": 0.559845407579761, "grad_norm": 4.182826519012451, "learning_rate": 4.492030268877797e-07, "loss": 0.0542, "step": 3549 }, { "epoch": 0.5600031549473519, "grad_norm": 4.986604690551758, "learning_rate": 4.4904202221864433e-07, "loss": 0.06, "step": 3550 }, { "epoch": 0.5601609023149426, "grad_norm": 6.201351165771484, "learning_rate": 4.4888101754950894e-07, "loss": 0.0344, "step": 3551 }, { "epoch": 0.5603186496825334, "grad_norm": 9.955351829528809, "learning_rate": 4.4872001288037354e-07, "loss": 0.0868, "step": 3552 }, { "epoch": 0.5604763970501242, "grad_norm": 6.308249473571777, "learning_rate": 4.485590082112381e-07, "loss": 0.0451, "step": 3553 }, { "epoch": 0.560634144417715, "grad_norm": 5.036297798156738, "learning_rate": 4.483980035421027e-07, "loss": 0.034, "step": 3554 }, { "epoch": 0.5607918917853059, "grad_norm": 6.0406293869018555, "learning_rate": 4.482369988729673e-07, "loss": 0.0182, "step": 3555 }, { "epoch": 0.5609496391528966, "grad_norm": 6.802767753601074, "learning_rate": 4.480759942038319e-07, "loss": 0.0706, "step": 3556 }, { "epoch": 0.5611073865204874, "grad_norm": 2.260598659515381, "learning_rate": 4.4791498953469647e-07, "loss": 0.0112, "step": 3557 }, { "epoch": 0.5612651338880782, "grad_norm": 5.509818077087402, "learning_rate": 4.477539848655611e-07, "loss": 0.0924, "step": 3558 }, { "epoch": 0.5614228812556691, "grad_norm": 2.3354532718658447, "learning_rate": 4.475929801964257e-07, "loss": 0.033, "step": 3559 }, { "epoch": 0.5615806286232599, "grad_norm": 1.7844414710998535, "learning_rate": 4.474319755272903e-07, "loss": 0.0152, "step": 3560 }, { "epoch": 0.5617383759908506, "grad_norm": 6.738806247711182, "learning_rate": 4.4727097085815484e-07, "loss": 0.0756, "step": 3561 }, { "epoch": 0.5618961233584414, "grad_norm": 4.664470195770264, "learning_rate": 4.4710996618901945e-07, "loss": 0.0299, "step": 3562 }, { "epoch": 0.5620538707260323, "grad_norm": 7.961429119110107, "learning_rate": 4.4694896151988405e-07, "loss": 0.0919, "step": 3563 }, { "epoch": 0.5622116180936231, "grad_norm": 8.812355995178223, "learning_rate": 4.4678795685074866e-07, "loss": 0.0479, "step": 3564 }, { "epoch": 0.5623693654612139, "grad_norm": 2.8166677951812744, "learning_rate": 4.466269521816132e-07, "loss": 0.0226, "step": 3565 }, { "epoch": 0.5625271128288046, "grad_norm": 4.626858234405518, "learning_rate": 4.4646594751247787e-07, "loss": 0.0438, "step": 3566 }, { "epoch": 0.5626848601963955, "grad_norm": 6.821376800537109, "learning_rate": 4.463049428433424e-07, "loss": 0.0307, "step": 3567 }, { "epoch": 0.5628426075639863, "grad_norm": 1.3527711629867554, "learning_rate": 4.4614393817420703e-07, "loss": 0.0126, "step": 3568 }, { "epoch": 0.5630003549315771, "grad_norm": 6.533590793609619, "learning_rate": 4.459829335050716e-07, "loss": 0.0544, "step": 3569 }, { "epoch": 0.5631581022991679, "grad_norm": 4.931443214416504, "learning_rate": 4.4582192883593624e-07, "loss": 0.0374, "step": 3570 }, { "epoch": 0.5633158496667587, "grad_norm": 4.620832920074463, "learning_rate": 4.4566092416680085e-07, "loss": 0.0592, "step": 3571 }, { "epoch": 0.5634735970343495, "grad_norm": 4.254405498504639, "learning_rate": 4.454999194976654e-07, "loss": 0.0428, "step": 3572 }, { "epoch": 0.5636313444019403, "grad_norm": 3.803039312362671, "learning_rate": 4.4533891482853e-07, "loss": 0.0283, "step": 3573 }, { "epoch": 0.5637890917695311, "grad_norm": 1.895159125328064, "learning_rate": 4.451779101593946e-07, "loss": 0.0143, "step": 3574 }, { "epoch": 0.5639468391371218, "grad_norm": 8.910772323608398, "learning_rate": 4.450169054902592e-07, "loss": 0.0906, "step": 3575 }, { "epoch": 0.5641045865047127, "grad_norm": 7.452020168304443, "learning_rate": 4.448559008211238e-07, "loss": 0.0389, "step": 3576 }, { "epoch": 0.5642623338723035, "grad_norm": 6.074785232543945, "learning_rate": 4.446948961519884e-07, "loss": 0.0544, "step": 3577 }, { "epoch": 0.5644200812398943, "grad_norm": 2.867938756942749, "learning_rate": 4.44533891482853e-07, "loss": 0.0141, "step": 3578 }, { "epoch": 0.5645778286074851, "grad_norm": 5.436214447021484, "learning_rate": 4.443728868137176e-07, "loss": 0.0427, "step": 3579 }, { "epoch": 0.564735575975076, "grad_norm": 5.736272811889648, "learning_rate": 4.4421188214458215e-07, "loss": 0.0629, "step": 3580 }, { "epoch": 0.5648933233426667, "grad_norm": 7.325777530670166, "learning_rate": 4.440508774754468e-07, "loss": 0.0482, "step": 3581 }, { "epoch": 0.5650510707102575, "grad_norm": 3.6818907260894775, "learning_rate": 4.4388987280631136e-07, "loss": 0.0268, "step": 3582 }, { "epoch": 0.5652088180778483, "grad_norm": 5.712748050689697, "learning_rate": 4.4372886813717597e-07, "loss": 0.0583, "step": 3583 }, { "epoch": 0.5653665654454392, "grad_norm": 2.976290464401245, "learning_rate": 4.435678634680405e-07, "loss": 0.0155, "step": 3584 }, { "epoch": 0.56552431281303, "grad_norm": 4.396505832672119, "learning_rate": 4.434068587989052e-07, "loss": 0.0215, "step": 3585 }, { "epoch": 0.5656820601806207, "grad_norm": 3.87809157371521, "learning_rate": 4.4324585412976973e-07, "loss": 0.0238, "step": 3586 }, { "epoch": 0.5658398075482115, "grad_norm": 5.459232807159424, "learning_rate": 4.4308484946063434e-07, "loss": 0.0462, "step": 3587 }, { "epoch": 0.5659975549158024, "grad_norm": 7.381917476654053, "learning_rate": 4.429238447914989e-07, "loss": 0.0517, "step": 3588 }, { "epoch": 0.5661553022833932, "grad_norm": 6.301811218261719, "learning_rate": 4.4276284012236355e-07, "loss": 0.0395, "step": 3589 }, { "epoch": 0.566313049650984, "grad_norm": 6.701537609100342, "learning_rate": 4.426018354532281e-07, "loss": 0.064, "step": 3590 }, { "epoch": 0.5664707970185747, "grad_norm": 6.956577301025391, "learning_rate": 4.424408307840927e-07, "loss": 0.027, "step": 3591 }, { "epoch": 0.5666285443861656, "grad_norm": 6.830028057098389, "learning_rate": 4.4227982611495726e-07, "loss": 0.0788, "step": 3592 }, { "epoch": 0.5667862917537564, "grad_norm": 5.745498180389404, "learning_rate": 4.421188214458219e-07, "loss": 0.0823, "step": 3593 }, { "epoch": 0.5669440391213472, "grad_norm": 8.539166450500488, "learning_rate": 4.419578167766865e-07, "loss": 0.021, "step": 3594 }, { "epoch": 0.5671017864889379, "grad_norm": 8.19500732421875, "learning_rate": 4.417968121075511e-07, "loss": 0.0428, "step": 3595 }, { "epoch": 0.5672595338565287, "grad_norm": 5.17337703704834, "learning_rate": 4.4163580743841574e-07, "loss": 0.0982, "step": 3596 }, { "epoch": 0.5674172812241196, "grad_norm": 5.357944965362549, "learning_rate": 4.414748027692803e-07, "loss": 0.0188, "step": 3597 }, { "epoch": 0.5675750285917104, "grad_norm": 5.059239864349365, "learning_rate": 4.413137981001449e-07, "loss": 0.0285, "step": 3598 }, { "epoch": 0.5677327759593012, "grad_norm": 3.6183338165283203, "learning_rate": 4.4115279343100946e-07, "loss": 0.0534, "step": 3599 }, { "epoch": 0.5678905233268919, "grad_norm": 3.2750072479248047, "learning_rate": 4.409917887618741e-07, "loss": 0.0229, "step": 3600 }, { "epoch": 0.5680482706944828, "grad_norm": 6.083227634429932, "learning_rate": 4.4083078409273867e-07, "loss": 0.0634, "step": 3601 }, { "epoch": 0.5682060180620736, "grad_norm": 6.175106525421143, "learning_rate": 4.406697794236033e-07, "loss": 0.0628, "step": 3602 }, { "epoch": 0.5683637654296644, "grad_norm": 3.048811197280884, "learning_rate": 4.4050877475446783e-07, "loss": 0.0191, "step": 3603 }, { "epoch": 0.5685215127972552, "grad_norm": 3.7438251972198486, "learning_rate": 4.403477700853325e-07, "loss": 0.034, "step": 3604 }, { "epoch": 0.568679260164846, "grad_norm": 2.688021659851074, "learning_rate": 4.4018676541619704e-07, "loss": 0.0154, "step": 3605 }, { "epoch": 0.5688370075324368, "grad_norm": 4.568440914154053, "learning_rate": 4.4002576074706165e-07, "loss": 0.0536, "step": 3606 }, { "epoch": 0.5689947549000276, "grad_norm": 4.3514323234558105, "learning_rate": 4.398647560779262e-07, "loss": 0.0249, "step": 3607 }, { "epoch": 0.5691525022676184, "grad_norm": 9.23238468170166, "learning_rate": 4.3970375140879086e-07, "loss": 0.072, "step": 3608 }, { "epoch": 0.5693102496352093, "grad_norm": 25.24394989013672, "learning_rate": 4.395427467396554e-07, "loss": 0.0431, "step": 3609 }, { "epoch": 0.5694679970028, "grad_norm": 5.3971052169799805, "learning_rate": 4.3938174207052e-07, "loss": 0.0692, "step": 3610 }, { "epoch": 0.5696257443703908, "grad_norm": 1.751876950263977, "learning_rate": 4.392207374013846e-07, "loss": 0.0128, "step": 3611 }, { "epoch": 0.5697834917379816, "grad_norm": 4.0099005699157715, "learning_rate": 4.3905973273224923e-07, "loss": 0.0545, "step": 3612 }, { "epoch": 0.5699412391055724, "grad_norm": 5.732677936553955, "learning_rate": 4.388987280631138e-07, "loss": 0.0563, "step": 3613 }, { "epoch": 0.5700989864731633, "grad_norm": 6.623098373413086, "learning_rate": 4.387377233939784e-07, "loss": 0.0269, "step": 3614 }, { "epoch": 0.570256733840754, "grad_norm": 7.234887599945068, "learning_rate": 4.38576718724843e-07, "loss": 0.0952, "step": 3615 }, { "epoch": 0.5704144812083448, "grad_norm": 5.600711345672607, "learning_rate": 4.384157140557076e-07, "loss": 0.0298, "step": 3616 }, { "epoch": 0.5705722285759356, "grad_norm": 5.513890743255615, "learning_rate": 4.3825470938657216e-07, "loss": 0.0275, "step": 3617 }, { "epoch": 0.5707299759435265, "grad_norm": 4.5403337478637695, "learning_rate": 4.3809370471743676e-07, "loss": 0.0458, "step": 3618 }, { "epoch": 0.5708877233111173, "grad_norm": 2.449573516845703, "learning_rate": 4.379327000483014e-07, "loss": 0.0124, "step": 3619 }, { "epoch": 0.571045470678708, "grad_norm": 6.583084583282471, "learning_rate": 4.37771695379166e-07, "loss": 0.0841, "step": 3620 }, { "epoch": 0.5712032180462988, "grad_norm": 3.6035895347595215, "learning_rate": 4.376106907100306e-07, "loss": 0.0375, "step": 3621 }, { "epoch": 0.5713609654138897, "grad_norm": 9.386648178100586, "learning_rate": 4.3744968604089514e-07, "loss": 0.0745, "step": 3622 }, { "epoch": 0.5715187127814805, "grad_norm": 5.065177917480469, "learning_rate": 4.372886813717598e-07, "loss": 0.0708, "step": 3623 }, { "epoch": 0.5716764601490713, "grad_norm": 5.430810928344727, "learning_rate": 4.3712767670262435e-07, "loss": 0.0348, "step": 3624 }, { "epoch": 0.571834207516662, "grad_norm": 2.6487369537353516, "learning_rate": 4.3696667203348896e-07, "loss": 0.0116, "step": 3625 }, { "epoch": 0.5719919548842529, "grad_norm": 6.235917568206787, "learning_rate": 4.3680566736435356e-07, "loss": 0.0348, "step": 3626 }, { "epoch": 0.5721497022518437, "grad_norm": 6.639988899230957, "learning_rate": 4.3664466269521817e-07, "loss": 0.0418, "step": 3627 }, { "epoch": 0.5723074496194345, "grad_norm": 24.25504493713379, "learning_rate": 4.364836580260827e-07, "loss": 0.0543, "step": 3628 }, { "epoch": 0.5724651969870252, "grad_norm": 5.3725175857543945, "learning_rate": 4.3632265335694733e-07, "loss": 0.0462, "step": 3629 }, { "epoch": 0.5726229443546161, "grad_norm": 2.526221752166748, "learning_rate": 4.3616164868781193e-07, "loss": 0.0144, "step": 3630 }, { "epoch": 0.5727806917222069, "grad_norm": 9.240471839904785, "learning_rate": 4.3600064401867654e-07, "loss": 0.0432, "step": 3631 }, { "epoch": 0.5729384390897977, "grad_norm": 7.248495101928711, "learning_rate": 4.358396393495411e-07, "loss": 0.0247, "step": 3632 }, { "epoch": 0.5730961864573885, "grad_norm": 12.687851905822754, "learning_rate": 4.356786346804057e-07, "loss": 0.0641, "step": 3633 }, { "epoch": 0.5732539338249792, "grad_norm": 7.592212677001953, "learning_rate": 4.355176300112703e-07, "loss": 0.0675, "step": 3634 }, { "epoch": 0.5734116811925701, "grad_norm": 6.328648090362549, "learning_rate": 4.353566253421349e-07, "loss": 0.0605, "step": 3635 }, { "epoch": 0.5735694285601609, "grad_norm": 2.8092939853668213, "learning_rate": 4.3519562067299947e-07, "loss": 0.0199, "step": 3636 }, { "epoch": 0.5737271759277517, "grad_norm": 2.8142776489257812, "learning_rate": 4.3503461600386407e-07, "loss": 0.0202, "step": 3637 }, { "epoch": 0.5738849232953425, "grad_norm": 3.231452465057373, "learning_rate": 4.348736113347287e-07, "loss": 0.0193, "step": 3638 }, { "epoch": 0.5740426706629334, "grad_norm": 3.44266676902771, "learning_rate": 4.347126066655933e-07, "loss": 0.0451, "step": 3639 }, { "epoch": 0.5742004180305241, "grad_norm": 7.478565692901611, "learning_rate": 4.3455160199645784e-07, "loss": 0.0898, "step": 3640 }, { "epoch": 0.5743581653981149, "grad_norm": 6.21815824508667, "learning_rate": 4.343905973273225e-07, "loss": 0.0224, "step": 3641 }, { "epoch": 0.5745159127657057, "grad_norm": 4.524927616119385, "learning_rate": 4.342295926581871e-07, "loss": 0.0594, "step": 3642 }, { "epoch": 0.5746736601332966, "grad_norm": 4.425346851348877, "learning_rate": 4.3406858798905166e-07, "loss": 0.0448, "step": 3643 }, { "epoch": 0.5748314075008873, "grad_norm": 2.897800922393799, "learning_rate": 4.3390758331991626e-07, "loss": 0.026, "step": 3644 }, { "epoch": 0.5749891548684781, "grad_norm": 5.019111156463623, "learning_rate": 4.3374657865078087e-07, "loss": 0.059, "step": 3645 }, { "epoch": 0.5751469022360689, "grad_norm": 4.009286880493164, "learning_rate": 4.335855739816455e-07, "loss": 0.0288, "step": 3646 }, { "epoch": 0.5753046496036598, "grad_norm": 7.221583843231201, "learning_rate": 4.3342456931251003e-07, "loss": 0.0882, "step": 3647 }, { "epoch": 0.5754623969712506, "grad_norm": 5.97765588760376, "learning_rate": 4.3326356464337464e-07, "loss": 0.0824, "step": 3648 }, { "epoch": 0.5756201443388413, "grad_norm": 5.377807140350342, "learning_rate": 4.3310255997423924e-07, "loss": 0.0706, "step": 3649 }, { "epoch": 0.5757778917064321, "grad_norm": 2.622694492340088, "learning_rate": 4.3294155530510385e-07, "loss": 0.0229, "step": 3650 }, { "epoch": 0.5759356390740229, "grad_norm": 3.528857707977295, "learning_rate": 4.327805506359684e-07, "loss": 0.0358, "step": 3651 }, { "epoch": 0.5760933864416138, "grad_norm": 2.9627106189727783, "learning_rate": 4.32619545966833e-07, "loss": 0.0154, "step": 3652 }, { "epoch": 0.5762511338092046, "grad_norm": 5.691407203674316, "learning_rate": 4.324585412976976e-07, "loss": 0.0374, "step": 3653 }, { "epoch": 0.5764088811767953, "grad_norm": 7.110293388366699, "learning_rate": 4.322975366285622e-07, "loss": 0.0608, "step": 3654 }, { "epoch": 0.5765666285443861, "grad_norm": 4.146986961364746, "learning_rate": 4.321365319594268e-07, "loss": 0.0254, "step": 3655 }, { "epoch": 0.576724375911977, "grad_norm": 5.152608394622803, "learning_rate": 4.3197552729029143e-07, "loss": 0.0481, "step": 3656 }, { "epoch": 0.5768821232795678, "grad_norm": 7.030542373657227, "learning_rate": 4.31814522621156e-07, "loss": 0.0655, "step": 3657 }, { "epoch": 0.5770398706471586, "grad_norm": 6.886842250823975, "learning_rate": 4.316535179520206e-07, "loss": 0.0739, "step": 3658 }, { "epoch": 0.5771976180147493, "grad_norm": 3.774353504180908, "learning_rate": 4.3149251328288515e-07, "loss": 0.016, "step": 3659 }, { "epoch": 0.5773553653823402, "grad_norm": 5.763378143310547, "learning_rate": 4.313315086137498e-07, "loss": 0.0478, "step": 3660 }, { "epoch": 0.577513112749931, "grad_norm": 4.991652011871338, "learning_rate": 4.3117050394461436e-07, "loss": 0.0489, "step": 3661 }, { "epoch": 0.5776708601175218, "grad_norm": 5.139795303344727, "learning_rate": 4.3100949927547897e-07, "loss": 0.0507, "step": 3662 }, { "epoch": 0.5778286074851126, "grad_norm": 5.135206699371338, "learning_rate": 4.308484946063435e-07, "loss": 0.0642, "step": 3663 }, { "epoch": 0.5779863548527034, "grad_norm": 3.217456340789795, "learning_rate": 4.306874899372082e-07, "loss": 0.03, "step": 3664 }, { "epoch": 0.5781441022202942, "grad_norm": 6.17140007019043, "learning_rate": 4.305264852680728e-07, "loss": 0.106, "step": 3665 }, { "epoch": 0.578301849587885, "grad_norm": 9.629501342773438, "learning_rate": 4.3036548059893734e-07, "loss": 0.0751, "step": 3666 }, { "epoch": 0.5784595969554758, "grad_norm": 4.909124851226807, "learning_rate": 4.3020447592980194e-07, "loss": 0.0262, "step": 3667 }, { "epoch": 0.5786173443230667, "grad_norm": 3.7654974460601807, "learning_rate": 4.3004347126066655e-07, "loss": 0.0168, "step": 3668 }, { "epoch": 0.5787750916906574, "grad_norm": 2.6891441345214844, "learning_rate": 4.2988246659153116e-07, "loss": 0.0144, "step": 3669 }, { "epoch": 0.5789328390582482, "grad_norm": 3.555137872695923, "learning_rate": 4.297214619223957e-07, "loss": 0.0339, "step": 3670 }, { "epoch": 0.579090586425839, "grad_norm": 5.062921047210693, "learning_rate": 4.2956045725326037e-07, "loss": 0.044, "step": 3671 }, { "epoch": 0.5792483337934298, "grad_norm": 6.692593097686768, "learning_rate": 4.293994525841249e-07, "loss": 0.0726, "step": 3672 }, { "epoch": 0.5794060811610207, "grad_norm": 7.257974147796631, "learning_rate": 4.2923844791498953e-07, "loss": 0.0226, "step": 3673 }, { "epoch": 0.5795638285286114, "grad_norm": 8.177369117736816, "learning_rate": 4.290774432458541e-07, "loss": 0.0357, "step": 3674 }, { "epoch": 0.5797215758962022, "grad_norm": 6.1049065589904785, "learning_rate": 4.2891643857671874e-07, "loss": 0.0286, "step": 3675 }, { "epoch": 0.579879323263793, "grad_norm": 1.6039459705352783, "learning_rate": 4.287554339075833e-07, "loss": 0.0111, "step": 3676 }, { "epoch": 0.5800370706313839, "grad_norm": 23.160669326782227, "learning_rate": 4.285944292384479e-07, "loss": 0.0616, "step": 3677 }, { "epoch": 0.5801948179989747, "grad_norm": 6.465285778045654, "learning_rate": 4.2843342456931245e-07, "loss": 0.0437, "step": 3678 }, { "epoch": 0.5803525653665654, "grad_norm": 3.6227431297302246, "learning_rate": 4.282724199001771e-07, "loss": 0.0228, "step": 3679 }, { "epoch": 0.5805103127341562, "grad_norm": 4.434861183166504, "learning_rate": 4.2811141523104167e-07, "loss": 0.0439, "step": 3680 }, { "epoch": 0.5806680601017471, "grad_norm": 4.3915228843688965, "learning_rate": 4.279504105619063e-07, "loss": 0.0152, "step": 3681 }, { "epoch": 0.5808258074693379, "grad_norm": 10.597392082214355, "learning_rate": 4.2778940589277083e-07, "loss": 0.0628, "step": 3682 }, { "epoch": 0.5809835548369287, "grad_norm": 11.902427673339844, "learning_rate": 4.276284012236355e-07, "loss": 0.064, "step": 3683 }, { "epoch": 0.5811413022045194, "grad_norm": 6.83543586730957, "learning_rate": 4.2746739655450004e-07, "loss": 0.0525, "step": 3684 }, { "epoch": 0.5812990495721103, "grad_norm": 3.0453226566314697, "learning_rate": 4.2730639188536465e-07, "loss": 0.0467, "step": 3685 }, { "epoch": 0.5814567969397011, "grad_norm": 5.81014347076416, "learning_rate": 4.271453872162292e-07, "loss": 0.0578, "step": 3686 }, { "epoch": 0.5816145443072919, "grad_norm": 2.190941333770752, "learning_rate": 4.2698438254709386e-07, "loss": 0.0091, "step": 3687 }, { "epoch": 0.5817722916748826, "grad_norm": 3.178262710571289, "learning_rate": 4.2682337787795847e-07, "loss": 0.0694, "step": 3688 }, { "epoch": 0.5819300390424734, "grad_norm": 4.956207752227783, "learning_rate": 4.26662373208823e-07, "loss": 0.033, "step": 3689 }, { "epoch": 0.5820877864100643, "grad_norm": 7.0127105712890625, "learning_rate": 4.265013685396877e-07, "loss": 0.0465, "step": 3690 }, { "epoch": 0.5822455337776551, "grad_norm": 0.9440305829048157, "learning_rate": 4.2634036387055223e-07, "loss": 0.005, "step": 3691 }, { "epoch": 0.5824032811452459, "grad_norm": 5.224591255187988, "learning_rate": 4.2617935920141684e-07, "loss": 0.0556, "step": 3692 }, { "epoch": 0.5825610285128366, "grad_norm": 6.958468914031982, "learning_rate": 4.260183545322814e-07, "loss": 0.0655, "step": 3693 }, { "epoch": 0.5827187758804275, "grad_norm": 7.18191385269165, "learning_rate": 4.2585734986314605e-07, "loss": 0.0813, "step": 3694 }, { "epoch": 0.5828765232480183, "grad_norm": 6.075129985809326, "learning_rate": 4.256963451940106e-07, "loss": 0.0164, "step": 3695 }, { "epoch": 0.5830342706156091, "grad_norm": 4.780632495880127, "learning_rate": 4.255353405248752e-07, "loss": 0.0145, "step": 3696 }, { "epoch": 0.5831920179831999, "grad_norm": 5.390576362609863, "learning_rate": 4.2537433585573976e-07, "loss": 0.0299, "step": 3697 }, { "epoch": 0.5833497653507907, "grad_norm": 6.3759846687316895, "learning_rate": 4.252133311866044e-07, "loss": 0.0443, "step": 3698 }, { "epoch": 0.5835075127183815, "grad_norm": 4.434875011444092, "learning_rate": 4.25052326517469e-07, "loss": 0.0518, "step": 3699 }, { "epoch": 0.5836652600859723, "grad_norm": 5.077605724334717, "learning_rate": 4.248913218483336e-07, "loss": 0.0371, "step": 3700 }, { "epoch": 0.5838230074535631, "grad_norm": 3.5893185138702393, "learning_rate": 4.2473031717919814e-07, "loss": 0.0552, "step": 3701 }, { "epoch": 0.583980754821154, "grad_norm": 3.928100347518921, "learning_rate": 4.245693125100628e-07, "loss": 0.0393, "step": 3702 }, { "epoch": 0.5841385021887447, "grad_norm": 4.908411026000977, "learning_rate": 4.2440830784092735e-07, "loss": 0.0357, "step": 3703 }, { "epoch": 0.5842962495563355, "grad_norm": 4.357424736022949, "learning_rate": 4.2424730317179195e-07, "loss": 0.0249, "step": 3704 }, { "epoch": 0.5844539969239263, "grad_norm": 5.003686904907227, "learning_rate": 4.2408629850265656e-07, "loss": 0.0497, "step": 3705 }, { "epoch": 0.5846117442915172, "grad_norm": 4.742331027984619, "learning_rate": 4.2392529383352117e-07, "loss": 0.027, "step": 3706 }, { "epoch": 0.584769491659108, "grad_norm": 8.138925552368164, "learning_rate": 4.237642891643857e-07, "loss": 0.0615, "step": 3707 }, { "epoch": 0.5849272390266987, "grad_norm": 11.250143051147461, "learning_rate": 4.2360328449525033e-07, "loss": 0.0441, "step": 3708 }, { "epoch": 0.5850849863942895, "grad_norm": 3.3631234169006348, "learning_rate": 4.2344227982611493e-07, "loss": 0.0212, "step": 3709 }, { "epoch": 0.5852427337618803, "grad_norm": 5.691203594207764, "learning_rate": 4.2328127515697954e-07, "loss": 0.0664, "step": 3710 }, { "epoch": 0.5854004811294712, "grad_norm": 8.211243629455566, "learning_rate": 4.2312027048784415e-07, "loss": 0.0247, "step": 3711 }, { "epoch": 0.585558228497062, "grad_norm": 4.621741771697998, "learning_rate": 4.229592658187087e-07, "loss": 0.0175, "step": 3712 }, { "epoch": 0.5857159758646527, "grad_norm": 7.450106143951416, "learning_rate": 4.2279826114957336e-07, "loss": 0.0811, "step": 3713 }, { "epoch": 0.5858737232322435, "grad_norm": 3.560211181640625, "learning_rate": 4.226372564804379e-07, "loss": 0.0275, "step": 3714 }, { "epoch": 0.5860314705998344, "grad_norm": 5.3267388343811035, "learning_rate": 4.224762518113025e-07, "loss": 0.0469, "step": 3715 }, { "epoch": 0.5861892179674252, "grad_norm": 8.223593711853027, "learning_rate": 4.2231524714216707e-07, "loss": 0.0383, "step": 3716 }, { "epoch": 0.586346965335016, "grad_norm": 7.442598819732666, "learning_rate": 4.2215424247303173e-07, "loss": 0.0441, "step": 3717 }, { "epoch": 0.5865047127026067, "grad_norm": 6.4629807472229, "learning_rate": 4.219932378038963e-07, "loss": 0.0241, "step": 3718 }, { "epoch": 0.5866624600701976, "grad_norm": 5.52560567855835, "learning_rate": 4.218322331347609e-07, "loss": 0.0468, "step": 3719 }, { "epoch": 0.5868202074377884, "grad_norm": 3.707911729812622, "learning_rate": 4.216712284656255e-07, "loss": 0.0286, "step": 3720 }, { "epoch": 0.5869779548053792, "grad_norm": 6.9518585205078125, "learning_rate": 4.215102237964901e-07, "loss": 0.0383, "step": 3721 }, { "epoch": 0.58713570217297, "grad_norm": 5.35220193862915, "learning_rate": 4.2134921912735466e-07, "loss": 0.077, "step": 3722 }, { "epoch": 0.5872934495405608, "grad_norm": 5.377768516540527, "learning_rate": 4.2118821445821926e-07, "loss": 0.0225, "step": 3723 }, { "epoch": 0.5874511969081516, "grad_norm": 7.410951614379883, "learning_rate": 4.2102720978908387e-07, "loss": 0.0295, "step": 3724 }, { "epoch": 0.5876089442757424, "grad_norm": 3.232762336730957, "learning_rate": 4.208662051199485e-07, "loss": 0.0387, "step": 3725 }, { "epoch": 0.5877666916433332, "grad_norm": 4.976396560668945, "learning_rate": 4.2070520045081303e-07, "loss": 0.0253, "step": 3726 }, { "epoch": 0.5879244390109241, "grad_norm": 5.500836372375488, "learning_rate": 4.2054419578167763e-07, "loss": 0.0648, "step": 3727 }, { "epoch": 0.5880821863785148, "grad_norm": 3.2531464099884033, "learning_rate": 4.2038319111254224e-07, "loss": 0.0253, "step": 3728 }, { "epoch": 0.5882399337461056, "grad_norm": 3.775691032409668, "learning_rate": 4.2022218644340685e-07, "loss": 0.0228, "step": 3729 }, { "epoch": 0.5883976811136964, "grad_norm": 6.692509174346924, "learning_rate": 4.200611817742714e-07, "loss": 0.0512, "step": 3730 }, { "epoch": 0.5885554284812872, "grad_norm": 8.168987274169922, "learning_rate": 4.19900177105136e-07, "loss": 0.0431, "step": 3731 }, { "epoch": 0.5887131758488781, "grad_norm": 9.085684776306152, "learning_rate": 4.197391724360006e-07, "loss": 0.0628, "step": 3732 }, { "epoch": 0.5888709232164688, "grad_norm": 3.0501174926757812, "learning_rate": 4.195781677668652e-07, "loss": 0.0272, "step": 3733 }, { "epoch": 0.5890286705840596, "grad_norm": 5.695446014404297, "learning_rate": 4.194171630977298e-07, "loss": 0.0235, "step": 3734 }, { "epoch": 0.5891864179516504, "grad_norm": 7.079178810119629, "learning_rate": 4.1925615842859443e-07, "loss": 0.0655, "step": 3735 }, { "epoch": 0.5893441653192413, "grad_norm": 4.774590969085693, "learning_rate": 4.1909515375945904e-07, "loss": 0.0675, "step": 3736 }, { "epoch": 0.589501912686832, "grad_norm": 4.56651496887207, "learning_rate": 4.189341490903236e-07, "loss": 0.0344, "step": 3737 }, { "epoch": 0.5896596600544228, "grad_norm": 6.133149147033691, "learning_rate": 4.187731444211882e-07, "loss": 0.0771, "step": 3738 }, { "epoch": 0.5898174074220136, "grad_norm": 5.865187168121338, "learning_rate": 4.186121397520528e-07, "loss": 0.0435, "step": 3739 }, { "epoch": 0.5899751547896045, "grad_norm": 4.250848770141602, "learning_rate": 4.184511350829174e-07, "loss": 0.0368, "step": 3740 }, { "epoch": 0.5901329021571953, "grad_norm": 7.351658821105957, "learning_rate": 4.1829013041378196e-07, "loss": 0.0586, "step": 3741 }, { "epoch": 0.590290649524786, "grad_norm": 7.026547908782959, "learning_rate": 4.1812912574464657e-07, "loss": 0.0723, "step": 3742 }, { "epoch": 0.5904483968923768, "grad_norm": 4.544672012329102, "learning_rate": 4.179681210755112e-07, "loss": 0.0287, "step": 3743 }, { "epoch": 0.5906061442599677, "grad_norm": 4.992133617401123, "learning_rate": 4.178071164063758e-07, "loss": 0.0434, "step": 3744 }, { "epoch": 0.5907638916275585, "grad_norm": 6.560860633850098, "learning_rate": 4.1764611173724034e-07, "loss": 0.0939, "step": 3745 }, { "epoch": 0.5909216389951493, "grad_norm": 3.0830299854278564, "learning_rate": 4.1748510706810494e-07, "loss": 0.0418, "step": 3746 }, { "epoch": 0.59107938636274, "grad_norm": 4.333657264709473, "learning_rate": 4.1732410239896955e-07, "loss": 0.0564, "step": 3747 }, { "epoch": 0.5912371337303308, "grad_norm": 11.215478897094727, "learning_rate": 4.1716309772983416e-07, "loss": 0.0601, "step": 3748 }, { "epoch": 0.5913948810979217, "grad_norm": 8.15578556060791, "learning_rate": 4.170020930606987e-07, "loss": 0.0632, "step": 3749 }, { "epoch": 0.5915526284655125, "grad_norm": 6.492042064666748, "learning_rate": 4.1684108839156337e-07, "loss": 0.0423, "step": 3750 }, { "epoch": 0.5917103758331033, "grad_norm": 6.464126110076904, "learning_rate": 4.166800837224279e-07, "loss": 0.0563, "step": 3751 }, { "epoch": 0.591868123200694, "grad_norm": 4.373297214508057, "learning_rate": 4.1651907905329253e-07, "loss": 0.0265, "step": 3752 }, { "epoch": 0.5920258705682849, "grad_norm": 6.5860772132873535, "learning_rate": 4.163580743841571e-07, "loss": 0.0612, "step": 3753 }, { "epoch": 0.5921836179358757, "grad_norm": 10.905440330505371, "learning_rate": 4.1619706971502174e-07, "loss": 0.0791, "step": 3754 }, { "epoch": 0.5923413653034665, "grad_norm": 5.605947017669678, "learning_rate": 4.160360650458863e-07, "loss": 0.0905, "step": 3755 }, { "epoch": 0.5924991126710573, "grad_norm": 6.811803817749023, "learning_rate": 4.158750603767509e-07, "loss": 0.0751, "step": 3756 }, { "epoch": 0.5926568600386481, "grad_norm": 5.964085102081299, "learning_rate": 4.157140557076155e-07, "loss": 0.0321, "step": 3757 }, { "epoch": 0.5928146074062389, "grad_norm": 5.20408296585083, "learning_rate": 4.155530510384801e-07, "loss": 0.0687, "step": 3758 }, { "epoch": 0.5929723547738297, "grad_norm": 6.977663993835449, "learning_rate": 4.153920463693447e-07, "loss": 0.1494, "step": 3759 }, { "epoch": 0.5931301021414205, "grad_norm": 5.758049488067627, "learning_rate": 4.1523104170020927e-07, "loss": 0.0372, "step": 3760 }, { "epoch": 0.5932878495090114, "grad_norm": 6.385850429534912, "learning_rate": 4.150700370310739e-07, "loss": 0.0405, "step": 3761 }, { "epoch": 0.5934455968766021, "grad_norm": 5.329529285430908, "learning_rate": 4.149090323619385e-07, "loss": 0.0484, "step": 3762 }, { "epoch": 0.5936033442441929, "grad_norm": 6.6791887283325195, "learning_rate": 4.147480276928031e-07, "loss": 0.0831, "step": 3763 }, { "epoch": 0.5937610916117837, "grad_norm": 2.605978012084961, "learning_rate": 4.1458702302366765e-07, "loss": 0.0175, "step": 3764 }, { "epoch": 0.5939188389793746, "grad_norm": 5.62919807434082, "learning_rate": 4.144260183545323e-07, "loss": 0.0628, "step": 3765 }, { "epoch": 0.5940765863469654, "grad_norm": 3.591205596923828, "learning_rate": 4.1426501368539686e-07, "loss": 0.0138, "step": 3766 }, { "epoch": 0.5942343337145561, "grad_norm": 7.075249195098877, "learning_rate": 4.1410400901626146e-07, "loss": 0.0881, "step": 3767 }, { "epoch": 0.5943920810821469, "grad_norm": 7.193486213684082, "learning_rate": 4.13943004347126e-07, "loss": 0.0773, "step": 3768 }, { "epoch": 0.5945498284497377, "grad_norm": 5.35740852355957, "learning_rate": 4.137819996779907e-07, "loss": 0.0772, "step": 3769 }, { "epoch": 0.5947075758173286, "grad_norm": 6.029299259185791, "learning_rate": 4.1362099500885523e-07, "loss": 0.0478, "step": 3770 }, { "epoch": 0.5948653231849194, "grad_norm": 3.6259782314300537, "learning_rate": 4.1345999033971984e-07, "loss": 0.0134, "step": 3771 }, { "epoch": 0.5950230705525101, "grad_norm": 4.932665824890137, "learning_rate": 4.132989856705844e-07, "loss": 0.0329, "step": 3772 }, { "epoch": 0.5951808179201009, "grad_norm": 4.984086990356445, "learning_rate": 4.1313798100144905e-07, "loss": 0.0355, "step": 3773 }, { "epoch": 0.5953385652876918, "grad_norm": 3.5823488235473633, "learning_rate": 4.129769763323136e-07, "loss": 0.0333, "step": 3774 }, { "epoch": 0.5954963126552826, "grad_norm": 4.083855152130127, "learning_rate": 4.128159716631782e-07, "loss": 0.0549, "step": 3775 }, { "epoch": 0.5956540600228734, "grad_norm": 3.4935083389282227, "learning_rate": 4.1265496699404276e-07, "loss": 0.0655, "step": 3776 }, { "epoch": 0.5958118073904641, "grad_norm": 5.366272926330566, "learning_rate": 4.124939623249074e-07, "loss": 0.0264, "step": 3777 }, { "epoch": 0.595969554758055, "grad_norm": 2.7977328300476074, "learning_rate": 4.12332957655772e-07, "loss": 0.0135, "step": 3778 }, { "epoch": 0.5961273021256458, "grad_norm": 5.729579925537109, "learning_rate": 4.121719529866366e-07, "loss": 0.0309, "step": 3779 }, { "epoch": 0.5962850494932366, "grad_norm": 5.854665279388428, "learning_rate": 4.120109483175012e-07, "loss": 0.0598, "step": 3780 }, { "epoch": 0.5964427968608274, "grad_norm": 2.5261032581329346, "learning_rate": 4.118499436483658e-07, "loss": 0.038, "step": 3781 }, { "epoch": 0.5966005442284182, "grad_norm": 3.297360420227051, "learning_rate": 4.116889389792304e-07, "loss": 0.0344, "step": 3782 }, { "epoch": 0.596758291596009, "grad_norm": 3.670506715774536, "learning_rate": 4.1152793431009495e-07, "loss": 0.0349, "step": 3783 }, { "epoch": 0.5969160389635998, "grad_norm": 7.786942005157471, "learning_rate": 4.113669296409596e-07, "loss": 0.0665, "step": 3784 }, { "epoch": 0.5970737863311906, "grad_norm": 1.6498265266418457, "learning_rate": 4.1120592497182417e-07, "loss": 0.0122, "step": 3785 }, { "epoch": 0.5972315336987813, "grad_norm": 4.289611339569092, "learning_rate": 4.1104492030268877e-07, "loss": 0.0219, "step": 3786 }, { "epoch": 0.5973892810663722, "grad_norm": 8.479968070983887, "learning_rate": 4.108839156335533e-07, "loss": 0.0511, "step": 3787 }, { "epoch": 0.597547028433963, "grad_norm": 9.507231712341309, "learning_rate": 4.10722910964418e-07, "loss": 0.0369, "step": 3788 }, { "epoch": 0.5977047758015538, "grad_norm": 5.04258394241333, "learning_rate": 4.1056190629528254e-07, "loss": 0.042, "step": 3789 }, { "epoch": 0.5978625231691446, "grad_norm": 6.5770182609558105, "learning_rate": 4.1040090162614714e-07, "loss": 0.066, "step": 3790 }, { "epoch": 0.5980202705367355, "grad_norm": 7.160120964050293, "learning_rate": 4.102398969570117e-07, "loss": 0.027, "step": 3791 }, { "epoch": 0.5981780179043262, "grad_norm": 2.9768872261047363, "learning_rate": 4.1007889228787636e-07, "loss": 0.0131, "step": 3792 }, { "epoch": 0.598335765271917, "grad_norm": 5.2741923332214355, "learning_rate": 4.099178876187409e-07, "loss": 0.0514, "step": 3793 }, { "epoch": 0.5984935126395078, "grad_norm": 7.782166957855225, "learning_rate": 4.097568829496055e-07, "loss": 0.0507, "step": 3794 }, { "epoch": 0.5986512600070987, "grad_norm": 6.158569812774658, "learning_rate": 4.095958782804701e-07, "loss": 0.0807, "step": 3795 }, { "epoch": 0.5988090073746895, "grad_norm": 4.537302494049072, "learning_rate": 4.0943487361133473e-07, "loss": 0.0341, "step": 3796 }, { "epoch": 0.5989667547422802, "grad_norm": 4.648359775543213, "learning_rate": 4.092738689421993e-07, "loss": 0.0287, "step": 3797 }, { "epoch": 0.599124502109871, "grad_norm": 6.254510879516602, "learning_rate": 4.091128642730639e-07, "loss": 0.0519, "step": 3798 }, { "epoch": 0.5992822494774619, "grad_norm": 5.403506755828857, "learning_rate": 4.089518596039285e-07, "loss": 0.0362, "step": 3799 }, { "epoch": 0.5994399968450527, "grad_norm": 7.727915287017822, "learning_rate": 4.087908549347931e-07, "loss": 0.0411, "step": 3800 }, { "epoch": 0.5995977442126434, "grad_norm": 6.238685607910156, "learning_rate": 4.0862985026565766e-07, "loss": 0.064, "step": 3801 }, { "epoch": 0.5997554915802342, "grad_norm": 6.48141622543335, "learning_rate": 4.0846884559652226e-07, "loss": 0.0529, "step": 3802 }, { "epoch": 0.5999132389478251, "grad_norm": 7.0403313636779785, "learning_rate": 4.0830784092738687e-07, "loss": 0.0498, "step": 3803 }, { "epoch": 0.6000709863154159, "grad_norm": 4.721019268035889, "learning_rate": 4.081468362582515e-07, "loss": 0.0254, "step": 3804 }, { "epoch": 0.6002287336830067, "grad_norm": 5.8263726234436035, "learning_rate": 4.079858315891161e-07, "loss": 0.0395, "step": 3805 }, { "epoch": 0.6003864810505974, "grad_norm": 3.3491733074188232, "learning_rate": 4.0782482691998063e-07, "loss": 0.0234, "step": 3806 }, { "epoch": 0.6005442284181882, "grad_norm": 4.7357587814331055, "learning_rate": 4.076638222508453e-07, "loss": 0.0301, "step": 3807 }, { "epoch": 0.6007019757857791, "grad_norm": 4.4766411781311035, "learning_rate": 4.0750281758170985e-07, "loss": 0.0508, "step": 3808 }, { "epoch": 0.6008597231533699, "grad_norm": 4.65160608291626, "learning_rate": 4.0734181291257445e-07, "loss": 0.0491, "step": 3809 }, { "epoch": 0.6010174705209607, "grad_norm": 4.908505439758301, "learning_rate": 4.0718080824343906e-07, "loss": 0.0352, "step": 3810 }, { "epoch": 0.6011752178885514, "grad_norm": 5.558905124664307, "learning_rate": 4.0701980357430367e-07, "loss": 0.0613, "step": 3811 }, { "epoch": 0.6013329652561423, "grad_norm": 5.562036037445068, "learning_rate": 4.068587989051682e-07, "loss": 0.0634, "step": 3812 }, { "epoch": 0.6014907126237331, "grad_norm": 5.081418991088867, "learning_rate": 4.066977942360328e-07, "loss": 0.0427, "step": 3813 }, { "epoch": 0.6016484599913239, "grad_norm": 5.784884452819824, "learning_rate": 4.0653678956689743e-07, "loss": 0.0286, "step": 3814 }, { "epoch": 0.6018062073589147, "grad_norm": 5.610983848571777, "learning_rate": 4.0637578489776204e-07, "loss": 0.082, "step": 3815 }, { "epoch": 0.6019639547265055, "grad_norm": 3.5575551986694336, "learning_rate": 4.062147802286266e-07, "loss": 0.0235, "step": 3816 }, { "epoch": 0.6021217020940963, "grad_norm": 6.529111385345459, "learning_rate": 4.060537755594912e-07, "loss": 0.0784, "step": 3817 }, { "epoch": 0.6022794494616871, "grad_norm": 2.607593536376953, "learning_rate": 4.058927708903558e-07, "loss": 0.0117, "step": 3818 }, { "epoch": 0.6024371968292779, "grad_norm": 6.788741111755371, "learning_rate": 4.057317662212204e-07, "loss": 0.0758, "step": 3819 }, { "epoch": 0.6025949441968688, "grad_norm": 4.868200778961182, "learning_rate": 4.0557076155208496e-07, "loss": 0.0335, "step": 3820 }, { "epoch": 0.6027526915644595, "grad_norm": 3.741621494293213, "learning_rate": 4.0540975688294957e-07, "loss": 0.0558, "step": 3821 }, { "epoch": 0.6029104389320503, "grad_norm": 4.564605236053467, "learning_rate": 4.052487522138142e-07, "loss": 0.0287, "step": 3822 }, { "epoch": 0.6030681862996411, "grad_norm": 4.796411991119385, "learning_rate": 4.050877475446788e-07, "loss": 0.0293, "step": 3823 }, { "epoch": 0.6032259336672319, "grad_norm": 4.420485019683838, "learning_rate": 4.0492674287554334e-07, "loss": 0.0641, "step": 3824 }, { "epoch": 0.6033836810348228, "grad_norm": 3.2823305130004883, "learning_rate": 4.04765738206408e-07, "loss": 0.0296, "step": 3825 }, { "epoch": 0.6035414284024135, "grad_norm": 3.3454949855804443, "learning_rate": 4.0460473353727255e-07, "loss": 0.0272, "step": 3826 }, { "epoch": 0.6036991757700043, "grad_norm": 6.38124418258667, "learning_rate": 4.0444372886813715e-07, "loss": 0.0661, "step": 3827 }, { "epoch": 0.6038569231375951, "grad_norm": 7.184398651123047, "learning_rate": 4.0428272419900176e-07, "loss": 0.0561, "step": 3828 }, { "epoch": 0.604014670505186, "grad_norm": 5.573564529418945, "learning_rate": 4.0412171952986637e-07, "loss": 0.0812, "step": 3829 }, { "epoch": 0.6041724178727768, "grad_norm": 6.937236785888672, "learning_rate": 4.03960714860731e-07, "loss": 0.0272, "step": 3830 }, { "epoch": 0.6043301652403675, "grad_norm": 8.94973373413086, "learning_rate": 4.0379971019159553e-07, "loss": 0.1069, "step": 3831 }, { "epoch": 0.6044879126079583, "grad_norm": 5.745685577392578, "learning_rate": 4.0363870552246013e-07, "loss": 0.0458, "step": 3832 }, { "epoch": 0.6046456599755492, "grad_norm": 1.876373291015625, "learning_rate": 4.0347770085332474e-07, "loss": 0.0111, "step": 3833 }, { "epoch": 0.60480340734314, "grad_norm": 3.7778372764587402, "learning_rate": 4.0331669618418935e-07, "loss": 0.046, "step": 3834 }, { "epoch": 0.6049611547107308, "grad_norm": 5.344654560089111, "learning_rate": 4.031556915150539e-07, "loss": 0.0454, "step": 3835 }, { "epoch": 0.6051189020783215, "grad_norm": 4.426279544830322, "learning_rate": 4.029946868459185e-07, "loss": 0.0722, "step": 3836 }, { "epoch": 0.6052766494459124, "grad_norm": 2.89872407913208, "learning_rate": 4.028336821767831e-07, "loss": 0.0113, "step": 3837 }, { "epoch": 0.6054343968135032, "grad_norm": 3.577136993408203, "learning_rate": 4.026726775076477e-07, "loss": 0.0428, "step": 3838 }, { "epoch": 0.605592144181094, "grad_norm": 6.849196910858154, "learning_rate": 4.0251167283851227e-07, "loss": 0.0206, "step": 3839 }, { "epoch": 0.6057498915486847, "grad_norm": 6.689548492431641, "learning_rate": 4.0235066816937693e-07, "loss": 0.0241, "step": 3840 }, { "epoch": 0.6057498915486847, "eval_accuracy": 0.9873752187523107, "eval_f1": 0.9873752187523107, "eval_loss": 0.04061572626233101, "eval_runtime": 4708.8828, "eval_samples_per_second": 43.079, "eval_steps_per_second": 2.693, "step": 3840 }, { "epoch": 0.6059076389162756, "grad_norm": 5.607239723205566, "learning_rate": 4.021896635002415e-07, "loss": 0.0529, "step": 3841 }, { "epoch": 0.6060653862838664, "grad_norm": 5.150136947631836, "learning_rate": 4.020286588311061e-07, "loss": 0.0286, "step": 3842 }, { "epoch": 0.6062231336514572, "grad_norm": 2.0050036907196045, "learning_rate": 4.0186765416197064e-07, "loss": 0.0084, "step": 3843 }, { "epoch": 0.606380881019048, "grad_norm": 4.918299674987793, "learning_rate": 4.017066494928353e-07, "loss": 0.0292, "step": 3844 }, { "epoch": 0.6065386283866387, "grad_norm": 4.557953357696533, "learning_rate": 4.0154564482369986e-07, "loss": 0.0518, "step": 3845 }, { "epoch": 0.6066963757542296, "grad_norm": 4.684080123901367, "learning_rate": 4.0138464015456446e-07, "loss": 0.046, "step": 3846 }, { "epoch": 0.6068541231218204, "grad_norm": 4.371344089508057, "learning_rate": 4.01223635485429e-07, "loss": 0.032, "step": 3847 }, { "epoch": 0.6070118704894112, "grad_norm": 4.31361722946167, "learning_rate": 4.010626308162937e-07, "loss": 0.0412, "step": 3848 }, { "epoch": 0.607169617857002, "grad_norm": 2.3368191719055176, "learning_rate": 4.0090162614715823e-07, "loss": 0.0292, "step": 3849 }, { "epoch": 0.6073273652245929, "grad_norm": 4.166183948516846, "learning_rate": 4.0074062147802284e-07, "loss": 0.03, "step": 3850 }, { "epoch": 0.6074851125921836, "grad_norm": 106.28617095947266, "learning_rate": 4.0057961680888744e-07, "loss": 0.0394, "step": 3851 }, { "epoch": 0.6076428599597744, "grad_norm": 7.088550090789795, "learning_rate": 4.0041861213975205e-07, "loss": 0.074, "step": 3852 }, { "epoch": 0.6078006073273652, "grad_norm": 6.301978588104248, "learning_rate": 4.0025760747061665e-07, "loss": 0.0727, "step": 3853 }, { "epoch": 0.6079583546949561, "grad_norm": 4.252588272094727, "learning_rate": 4.000966028014812e-07, "loss": 0.0327, "step": 3854 }, { "epoch": 0.6081161020625468, "grad_norm": 5.474897861480713, "learning_rate": 3.9993559813234587e-07, "loss": 0.0953, "step": 3855 }, { "epoch": 0.6082738494301376, "grad_norm": 3.9940476417541504, "learning_rate": 3.997745934632104e-07, "loss": 0.039, "step": 3856 }, { "epoch": 0.6084315967977284, "grad_norm": 5.6145100593566895, "learning_rate": 3.99613588794075e-07, "loss": 0.0515, "step": 3857 }, { "epoch": 0.6085893441653193, "grad_norm": 5.668785572052002, "learning_rate": 3.994525841249396e-07, "loss": 0.084, "step": 3858 }, { "epoch": 0.6087470915329101, "grad_norm": 3.3662006855010986, "learning_rate": 3.9929157945580424e-07, "loss": 0.0436, "step": 3859 }, { "epoch": 0.6089048389005008, "grad_norm": 4.437554359436035, "learning_rate": 3.991305747866688e-07, "loss": 0.024, "step": 3860 }, { "epoch": 0.6090625862680916, "grad_norm": 2.6105315685272217, "learning_rate": 3.989695701175334e-07, "loss": 0.014, "step": 3861 }, { "epoch": 0.6092203336356825, "grad_norm": 5.951709270477295, "learning_rate": 3.9880856544839795e-07, "loss": 0.0506, "step": 3862 }, { "epoch": 0.6093780810032733, "grad_norm": 4.6668243408203125, "learning_rate": 3.986475607792626e-07, "loss": 0.0567, "step": 3863 }, { "epoch": 0.6095358283708641, "grad_norm": 6.604398727416992, "learning_rate": 3.9848655611012716e-07, "loss": 0.0624, "step": 3864 }, { "epoch": 0.6096935757384548, "grad_norm": 7.9031291007995605, "learning_rate": 3.9832555144099177e-07, "loss": 0.0755, "step": 3865 }, { "epoch": 0.6098513231060456, "grad_norm": 6.474466323852539, "learning_rate": 3.981645467718563e-07, "loss": 0.041, "step": 3866 }, { "epoch": 0.6100090704736365, "grad_norm": 2.133225917816162, "learning_rate": 3.98003542102721e-07, "loss": 0.0162, "step": 3867 }, { "epoch": 0.6101668178412273, "grad_norm": 4.724400520324707, "learning_rate": 3.9784253743358554e-07, "loss": 0.0895, "step": 3868 }, { "epoch": 0.6103245652088181, "grad_norm": 3.4139459133148193, "learning_rate": 3.9768153276445014e-07, "loss": 0.0322, "step": 3869 }, { "epoch": 0.6104823125764088, "grad_norm": 5.139689922332764, "learning_rate": 3.975205280953147e-07, "loss": 0.0203, "step": 3870 }, { "epoch": 0.6106400599439997, "grad_norm": 2.9104020595550537, "learning_rate": 3.9735952342617936e-07, "loss": 0.0128, "step": 3871 }, { "epoch": 0.6107978073115905, "grad_norm": 3.117351770401001, "learning_rate": 3.971985187570439e-07, "loss": 0.0219, "step": 3872 }, { "epoch": 0.6109555546791813, "grad_norm": 4.9865593910217285, "learning_rate": 3.970375140879085e-07, "loss": 0.0384, "step": 3873 }, { "epoch": 0.6111133020467721, "grad_norm": 5.56674861907959, "learning_rate": 3.968765094187732e-07, "loss": 0.0512, "step": 3874 }, { "epoch": 0.6112710494143629, "grad_norm": 6.8554840087890625, "learning_rate": 3.9671550474963773e-07, "loss": 0.0607, "step": 3875 }, { "epoch": 0.6114287967819537, "grad_norm": 4.936234474182129, "learning_rate": 3.9655450008050233e-07, "loss": 0.0505, "step": 3876 }, { "epoch": 0.6115865441495445, "grad_norm": 4.334475517272949, "learning_rate": 3.963934954113669e-07, "loss": 0.0195, "step": 3877 }, { "epoch": 0.6117442915171353, "grad_norm": 3.8036322593688965, "learning_rate": 3.9623249074223155e-07, "loss": 0.0201, "step": 3878 }, { "epoch": 0.6119020388847262, "grad_norm": 4.9143805503845215, "learning_rate": 3.960714860730961e-07, "loss": 0.023, "step": 3879 }, { "epoch": 0.6120597862523169, "grad_norm": 4.306915760040283, "learning_rate": 3.959104814039607e-07, "loss": 0.0203, "step": 3880 }, { "epoch": 0.6122175336199077, "grad_norm": 4.71891450881958, "learning_rate": 3.9574947673482526e-07, "loss": 0.0478, "step": 3881 }, { "epoch": 0.6123752809874985, "grad_norm": 5.184467315673828, "learning_rate": 3.955884720656899e-07, "loss": 0.0596, "step": 3882 }, { "epoch": 0.6125330283550893, "grad_norm": 5.844971656799316, "learning_rate": 3.9542746739655447e-07, "loss": 0.0636, "step": 3883 }, { "epoch": 0.6126907757226802, "grad_norm": 3.3564276695251465, "learning_rate": 3.952664627274191e-07, "loss": 0.0571, "step": 3884 }, { "epoch": 0.6128485230902709, "grad_norm": 4.913923740386963, "learning_rate": 3.9510545805828363e-07, "loss": 0.0473, "step": 3885 }, { "epoch": 0.6130062704578617, "grad_norm": 4.008642196655273, "learning_rate": 3.949444533891483e-07, "loss": 0.0325, "step": 3886 }, { "epoch": 0.6131640178254525, "grad_norm": 3.20485782623291, "learning_rate": 3.9478344872001285e-07, "loss": 0.0301, "step": 3887 }, { "epoch": 0.6133217651930434, "grad_norm": 3.5149877071380615, "learning_rate": 3.9462244405087745e-07, "loss": 0.0348, "step": 3888 }, { "epoch": 0.6134795125606342, "grad_norm": 6.214327335357666, "learning_rate": 3.9446143938174206e-07, "loss": 0.0578, "step": 3889 }, { "epoch": 0.6136372599282249, "grad_norm": 3.9520163536071777, "learning_rate": 3.9430043471260666e-07, "loss": 0.0373, "step": 3890 }, { "epoch": 0.6137950072958157, "grad_norm": 3.7997848987579346, "learning_rate": 3.941394300434712e-07, "loss": 0.0307, "step": 3891 }, { "epoch": 0.6139527546634066, "grad_norm": 4.809603214263916, "learning_rate": 3.939784253743358e-07, "loss": 0.0336, "step": 3892 }, { "epoch": 0.6141105020309974, "grad_norm": 3.4347429275512695, "learning_rate": 3.9381742070520043e-07, "loss": 0.0324, "step": 3893 }, { "epoch": 0.6142682493985882, "grad_norm": 6.23806095123291, "learning_rate": 3.9365641603606504e-07, "loss": 0.0605, "step": 3894 }, { "epoch": 0.6144259967661789, "grad_norm": 8.719413757324219, "learning_rate": 3.934954113669296e-07, "loss": 0.0445, "step": 3895 }, { "epoch": 0.6145837441337698, "grad_norm": 4.386685371398926, "learning_rate": 3.933344066977942e-07, "loss": 0.0327, "step": 3896 }, { "epoch": 0.6147414915013606, "grad_norm": 4.640712738037109, "learning_rate": 3.9317340202865886e-07, "loss": 0.053, "step": 3897 }, { "epoch": 0.6148992388689514, "grad_norm": 4.2591471672058105, "learning_rate": 3.930123973595234e-07, "loss": 0.0306, "step": 3898 }, { "epoch": 0.6150569862365421, "grad_norm": 4.054599285125732, "learning_rate": 3.92851392690388e-07, "loss": 0.0303, "step": 3899 }, { "epoch": 0.615214733604133, "grad_norm": 9.246919631958008, "learning_rate": 3.9269038802125257e-07, "loss": 0.0719, "step": 3900 }, { "epoch": 0.6153724809717238, "grad_norm": 2.701117992401123, "learning_rate": 3.9252938335211723e-07, "loss": 0.0411, "step": 3901 }, { "epoch": 0.6155302283393146, "grad_norm": 5.678535461425781, "learning_rate": 3.923683786829818e-07, "loss": 0.0645, "step": 3902 }, { "epoch": 0.6156879757069054, "grad_norm": 2.281376838684082, "learning_rate": 3.922073740138464e-07, "loss": 0.0174, "step": 3903 }, { "epoch": 0.6158457230744961, "grad_norm": 5.359318256378174, "learning_rate": 3.92046369344711e-07, "loss": 0.0297, "step": 3904 }, { "epoch": 0.616003470442087, "grad_norm": 3.9732272624969482, "learning_rate": 3.918853646755756e-07, "loss": 0.0116, "step": 3905 }, { "epoch": 0.6161612178096778, "grad_norm": 4.528304576873779, "learning_rate": 3.9172436000644015e-07, "loss": 0.0219, "step": 3906 }, { "epoch": 0.6163189651772686, "grad_norm": 3.9752628803253174, "learning_rate": 3.9156335533730476e-07, "loss": 0.0238, "step": 3907 }, { "epoch": 0.6164767125448594, "grad_norm": 3.861325979232788, "learning_rate": 3.9140235066816937e-07, "loss": 0.031, "step": 3908 }, { "epoch": 0.6166344599124503, "grad_norm": 3.2465317249298096, "learning_rate": 3.9124134599903397e-07, "loss": 0.0262, "step": 3909 }, { "epoch": 0.616792207280041, "grad_norm": 4.051419258117676, "learning_rate": 3.910803413298985e-07, "loss": 0.0194, "step": 3910 }, { "epoch": 0.6169499546476318, "grad_norm": 6.009376525878906, "learning_rate": 3.9091933666076313e-07, "loss": 0.0467, "step": 3911 }, { "epoch": 0.6171077020152226, "grad_norm": 4.511587619781494, "learning_rate": 3.9075833199162774e-07, "loss": 0.0319, "step": 3912 }, { "epoch": 0.6172654493828135, "grad_norm": 2.984797954559326, "learning_rate": 3.9059732732249234e-07, "loss": 0.0412, "step": 3913 }, { "epoch": 0.6174231967504042, "grad_norm": 5.0735883712768555, "learning_rate": 3.904363226533569e-07, "loss": 0.0567, "step": 3914 }, { "epoch": 0.617580944117995, "grad_norm": 5.829403877258301, "learning_rate": 3.902753179842215e-07, "loss": 0.0313, "step": 3915 }, { "epoch": 0.6177386914855858, "grad_norm": 3.133254051208496, "learning_rate": 3.901143133150861e-07, "loss": 0.0169, "step": 3916 }, { "epoch": 0.6178964388531767, "grad_norm": 8.683976173400879, "learning_rate": 3.899533086459507e-07, "loss": 0.0718, "step": 3917 }, { "epoch": 0.6180541862207675, "grad_norm": 4.144251346588135, "learning_rate": 3.8979230397681527e-07, "loss": 0.0549, "step": 3918 }, { "epoch": 0.6182119335883582, "grad_norm": 4.386746406555176, "learning_rate": 3.8963129930767993e-07, "loss": 0.0664, "step": 3919 }, { "epoch": 0.618369680955949, "grad_norm": 5.533723831176758, "learning_rate": 3.8947029463854454e-07, "loss": 0.0232, "step": 3920 }, { "epoch": 0.6185274283235398, "grad_norm": 2.7240710258483887, "learning_rate": 3.893092899694091e-07, "loss": 0.021, "step": 3921 }, { "epoch": 0.6186851756911307, "grad_norm": 5.716734409332275, "learning_rate": 3.891482853002737e-07, "loss": 0.0409, "step": 3922 }, { "epoch": 0.6188429230587215, "grad_norm": 11.965015411376953, "learning_rate": 3.889872806311383e-07, "loss": 0.0609, "step": 3923 }, { "epoch": 0.6190006704263122, "grad_norm": 7.14946985244751, "learning_rate": 3.888262759620029e-07, "loss": 0.1178, "step": 3924 }, { "epoch": 0.619158417793903, "grad_norm": 4.608770370483398, "learning_rate": 3.8866527129286746e-07, "loss": 0.0373, "step": 3925 }, { "epoch": 0.6193161651614939, "grad_norm": 3.3860247135162354, "learning_rate": 3.8850426662373207e-07, "loss": 0.0155, "step": 3926 }, { "epoch": 0.6194739125290847, "grad_norm": 3.5459256172180176, "learning_rate": 3.883432619545967e-07, "loss": 0.0292, "step": 3927 }, { "epoch": 0.6196316598966755, "grad_norm": 6.250884532928467, "learning_rate": 3.881822572854613e-07, "loss": 0.037, "step": 3928 }, { "epoch": 0.6197894072642662, "grad_norm": 8.23530101776123, "learning_rate": 3.8802125261632583e-07, "loss": 0.0614, "step": 3929 }, { "epoch": 0.6199471546318571, "grad_norm": 2.993102788925171, "learning_rate": 3.8786024794719044e-07, "loss": 0.0096, "step": 3930 }, { "epoch": 0.6201049019994479, "grad_norm": 5.495903015136719, "learning_rate": 3.8769924327805505e-07, "loss": 0.0476, "step": 3931 }, { "epoch": 0.6202626493670387, "grad_norm": 3.0880420207977295, "learning_rate": 3.8753823860891965e-07, "loss": 0.0136, "step": 3932 }, { "epoch": 0.6204203967346295, "grad_norm": 6.1710052490234375, "learning_rate": 3.873772339397842e-07, "loss": 0.0548, "step": 3933 }, { "epoch": 0.6205781441022203, "grad_norm": 8.93266773223877, "learning_rate": 3.8721622927064887e-07, "loss": 0.0567, "step": 3934 }, { "epoch": 0.6207358914698111, "grad_norm": 8.327640533447266, "learning_rate": 3.870552246015134e-07, "loss": 0.0502, "step": 3935 }, { "epoch": 0.6208936388374019, "grad_norm": 4.471561908721924, "learning_rate": 3.86894219932378e-07, "loss": 0.058, "step": 3936 }, { "epoch": 0.6210513862049927, "grad_norm": 10.008733749389648, "learning_rate": 3.867332152632426e-07, "loss": 0.0678, "step": 3937 }, { "epoch": 0.6212091335725836, "grad_norm": 11.19927978515625, "learning_rate": 3.8657221059410724e-07, "loss": 0.068, "step": 3938 }, { "epoch": 0.6213668809401743, "grad_norm": 5.79336404800415, "learning_rate": 3.864112059249718e-07, "loss": 0.0683, "step": 3939 }, { "epoch": 0.6215246283077651, "grad_norm": 4.393022537231445, "learning_rate": 3.862502012558364e-07, "loss": 0.05, "step": 3940 }, { "epoch": 0.6216823756753559, "grad_norm": 5.813998699188232, "learning_rate": 3.8608919658670095e-07, "loss": 0.0492, "step": 3941 }, { "epoch": 0.6218401230429467, "grad_norm": 4.290627479553223, "learning_rate": 3.859281919175656e-07, "loss": 0.0462, "step": 3942 }, { "epoch": 0.6219978704105376, "grad_norm": 6.6334547996521, "learning_rate": 3.857671872484302e-07, "loss": 0.0395, "step": 3943 }, { "epoch": 0.6221556177781283, "grad_norm": 7.830145835876465, "learning_rate": 3.8560618257929477e-07, "loss": 0.0584, "step": 3944 }, { "epoch": 0.6223133651457191, "grad_norm": 5.8135857582092285, "learning_rate": 3.854451779101594e-07, "loss": 0.0472, "step": 3945 }, { "epoch": 0.6224711125133099, "grad_norm": 14.138686180114746, "learning_rate": 3.85284173241024e-07, "loss": 0.0655, "step": 3946 }, { "epoch": 0.6226288598809008, "grad_norm": 5.751094818115234, "learning_rate": 3.851231685718886e-07, "loss": 0.0585, "step": 3947 }, { "epoch": 0.6227866072484916, "grad_norm": 5.578413009643555, "learning_rate": 3.8496216390275314e-07, "loss": 0.0653, "step": 3948 }, { "epoch": 0.6229443546160823, "grad_norm": 3.894172191619873, "learning_rate": 3.848011592336178e-07, "loss": 0.0321, "step": 3949 }, { "epoch": 0.6231021019836731, "grad_norm": 5.169582366943359, "learning_rate": 3.8464015456448235e-07, "loss": 0.0373, "step": 3950 }, { "epoch": 0.623259849351264, "grad_norm": 3.197829246520996, "learning_rate": 3.8447914989534696e-07, "loss": 0.0319, "step": 3951 }, { "epoch": 0.6234175967188548, "grad_norm": 5.065234184265137, "learning_rate": 3.843181452262115e-07, "loss": 0.0582, "step": 3952 }, { "epoch": 0.6235753440864455, "grad_norm": 3.9794607162475586, "learning_rate": 3.841571405570762e-07, "loss": 0.0352, "step": 3953 }, { "epoch": 0.6237330914540363, "grad_norm": 5.315174579620361, "learning_rate": 3.8399613588794073e-07, "loss": 0.0537, "step": 3954 }, { "epoch": 0.6238908388216272, "grad_norm": 6.888188362121582, "learning_rate": 3.8383513121880533e-07, "loss": 0.0542, "step": 3955 }, { "epoch": 0.624048586189218, "grad_norm": 7.044703960418701, "learning_rate": 3.836741265496699e-07, "loss": 0.0565, "step": 3956 }, { "epoch": 0.6242063335568088, "grad_norm": 6.392137050628662, "learning_rate": 3.8351312188053455e-07, "loss": 0.0599, "step": 3957 }, { "epoch": 0.6243640809243995, "grad_norm": 3.4717397689819336, "learning_rate": 3.833521172113991e-07, "loss": 0.0163, "step": 3958 }, { "epoch": 0.6245218282919904, "grad_norm": 6.944422245025635, "learning_rate": 3.831911125422637e-07, "loss": 0.061, "step": 3959 }, { "epoch": 0.6246795756595812, "grad_norm": 4.242948532104492, "learning_rate": 3.8303010787312826e-07, "loss": 0.0408, "step": 3960 }, { "epoch": 0.624837323027172, "grad_norm": 2.8211543560028076, "learning_rate": 3.828691032039929e-07, "loss": 0.0193, "step": 3961 }, { "epoch": 0.6249950703947628, "grad_norm": 3.514871835708618, "learning_rate": 3.8270809853485747e-07, "loss": 0.0356, "step": 3962 }, { "epoch": 0.6251528177623535, "grad_norm": 4.88889741897583, "learning_rate": 3.825470938657221e-07, "loss": 0.0494, "step": 3963 }, { "epoch": 0.6253105651299444, "grad_norm": 10.978367805480957, "learning_rate": 3.823860891965867e-07, "loss": 0.0581, "step": 3964 }, { "epoch": 0.6254683124975352, "grad_norm": 5.499526500701904, "learning_rate": 3.822250845274513e-07, "loss": 0.0469, "step": 3965 }, { "epoch": 0.625626059865126, "grad_norm": 4.112061500549316, "learning_rate": 3.8206407985831584e-07, "loss": 0.0152, "step": 3966 }, { "epoch": 0.6257838072327168, "grad_norm": 4.566140174865723, "learning_rate": 3.8190307518918045e-07, "loss": 0.0517, "step": 3967 }, { "epoch": 0.6259415546003076, "grad_norm": 5.12914514541626, "learning_rate": 3.817420705200451e-07, "loss": 0.0567, "step": 3968 }, { "epoch": 0.6260993019678984, "grad_norm": 4.391212463378906, "learning_rate": 3.8158106585090966e-07, "loss": 0.0767, "step": 3969 }, { "epoch": 0.6262570493354892, "grad_norm": 4.2883124351501465, "learning_rate": 3.8142006118177427e-07, "loss": 0.0355, "step": 3970 }, { "epoch": 0.62641479670308, "grad_norm": 6.136410236358643, "learning_rate": 3.812590565126388e-07, "loss": 0.0739, "step": 3971 }, { "epoch": 0.6265725440706709, "grad_norm": 3.8228116035461426, "learning_rate": 3.810980518435035e-07, "loss": 0.0314, "step": 3972 }, { "epoch": 0.6267302914382616, "grad_norm": 5.828356742858887, "learning_rate": 3.8093704717436804e-07, "loss": 0.0812, "step": 3973 }, { "epoch": 0.6268880388058524, "grad_norm": 5.330719947814941, "learning_rate": 3.8077604250523264e-07, "loss": 0.0753, "step": 3974 }, { "epoch": 0.6270457861734432, "grad_norm": 6.100672245025635, "learning_rate": 3.806150378360972e-07, "loss": 0.0713, "step": 3975 }, { "epoch": 0.6272035335410341, "grad_norm": 4.633836269378662, "learning_rate": 3.8045403316696185e-07, "loss": 0.0396, "step": 3976 }, { "epoch": 0.6273612809086249, "grad_norm": 7.444503307342529, "learning_rate": 3.802930284978264e-07, "loss": 0.1086, "step": 3977 }, { "epoch": 0.6275190282762156, "grad_norm": 5.09481954574585, "learning_rate": 3.80132023828691e-07, "loss": 0.033, "step": 3978 }, { "epoch": 0.6276767756438064, "grad_norm": 5.039127349853516, "learning_rate": 3.799710191595556e-07, "loss": 0.0468, "step": 3979 }, { "epoch": 0.6278345230113972, "grad_norm": 8.150583267211914, "learning_rate": 3.7981001449042023e-07, "loss": 0.0346, "step": 3980 }, { "epoch": 0.6279922703789881, "grad_norm": 8.27899169921875, "learning_rate": 3.796490098212848e-07, "loss": 0.0524, "step": 3981 }, { "epoch": 0.6281500177465789, "grad_norm": 5.138139247894287, "learning_rate": 3.794880051521494e-07, "loss": 0.0493, "step": 3982 }, { "epoch": 0.6283077651141696, "grad_norm": 8.71139907836914, "learning_rate": 3.79327000483014e-07, "loss": 0.0797, "step": 3983 }, { "epoch": 0.6284655124817604, "grad_norm": 3.703125, "learning_rate": 3.791659958138786e-07, "loss": 0.0775, "step": 3984 }, { "epoch": 0.6286232598493513, "grad_norm": 3.0508437156677246, "learning_rate": 3.7900499114474315e-07, "loss": 0.0183, "step": 3985 }, { "epoch": 0.6287810072169421, "grad_norm": 4.136172294616699, "learning_rate": 3.7884398647560776e-07, "loss": 0.0507, "step": 3986 }, { "epoch": 0.6289387545845329, "grad_norm": 6.061408042907715, "learning_rate": 3.7868298180647237e-07, "loss": 0.0328, "step": 3987 }, { "epoch": 0.6290965019521236, "grad_norm": 4.876362323760986, "learning_rate": 3.7852197713733697e-07, "loss": 0.0309, "step": 3988 }, { "epoch": 0.6292542493197145, "grad_norm": 5.385231018066406, "learning_rate": 3.783609724682015e-07, "loss": 0.048, "step": 3989 }, { "epoch": 0.6294119966873053, "grad_norm": 5.413110733032227, "learning_rate": 3.7819996779906613e-07, "loss": 0.0398, "step": 3990 }, { "epoch": 0.6295697440548961, "grad_norm": 5.107495307922363, "learning_rate": 3.780389631299308e-07, "loss": 0.0457, "step": 3991 }, { "epoch": 0.6297274914224869, "grad_norm": 3.7703804969787598, "learning_rate": 3.7787795846079534e-07, "loss": 0.0275, "step": 3992 }, { "epoch": 0.6298852387900777, "grad_norm": 5.415693283081055, "learning_rate": 3.7771695379165995e-07, "loss": 0.0603, "step": 3993 }, { "epoch": 0.6300429861576685, "grad_norm": 6.143565654754639, "learning_rate": 3.7755594912252456e-07, "loss": 0.019, "step": 3994 }, { "epoch": 0.6302007335252593, "grad_norm": 4.309868812561035, "learning_rate": 3.7739494445338916e-07, "loss": 0.0542, "step": 3995 }, { "epoch": 0.6303584808928501, "grad_norm": 6.542187690734863, "learning_rate": 3.772339397842537e-07, "loss": 0.0651, "step": 3996 }, { "epoch": 0.630516228260441, "grad_norm": 4.054347991943359, "learning_rate": 3.770729351151183e-07, "loss": 0.0683, "step": 3997 }, { "epoch": 0.6306739756280317, "grad_norm": 4.607563495635986, "learning_rate": 3.7691193044598293e-07, "loss": 0.0368, "step": 3998 }, { "epoch": 0.6308317229956225, "grad_norm": 4.678775787353516, "learning_rate": 3.7675092577684753e-07, "loss": 0.0334, "step": 3999 }, { "epoch": 0.6309894703632133, "grad_norm": 6.153867244720459, "learning_rate": 3.765899211077121e-07, "loss": 0.0921, "step": 4000 }, { "epoch": 0.6311472177308041, "grad_norm": 2.6871447563171387, "learning_rate": 3.764289164385767e-07, "loss": 0.0273, "step": 4001 }, { "epoch": 0.631304965098395, "grad_norm": 3.4612224102020264, "learning_rate": 3.762679117694413e-07, "loss": 0.0384, "step": 4002 }, { "epoch": 0.6314627124659857, "grad_norm": 7.3668365478515625, "learning_rate": 3.761069071003059e-07, "loss": 0.0666, "step": 4003 }, { "epoch": 0.6316204598335765, "grad_norm": 4.346674919128418, "learning_rate": 3.7594590243117046e-07, "loss": 0.0554, "step": 4004 }, { "epoch": 0.6317782072011673, "grad_norm": 8.219521522521973, "learning_rate": 3.7578489776203507e-07, "loss": 0.0513, "step": 4005 }, { "epoch": 0.6319359545687582, "grad_norm": 5.268881797790527, "learning_rate": 3.7562389309289967e-07, "loss": 0.0428, "step": 4006 }, { "epoch": 0.632093701936349, "grad_norm": 6.4654107093811035, "learning_rate": 3.754628884237643e-07, "loss": 0.0729, "step": 4007 }, { "epoch": 0.6322514493039397, "grad_norm": 5.147934436798096, "learning_rate": 3.7530188375462883e-07, "loss": 0.0577, "step": 4008 }, { "epoch": 0.6324091966715305, "grad_norm": 6.916091442108154, "learning_rate": 3.751408790854935e-07, "loss": 0.0428, "step": 4009 }, { "epoch": 0.6325669440391214, "grad_norm": 4.3061628341674805, "learning_rate": 3.7497987441635805e-07, "loss": 0.0426, "step": 4010 }, { "epoch": 0.6327246914067122, "grad_norm": 4.644337177276611, "learning_rate": 3.7481886974722265e-07, "loss": 0.0304, "step": 4011 }, { "epoch": 0.632882438774303, "grad_norm": 2.602245807647705, "learning_rate": 3.746578650780872e-07, "loss": 0.0155, "step": 4012 }, { "epoch": 0.6330401861418937, "grad_norm": 4.101451873779297, "learning_rate": 3.7449686040895186e-07, "loss": 0.0265, "step": 4013 }, { "epoch": 0.6331979335094846, "grad_norm": 3.6709542274475098, "learning_rate": 3.7433585573981647e-07, "loss": 0.0258, "step": 4014 }, { "epoch": 0.6333556808770754, "grad_norm": 2.2103958129882812, "learning_rate": 3.74174851070681e-07, "loss": 0.0298, "step": 4015 }, { "epoch": 0.6335134282446662, "grad_norm": 4.5828752517700195, "learning_rate": 3.7401384640154563e-07, "loss": 0.0494, "step": 4016 }, { "epoch": 0.6336711756122569, "grad_norm": 3.546704053878784, "learning_rate": 3.7385284173241024e-07, "loss": 0.0414, "step": 4017 }, { "epoch": 0.6338289229798477, "grad_norm": 3.1935534477233887, "learning_rate": 3.7369183706327484e-07, "loss": 0.0275, "step": 4018 }, { "epoch": 0.6339866703474386, "grad_norm": 3.213179111480713, "learning_rate": 3.735308323941394e-07, "loss": 0.0375, "step": 4019 }, { "epoch": 0.6341444177150294, "grad_norm": 3.9565305709838867, "learning_rate": 3.73369827725004e-07, "loss": 0.0496, "step": 4020 }, { "epoch": 0.6343021650826202, "grad_norm": 4.46652889251709, "learning_rate": 3.732088230558686e-07, "loss": 0.0328, "step": 4021 }, { "epoch": 0.6344599124502109, "grad_norm": 4.671411991119385, "learning_rate": 3.730478183867332e-07, "loss": 0.0509, "step": 4022 }, { "epoch": 0.6346176598178018, "grad_norm": 5.742166042327881, "learning_rate": 3.7288681371759777e-07, "loss": 0.0404, "step": 4023 }, { "epoch": 0.6347754071853926, "grad_norm": 8.105299949645996, "learning_rate": 3.7272580904846243e-07, "loss": 0.0561, "step": 4024 }, { "epoch": 0.6349331545529834, "grad_norm": 5.17732048034668, "learning_rate": 3.72564804379327e-07, "loss": 0.0545, "step": 4025 }, { "epoch": 0.6350909019205742, "grad_norm": 4.43886137008667, "learning_rate": 3.724037997101916e-07, "loss": 0.0506, "step": 4026 }, { "epoch": 0.635248649288165, "grad_norm": 7.071035861968994, "learning_rate": 3.7224279504105614e-07, "loss": 0.0611, "step": 4027 }, { "epoch": 0.6354063966557558, "grad_norm": 3.3725736141204834, "learning_rate": 3.720817903719208e-07, "loss": 0.0499, "step": 4028 }, { "epoch": 0.6355641440233466, "grad_norm": 3.5755679607391357, "learning_rate": 3.7192078570278535e-07, "loss": 0.0449, "step": 4029 }, { "epoch": 0.6357218913909374, "grad_norm": 6.882693767547607, "learning_rate": 3.7175978103364996e-07, "loss": 0.0309, "step": 4030 }, { "epoch": 0.6358796387585283, "grad_norm": 1.3837300539016724, "learning_rate": 3.715987763645145e-07, "loss": 0.0114, "step": 4031 }, { "epoch": 0.636037386126119, "grad_norm": 5.463102340698242, "learning_rate": 3.7143777169537917e-07, "loss": 0.0325, "step": 4032 }, { "epoch": 0.6361951334937098, "grad_norm": 5.296006202697754, "learning_rate": 3.712767670262437e-07, "loss": 0.0346, "step": 4033 }, { "epoch": 0.6363528808613006, "grad_norm": 5.083899974822998, "learning_rate": 3.7111576235710833e-07, "loss": 0.0698, "step": 4034 }, { "epoch": 0.6365106282288915, "grad_norm": 3.9612481594085693, "learning_rate": 3.709547576879729e-07, "loss": 0.0361, "step": 4035 }, { "epoch": 0.6366683755964823, "grad_norm": 4.612145900726318, "learning_rate": 3.7079375301883755e-07, "loss": 0.026, "step": 4036 }, { "epoch": 0.636826122964073, "grad_norm": 2.1052379608154297, "learning_rate": 3.7063274834970215e-07, "loss": 0.0223, "step": 4037 }, { "epoch": 0.6369838703316638, "grad_norm": 5.490352630615234, "learning_rate": 3.704717436805667e-07, "loss": 0.0305, "step": 4038 }, { "epoch": 0.6371416176992546, "grad_norm": 3.52217173576355, "learning_rate": 3.7031073901143136e-07, "loss": 0.0358, "step": 4039 }, { "epoch": 0.6372993650668455, "grad_norm": 5.325096130371094, "learning_rate": 3.701497343422959e-07, "loss": 0.0169, "step": 4040 }, { "epoch": 0.6374571124344363, "grad_norm": 3.8855066299438477, "learning_rate": 3.699887296731605e-07, "loss": 0.048, "step": 4041 }, { "epoch": 0.637614859802027, "grad_norm": 7.730265140533447, "learning_rate": 3.698277250040251e-07, "loss": 0.0633, "step": 4042 }, { "epoch": 0.6377726071696178, "grad_norm": 6.336199760437012, "learning_rate": 3.6966672033488974e-07, "loss": 0.0216, "step": 4043 }, { "epoch": 0.6379303545372087, "grad_norm": 3.8117284774780273, "learning_rate": 3.695057156657543e-07, "loss": 0.0483, "step": 4044 }, { "epoch": 0.6380881019047995, "grad_norm": 2.870159864425659, "learning_rate": 3.693447109966189e-07, "loss": 0.0116, "step": 4045 }, { "epoch": 0.6382458492723903, "grad_norm": 3.458928108215332, "learning_rate": 3.6918370632748345e-07, "loss": 0.0194, "step": 4046 }, { "epoch": 0.638403596639981, "grad_norm": 3.858642101287842, "learning_rate": 3.690227016583481e-07, "loss": 0.0431, "step": 4047 }, { "epoch": 0.6385613440075719, "grad_norm": 5.640664100646973, "learning_rate": 3.6886169698921266e-07, "loss": 0.0745, "step": 4048 }, { "epoch": 0.6387190913751627, "grad_norm": 7.989383697509766, "learning_rate": 3.6870069232007727e-07, "loss": 0.0548, "step": 4049 }, { "epoch": 0.6388768387427535, "grad_norm": 6.7757673263549805, "learning_rate": 3.685396876509418e-07, "loss": 0.0579, "step": 4050 }, { "epoch": 0.6390345861103442, "grad_norm": 8.522056579589844, "learning_rate": 3.683786829818065e-07, "loss": 0.0603, "step": 4051 }, { "epoch": 0.6391923334779351, "grad_norm": 4.27354097366333, "learning_rate": 3.6821767831267103e-07, "loss": 0.0373, "step": 4052 }, { "epoch": 0.6393500808455259, "grad_norm": 3.444039821624756, "learning_rate": 3.6805667364353564e-07, "loss": 0.0394, "step": 4053 }, { "epoch": 0.6395078282131167, "grad_norm": 8.019424438476562, "learning_rate": 3.678956689744002e-07, "loss": 0.0641, "step": 4054 }, { "epoch": 0.6396655755807075, "grad_norm": 7.16523551940918, "learning_rate": 3.6773466430526485e-07, "loss": 0.0622, "step": 4055 }, { "epoch": 0.6398233229482982, "grad_norm": 4.8991875648498535, "learning_rate": 3.675736596361294e-07, "loss": 0.0196, "step": 4056 }, { "epoch": 0.6399810703158891, "grad_norm": 2.6943492889404297, "learning_rate": 3.67412654966994e-07, "loss": 0.0322, "step": 4057 }, { "epoch": 0.6401388176834799, "grad_norm": 7.7071709632873535, "learning_rate": 3.672516502978586e-07, "loss": 0.043, "step": 4058 }, { "epoch": 0.6402965650510707, "grad_norm": 7.444522380828857, "learning_rate": 3.670906456287232e-07, "loss": 0.0534, "step": 4059 }, { "epoch": 0.6404543124186615, "grad_norm": 2.598670244216919, "learning_rate": 3.6692964095958783e-07, "loss": 0.0112, "step": 4060 }, { "epoch": 0.6406120597862524, "grad_norm": 3.433713436126709, "learning_rate": 3.667686362904524e-07, "loss": 0.018, "step": 4061 }, { "epoch": 0.6407698071538431, "grad_norm": 5.276607513427734, "learning_rate": 3.6660763162131704e-07, "loss": 0.0593, "step": 4062 }, { "epoch": 0.6409275545214339, "grad_norm": 6.290041923522949, "learning_rate": 3.664466269521816e-07, "loss": 0.0257, "step": 4063 }, { "epoch": 0.6410853018890247, "grad_norm": 7.995701313018799, "learning_rate": 3.662856222830462e-07, "loss": 0.0123, "step": 4064 }, { "epoch": 0.6412430492566156, "grad_norm": 7.102702617645264, "learning_rate": 3.6612461761391076e-07, "loss": 0.0304, "step": 4065 }, { "epoch": 0.6414007966242063, "grad_norm": 4.317173480987549, "learning_rate": 3.659636129447754e-07, "loss": 0.0496, "step": 4066 }, { "epoch": 0.6415585439917971, "grad_norm": 4.227144718170166, "learning_rate": 3.6580260827563997e-07, "loss": 0.0602, "step": 4067 }, { "epoch": 0.6417162913593879, "grad_norm": 3.030519962310791, "learning_rate": 3.656416036065046e-07, "loss": 0.0215, "step": 4068 }, { "epoch": 0.6418740387269788, "grad_norm": 4.835078716278076, "learning_rate": 3.6548059893736913e-07, "loss": 0.0244, "step": 4069 }, { "epoch": 0.6420317860945696, "grad_norm": 4.8524956703186035, "learning_rate": 3.653195942682338e-07, "loss": 0.0243, "step": 4070 }, { "epoch": 0.6421895334621603, "grad_norm": 4.568508148193359, "learning_rate": 3.6515858959909834e-07, "loss": 0.0424, "step": 4071 }, { "epoch": 0.6423472808297511, "grad_norm": 5.0349812507629395, "learning_rate": 3.6499758492996295e-07, "loss": 0.0319, "step": 4072 }, { "epoch": 0.642505028197342, "grad_norm": 7.431118488311768, "learning_rate": 3.6483658026082756e-07, "loss": 0.0182, "step": 4073 }, { "epoch": 0.6426627755649328, "grad_norm": 10.326835632324219, "learning_rate": 3.6467557559169216e-07, "loss": 0.0409, "step": 4074 }, { "epoch": 0.6428205229325236, "grad_norm": 2.9283664226531982, "learning_rate": 3.645145709225567e-07, "loss": 0.03, "step": 4075 }, { "epoch": 0.6429782703001143, "grad_norm": 3.3292901515960693, "learning_rate": 3.643535662534213e-07, "loss": 0.0343, "step": 4076 }, { "epoch": 0.6431360176677051, "grad_norm": 2.3271267414093018, "learning_rate": 3.6419256158428593e-07, "loss": 0.0124, "step": 4077 }, { "epoch": 0.643293765035296, "grad_norm": 8.10965347290039, "learning_rate": 3.6403155691515053e-07, "loss": 0.0259, "step": 4078 }, { "epoch": 0.6434515124028868, "grad_norm": 8.534149169921875, "learning_rate": 3.638705522460151e-07, "loss": 0.0792, "step": 4079 }, { "epoch": 0.6436092597704776, "grad_norm": 4.74871301651001, "learning_rate": 3.637095475768797e-07, "loss": 0.0523, "step": 4080 }, { "epoch": 0.6437670071380683, "grad_norm": 3.940587043762207, "learning_rate": 3.635485429077443e-07, "loss": 0.0595, "step": 4081 }, { "epoch": 0.6439247545056592, "grad_norm": 4.647641181945801, "learning_rate": 3.633875382386089e-07, "loss": 0.0433, "step": 4082 }, { "epoch": 0.64408250187325, "grad_norm": 2.905236005783081, "learning_rate": 3.632265335694735e-07, "loss": 0.0263, "step": 4083 }, { "epoch": 0.6442402492408408, "grad_norm": 4.514106273651123, "learning_rate": 3.6306552890033807e-07, "loss": 0.0355, "step": 4084 }, { "epoch": 0.6443979966084316, "grad_norm": 5.877621173858643, "learning_rate": 3.629045242312027e-07, "loss": 0.0353, "step": 4085 }, { "epoch": 0.6445557439760224, "grad_norm": 5.227679252624512, "learning_rate": 3.627435195620673e-07, "loss": 0.0419, "step": 4086 }, { "epoch": 0.6447134913436132, "grad_norm": 7.069372653961182, "learning_rate": 3.625825148929319e-07, "loss": 0.074, "step": 4087 }, { "epoch": 0.644871238711204, "grad_norm": 3.09944486618042, "learning_rate": 3.624215102237965e-07, "loss": 0.0096, "step": 4088 }, { "epoch": 0.6450289860787948, "grad_norm": 6.890988349914551, "learning_rate": 3.622605055546611e-07, "loss": 0.0276, "step": 4089 }, { "epoch": 0.6451867334463857, "grad_norm": 12.69340705871582, "learning_rate": 3.6209950088552565e-07, "loss": 0.0508, "step": 4090 }, { "epoch": 0.6453444808139764, "grad_norm": 6.514010429382324, "learning_rate": 3.6193849621639026e-07, "loss": 0.0544, "step": 4091 }, { "epoch": 0.6455022281815672, "grad_norm": 4.55770206451416, "learning_rate": 3.6177749154725486e-07, "loss": 0.0171, "step": 4092 }, { "epoch": 0.645659975549158, "grad_norm": 5.213406085968018, "learning_rate": 3.6161648687811947e-07, "loss": 0.0405, "step": 4093 }, { "epoch": 0.6458177229167489, "grad_norm": 5.1065993309021, "learning_rate": 3.61455482208984e-07, "loss": 0.0696, "step": 4094 }, { "epoch": 0.6459754702843397, "grad_norm": 5.2595014572143555, "learning_rate": 3.6129447753984863e-07, "loss": 0.0381, "step": 4095 }, { "epoch": 0.6461332176519304, "grad_norm": 5.897598743438721, "learning_rate": 3.6113347287071324e-07, "loss": 0.0506, "step": 4096 }, { "epoch": 0.6462909650195212, "grad_norm": 5.073671340942383, "learning_rate": 3.6097246820157784e-07, "loss": 0.0504, "step": 4097 }, { "epoch": 0.646448712387112, "grad_norm": 7.696348667144775, "learning_rate": 3.608114635324424e-07, "loss": 0.0859, "step": 4098 }, { "epoch": 0.6466064597547029, "grad_norm": 7.650763511657715, "learning_rate": 3.60650458863307e-07, "loss": 0.0776, "step": 4099 }, { "epoch": 0.6467642071222937, "grad_norm": 9.237130165100098, "learning_rate": 3.604894541941716e-07, "loss": 0.0533, "step": 4100 }, { "epoch": 0.6469219544898844, "grad_norm": 5.437690734863281, "learning_rate": 3.603284495250362e-07, "loss": 0.0593, "step": 4101 }, { "epoch": 0.6470797018574752, "grad_norm": 5.095547199249268, "learning_rate": 3.6016744485590077e-07, "loss": 0.0623, "step": 4102 }, { "epoch": 0.6472374492250661, "grad_norm": 0.9722776412963867, "learning_rate": 3.6000644018676543e-07, "loss": 0.0071, "step": 4103 }, { "epoch": 0.6473951965926569, "grad_norm": 3.607344150543213, "learning_rate": 3.5984543551763e-07, "loss": 0.0378, "step": 4104 }, { "epoch": 0.6475529439602477, "grad_norm": 3.9352469444274902, "learning_rate": 3.596844308484946e-07, "loss": 0.0436, "step": 4105 }, { "epoch": 0.6477106913278384, "grad_norm": 5.423374652862549, "learning_rate": 3.595234261793592e-07, "loss": 0.0451, "step": 4106 }, { "epoch": 0.6478684386954293, "grad_norm": 2.667495012283325, "learning_rate": 3.593624215102238e-07, "loss": 0.0253, "step": 4107 }, { "epoch": 0.6480261860630201, "grad_norm": 7.697785377502441, "learning_rate": 3.592014168410884e-07, "loss": 0.0471, "step": 4108 }, { "epoch": 0.6481839334306109, "grad_norm": 5.53313684463501, "learning_rate": 3.5904041217195296e-07, "loss": 0.0425, "step": 4109 }, { "epoch": 0.6483416807982016, "grad_norm": 2.814230442047119, "learning_rate": 3.5887940750281757e-07, "loss": 0.0162, "step": 4110 }, { "epoch": 0.6484994281657925, "grad_norm": 3.392780065536499, "learning_rate": 3.5871840283368217e-07, "loss": 0.05, "step": 4111 }, { "epoch": 0.6486571755333833, "grad_norm": 11.159510612487793, "learning_rate": 3.585573981645468e-07, "loss": 0.0645, "step": 4112 }, { "epoch": 0.6488149229009741, "grad_norm": 5.5175042152404785, "learning_rate": 3.5839639349541133e-07, "loss": 0.0227, "step": 4113 }, { "epoch": 0.6489726702685649, "grad_norm": 5.363088607788086, "learning_rate": 3.5823538882627594e-07, "loss": 0.034, "step": 4114 }, { "epoch": 0.6491304176361556, "grad_norm": 4.83908224105835, "learning_rate": 3.5807438415714054e-07, "loss": 0.0319, "step": 4115 }, { "epoch": 0.6492881650037465, "grad_norm": 4.075836181640625, "learning_rate": 3.5791337948800515e-07, "loss": 0.0155, "step": 4116 }, { "epoch": 0.6494459123713373, "grad_norm": 3.4654319286346436, "learning_rate": 3.577523748188697e-07, "loss": 0.0382, "step": 4117 }, { "epoch": 0.6496036597389281, "grad_norm": 5.039188385009766, "learning_rate": 3.5759137014973436e-07, "loss": 0.0611, "step": 4118 }, { "epoch": 0.6497614071065189, "grad_norm": 5.757129192352295, "learning_rate": 3.574303654805989e-07, "loss": 0.038, "step": 4119 }, { "epoch": 0.6499191544741098, "grad_norm": 6.571725368499756, "learning_rate": 3.572693608114635e-07, "loss": 0.0511, "step": 4120 }, { "epoch": 0.6500769018417005, "grad_norm": 3.8214361667633057, "learning_rate": 3.571083561423281e-07, "loss": 0.0881, "step": 4121 }, { "epoch": 0.6502346492092913, "grad_norm": 12.41899299621582, "learning_rate": 3.5694735147319274e-07, "loss": 0.0888, "step": 4122 }, { "epoch": 0.6503923965768821, "grad_norm": 4.831263542175293, "learning_rate": 3.567863468040573e-07, "loss": 0.0286, "step": 4123 }, { "epoch": 0.650550143944473, "grad_norm": 8.912391662597656, "learning_rate": 3.566253421349219e-07, "loss": 0.1056, "step": 4124 }, { "epoch": 0.6507078913120637, "grad_norm": 4.068164825439453, "learning_rate": 3.5646433746578645e-07, "loss": 0.0564, "step": 4125 }, { "epoch": 0.6508656386796545, "grad_norm": 3.028892993927002, "learning_rate": 3.563033327966511e-07, "loss": 0.0219, "step": 4126 }, { "epoch": 0.6510233860472453, "grad_norm": 4.021695137023926, "learning_rate": 3.5614232812751566e-07, "loss": 0.0556, "step": 4127 }, { "epoch": 0.6511811334148362, "grad_norm": 5.660754680633545, "learning_rate": 3.5598132345838027e-07, "loss": 0.0738, "step": 4128 }, { "epoch": 0.651338880782427, "grad_norm": 3.951024055480957, "learning_rate": 3.558203187892448e-07, "loss": 0.0117, "step": 4129 }, { "epoch": 0.6514966281500177, "grad_norm": 2.7424662113189697, "learning_rate": 3.556593141201095e-07, "loss": 0.0325, "step": 4130 }, { "epoch": 0.6516543755176085, "grad_norm": 4.798673629760742, "learning_rate": 3.554983094509741e-07, "loss": 0.0704, "step": 4131 }, { "epoch": 0.6518121228851994, "grad_norm": 10.368926048278809, "learning_rate": 3.5533730478183864e-07, "loss": 0.0329, "step": 4132 }, { "epoch": 0.6519698702527902, "grad_norm": 8.735679626464844, "learning_rate": 3.551763001127033e-07, "loss": 0.0419, "step": 4133 }, { "epoch": 0.652127617620381, "grad_norm": 7.130101680755615, "learning_rate": 3.5501529544356785e-07, "loss": 0.0541, "step": 4134 }, { "epoch": 0.6522853649879717, "grad_norm": 4.491020679473877, "learning_rate": 3.5485429077443246e-07, "loss": 0.04, "step": 4135 }, { "epoch": 0.6524431123555625, "grad_norm": 7.604978084564209, "learning_rate": 3.54693286105297e-07, "loss": 0.0552, "step": 4136 }, { "epoch": 0.6526008597231534, "grad_norm": 3.4317190647125244, "learning_rate": 3.5453228143616167e-07, "loss": 0.0243, "step": 4137 }, { "epoch": 0.6527586070907442, "grad_norm": 16.422517776489258, "learning_rate": 3.543712767670262e-07, "loss": 0.1136, "step": 4138 }, { "epoch": 0.652916354458335, "grad_norm": 2.868276834487915, "learning_rate": 3.5421027209789083e-07, "loss": 0.0241, "step": 4139 }, { "epoch": 0.6530741018259257, "grad_norm": 5.0669989585876465, "learning_rate": 3.540492674287554e-07, "loss": 0.0287, "step": 4140 }, { "epoch": 0.6532318491935166, "grad_norm": 5.366792678833008, "learning_rate": 3.5388826275962004e-07, "loss": 0.0286, "step": 4141 }, { "epoch": 0.6533895965611074, "grad_norm": 6.7800493240356445, "learning_rate": 3.537272580904846e-07, "loss": 0.0189, "step": 4142 }, { "epoch": 0.6535473439286982, "grad_norm": 4.256573677062988, "learning_rate": 3.535662534213492e-07, "loss": 0.0498, "step": 4143 }, { "epoch": 0.653705091296289, "grad_norm": 6.63882303237915, "learning_rate": 3.5340524875221376e-07, "loss": 0.0751, "step": 4144 }, { "epoch": 0.6538628386638798, "grad_norm": 7.1804890632629395, "learning_rate": 3.532442440830784e-07, "loss": 0.0514, "step": 4145 }, { "epoch": 0.6540205860314706, "grad_norm": 5.477293491363525, "learning_rate": 3.5308323941394297e-07, "loss": 0.033, "step": 4146 }, { "epoch": 0.6541783333990614, "grad_norm": 5.306337356567383, "learning_rate": 3.529222347448076e-07, "loss": 0.0401, "step": 4147 }, { "epoch": 0.6543360807666522, "grad_norm": 5.703723430633545, "learning_rate": 3.527612300756722e-07, "loss": 0.051, "step": 4148 }, { "epoch": 0.6544938281342431, "grad_norm": 6.859851360321045, "learning_rate": 3.526002254065368e-07, "loss": 0.0838, "step": 4149 }, { "epoch": 0.6546515755018338, "grad_norm": 4.944122314453125, "learning_rate": 3.5243922073740134e-07, "loss": 0.0609, "step": 4150 }, { "epoch": 0.6548093228694246, "grad_norm": 2.2758069038391113, "learning_rate": 3.5227821606826595e-07, "loss": 0.0182, "step": 4151 }, { "epoch": 0.6549670702370154, "grad_norm": 8.949913024902344, "learning_rate": 3.5211721139913055e-07, "loss": 0.0527, "step": 4152 }, { "epoch": 0.6551248176046062, "grad_norm": 4.411393165588379, "learning_rate": 3.5195620672999516e-07, "loss": 0.0257, "step": 4153 }, { "epoch": 0.6552825649721971, "grad_norm": 4.506752014160156, "learning_rate": 3.5179520206085977e-07, "loss": 0.0277, "step": 4154 }, { "epoch": 0.6554403123397878, "grad_norm": 4.197580814361572, "learning_rate": 3.516341973917243e-07, "loss": 0.0301, "step": 4155 }, { "epoch": 0.6555980597073786, "grad_norm": 4.70909309387207, "learning_rate": 3.51473192722589e-07, "loss": 0.0328, "step": 4156 }, { "epoch": 0.6557558070749694, "grad_norm": 4.624414443969727, "learning_rate": 3.5131218805345353e-07, "loss": 0.0461, "step": 4157 }, { "epoch": 0.6559135544425603, "grad_norm": 4.482852935791016, "learning_rate": 3.5115118338431814e-07, "loss": 0.0185, "step": 4158 }, { "epoch": 0.656071301810151, "grad_norm": 7.920719623565674, "learning_rate": 3.509901787151827e-07, "loss": 0.06, "step": 4159 }, { "epoch": 0.6562290491777418, "grad_norm": 3.98606538772583, "learning_rate": 3.5082917404604735e-07, "loss": 0.0386, "step": 4160 }, { "epoch": 0.6563867965453326, "grad_norm": 3.9634904861450195, "learning_rate": 3.506681693769119e-07, "loss": 0.0345, "step": 4161 }, { "epoch": 0.6565445439129235, "grad_norm": 5.764398097991943, "learning_rate": 3.505071647077765e-07, "loss": 0.0681, "step": 4162 }, { "epoch": 0.6567022912805143, "grad_norm": 4.86509370803833, "learning_rate": 3.503461600386411e-07, "loss": 0.0353, "step": 4163 }, { "epoch": 0.656860038648105, "grad_norm": 5.676183700561523, "learning_rate": 3.501851553695057e-07, "loss": 0.0266, "step": 4164 }, { "epoch": 0.6570177860156958, "grad_norm": 6.578132629394531, "learning_rate": 3.500241507003703e-07, "loss": 0.064, "step": 4165 }, { "epoch": 0.6571755333832867, "grad_norm": 3.3060572147369385, "learning_rate": 3.498631460312349e-07, "loss": 0.032, "step": 4166 }, { "epoch": 0.6573332807508775, "grad_norm": 4.167490005493164, "learning_rate": 3.497021413620995e-07, "loss": 0.0383, "step": 4167 }, { "epoch": 0.6574910281184683, "grad_norm": 3.2244651317596436, "learning_rate": 3.495411366929641e-07, "loss": 0.0161, "step": 4168 }, { "epoch": 0.657648775486059, "grad_norm": 6.713063716888428, "learning_rate": 3.4938013202382865e-07, "loss": 0.0506, "step": 4169 }, { "epoch": 0.6578065228536499, "grad_norm": 8.036482810974121, "learning_rate": 3.4921912735469326e-07, "loss": 0.0809, "step": 4170 }, { "epoch": 0.6579642702212407, "grad_norm": 9.210870742797852, "learning_rate": 3.4905812268555786e-07, "loss": 0.0779, "step": 4171 }, { "epoch": 0.6581220175888315, "grad_norm": 6.666898727416992, "learning_rate": 3.4889711801642247e-07, "loss": 0.0443, "step": 4172 }, { "epoch": 0.6582797649564223, "grad_norm": 3.237283229827881, "learning_rate": 3.48736113347287e-07, "loss": 0.0131, "step": 4173 }, { "epoch": 0.658437512324013, "grad_norm": 6.588843822479248, "learning_rate": 3.4857510867815163e-07, "loss": 0.0341, "step": 4174 }, { "epoch": 0.6585952596916039, "grad_norm": 9.462082862854004, "learning_rate": 3.4841410400901623e-07, "loss": 0.0871, "step": 4175 }, { "epoch": 0.6587530070591947, "grad_norm": 2.9450438022613525, "learning_rate": 3.4825309933988084e-07, "loss": 0.0546, "step": 4176 }, { "epoch": 0.6589107544267855, "grad_norm": 4.6205058097839355, "learning_rate": 3.4809209467074545e-07, "loss": 0.0616, "step": 4177 }, { "epoch": 0.6590685017943763, "grad_norm": 5.313398838043213, "learning_rate": 3.4793109000161005e-07, "loss": 0.0259, "step": 4178 }, { "epoch": 0.6592262491619671, "grad_norm": 12.58399772644043, "learning_rate": 3.4777008533247466e-07, "loss": 0.0592, "step": 4179 }, { "epoch": 0.6593839965295579, "grad_norm": 4.588766098022461, "learning_rate": 3.476090806633392e-07, "loss": 0.0348, "step": 4180 }, { "epoch": 0.6595417438971487, "grad_norm": 4.431708812713623, "learning_rate": 3.474480759942038e-07, "loss": 0.0355, "step": 4181 }, { "epoch": 0.6596994912647395, "grad_norm": 3.6338794231414795, "learning_rate": 3.472870713250684e-07, "loss": 0.0579, "step": 4182 }, { "epoch": 0.6598572386323304, "grad_norm": 5.266646385192871, "learning_rate": 3.4712606665593303e-07, "loss": 0.054, "step": 4183 }, { "epoch": 0.6600149859999211, "grad_norm": 1.8733805418014526, "learning_rate": 3.469650619867976e-07, "loss": 0.0286, "step": 4184 }, { "epoch": 0.6601727333675119, "grad_norm": 2.6201236248016357, "learning_rate": 3.468040573176622e-07, "loss": 0.0237, "step": 4185 }, { "epoch": 0.6603304807351027, "grad_norm": 4.8239827156066895, "learning_rate": 3.466430526485268e-07, "loss": 0.0303, "step": 4186 }, { "epoch": 0.6604882281026936, "grad_norm": 4.803821563720703, "learning_rate": 3.464820479793914e-07, "loss": 0.033, "step": 4187 }, { "epoch": 0.6606459754702844, "grad_norm": 6.170940399169922, "learning_rate": 3.4632104331025596e-07, "loss": 0.0473, "step": 4188 }, { "epoch": 0.6608037228378751, "grad_norm": 1.6020370721817017, "learning_rate": 3.4616003864112056e-07, "loss": 0.0126, "step": 4189 }, { "epoch": 0.6609614702054659, "grad_norm": 5.833771705627441, "learning_rate": 3.4599903397198517e-07, "loss": 0.0836, "step": 4190 }, { "epoch": 0.6611192175730567, "grad_norm": 6.305184841156006, "learning_rate": 3.458380293028498e-07, "loss": 0.0286, "step": 4191 }, { "epoch": 0.6612769649406476, "grad_norm": 12.768031120300293, "learning_rate": 3.4567702463371433e-07, "loss": 0.0645, "step": 4192 }, { "epoch": 0.6614347123082384, "grad_norm": 5.400166988372803, "learning_rate": 3.45516019964579e-07, "loss": 0.0257, "step": 4193 }, { "epoch": 0.6615924596758291, "grad_norm": 2.253767728805542, "learning_rate": 3.4535501529544354e-07, "loss": 0.0267, "step": 4194 }, { "epoch": 0.6617502070434199, "grad_norm": 4.741762638092041, "learning_rate": 3.4519401062630815e-07, "loss": 0.0288, "step": 4195 }, { "epoch": 0.6619079544110108, "grad_norm": 5.254599094390869, "learning_rate": 3.450330059571727e-07, "loss": 0.0667, "step": 4196 }, { "epoch": 0.6620657017786016, "grad_norm": 2.3707597255706787, "learning_rate": 3.4487200128803736e-07, "loss": 0.051, "step": 4197 }, { "epoch": 0.6622234491461924, "grad_norm": 3.265528440475464, "learning_rate": 3.447109966189019e-07, "loss": 0.0385, "step": 4198 }, { "epoch": 0.6623811965137831, "grad_norm": 3.6523993015289307, "learning_rate": 3.445499919497665e-07, "loss": 0.0626, "step": 4199 }, { "epoch": 0.662538943881374, "grad_norm": 4.667764186859131, "learning_rate": 3.4438898728063113e-07, "loss": 0.0559, "step": 4200 }, { "epoch": 0.6626966912489648, "grad_norm": 8.858838081359863, "learning_rate": 3.4422798261149573e-07, "loss": 0.0963, "step": 4201 }, { "epoch": 0.6628544386165556, "grad_norm": 7.327144622802734, "learning_rate": 3.4406697794236034e-07, "loss": 0.0662, "step": 4202 }, { "epoch": 0.6630121859841464, "grad_norm": 4.257870197296143, "learning_rate": 3.439059732732249e-07, "loss": 0.0517, "step": 4203 }, { "epoch": 0.6631699333517372, "grad_norm": 6.316591739654541, "learning_rate": 3.437449686040895e-07, "loss": 0.0466, "step": 4204 }, { "epoch": 0.663327680719328, "grad_norm": 4.2004594802856445, "learning_rate": 3.435839639349541e-07, "loss": 0.0359, "step": 4205 }, { "epoch": 0.6634854280869188, "grad_norm": 6.217618942260742, "learning_rate": 3.434229592658187e-07, "loss": 0.0308, "step": 4206 }, { "epoch": 0.6636431754545096, "grad_norm": 3.3948912620544434, "learning_rate": 3.4326195459668327e-07, "loss": 0.0388, "step": 4207 }, { "epoch": 0.6638009228221005, "grad_norm": 5.694115161895752, "learning_rate": 3.431009499275479e-07, "loss": 0.0309, "step": 4208 }, { "epoch": 0.6639586701896912, "grad_norm": 3.7904810905456543, "learning_rate": 3.429399452584125e-07, "loss": 0.0198, "step": 4209 }, { "epoch": 0.664116417557282, "grad_norm": 6.0549092292785645, "learning_rate": 3.427789405892771e-07, "loss": 0.0374, "step": 4210 }, { "epoch": 0.6642741649248728, "grad_norm": 3.5876405239105225, "learning_rate": 3.4261793592014164e-07, "loss": 0.028, "step": 4211 }, { "epoch": 0.6644319122924636, "grad_norm": 3.949674606323242, "learning_rate": 3.424569312510063e-07, "loss": 0.0501, "step": 4212 }, { "epoch": 0.6645896596600545, "grad_norm": 4.2892069816589355, "learning_rate": 3.4229592658187085e-07, "loss": 0.0243, "step": 4213 }, { "epoch": 0.6647474070276452, "grad_norm": 6.020479679107666, "learning_rate": 3.4213492191273546e-07, "loss": 0.1264, "step": 4214 }, { "epoch": 0.664905154395236, "grad_norm": 3.7382099628448486, "learning_rate": 3.419739172436e-07, "loss": 0.0297, "step": 4215 }, { "epoch": 0.6650629017628268, "grad_norm": 3.4760491847991943, "learning_rate": 3.4181291257446467e-07, "loss": 0.02, "step": 4216 }, { "epoch": 0.6652206491304177, "grad_norm": 5.966982364654541, "learning_rate": 3.416519079053292e-07, "loss": 0.0269, "step": 4217 }, { "epoch": 0.6653783964980085, "grad_norm": 4.919923782348633, "learning_rate": 3.4149090323619383e-07, "loss": 0.0698, "step": 4218 }, { "epoch": 0.6655361438655992, "grad_norm": 10.699564933776855, "learning_rate": 3.413298985670584e-07, "loss": 0.06, "step": 4219 }, { "epoch": 0.66569389123319, "grad_norm": 1.5768576860427856, "learning_rate": 3.4116889389792304e-07, "loss": 0.0097, "step": 4220 }, { "epoch": 0.6658516386007809, "grad_norm": 4.370302200317383, "learning_rate": 3.410078892287876e-07, "loss": 0.0334, "step": 4221 }, { "epoch": 0.6660093859683717, "grad_norm": 9.208818435668945, "learning_rate": 3.408468845596522e-07, "loss": 0.0628, "step": 4222 }, { "epoch": 0.6661671333359624, "grad_norm": 4.493042469024658, "learning_rate": 3.4068587989051686e-07, "loss": 0.0419, "step": 4223 }, { "epoch": 0.6663248807035532, "grad_norm": 2.3730244636535645, "learning_rate": 3.405248752213814e-07, "loss": 0.0316, "step": 4224 }, { "epoch": 0.6664826280711441, "grad_norm": 5.553833961486816, "learning_rate": 3.40363870552246e-07, "loss": 0.0477, "step": 4225 }, { "epoch": 0.6666403754387349, "grad_norm": 6.563319206237793, "learning_rate": 3.402028658831106e-07, "loss": 0.0242, "step": 4226 }, { "epoch": 0.6667981228063257, "grad_norm": 3.1789028644561768, "learning_rate": 3.4004186121397523e-07, "loss": 0.0207, "step": 4227 }, { "epoch": 0.6669558701739164, "grad_norm": 6.585756778717041, "learning_rate": 3.398808565448398e-07, "loss": 0.0366, "step": 4228 }, { "epoch": 0.6671136175415073, "grad_norm": 4.605175971984863, "learning_rate": 3.397198518757044e-07, "loss": 0.0557, "step": 4229 }, { "epoch": 0.6672713649090981, "grad_norm": 1.7443026304244995, "learning_rate": 3.3955884720656895e-07, "loss": 0.0055, "step": 4230 }, { "epoch": 0.6674291122766889, "grad_norm": 10.741480827331543, "learning_rate": 3.393978425374336e-07, "loss": 0.0625, "step": 4231 }, { "epoch": 0.6675868596442797, "grad_norm": 5.386422634124756, "learning_rate": 3.3923683786829816e-07, "loss": 0.0417, "step": 4232 }, { "epoch": 0.6677446070118704, "grad_norm": 3.263134002685547, "learning_rate": 3.3907583319916277e-07, "loss": 0.0708, "step": 4233 }, { "epoch": 0.6679023543794613, "grad_norm": 16.562847137451172, "learning_rate": 3.389148285300273e-07, "loss": 0.0416, "step": 4234 }, { "epoch": 0.6680601017470521, "grad_norm": 4.665866851806641, "learning_rate": 3.38753823860892e-07, "loss": 0.0315, "step": 4235 }, { "epoch": 0.6682178491146429, "grad_norm": 6.751397609710693, "learning_rate": 3.3859281919175653e-07, "loss": 0.055, "step": 4236 }, { "epoch": 0.6683755964822337, "grad_norm": 6.178576946258545, "learning_rate": 3.3843181452262114e-07, "loss": 0.0387, "step": 4237 }, { "epoch": 0.6685333438498245, "grad_norm": 1.5225474834442139, "learning_rate": 3.382708098534857e-07, "loss": 0.0115, "step": 4238 }, { "epoch": 0.6686910912174153, "grad_norm": 5.776130199432373, "learning_rate": 3.3810980518435035e-07, "loss": 0.038, "step": 4239 }, { "epoch": 0.6688488385850061, "grad_norm": 4.210666656494141, "learning_rate": 3.379488005152149e-07, "loss": 0.0325, "step": 4240 }, { "epoch": 0.6690065859525969, "grad_norm": 4.568084239959717, "learning_rate": 3.377877958460795e-07, "loss": 0.0552, "step": 4241 }, { "epoch": 0.6691643333201878, "grad_norm": 3.740183115005493, "learning_rate": 3.376267911769441e-07, "loss": 0.0464, "step": 4242 }, { "epoch": 0.6693220806877785, "grad_norm": 3.6826813220977783, "learning_rate": 3.374657865078087e-07, "loss": 0.0426, "step": 4243 }, { "epoch": 0.6694798280553693, "grad_norm": 7.331096649169922, "learning_rate": 3.373047818386733e-07, "loss": 0.0952, "step": 4244 }, { "epoch": 0.6696375754229601, "grad_norm": 10.238526344299316, "learning_rate": 3.371437771695379e-07, "loss": 0.0551, "step": 4245 }, { "epoch": 0.669795322790551, "grad_norm": 7.426093578338623, "learning_rate": 3.3698277250040254e-07, "loss": 0.0571, "step": 4246 }, { "epoch": 0.6699530701581418, "grad_norm": 4.035054683685303, "learning_rate": 3.368217678312671e-07, "loss": 0.0272, "step": 4247 }, { "epoch": 0.6701108175257325, "grad_norm": 5.987761497497559, "learning_rate": 3.366607631621317e-07, "loss": 0.0643, "step": 4248 }, { "epoch": 0.6702685648933233, "grad_norm": 4.998387813568115, "learning_rate": 3.3649975849299625e-07, "loss": 0.031, "step": 4249 }, { "epoch": 0.6704263122609141, "grad_norm": 2.5684804916381836, "learning_rate": 3.363387538238609e-07, "loss": 0.0174, "step": 4250 }, { "epoch": 0.670584059628505, "grad_norm": 4.180668830871582, "learning_rate": 3.3617774915472547e-07, "loss": 0.0449, "step": 4251 }, { "epoch": 0.6707418069960958, "grad_norm": 8.460077285766602, "learning_rate": 3.360167444855901e-07, "loss": 0.0965, "step": 4252 }, { "epoch": 0.6708995543636865, "grad_norm": 4.493109703063965, "learning_rate": 3.3585573981645463e-07, "loss": 0.0589, "step": 4253 }, { "epoch": 0.6710573017312773, "grad_norm": 6.921128749847412, "learning_rate": 3.356947351473193e-07, "loss": 0.0487, "step": 4254 }, { "epoch": 0.6712150490988682, "grad_norm": 7.861953258514404, "learning_rate": 3.3553373047818384e-07, "loss": 0.1062, "step": 4255 }, { "epoch": 0.671372796466459, "grad_norm": 3.8344569206237793, "learning_rate": 3.3537272580904845e-07, "loss": 0.0313, "step": 4256 }, { "epoch": 0.6715305438340498, "grad_norm": 4.822232723236084, "learning_rate": 3.3521172113991305e-07, "loss": 0.0378, "step": 4257 }, { "epoch": 0.6716882912016405, "grad_norm": 12.531158447265625, "learning_rate": 3.3505071647077766e-07, "loss": 0.1177, "step": 4258 }, { "epoch": 0.6718460385692314, "grad_norm": 6.85503625869751, "learning_rate": 3.348897118016422e-07, "loss": 0.0474, "step": 4259 }, { "epoch": 0.6720037859368222, "grad_norm": 5.491405487060547, "learning_rate": 3.347287071325068e-07, "loss": 0.0428, "step": 4260 }, { "epoch": 0.672161533304413, "grad_norm": 1.6274034976959229, "learning_rate": 3.345677024633714e-07, "loss": 0.0176, "step": 4261 }, { "epoch": 0.6723192806720037, "grad_norm": 11.389778137207031, "learning_rate": 3.3440669779423603e-07, "loss": 0.0727, "step": 4262 }, { "epoch": 0.6724770280395946, "grad_norm": 4.566437244415283, "learning_rate": 3.342456931251006e-07, "loss": 0.0446, "step": 4263 }, { "epoch": 0.6726347754071854, "grad_norm": 6.433427333831787, "learning_rate": 3.340846884559652e-07, "loss": 0.0446, "step": 4264 }, { "epoch": 0.6727925227747762, "grad_norm": 6.395633697509766, "learning_rate": 3.339236837868298e-07, "loss": 0.0712, "step": 4265 }, { "epoch": 0.672950270142367, "grad_norm": 6.634777545928955, "learning_rate": 3.337626791176944e-07, "loss": 0.0385, "step": 4266 }, { "epoch": 0.6731080175099579, "grad_norm": 3.537296772003174, "learning_rate": 3.3360167444855896e-07, "loss": 0.0313, "step": 4267 }, { "epoch": 0.6732657648775486, "grad_norm": 4.52726936340332, "learning_rate": 3.3344066977942356e-07, "loss": 0.0617, "step": 4268 }, { "epoch": 0.6734235122451394, "grad_norm": 7.097104072570801, "learning_rate": 3.332796651102882e-07, "loss": 0.0645, "step": 4269 }, { "epoch": 0.6735812596127302, "grad_norm": 3.4680495262145996, "learning_rate": 3.331186604411528e-07, "loss": 0.0266, "step": 4270 }, { "epoch": 0.673739006980321, "grad_norm": 3.1925270557403564, "learning_rate": 3.329576557720174e-07, "loss": 0.0421, "step": 4271 }, { "epoch": 0.6738967543479119, "grad_norm": 151.88665771484375, "learning_rate": 3.32796651102882e-07, "loss": 0.0802, "step": 4272 }, { "epoch": 0.6740545017155026, "grad_norm": 4.57705020904541, "learning_rate": 3.326356464337466e-07, "loss": 0.0307, "step": 4273 }, { "epoch": 0.6742122490830934, "grad_norm": 5.487161636352539, "learning_rate": 3.3247464176461115e-07, "loss": 0.0508, "step": 4274 }, { "epoch": 0.6743699964506842, "grad_norm": 5.27452278137207, "learning_rate": 3.3231363709547575e-07, "loss": 0.0557, "step": 4275 }, { "epoch": 0.6745277438182751, "grad_norm": 6.933518409729004, "learning_rate": 3.3215263242634036e-07, "loss": 0.0624, "step": 4276 }, { "epoch": 0.6746854911858658, "grad_norm": 6.866750717163086, "learning_rate": 3.3199162775720497e-07, "loss": 0.0453, "step": 4277 }, { "epoch": 0.6748432385534566, "grad_norm": 4.619298458099365, "learning_rate": 3.318306230880695e-07, "loss": 0.0476, "step": 4278 }, { "epoch": 0.6750009859210474, "grad_norm": 5.475695610046387, "learning_rate": 3.3166961841893413e-07, "loss": 0.0608, "step": 4279 }, { "epoch": 0.6751587332886383, "grad_norm": 5.759555816650391, "learning_rate": 3.3150861374979873e-07, "loss": 0.044, "step": 4280 }, { "epoch": 0.6753164806562291, "grad_norm": 6.68592643737793, "learning_rate": 3.3134760908066334e-07, "loss": 0.0689, "step": 4281 }, { "epoch": 0.6754742280238198, "grad_norm": 3.2656190395355225, "learning_rate": 3.311866044115279e-07, "loss": 0.0328, "step": 4282 }, { "epoch": 0.6756319753914106, "grad_norm": 4.156400680541992, "learning_rate": 3.310255997423925e-07, "loss": 0.0413, "step": 4283 }, { "epoch": 0.6757897227590015, "grad_norm": 5.9173583984375, "learning_rate": 3.308645950732571e-07, "loss": 0.0737, "step": 4284 }, { "epoch": 0.6759474701265923, "grad_norm": 2.6478850841522217, "learning_rate": 3.307035904041217e-07, "loss": 0.0155, "step": 4285 }, { "epoch": 0.6761052174941831, "grad_norm": 5.950133800506592, "learning_rate": 3.3054258573498626e-07, "loss": 0.0385, "step": 4286 }, { "epoch": 0.6762629648617738, "grad_norm": 5.7482099533081055, "learning_rate": 3.303815810658509e-07, "loss": 0.0617, "step": 4287 }, { "epoch": 0.6764207122293646, "grad_norm": 1.9521363973617554, "learning_rate": 3.302205763967155e-07, "loss": 0.0122, "step": 4288 }, { "epoch": 0.6765784595969555, "grad_norm": 2.926424264907837, "learning_rate": 3.300595717275801e-07, "loss": 0.0215, "step": 4289 }, { "epoch": 0.6767362069645463, "grad_norm": 5.0904860496521, "learning_rate": 3.2989856705844464e-07, "loss": 0.0627, "step": 4290 }, { "epoch": 0.6768939543321371, "grad_norm": 3.662292003631592, "learning_rate": 3.297375623893093e-07, "loss": 0.049, "step": 4291 }, { "epoch": 0.6770517016997278, "grad_norm": 4.862106800079346, "learning_rate": 3.295765577201739e-07, "loss": 0.038, "step": 4292 }, { "epoch": 0.6772094490673187, "grad_norm": 7.885111331939697, "learning_rate": 3.2941555305103846e-07, "loss": 0.0509, "step": 4293 }, { "epoch": 0.6773671964349095, "grad_norm": 4.848024368286133, "learning_rate": 3.2925454838190306e-07, "loss": 0.0497, "step": 4294 }, { "epoch": 0.6775249438025003, "grad_norm": 7.610829830169678, "learning_rate": 3.2909354371276767e-07, "loss": 0.06, "step": 4295 }, { "epoch": 0.6776826911700911, "grad_norm": 5.329433917999268, "learning_rate": 3.289325390436323e-07, "loss": 0.0599, "step": 4296 }, { "epoch": 0.6778404385376819, "grad_norm": 4.244685173034668, "learning_rate": 3.2877153437449683e-07, "loss": 0.021, "step": 4297 }, { "epoch": 0.6779981859052727, "grad_norm": 3.7655065059661865, "learning_rate": 3.2861052970536143e-07, "loss": 0.0361, "step": 4298 }, { "epoch": 0.6781559332728635, "grad_norm": 6.28010892868042, "learning_rate": 3.2844952503622604e-07, "loss": 0.0568, "step": 4299 }, { "epoch": 0.6783136806404543, "grad_norm": 7.3299055099487305, "learning_rate": 3.2828852036709065e-07, "loss": 0.0582, "step": 4300 }, { "epoch": 0.6784714280080452, "grad_norm": 7.427062034606934, "learning_rate": 3.281275156979552e-07, "loss": 0.0659, "step": 4301 }, { "epoch": 0.6786291753756359, "grad_norm": 3.0374765396118164, "learning_rate": 3.2796651102881986e-07, "loss": 0.0342, "step": 4302 }, { "epoch": 0.6787869227432267, "grad_norm": 10.06717586517334, "learning_rate": 3.278055063596844e-07, "loss": 0.0529, "step": 4303 }, { "epoch": 0.6789446701108175, "grad_norm": 7.636187553405762, "learning_rate": 3.27644501690549e-07, "loss": 0.0607, "step": 4304 }, { "epoch": 0.6791024174784084, "grad_norm": 2.9388339519500732, "learning_rate": 3.2748349702141357e-07, "loss": 0.0135, "step": 4305 }, { "epoch": 0.6792601648459992, "grad_norm": 3.9617867469787598, "learning_rate": 3.2732249235227823e-07, "loss": 0.0261, "step": 4306 }, { "epoch": 0.6794179122135899, "grad_norm": 3.3003311157226562, "learning_rate": 3.271614876831428e-07, "loss": 0.0218, "step": 4307 }, { "epoch": 0.6795756595811807, "grad_norm": 3.664931535720825, "learning_rate": 3.270004830140074e-07, "loss": 0.0604, "step": 4308 }, { "epoch": 0.6797334069487715, "grad_norm": 4.4985857009887695, "learning_rate": 3.2683947834487195e-07, "loss": 0.0699, "step": 4309 }, { "epoch": 0.6798911543163624, "grad_norm": 4.862767219543457, "learning_rate": 3.266784736757366e-07, "loss": 0.0434, "step": 4310 }, { "epoch": 0.6800489016839532, "grad_norm": 4.629154205322266, "learning_rate": 3.2651746900660116e-07, "loss": 0.031, "step": 4311 }, { "epoch": 0.6802066490515439, "grad_norm": 5.003152370452881, "learning_rate": 3.2635646433746576e-07, "loss": 0.0605, "step": 4312 }, { "epoch": 0.6803643964191347, "grad_norm": 8.366888046264648, "learning_rate": 3.261954596683303e-07, "loss": 0.0345, "step": 4313 }, { "epoch": 0.6805221437867256, "grad_norm": 5.949161529541016, "learning_rate": 3.26034454999195e-07, "loss": 0.0417, "step": 4314 }, { "epoch": 0.6806798911543164, "grad_norm": 2.953150749206543, "learning_rate": 3.2587345033005953e-07, "loss": 0.0154, "step": 4315 }, { "epoch": 0.6808376385219072, "grad_norm": 4.614923477172852, "learning_rate": 3.2571244566092414e-07, "loss": 0.0489, "step": 4316 }, { "epoch": 0.6809953858894979, "grad_norm": 7.076268672943115, "learning_rate": 3.255514409917888e-07, "loss": 0.0555, "step": 4317 }, { "epoch": 0.6811531332570888, "grad_norm": 5.182340145111084, "learning_rate": 3.2539043632265335e-07, "loss": 0.0308, "step": 4318 }, { "epoch": 0.6813108806246796, "grad_norm": 5.0712666511535645, "learning_rate": 3.2522943165351796e-07, "loss": 0.0365, "step": 4319 }, { "epoch": 0.6814686279922704, "grad_norm": 5.439001083374023, "learning_rate": 3.250684269843825e-07, "loss": 0.0423, "step": 4320 }, { "epoch": 0.6816263753598611, "grad_norm": 7.498855113983154, "learning_rate": 3.2490742231524717e-07, "loss": 0.0813, "step": 4321 }, { "epoch": 0.681784122727452, "grad_norm": 9.36682415008545, "learning_rate": 3.247464176461117e-07, "loss": 0.0418, "step": 4322 }, { "epoch": 0.6819418700950428, "grad_norm": 6.6287522315979, "learning_rate": 3.2458541297697633e-07, "loss": 0.1025, "step": 4323 }, { "epoch": 0.6820996174626336, "grad_norm": 3.7365763187408447, "learning_rate": 3.244244083078409e-07, "loss": 0.0277, "step": 4324 }, { "epoch": 0.6822573648302244, "grad_norm": 4.683732509613037, "learning_rate": 3.2426340363870554e-07, "loss": 0.0367, "step": 4325 }, { "epoch": 0.6824151121978151, "grad_norm": 6.683684349060059, "learning_rate": 3.241023989695701e-07, "loss": 0.0389, "step": 4326 }, { "epoch": 0.682572859565406, "grad_norm": 5.876407623291016, "learning_rate": 3.239413943004347e-07, "loss": 0.0854, "step": 4327 }, { "epoch": 0.6827306069329968, "grad_norm": 3.6485280990600586, "learning_rate": 3.2378038963129925e-07, "loss": 0.0292, "step": 4328 }, { "epoch": 0.6828883543005876, "grad_norm": 5.209390640258789, "learning_rate": 3.236193849621639e-07, "loss": 0.0215, "step": 4329 }, { "epoch": 0.6830461016681784, "grad_norm": 4.231175422668457, "learning_rate": 3.2345838029302847e-07, "loss": 0.0724, "step": 4330 }, { "epoch": 0.6832038490357693, "grad_norm": 3.57532000541687, "learning_rate": 3.2329737562389307e-07, "loss": 0.0354, "step": 4331 }, { "epoch": 0.68336159640336, "grad_norm": 5.371700763702393, "learning_rate": 3.231363709547577e-07, "loss": 0.0392, "step": 4332 }, { "epoch": 0.6835193437709508, "grad_norm": 5.248849391937256, "learning_rate": 3.229753662856223e-07, "loss": 0.0308, "step": 4333 }, { "epoch": 0.6836770911385416, "grad_norm": 5.769176959991455, "learning_rate": 3.2281436161648684e-07, "loss": 0.0463, "step": 4334 }, { "epoch": 0.6838348385061325, "grad_norm": 2.052978992462158, "learning_rate": 3.2265335694735144e-07, "loss": 0.0169, "step": 4335 }, { "epoch": 0.6839925858737232, "grad_norm": 4.024712085723877, "learning_rate": 3.2249235227821605e-07, "loss": 0.0284, "step": 4336 }, { "epoch": 0.684150333241314, "grad_norm": 4.762681484222412, "learning_rate": 3.2233134760908066e-07, "loss": 0.0418, "step": 4337 }, { "epoch": 0.6843080806089048, "grad_norm": 3.959506034851074, "learning_rate": 3.221703429399452e-07, "loss": 0.0366, "step": 4338 }, { "epoch": 0.6844658279764957, "grad_norm": 3.946669816970825, "learning_rate": 3.220093382708098e-07, "loss": 0.0447, "step": 4339 }, { "epoch": 0.6846235753440865, "grad_norm": 5.558835029602051, "learning_rate": 3.218483336016745e-07, "loss": 0.0557, "step": 4340 }, { "epoch": 0.6847813227116772, "grad_norm": 2.9637818336486816, "learning_rate": 3.2168732893253903e-07, "loss": 0.0119, "step": 4341 }, { "epoch": 0.684939070079268, "grad_norm": 5.928901195526123, "learning_rate": 3.2152632426340364e-07, "loss": 0.0738, "step": 4342 }, { "epoch": 0.6850968174468589, "grad_norm": 4.637601375579834, "learning_rate": 3.213653195942682e-07, "loss": 0.0391, "step": 4343 }, { "epoch": 0.6852545648144497, "grad_norm": 5.620077610015869, "learning_rate": 3.2120431492513285e-07, "loss": 0.0341, "step": 4344 }, { "epoch": 0.6854123121820405, "grad_norm": 5.471549987792969, "learning_rate": 3.210433102559974e-07, "loss": 0.0798, "step": 4345 }, { "epoch": 0.6855700595496312, "grad_norm": 17.964046478271484, "learning_rate": 3.20882305586862e-07, "loss": 0.0637, "step": 4346 }, { "epoch": 0.685727806917222, "grad_norm": 3.0591983795166016, "learning_rate": 3.207213009177266e-07, "loss": 0.0194, "step": 4347 }, { "epoch": 0.6858855542848129, "grad_norm": 5.701319217681885, "learning_rate": 3.205602962485912e-07, "loss": 0.0551, "step": 4348 }, { "epoch": 0.6860433016524037, "grad_norm": 4.1603851318359375, "learning_rate": 3.203992915794558e-07, "loss": 0.0474, "step": 4349 }, { "epoch": 0.6862010490199945, "grad_norm": 9.443543434143066, "learning_rate": 3.202382869103204e-07, "loss": 0.0295, "step": 4350 }, { "epoch": 0.6863587963875852, "grad_norm": 5.248782634735107, "learning_rate": 3.20077282241185e-07, "loss": 0.0428, "step": 4351 }, { "epoch": 0.6865165437551761, "grad_norm": 4.237235069274902, "learning_rate": 3.199162775720496e-07, "loss": 0.0349, "step": 4352 }, { "epoch": 0.6866742911227669, "grad_norm": 3.3554368019104004, "learning_rate": 3.1975527290291415e-07, "loss": 0.019, "step": 4353 }, { "epoch": 0.6868320384903577, "grad_norm": 6.664609909057617, "learning_rate": 3.1959426823377875e-07, "loss": 0.0383, "step": 4354 }, { "epoch": 0.6869897858579485, "grad_norm": 4.52626371383667, "learning_rate": 3.1943326356464336e-07, "loss": 0.0221, "step": 4355 }, { "epoch": 0.6871475332255393, "grad_norm": 5.13330078125, "learning_rate": 3.1927225889550797e-07, "loss": 0.0715, "step": 4356 }, { "epoch": 0.6873052805931301, "grad_norm": 6.823228359222412, "learning_rate": 3.191112542263725e-07, "loss": 0.0701, "step": 4357 }, { "epoch": 0.6874630279607209, "grad_norm": 5.131317138671875, "learning_rate": 3.189502495572371e-07, "loss": 0.0364, "step": 4358 }, { "epoch": 0.6876207753283117, "grad_norm": 1.8192733526229858, "learning_rate": 3.1878924488810173e-07, "loss": 0.0108, "step": 4359 }, { "epoch": 0.6877785226959026, "grad_norm": 3.5216405391693115, "learning_rate": 3.1862824021896634e-07, "loss": 0.0194, "step": 4360 }, { "epoch": 0.6879362700634933, "grad_norm": 3.8323073387145996, "learning_rate": 3.184672355498309e-07, "loss": 0.0571, "step": 4361 }, { "epoch": 0.6880940174310841, "grad_norm": 9.057934761047363, "learning_rate": 3.1830623088069555e-07, "loss": 0.0722, "step": 4362 }, { "epoch": 0.6882517647986749, "grad_norm": 5.102464199066162, "learning_rate": 3.1814522621156016e-07, "loss": 0.0408, "step": 4363 }, { "epoch": 0.6884095121662658, "grad_norm": 9.215035438537598, "learning_rate": 3.179842215424247e-07, "loss": 0.0647, "step": 4364 }, { "epoch": 0.6885672595338566, "grad_norm": 4.922355651855469, "learning_rate": 3.178232168732893e-07, "loss": 0.0174, "step": 4365 }, { "epoch": 0.6887250069014473, "grad_norm": 3.5283076763153076, "learning_rate": 3.176622122041539e-07, "loss": 0.0623, "step": 4366 }, { "epoch": 0.6888827542690381, "grad_norm": 4.64548397064209, "learning_rate": 3.1750120753501853e-07, "loss": 0.0238, "step": 4367 }, { "epoch": 0.6890405016366289, "grad_norm": 4.4998250007629395, "learning_rate": 3.173402028658831e-07, "loss": 0.0281, "step": 4368 }, { "epoch": 0.6891982490042198, "grad_norm": 5.0620927810668945, "learning_rate": 3.171791981967477e-07, "loss": 0.0402, "step": 4369 }, { "epoch": 0.6893559963718106, "grad_norm": 5.032283306121826, "learning_rate": 3.170181935276123e-07, "loss": 0.0911, "step": 4370 }, { "epoch": 0.6895137437394013, "grad_norm": 4.686389923095703, "learning_rate": 3.168571888584769e-07, "loss": 0.029, "step": 4371 }, { "epoch": 0.6896714911069921, "grad_norm": 3.5803253650665283, "learning_rate": 3.1669618418934146e-07, "loss": 0.0494, "step": 4372 }, { "epoch": 0.689829238474583, "grad_norm": 4.782825469970703, "learning_rate": 3.1653517952020606e-07, "loss": 0.0845, "step": 4373 }, { "epoch": 0.6899869858421738, "grad_norm": 4.182551383972168, "learning_rate": 3.1637417485107067e-07, "loss": 0.0609, "step": 4374 }, { "epoch": 0.6901447332097645, "grad_norm": 3.656662702560425, "learning_rate": 3.162131701819353e-07, "loss": 0.0251, "step": 4375 }, { "epoch": 0.6903024805773553, "grad_norm": 3.6005845069885254, "learning_rate": 3.1605216551279983e-07, "loss": 0.0217, "step": 4376 }, { "epoch": 0.6904602279449462, "grad_norm": 5.404850006103516, "learning_rate": 3.158911608436645e-07, "loss": 0.0388, "step": 4377 }, { "epoch": 0.690617975312537, "grad_norm": 5.392271995544434, "learning_rate": 3.1573015617452904e-07, "loss": 0.0436, "step": 4378 }, { "epoch": 0.6907757226801278, "grad_norm": 4.791945934295654, "learning_rate": 3.1556915150539365e-07, "loss": 0.0294, "step": 4379 }, { "epoch": 0.6909334700477185, "grad_norm": 2.6300947666168213, "learning_rate": 3.154081468362582e-07, "loss": 0.0428, "step": 4380 }, { "epoch": 0.6910912174153094, "grad_norm": 6.325275421142578, "learning_rate": 3.1524714216712286e-07, "loss": 0.0422, "step": 4381 }, { "epoch": 0.6912489647829002, "grad_norm": 5.496462821960449, "learning_rate": 3.150861374979874e-07, "loss": 0.0606, "step": 4382 }, { "epoch": 0.691406712150491, "grad_norm": 3.19685435295105, "learning_rate": 3.14925132828852e-07, "loss": 0.0368, "step": 4383 }, { "epoch": 0.6915644595180818, "grad_norm": 4.607407093048096, "learning_rate": 3.1476412815971657e-07, "loss": 0.0168, "step": 4384 }, { "epoch": 0.6917222068856725, "grad_norm": 6.012766361236572, "learning_rate": 3.1460312349058123e-07, "loss": 0.0433, "step": 4385 }, { "epoch": 0.6918799542532634, "grad_norm": 5.831282138824463, "learning_rate": 3.1444211882144584e-07, "loss": 0.0495, "step": 4386 }, { "epoch": 0.6920377016208542, "grad_norm": 5.386518478393555, "learning_rate": 3.142811141523104e-07, "loss": 0.0516, "step": 4387 }, { "epoch": 0.692195448988445, "grad_norm": 5.470829010009766, "learning_rate": 3.14120109483175e-07, "loss": 0.0507, "step": 4388 }, { "epoch": 0.6923531963560358, "grad_norm": 5.152629852294922, "learning_rate": 3.139591048140396e-07, "loss": 0.0675, "step": 4389 }, { "epoch": 0.6925109437236266, "grad_norm": 3.937241792678833, "learning_rate": 3.137981001449042e-07, "loss": 0.0344, "step": 4390 }, { "epoch": 0.6926686910912174, "grad_norm": 5.884132385253906, "learning_rate": 3.1363709547576876e-07, "loss": 0.041, "step": 4391 }, { "epoch": 0.6928264384588082, "grad_norm": 5.485620498657227, "learning_rate": 3.134760908066334e-07, "loss": 0.0671, "step": 4392 }, { "epoch": 0.692984185826399, "grad_norm": 2.248553514480591, "learning_rate": 3.13315086137498e-07, "loss": 0.0109, "step": 4393 }, { "epoch": 0.6931419331939899, "grad_norm": 5.195814609527588, "learning_rate": 3.131540814683626e-07, "loss": 0.0627, "step": 4394 }, { "epoch": 0.6932996805615806, "grad_norm": 10.239777565002441, "learning_rate": 3.1299307679922714e-07, "loss": 0.1075, "step": 4395 }, { "epoch": 0.6934574279291714, "grad_norm": 4.752469062805176, "learning_rate": 3.128320721300918e-07, "loss": 0.0669, "step": 4396 }, { "epoch": 0.6936151752967622, "grad_norm": 5.5641889572143555, "learning_rate": 3.1267106746095635e-07, "loss": 0.0323, "step": 4397 }, { "epoch": 0.6937729226643531, "grad_norm": 3.373412847518921, "learning_rate": 3.1251006279182095e-07, "loss": 0.0484, "step": 4398 }, { "epoch": 0.6939306700319439, "grad_norm": 5.519466400146484, "learning_rate": 3.123490581226855e-07, "loss": 0.0485, "step": 4399 }, { "epoch": 0.6940884173995346, "grad_norm": 8.3934965133667, "learning_rate": 3.1218805345355017e-07, "loss": 0.0333, "step": 4400 }, { "epoch": 0.6942461647671254, "grad_norm": 3.610666275024414, "learning_rate": 3.120270487844147e-07, "loss": 0.0364, "step": 4401 }, { "epoch": 0.6944039121347163, "grad_norm": 3.121730089187622, "learning_rate": 3.1186604411527933e-07, "loss": 0.0299, "step": 4402 }, { "epoch": 0.6945616595023071, "grad_norm": 4.997406005859375, "learning_rate": 3.117050394461439e-07, "loss": 0.0348, "step": 4403 }, { "epoch": 0.6947194068698979, "grad_norm": 3.996657133102417, "learning_rate": 3.1154403477700854e-07, "loss": 0.0232, "step": 4404 }, { "epoch": 0.6948771542374886, "grad_norm": 2.573070526123047, "learning_rate": 3.113830301078731e-07, "loss": 0.0403, "step": 4405 }, { "epoch": 0.6950349016050794, "grad_norm": 6.505340576171875, "learning_rate": 3.112220254387377e-07, "loss": 0.0318, "step": 4406 }, { "epoch": 0.6951926489726703, "grad_norm": 3.81855845451355, "learning_rate": 3.1106102076960225e-07, "loss": 0.05, "step": 4407 }, { "epoch": 0.6953503963402611, "grad_norm": 5.036444187164307, "learning_rate": 3.109000161004669e-07, "loss": 0.0554, "step": 4408 }, { "epoch": 0.6955081437078519, "grad_norm": 4.578446388244629, "learning_rate": 3.107390114313315e-07, "loss": 0.0224, "step": 4409 }, { "epoch": 0.6956658910754426, "grad_norm": 4.750903606414795, "learning_rate": 3.1057800676219607e-07, "loss": 0.0295, "step": 4410 }, { "epoch": 0.6958236384430335, "grad_norm": 4.846446514129639, "learning_rate": 3.1041700209306073e-07, "loss": 0.0276, "step": 4411 }, { "epoch": 0.6959813858106243, "grad_norm": 35.4178581237793, "learning_rate": 3.102559974239253e-07, "loss": 0.0763, "step": 4412 }, { "epoch": 0.6961391331782151, "grad_norm": 5.210877418518066, "learning_rate": 3.100949927547899e-07, "loss": 0.0483, "step": 4413 }, { "epoch": 0.6962968805458059, "grad_norm": 9.870617866516113, "learning_rate": 3.0993398808565444e-07, "loss": 0.0415, "step": 4414 }, { "epoch": 0.6964546279133967, "grad_norm": 5.791896343231201, "learning_rate": 3.097729834165191e-07, "loss": 0.0401, "step": 4415 }, { "epoch": 0.6966123752809875, "grad_norm": 3.8255116939544678, "learning_rate": 3.0961197874738366e-07, "loss": 0.0138, "step": 4416 }, { "epoch": 0.6967701226485783, "grad_norm": 5.592365264892578, "learning_rate": 3.0945097407824826e-07, "loss": 0.041, "step": 4417 }, { "epoch": 0.6969278700161691, "grad_norm": 2.6533119678497314, "learning_rate": 3.092899694091128e-07, "loss": 0.0253, "step": 4418 }, { "epoch": 0.69708561738376, "grad_norm": 7.8833112716674805, "learning_rate": 3.091289647399775e-07, "loss": 0.061, "step": 4419 }, { "epoch": 0.6972433647513507, "grad_norm": 5.523496627807617, "learning_rate": 3.0896796007084203e-07, "loss": 0.0687, "step": 4420 }, { "epoch": 0.6974011121189415, "grad_norm": 6.058192253112793, "learning_rate": 3.0880695540170664e-07, "loss": 0.03, "step": 4421 }, { "epoch": 0.6975588594865323, "grad_norm": 5.791531562805176, "learning_rate": 3.086459507325712e-07, "loss": 0.0248, "step": 4422 }, { "epoch": 0.6977166068541231, "grad_norm": 9.704782485961914, "learning_rate": 3.0848494606343585e-07, "loss": 0.0689, "step": 4423 }, { "epoch": 0.697874354221714, "grad_norm": 8.3525972366333, "learning_rate": 3.083239413943004e-07, "loss": 0.0311, "step": 4424 }, { "epoch": 0.6980321015893047, "grad_norm": 4.394040107727051, "learning_rate": 3.08162936725165e-07, "loss": 0.0606, "step": 4425 }, { "epoch": 0.6981898489568955, "grad_norm": 3.5000298023223877, "learning_rate": 3.080019320560296e-07, "loss": 0.0668, "step": 4426 }, { "epoch": 0.6983475963244863, "grad_norm": 5.3555498123168945, "learning_rate": 3.078409273868942e-07, "loss": 0.0319, "step": 4427 }, { "epoch": 0.6985053436920772, "grad_norm": 4.745055198669434, "learning_rate": 3.0767992271775877e-07, "loss": 0.0202, "step": 4428 }, { "epoch": 0.698663091059668, "grad_norm": 5.928339004516602, "learning_rate": 3.075189180486234e-07, "loss": 0.0817, "step": 4429 }, { "epoch": 0.6988208384272587, "grad_norm": 4.530652046203613, "learning_rate": 3.07357913379488e-07, "loss": 0.029, "step": 4430 }, { "epoch": 0.6989785857948495, "grad_norm": 6.590553283691406, "learning_rate": 3.071969087103526e-07, "loss": 0.0702, "step": 4431 }, { "epoch": 0.6991363331624404, "grad_norm": 6.570598125457764, "learning_rate": 3.070359040412172e-07, "loss": 0.0358, "step": 4432 }, { "epoch": 0.6992940805300312, "grad_norm": 4.274868965148926, "learning_rate": 3.0687489937208175e-07, "loss": 0.0381, "step": 4433 }, { "epoch": 0.699451827897622, "grad_norm": 4.528099536895752, "learning_rate": 3.067138947029464e-07, "loss": 0.0535, "step": 4434 }, { "epoch": 0.6996095752652127, "grad_norm": 1.9978628158569336, "learning_rate": 3.0655289003381096e-07, "loss": 0.01, "step": 4435 }, { "epoch": 0.6997673226328036, "grad_norm": 5.906517505645752, "learning_rate": 3.0639188536467557e-07, "loss": 0.059, "step": 4436 }, { "epoch": 0.6999250700003944, "grad_norm": 3.6237752437591553, "learning_rate": 3.062308806955401e-07, "loss": 0.0171, "step": 4437 }, { "epoch": 0.7000828173679852, "grad_norm": 5.734447002410889, "learning_rate": 3.060698760264048e-07, "loss": 0.0327, "step": 4438 }, { "epoch": 0.7002405647355759, "grad_norm": 9.282461166381836, "learning_rate": 3.0590887135726934e-07, "loss": 0.0311, "step": 4439 }, { "epoch": 0.7003983121031668, "grad_norm": 5.420535087585449, "learning_rate": 3.0574786668813394e-07, "loss": 0.0411, "step": 4440 }, { "epoch": 0.7005560594707576, "grad_norm": 2.7260584831237793, "learning_rate": 3.0558686201899855e-07, "loss": 0.0234, "step": 4441 }, { "epoch": 0.7007138068383484, "grad_norm": 6.108238697052002, "learning_rate": 3.0542585734986316e-07, "loss": 0.0253, "step": 4442 }, { "epoch": 0.7008715542059392, "grad_norm": 8.544012069702148, "learning_rate": 3.052648526807277e-07, "loss": 0.0689, "step": 4443 }, { "epoch": 0.7010293015735299, "grad_norm": 5.6943359375, "learning_rate": 3.051038480115923e-07, "loss": 0.0669, "step": 4444 }, { "epoch": 0.7011870489411208, "grad_norm": 4.468897819519043, "learning_rate": 3.049428433424569e-07, "loss": 0.0433, "step": 4445 }, { "epoch": 0.7013447963087116, "grad_norm": 3.731135129928589, "learning_rate": 3.0478183867332153e-07, "loss": 0.0478, "step": 4446 }, { "epoch": 0.7015025436763024, "grad_norm": 8.033485412597656, "learning_rate": 3.046208340041861e-07, "loss": 0.0291, "step": 4447 }, { "epoch": 0.7016602910438932, "grad_norm": 2.8984217643737793, "learning_rate": 3.044598293350507e-07, "loss": 0.0151, "step": 4448 }, { "epoch": 0.701818038411484, "grad_norm": 3.4804484844207764, "learning_rate": 3.042988246659153e-07, "loss": 0.0242, "step": 4449 }, { "epoch": 0.7019757857790748, "grad_norm": 4.877875328063965, "learning_rate": 3.041378199967799e-07, "loss": 0.0262, "step": 4450 }, { "epoch": 0.7021335331466656, "grad_norm": 4.9602532386779785, "learning_rate": 3.0397681532764445e-07, "loss": 0.0668, "step": 4451 }, { "epoch": 0.7022912805142564, "grad_norm": 4.77786111831665, "learning_rate": 3.0381581065850906e-07, "loss": 0.0644, "step": 4452 }, { "epoch": 0.7024490278818473, "grad_norm": 4.0027546882629395, "learning_rate": 3.0365480598937367e-07, "loss": 0.0445, "step": 4453 }, { "epoch": 0.702606775249438, "grad_norm": 3.758492946624756, "learning_rate": 3.0349380132023827e-07, "loss": 0.0331, "step": 4454 }, { "epoch": 0.7027645226170288, "grad_norm": 2.2905874252319336, "learning_rate": 3.033327966511029e-07, "loss": 0.0215, "step": 4455 }, { "epoch": 0.7029222699846196, "grad_norm": 3.43929386138916, "learning_rate": 3.031717919819675e-07, "loss": 0.025, "step": 4456 }, { "epoch": 0.7030800173522105, "grad_norm": 2.7031850814819336, "learning_rate": 3.030107873128321e-07, "loss": 0.0158, "step": 4457 }, { "epoch": 0.7032377647198013, "grad_norm": 4.073042869567871, "learning_rate": 3.0284978264369665e-07, "loss": 0.0372, "step": 4458 }, { "epoch": 0.703395512087392, "grad_norm": 5.968804836273193, "learning_rate": 3.0268877797456125e-07, "loss": 0.041, "step": 4459 }, { "epoch": 0.7035532594549828, "grad_norm": 4.01667594909668, "learning_rate": 3.0252777330542586e-07, "loss": 0.0682, "step": 4460 }, { "epoch": 0.7037110068225737, "grad_norm": 3.069646120071411, "learning_rate": 3.0236676863629046e-07, "loss": 0.024, "step": 4461 }, { "epoch": 0.7038687541901645, "grad_norm": 2.835047960281372, "learning_rate": 3.02205763967155e-07, "loss": 0.0191, "step": 4462 }, { "epoch": 0.7040265015577553, "grad_norm": 3.3007359504699707, "learning_rate": 3.020447592980196e-07, "loss": 0.0283, "step": 4463 }, { "epoch": 0.704184248925346, "grad_norm": 4.909847259521484, "learning_rate": 3.0188375462888423e-07, "loss": 0.0513, "step": 4464 }, { "epoch": 0.7043419962929368, "grad_norm": 4.200301170349121, "learning_rate": 3.0172274995974884e-07, "loss": 0.0357, "step": 4465 }, { "epoch": 0.7044997436605277, "grad_norm": 6.148838043212891, "learning_rate": 3.015617452906134e-07, "loss": 0.0334, "step": 4466 }, { "epoch": 0.7046574910281185, "grad_norm": 5.063936710357666, "learning_rate": 3.01400740621478e-07, "loss": 0.0635, "step": 4467 }, { "epoch": 0.7048152383957093, "grad_norm": 5.4391865730285645, "learning_rate": 3.012397359523426e-07, "loss": 0.0799, "step": 4468 }, { "epoch": 0.7049729857633, "grad_norm": 3.486837863922119, "learning_rate": 3.010787312832072e-07, "loss": 0.042, "step": 4469 }, { "epoch": 0.7051307331308909, "grad_norm": 5.314830780029297, "learning_rate": 3.0091772661407176e-07, "loss": 0.0422, "step": 4470 }, { "epoch": 0.7052884804984817, "grad_norm": 5.49745512008667, "learning_rate": 3.007567219449364e-07, "loss": 0.0641, "step": 4471 }, { "epoch": 0.7054462278660725, "grad_norm": 3.8079562187194824, "learning_rate": 3.00595717275801e-07, "loss": 0.0256, "step": 4472 }, { "epoch": 0.7056039752336632, "grad_norm": 8.788825988769531, "learning_rate": 3.004347126066656e-07, "loss": 0.0535, "step": 4473 }, { "epoch": 0.7057617226012541, "grad_norm": 5.572489261627197, "learning_rate": 3.0027370793753013e-07, "loss": 0.0549, "step": 4474 }, { "epoch": 0.7059194699688449, "grad_norm": 6.502420425415039, "learning_rate": 3.001127032683948e-07, "loss": 0.046, "step": 4475 }, { "epoch": 0.7060772173364357, "grad_norm": 4.948294162750244, "learning_rate": 2.9995169859925935e-07, "loss": 0.0298, "step": 4476 }, { "epoch": 0.7062349647040265, "grad_norm": 5.6751861572265625, "learning_rate": 2.9979069393012395e-07, "loss": 0.0739, "step": 4477 }, { "epoch": 0.7063927120716174, "grad_norm": 4.962527275085449, "learning_rate": 2.996296892609885e-07, "loss": 0.0486, "step": 4478 }, { "epoch": 0.7065504594392081, "grad_norm": 5.656927585601807, "learning_rate": 2.9946868459185317e-07, "loss": 0.0707, "step": 4479 }, { "epoch": 0.7067082068067989, "grad_norm": 13.633844375610352, "learning_rate": 2.9930767992271777e-07, "loss": 0.0937, "step": 4480 }, { "epoch": 0.7068659541743897, "grad_norm": 7.616853713989258, "learning_rate": 2.991466752535823e-07, "loss": 0.046, "step": 4481 }, { "epoch": 0.7070237015419805, "grad_norm": 1.5346670150756836, "learning_rate": 2.9898567058444693e-07, "loss": 0.0096, "step": 4482 }, { "epoch": 0.7071814489095714, "grad_norm": 8.293448448181152, "learning_rate": 2.9882466591531154e-07, "loss": 0.0729, "step": 4483 }, { "epoch": 0.7073391962771621, "grad_norm": 6.246132850646973, "learning_rate": 2.9866366124617614e-07, "loss": 0.0421, "step": 4484 }, { "epoch": 0.7074969436447529, "grad_norm": 5.36680269241333, "learning_rate": 2.985026565770407e-07, "loss": 0.1121, "step": 4485 }, { "epoch": 0.7076546910123437, "grad_norm": 4.480698108673096, "learning_rate": 2.9834165190790536e-07, "loss": 0.0389, "step": 4486 }, { "epoch": 0.7078124383799346, "grad_norm": 6.817065238952637, "learning_rate": 2.981806472387699e-07, "loss": 0.0429, "step": 4487 }, { "epoch": 0.7079701857475253, "grad_norm": 4.782590389251709, "learning_rate": 2.980196425696345e-07, "loss": 0.0253, "step": 4488 }, { "epoch": 0.7081279331151161, "grad_norm": 3.480205774307251, "learning_rate": 2.9785863790049907e-07, "loss": 0.0375, "step": 4489 }, { "epoch": 0.7082856804827069, "grad_norm": 2.1709165573120117, "learning_rate": 2.9769763323136373e-07, "loss": 0.0093, "step": 4490 }, { "epoch": 0.7084434278502978, "grad_norm": 3.2164363861083984, "learning_rate": 2.975366285622283e-07, "loss": 0.0374, "step": 4491 }, { "epoch": 0.7086011752178886, "grad_norm": 4.7639241218566895, "learning_rate": 2.973756238930929e-07, "loss": 0.0433, "step": 4492 }, { "epoch": 0.7087589225854793, "grad_norm": 8.232230186462402, "learning_rate": 2.9721461922395744e-07, "loss": 0.0485, "step": 4493 }, { "epoch": 0.7089166699530701, "grad_norm": 6.158698558807373, "learning_rate": 2.970536145548221e-07, "loss": 0.0252, "step": 4494 }, { "epoch": 0.709074417320661, "grad_norm": 6.094226360321045, "learning_rate": 2.9689260988568666e-07, "loss": 0.0234, "step": 4495 }, { "epoch": 0.7092321646882518, "grad_norm": 3.0969483852386475, "learning_rate": 2.9673160521655126e-07, "loss": 0.0301, "step": 4496 }, { "epoch": 0.7093899120558426, "grad_norm": 5.906505584716797, "learning_rate": 2.965706005474158e-07, "loss": 0.0379, "step": 4497 }, { "epoch": 0.7095476594234333, "grad_norm": 3.609701633453369, "learning_rate": 2.964095958782805e-07, "loss": 0.064, "step": 4498 }, { "epoch": 0.7097054067910242, "grad_norm": 4.088521480560303, "learning_rate": 2.9624859120914503e-07, "loss": 0.0166, "step": 4499 }, { "epoch": 0.709863154158615, "grad_norm": 4.423680305480957, "learning_rate": 2.9608758654000963e-07, "loss": 0.044, "step": 4500 }, { "epoch": 0.7100209015262058, "grad_norm": 4.046323776245117, "learning_rate": 2.9592658187087424e-07, "loss": 0.0407, "step": 4501 }, { "epoch": 0.7101786488937966, "grad_norm": 4.431769847869873, "learning_rate": 2.9576557720173885e-07, "loss": 0.0502, "step": 4502 }, { "epoch": 0.7103363962613873, "grad_norm": 7.08558464050293, "learning_rate": 2.9560457253260345e-07, "loss": 0.0451, "step": 4503 }, { "epoch": 0.7104941436289782, "grad_norm": 8.421886444091797, "learning_rate": 2.95443567863468e-07, "loss": 0.0634, "step": 4504 }, { "epoch": 0.710651890996569, "grad_norm": 6.394400119781494, "learning_rate": 2.9528256319433267e-07, "loss": 0.0411, "step": 4505 }, { "epoch": 0.7108096383641598, "grad_norm": 3.922478437423706, "learning_rate": 2.951215585251972e-07, "loss": 0.0279, "step": 4506 }, { "epoch": 0.7109673857317506, "grad_norm": 5.17605447769165, "learning_rate": 2.949605538560618e-07, "loss": 0.0348, "step": 4507 }, { "epoch": 0.7111251330993414, "grad_norm": 2.986701726913452, "learning_rate": 2.947995491869264e-07, "loss": 0.0251, "step": 4508 }, { "epoch": 0.7112828804669322, "grad_norm": 8.349322319030762, "learning_rate": 2.9463854451779104e-07, "loss": 0.0598, "step": 4509 }, { "epoch": 0.711440627834523, "grad_norm": 6.769835948944092, "learning_rate": 2.944775398486556e-07, "loss": 0.0446, "step": 4510 }, { "epoch": 0.7115983752021138, "grad_norm": 2.4589574337005615, "learning_rate": 2.943165351795202e-07, "loss": 0.0303, "step": 4511 }, { "epoch": 0.7117561225697047, "grad_norm": 4.1198506355285645, "learning_rate": 2.9415553051038475e-07, "loss": 0.021, "step": 4512 }, { "epoch": 0.7119138699372954, "grad_norm": 6.685397148132324, "learning_rate": 2.939945258412494e-07, "loss": 0.0374, "step": 4513 }, { "epoch": 0.7120716173048862, "grad_norm": 6.449472904205322, "learning_rate": 2.9383352117211396e-07, "loss": 0.0613, "step": 4514 }, { "epoch": 0.712229364672477, "grad_norm": 4.5486369132995605, "learning_rate": 2.9367251650297857e-07, "loss": 0.0372, "step": 4515 }, { "epoch": 0.7123871120400679, "grad_norm": 3.8935794830322266, "learning_rate": 2.935115118338432e-07, "loss": 0.0285, "step": 4516 }, { "epoch": 0.7125448594076587, "grad_norm": 4.628503322601318, "learning_rate": 2.933505071647078e-07, "loss": 0.0472, "step": 4517 }, { "epoch": 0.7127026067752494, "grad_norm": 2.7762725353240967, "learning_rate": 2.9318950249557234e-07, "loss": 0.0279, "step": 4518 }, { "epoch": 0.7128603541428402, "grad_norm": 10.613434791564941, "learning_rate": 2.9302849782643694e-07, "loss": 0.0497, "step": 4519 }, { "epoch": 0.713018101510431, "grad_norm": 4.48843240737915, "learning_rate": 2.9286749315730155e-07, "loss": 0.0392, "step": 4520 }, { "epoch": 0.7131758488780219, "grad_norm": 3.927433967590332, "learning_rate": 2.9270648848816615e-07, "loss": 0.038, "step": 4521 }, { "epoch": 0.7133335962456127, "grad_norm": 9.800247192382812, "learning_rate": 2.925454838190307e-07, "loss": 0.0511, "step": 4522 }, { "epoch": 0.7134913436132034, "grad_norm": 7.325588703155518, "learning_rate": 2.923844791498953e-07, "loss": 0.0644, "step": 4523 }, { "epoch": 0.7136490909807942, "grad_norm": 4.694719314575195, "learning_rate": 2.922234744807599e-07, "loss": 0.0313, "step": 4524 }, { "epoch": 0.7138068383483851, "grad_norm": 3.6894118785858154, "learning_rate": 2.9206246981162453e-07, "loss": 0.0345, "step": 4525 }, { "epoch": 0.7139645857159759, "grad_norm": 4.581935882568359, "learning_rate": 2.9190146514248913e-07, "loss": 0.0372, "step": 4526 }, { "epoch": 0.7141223330835667, "grad_norm": 7.473629951477051, "learning_rate": 2.917404604733537e-07, "loss": 0.0286, "step": 4527 }, { "epoch": 0.7142800804511574, "grad_norm": 5.640164375305176, "learning_rate": 2.9157945580421835e-07, "loss": 0.0412, "step": 4528 }, { "epoch": 0.7144378278187483, "grad_norm": 6.2254838943481445, "learning_rate": 2.914184511350829e-07, "loss": 0.0532, "step": 4529 }, { "epoch": 0.7145955751863391, "grad_norm": 4.138742446899414, "learning_rate": 2.912574464659475e-07, "loss": 0.0364, "step": 4530 }, { "epoch": 0.7147533225539299, "grad_norm": 4.648903846740723, "learning_rate": 2.910964417968121e-07, "loss": 0.0359, "step": 4531 }, { "epoch": 0.7149110699215206, "grad_norm": 4.27373743057251, "learning_rate": 2.909354371276767e-07, "loss": 0.0402, "step": 4532 }, { "epoch": 0.7150688172891115, "grad_norm": 4.147173881530762, "learning_rate": 2.9077443245854127e-07, "loss": 0.035, "step": 4533 }, { "epoch": 0.7152265646567023, "grad_norm": 3.1072118282318115, "learning_rate": 2.906134277894059e-07, "loss": 0.0137, "step": 4534 }, { "epoch": 0.7153843120242931, "grad_norm": 11.774784088134766, "learning_rate": 2.904524231202705e-07, "loss": 0.1105, "step": 4535 }, { "epoch": 0.7155420593918839, "grad_norm": 4.649820804595947, "learning_rate": 2.902914184511351e-07, "loss": 0.0727, "step": 4536 }, { "epoch": 0.7156998067594748, "grad_norm": 7.751923561096191, "learning_rate": 2.9013041378199964e-07, "loss": 0.0485, "step": 4537 }, { "epoch": 0.7158575541270655, "grad_norm": 5.118620872497559, "learning_rate": 2.8996940911286425e-07, "loss": 0.0403, "step": 4538 }, { "epoch": 0.7160153014946563, "grad_norm": 7.4496846199035645, "learning_rate": 2.8980840444372886e-07, "loss": 0.0397, "step": 4539 }, { "epoch": 0.7161730488622471, "grad_norm": 7.6184587478637695, "learning_rate": 2.8964739977459346e-07, "loss": 0.0584, "step": 4540 }, { "epoch": 0.7163307962298379, "grad_norm": 3.5897741317749023, "learning_rate": 2.89486395105458e-07, "loss": 0.049, "step": 4541 }, { "epoch": 0.7164885435974288, "grad_norm": 5.4391913414001465, "learning_rate": 2.893253904363226e-07, "loss": 0.0216, "step": 4542 }, { "epoch": 0.7166462909650195, "grad_norm": 2.8665518760681152, "learning_rate": 2.8916438576718723e-07, "loss": 0.0303, "step": 4543 }, { "epoch": 0.7168040383326103, "grad_norm": 2.100839138031006, "learning_rate": 2.8900338109805184e-07, "loss": 0.0122, "step": 4544 }, { "epoch": 0.7169617857002011, "grad_norm": 7.321889400482178, "learning_rate": 2.888423764289164e-07, "loss": 0.0942, "step": 4545 }, { "epoch": 0.717119533067792, "grad_norm": 4.987734317779541, "learning_rate": 2.8868137175978105e-07, "loss": 0.0475, "step": 4546 }, { "epoch": 0.7172772804353827, "grad_norm": 6.623134613037109, "learning_rate": 2.885203670906456e-07, "loss": 0.0415, "step": 4547 }, { "epoch": 0.7174350278029735, "grad_norm": 5.465873718261719, "learning_rate": 2.883593624215102e-07, "loss": 0.085, "step": 4548 }, { "epoch": 0.7175927751705643, "grad_norm": 7.853731155395508, "learning_rate": 2.881983577523748e-07, "loss": 0.0748, "step": 4549 }, { "epoch": 0.7177505225381552, "grad_norm": 6.531794548034668, "learning_rate": 2.880373530832394e-07, "loss": 0.0394, "step": 4550 }, { "epoch": 0.717908269905746, "grad_norm": 6.5806732177734375, "learning_rate": 2.8787634841410403e-07, "loss": 0.0241, "step": 4551 }, { "epoch": 0.7180660172733367, "grad_norm": 6.493992328643799, "learning_rate": 2.877153437449686e-07, "loss": 0.0675, "step": 4552 }, { "epoch": 0.7182237646409275, "grad_norm": 1.3564178943634033, "learning_rate": 2.875543390758332e-07, "loss": 0.0079, "step": 4553 }, { "epoch": 0.7183815120085184, "grad_norm": 6.057300567626953, "learning_rate": 2.873933344066978e-07, "loss": 0.0408, "step": 4554 }, { "epoch": 0.7185392593761092, "grad_norm": 4.63604736328125, "learning_rate": 2.872323297375624e-07, "loss": 0.0221, "step": 4555 }, { "epoch": 0.7186970067437, "grad_norm": 8.910985946655273, "learning_rate": 2.8707132506842695e-07, "loss": 0.0456, "step": 4556 }, { "epoch": 0.7188547541112907, "grad_norm": 6.352619647979736, "learning_rate": 2.8691032039929156e-07, "loss": 0.056, "step": 4557 }, { "epoch": 0.7190125014788815, "grad_norm": 4.340364456176758, "learning_rate": 2.8674931573015616e-07, "loss": 0.04, "step": 4558 }, { "epoch": 0.7191702488464724, "grad_norm": 2.3495678901672363, "learning_rate": 2.8658831106102077e-07, "loss": 0.0139, "step": 4559 }, { "epoch": 0.7193279962140632, "grad_norm": 5.017070770263672, "learning_rate": 2.864273063918853e-07, "loss": 0.075, "step": 4560 }, { "epoch": 0.719485743581654, "grad_norm": 3.2221689224243164, "learning_rate": 2.8626630172275e-07, "loss": 0.043, "step": 4561 }, { "epoch": 0.7196434909492447, "grad_norm": 7.304450988769531, "learning_rate": 2.8610529705361454e-07, "loss": 0.0876, "step": 4562 }, { "epoch": 0.7198012383168356, "grad_norm": 3.5030410289764404, "learning_rate": 2.8594429238447914e-07, "loss": 0.0383, "step": 4563 }, { "epoch": 0.7199589856844264, "grad_norm": 3.965207815170288, "learning_rate": 2.857832877153437e-07, "loss": 0.032, "step": 4564 }, { "epoch": 0.7201167330520172, "grad_norm": 4.125488758087158, "learning_rate": 2.8562228304620836e-07, "loss": 0.0227, "step": 4565 }, { "epoch": 0.720274480419608, "grad_norm": 4.313589096069336, "learning_rate": 2.854612783770729e-07, "loss": 0.0622, "step": 4566 }, { "epoch": 0.7204322277871988, "grad_norm": 3.845998764038086, "learning_rate": 2.853002737079375e-07, "loss": 0.0717, "step": 4567 }, { "epoch": 0.7205899751547896, "grad_norm": 8.11777400970459, "learning_rate": 2.8513926903880207e-07, "loss": 0.0392, "step": 4568 }, { "epoch": 0.7207477225223804, "grad_norm": 3.3960371017456055, "learning_rate": 2.8497826436966673e-07, "loss": 0.0266, "step": 4569 }, { "epoch": 0.7209054698899712, "grad_norm": 10.87304973602295, "learning_rate": 2.848172597005313e-07, "loss": 0.0588, "step": 4570 }, { "epoch": 0.7210632172575621, "grad_norm": 5.718992710113525, "learning_rate": 2.846562550313959e-07, "loss": 0.0481, "step": 4571 }, { "epoch": 0.7212209646251528, "grad_norm": 6.245044708251953, "learning_rate": 2.844952503622605e-07, "loss": 0.0281, "step": 4572 }, { "epoch": 0.7213787119927436, "grad_norm": 2.650266647338867, "learning_rate": 2.843342456931251e-07, "loss": 0.0347, "step": 4573 }, { "epoch": 0.7215364593603344, "grad_norm": 4.317817211151123, "learning_rate": 2.841732410239897e-07, "loss": 0.0436, "step": 4574 }, { "epoch": 0.7216942067279253, "grad_norm": 7.154784679412842, "learning_rate": 2.8401223635485426e-07, "loss": 0.0695, "step": 4575 }, { "epoch": 0.7218519540955161, "grad_norm": 6.304006576538086, "learning_rate": 2.838512316857189e-07, "loss": 0.0525, "step": 4576 }, { "epoch": 0.7220097014631068, "grad_norm": 3.2809009552001953, "learning_rate": 2.8369022701658347e-07, "loss": 0.0288, "step": 4577 }, { "epoch": 0.7221674488306976, "grad_norm": 6.112624645233154, "learning_rate": 2.835292223474481e-07, "loss": 0.0714, "step": 4578 }, { "epoch": 0.7223251961982884, "grad_norm": 11.613422393798828, "learning_rate": 2.8336821767831263e-07, "loss": 0.0675, "step": 4579 }, { "epoch": 0.7224829435658793, "grad_norm": 9.109901428222656, "learning_rate": 2.832072130091773e-07, "loss": 0.0615, "step": 4580 }, { "epoch": 0.72264069093347, "grad_norm": 6.590017318725586, "learning_rate": 2.8304620834004185e-07, "loss": 0.032, "step": 4581 }, { "epoch": 0.7227984383010608, "grad_norm": 4.4322404861450195, "learning_rate": 2.8288520367090645e-07, "loss": 0.073, "step": 4582 }, { "epoch": 0.7229561856686516, "grad_norm": 7.764130115509033, "learning_rate": 2.82724199001771e-07, "loss": 0.0585, "step": 4583 }, { "epoch": 0.7231139330362425, "grad_norm": 5.1639790534973145, "learning_rate": 2.8256319433263566e-07, "loss": 0.0843, "step": 4584 }, { "epoch": 0.7232716804038333, "grad_norm": 4.606882095336914, "learning_rate": 2.824021896635002e-07, "loss": 0.0319, "step": 4585 }, { "epoch": 0.723429427771424, "grad_norm": 6.314964294433594, "learning_rate": 2.822411849943648e-07, "loss": 0.0854, "step": 4586 }, { "epoch": 0.7235871751390148, "grad_norm": 4.091706275939941, "learning_rate": 2.820801803252294e-07, "loss": 0.043, "step": 4587 }, { "epoch": 0.7237449225066057, "grad_norm": 3.6330947875976562, "learning_rate": 2.8191917565609404e-07, "loss": 0.0287, "step": 4588 }, { "epoch": 0.7239026698741965, "grad_norm": 7.67761754989624, "learning_rate": 2.817581709869586e-07, "loss": 0.0151, "step": 4589 }, { "epoch": 0.7240604172417873, "grad_norm": 5.9519548416137695, "learning_rate": 2.815971663178232e-07, "loss": 0.0776, "step": 4590 }, { "epoch": 0.724218164609378, "grad_norm": 4.592087268829346, "learning_rate": 2.8143616164868775e-07, "loss": 0.0279, "step": 4591 }, { "epoch": 0.7243759119769689, "grad_norm": 4.141416072845459, "learning_rate": 2.812751569795524e-07, "loss": 0.0213, "step": 4592 }, { "epoch": 0.7245336593445597, "grad_norm": 5.703773498535156, "learning_rate": 2.8111415231041696e-07, "loss": 0.046, "step": 4593 }, { "epoch": 0.7246914067121505, "grad_norm": 4.934125900268555, "learning_rate": 2.8095314764128157e-07, "loss": 0.0524, "step": 4594 }, { "epoch": 0.7248491540797413, "grad_norm": 2.652515411376953, "learning_rate": 2.8079214297214623e-07, "loss": 0.0161, "step": 4595 }, { "epoch": 0.7250069014473322, "grad_norm": 4.344799518585205, "learning_rate": 2.806311383030108e-07, "loss": 0.0194, "step": 4596 }, { "epoch": 0.7251646488149229, "grad_norm": 5.285390377044678, "learning_rate": 2.804701336338754e-07, "loss": 0.0399, "step": 4597 }, { "epoch": 0.7253223961825137, "grad_norm": 4.879339694976807, "learning_rate": 2.8030912896473994e-07, "loss": 0.037, "step": 4598 }, { "epoch": 0.7254801435501045, "grad_norm": 9.88895034790039, "learning_rate": 2.801481242956046e-07, "loss": 0.044, "step": 4599 }, { "epoch": 0.7256378909176953, "grad_norm": 1.890272617340088, "learning_rate": 2.7998711962646915e-07, "loss": 0.0226, "step": 4600 }, { "epoch": 0.7257956382852861, "grad_norm": 4.78530740737915, "learning_rate": 2.7982611495733376e-07, "loss": 0.0408, "step": 4601 }, { "epoch": 0.7259533856528769, "grad_norm": 2.758941173553467, "learning_rate": 2.796651102881983e-07, "loss": 0.0129, "step": 4602 }, { "epoch": 0.7261111330204677, "grad_norm": 5.772910118103027, "learning_rate": 2.7950410561906297e-07, "loss": 0.0544, "step": 4603 }, { "epoch": 0.7262688803880585, "grad_norm": 4.390839099884033, "learning_rate": 2.793431009499275e-07, "loss": 0.055, "step": 4604 }, { "epoch": 0.7264266277556494, "grad_norm": 3.2692339420318604, "learning_rate": 2.7918209628079213e-07, "loss": 0.0265, "step": 4605 }, { "epoch": 0.7265843751232401, "grad_norm": 3.9459526538848877, "learning_rate": 2.790210916116567e-07, "loss": 0.0135, "step": 4606 }, { "epoch": 0.7267421224908309, "grad_norm": 3.0343422889709473, "learning_rate": 2.7886008694252134e-07, "loss": 0.0212, "step": 4607 }, { "epoch": 0.7268998698584217, "grad_norm": 4.5064520835876465, "learning_rate": 2.786990822733859e-07, "loss": 0.0312, "step": 4608 }, { "epoch": 0.7268998698584217, "eval_accuracy": 0.9879914224446033, "eval_f1": 0.9879914224446033, "eval_loss": 0.039093952625989914, "eval_runtime": 4699.0523, "eval_samples_per_second": 43.169, "eval_steps_per_second": 2.698, "step": 4608 }, { "epoch": 0.7270576172260126, "grad_norm": 3.6038103103637695, "learning_rate": 2.785380776042505e-07, "loss": 0.0276, "step": 4609 }, { "epoch": 0.7272153645936034, "grad_norm": 1.399121880531311, "learning_rate": 2.783770729351151e-07, "loss": 0.0091, "step": 4610 }, { "epoch": 0.7273731119611941, "grad_norm": 3.0181891918182373, "learning_rate": 2.782160682659797e-07, "loss": 0.0277, "step": 4611 }, { "epoch": 0.7275308593287849, "grad_norm": 4.86851692199707, "learning_rate": 2.7805506359684427e-07, "loss": 0.0495, "step": 4612 }, { "epoch": 0.7276886066963758, "grad_norm": 6.538103103637695, "learning_rate": 2.778940589277089e-07, "loss": 0.0689, "step": 4613 }, { "epoch": 0.7278463540639666, "grad_norm": 4.0547895431518555, "learning_rate": 2.777330542585735e-07, "loss": 0.0293, "step": 4614 }, { "epoch": 0.7280041014315574, "grad_norm": 3.82523250579834, "learning_rate": 2.775720495894381e-07, "loss": 0.0363, "step": 4615 }, { "epoch": 0.7281618487991481, "grad_norm": 3.744596242904663, "learning_rate": 2.7741104492030264e-07, "loss": 0.0347, "step": 4616 }, { "epoch": 0.7283195961667389, "grad_norm": 7.278544902801514, "learning_rate": 2.7725004025116725e-07, "loss": 0.1236, "step": 4617 }, { "epoch": 0.7284773435343298, "grad_norm": 8.369022369384766, "learning_rate": 2.770890355820319e-07, "loss": 0.0403, "step": 4618 }, { "epoch": 0.7286350909019206, "grad_norm": 6.324163913726807, "learning_rate": 2.7692803091289646e-07, "loss": 0.0488, "step": 4619 }, { "epoch": 0.7287928382695114, "grad_norm": 3.8759469985961914, "learning_rate": 2.7676702624376107e-07, "loss": 0.0257, "step": 4620 }, { "epoch": 0.7289505856371021, "grad_norm": 4.767875671386719, "learning_rate": 2.766060215746256e-07, "loss": 0.0265, "step": 4621 }, { "epoch": 0.729108333004693, "grad_norm": 5.644853115081787, "learning_rate": 2.764450169054903e-07, "loss": 0.0562, "step": 4622 }, { "epoch": 0.7292660803722838, "grad_norm": 6.3105597496032715, "learning_rate": 2.7628401223635483e-07, "loss": 0.072, "step": 4623 }, { "epoch": 0.7294238277398746, "grad_norm": 6.4134297370910645, "learning_rate": 2.7612300756721944e-07, "loss": 0.0305, "step": 4624 }, { "epoch": 0.7295815751074654, "grad_norm": 3.7911131381988525, "learning_rate": 2.7596200289808405e-07, "loss": 0.0552, "step": 4625 }, { "epoch": 0.7297393224750562, "grad_norm": 2.7404658794403076, "learning_rate": 2.7580099822894865e-07, "loss": 0.0138, "step": 4626 }, { "epoch": 0.729897069842647, "grad_norm": 4.408675670623779, "learning_rate": 2.756399935598132e-07, "loss": 0.0398, "step": 4627 }, { "epoch": 0.7300548172102378, "grad_norm": 2.576265811920166, "learning_rate": 2.754789888906778e-07, "loss": 0.0088, "step": 4628 }, { "epoch": 0.7302125645778286, "grad_norm": 4.607842922210693, "learning_rate": 2.753179842215424e-07, "loss": 0.039, "step": 4629 }, { "epoch": 0.7303703119454195, "grad_norm": 6.559534549713135, "learning_rate": 2.75156979552407e-07, "loss": 0.1064, "step": 4630 }, { "epoch": 0.7305280593130102, "grad_norm": 5.801240921020508, "learning_rate": 2.749959748832716e-07, "loss": 0.0354, "step": 4631 }, { "epoch": 0.730685806680601, "grad_norm": 5.777365684509277, "learning_rate": 2.748349702141362e-07, "loss": 0.046, "step": 4632 }, { "epoch": 0.7308435540481918, "grad_norm": 7.259184837341309, "learning_rate": 2.746739655450008e-07, "loss": 0.0827, "step": 4633 }, { "epoch": 0.7310013014157827, "grad_norm": 7.7218918800354, "learning_rate": 2.745129608758654e-07, "loss": 0.0804, "step": 4634 }, { "epoch": 0.7311590487833735, "grad_norm": 7.663471221923828, "learning_rate": 2.7435195620672995e-07, "loss": 0.0958, "step": 4635 }, { "epoch": 0.7313167961509642, "grad_norm": 4.536482810974121, "learning_rate": 2.7419095153759456e-07, "loss": 0.0327, "step": 4636 }, { "epoch": 0.731474543518555, "grad_norm": 5.311334609985352, "learning_rate": 2.7402994686845916e-07, "loss": 0.0725, "step": 4637 }, { "epoch": 0.7316322908861458, "grad_norm": 7.605411052703857, "learning_rate": 2.7386894219932377e-07, "loss": 0.0739, "step": 4638 }, { "epoch": 0.7317900382537367, "grad_norm": 6.093417167663574, "learning_rate": 2.737079375301883e-07, "loss": 0.0232, "step": 4639 }, { "epoch": 0.7319477856213275, "grad_norm": 5.984975337982178, "learning_rate": 2.73546932861053e-07, "loss": 0.0578, "step": 4640 }, { "epoch": 0.7321055329889182, "grad_norm": 5.4426751136779785, "learning_rate": 2.733859281919176e-07, "loss": 0.0651, "step": 4641 }, { "epoch": 0.732263280356509, "grad_norm": 6.997135639190674, "learning_rate": 2.7322492352278214e-07, "loss": 0.0658, "step": 4642 }, { "epoch": 0.7324210277240999, "grad_norm": 4.691622734069824, "learning_rate": 2.7306391885364675e-07, "loss": 0.0178, "step": 4643 }, { "epoch": 0.7325787750916907, "grad_norm": 5.306899547576904, "learning_rate": 2.7290291418451136e-07, "loss": 0.0451, "step": 4644 }, { "epoch": 0.7327365224592814, "grad_norm": 6.233757495880127, "learning_rate": 2.7274190951537596e-07, "loss": 0.0713, "step": 4645 }, { "epoch": 0.7328942698268722, "grad_norm": 6.5986456871032715, "learning_rate": 2.725809048462405e-07, "loss": 0.0324, "step": 4646 }, { "epoch": 0.7330520171944631, "grad_norm": 5.2762837409973145, "learning_rate": 2.724199001771051e-07, "loss": 0.0331, "step": 4647 }, { "epoch": 0.7332097645620539, "grad_norm": 10.506940841674805, "learning_rate": 2.7225889550796973e-07, "loss": 0.0518, "step": 4648 }, { "epoch": 0.7333675119296447, "grad_norm": 2.849714756011963, "learning_rate": 2.7209789083883433e-07, "loss": 0.0188, "step": 4649 }, { "epoch": 0.7335252592972354, "grad_norm": 7.017124652862549, "learning_rate": 2.719368861696989e-07, "loss": 0.0746, "step": 4650 }, { "epoch": 0.7336830066648263, "grad_norm": 6.85901403427124, "learning_rate": 2.717758815005635e-07, "loss": 0.0417, "step": 4651 }, { "epoch": 0.7338407540324171, "grad_norm": 6.239452838897705, "learning_rate": 2.716148768314281e-07, "loss": 0.0657, "step": 4652 }, { "epoch": 0.7339985014000079, "grad_norm": 8.635309219360352, "learning_rate": 2.714538721622927e-07, "loss": 0.0552, "step": 4653 }, { "epoch": 0.7341562487675987, "grad_norm": 3.114839553833008, "learning_rate": 2.7129286749315726e-07, "loss": 0.0377, "step": 4654 }, { "epoch": 0.7343139961351894, "grad_norm": 5.728065013885498, "learning_rate": 2.711318628240219e-07, "loss": 0.0805, "step": 4655 }, { "epoch": 0.7344717435027803, "grad_norm": 3.9795284271240234, "learning_rate": 2.7097085815488647e-07, "loss": 0.0591, "step": 4656 }, { "epoch": 0.7346294908703711, "grad_norm": 4.951369762420654, "learning_rate": 2.708098534857511e-07, "loss": 0.0399, "step": 4657 }, { "epoch": 0.7347872382379619, "grad_norm": 6.942314624786377, "learning_rate": 2.7064884881661563e-07, "loss": 0.0601, "step": 4658 }, { "epoch": 0.7349449856055527, "grad_norm": 2.2104973793029785, "learning_rate": 2.704878441474803e-07, "loss": 0.0175, "step": 4659 }, { "epoch": 0.7351027329731435, "grad_norm": 2.437842607498169, "learning_rate": 2.7032683947834484e-07, "loss": 0.0091, "step": 4660 }, { "epoch": 0.7352604803407343, "grad_norm": 4.740623950958252, "learning_rate": 2.7016583480920945e-07, "loss": 0.0325, "step": 4661 }, { "epoch": 0.7354182277083251, "grad_norm": 6.783057689666748, "learning_rate": 2.70004830140074e-07, "loss": 0.0595, "step": 4662 }, { "epoch": 0.7355759750759159, "grad_norm": 3.6913976669311523, "learning_rate": 2.6984382547093866e-07, "loss": 0.0233, "step": 4663 }, { "epoch": 0.7357337224435068, "grad_norm": 4.711301326751709, "learning_rate": 2.696828208018032e-07, "loss": 0.0345, "step": 4664 }, { "epoch": 0.7358914698110975, "grad_norm": 6.14579439163208, "learning_rate": 2.695218161326678e-07, "loss": 0.0343, "step": 4665 }, { "epoch": 0.7360492171786883, "grad_norm": 7.727104187011719, "learning_rate": 2.6936081146353243e-07, "loss": 0.1267, "step": 4666 }, { "epoch": 0.7362069645462791, "grad_norm": 3.1093430519104004, "learning_rate": 2.6919980679439704e-07, "loss": 0.0222, "step": 4667 }, { "epoch": 0.73636471191387, "grad_norm": 2.481738805770874, "learning_rate": 2.6903880212526164e-07, "loss": 0.0277, "step": 4668 }, { "epoch": 0.7365224592814608, "grad_norm": 7.11629056930542, "learning_rate": 2.688777974561262e-07, "loss": 0.0476, "step": 4669 }, { "epoch": 0.7366802066490515, "grad_norm": 9.372925758361816, "learning_rate": 2.6871679278699085e-07, "loss": 0.0442, "step": 4670 }, { "epoch": 0.7368379540166423, "grad_norm": 6.089367389678955, "learning_rate": 2.685557881178554e-07, "loss": 0.0311, "step": 4671 }, { "epoch": 0.7369957013842332, "grad_norm": 3.7933833599090576, "learning_rate": 2.6839478344872e-07, "loss": 0.0246, "step": 4672 }, { "epoch": 0.737153448751824, "grad_norm": 5.042723655700684, "learning_rate": 2.6823377877958457e-07, "loss": 0.0699, "step": 4673 }, { "epoch": 0.7373111961194148, "grad_norm": 3.5024068355560303, "learning_rate": 2.6807277411044923e-07, "loss": 0.0311, "step": 4674 }, { "epoch": 0.7374689434870055, "grad_norm": 4.534403324127197, "learning_rate": 2.679117694413138e-07, "loss": 0.0586, "step": 4675 }, { "epoch": 0.7376266908545963, "grad_norm": 4.200473308563232, "learning_rate": 2.677507647721784e-07, "loss": 0.0246, "step": 4676 }, { "epoch": 0.7377844382221872, "grad_norm": 4.237485885620117, "learning_rate": 2.6758976010304294e-07, "loss": 0.0132, "step": 4677 }, { "epoch": 0.737942185589778, "grad_norm": 3.843857526779175, "learning_rate": 2.674287554339076e-07, "loss": 0.0278, "step": 4678 }, { "epoch": 0.7380999329573688, "grad_norm": 5.551311492919922, "learning_rate": 2.6726775076477215e-07, "loss": 0.0603, "step": 4679 }, { "epoch": 0.7382576803249595, "grad_norm": 7.5500030517578125, "learning_rate": 2.6710674609563676e-07, "loss": 0.0653, "step": 4680 }, { "epoch": 0.7384154276925504, "grad_norm": 2.9558167457580566, "learning_rate": 2.669457414265013e-07, "loss": 0.0334, "step": 4681 }, { "epoch": 0.7385731750601412, "grad_norm": 4.379537105560303, "learning_rate": 2.6678473675736597e-07, "loss": 0.035, "step": 4682 }, { "epoch": 0.738730922427732, "grad_norm": 7.399539947509766, "learning_rate": 2.666237320882305e-07, "loss": 0.0568, "step": 4683 }, { "epoch": 0.7388886697953228, "grad_norm": 6.342048645019531, "learning_rate": 2.6646272741909513e-07, "loss": 0.0672, "step": 4684 }, { "epoch": 0.7390464171629136, "grad_norm": 5.140990257263184, "learning_rate": 2.6630172274995974e-07, "loss": 0.0545, "step": 4685 }, { "epoch": 0.7392041645305044, "grad_norm": 4.986282825469971, "learning_rate": 2.6614071808082434e-07, "loss": 0.0561, "step": 4686 }, { "epoch": 0.7393619118980952, "grad_norm": 5.449554443359375, "learning_rate": 2.659797134116889e-07, "loss": 0.0331, "step": 4687 }, { "epoch": 0.739519659265686, "grad_norm": 1.2541812658309937, "learning_rate": 2.658187087425535e-07, "loss": 0.0108, "step": 4688 }, { "epoch": 0.7396774066332769, "grad_norm": 39.77605056762695, "learning_rate": 2.6565770407341816e-07, "loss": 0.061, "step": 4689 }, { "epoch": 0.7398351540008676, "grad_norm": 3.652667284011841, "learning_rate": 2.654966994042827e-07, "loss": 0.0678, "step": 4690 }, { "epoch": 0.7399929013684584, "grad_norm": 3.6541528701782227, "learning_rate": 2.653356947351473e-07, "loss": 0.049, "step": 4691 }, { "epoch": 0.7401506487360492, "grad_norm": 4.591233730316162, "learning_rate": 2.651746900660119e-07, "loss": 0.0388, "step": 4692 }, { "epoch": 0.74030839610364, "grad_norm": 5.591604232788086, "learning_rate": 2.6501368539687654e-07, "loss": 0.0264, "step": 4693 }, { "epoch": 0.7404661434712309, "grad_norm": 5.747438907623291, "learning_rate": 2.648526807277411e-07, "loss": 0.0393, "step": 4694 }, { "epoch": 0.7406238908388216, "grad_norm": 7.057928562164307, "learning_rate": 2.646916760586057e-07, "loss": 0.0511, "step": 4695 }, { "epoch": 0.7407816382064124, "grad_norm": 8.497698783874512, "learning_rate": 2.6453067138947025e-07, "loss": 0.0786, "step": 4696 }, { "epoch": 0.7409393855740032, "grad_norm": 6.685943126678467, "learning_rate": 2.643696667203349e-07, "loss": 0.0275, "step": 4697 }, { "epoch": 0.7410971329415941, "grad_norm": 7.371028423309326, "learning_rate": 2.6420866205119946e-07, "loss": 0.0547, "step": 4698 }, { "epoch": 0.7412548803091848, "grad_norm": 7.228631496429443, "learning_rate": 2.6404765738206407e-07, "loss": 0.0289, "step": 4699 }, { "epoch": 0.7414126276767756, "grad_norm": 5.806507587432861, "learning_rate": 2.6388665271292867e-07, "loss": 0.0653, "step": 4700 }, { "epoch": 0.7415703750443664, "grad_norm": 5.1528496742248535, "learning_rate": 2.637256480437933e-07, "loss": 0.0441, "step": 4701 }, { "epoch": 0.7417281224119573, "grad_norm": 3.8632359504699707, "learning_rate": 2.6356464337465783e-07, "loss": 0.0503, "step": 4702 }, { "epoch": 0.7418858697795481, "grad_norm": 2.901906728744507, "learning_rate": 2.6340363870552244e-07, "loss": 0.0336, "step": 4703 }, { "epoch": 0.7420436171471388, "grad_norm": 2.611889362335205, "learning_rate": 2.6324263403638705e-07, "loss": 0.0109, "step": 4704 }, { "epoch": 0.7422013645147296, "grad_norm": 4.360488414764404, "learning_rate": 2.6308162936725165e-07, "loss": 0.0228, "step": 4705 }, { "epoch": 0.7423591118823205, "grad_norm": 4.6343302726745605, "learning_rate": 2.629206246981162e-07, "loss": 0.0449, "step": 4706 }, { "epoch": 0.7425168592499113, "grad_norm": 3.0819201469421387, "learning_rate": 2.627596200289808e-07, "loss": 0.015, "step": 4707 }, { "epoch": 0.7426746066175021, "grad_norm": 10.442688941955566, "learning_rate": 2.625986153598454e-07, "loss": 0.045, "step": 4708 }, { "epoch": 0.7428323539850928, "grad_norm": 4.94475793838501, "learning_rate": 2.6243761069071e-07, "loss": 0.0193, "step": 4709 }, { "epoch": 0.7429901013526837, "grad_norm": 6.157139301300049, "learning_rate": 2.622766060215746e-07, "loss": 0.056, "step": 4710 }, { "epoch": 0.7431478487202745, "grad_norm": 4.2757744789123535, "learning_rate": 2.621156013524392e-07, "loss": 0.0355, "step": 4711 }, { "epoch": 0.7433055960878653, "grad_norm": 3.8923239707946777, "learning_rate": 2.6195459668330384e-07, "loss": 0.0271, "step": 4712 }, { "epoch": 0.7434633434554561, "grad_norm": 10.463128089904785, "learning_rate": 2.617935920141684e-07, "loss": 0.0635, "step": 4713 }, { "epoch": 0.7436210908230468, "grad_norm": 2.57322359085083, "learning_rate": 2.61632587345033e-07, "loss": 0.0156, "step": 4714 }, { "epoch": 0.7437788381906377, "grad_norm": 7.563078880310059, "learning_rate": 2.614715826758976e-07, "loss": 0.0282, "step": 4715 }, { "epoch": 0.7439365855582285, "grad_norm": 6.335556507110596, "learning_rate": 2.613105780067622e-07, "loss": 0.0361, "step": 4716 }, { "epoch": 0.7440943329258193, "grad_norm": 2.6851494312286377, "learning_rate": 2.6114957333762677e-07, "loss": 0.0136, "step": 4717 }, { "epoch": 0.7442520802934101, "grad_norm": 4.868751049041748, "learning_rate": 2.609885686684914e-07, "loss": 0.0407, "step": 4718 }, { "epoch": 0.744409827661001, "grad_norm": 7.605571746826172, "learning_rate": 2.60827563999356e-07, "loss": 0.0511, "step": 4719 }, { "epoch": 0.7445675750285917, "grad_norm": 4.598052501678467, "learning_rate": 2.606665593302206e-07, "loss": 0.0302, "step": 4720 }, { "epoch": 0.7447253223961825, "grad_norm": 5.403987884521484, "learning_rate": 2.6050555466108514e-07, "loss": 0.047, "step": 4721 }, { "epoch": 0.7448830697637733, "grad_norm": 9.109344482421875, "learning_rate": 2.6034454999194975e-07, "loss": 0.0505, "step": 4722 }, { "epoch": 0.7450408171313642, "grad_norm": 4.874695301055908, "learning_rate": 2.6018354532281435e-07, "loss": 0.0435, "step": 4723 }, { "epoch": 0.7451985644989549, "grad_norm": 3.578183174133301, "learning_rate": 2.6002254065367896e-07, "loss": 0.0343, "step": 4724 }, { "epoch": 0.7453563118665457, "grad_norm": 4.437819957733154, "learning_rate": 2.598615359845435e-07, "loss": 0.0496, "step": 4725 }, { "epoch": 0.7455140592341365, "grad_norm": 2.6773672103881836, "learning_rate": 2.597005313154081e-07, "loss": 0.0352, "step": 4726 }, { "epoch": 0.7456718066017274, "grad_norm": 3.715035915374756, "learning_rate": 2.595395266462727e-07, "loss": 0.0368, "step": 4727 }, { "epoch": 0.7458295539693182, "grad_norm": 5.758171081542969, "learning_rate": 2.5937852197713733e-07, "loss": 0.0559, "step": 4728 }, { "epoch": 0.7459873013369089, "grad_norm": 6.195015907287598, "learning_rate": 2.592175173080019e-07, "loss": 0.0625, "step": 4729 }, { "epoch": 0.7461450487044997, "grad_norm": 3.536844491958618, "learning_rate": 2.5905651263886655e-07, "loss": 0.033, "step": 4730 }, { "epoch": 0.7463027960720906, "grad_norm": 3.621981143951416, "learning_rate": 2.588955079697311e-07, "loss": 0.0297, "step": 4731 }, { "epoch": 0.7464605434396814, "grad_norm": 7.923707485198975, "learning_rate": 2.587345033005957e-07, "loss": 0.0379, "step": 4732 }, { "epoch": 0.7466182908072722, "grad_norm": 6.867339611053467, "learning_rate": 2.5857349863146026e-07, "loss": 0.0469, "step": 4733 }, { "epoch": 0.7467760381748629, "grad_norm": 10.134578704833984, "learning_rate": 2.584124939623249e-07, "loss": 0.0595, "step": 4734 }, { "epoch": 0.7469337855424537, "grad_norm": 2.841083288192749, "learning_rate": 2.582514892931895e-07, "loss": 0.0295, "step": 4735 }, { "epoch": 0.7470915329100446, "grad_norm": 4.944960117340088, "learning_rate": 2.580904846240541e-07, "loss": 0.0282, "step": 4736 }, { "epoch": 0.7472492802776354, "grad_norm": 2.846627950668335, "learning_rate": 2.579294799549187e-07, "loss": 0.0371, "step": 4737 }, { "epoch": 0.7474070276452262, "grad_norm": 7.171413421630859, "learning_rate": 2.577684752857833e-07, "loss": 0.0766, "step": 4738 }, { "epoch": 0.7475647750128169, "grad_norm": 5.6841864585876465, "learning_rate": 2.576074706166479e-07, "loss": 0.0812, "step": 4739 }, { "epoch": 0.7477225223804078, "grad_norm": 10.416234970092773, "learning_rate": 2.5744646594751245e-07, "loss": 0.0474, "step": 4740 }, { "epoch": 0.7478802697479986, "grad_norm": 3.202110528945923, "learning_rate": 2.5728546127837706e-07, "loss": 0.0367, "step": 4741 }, { "epoch": 0.7480380171155894, "grad_norm": 6.204504013061523, "learning_rate": 2.5712445660924166e-07, "loss": 0.0462, "step": 4742 }, { "epoch": 0.7481957644831801, "grad_norm": 9.100051879882812, "learning_rate": 2.5696345194010627e-07, "loss": 0.0655, "step": 4743 }, { "epoch": 0.748353511850771, "grad_norm": 5.076653480529785, "learning_rate": 2.568024472709708e-07, "loss": 0.0378, "step": 4744 }, { "epoch": 0.7485112592183618, "grad_norm": 5.283294200897217, "learning_rate": 2.566414426018355e-07, "loss": 0.0436, "step": 4745 }, { "epoch": 0.7486690065859526, "grad_norm": 3.0343127250671387, "learning_rate": 2.5648043793270003e-07, "loss": 0.0343, "step": 4746 }, { "epoch": 0.7488267539535434, "grad_norm": 6.469725131988525, "learning_rate": 2.5631943326356464e-07, "loss": 0.0501, "step": 4747 }, { "epoch": 0.7489845013211343, "grad_norm": 4.294254302978516, "learning_rate": 2.561584285944292e-07, "loss": 0.0278, "step": 4748 }, { "epoch": 0.749142248688725, "grad_norm": 5.547506332397461, "learning_rate": 2.5599742392529385e-07, "loss": 0.0406, "step": 4749 }, { "epoch": 0.7492999960563158, "grad_norm": 3.277902126312256, "learning_rate": 2.558364192561584e-07, "loss": 0.0488, "step": 4750 }, { "epoch": 0.7494577434239066, "grad_norm": 6.372631549835205, "learning_rate": 2.55675414587023e-07, "loss": 0.055, "step": 4751 }, { "epoch": 0.7496154907914974, "grad_norm": 6.90498161315918, "learning_rate": 2.5551440991788757e-07, "loss": 0.0632, "step": 4752 }, { "epoch": 0.7497732381590883, "grad_norm": 3.1088144779205322, "learning_rate": 2.553534052487522e-07, "loss": 0.0329, "step": 4753 }, { "epoch": 0.749930985526679, "grad_norm": 3.4850716590881348, "learning_rate": 2.551924005796168e-07, "loss": 0.0423, "step": 4754 }, { "epoch": 0.7500887328942698, "grad_norm": 3.101902484893799, "learning_rate": 2.550313959104814e-07, "loss": 0.0278, "step": 4755 }, { "epoch": 0.7502464802618606, "grad_norm": 3.260136842727661, "learning_rate": 2.5487039124134594e-07, "loss": 0.0163, "step": 4756 }, { "epoch": 0.7504042276294515, "grad_norm": 3.9728457927703857, "learning_rate": 2.547093865722106e-07, "loss": 0.0537, "step": 4757 }, { "epoch": 0.7505619749970422, "grad_norm": 6.179561614990234, "learning_rate": 2.545483819030752e-07, "loss": 0.0656, "step": 4758 }, { "epoch": 0.750719722364633, "grad_norm": 3.508845567703247, "learning_rate": 2.5438737723393976e-07, "loss": 0.0186, "step": 4759 }, { "epoch": 0.7508774697322238, "grad_norm": 10.673635482788086, "learning_rate": 2.542263725648044e-07, "loss": 0.0387, "step": 4760 }, { "epoch": 0.7510352170998147, "grad_norm": 4.301787853240967, "learning_rate": 2.5406536789566897e-07, "loss": 0.0845, "step": 4761 }, { "epoch": 0.7511929644674055, "grad_norm": 4.449602127075195, "learning_rate": 2.539043632265336e-07, "loss": 0.0535, "step": 4762 }, { "epoch": 0.7513507118349962, "grad_norm": 3.7055530548095703, "learning_rate": 2.5374335855739813e-07, "loss": 0.0383, "step": 4763 }, { "epoch": 0.751508459202587, "grad_norm": 3.549971103668213, "learning_rate": 2.535823538882628e-07, "loss": 0.07, "step": 4764 }, { "epoch": 0.7516662065701779, "grad_norm": 3.0408430099487305, "learning_rate": 2.5342134921912734e-07, "loss": 0.0534, "step": 4765 }, { "epoch": 0.7518239539377687, "grad_norm": 6.597678184509277, "learning_rate": 2.5326034454999195e-07, "loss": 0.036, "step": 4766 }, { "epoch": 0.7519817013053595, "grad_norm": 3.500997543334961, "learning_rate": 2.530993398808565e-07, "loss": 0.0546, "step": 4767 }, { "epoch": 0.7521394486729502, "grad_norm": 7.335076808929443, "learning_rate": 2.5293833521172116e-07, "loss": 0.032, "step": 4768 }, { "epoch": 0.7522971960405411, "grad_norm": 6.85729455947876, "learning_rate": 2.527773305425857e-07, "loss": 0.0292, "step": 4769 }, { "epoch": 0.7524549434081319, "grad_norm": 6.948343753814697, "learning_rate": 2.526163258734503e-07, "loss": 0.039, "step": 4770 }, { "epoch": 0.7526126907757227, "grad_norm": 2.67183518409729, "learning_rate": 2.524553212043149e-07, "loss": 0.037, "step": 4771 }, { "epoch": 0.7527704381433135, "grad_norm": 3.0378963947296143, "learning_rate": 2.5229431653517953e-07, "loss": 0.0303, "step": 4772 }, { "epoch": 0.7529281855109042, "grad_norm": 6.4038591384887695, "learning_rate": 2.521333118660441e-07, "loss": 0.0595, "step": 4773 }, { "epoch": 0.7530859328784951, "grad_norm": 7.847699165344238, "learning_rate": 2.519723071969087e-07, "loss": 0.0433, "step": 4774 }, { "epoch": 0.7532436802460859, "grad_norm": 7.712286472320557, "learning_rate": 2.5181130252777325e-07, "loss": 0.0308, "step": 4775 }, { "epoch": 0.7534014276136767, "grad_norm": 3.324777603149414, "learning_rate": 2.516502978586379e-07, "loss": 0.0352, "step": 4776 }, { "epoch": 0.7535591749812675, "grad_norm": 5.5077314376831055, "learning_rate": 2.5148929318950246e-07, "loss": 0.041, "step": 4777 }, { "epoch": 0.7537169223488583, "grad_norm": 3.103212594985962, "learning_rate": 2.5132828852036707e-07, "loss": 0.0167, "step": 4778 }, { "epoch": 0.7538746697164491, "grad_norm": 3.603749990463257, "learning_rate": 2.5116728385123167e-07, "loss": 0.0508, "step": 4779 }, { "epoch": 0.7540324170840399, "grad_norm": 3.0347187519073486, "learning_rate": 2.510062791820963e-07, "loss": 0.0269, "step": 4780 }, { "epoch": 0.7541901644516307, "grad_norm": 3.86840558052063, "learning_rate": 2.508452745129609e-07, "loss": 0.0381, "step": 4781 }, { "epoch": 0.7543479118192216, "grad_norm": 9.875972747802734, "learning_rate": 2.5068426984382544e-07, "loss": 0.0881, "step": 4782 }, { "epoch": 0.7545056591868123, "grad_norm": 7.191247940063477, "learning_rate": 2.505232651746901e-07, "loss": 0.0269, "step": 4783 }, { "epoch": 0.7546634065544031, "grad_norm": 7.005562782287598, "learning_rate": 2.5036226050555465e-07, "loss": 0.0612, "step": 4784 }, { "epoch": 0.7548211539219939, "grad_norm": 3.008082628250122, "learning_rate": 2.5020125583641926e-07, "loss": 0.0405, "step": 4785 }, { "epoch": 0.7549789012895848, "grad_norm": 3.388357162475586, "learning_rate": 2.500402511672838e-07, "loss": 0.0251, "step": 4786 }, { "epoch": 0.7551366486571756, "grad_norm": 3.2952568531036377, "learning_rate": 2.498792464981484e-07, "loss": 0.0421, "step": 4787 }, { "epoch": 0.7552943960247663, "grad_norm": 8.339786529541016, "learning_rate": 2.49718241829013e-07, "loss": 0.0306, "step": 4788 }, { "epoch": 0.7554521433923571, "grad_norm": 4.5375494956970215, "learning_rate": 2.4955723715987763e-07, "loss": 0.0301, "step": 4789 }, { "epoch": 0.7556098907599479, "grad_norm": 3.0443336963653564, "learning_rate": 2.493962324907422e-07, "loss": 0.02, "step": 4790 }, { "epoch": 0.7557676381275388, "grad_norm": 4.50595760345459, "learning_rate": 2.492352278216068e-07, "loss": 0.0594, "step": 4791 }, { "epoch": 0.7559253854951296, "grad_norm": 4.146176338195801, "learning_rate": 2.490742231524714e-07, "loss": 0.0403, "step": 4792 }, { "epoch": 0.7560831328627203, "grad_norm": 5.834567546844482, "learning_rate": 2.48913218483336e-07, "loss": 0.0349, "step": 4793 }, { "epoch": 0.7562408802303111, "grad_norm": 6.704777717590332, "learning_rate": 2.487522138142006e-07, "loss": 0.0567, "step": 4794 }, { "epoch": 0.756398627597902, "grad_norm": 2.64961576461792, "learning_rate": 2.485912091450652e-07, "loss": 0.0282, "step": 4795 }, { "epoch": 0.7565563749654928, "grad_norm": 5.034839153289795, "learning_rate": 2.484302044759298e-07, "loss": 0.069, "step": 4796 }, { "epoch": 0.7567141223330835, "grad_norm": 3.204643487930298, "learning_rate": 2.482691998067944e-07, "loss": 0.0705, "step": 4797 }, { "epoch": 0.7568718697006743, "grad_norm": 3.6070191860198975, "learning_rate": 2.48108195137659e-07, "loss": 0.0594, "step": 4798 }, { "epoch": 0.7570296170682652, "grad_norm": 3.545212745666504, "learning_rate": 2.479471904685236e-07, "loss": 0.0552, "step": 4799 }, { "epoch": 0.757187364435856, "grad_norm": 4.373169898986816, "learning_rate": 2.477861857993882e-07, "loss": 0.0253, "step": 4800 }, { "epoch": 0.7573451118034468, "grad_norm": 0.9404663443565369, "learning_rate": 2.4762518113025275e-07, "loss": 0.0071, "step": 4801 }, { "epoch": 0.7575028591710375, "grad_norm": 7.398576259613037, "learning_rate": 2.4746417646111735e-07, "loss": 0.0461, "step": 4802 }, { "epoch": 0.7576606065386284, "grad_norm": 4.485061168670654, "learning_rate": 2.4730317179198196e-07, "loss": 0.0346, "step": 4803 }, { "epoch": 0.7578183539062192, "grad_norm": 6.237451553344727, "learning_rate": 2.4714216712284657e-07, "loss": 0.0425, "step": 4804 }, { "epoch": 0.75797610127381, "grad_norm": 3.685185670852661, "learning_rate": 2.469811624537111e-07, "loss": 0.0268, "step": 4805 }, { "epoch": 0.7581338486414008, "grad_norm": 7.0791335105896, "learning_rate": 2.468201577845757e-07, "loss": 0.0225, "step": 4806 }, { "epoch": 0.7582915960089917, "grad_norm": 8.132930755615234, "learning_rate": 2.4665915311544033e-07, "loss": 0.0414, "step": 4807 }, { "epoch": 0.7584493433765824, "grad_norm": 4.797114372253418, "learning_rate": 2.4649814844630494e-07, "loss": 0.0268, "step": 4808 }, { "epoch": 0.7586070907441732, "grad_norm": 3.8750391006469727, "learning_rate": 2.4633714377716954e-07, "loss": 0.0629, "step": 4809 }, { "epoch": 0.758764838111764, "grad_norm": 3.5614655017852783, "learning_rate": 2.461761391080341e-07, "loss": 0.0593, "step": 4810 }, { "epoch": 0.7589225854793548, "grad_norm": 4.513403415679932, "learning_rate": 2.460151344388987e-07, "loss": 0.0251, "step": 4811 }, { "epoch": 0.7590803328469456, "grad_norm": 2.8573057651519775, "learning_rate": 2.458541297697633e-07, "loss": 0.0245, "step": 4812 }, { "epoch": 0.7592380802145364, "grad_norm": 4.854127407073975, "learning_rate": 2.456931251006279e-07, "loss": 0.0691, "step": 4813 }, { "epoch": 0.7593958275821272, "grad_norm": 6.474213123321533, "learning_rate": 2.4553212043149247e-07, "loss": 0.0518, "step": 4814 }, { "epoch": 0.759553574949718, "grad_norm": 3.787691354751587, "learning_rate": 2.453711157623571e-07, "loss": 0.0236, "step": 4815 }, { "epoch": 0.7597113223173089, "grad_norm": 4.183278560638428, "learning_rate": 2.452101110932217e-07, "loss": 0.0768, "step": 4816 }, { "epoch": 0.7598690696848996, "grad_norm": 5.145864486694336, "learning_rate": 2.450491064240863e-07, "loss": 0.0346, "step": 4817 }, { "epoch": 0.7600268170524904, "grad_norm": 4.315591335296631, "learning_rate": 2.448881017549509e-07, "loss": 0.0258, "step": 4818 }, { "epoch": 0.7601845644200812, "grad_norm": 9.347548484802246, "learning_rate": 2.447270970858155e-07, "loss": 0.1037, "step": 4819 }, { "epoch": 0.7603423117876721, "grad_norm": 10.202075004577637, "learning_rate": 2.4456609241668005e-07, "loss": 0.0554, "step": 4820 }, { "epoch": 0.7605000591552629, "grad_norm": 8.322285652160645, "learning_rate": 2.4440508774754466e-07, "loss": 0.0497, "step": 4821 }, { "epoch": 0.7606578065228536, "grad_norm": 3.859376907348633, "learning_rate": 2.4424408307840927e-07, "loss": 0.0547, "step": 4822 }, { "epoch": 0.7608155538904444, "grad_norm": 4.298503875732422, "learning_rate": 2.440830784092739e-07, "loss": 0.0352, "step": 4823 }, { "epoch": 0.7609733012580353, "grad_norm": 6.906977653503418, "learning_rate": 2.439220737401385e-07, "loss": 0.0628, "step": 4824 }, { "epoch": 0.7611310486256261, "grad_norm": 4.583846092224121, "learning_rate": 2.4376106907100303e-07, "loss": 0.0759, "step": 4825 }, { "epoch": 0.7612887959932169, "grad_norm": 3.9792728424072266, "learning_rate": 2.4360006440186764e-07, "loss": 0.0299, "step": 4826 }, { "epoch": 0.7614465433608076, "grad_norm": 5.429749011993408, "learning_rate": 2.4343905973273225e-07, "loss": 0.0402, "step": 4827 }, { "epoch": 0.7616042907283985, "grad_norm": 2.6782314777374268, "learning_rate": 2.4327805506359685e-07, "loss": 0.017, "step": 4828 }, { "epoch": 0.7617620380959893, "grad_norm": 7.9765424728393555, "learning_rate": 2.431170503944614e-07, "loss": 0.0417, "step": 4829 }, { "epoch": 0.7619197854635801, "grad_norm": 4.836733341217041, "learning_rate": 2.42956045725326e-07, "loss": 0.0232, "step": 4830 }, { "epoch": 0.7620775328311709, "grad_norm": 1.0174866914749146, "learning_rate": 2.427950410561906e-07, "loss": 0.0072, "step": 4831 }, { "epoch": 0.7622352801987616, "grad_norm": 4.296733856201172, "learning_rate": 2.426340363870552e-07, "loss": 0.0384, "step": 4832 }, { "epoch": 0.7623930275663525, "grad_norm": 3.270857572555542, "learning_rate": 2.424730317179198e-07, "loss": 0.0171, "step": 4833 }, { "epoch": 0.7625507749339433, "grad_norm": 2.496772050857544, "learning_rate": 2.423120270487844e-07, "loss": 0.0202, "step": 4834 }, { "epoch": 0.7627085223015341, "grad_norm": 4.638273239135742, "learning_rate": 2.42151022379649e-07, "loss": 0.0372, "step": 4835 }, { "epoch": 0.7628662696691249, "grad_norm": 4.851541519165039, "learning_rate": 2.419900177105136e-07, "loss": 0.0433, "step": 4836 }, { "epoch": 0.7630240170367157, "grad_norm": 2.6814563274383545, "learning_rate": 2.418290130413782e-07, "loss": 0.0153, "step": 4837 }, { "epoch": 0.7631817644043065, "grad_norm": 8.24600601196289, "learning_rate": 2.4166800837224276e-07, "loss": 0.1217, "step": 4838 }, { "epoch": 0.7633395117718973, "grad_norm": 4.169066905975342, "learning_rate": 2.415070037031074e-07, "loss": 0.0368, "step": 4839 }, { "epoch": 0.7634972591394881, "grad_norm": 4.953155994415283, "learning_rate": 2.4134599903397197e-07, "loss": 0.0358, "step": 4840 }, { "epoch": 0.763655006507079, "grad_norm": 6.2820611000061035, "learning_rate": 2.411849943648366e-07, "loss": 0.0455, "step": 4841 }, { "epoch": 0.7638127538746697, "grad_norm": 5.614377021789551, "learning_rate": 2.410239896957012e-07, "loss": 0.057, "step": 4842 }, { "epoch": 0.7639705012422605, "grad_norm": 3.532853841781616, "learning_rate": 2.408629850265658e-07, "loss": 0.0217, "step": 4843 }, { "epoch": 0.7641282486098513, "grad_norm": 7.9854416847229, "learning_rate": 2.4070198035743034e-07, "loss": 0.0536, "step": 4844 }, { "epoch": 0.7642859959774422, "grad_norm": 9.11436939239502, "learning_rate": 2.4054097568829495e-07, "loss": 0.0833, "step": 4845 }, { "epoch": 0.764443743345033, "grad_norm": 3.794739007949829, "learning_rate": 2.4037997101915955e-07, "loss": 0.0385, "step": 4846 }, { "epoch": 0.7646014907126237, "grad_norm": 4.817052364349365, "learning_rate": 2.4021896635002416e-07, "loss": 0.0394, "step": 4847 }, { "epoch": 0.7647592380802145, "grad_norm": 46.40493392944336, "learning_rate": 2.400579616808887e-07, "loss": 0.0332, "step": 4848 }, { "epoch": 0.7649169854478053, "grad_norm": 10.331274032592773, "learning_rate": 2.398969570117533e-07, "loss": 0.0661, "step": 4849 }, { "epoch": 0.7650747328153962, "grad_norm": 4.1327128410339355, "learning_rate": 2.3973595234261793e-07, "loss": 0.0244, "step": 4850 }, { "epoch": 0.765232480182987, "grad_norm": 8.767778396606445, "learning_rate": 2.3957494767348253e-07, "loss": 0.1145, "step": 4851 }, { "epoch": 0.7653902275505777, "grad_norm": 5.427033424377441, "learning_rate": 2.3941394300434714e-07, "loss": 0.0392, "step": 4852 }, { "epoch": 0.7655479749181685, "grad_norm": 5.3446221351623535, "learning_rate": 2.392529383352117e-07, "loss": 0.0689, "step": 4853 }, { "epoch": 0.7657057222857594, "grad_norm": 5.317961692810059, "learning_rate": 2.390919336660763e-07, "loss": 0.0306, "step": 4854 }, { "epoch": 0.7658634696533502, "grad_norm": 9.808844566345215, "learning_rate": 2.389309289969409e-07, "loss": 0.0767, "step": 4855 }, { "epoch": 0.766021217020941, "grad_norm": 8.848456382751465, "learning_rate": 2.387699243278055e-07, "loss": 0.09, "step": 4856 }, { "epoch": 0.7661789643885317, "grad_norm": 6.596203327178955, "learning_rate": 2.3860891965867006e-07, "loss": 0.0475, "step": 4857 }, { "epoch": 0.7663367117561226, "grad_norm": 5.3128981590271, "learning_rate": 2.3844791498953467e-07, "loss": 0.0464, "step": 4858 }, { "epoch": 0.7664944591237134, "grad_norm": 4.245983600616455, "learning_rate": 2.3828691032039928e-07, "loss": 0.05, "step": 4859 }, { "epoch": 0.7666522064913042, "grad_norm": 6.538334369659424, "learning_rate": 2.3812590565126386e-07, "loss": 0.0277, "step": 4860 }, { "epoch": 0.7668099538588949, "grad_norm": 8.421479225158691, "learning_rate": 2.3796490098212846e-07, "loss": 0.06, "step": 4861 }, { "epoch": 0.7669677012264858, "grad_norm": 2.751904249191284, "learning_rate": 2.3780389631299307e-07, "loss": 0.0292, "step": 4862 }, { "epoch": 0.7671254485940766, "grad_norm": 4.167398452758789, "learning_rate": 2.3764289164385768e-07, "loss": 0.0197, "step": 4863 }, { "epoch": 0.7672831959616674, "grad_norm": 7.360714435577393, "learning_rate": 2.3748188697472226e-07, "loss": 0.0757, "step": 4864 }, { "epoch": 0.7674409433292582, "grad_norm": 10.587705612182617, "learning_rate": 2.3732088230558686e-07, "loss": 0.1102, "step": 4865 }, { "epoch": 0.767598690696849, "grad_norm": 5.881148815155029, "learning_rate": 2.3715987763645144e-07, "loss": 0.0518, "step": 4866 }, { "epoch": 0.7677564380644398, "grad_norm": 4.342334747314453, "learning_rate": 2.3699887296731605e-07, "loss": 0.0632, "step": 4867 }, { "epoch": 0.7679141854320306, "grad_norm": 7.428899765014648, "learning_rate": 2.3683786829818065e-07, "loss": 0.0457, "step": 4868 }, { "epoch": 0.7680719327996214, "grad_norm": 5.535422325134277, "learning_rate": 2.3667686362904523e-07, "loss": 0.055, "step": 4869 }, { "epoch": 0.7682296801672122, "grad_norm": 2.0176048278808594, "learning_rate": 2.3651585895990984e-07, "loss": 0.0137, "step": 4870 }, { "epoch": 0.768387427534803, "grad_norm": 2.525188446044922, "learning_rate": 2.3635485429077442e-07, "loss": 0.0233, "step": 4871 }, { "epoch": 0.7685451749023938, "grad_norm": 5.715892314910889, "learning_rate": 2.3619384962163903e-07, "loss": 0.0611, "step": 4872 }, { "epoch": 0.7687029222699846, "grad_norm": 4.827230930328369, "learning_rate": 2.360328449525036e-07, "loss": 0.0436, "step": 4873 }, { "epoch": 0.7688606696375754, "grad_norm": 4.874945163726807, "learning_rate": 2.3587184028336821e-07, "loss": 0.037, "step": 4874 }, { "epoch": 0.7690184170051663, "grad_norm": 4.637301445007324, "learning_rate": 2.357108356142328e-07, "loss": 0.0269, "step": 4875 }, { "epoch": 0.769176164372757, "grad_norm": 2.1692140102386475, "learning_rate": 2.355498309450974e-07, "loss": 0.0154, "step": 4876 }, { "epoch": 0.7693339117403478, "grad_norm": 4.651496887207031, "learning_rate": 2.3538882627596198e-07, "loss": 0.0496, "step": 4877 }, { "epoch": 0.7694916591079386, "grad_norm": 4.78239631652832, "learning_rate": 2.3522782160682659e-07, "loss": 0.0236, "step": 4878 }, { "epoch": 0.7696494064755295, "grad_norm": 4.077250957489014, "learning_rate": 2.3506681693769117e-07, "loss": 0.0462, "step": 4879 }, { "epoch": 0.7698071538431203, "grad_norm": 11.404817581176758, "learning_rate": 2.3490581226855577e-07, "loss": 0.1007, "step": 4880 }, { "epoch": 0.769964901210711, "grad_norm": 7.535063743591309, "learning_rate": 2.3474480759942035e-07, "loss": 0.0468, "step": 4881 }, { "epoch": 0.7701226485783018, "grad_norm": 5.084873676300049, "learning_rate": 2.3458380293028496e-07, "loss": 0.0323, "step": 4882 }, { "epoch": 0.7702803959458927, "grad_norm": 8.215431213378906, "learning_rate": 2.3442279826114956e-07, "loss": 0.0517, "step": 4883 }, { "epoch": 0.7704381433134835, "grad_norm": 1.797682285308838, "learning_rate": 2.3426179359201414e-07, "loss": 0.0081, "step": 4884 }, { "epoch": 0.7705958906810743, "grad_norm": 2.626028060913086, "learning_rate": 2.3410078892287875e-07, "loss": 0.016, "step": 4885 }, { "epoch": 0.770753638048665, "grad_norm": 3.058671236038208, "learning_rate": 2.3393978425374336e-07, "loss": 0.0163, "step": 4886 }, { "epoch": 0.7709113854162558, "grad_norm": 5.610447883605957, "learning_rate": 2.3377877958460796e-07, "loss": 0.0266, "step": 4887 }, { "epoch": 0.7710691327838467, "grad_norm": 5.915511131286621, "learning_rate": 2.3361777491547254e-07, "loss": 0.0222, "step": 4888 }, { "epoch": 0.7712268801514375, "grad_norm": 5.027588844299316, "learning_rate": 2.3345677024633715e-07, "loss": 0.018, "step": 4889 }, { "epoch": 0.7713846275190283, "grad_norm": 5.529494762420654, "learning_rate": 2.3329576557720173e-07, "loss": 0.0898, "step": 4890 }, { "epoch": 0.771542374886619, "grad_norm": 7.25527286529541, "learning_rate": 2.3313476090806634e-07, "loss": 0.0528, "step": 4891 }, { "epoch": 0.7717001222542099, "grad_norm": 4.3180389404296875, "learning_rate": 2.3297375623893092e-07, "loss": 0.0305, "step": 4892 }, { "epoch": 0.7718578696218007, "grad_norm": 2.8147454261779785, "learning_rate": 2.3281275156979552e-07, "loss": 0.0238, "step": 4893 }, { "epoch": 0.7720156169893915, "grad_norm": 3.026052236557007, "learning_rate": 2.326517469006601e-07, "loss": 0.0238, "step": 4894 }, { "epoch": 0.7721733643569823, "grad_norm": 6.311300754547119, "learning_rate": 2.324907422315247e-07, "loss": 0.0361, "step": 4895 }, { "epoch": 0.7723311117245731, "grad_norm": 6.82096529006958, "learning_rate": 2.323297375623893e-07, "loss": 0.0481, "step": 4896 }, { "epoch": 0.7724888590921639, "grad_norm": 10.022536277770996, "learning_rate": 2.321687328932539e-07, "loss": 0.085, "step": 4897 }, { "epoch": 0.7726466064597547, "grad_norm": 6.319077491760254, "learning_rate": 2.320077282241185e-07, "loss": 0.0896, "step": 4898 }, { "epoch": 0.7728043538273455, "grad_norm": 4.465986728668213, "learning_rate": 2.3184672355498308e-07, "loss": 0.017, "step": 4899 }, { "epoch": 0.7729621011949364, "grad_norm": 3.6244585514068604, "learning_rate": 2.3168571888584769e-07, "loss": 0.0214, "step": 4900 }, { "epoch": 0.7731198485625271, "grad_norm": 4.184035301208496, "learning_rate": 2.3152471421671227e-07, "loss": 0.038, "step": 4901 }, { "epoch": 0.7732775959301179, "grad_norm": 5.894932270050049, "learning_rate": 2.3136370954757687e-07, "loss": 0.0434, "step": 4902 }, { "epoch": 0.7734353432977087, "grad_norm": 4.606961727142334, "learning_rate": 2.3120270487844145e-07, "loss": 0.0793, "step": 4903 }, { "epoch": 0.7735930906652996, "grad_norm": 7.030303955078125, "learning_rate": 2.3104170020930606e-07, "loss": 0.0397, "step": 4904 }, { "epoch": 0.7737508380328904, "grad_norm": 4.425064563751221, "learning_rate": 2.3088069554017064e-07, "loss": 0.0951, "step": 4905 }, { "epoch": 0.7739085854004811, "grad_norm": 4.096029281616211, "learning_rate": 2.3071969087103524e-07, "loss": 0.0241, "step": 4906 }, { "epoch": 0.7740663327680719, "grad_norm": 8.110779762268066, "learning_rate": 2.3055868620189982e-07, "loss": 0.0728, "step": 4907 }, { "epoch": 0.7742240801356627, "grad_norm": 4.087162494659424, "learning_rate": 2.3039768153276443e-07, "loss": 0.0117, "step": 4908 }, { "epoch": 0.7743818275032536, "grad_norm": 4.035387992858887, "learning_rate": 2.3023667686362904e-07, "loss": 0.0131, "step": 4909 }, { "epoch": 0.7745395748708443, "grad_norm": 6.301119804382324, "learning_rate": 2.3007567219449364e-07, "loss": 0.0493, "step": 4910 }, { "epoch": 0.7746973222384351, "grad_norm": 2.393937349319458, "learning_rate": 2.2991466752535822e-07, "loss": 0.02, "step": 4911 }, { "epoch": 0.7748550696060259, "grad_norm": 5.6482930183410645, "learning_rate": 2.2975366285622283e-07, "loss": 0.0235, "step": 4912 }, { "epoch": 0.7750128169736168, "grad_norm": 6.207182884216309, "learning_rate": 2.2959265818708744e-07, "loss": 0.0537, "step": 4913 }, { "epoch": 0.7751705643412076, "grad_norm": 3.2115135192871094, "learning_rate": 2.2943165351795202e-07, "loss": 0.0363, "step": 4914 }, { "epoch": 0.7753283117087983, "grad_norm": 2.6930439472198486, "learning_rate": 2.2927064884881662e-07, "loss": 0.033, "step": 4915 }, { "epoch": 0.7754860590763891, "grad_norm": 5.9401984214782715, "learning_rate": 2.291096441796812e-07, "loss": 0.0288, "step": 4916 }, { "epoch": 0.77564380644398, "grad_norm": 3.733520984649658, "learning_rate": 2.289486395105458e-07, "loss": 0.0402, "step": 4917 }, { "epoch": 0.7758015538115708, "grad_norm": 3.3514657020568848, "learning_rate": 2.287876348414104e-07, "loss": 0.0177, "step": 4918 }, { "epoch": 0.7759593011791616, "grad_norm": 7.077025890350342, "learning_rate": 2.28626630172275e-07, "loss": 0.0973, "step": 4919 }, { "epoch": 0.7761170485467523, "grad_norm": 5.048889636993408, "learning_rate": 2.2846562550313957e-07, "loss": 0.0482, "step": 4920 }, { "epoch": 0.7762747959143432, "grad_norm": 5.32190465927124, "learning_rate": 2.2830462083400418e-07, "loss": 0.0361, "step": 4921 }, { "epoch": 0.776432543281934, "grad_norm": 3.7085556983947754, "learning_rate": 2.2814361616486876e-07, "loss": 0.0413, "step": 4922 }, { "epoch": 0.7765902906495248, "grad_norm": 5.97633695602417, "learning_rate": 2.2798261149573337e-07, "loss": 0.0395, "step": 4923 }, { "epoch": 0.7767480380171156, "grad_norm": 6.187735080718994, "learning_rate": 2.2782160682659795e-07, "loss": 0.0464, "step": 4924 }, { "epoch": 0.7769057853847063, "grad_norm": 6.129651069641113, "learning_rate": 2.2766060215746255e-07, "loss": 0.0365, "step": 4925 }, { "epoch": 0.7770635327522972, "grad_norm": 3.3210835456848145, "learning_rate": 2.2749959748832713e-07, "loss": 0.0744, "step": 4926 }, { "epoch": 0.777221280119888, "grad_norm": 4.758389472961426, "learning_rate": 2.2733859281919174e-07, "loss": 0.037, "step": 4927 }, { "epoch": 0.7773790274874788, "grad_norm": 2.596815824508667, "learning_rate": 2.2717758815005632e-07, "loss": 0.0134, "step": 4928 }, { "epoch": 0.7775367748550696, "grad_norm": 7.266978740692139, "learning_rate": 2.2701658348092093e-07, "loss": 0.0338, "step": 4929 }, { "epoch": 0.7776945222226604, "grad_norm": 5.165182113647461, "learning_rate": 2.2685557881178553e-07, "loss": 0.0537, "step": 4930 }, { "epoch": 0.7778522695902512, "grad_norm": 8.269397735595703, "learning_rate": 2.266945741426501e-07, "loss": 0.0432, "step": 4931 }, { "epoch": 0.778010016957842, "grad_norm": 5.935682773590088, "learning_rate": 2.2653356947351474e-07, "loss": 0.0484, "step": 4932 }, { "epoch": 0.7781677643254328, "grad_norm": 2.917829751968384, "learning_rate": 2.2637256480437932e-07, "loss": 0.0229, "step": 4933 }, { "epoch": 0.7783255116930237, "grad_norm": 3.5857861042022705, "learning_rate": 2.2621156013524393e-07, "loss": 0.0288, "step": 4934 }, { "epoch": 0.7784832590606144, "grad_norm": 7.50002384185791, "learning_rate": 2.260505554661085e-07, "loss": 0.0308, "step": 4935 }, { "epoch": 0.7786410064282052, "grad_norm": 2.9288904666900635, "learning_rate": 2.2588955079697312e-07, "loss": 0.0164, "step": 4936 }, { "epoch": 0.778798753795796, "grad_norm": 3.809661388397217, "learning_rate": 2.257285461278377e-07, "loss": 0.024, "step": 4937 }, { "epoch": 0.7789565011633869, "grad_norm": 5.5322442054748535, "learning_rate": 2.255675414587023e-07, "loss": 0.0491, "step": 4938 }, { "epoch": 0.7791142485309777, "grad_norm": 2.1326024532318115, "learning_rate": 2.2540653678956688e-07, "loss": 0.0106, "step": 4939 }, { "epoch": 0.7792719958985684, "grad_norm": 7.786957263946533, "learning_rate": 2.252455321204315e-07, "loss": 0.0653, "step": 4940 }, { "epoch": 0.7794297432661592, "grad_norm": 4.649642467498779, "learning_rate": 2.2508452745129607e-07, "loss": 0.0257, "step": 4941 }, { "epoch": 0.7795874906337501, "grad_norm": 3.2936179637908936, "learning_rate": 2.2492352278216068e-07, "loss": 0.0304, "step": 4942 }, { "epoch": 0.7797452380013409, "grad_norm": 3.508375883102417, "learning_rate": 2.2476251811302526e-07, "loss": 0.0197, "step": 4943 }, { "epoch": 0.7799029853689317, "grad_norm": 4.908533573150635, "learning_rate": 2.2460151344388986e-07, "loss": 0.0472, "step": 4944 }, { "epoch": 0.7800607327365224, "grad_norm": 3.5363972187042236, "learning_rate": 2.2444050877475447e-07, "loss": 0.0446, "step": 4945 }, { "epoch": 0.7802184801041132, "grad_norm": 8.4215726852417, "learning_rate": 2.2427950410561905e-07, "loss": 0.0679, "step": 4946 }, { "epoch": 0.7803762274717041, "grad_norm": 4.550530910491943, "learning_rate": 2.2411849943648365e-07, "loss": 0.0251, "step": 4947 }, { "epoch": 0.7805339748392949, "grad_norm": 4.5460662841796875, "learning_rate": 2.2395749476734823e-07, "loss": 0.0345, "step": 4948 }, { "epoch": 0.7806917222068857, "grad_norm": 6.487603187561035, "learning_rate": 2.2379649009821284e-07, "loss": 0.0756, "step": 4949 }, { "epoch": 0.7808494695744764, "grad_norm": 7.149035453796387, "learning_rate": 2.2363548542907742e-07, "loss": 0.0377, "step": 4950 }, { "epoch": 0.7810072169420673, "grad_norm": 5.912919521331787, "learning_rate": 2.2347448075994203e-07, "loss": 0.0654, "step": 4951 }, { "epoch": 0.7811649643096581, "grad_norm": 7.855524063110352, "learning_rate": 2.233134760908066e-07, "loss": 0.0607, "step": 4952 }, { "epoch": 0.7813227116772489, "grad_norm": 7.1649065017700195, "learning_rate": 2.231524714216712e-07, "loss": 0.0493, "step": 4953 }, { "epoch": 0.7814804590448396, "grad_norm": 2.527353525161743, "learning_rate": 2.229914667525358e-07, "loss": 0.0193, "step": 4954 }, { "epoch": 0.7816382064124305, "grad_norm": 9.294767379760742, "learning_rate": 2.2283046208340042e-07, "loss": 0.0814, "step": 4955 }, { "epoch": 0.7817959537800213, "grad_norm": 7.593995094299316, "learning_rate": 2.22669457414265e-07, "loss": 0.049, "step": 4956 }, { "epoch": 0.7819537011476121, "grad_norm": 9.311671257019043, "learning_rate": 2.225084527451296e-07, "loss": 0.0796, "step": 4957 }, { "epoch": 0.7821114485152029, "grad_norm": 5.528035640716553, "learning_rate": 2.223474480759942e-07, "loss": 0.0503, "step": 4958 }, { "epoch": 0.7822691958827938, "grad_norm": 3.1407878398895264, "learning_rate": 2.221864434068588e-07, "loss": 0.0189, "step": 4959 }, { "epoch": 0.7824269432503845, "grad_norm": 2.911999464035034, "learning_rate": 2.220254387377234e-07, "loss": 0.0364, "step": 4960 }, { "epoch": 0.7825846906179753, "grad_norm": 6.310225963592529, "learning_rate": 2.2186443406858798e-07, "loss": 0.0259, "step": 4961 }, { "epoch": 0.7827424379855661, "grad_norm": 5.104500770568848, "learning_rate": 2.217034293994526e-07, "loss": 0.0329, "step": 4962 }, { "epoch": 0.782900185353157, "grad_norm": 4.815276145935059, "learning_rate": 2.2154242473031717e-07, "loss": 0.0276, "step": 4963 }, { "epoch": 0.7830579327207478, "grad_norm": 4.592746734619141, "learning_rate": 2.2138142006118178e-07, "loss": 0.0459, "step": 4964 }, { "epoch": 0.7832156800883385, "grad_norm": 4.836174488067627, "learning_rate": 2.2122041539204636e-07, "loss": 0.0271, "step": 4965 }, { "epoch": 0.7833734274559293, "grad_norm": 4.292660713195801, "learning_rate": 2.2105941072291096e-07, "loss": 0.0321, "step": 4966 }, { "epoch": 0.7835311748235201, "grad_norm": 7.005777359008789, "learning_rate": 2.2089840605377554e-07, "loss": 0.0299, "step": 4967 }, { "epoch": 0.783688922191111, "grad_norm": 4.866457462310791, "learning_rate": 2.2073740138464015e-07, "loss": 0.034, "step": 4968 }, { "epoch": 0.7838466695587017, "grad_norm": 5.929502010345459, "learning_rate": 2.2057639671550473e-07, "loss": 0.0552, "step": 4969 }, { "epoch": 0.7840044169262925, "grad_norm": 7.180016040802002, "learning_rate": 2.2041539204636933e-07, "loss": 0.0523, "step": 4970 }, { "epoch": 0.7841621642938833, "grad_norm": 5.354877948760986, "learning_rate": 2.2025438737723391e-07, "loss": 0.06, "step": 4971 }, { "epoch": 0.7843199116614742, "grad_norm": 5.529969692230225, "learning_rate": 2.2009338270809852e-07, "loss": 0.0269, "step": 4972 }, { "epoch": 0.784477659029065, "grad_norm": 2.358067512512207, "learning_rate": 2.199323780389631e-07, "loss": 0.04, "step": 4973 }, { "epoch": 0.7846354063966557, "grad_norm": 4.004404544830322, "learning_rate": 2.197713733698277e-07, "loss": 0.0311, "step": 4974 }, { "epoch": 0.7847931537642465, "grad_norm": 3.5707602500915527, "learning_rate": 2.196103687006923e-07, "loss": 0.0191, "step": 4975 }, { "epoch": 0.7849509011318374, "grad_norm": 22.91334342956543, "learning_rate": 2.194493640315569e-07, "loss": 0.0181, "step": 4976 }, { "epoch": 0.7851086484994282, "grad_norm": 4.836089134216309, "learning_rate": 2.192883593624215e-07, "loss": 0.0604, "step": 4977 }, { "epoch": 0.785266395867019, "grad_norm": 5.083307266235352, "learning_rate": 2.1912735469328608e-07, "loss": 0.0234, "step": 4978 }, { "epoch": 0.7854241432346097, "grad_norm": 3.707251787185669, "learning_rate": 2.189663500241507e-07, "loss": 0.0142, "step": 4979 }, { "epoch": 0.7855818906022006, "grad_norm": 6.500045299530029, "learning_rate": 2.188053453550153e-07, "loss": 0.0267, "step": 4980 }, { "epoch": 0.7857396379697914, "grad_norm": 9.503015518188477, "learning_rate": 2.186443406858799e-07, "loss": 0.0521, "step": 4981 }, { "epoch": 0.7858973853373822, "grad_norm": 4.674173355102539, "learning_rate": 2.1848333601674448e-07, "loss": 0.0601, "step": 4982 }, { "epoch": 0.786055132704973, "grad_norm": 3.1530070304870605, "learning_rate": 2.1832233134760908e-07, "loss": 0.0256, "step": 4983 }, { "epoch": 0.7862128800725637, "grad_norm": 5.259850978851318, "learning_rate": 2.1816132667847366e-07, "loss": 0.0371, "step": 4984 }, { "epoch": 0.7863706274401546, "grad_norm": 4.983250617980957, "learning_rate": 2.1800032200933827e-07, "loss": 0.0252, "step": 4985 }, { "epoch": 0.7865283748077454, "grad_norm": 4.136760234832764, "learning_rate": 2.1783931734020285e-07, "loss": 0.046, "step": 4986 }, { "epoch": 0.7866861221753362, "grad_norm": 3.8495824337005615, "learning_rate": 2.1767831267106746e-07, "loss": 0.0566, "step": 4987 }, { "epoch": 0.786843869542927, "grad_norm": 5.45047664642334, "learning_rate": 2.1751730800193204e-07, "loss": 0.0453, "step": 4988 }, { "epoch": 0.7870016169105178, "grad_norm": 7.654728412628174, "learning_rate": 2.1735630333279664e-07, "loss": 0.0835, "step": 4989 }, { "epoch": 0.7871593642781086, "grad_norm": 6.550886631011963, "learning_rate": 2.1719529866366125e-07, "loss": 0.0563, "step": 4990 }, { "epoch": 0.7873171116456994, "grad_norm": 7.195198059082031, "learning_rate": 2.1703429399452583e-07, "loss": 0.0516, "step": 4991 }, { "epoch": 0.7874748590132902, "grad_norm": 5.970688819885254, "learning_rate": 2.1687328932539044e-07, "loss": 0.0673, "step": 4992 }, { "epoch": 0.7876326063808811, "grad_norm": 5.075960159301758, "learning_rate": 2.1671228465625501e-07, "loss": 0.0229, "step": 4993 }, { "epoch": 0.7877903537484718, "grad_norm": 6.305851936340332, "learning_rate": 2.1655127998711962e-07, "loss": 0.0966, "step": 4994 }, { "epoch": 0.7879481011160626, "grad_norm": 4.926904201507568, "learning_rate": 2.163902753179842e-07, "loss": 0.0955, "step": 4995 }, { "epoch": 0.7881058484836534, "grad_norm": 5.612282752990723, "learning_rate": 2.162292706488488e-07, "loss": 0.0505, "step": 4996 }, { "epoch": 0.7882635958512443, "grad_norm": 4.992518901824951, "learning_rate": 2.160682659797134e-07, "loss": 0.0394, "step": 4997 }, { "epoch": 0.7884213432188351, "grad_norm": 2.6907243728637695, "learning_rate": 2.15907261310578e-07, "loss": 0.022, "step": 4998 }, { "epoch": 0.7885790905864258, "grad_norm": 3.63834547996521, "learning_rate": 2.1574625664144257e-07, "loss": 0.0419, "step": 4999 }, { "epoch": 0.7887368379540166, "grad_norm": 5.431453704833984, "learning_rate": 2.1558525197230718e-07, "loss": 0.0387, "step": 5000 }, { "epoch": 0.7888945853216075, "grad_norm": 3.6970741748809814, "learning_rate": 2.1542424730317176e-07, "loss": 0.0143, "step": 5001 }, { "epoch": 0.7890523326891983, "grad_norm": 4.054006576538086, "learning_rate": 2.152632426340364e-07, "loss": 0.0461, "step": 5002 }, { "epoch": 0.789210080056789, "grad_norm": 5.707550048828125, "learning_rate": 2.1510223796490097e-07, "loss": 0.0571, "step": 5003 }, { "epoch": 0.7893678274243798, "grad_norm": 2.3995659351348877, "learning_rate": 2.1494123329576558e-07, "loss": 0.0114, "step": 5004 }, { "epoch": 0.7895255747919706, "grad_norm": 6.082056999206543, "learning_rate": 2.1478022862663018e-07, "loss": 0.0759, "step": 5005 }, { "epoch": 0.7896833221595615, "grad_norm": 8.491081237792969, "learning_rate": 2.1461922395749476e-07, "loss": 0.0741, "step": 5006 }, { "epoch": 0.7898410695271523, "grad_norm": 3.2995965480804443, "learning_rate": 2.1445821928835937e-07, "loss": 0.0358, "step": 5007 }, { "epoch": 0.789998816894743, "grad_norm": 3.908900737762451, "learning_rate": 2.1429721461922395e-07, "loss": 0.0341, "step": 5008 }, { "epoch": 0.7901565642623338, "grad_norm": 3.2598395347595215, "learning_rate": 2.1413620995008856e-07, "loss": 0.0123, "step": 5009 }, { "epoch": 0.7903143116299247, "grad_norm": 5.370056629180908, "learning_rate": 2.1397520528095314e-07, "loss": 0.0787, "step": 5010 }, { "epoch": 0.7904720589975155, "grad_norm": 4.599198818206787, "learning_rate": 2.1381420061181774e-07, "loss": 0.0424, "step": 5011 }, { "epoch": 0.7906298063651063, "grad_norm": 7.477070331573486, "learning_rate": 2.1365319594268232e-07, "loss": 0.0829, "step": 5012 }, { "epoch": 0.790787553732697, "grad_norm": 3.146921396255493, "learning_rate": 2.1349219127354693e-07, "loss": 0.0374, "step": 5013 }, { "epoch": 0.7909453011002879, "grad_norm": 5.969034194946289, "learning_rate": 2.133311866044115e-07, "loss": 0.0502, "step": 5014 }, { "epoch": 0.7911030484678787, "grad_norm": 6.728024005889893, "learning_rate": 2.1317018193527612e-07, "loss": 0.0762, "step": 5015 }, { "epoch": 0.7912607958354695, "grad_norm": 3.958031177520752, "learning_rate": 2.130091772661407e-07, "loss": 0.0425, "step": 5016 }, { "epoch": 0.7914185432030603, "grad_norm": 7.5270586013793945, "learning_rate": 2.128481725970053e-07, "loss": 0.0662, "step": 5017 }, { "epoch": 0.7915762905706512, "grad_norm": 4.066029071807861, "learning_rate": 2.1268716792786988e-07, "loss": 0.0555, "step": 5018 }, { "epoch": 0.7917340379382419, "grad_norm": 6.936734199523926, "learning_rate": 2.125261632587345e-07, "loss": 0.0457, "step": 5019 }, { "epoch": 0.7918917853058327, "grad_norm": 4.055303573608398, "learning_rate": 2.1236515858959907e-07, "loss": 0.0227, "step": 5020 }, { "epoch": 0.7920495326734235, "grad_norm": 6.9554619789123535, "learning_rate": 2.1220415392046367e-07, "loss": 0.0445, "step": 5021 }, { "epoch": 0.7922072800410143, "grad_norm": 5.465765476226807, "learning_rate": 2.1204314925132828e-07, "loss": 0.0621, "step": 5022 }, { "epoch": 0.7923650274086051, "grad_norm": 5.453721046447754, "learning_rate": 2.1188214458219286e-07, "loss": 0.0395, "step": 5023 }, { "epoch": 0.7925227747761959, "grad_norm": 8.362287521362305, "learning_rate": 2.1172113991305747e-07, "loss": 0.0564, "step": 5024 }, { "epoch": 0.7926805221437867, "grad_norm": 6.847805500030518, "learning_rate": 2.1156013524392207e-07, "loss": 0.0989, "step": 5025 }, { "epoch": 0.7928382695113775, "grad_norm": 6.537115573883057, "learning_rate": 2.1139913057478668e-07, "loss": 0.0343, "step": 5026 }, { "epoch": 0.7929960168789684, "grad_norm": 5.798552989959717, "learning_rate": 2.1123812590565126e-07, "loss": 0.0401, "step": 5027 }, { "epoch": 0.7931537642465591, "grad_norm": 6.021036624908447, "learning_rate": 2.1107712123651587e-07, "loss": 0.0705, "step": 5028 }, { "epoch": 0.7933115116141499, "grad_norm": 11.112351417541504, "learning_rate": 2.1091611656738045e-07, "loss": 0.0628, "step": 5029 }, { "epoch": 0.7934692589817407, "grad_norm": 4.815724849700928, "learning_rate": 2.1075511189824505e-07, "loss": 0.0242, "step": 5030 }, { "epoch": 0.7936270063493316, "grad_norm": 8.125972747802734, "learning_rate": 2.1059410722910963e-07, "loss": 0.0861, "step": 5031 }, { "epoch": 0.7937847537169224, "grad_norm": 6.67203950881958, "learning_rate": 2.1043310255997424e-07, "loss": 0.0603, "step": 5032 }, { "epoch": 0.7939425010845131, "grad_norm": 5.425326347351074, "learning_rate": 2.1027209789083882e-07, "loss": 0.0695, "step": 5033 }, { "epoch": 0.7941002484521039, "grad_norm": 3.3202567100524902, "learning_rate": 2.1011109322170342e-07, "loss": 0.0209, "step": 5034 }, { "epoch": 0.7942579958196948, "grad_norm": 4.705724239349365, "learning_rate": 2.09950088552568e-07, "loss": 0.0535, "step": 5035 }, { "epoch": 0.7944157431872856, "grad_norm": 8.349897384643555, "learning_rate": 2.097890838834326e-07, "loss": 0.0793, "step": 5036 }, { "epoch": 0.7945734905548764, "grad_norm": 4.8175153732299805, "learning_rate": 2.0962807921429722e-07, "loss": 0.0428, "step": 5037 }, { "epoch": 0.7947312379224671, "grad_norm": 5.381453037261963, "learning_rate": 2.094670745451618e-07, "loss": 0.06, "step": 5038 }, { "epoch": 0.794888985290058, "grad_norm": 7.311165809631348, "learning_rate": 2.093060698760264e-07, "loss": 0.0732, "step": 5039 }, { "epoch": 0.7950467326576488, "grad_norm": 4.570580005645752, "learning_rate": 2.0914506520689098e-07, "loss": 0.0466, "step": 5040 }, { "epoch": 0.7952044800252396, "grad_norm": 6.096408367156982, "learning_rate": 2.089840605377556e-07, "loss": 0.087, "step": 5041 }, { "epoch": 0.7953622273928304, "grad_norm": 3.1089320182800293, "learning_rate": 2.0882305586862017e-07, "loss": 0.0161, "step": 5042 }, { "epoch": 0.7955199747604211, "grad_norm": 4.253561496734619, "learning_rate": 2.0866205119948477e-07, "loss": 0.0497, "step": 5043 }, { "epoch": 0.795677722128012, "grad_norm": 5.132200241088867, "learning_rate": 2.0850104653034935e-07, "loss": 0.0631, "step": 5044 }, { "epoch": 0.7958354694956028, "grad_norm": 1.9497289657592773, "learning_rate": 2.0834004186121396e-07, "loss": 0.0427, "step": 5045 }, { "epoch": 0.7959932168631936, "grad_norm": 3.9391345977783203, "learning_rate": 2.0817903719207854e-07, "loss": 0.0766, "step": 5046 }, { "epoch": 0.7961509642307844, "grad_norm": 9.721490859985352, "learning_rate": 2.0801803252294315e-07, "loss": 0.0479, "step": 5047 }, { "epoch": 0.7963087115983752, "grad_norm": 4.6253767013549805, "learning_rate": 2.0785702785380775e-07, "loss": 0.0686, "step": 5048 }, { "epoch": 0.796466458965966, "grad_norm": 5.105081081390381, "learning_rate": 2.0769602318467236e-07, "loss": 0.0437, "step": 5049 }, { "epoch": 0.7966242063335568, "grad_norm": 4.698444366455078, "learning_rate": 2.0753501851553694e-07, "loss": 0.0604, "step": 5050 }, { "epoch": 0.7967819537011476, "grad_norm": 3.0366082191467285, "learning_rate": 2.0737401384640155e-07, "loss": 0.0203, "step": 5051 }, { "epoch": 0.7969397010687385, "grad_norm": 1.8577876091003418, "learning_rate": 2.0721300917726615e-07, "loss": 0.0143, "step": 5052 }, { "epoch": 0.7970974484363292, "grad_norm": 3.2034897804260254, "learning_rate": 2.0705200450813073e-07, "loss": 0.0335, "step": 5053 }, { "epoch": 0.79725519580392, "grad_norm": 7.292186737060547, "learning_rate": 2.0689099983899534e-07, "loss": 0.054, "step": 5054 }, { "epoch": 0.7974129431715108, "grad_norm": 5.905879497528076, "learning_rate": 2.0672999516985992e-07, "loss": 0.042, "step": 5055 }, { "epoch": 0.7975706905391017, "grad_norm": 4.840546131134033, "learning_rate": 2.0656899050072452e-07, "loss": 0.0408, "step": 5056 }, { "epoch": 0.7977284379066925, "grad_norm": 6.667255401611328, "learning_rate": 2.064079858315891e-07, "loss": 0.035, "step": 5057 }, { "epoch": 0.7978861852742832, "grad_norm": 5.428292751312256, "learning_rate": 2.062469811624537e-07, "loss": 0.0349, "step": 5058 }, { "epoch": 0.798043932641874, "grad_norm": 3.1888558864593506, "learning_rate": 2.060859764933183e-07, "loss": 0.0363, "step": 5059 }, { "epoch": 0.7982016800094648, "grad_norm": 3.992692470550537, "learning_rate": 2.059249718241829e-07, "loss": 0.0483, "step": 5060 }, { "epoch": 0.7983594273770557, "grad_norm": 4.776655197143555, "learning_rate": 2.0576396715504748e-07, "loss": 0.072, "step": 5061 }, { "epoch": 0.7985171747446465, "grad_norm": 8.188023567199707, "learning_rate": 2.0560296248591208e-07, "loss": 0.0304, "step": 5062 }, { "epoch": 0.7986749221122372, "grad_norm": 2.867313861846924, "learning_rate": 2.0544195781677666e-07, "loss": 0.008, "step": 5063 }, { "epoch": 0.798832669479828, "grad_norm": 9.445868492126465, "learning_rate": 2.0528095314764127e-07, "loss": 0.0736, "step": 5064 }, { "epoch": 0.7989904168474189, "grad_norm": 5.495667934417725, "learning_rate": 2.0511994847850585e-07, "loss": 0.0257, "step": 5065 }, { "epoch": 0.7991481642150097, "grad_norm": 6.540268898010254, "learning_rate": 2.0495894380937046e-07, "loss": 0.0598, "step": 5066 }, { "epoch": 0.7993059115826004, "grad_norm": 4.974075794219971, "learning_rate": 2.0479793914023506e-07, "loss": 0.056, "step": 5067 }, { "epoch": 0.7994636589501912, "grad_norm": 0.9020988941192627, "learning_rate": 2.0463693447109964e-07, "loss": 0.0085, "step": 5068 }, { "epoch": 0.7996214063177821, "grad_norm": 7.461383819580078, "learning_rate": 2.0447592980196425e-07, "loss": 0.0706, "step": 5069 }, { "epoch": 0.7997791536853729, "grad_norm": 3.6474037170410156, "learning_rate": 2.0431492513282883e-07, "loss": 0.0239, "step": 5070 }, { "epoch": 0.7999369010529637, "grad_norm": 6.408823013305664, "learning_rate": 2.0415392046369343e-07, "loss": 0.0448, "step": 5071 }, { "epoch": 0.8000946484205544, "grad_norm": 3.4695098400115967, "learning_rate": 2.0399291579455804e-07, "loss": 0.0334, "step": 5072 }, { "epoch": 0.8002523957881453, "grad_norm": 5.335095405578613, "learning_rate": 2.0383191112542265e-07, "loss": 0.0565, "step": 5073 }, { "epoch": 0.8004101431557361, "grad_norm": 4.53596305847168, "learning_rate": 2.0367090645628723e-07, "loss": 0.0516, "step": 5074 }, { "epoch": 0.8005678905233269, "grad_norm": 6.096803188323975, "learning_rate": 2.0350990178715183e-07, "loss": 0.0783, "step": 5075 }, { "epoch": 0.8007256378909177, "grad_norm": 2.34279465675354, "learning_rate": 2.033488971180164e-07, "loss": 0.0354, "step": 5076 }, { "epoch": 0.8008833852585086, "grad_norm": 3.428112506866455, "learning_rate": 2.0318789244888102e-07, "loss": 0.0316, "step": 5077 }, { "epoch": 0.8010411326260993, "grad_norm": 4.4523115158081055, "learning_rate": 2.030268877797456e-07, "loss": 0.0419, "step": 5078 }, { "epoch": 0.8011988799936901, "grad_norm": 8.772183418273926, "learning_rate": 2.028658831106102e-07, "loss": 0.0461, "step": 5079 }, { "epoch": 0.8013566273612809, "grad_norm": 3.7308502197265625, "learning_rate": 2.0270487844147478e-07, "loss": 0.0295, "step": 5080 }, { "epoch": 0.8015143747288717, "grad_norm": 5.455733299255371, "learning_rate": 2.025438737723394e-07, "loss": 0.0342, "step": 5081 }, { "epoch": 0.8016721220964625, "grad_norm": 3.7311489582061768, "learning_rate": 2.02382869103204e-07, "loss": 0.0563, "step": 5082 }, { "epoch": 0.8018298694640533, "grad_norm": 3.39882755279541, "learning_rate": 2.0222186443406858e-07, "loss": 0.0284, "step": 5083 }, { "epoch": 0.8019876168316441, "grad_norm": 3.38927960395813, "learning_rate": 2.0206085976493318e-07, "loss": 0.0239, "step": 5084 }, { "epoch": 0.8021453641992349, "grad_norm": 4.0973005294799805, "learning_rate": 2.0189985509579776e-07, "loss": 0.0294, "step": 5085 }, { "epoch": 0.8023031115668258, "grad_norm": 3.903991937637329, "learning_rate": 2.0173885042666237e-07, "loss": 0.0475, "step": 5086 }, { "epoch": 0.8024608589344165, "grad_norm": 4.17649507522583, "learning_rate": 2.0157784575752695e-07, "loss": 0.0355, "step": 5087 }, { "epoch": 0.8026186063020073, "grad_norm": 7.29484748840332, "learning_rate": 2.0141684108839156e-07, "loss": 0.0483, "step": 5088 }, { "epoch": 0.8027763536695981, "grad_norm": 4.985876083374023, "learning_rate": 2.0125583641925614e-07, "loss": 0.0327, "step": 5089 }, { "epoch": 0.802934101037189, "grad_norm": 10.11892318725586, "learning_rate": 2.0109483175012074e-07, "loss": 0.0445, "step": 5090 }, { "epoch": 0.8030918484047798, "grad_norm": 3.6641905307769775, "learning_rate": 2.0093382708098532e-07, "loss": 0.0733, "step": 5091 }, { "epoch": 0.8032495957723705, "grad_norm": 5.7901930809021, "learning_rate": 2.0077282241184993e-07, "loss": 0.0476, "step": 5092 }, { "epoch": 0.8034073431399613, "grad_norm": 6.130947113037109, "learning_rate": 2.006118177427145e-07, "loss": 0.0747, "step": 5093 }, { "epoch": 0.8035650905075522, "grad_norm": 4.188258647918701, "learning_rate": 2.0045081307357911e-07, "loss": 0.0255, "step": 5094 }, { "epoch": 0.803722837875143, "grad_norm": 5.715501308441162, "learning_rate": 2.0028980840444372e-07, "loss": 0.0808, "step": 5095 }, { "epoch": 0.8038805852427338, "grad_norm": 2.4276111125946045, "learning_rate": 2.0012880373530833e-07, "loss": 0.0302, "step": 5096 }, { "epoch": 0.8040383326103245, "grad_norm": 7.273693084716797, "learning_rate": 1.9996779906617293e-07, "loss": 0.0404, "step": 5097 }, { "epoch": 0.8041960799779154, "grad_norm": 3.8964407444000244, "learning_rate": 1.998067943970375e-07, "loss": 0.0211, "step": 5098 }, { "epoch": 0.8043538273455062, "grad_norm": 3.150508165359497, "learning_rate": 1.9964578972790212e-07, "loss": 0.0354, "step": 5099 }, { "epoch": 0.804511574713097, "grad_norm": 3.0548505783081055, "learning_rate": 1.994847850587667e-07, "loss": 0.0541, "step": 5100 }, { "epoch": 0.8046693220806878, "grad_norm": 3.6085751056671143, "learning_rate": 1.993237803896313e-07, "loss": 0.0259, "step": 5101 }, { "epoch": 0.8048270694482785, "grad_norm": 3.171452283859253, "learning_rate": 1.9916277572049589e-07, "loss": 0.0194, "step": 5102 }, { "epoch": 0.8049848168158694, "grad_norm": 6.587282180786133, "learning_rate": 1.990017710513605e-07, "loss": 0.0498, "step": 5103 }, { "epoch": 0.8051425641834602, "grad_norm": 3.8433663845062256, "learning_rate": 1.9884076638222507e-07, "loss": 0.0298, "step": 5104 }, { "epoch": 0.805300311551051, "grad_norm": 6.593711853027344, "learning_rate": 1.9867976171308968e-07, "loss": 0.0736, "step": 5105 }, { "epoch": 0.8054580589186418, "grad_norm": 7.378602981567383, "learning_rate": 1.9851875704395426e-07, "loss": 0.03, "step": 5106 }, { "epoch": 0.8056158062862326, "grad_norm": 7.948828220367432, "learning_rate": 1.9835775237481886e-07, "loss": 0.0381, "step": 5107 }, { "epoch": 0.8057735536538234, "grad_norm": 4.1958465576171875, "learning_rate": 1.9819674770568344e-07, "loss": 0.0262, "step": 5108 }, { "epoch": 0.8059313010214142, "grad_norm": 4.811060428619385, "learning_rate": 1.9803574303654805e-07, "loss": 0.0777, "step": 5109 }, { "epoch": 0.806089048389005, "grad_norm": 3.1506600379943848, "learning_rate": 1.9787473836741263e-07, "loss": 0.0367, "step": 5110 }, { "epoch": 0.8062467957565959, "grad_norm": 3.457339286804199, "learning_rate": 1.9771373369827724e-07, "loss": 0.0443, "step": 5111 }, { "epoch": 0.8064045431241866, "grad_norm": 5.397216320037842, "learning_rate": 1.9755272902914182e-07, "loss": 0.0317, "step": 5112 }, { "epoch": 0.8065622904917774, "grad_norm": 3.1989307403564453, "learning_rate": 1.9739172436000642e-07, "loss": 0.0334, "step": 5113 }, { "epoch": 0.8067200378593682, "grad_norm": 3.706082582473755, "learning_rate": 1.9723071969087103e-07, "loss": 0.0301, "step": 5114 }, { "epoch": 0.8068777852269591, "grad_norm": 6.416573524475098, "learning_rate": 1.970697150217356e-07, "loss": 0.0246, "step": 5115 }, { "epoch": 0.8070355325945499, "grad_norm": 3.8669443130493164, "learning_rate": 1.9690871035260022e-07, "loss": 0.044, "step": 5116 }, { "epoch": 0.8071932799621406, "grad_norm": 4.944881439208984, "learning_rate": 1.967477056834648e-07, "loss": 0.089, "step": 5117 }, { "epoch": 0.8073510273297314, "grad_norm": 2.441859006881714, "learning_rate": 1.9658670101432943e-07, "loss": 0.0148, "step": 5118 }, { "epoch": 0.8075087746973222, "grad_norm": 6.643697261810303, "learning_rate": 1.96425696345194e-07, "loss": 0.0612, "step": 5119 }, { "epoch": 0.8076665220649131, "grad_norm": 3.9879868030548096, "learning_rate": 1.9626469167605861e-07, "loss": 0.0216, "step": 5120 }, { "epoch": 0.8078242694325038, "grad_norm": 5.297037601470947, "learning_rate": 1.961036870069232e-07, "loss": 0.0826, "step": 5121 }, { "epoch": 0.8079820168000946, "grad_norm": 4.819685935974121, "learning_rate": 1.959426823377878e-07, "loss": 0.0512, "step": 5122 }, { "epoch": 0.8081397641676854, "grad_norm": 5.230579853057861, "learning_rate": 1.9578167766865238e-07, "loss": 0.0202, "step": 5123 }, { "epoch": 0.8082975115352763, "grad_norm": 5.653816223144531, "learning_rate": 1.9562067299951699e-07, "loss": 0.0703, "step": 5124 }, { "epoch": 0.8084552589028671, "grad_norm": 4.329024791717529, "learning_rate": 1.9545966833038157e-07, "loss": 0.0398, "step": 5125 }, { "epoch": 0.8086130062704578, "grad_norm": 4.0327229499816895, "learning_rate": 1.9529866366124617e-07, "loss": 0.0449, "step": 5126 }, { "epoch": 0.8087707536380486, "grad_norm": 4.008519172668457, "learning_rate": 1.9513765899211075e-07, "loss": 0.0302, "step": 5127 }, { "epoch": 0.8089285010056395, "grad_norm": 5.778116703033447, "learning_rate": 1.9497665432297536e-07, "loss": 0.0319, "step": 5128 }, { "epoch": 0.8090862483732303, "grad_norm": 6.866086006164551, "learning_rate": 1.9481564965383996e-07, "loss": 0.0707, "step": 5129 }, { "epoch": 0.8092439957408211, "grad_norm": 5.5323686599731445, "learning_rate": 1.9465464498470454e-07, "loss": 0.0583, "step": 5130 }, { "epoch": 0.8094017431084118, "grad_norm": 5.858497619628906, "learning_rate": 1.9449364031556915e-07, "loss": 0.0484, "step": 5131 }, { "epoch": 0.8095594904760027, "grad_norm": 6.386364936828613, "learning_rate": 1.9433263564643373e-07, "loss": 0.0744, "step": 5132 }, { "epoch": 0.8097172378435935, "grad_norm": 2.88401460647583, "learning_rate": 1.9417163097729834e-07, "loss": 0.0156, "step": 5133 }, { "epoch": 0.8098749852111843, "grad_norm": 7.814047336578369, "learning_rate": 1.9401062630816292e-07, "loss": 0.0846, "step": 5134 }, { "epoch": 0.8100327325787751, "grad_norm": 5.588277339935303, "learning_rate": 1.9384962163902752e-07, "loss": 0.0675, "step": 5135 }, { "epoch": 0.810190479946366, "grad_norm": 6.536828994750977, "learning_rate": 1.936886169698921e-07, "loss": 0.0659, "step": 5136 }, { "epoch": 0.8103482273139567, "grad_norm": 6.740087985992432, "learning_rate": 1.935276123007567e-07, "loss": 0.0569, "step": 5137 }, { "epoch": 0.8105059746815475, "grad_norm": 5.166973114013672, "learning_rate": 1.933666076316213e-07, "loss": 0.025, "step": 5138 }, { "epoch": 0.8106637220491383, "grad_norm": 5.403120994567871, "learning_rate": 1.932056029624859e-07, "loss": 0.0587, "step": 5139 }, { "epoch": 0.8108214694167291, "grad_norm": 3.0564119815826416, "learning_rate": 1.9304459829335048e-07, "loss": 0.0176, "step": 5140 }, { "epoch": 0.81097921678432, "grad_norm": 7.829894065856934, "learning_rate": 1.928835936242151e-07, "loss": 0.0628, "step": 5141 }, { "epoch": 0.8111369641519107, "grad_norm": 5.746363639831543, "learning_rate": 1.927225889550797e-07, "loss": 0.0854, "step": 5142 }, { "epoch": 0.8112947115195015, "grad_norm": 4.095083713531494, "learning_rate": 1.925615842859443e-07, "loss": 0.054, "step": 5143 }, { "epoch": 0.8114524588870923, "grad_norm": 7.036856651306152, "learning_rate": 1.924005796168089e-07, "loss": 0.0686, "step": 5144 }, { "epoch": 0.8116102062546832, "grad_norm": 4.661142826080322, "learning_rate": 1.9223957494767348e-07, "loss": 0.0418, "step": 5145 }, { "epoch": 0.8117679536222739, "grad_norm": 10.97108268737793, "learning_rate": 1.920785702785381e-07, "loss": 0.0449, "step": 5146 }, { "epoch": 0.8119257009898647, "grad_norm": 6.629733085632324, "learning_rate": 1.9191756560940267e-07, "loss": 0.0372, "step": 5147 }, { "epoch": 0.8120834483574555, "grad_norm": 5.715473651885986, "learning_rate": 1.9175656094026727e-07, "loss": 0.0555, "step": 5148 }, { "epoch": 0.8122411957250464, "grad_norm": 4.085668087005615, "learning_rate": 1.9159555627113185e-07, "loss": 0.0411, "step": 5149 }, { "epoch": 0.8123989430926372, "grad_norm": 3.9186437129974365, "learning_rate": 1.9143455160199646e-07, "loss": 0.0254, "step": 5150 }, { "epoch": 0.8125566904602279, "grad_norm": 4.551998138427734, "learning_rate": 1.9127354693286104e-07, "loss": 0.047, "step": 5151 }, { "epoch": 0.8127144378278187, "grad_norm": 2.5676891803741455, "learning_rate": 1.9111254226372565e-07, "loss": 0.0142, "step": 5152 }, { "epoch": 0.8128721851954096, "grad_norm": 3.801438808441162, "learning_rate": 1.9095153759459023e-07, "loss": 0.0271, "step": 5153 }, { "epoch": 0.8130299325630004, "grad_norm": 6.762763023376465, "learning_rate": 1.9079053292545483e-07, "loss": 0.0453, "step": 5154 }, { "epoch": 0.8131876799305912, "grad_norm": 4.866330146789551, "learning_rate": 1.906295282563194e-07, "loss": 0.0637, "step": 5155 }, { "epoch": 0.8133454272981819, "grad_norm": 3.441608190536499, "learning_rate": 1.9046852358718402e-07, "loss": 0.0299, "step": 5156 }, { "epoch": 0.8135031746657727, "grad_norm": 5.448775291442871, "learning_rate": 1.903075189180486e-07, "loss": 0.0421, "step": 5157 }, { "epoch": 0.8136609220333636, "grad_norm": 4.715697765350342, "learning_rate": 1.901465142489132e-07, "loss": 0.0696, "step": 5158 }, { "epoch": 0.8138186694009544, "grad_norm": 3.6033568382263184, "learning_rate": 1.899855095797778e-07, "loss": 0.039, "step": 5159 }, { "epoch": 0.8139764167685452, "grad_norm": 6.063899993896484, "learning_rate": 1.898245049106424e-07, "loss": 0.04, "step": 5160 }, { "epoch": 0.8141341641361359, "grad_norm": 7.023772239685059, "learning_rate": 1.89663500241507e-07, "loss": 0.0609, "step": 5161 }, { "epoch": 0.8142919115037268, "grad_norm": 4.2832441329956055, "learning_rate": 1.8950249557237158e-07, "loss": 0.0452, "step": 5162 }, { "epoch": 0.8144496588713176, "grad_norm": 3.4046990871429443, "learning_rate": 1.8934149090323618e-07, "loss": 0.0176, "step": 5163 }, { "epoch": 0.8146074062389084, "grad_norm": 3.943274974822998, "learning_rate": 1.8918048623410076e-07, "loss": 0.0433, "step": 5164 }, { "epoch": 0.8147651536064991, "grad_norm": 5.466538906097412, "learning_rate": 1.890194815649654e-07, "loss": 0.0649, "step": 5165 }, { "epoch": 0.81492290097409, "grad_norm": 4.183228015899658, "learning_rate": 1.8885847689582997e-07, "loss": 0.0194, "step": 5166 }, { "epoch": 0.8150806483416808, "grad_norm": 7.976485729217529, "learning_rate": 1.8869747222669458e-07, "loss": 0.0629, "step": 5167 }, { "epoch": 0.8152383957092716, "grad_norm": 7.346073627471924, "learning_rate": 1.8853646755755916e-07, "loss": 0.0646, "step": 5168 }, { "epoch": 0.8153961430768624, "grad_norm": 3.0371408462524414, "learning_rate": 1.8837546288842377e-07, "loss": 0.0143, "step": 5169 }, { "epoch": 0.8155538904444533, "grad_norm": 3.4316720962524414, "learning_rate": 1.8821445821928835e-07, "loss": 0.02, "step": 5170 }, { "epoch": 0.815711637812044, "grad_norm": 6.701343536376953, "learning_rate": 1.8805345355015295e-07, "loss": 0.0246, "step": 5171 }, { "epoch": 0.8158693851796348, "grad_norm": 5.175650119781494, "learning_rate": 1.8789244888101753e-07, "loss": 0.0418, "step": 5172 }, { "epoch": 0.8160271325472256, "grad_norm": 5.159764766693115, "learning_rate": 1.8773144421188214e-07, "loss": 0.0232, "step": 5173 }, { "epoch": 0.8161848799148165, "grad_norm": 3.392036199569702, "learning_rate": 1.8757043954274675e-07, "loss": 0.0177, "step": 5174 }, { "epoch": 0.8163426272824073, "grad_norm": 3.4437549114227295, "learning_rate": 1.8740943487361133e-07, "loss": 0.0169, "step": 5175 }, { "epoch": 0.816500374649998, "grad_norm": 5.620139122009277, "learning_rate": 1.8724843020447593e-07, "loss": 0.0358, "step": 5176 }, { "epoch": 0.8166581220175888, "grad_norm": 4.827464580535889, "learning_rate": 1.870874255353405e-07, "loss": 0.0136, "step": 5177 }, { "epoch": 0.8168158693851796, "grad_norm": 5.521566390991211, "learning_rate": 1.8692642086620512e-07, "loss": 0.05, "step": 5178 }, { "epoch": 0.8169736167527705, "grad_norm": 7.122973442077637, "learning_rate": 1.867654161970697e-07, "loss": 0.0473, "step": 5179 }, { "epoch": 0.8171313641203612, "grad_norm": 5.359594345092773, "learning_rate": 1.866044115279343e-07, "loss": 0.0563, "step": 5180 }, { "epoch": 0.817289111487952, "grad_norm": 2.6821999549865723, "learning_rate": 1.8644340685879888e-07, "loss": 0.0166, "step": 5181 }, { "epoch": 0.8174468588555428, "grad_norm": 4.8910322189331055, "learning_rate": 1.862824021896635e-07, "loss": 0.0386, "step": 5182 }, { "epoch": 0.8176046062231337, "grad_norm": 3.7409276962280273, "learning_rate": 1.8612139752052807e-07, "loss": 0.0184, "step": 5183 }, { "epoch": 0.8177623535907245, "grad_norm": 6.966063022613525, "learning_rate": 1.8596039285139268e-07, "loss": 0.0756, "step": 5184 }, { "epoch": 0.8179201009583152, "grad_norm": 7.367618560791016, "learning_rate": 1.8579938818225726e-07, "loss": 0.045, "step": 5185 }, { "epoch": 0.818077848325906, "grad_norm": 5.712251663208008, "learning_rate": 1.8563838351312186e-07, "loss": 0.0427, "step": 5186 }, { "epoch": 0.8182355956934969, "grad_norm": 2.799457550048828, "learning_rate": 1.8547737884398644e-07, "loss": 0.0176, "step": 5187 }, { "epoch": 0.8183933430610877, "grad_norm": 5.2669782638549805, "learning_rate": 1.8531637417485108e-07, "loss": 0.0583, "step": 5188 }, { "epoch": 0.8185510904286785, "grad_norm": 1.8714468479156494, "learning_rate": 1.8515536950571568e-07, "loss": 0.0123, "step": 5189 }, { "epoch": 0.8187088377962692, "grad_norm": 3.939018964767456, "learning_rate": 1.8499436483658026e-07, "loss": 0.0358, "step": 5190 }, { "epoch": 0.8188665851638601, "grad_norm": 6.186793327331543, "learning_rate": 1.8483336016744487e-07, "loss": 0.0285, "step": 5191 }, { "epoch": 0.8190243325314509, "grad_norm": 2.6424143314361572, "learning_rate": 1.8467235549830945e-07, "loss": 0.0173, "step": 5192 }, { "epoch": 0.8191820798990417, "grad_norm": 4.678024768829346, "learning_rate": 1.8451135082917405e-07, "loss": 0.0517, "step": 5193 }, { "epoch": 0.8193398272666325, "grad_norm": 4.546450138092041, "learning_rate": 1.8435034616003863e-07, "loss": 0.0284, "step": 5194 }, { "epoch": 0.8194975746342232, "grad_norm": 5.08975887298584, "learning_rate": 1.8418934149090324e-07, "loss": 0.0222, "step": 5195 }, { "epoch": 0.8196553220018141, "grad_norm": 7.609801769256592, "learning_rate": 1.8402833682176782e-07, "loss": 0.0459, "step": 5196 }, { "epoch": 0.8198130693694049, "grad_norm": 4.874221324920654, "learning_rate": 1.8386733215263243e-07, "loss": 0.051, "step": 5197 }, { "epoch": 0.8199708167369957, "grad_norm": 6.313220500946045, "learning_rate": 1.83706327483497e-07, "loss": 0.0341, "step": 5198 }, { "epoch": 0.8201285641045865, "grad_norm": 4.605165481567383, "learning_rate": 1.835453228143616e-07, "loss": 0.0263, "step": 5199 }, { "epoch": 0.8202863114721773, "grad_norm": 2.869143009185791, "learning_rate": 1.833843181452262e-07, "loss": 0.0208, "step": 5200 }, { "epoch": 0.8204440588397681, "grad_norm": 4.082322120666504, "learning_rate": 1.832233134760908e-07, "loss": 0.0386, "step": 5201 }, { "epoch": 0.8206018062073589, "grad_norm": 7.996222496032715, "learning_rate": 1.8306230880695538e-07, "loss": 0.0393, "step": 5202 }, { "epoch": 0.8207595535749497, "grad_norm": 2.8665354251861572, "learning_rate": 1.8290130413781999e-07, "loss": 0.0491, "step": 5203 }, { "epoch": 0.8209173009425406, "grad_norm": 5.036356449127197, "learning_rate": 1.8274029946868456e-07, "loss": 0.0599, "step": 5204 }, { "epoch": 0.8210750483101313, "grad_norm": 4.692917823791504, "learning_rate": 1.8257929479954917e-07, "loss": 0.0389, "step": 5205 }, { "epoch": 0.8212327956777221, "grad_norm": 3.754473924636841, "learning_rate": 1.8241829013041378e-07, "loss": 0.0729, "step": 5206 }, { "epoch": 0.8213905430453129, "grad_norm": 6.935403823852539, "learning_rate": 1.8225728546127836e-07, "loss": 0.0671, "step": 5207 }, { "epoch": 0.8215482904129038, "grad_norm": 10.570488929748535, "learning_rate": 1.8209628079214296e-07, "loss": 0.0494, "step": 5208 }, { "epoch": 0.8217060377804946, "grad_norm": 5.963524341583252, "learning_rate": 1.8193527612300754e-07, "loss": 0.0522, "step": 5209 }, { "epoch": 0.8218637851480853, "grad_norm": 5.608829498291016, "learning_rate": 1.8177427145387215e-07, "loss": 0.0384, "step": 5210 }, { "epoch": 0.8220215325156761, "grad_norm": 6.981807708740234, "learning_rate": 1.8161326678473676e-07, "loss": 0.0441, "step": 5211 }, { "epoch": 0.822179279883267, "grad_norm": 5.255591869354248, "learning_rate": 1.8145226211560136e-07, "loss": 0.0339, "step": 5212 }, { "epoch": 0.8223370272508578, "grad_norm": 4.443271160125732, "learning_rate": 1.8129125744646594e-07, "loss": 0.0226, "step": 5213 }, { "epoch": 0.8224947746184486, "grad_norm": 1.6246569156646729, "learning_rate": 1.8113025277733055e-07, "loss": 0.0107, "step": 5214 }, { "epoch": 0.8226525219860393, "grad_norm": 2.8058571815490723, "learning_rate": 1.8096924810819513e-07, "loss": 0.0123, "step": 5215 }, { "epoch": 0.8228102693536301, "grad_norm": 3.151439666748047, "learning_rate": 1.8080824343905973e-07, "loss": 0.0323, "step": 5216 }, { "epoch": 0.822968016721221, "grad_norm": 2.786860942840576, "learning_rate": 1.8064723876992431e-07, "loss": 0.0376, "step": 5217 }, { "epoch": 0.8231257640888118, "grad_norm": 6.642549991607666, "learning_rate": 1.8048623410078892e-07, "loss": 0.0361, "step": 5218 }, { "epoch": 0.8232835114564026, "grad_norm": 5.647134304046631, "learning_rate": 1.803252294316535e-07, "loss": 0.0205, "step": 5219 }, { "epoch": 0.8234412588239933, "grad_norm": 2.1456258296966553, "learning_rate": 1.801642247625181e-07, "loss": 0.013, "step": 5220 }, { "epoch": 0.8235990061915842, "grad_norm": 3.01601243019104, "learning_rate": 1.8000322009338271e-07, "loss": 0.0324, "step": 5221 }, { "epoch": 0.823756753559175, "grad_norm": 4.266296863555908, "learning_rate": 1.798422154242473e-07, "loss": 0.0257, "step": 5222 }, { "epoch": 0.8239145009267658, "grad_norm": 7.477838039398193, "learning_rate": 1.796812107551119e-07, "loss": 0.0249, "step": 5223 }, { "epoch": 0.8240722482943565, "grad_norm": 4.718059062957764, "learning_rate": 1.7952020608597648e-07, "loss": 0.0673, "step": 5224 }, { "epoch": 0.8242299956619474, "grad_norm": 5.125923156738281, "learning_rate": 1.7935920141684109e-07, "loss": 0.0544, "step": 5225 }, { "epoch": 0.8243877430295382, "grad_norm": 4.098823547363281, "learning_rate": 1.7919819674770567e-07, "loss": 0.0362, "step": 5226 }, { "epoch": 0.824545490397129, "grad_norm": 2.892537832260132, "learning_rate": 1.7903719207857027e-07, "loss": 0.0105, "step": 5227 }, { "epoch": 0.8247032377647198, "grad_norm": 5.221391201019287, "learning_rate": 1.7887618740943485e-07, "loss": 0.0258, "step": 5228 }, { "epoch": 0.8248609851323107, "grad_norm": 6.827422618865967, "learning_rate": 1.7871518274029946e-07, "loss": 0.0796, "step": 5229 }, { "epoch": 0.8250187324999014, "grad_norm": 4.7586164474487305, "learning_rate": 1.7855417807116404e-07, "loss": 0.0388, "step": 5230 }, { "epoch": 0.8251764798674922, "grad_norm": 6.9503583908081055, "learning_rate": 1.7839317340202864e-07, "loss": 0.048, "step": 5231 }, { "epoch": 0.825334227235083, "grad_norm": 4.58003044128418, "learning_rate": 1.7823216873289322e-07, "loss": 0.0331, "step": 5232 }, { "epoch": 0.8254919746026739, "grad_norm": 5.491656303405762, "learning_rate": 1.7807116406375783e-07, "loss": 0.0615, "step": 5233 }, { "epoch": 0.8256497219702646, "grad_norm": 6.24632453918457, "learning_rate": 1.779101593946224e-07, "loss": 0.0404, "step": 5234 }, { "epoch": 0.8258074693378554, "grad_norm": 5.307912349700928, "learning_rate": 1.7774915472548704e-07, "loss": 0.0657, "step": 5235 }, { "epoch": 0.8259652167054462, "grad_norm": 3.9404730796813965, "learning_rate": 1.7758815005635165e-07, "loss": 0.0289, "step": 5236 }, { "epoch": 0.826122964073037, "grad_norm": 6.570104122161865, "learning_rate": 1.7742714538721623e-07, "loss": 0.0791, "step": 5237 }, { "epoch": 0.8262807114406279, "grad_norm": 4.156282901763916, "learning_rate": 1.7726614071808084e-07, "loss": 0.0244, "step": 5238 }, { "epoch": 0.8264384588082186, "grad_norm": 6.332943439483643, "learning_rate": 1.7710513604894542e-07, "loss": 0.0629, "step": 5239 }, { "epoch": 0.8265962061758094, "grad_norm": 6.7248616218566895, "learning_rate": 1.7694413137981002e-07, "loss": 0.0543, "step": 5240 }, { "epoch": 0.8267539535434002, "grad_norm": 4.556899070739746, "learning_rate": 1.767831267106746e-07, "loss": 0.0377, "step": 5241 }, { "epoch": 0.8269117009109911, "grad_norm": 5.604135513305664, "learning_rate": 1.766221220415392e-07, "loss": 0.0738, "step": 5242 }, { "epoch": 0.8270694482785819, "grad_norm": 2.578310012817383, "learning_rate": 1.764611173724038e-07, "loss": 0.0125, "step": 5243 }, { "epoch": 0.8272271956461726, "grad_norm": 4.124502182006836, "learning_rate": 1.763001127032684e-07, "loss": 0.0156, "step": 5244 }, { "epoch": 0.8273849430137634, "grad_norm": 3.3038899898529053, "learning_rate": 1.7613910803413297e-07, "loss": 0.019, "step": 5245 }, { "epoch": 0.8275426903813543, "grad_norm": 6.650944232940674, "learning_rate": 1.7597810336499758e-07, "loss": 0.0192, "step": 5246 }, { "epoch": 0.8277004377489451, "grad_norm": 7.623540878295898, "learning_rate": 1.7581709869586216e-07, "loss": 0.0686, "step": 5247 }, { "epoch": 0.8278581851165359, "grad_norm": 0.5060501098632812, "learning_rate": 1.7565609402672677e-07, "loss": 0.0036, "step": 5248 }, { "epoch": 0.8280159324841266, "grad_norm": 7.243015289306641, "learning_rate": 1.7549508935759135e-07, "loss": 0.0171, "step": 5249 }, { "epoch": 0.8281736798517175, "grad_norm": 5.172285556793213, "learning_rate": 1.7533408468845595e-07, "loss": 0.0337, "step": 5250 }, { "epoch": 0.8283314272193083, "grad_norm": 2.3561525344848633, "learning_rate": 1.7517308001932056e-07, "loss": 0.0355, "step": 5251 }, { "epoch": 0.8284891745868991, "grad_norm": 6.328184127807617, "learning_rate": 1.7501207535018514e-07, "loss": 0.0404, "step": 5252 }, { "epoch": 0.8286469219544899, "grad_norm": 3.560062885284424, "learning_rate": 1.7485107068104974e-07, "loss": 0.0185, "step": 5253 }, { "epoch": 0.8288046693220806, "grad_norm": 6.333250999450684, "learning_rate": 1.7469006601191432e-07, "loss": 0.0203, "step": 5254 }, { "epoch": 0.8289624166896715, "grad_norm": 4.132623672485352, "learning_rate": 1.7452906134277893e-07, "loss": 0.0472, "step": 5255 }, { "epoch": 0.8291201640572623, "grad_norm": 3.312601089477539, "learning_rate": 1.743680566736435e-07, "loss": 0.0277, "step": 5256 }, { "epoch": 0.8292779114248531, "grad_norm": 2.555586814880371, "learning_rate": 1.7420705200450812e-07, "loss": 0.0375, "step": 5257 }, { "epoch": 0.8294356587924439, "grad_norm": 5.3458476066589355, "learning_rate": 1.7404604733537272e-07, "loss": 0.0582, "step": 5258 }, { "epoch": 0.8295934061600347, "grad_norm": 4.48042106628418, "learning_rate": 1.7388504266623733e-07, "loss": 0.0634, "step": 5259 }, { "epoch": 0.8297511535276255, "grad_norm": 3.472592830657959, "learning_rate": 1.737240379971019e-07, "loss": 0.0282, "step": 5260 }, { "epoch": 0.8299089008952163, "grad_norm": 5.665646553039551, "learning_rate": 1.7356303332796652e-07, "loss": 0.0448, "step": 5261 }, { "epoch": 0.8300666482628071, "grad_norm": 3.956422805786133, "learning_rate": 1.734020286588311e-07, "loss": 0.0191, "step": 5262 }, { "epoch": 0.830224395630398, "grad_norm": 3.4300477504730225, "learning_rate": 1.732410239896957e-07, "loss": 0.0185, "step": 5263 }, { "epoch": 0.8303821429979887, "grad_norm": 5.574986934661865, "learning_rate": 1.7308001932056028e-07, "loss": 0.0243, "step": 5264 }, { "epoch": 0.8305398903655795, "grad_norm": 16.364606857299805, "learning_rate": 1.729190146514249e-07, "loss": 0.034, "step": 5265 }, { "epoch": 0.8306976377331703, "grad_norm": 2.714670419692993, "learning_rate": 1.727580099822895e-07, "loss": 0.0267, "step": 5266 }, { "epoch": 0.8308553851007612, "grad_norm": 4.444028377532959, "learning_rate": 1.7259700531315407e-07, "loss": 0.0221, "step": 5267 }, { "epoch": 0.831013132468352, "grad_norm": 7.142070770263672, "learning_rate": 1.7243600064401868e-07, "loss": 0.031, "step": 5268 }, { "epoch": 0.8311708798359427, "grad_norm": 6.350658893585205, "learning_rate": 1.7227499597488326e-07, "loss": 0.0409, "step": 5269 }, { "epoch": 0.8313286272035335, "grad_norm": 4.477631568908691, "learning_rate": 1.7211399130574787e-07, "loss": 0.0261, "step": 5270 }, { "epoch": 0.8314863745711244, "grad_norm": 4.212431907653809, "learning_rate": 1.7195298663661245e-07, "loss": 0.0339, "step": 5271 }, { "epoch": 0.8316441219387152, "grad_norm": 4.496842861175537, "learning_rate": 1.7179198196747705e-07, "loss": 0.0229, "step": 5272 }, { "epoch": 0.831801869306306, "grad_norm": 3.2212107181549072, "learning_rate": 1.7163097729834163e-07, "loss": 0.0382, "step": 5273 }, { "epoch": 0.8319596166738967, "grad_norm": 5.017549514770508, "learning_rate": 1.7146997262920624e-07, "loss": 0.0323, "step": 5274 }, { "epoch": 0.8321173640414875, "grad_norm": 3.7942099571228027, "learning_rate": 1.7130896796007082e-07, "loss": 0.0189, "step": 5275 }, { "epoch": 0.8322751114090784, "grad_norm": 6.848123550415039, "learning_rate": 1.7114796329093543e-07, "loss": 0.1382, "step": 5276 }, { "epoch": 0.8324328587766692, "grad_norm": 3.2568774223327637, "learning_rate": 1.709869586218e-07, "loss": 0.0191, "step": 5277 }, { "epoch": 0.83259060614426, "grad_norm": 3.9052367210388184, "learning_rate": 1.708259539526646e-07, "loss": 0.0481, "step": 5278 }, { "epoch": 0.8327483535118507, "grad_norm": 5.097133636474609, "learning_rate": 1.706649492835292e-07, "loss": 0.0609, "step": 5279 }, { "epoch": 0.8329061008794416, "grad_norm": 4.586767196655273, "learning_rate": 1.705039446143938e-07, "loss": 0.0763, "step": 5280 }, { "epoch": 0.8330638482470324, "grad_norm": 4.707311630249023, "learning_rate": 1.7034293994525843e-07, "loss": 0.0413, "step": 5281 }, { "epoch": 0.8332215956146232, "grad_norm": 5.52780818939209, "learning_rate": 1.70181935276123e-07, "loss": 0.0304, "step": 5282 }, { "epoch": 0.8333793429822139, "grad_norm": 4.097690105438232, "learning_rate": 1.7002093060698762e-07, "loss": 0.0355, "step": 5283 }, { "epoch": 0.8335370903498048, "grad_norm": 4.840817451477051, "learning_rate": 1.698599259378522e-07, "loss": 0.0417, "step": 5284 }, { "epoch": 0.8336948377173956, "grad_norm": 5.8652825355529785, "learning_rate": 1.696989212687168e-07, "loss": 0.0465, "step": 5285 }, { "epoch": 0.8338525850849864, "grad_norm": 4.693187236785889, "learning_rate": 1.6953791659958138e-07, "loss": 0.0599, "step": 5286 }, { "epoch": 0.8340103324525772, "grad_norm": 4.07180643081665, "learning_rate": 1.69376911930446e-07, "loss": 0.029, "step": 5287 }, { "epoch": 0.834168079820168, "grad_norm": 4.234261512756348, "learning_rate": 1.6921590726131057e-07, "loss": 0.0163, "step": 5288 }, { "epoch": 0.8343258271877588, "grad_norm": 3.8681535720825195, "learning_rate": 1.6905490259217518e-07, "loss": 0.0309, "step": 5289 }, { "epoch": 0.8344835745553496, "grad_norm": 6.554484844207764, "learning_rate": 1.6889389792303976e-07, "loss": 0.0334, "step": 5290 }, { "epoch": 0.8346413219229404, "grad_norm": 4.8437347412109375, "learning_rate": 1.6873289325390436e-07, "loss": 0.0338, "step": 5291 }, { "epoch": 0.8347990692905312, "grad_norm": 5.575812816619873, "learning_rate": 1.6857188858476894e-07, "loss": 0.0506, "step": 5292 }, { "epoch": 0.834956816658122, "grad_norm": 4.8221282958984375, "learning_rate": 1.6841088391563355e-07, "loss": 0.051, "step": 5293 }, { "epoch": 0.8351145640257128, "grad_norm": 5.950834274291992, "learning_rate": 1.6824987924649813e-07, "loss": 0.045, "step": 5294 }, { "epoch": 0.8352723113933036, "grad_norm": 7.225146293640137, "learning_rate": 1.6808887457736273e-07, "loss": 0.0799, "step": 5295 }, { "epoch": 0.8354300587608944, "grad_norm": 7.208869934082031, "learning_rate": 1.6792786990822731e-07, "loss": 0.0418, "step": 5296 }, { "epoch": 0.8355878061284853, "grad_norm": 5.060133934020996, "learning_rate": 1.6776686523909192e-07, "loss": 0.0589, "step": 5297 }, { "epoch": 0.835745553496076, "grad_norm": 4.54489278793335, "learning_rate": 1.6760586056995653e-07, "loss": 0.0413, "step": 5298 }, { "epoch": 0.8359033008636668, "grad_norm": 3.2688772678375244, "learning_rate": 1.674448559008211e-07, "loss": 0.0497, "step": 5299 }, { "epoch": 0.8360610482312576, "grad_norm": 5.300332069396973, "learning_rate": 1.672838512316857e-07, "loss": 0.0306, "step": 5300 }, { "epoch": 0.8362187955988485, "grad_norm": 3.340242862701416, "learning_rate": 1.671228465625503e-07, "loss": 0.0298, "step": 5301 }, { "epoch": 0.8363765429664393, "grad_norm": 6.670624256134033, "learning_rate": 1.669618418934149e-07, "loss": 0.0678, "step": 5302 }, { "epoch": 0.83653429033403, "grad_norm": 6.236144542694092, "learning_rate": 1.6680083722427948e-07, "loss": 0.0304, "step": 5303 }, { "epoch": 0.8366920377016208, "grad_norm": 6.884690761566162, "learning_rate": 1.666398325551441e-07, "loss": 0.063, "step": 5304 }, { "epoch": 0.8368497850692117, "grad_norm": 6.910121917724609, "learning_rate": 1.664788278860087e-07, "loss": 0.0861, "step": 5305 }, { "epoch": 0.8370075324368025, "grad_norm": 5.235419273376465, "learning_rate": 1.663178232168733e-07, "loss": 0.0339, "step": 5306 }, { "epoch": 0.8371652798043933, "grad_norm": 3.1473639011383057, "learning_rate": 1.6615681854773788e-07, "loss": 0.0244, "step": 5307 }, { "epoch": 0.837323027171984, "grad_norm": 3.6600468158721924, "learning_rate": 1.6599581387860248e-07, "loss": 0.0158, "step": 5308 }, { "epoch": 0.8374807745395749, "grad_norm": 2.8669915199279785, "learning_rate": 1.6583480920946706e-07, "loss": 0.0271, "step": 5309 }, { "epoch": 0.8376385219071657, "grad_norm": 2.6794514656066895, "learning_rate": 1.6567380454033167e-07, "loss": 0.0231, "step": 5310 }, { "epoch": 0.8377962692747565, "grad_norm": 4.003549098968506, "learning_rate": 1.6551279987119625e-07, "loss": 0.0417, "step": 5311 }, { "epoch": 0.8379540166423473, "grad_norm": 4.020390033721924, "learning_rate": 1.6535179520206086e-07, "loss": 0.0539, "step": 5312 }, { "epoch": 0.838111764009938, "grad_norm": 16.512651443481445, "learning_rate": 1.6519079053292546e-07, "loss": 0.0534, "step": 5313 }, { "epoch": 0.8382695113775289, "grad_norm": 4.025285243988037, "learning_rate": 1.6502978586379004e-07, "loss": 0.0322, "step": 5314 }, { "epoch": 0.8384272587451197, "grad_norm": 5.083260536193848, "learning_rate": 1.6486878119465465e-07, "loss": 0.0327, "step": 5315 }, { "epoch": 0.8385850061127105, "grad_norm": 6.532556056976318, "learning_rate": 1.6470777652551923e-07, "loss": 0.0248, "step": 5316 }, { "epoch": 0.8387427534803013, "grad_norm": 4.935925006866455, "learning_rate": 1.6454677185638383e-07, "loss": 0.0638, "step": 5317 }, { "epoch": 0.8389005008478921, "grad_norm": 3.878962516784668, "learning_rate": 1.6438576718724841e-07, "loss": 0.0259, "step": 5318 }, { "epoch": 0.8390582482154829, "grad_norm": 1.2626736164093018, "learning_rate": 1.6422476251811302e-07, "loss": 0.0073, "step": 5319 }, { "epoch": 0.8392159955830737, "grad_norm": 2.795637369155884, "learning_rate": 1.640637578489776e-07, "loss": 0.0282, "step": 5320 }, { "epoch": 0.8393737429506645, "grad_norm": 11.341633796691895, "learning_rate": 1.639027531798422e-07, "loss": 0.0723, "step": 5321 }, { "epoch": 0.8395314903182554, "grad_norm": 5.826061725616455, "learning_rate": 1.6374174851070679e-07, "loss": 0.0436, "step": 5322 }, { "epoch": 0.8396892376858461, "grad_norm": 4.152256965637207, "learning_rate": 1.635807438415714e-07, "loss": 0.0494, "step": 5323 }, { "epoch": 0.8398469850534369, "grad_norm": 4.842311859130859, "learning_rate": 1.6341973917243597e-07, "loss": 0.0235, "step": 5324 }, { "epoch": 0.8400047324210277, "grad_norm": 5.246630668640137, "learning_rate": 1.6325873450330058e-07, "loss": 0.0339, "step": 5325 }, { "epoch": 0.8401624797886186, "grad_norm": 5.315804481506348, "learning_rate": 1.6309772983416516e-07, "loss": 0.0516, "step": 5326 }, { "epoch": 0.8403202271562094, "grad_norm": 4.90289831161499, "learning_rate": 1.6293672516502977e-07, "loss": 0.0272, "step": 5327 }, { "epoch": 0.8404779745238001, "grad_norm": 9.051569938659668, "learning_rate": 1.627757204958944e-07, "loss": 0.0263, "step": 5328 }, { "epoch": 0.8406357218913909, "grad_norm": 7.007421970367432, "learning_rate": 1.6261471582675898e-07, "loss": 0.0619, "step": 5329 }, { "epoch": 0.8407934692589818, "grad_norm": 4.381661415100098, "learning_rate": 1.6245371115762358e-07, "loss": 0.054, "step": 5330 }, { "epoch": 0.8409512166265726, "grad_norm": 7.271567344665527, "learning_rate": 1.6229270648848816e-07, "loss": 0.0838, "step": 5331 }, { "epoch": 0.8411089639941634, "grad_norm": 3.0706746578216553, "learning_rate": 1.6213170181935277e-07, "loss": 0.0313, "step": 5332 }, { "epoch": 0.8412667113617541, "grad_norm": 3.5723705291748047, "learning_rate": 1.6197069715021735e-07, "loss": 0.0391, "step": 5333 }, { "epoch": 0.8414244587293449, "grad_norm": 3.5169169902801514, "learning_rate": 1.6180969248108196e-07, "loss": 0.0394, "step": 5334 }, { "epoch": 0.8415822060969358, "grad_norm": 9.128049850463867, "learning_rate": 1.6164868781194654e-07, "loss": 0.058, "step": 5335 }, { "epoch": 0.8417399534645266, "grad_norm": 4.708001136779785, "learning_rate": 1.6148768314281114e-07, "loss": 0.0326, "step": 5336 }, { "epoch": 0.8418977008321173, "grad_norm": 6.781467914581299, "learning_rate": 1.6132667847367572e-07, "loss": 0.0401, "step": 5337 }, { "epoch": 0.8420554481997081, "grad_norm": 7.906466007232666, "learning_rate": 1.6116567380454033e-07, "loss": 0.1097, "step": 5338 }, { "epoch": 0.842213195567299, "grad_norm": 6.7985053062438965, "learning_rate": 1.610046691354049e-07, "loss": 0.0895, "step": 5339 }, { "epoch": 0.8423709429348898, "grad_norm": 7.104395389556885, "learning_rate": 1.6084366446626951e-07, "loss": 0.027, "step": 5340 }, { "epoch": 0.8425286903024806, "grad_norm": 7.683440208435059, "learning_rate": 1.606826597971341e-07, "loss": 0.0694, "step": 5341 }, { "epoch": 0.8426864376700713, "grad_norm": 5.992101669311523, "learning_rate": 1.605216551279987e-07, "loss": 0.0298, "step": 5342 }, { "epoch": 0.8428441850376622, "grad_norm": 3.087597131729126, "learning_rate": 1.603606504588633e-07, "loss": 0.0214, "step": 5343 }, { "epoch": 0.843001932405253, "grad_norm": 2.7837986946105957, "learning_rate": 1.601996457897279e-07, "loss": 0.0121, "step": 5344 }, { "epoch": 0.8431596797728438, "grad_norm": 4.8075056076049805, "learning_rate": 1.600386411205925e-07, "loss": 0.0549, "step": 5345 }, { "epoch": 0.8433174271404346, "grad_norm": 5.946511745452881, "learning_rate": 1.5987763645145707e-07, "loss": 0.0535, "step": 5346 }, { "epoch": 0.8434751745080254, "grad_norm": 8.470784187316895, "learning_rate": 1.5971663178232168e-07, "loss": 0.0281, "step": 5347 }, { "epoch": 0.8436329218756162, "grad_norm": 3.7917988300323486, "learning_rate": 1.5955562711318626e-07, "loss": 0.0509, "step": 5348 }, { "epoch": 0.843790669243207, "grad_norm": 6.6693806648254395, "learning_rate": 1.5939462244405087e-07, "loss": 0.0705, "step": 5349 }, { "epoch": 0.8439484166107978, "grad_norm": 1.5791327953338623, "learning_rate": 1.5923361777491545e-07, "loss": 0.0124, "step": 5350 }, { "epoch": 0.8441061639783886, "grad_norm": 3.5993335247039795, "learning_rate": 1.5907261310578008e-07, "loss": 0.0396, "step": 5351 }, { "epoch": 0.8442639113459794, "grad_norm": 3.5184741020202637, "learning_rate": 1.5891160843664466e-07, "loss": 0.04, "step": 5352 }, { "epoch": 0.8444216587135702, "grad_norm": 8.009190559387207, "learning_rate": 1.5875060376750926e-07, "loss": 0.1181, "step": 5353 }, { "epoch": 0.844579406081161, "grad_norm": 5.545644760131836, "learning_rate": 1.5858959909837384e-07, "loss": 0.0656, "step": 5354 }, { "epoch": 0.8447371534487518, "grad_norm": 5.302511692047119, "learning_rate": 1.5842859442923845e-07, "loss": 0.0347, "step": 5355 }, { "epoch": 0.8448949008163427, "grad_norm": 5.997876167297363, "learning_rate": 1.5826758976010303e-07, "loss": 0.0379, "step": 5356 }, { "epoch": 0.8450526481839334, "grad_norm": 5.522672653198242, "learning_rate": 1.5810658509096764e-07, "loss": 0.0662, "step": 5357 }, { "epoch": 0.8452103955515242, "grad_norm": 4.134479522705078, "learning_rate": 1.5794558042183224e-07, "loss": 0.0398, "step": 5358 }, { "epoch": 0.845368142919115, "grad_norm": 3.993116617202759, "learning_rate": 1.5778457575269682e-07, "loss": 0.0573, "step": 5359 }, { "epoch": 0.8455258902867059, "grad_norm": 5.000351428985596, "learning_rate": 1.5762357108356143e-07, "loss": 0.059, "step": 5360 }, { "epoch": 0.8456836376542967, "grad_norm": 10.278120994567871, "learning_rate": 1.57462566414426e-07, "loss": 0.1084, "step": 5361 }, { "epoch": 0.8458413850218874, "grad_norm": 5.670780181884766, "learning_rate": 1.5730156174529062e-07, "loss": 0.0568, "step": 5362 }, { "epoch": 0.8459991323894782, "grad_norm": 3.0353498458862305, "learning_rate": 1.571405570761552e-07, "loss": 0.0196, "step": 5363 }, { "epoch": 0.8461568797570691, "grad_norm": 7.127408981323242, "learning_rate": 1.569795524070198e-07, "loss": 0.0673, "step": 5364 }, { "epoch": 0.8463146271246599, "grad_norm": 3.319708824157715, "learning_rate": 1.5681854773788438e-07, "loss": 0.0223, "step": 5365 }, { "epoch": 0.8464723744922507, "grad_norm": 3.6849279403686523, "learning_rate": 1.56657543068749e-07, "loss": 0.0148, "step": 5366 }, { "epoch": 0.8466301218598414, "grad_norm": 4.387140274047852, "learning_rate": 1.5649653839961357e-07, "loss": 0.0553, "step": 5367 }, { "epoch": 0.8467878692274323, "grad_norm": 6.39334774017334, "learning_rate": 1.5633553373047817e-07, "loss": 0.0516, "step": 5368 }, { "epoch": 0.8469456165950231, "grad_norm": 5.241713047027588, "learning_rate": 1.5617452906134275e-07, "loss": 0.0204, "step": 5369 }, { "epoch": 0.8471033639626139, "grad_norm": 4.3344197273254395, "learning_rate": 1.5601352439220736e-07, "loss": 0.0257, "step": 5370 }, { "epoch": 0.8472611113302047, "grad_norm": 1.5527998208999634, "learning_rate": 1.5585251972307194e-07, "loss": 0.0101, "step": 5371 }, { "epoch": 0.8474188586977954, "grad_norm": 4.683798313140869, "learning_rate": 1.5569151505393655e-07, "loss": 0.0377, "step": 5372 }, { "epoch": 0.8475766060653863, "grad_norm": 6.119536399841309, "learning_rate": 1.5553051038480113e-07, "loss": 0.0286, "step": 5373 }, { "epoch": 0.8477343534329771, "grad_norm": 9.703241348266602, "learning_rate": 1.5536950571566576e-07, "loss": 0.0563, "step": 5374 }, { "epoch": 0.8478921008005679, "grad_norm": 5.909456729888916, "learning_rate": 1.5520850104653037e-07, "loss": 0.06, "step": 5375 }, { "epoch": 0.8480498481681586, "grad_norm": 6.890008449554443, "learning_rate": 1.5504749637739495e-07, "loss": 0.0846, "step": 5376 }, { "epoch": 0.8480498481681586, "eval_accuracy": 0.9880505779990634, "eval_f1": 0.9880505779990634, "eval_loss": 0.038994282484054565, "eval_runtime": 4696.7739, "eval_samples_per_second": 43.19, "eval_steps_per_second": 2.7, "step": 5376 }, { "epoch": 0.8482075955357495, "grad_norm": 2.2304201126098633, "learning_rate": 1.5488649170825955e-07, "loss": 0.0505, "step": 5377 }, { "epoch": 0.8483653429033403, "grad_norm": 8.14090633392334, "learning_rate": 1.5472548703912413e-07, "loss": 0.052, "step": 5378 }, { "epoch": 0.8485230902709311, "grad_norm": 3.4045495986938477, "learning_rate": 1.5456448236998874e-07, "loss": 0.0606, "step": 5379 }, { "epoch": 0.8486808376385219, "grad_norm": 9.605330467224121, "learning_rate": 1.5440347770085332e-07, "loss": 0.0482, "step": 5380 }, { "epoch": 0.8488385850061128, "grad_norm": 3.838212013244629, "learning_rate": 1.5424247303171792e-07, "loss": 0.0585, "step": 5381 }, { "epoch": 0.8489963323737035, "grad_norm": 8.023015022277832, "learning_rate": 1.540814683625825e-07, "loss": 0.0737, "step": 5382 }, { "epoch": 0.8491540797412943, "grad_norm": 4.286295413970947, "learning_rate": 1.539204636934471e-07, "loss": 0.023, "step": 5383 }, { "epoch": 0.8493118271088851, "grad_norm": 3.6643738746643066, "learning_rate": 1.537594590243117e-07, "loss": 0.0323, "step": 5384 }, { "epoch": 0.849469574476476, "grad_norm": 4.660480976104736, "learning_rate": 1.535984543551763e-07, "loss": 0.0373, "step": 5385 }, { "epoch": 0.8496273218440668, "grad_norm": 7.360021114349365, "learning_rate": 1.5343744968604088e-07, "loss": 0.0304, "step": 5386 }, { "epoch": 0.8497850692116575, "grad_norm": 5.344607830047607, "learning_rate": 1.5327644501690548e-07, "loss": 0.0419, "step": 5387 }, { "epoch": 0.8499428165792483, "grad_norm": 4.966273784637451, "learning_rate": 1.5311544034777006e-07, "loss": 0.0308, "step": 5388 }, { "epoch": 0.8501005639468391, "grad_norm": 2.0803072452545166, "learning_rate": 1.5295443567863467e-07, "loss": 0.0167, "step": 5389 }, { "epoch": 0.85025831131443, "grad_norm": 4.4739532470703125, "learning_rate": 1.5279343100949927e-07, "loss": 0.0309, "step": 5390 }, { "epoch": 0.8504160586820207, "grad_norm": 4.332695960998535, "learning_rate": 1.5263242634036385e-07, "loss": 0.0297, "step": 5391 }, { "epoch": 0.8505738060496115, "grad_norm": 6.460211753845215, "learning_rate": 1.5247142167122846e-07, "loss": 0.0442, "step": 5392 }, { "epoch": 0.8507315534172023, "grad_norm": 5.7827067375183105, "learning_rate": 1.5231041700209304e-07, "loss": 0.0314, "step": 5393 }, { "epoch": 0.8508893007847932, "grad_norm": 3.955542802810669, "learning_rate": 1.5214941233295765e-07, "loss": 0.0606, "step": 5394 }, { "epoch": 0.851047048152384, "grad_norm": 8.374228477478027, "learning_rate": 1.5198840766382223e-07, "loss": 0.0665, "step": 5395 }, { "epoch": 0.8512047955199747, "grad_norm": 5.542219638824463, "learning_rate": 1.5182740299468683e-07, "loss": 0.0426, "step": 5396 }, { "epoch": 0.8513625428875655, "grad_norm": 3.1665616035461426, "learning_rate": 1.5166639832555144e-07, "loss": 0.0483, "step": 5397 }, { "epoch": 0.8515202902551564, "grad_norm": 7.315969467163086, "learning_rate": 1.5150539365641605e-07, "loss": 0.0549, "step": 5398 }, { "epoch": 0.8516780376227472, "grad_norm": 1.9205162525177002, "learning_rate": 1.5134438898728063e-07, "loss": 0.0366, "step": 5399 }, { "epoch": 0.851835784990338, "grad_norm": 4.792670249938965, "learning_rate": 1.5118338431814523e-07, "loss": 0.0555, "step": 5400 }, { "epoch": 0.8519935323579287, "grad_norm": 4.623531818389893, "learning_rate": 1.510223796490098e-07, "loss": 0.0252, "step": 5401 }, { "epoch": 0.8521512797255196, "grad_norm": 6.570959091186523, "learning_rate": 1.5086137497987442e-07, "loss": 0.031, "step": 5402 }, { "epoch": 0.8523090270931104, "grad_norm": 5.0001654624938965, "learning_rate": 1.50700370310739e-07, "loss": 0.0171, "step": 5403 }, { "epoch": 0.8524667744607012, "grad_norm": 2.375577449798584, "learning_rate": 1.505393656416036e-07, "loss": 0.0141, "step": 5404 }, { "epoch": 0.852624521828292, "grad_norm": 2.5384368896484375, "learning_rate": 1.503783609724682e-07, "loss": 0.0117, "step": 5405 }, { "epoch": 0.8527822691958828, "grad_norm": 4.021308422088623, "learning_rate": 1.502173563033328e-07, "loss": 0.0564, "step": 5406 }, { "epoch": 0.8529400165634736, "grad_norm": 3.8227062225341797, "learning_rate": 1.500563516341974e-07, "loss": 0.0469, "step": 5407 }, { "epoch": 0.8530977639310644, "grad_norm": 4.229830741882324, "learning_rate": 1.4989534696506198e-07, "loss": 0.0156, "step": 5408 }, { "epoch": 0.8532555112986552, "grad_norm": 5.805696964263916, "learning_rate": 1.4973434229592658e-07, "loss": 0.0552, "step": 5409 }, { "epoch": 0.853413258666246, "grad_norm": 4.083367824554443, "learning_rate": 1.4957333762679116e-07, "loss": 0.0477, "step": 5410 }, { "epoch": 0.8535710060338368, "grad_norm": 2.8954594135284424, "learning_rate": 1.4941233295765577e-07, "loss": 0.0396, "step": 5411 }, { "epoch": 0.8537287534014276, "grad_norm": 4.661309719085693, "learning_rate": 1.4925132828852035e-07, "loss": 0.0522, "step": 5412 }, { "epoch": 0.8538865007690184, "grad_norm": 7.065108299255371, "learning_rate": 1.4909032361938496e-07, "loss": 0.0316, "step": 5413 }, { "epoch": 0.8540442481366092, "grad_norm": 3.9269495010375977, "learning_rate": 1.4892931895024954e-07, "loss": 0.0467, "step": 5414 }, { "epoch": 0.8542019955042001, "grad_norm": 5.224636554718018, "learning_rate": 1.4876831428111414e-07, "loss": 0.0492, "step": 5415 }, { "epoch": 0.8543597428717908, "grad_norm": 3.75856351852417, "learning_rate": 1.4860730961197872e-07, "loss": 0.0448, "step": 5416 }, { "epoch": 0.8545174902393816, "grad_norm": 6.5601606369018555, "learning_rate": 1.4844630494284333e-07, "loss": 0.0275, "step": 5417 }, { "epoch": 0.8546752376069724, "grad_norm": 3.714090347290039, "learning_rate": 1.482853002737079e-07, "loss": 0.025, "step": 5418 }, { "epoch": 0.8548329849745633, "grad_norm": 8.23410701751709, "learning_rate": 1.4812429560457251e-07, "loss": 0.0547, "step": 5419 }, { "epoch": 0.8549907323421541, "grad_norm": 3.264073133468628, "learning_rate": 1.4796329093543712e-07, "loss": 0.019, "step": 5420 }, { "epoch": 0.8551484797097448, "grad_norm": 3.837836980819702, "learning_rate": 1.4780228626630173e-07, "loss": 0.0235, "step": 5421 }, { "epoch": 0.8553062270773356, "grad_norm": 8.193350791931152, "learning_rate": 1.4764128159716633e-07, "loss": 0.0353, "step": 5422 }, { "epoch": 0.8554639744449265, "grad_norm": 3.9345805644989014, "learning_rate": 1.474802769280309e-07, "loss": 0.0205, "step": 5423 }, { "epoch": 0.8556217218125173, "grad_norm": 4.9703898429870605, "learning_rate": 1.4731927225889552e-07, "loss": 0.0445, "step": 5424 }, { "epoch": 0.855779469180108, "grad_norm": 6.9233551025390625, "learning_rate": 1.471582675897601e-07, "loss": 0.0958, "step": 5425 }, { "epoch": 0.8559372165476988, "grad_norm": 4.656965732574463, "learning_rate": 1.469972629206247e-07, "loss": 0.0548, "step": 5426 }, { "epoch": 0.8560949639152896, "grad_norm": 6.408620834350586, "learning_rate": 1.4683625825148928e-07, "loss": 0.0559, "step": 5427 }, { "epoch": 0.8562527112828805, "grad_norm": 4.187609672546387, "learning_rate": 1.466752535823539e-07, "loss": 0.0439, "step": 5428 }, { "epoch": 0.8564104586504713, "grad_norm": 3.907073497772217, "learning_rate": 1.4651424891321847e-07, "loss": 0.0207, "step": 5429 }, { "epoch": 0.856568206018062, "grad_norm": 2.5755913257598877, "learning_rate": 1.4635324424408308e-07, "loss": 0.0192, "step": 5430 }, { "epoch": 0.8567259533856528, "grad_norm": 7.500949859619141, "learning_rate": 1.4619223957494766e-07, "loss": 0.0721, "step": 5431 }, { "epoch": 0.8568837007532437, "grad_norm": 4.847621440887451, "learning_rate": 1.4603123490581226e-07, "loss": 0.0414, "step": 5432 }, { "epoch": 0.8570414481208345, "grad_norm": 11.874192237854004, "learning_rate": 1.4587023023667684e-07, "loss": 0.064, "step": 5433 }, { "epoch": 0.8571991954884253, "grad_norm": 3.3340604305267334, "learning_rate": 1.4570922556754145e-07, "loss": 0.0541, "step": 5434 }, { "epoch": 0.857356942856016, "grad_norm": 3.7429940700531006, "learning_rate": 1.4554822089840606e-07, "loss": 0.0274, "step": 5435 }, { "epoch": 0.8575146902236069, "grad_norm": 5.898879051208496, "learning_rate": 1.4538721622927064e-07, "loss": 0.0472, "step": 5436 }, { "epoch": 0.8576724375911977, "grad_norm": 5.683361530303955, "learning_rate": 1.4522621156013524e-07, "loss": 0.0394, "step": 5437 }, { "epoch": 0.8578301849587885, "grad_norm": 6.243908882141113, "learning_rate": 1.4506520689099982e-07, "loss": 0.0507, "step": 5438 }, { "epoch": 0.8579879323263793, "grad_norm": 8.795913696289062, "learning_rate": 1.4490420222186443e-07, "loss": 0.0618, "step": 5439 }, { "epoch": 0.8581456796939702, "grad_norm": 11.45658016204834, "learning_rate": 1.44743197552729e-07, "loss": 0.0555, "step": 5440 }, { "epoch": 0.8583034270615609, "grad_norm": 5.12626838684082, "learning_rate": 1.4458219288359361e-07, "loss": 0.026, "step": 5441 }, { "epoch": 0.8584611744291517, "grad_norm": 7.450155735015869, "learning_rate": 1.444211882144582e-07, "loss": 0.0952, "step": 5442 }, { "epoch": 0.8586189217967425, "grad_norm": 4.901927471160889, "learning_rate": 1.442601835453228e-07, "loss": 0.0459, "step": 5443 }, { "epoch": 0.8587766691643334, "grad_norm": 5.0577898025512695, "learning_rate": 1.440991788761874e-07, "loss": 0.0187, "step": 5444 }, { "epoch": 0.8589344165319241, "grad_norm": 5.3275580406188965, "learning_rate": 1.4393817420705201e-07, "loss": 0.0212, "step": 5445 }, { "epoch": 0.8590921638995149, "grad_norm": 3.313403367996216, "learning_rate": 1.437771695379166e-07, "loss": 0.0366, "step": 5446 }, { "epoch": 0.8592499112671057, "grad_norm": 9.613713264465332, "learning_rate": 1.436161648687812e-07, "loss": 0.0365, "step": 5447 }, { "epoch": 0.8594076586346965, "grad_norm": 4.729363918304443, "learning_rate": 1.4345516019964578e-07, "loss": 0.0644, "step": 5448 }, { "epoch": 0.8595654060022874, "grad_norm": 7.8069539070129395, "learning_rate": 1.4329415553051039e-07, "loss": 0.0562, "step": 5449 }, { "epoch": 0.8597231533698781, "grad_norm": 3.9969279766082764, "learning_rate": 1.43133150861375e-07, "loss": 0.0683, "step": 5450 }, { "epoch": 0.8598809007374689, "grad_norm": 7.209540367126465, "learning_rate": 1.4297214619223957e-07, "loss": 0.0496, "step": 5451 }, { "epoch": 0.8600386481050597, "grad_norm": 2.844700813293457, "learning_rate": 1.4281114152310418e-07, "loss": 0.0204, "step": 5452 }, { "epoch": 0.8601963954726506, "grad_norm": 6.2221455574035645, "learning_rate": 1.4265013685396876e-07, "loss": 0.0448, "step": 5453 }, { "epoch": 0.8603541428402414, "grad_norm": 5.791203498840332, "learning_rate": 1.4248913218483336e-07, "loss": 0.0466, "step": 5454 }, { "epoch": 0.8605118902078321, "grad_norm": 4.409770488739014, "learning_rate": 1.4232812751569794e-07, "loss": 0.0212, "step": 5455 }, { "epoch": 0.8606696375754229, "grad_norm": 3.2753994464874268, "learning_rate": 1.4216712284656255e-07, "loss": 0.024, "step": 5456 }, { "epoch": 0.8608273849430138, "grad_norm": 6.3212127685546875, "learning_rate": 1.4200611817742713e-07, "loss": 0.0755, "step": 5457 }, { "epoch": 0.8609851323106046, "grad_norm": 3.5120387077331543, "learning_rate": 1.4184511350829174e-07, "loss": 0.0335, "step": 5458 }, { "epoch": 0.8611428796781954, "grad_norm": 3.85371732711792, "learning_rate": 1.4168410883915632e-07, "loss": 0.0366, "step": 5459 }, { "epoch": 0.8613006270457861, "grad_norm": 6.459498405456543, "learning_rate": 1.4152310417002092e-07, "loss": 0.0612, "step": 5460 }, { "epoch": 0.861458374413377, "grad_norm": 5.595515251159668, "learning_rate": 1.413620995008855e-07, "loss": 0.0859, "step": 5461 }, { "epoch": 0.8616161217809678, "grad_norm": 4.2606987953186035, "learning_rate": 1.412010948317501e-07, "loss": 0.0686, "step": 5462 }, { "epoch": 0.8617738691485586, "grad_norm": 5.481800556182861, "learning_rate": 1.410400901626147e-07, "loss": 0.0427, "step": 5463 }, { "epoch": 0.8619316165161494, "grad_norm": 4.921341896057129, "learning_rate": 1.408790854934793e-07, "loss": 0.0245, "step": 5464 }, { "epoch": 0.8620893638837402, "grad_norm": 4.414101600646973, "learning_rate": 1.4071808082434387e-07, "loss": 0.0324, "step": 5465 }, { "epoch": 0.862247111251331, "grad_norm": 6.160189628601074, "learning_rate": 1.4055707615520848e-07, "loss": 0.0534, "step": 5466 }, { "epoch": 0.8624048586189218, "grad_norm": 3.8047962188720703, "learning_rate": 1.4039607148607311e-07, "loss": 0.0171, "step": 5467 }, { "epoch": 0.8625626059865126, "grad_norm": 4.260643482208252, "learning_rate": 1.402350668169377e-07, "loss": 0.0369, "step": 5468 }, { "epoch": 0.8627203533541034, "grad_norm": 3.534762382507324, "learning_rate": 1.400740621478023e-07, "loss": 0.0357, "step": 5469 }, { "epoch": 0.8628781007216942, "grad_norm": 3.0478262901306152, "learning_rate": 1.3991305747866688e-07, "loss": 0.0175, "step": 5470 }, { "epoch": 0.863035848089285, "grad_norm": 6.904594898223877, "learning_rate": 1.3975205280953149e-07, "loss": 0.077, "step": 5471 }, { "epoch": 0.8631935954568758, "grad_norm": 9.13009262084961, "learning_rate": 1.3959104814039607e-07, "loss": 0.0544, "step": 5472 }, { "epoch": 0.8633513428244666, "grad_norm": 6.062447547912598, "learning_rate": 1.3943004347126067e-07, "loss": 0.0977, "step": 5473 }, { "epoch": 0.8635090901920575, "grad_norm": 5.987272262573242, "learning_rate": 1.3926903880212525e-07, "loss": 0.0302, "step": 5474 }, { "epoch": 0.8636668375596482, "grad_norm": 4.56599760055542, "learning_rate": 1.3910803413298986e-07, "loss": 0.0235, "step": 5475 }, { "epoch": 0.863824584927239, "grad_norm": 5.920708179473877, "learning_rate": 1.3894702946385444e-07, "loss": 0.0559, "step": 5476 }, { "epoch": 0.8639823322948298, "grad_norm": 4.275696754455566, "learning_rate": 1.3878602479471904e-07, "loss": 0.0248, "step": 5477 }, { "epoch": 0.8641400796624207, "grad_norm": 6.5294270515441895, "learning_rate": 1.3862502012558362e-07, "loss": 0.0582, "step": 5478 }, { "epoch": 0.8642978270300115, "grad_norm": 5.760613918304443, "learning_rate": 1.3846401545644823e-07, "loss": 0.0661, "step": 5479 }, { "epoch": 0.8644555743976022, "grad_norm": 4.418890953063965, "learning_rate": 1.383030107873128e-07, "loss": 0.0307, "step": 5480 }, { "epoch": 0.864613321765193, "grad_norm": 3.454737424850464, "learning_rate": 1.3814200611817742e-07, "loss": 0.0341, "step": 5481 }, { "epoch": 0.8647710691327839, "grad_norm": 3.304800510406494, "learning_rate": 1.3798100144904202e-07, "loss": 0.0207, "step": 5482 }, { "epoch": 0.8649288165003747, "grad_norm": 5.844666481018066, "learning_rate": 1.378199967799066e-07, "loss": 0.0622, "step": 5483 }, { "epoch": 0.8650865638679655, "grad_norm": 2.9924535751342773, "learning_rate": 1.376589921107712e-07, "loss": 0.0347, "step": 5484 }, { "epoch": 0.8652443112355562, "grad_norm": 5.538552284240723, "learning_rate": 1.374979874416358e-07, "loss": 0.0497, "step": 5485 }, { "epoch": 0.865402058603147, "grad_norm": 5.983892917633057, "learning_rate": 1.373369827725004e-07, "loss": 0.1164, "step": 5486 }, { "epoch": 0.8655598059707379, "grad_norm": 3.1277856826782227, "learning_rate": 1.3717597810336498e-07, "loss": 0.0589, "step": 5487 }, { "epoch": 0.8657175533383287, "grad_norm": 3.137880325317383, "learning_rate": 1.3701497343422958e-07, "loss": 0.0211, "step": 5488 }, { "epoch": 0.8658753007059194, "grad_norm": 7.256457328796387, "learning_rate": 1.3685396876509416e-07, "loss": 0.0381, "step": 5489 }, { "epoch": 0.8660330480735102, "grad_norm": 5.043852806091309, "learning_rate": 1.366929640959588e-07, "loss": 0.0212, "step": 5490 }, { "epoch": 0.8661907954411011, "grad_norm": 11.189506530761719, "learning_rate": 1.3653195942682337e-07, "loss": 0.0846, "step": 5491 }, { "epoch": 0.8663485428086919, "grad_norm": 5.468672752380371, "learning_rate": 1.3637095475768798e-07, "loss": 0.1226, "step": 5492 }, { "epoch": 0.8665062901762827, "grad_norm": 3.6421420574188232, "learning_rate": 1.3620995008855256e-07, "loss": 0.0461, "step": 5493 }, { "epoch": 0.8666640375438734, "grad_norm": 4.256086349487305, "learning_rate": 1.3604894541941717e-07, "loss": 0.0468, "step": 5494 }, { "epoch": 0.8668217849114643, "grad_norm": 3.2741804122924805, "learning_rate": 1.3588794075028175e-07, "loss": 0.0261, "step": 5495 }, { "epoch": 0.8669795322790551, "grad_norm": 6.785216808319092, "learning_rate": 1.3572693608114635e-07, "loss": 0.0376, "step": 5496 }, { "epoch": 0.8671372796466459, "grad_norm": 8.623574256896973, "learning_rate": 1.3556593141201096e-07, "loss": 0.0441, "step": 5497 }, { "epoch": 0.8672950270142367, "grad_norm": 4.828185558319092, "learning_rate": 1.3540492674287554e-07, "loss": 0.0513, "step": 5498 }, { "epoch": 0.8674527743818276, "grad_norm": 6.098627090454102, "learning_rate": 1.3524392207374015e-07, "loss": 0.0362, "step": 5499 }, { "epoch": 0.8676105217494183, "grad_norm": 3.566786289215088, "learning_rate": 1.3508291740460473e-07, "loss": 0.0375, "step": 5500 }, { "epoch": 0.8677682691170091, "grad_norm": 5.9243550300598145, "learning_rate": 1.3492191273546933e-07, "loss": 0.0663, "step": 5501 }, { "epoch": 0.8679260164845999, "grad_norm": 4.2992048263549805, "learning_rate": 1.347609080663339e-07, "loss": 0.0516, "step": 5502 }, { "epoch": 0.8680837638521908, "grad_norm": 3.649446964263916, "learning_rate": 1.3459990339719852e-07, "loss": 0.0556, "step": 5503 }, { "epoch": 0.8682415112197815, "grad_norm": 3.712911367416382, "learning_rate": 1.344388987280631e-07, "loss": 0.0167, "step": 5504 }, { "epoch": 0.8683992585873723, "grad_norm": 2.6621758937835693, "learning_rate": 1.342778940589277e-07, "loss": 0.019, "step": 5505 }, { "epoch": 0.8685570059549631, "grad_norm": 5.428889274597168, "learning_rate": 1.3411688938979228e-07, "loss": 0.0389, "step": 5506 }, { "epoch": 0.8687147533225539, "grad_norm": 2.7606325149536133, "learning_rate": 1.339558847206569e-07, "loss": 0.0338, "step": 5507 }, { "epoch": 0.8688725006901448, "grad_norm": 4.508447170257568, "learning_rate": 1.3379488005152147e-07, "loss": 0.027, "step": 5508 }, { "epoch": 0.8690302480577355, "grad_norm": 3.3050265312194824, "learning_rate": 1.3363387538238608e-07, "loss": 0.0173, "step": 5509 }, { "epoch": 0.8691879954253263, "grad_norm": 4.372236251831055, "learning_rate": 1.3347287071325066e-07, "loss": 0.0327, "step": 5510 }, { "epoch": 0.8693457427929171, "grad_norm": 2.6069273948669434, "learning_rate": 1.3331186604411526e-07, "loss": 0.0166, "step": 5511 }, { "epoch": 0.869503490160508, "grad_norm": 3.1929118633270264, "learning_rate": 1.3315086137497987e-07, "loss": 0.0508, "step": 5512 }, { "epoch": 0.8696612375280988, "grad_norm": 4.904454708099365, "learning_rate": 1.3298985670584445e-07, "loss": 0.0689, "step": 5513 }, { "epoch": 0.8698189848956895, "grad_norm": 3.687384605407715, "learning_rate": 1.3282885203670908e-07, "loss": 0.0288, "step": 5514 }, { "epoch": 0.8699767322632803, "grad_norm": 3.5571999549865723, "learning_rate": 1.3266784736757366e-07, "loss": 0.0211, "step": 5515 }, { "epoch": 0.8701344796308712, "grad_norm": 5.523207664489746, "learning_rate": 1.3250684269843827e-07, "loss": 0.0323, "step": 5516 }, { "epoch": 0.870292226998462, "grad_norm": 2.4000661373138428, "learning_rate": 1.3234583802930285e-07, "loss": 0.012, "step": 5517 }, { "epoch": 0.8704499743660528, "grad_norm": 4.749390602111816, "learning_rate": 1.3218483336016745e-07, "loss": 0.0679, "step": 5518 }, { "epoch": 0.8706077217336435, "grad_norm": 6.371307373046875, "learning_rate": 1.3202382869103203e-07, "loss": 0.037, "step": 5519 }, { "epoch": 0.8707654691012344, "grad_norm": 7.201481819152832, "learning_rate": 1.3186282402189664e-07, "loss": 0.0438, "step": 5520 }, { "epoch": 0.8709232164688252, "grad_norm": 3.805671215057373, "learning_rate": 1.3170181935276122e-07, "loss": 0.0228, "step": 5521 }, { "epoch": 0.871080963836416, "grad_norm": 6.359230041503906, "learning_rate": 1.3154081468362583e-07, "loss": 0.0374, "step": 5522 }, { "epoch": 0.8712387112040068, "grad_norm": 4.945654392242432, "learning_rate": 1.313798100144904e-07, "loss": 0.0359, "step": 5523 }, { "epoch": 0.8713964585715975, "grad_norm": 11.090949058532715, "learning_rate": 1.31218805345355e-07, "loss": 0.0652, "step": 5524 }, { "epoch": 0.8715542059391884, "grad_norm": 4.84529447555542, "learning_rate": 1.310578006762196e-07, "loss": 0.0421, "step": 5525 }, { "epoch": 0.8717119533067792, "grad_norm": 4.299323081970215, "learning_rate": 1.308967960070842e-07, "loss": 0.0262, "step": 5526 }, { "epoch": 0.87186970067437, "grad_norm": 4.873415946960449, "learning_rate": 1.307357913379488e-07, "loss": 0.0235, "step": 5527 }, { "epoch": 0.8720274480419608, "grad_norm": 5.196324348449707, "learning_rate": 1.3057478666881338e-07, "loss": 0.0603, "step": 5528 }, { "epoch": 0.8721851954095516, "grad_norm": 4.5741353034973145, "learning_rate": 1.30413781999678e-07, "loss": 0.0368, "step": 5529 }, { "epoch": 0.8723429427771424, "grad_norm": 4.3620924949646, "learning_rate": 1.3025277733054257e-07, "loss": 0.0488, "step": 5530 }, { "epoch": 0.8725006901447332, "grad_norm": 3.900864362716675, "learning_rate": 1.3009177266140718e-07, "loss": 0.0229, "step": 5531 }, { "epoch": 0.872658437512324, "grad_norm": 3.649635076522827, "learning_rate": 1.2993076799227176e-07, "loss": 0.0163, "step": 5532 }, { "epoch": 0.8728161848799149, "grad_norm": 3.126142740249634, "learning_rate": 1.2976976332313636e-07, "loss": 0.014, "step": 5533 }, { "epoch": 0.8729739322475056, "grad_norm": 5.796621799468994, "learning_rate": 1.2960875865400094e-07, "loss": 0.0625, "step": 5534 }, { "epoch": 0.8731316796150964, "grad_norm": 4.14392614364624, "learning_rate": 1.2944775398486555e-07, "loss": 0.0959, "step": 5535 }, { "epoch": 0.8732894269826872, "grad_norm": 5.45209264755249, "learning_rate": 1.2928674931573013e-07, "loss": 0.0518, "step": 5536 }, { "epoch": 0.8734471743502781, "grad_norm": 5.426156044006348, "learning_rate": 1.2912574464659476e-07, "loss": 0.0293, "step": 5537 }, { "epoch": 0.8736049217178689, "grad_norm": 2.479907274246216, "learning_rate": 1.2896473997745934e-07, "loss": 0.0229, "step": 5538 }, { "epoch": 0.8737626690854596, "grad_norm": 11.546285629272461, "learning_rate": 1.2880373530832395e-07, "loss": 0.0648, "step": 5539 }, { "epoch": 0.8739204164530504, "grad_norm": 3.793890953063965, "learning_rate": 1.2864273063918853e-07, "loss": 0.0261, "step": 5540 }, { "epoch": 0.8740781638206413, "grad_norm": 3.6029372215270996, "learning_rate": 1.2848172597005313e-07, "loss": 0.0133, "step": 5541 }, { "epoch": 0.8742359111882321, "grad_norm": 5.791674613952637, "learning_rate": 1.2832072130091774e-07, "loss": 0.0385, "step": 5542 }, { "epoch": 0.8743936585558229, "grad_norm": 5.276854991912842, "learning_rate": 1.2815971663178232e-07, "loss": 0.0487, "step": 5543 }, { "epoch": 0.8745514059234136, "grad_norm": 2.5421369075775146, "learning_rate": 1.2799871196264693e-07, "loss": 0.0203, "step": 5544 }, { "epoch": 0.8747091532910044, "grad_norm": 7.317483425140381, "learning_rate": 1.278377072935115e-07, "loss": 0.0333, "step": 5545 }, { "epoch": 0.8748669006585953, "grad_norm": 4.536862850189209, "learning_rate": 1.276767026243761e-07, "loss": 0.0141, "step": 5546 }, { "epoch": 0.8750246480261861, "grad_norm": 5.074760437011719, "learning_rate": 1.275156979552407e-07, "loss": 0.053, "step": 5547 }, { "epoch": 0.8751823953937768, "grad_norm": 3.548044204711914, "learning_rate": 1.273546932861053e-07, "loss": 0.0801, "step": 5548 }, { "epoch": 0.8753401427613676, "grad_norm": 1.4778392314910889, "learning_rate": 1.2719368861696988e-07, "loss": 0.0175, "step": 5549 }, { "epoch": 0.8754978901289585, "grad_norm": 3.6699726581573486, "learning_rate": 1.2703268394783449e-07, "loss": 0.0194, "step": 5550 }, { "epoch": 0.8756556374965493, "grad_norm": 7.689450263977051, "learning_rate": 1.2687167927869907e-07, "loss": 0.031, "step": 5551 }, { "epoch": 0.8758133848641401, "grad_norm": 3.877007246017456, "learning_rate": 1.2671067460956367e-07, "loss": 0.0222, "step": 5552 }, { "epoch": 0.8759711322317308, "grad_norm": 4.741959571838379, "learning_rate": 1.2654966994042825e-07, "loss": 0.0274, "step": 5553 }, { "epoch": 0.8761288795993217, "grad_norm": 7.20140266418457, "learning_rate": 1.2638866527129286e-07, "loss": 0.0272, "step": 5554 }, { "epoch": 0.8762866269669125, "grad_norm": 7.812570095062256, "learning_rate": 1.2622766060215744e-07, "loss": 0.0382, "step": 5555 }, { "epoch": 0.8764443743345033, "grad_norm": 4.745429992675781, "learning_rate": 1.2606665593302204e-07, "loss": 0.0347, "step": 5556 }, { "epoch": 0.8766021217020941, "grad_norm": 6.139153957366943, "learning_rate": 1.2590565126388662e-07, "loss": 0.0167, "step": 5557 }, { "epoch": 0.876759869069685, "grad_norm": 5.597256660461426, "learning_rate": 1.2574464659475123e-07, "loss": 0.0236, "step": 5558 }, { "epoch": 0.8769176164372757, "grad_norm": 7.539785385131836, "learning_rate": 1.2558364192561584e-07, "loss": 0.0815, "step": 5559 }, { "epoch": 0.8770753638048665, "grad_norm": 3.764596462249756, "learning_rate": 1.2542263725648044e-07, "loss": 0.025, "step": 5560 }, { "epoch": 0.8772331111724573, "grad_norm": 4.3173089027404785, "learning_rate": 1.2526163258734505e-07, "loss": 0.0335, "step": 5561 }, { "epoch": 0.8773908585400481, "grad_norm": 11.727649688720703, "learning_rate": 1.2510062791820963e-07, "loss": 0.0641, "step": 5562 }, { "epoch": 0.877548605907639, "grad_norm": 5.548543930053711, "learning_rate": 1.249396232490742e-07, "loss": 0.0704, "step": 5563 }, { "epoch": 0.8777063532752297, "grad_norm": 5.145328044891357, "learning_rate": 1.2477861857993881e-07, "loss": 0.0552, "step": 5564 }, { "epoch": 0.8778641006428205, "grad_norm": 5.675923824310303, "learning_rate": 1.246176139108034e-07, "loss": 0.077, "step": 5565 }, { "epoch": 0.8780218480104113, "grad_norm": 4.026497840881348, "learning_rate": 1.24456609241668e-07, "loss": 0.047, "step": 5566 }, { "epoch": 0.8781795953780022, "grad_norm": 3.609802484512329, "learning_rate": 1.242956045725326e-07, "loss": 0.0405, "step": 5567 }, { "epoch": 0.8783373427455929, "grad_norm": 5.737876892089844, "learning_rate": 1.241345999033972e-07, "loss": 0.0442, "step": 5568 }, { "epoch": 0.8784950901131837, "grad_norm": 3.8690743446350098, "learning_rate": 1.239735952342618e-07, "loss": 0.0143, "step": 5569 }, { "epoch": 0.8786528374807745, "grad_norm": 3.8035223484039307, "learning_rate": 1.2381259056512637e-07, "loss": 0.0281, "step": 5570 }, { "epoch": 0.8788105848483654, "grad_norm": 4.356854438781738, "learning_rate": 1.2365158589599098e-07, "loss": 0.0248, "step": 5571 }, { "epoch": 0.8789683322159562, "grad_norm": 4.11252498626709, "learning_rate": 1.2349058122685556e-07, "loss": 0.0224, "step": 5572 }, { "epoch": 0.8791260795835469, "grad_norm": 4.7247843742370605, "learning_rate": 1.2332957655772017e-07, "loss": 0.0334, "step": 5573 }, { "epoch": 0.8792838269511377, "grad_norm": 7.541321754455566, "learning_rate": 1.2316857188858477e-07, "loss": 0.0474, "step": 5574 }, { "epoch": 0.8794415743187286, "grad_norm": 5.661472797393799, "learning_rate": 1.2300756721944935e-07, "loss": 0.0308, "step": 5575 }, { "epoch": 0.8795993216863194, "grad_norm": 2.8416740894317627, "learning_rate": 1.2284656255031396e-07, "loss": 0.0192, "step": 5576 }, { "epoch": 0.8797570690539102, "grad_norm": 10.018465995788574, "learning_rate": 1.2268555788117854e-07, "loss": 0.0559, "step": 5577 }, { "epoch": 0.8799148164215009, "grad_norm": 6.063082695007324, "learning_rate": 1.2252455321204314e-07, "loss": 0.0564, "step": 5578 }, { "epoch": 0.8800725637890918, "grad_norm": 4.855288982391357, "learning_rate": 1.2236354854290775e-07, "loss": 0.0629, "step": 5579 }, { "epoch": 0.8802303111566826, "grad_norm": 1.4225459098815918, "learning_rate": 1.2220254387377233e-07, "loss": 0.0077, "step": 5580 }, { "epoch": 0.8803880585242734, "grad_norm": 4.527211666107178, "learning_rate": 1.2204153920463694e-07, "loss": 0.0538, "step": 5581 }, { "epoch": 0.8805458058918642, "grad_norm": 7.736018180847168, "learning_rate": 1.2188053453550152e-07, "loss": 0.0576, "step": 5582 }, { "epoch": 0.8807035532594549, "grad_norm": 5.84458589553833, "learning_rate": 1.2171952986636612e-07, "loss": 0.0382, "step": 5583 }, { "epoch": 0.8808613006270458, "grad_norm": 2.6455740928649902, "learning_rate": 1.215585251972307e-07, "loss": 0.0397, "step": 5584 }, { "epoch": 0.8810190479946366, "grad_norm": 4.986097812652588, "learning_rate": 1.213975205280953e-07, "loss": 0.0612, "step": 5585 }, { "epoch": 0.8811767953622274, "grad_norm": 6.404510498046875, "learning_rate": 1.212365158589599e-07, "loss": 0.0222, "step": 5586 }, { "epoch": 0.8813345427298181, "grad_norm": 5.908301830291748, "learning_rate": 1.210755111898245e-07, "loss": 0.0553, "step": 5587 }, { "epoch": 0.881492290097409, "grad_norm": 7.566300392150879, "learning_rate": 1.209145065206891e-07, "loss": 0.0942, "step": 5588 }, { "epoch": 0.8816500374649998, "grad_norm": 2.8671958446502686, "learning_rate": 1.207535018515537e-07, "loss": 0.035, "step": 5589 }, { "epoch": 0.8818077848325906, "grad_norm": 3.029541015625, "learning_rate": 1.205924971824183e-07, "loss": 0.0174, "step": 5590 }, { "epoch": 0.8819655322001814, "grad_norm": 6.2882537841796875, "learning_rate": 1.204314925132829e-07, "loss": 0.0445, "step": 5591 }, { "epoch": 0.8821232795677723, "grad_norm": 9.018379211425781, "learning_rate": 1.2027048784414747e-07, "loss": 0.0387, "step": 5592 }, { "epoch": 0.882281026935363, "grad_norm": 3.9891650676727295, "learning_rate": 1.2010948317501208e-07, "loss": 0.0155, "step": 5593 }, { "epoch": 0.8824387743029538, "grad_norm": 3.620929479598999, "learning_rate": 1.1994847850587666e-07, "loss": 0.031, "step": 5594 }, { "epoch": 0.8825965216705446, "grad_norm": 3.1831936836242676, "learning_rate": 1.1978747383674127e-07, "loss": 0.0383, "step": 5595 }, { "epoch": 0.8827542690381355, "grad_norm": 4.132047176361084, "learning_rate": 1.1962646916760585e-07, "loss": 0.0222, "step": 5596 }, { "epoch": 0.8829120164057263, "grad_norm": 7.14155912399292, "learning_rate": 1.1946546449847045e-07, "loss": 0.0605, "step": 5597 }, { "epoch": 0.883069763773317, "grad_norm": 19.78788948059082, "learning_rate": 1.1930445982933503e-07, "loss": 0.0424, "step": 5598 }, { "epoch": 0.8832275111409078, "grad_norm": 2.621591091156006, "learning_rate": 1.1914345516019964e-07, "loss": 0.0312, "step": 5599 }, { "epoch": 0.8833852585084987, "grad_norm": 5.4610066413879395, "learning_rate": 1.1898245049106423e-07, "loss": 0.0424, "step": 5600 }, { "epoch": 0.8835430058760895, "grad_norm": 5.091216087341309, "learning_rate": 1.1882144582192884e-07, "loss": 0.0666, "step": 5601 }, { "epoch": 0.8837007532436802, "grad_norm": 7.490744113922119, "learning_rate": 1.1866044115279343e-07, "loss": 0.0576, "step": 5602 }, { "epoch": 0.883858500611271, "grad_norm": 6.292675495147705, "learning_rate": 1.1849943648365802e-07, "loss": 0.044, "step": 5603 }, { "epoch": 0.8840162479788618, "grad_norm": 4.969595909118652, "learning_rate": 1.1833843181452262e-07, "loss": 0.0326, "step": 5604 }, { "epoch": 0.8841739953464527, "grad_norm": 4.043492317199707, "learning_rate": 1.1817742714538721e-07, "loss": 0.0191, "step": 5605 }, { "epoch": 0.8843317427140435, "grad_norm": 4.971721649169922, "learning_rate": 1.180164224762518e-07, "loss": 0.0381, "step": 5606 }, { "epoch": 0.8844894900816342, "grad_norm": 3.6937193870544434, "learning_rate": 1.178554178071164e-07, "loss": 0.0471, "step": 5607 }, { "epoch": 0.884647237449225, "grad_norm": 7.5754714012146, "learning_rate": 1.1769441313798099e-07, "loss": 0.0814, "step": 5608 }, { "epoch": 0.8848049848168159, "grad_norm": 4.852900505065918, "learning_rate": 1.1753340846884558e-07, "loss": 0.0518, "step": 5609 }, { "epoch": 0.8849627321844067, "grad_norm": 4.317257404327393, "learning_rate": 1.1737240379971018e-07, "loss": 0.0588, "step": 5610 }, { "epoch": 0.8851204795519975, "grad_norm": 5.594083786010742, "learning_rate": 1.1721139913057478e-07, "loss": 0.0359, "step": 5611 }, { "epoch": 0.8852782269195882, "grad_norm": 3.8825714588165283, "learning_rate": 1.1705039446143938e-07, "loss": 0.0558, "step": 5612 }, { "epoch": 0.8854359742871791, "grad_norm": 4.055095195770264, "learning_rate": 1.1688938979230398e-07, "loss": 0.0386, "step": 5613 }, { "epoch": 0.8855937216547699, "grad_norm": 4.435608863830566, "learning_rate": 1.1672838512316857e-07, "loss": 0.0153, "step": 5614 }, { "epoch": 0.8857514690223607, "grad_norm": 3.319115161895752, "learning_rate": 1.1656738045403317e-07, "loss": 0.0179, "step": 5615 }, { "epoch": 0.8859092163899515, "grad_norm": 3.764820098876953, "learning_rate": 1.1640637578489776e-07, "loss": 0.0494, "step": 5616 }, { "epoch": 0.8860669637575423, "grad_norm": 4.6623406410217285, "learning_rate": 1.1624537111576235e-07, "loss": 0.0426, "step": 5617 }, { "epoch": 0.8862247111251331, "grad_norm": 4.292932510375977, "learning_rate": 1.1608436644662695e-07, "loss": 0.0578, "step": 5618 }, { "epoch": 0.8863824584927239, "grad_norm": 4.933620452880859, "learning_rate": 1.1592336177749154e-07, "loss": 0.0407, "step": 5619 }, { "epoch": 0.8865402058603147, "grad_norm": 0.9307185411453247, "learning_rate": 1.1576235710835613e-07, "loss": 0.0053, "step": 5620 }, { "epoch": 0.8866979532279055, "grad_norm": 4.064803600311279, "learning_rate": 1.1560135243922073e-07, "loss": 0.0326, "step": 5621 }, { "epoch": 0.8868557005954963, "grad_norm": 5.850526332855225, "learning_rate": 1.1544034777008532e-07, "loss": 0.0516, "step": 5622 }, { "epoch": 0.8870134479630871, "grad_norm": 3.1925485134124756, "learning_rate": 1.1527934310094991e-07, "loss": 0.0234, "step": 5623 }, { "epoch": 0.8871711953306779, "grad_norm": 7.044651508331299, "learning_rate": 1.1511833843181452e-07, "loss": 0.0467, "step": 5624 }, { "epoch": 0.8873289426982687, "grad_norm": 6.3519978523254395, "learning_rate": 1.1495733376267911e-07, "loss": 0.0638, "step": 5625 }, { "epoch": 0.8874866900658596, "grad_norm": 4.373105525970459, "learning_rate": 1.1479632909354372e-07, "loss": 0.0268, "step": 5626 }, { "epoch": 0.8876444374334503, "grad_norm": 7.837530612945557, "learning_rate": 1.1463532442440831e-07, "loss": 0.054, "step": 5627 }, { "epoch": 0.8878021848010411, "grad_norm": 3.8669676780700684, "learning_rate": 1.144743197552729e-07, "loss": 0.0259, "step": 5628 }, { "epoch": 0.8879599321686319, "grad_norm": 5.265133380889893, "learning_rate": 1.143133150861375e-07, "loss": 0.0153, "step": 5629 }, { "epoch": 0.8881176795362228, "grad_norm": 3.393134593963623, "learning_rate": 1.1415231041700209e-07, "loss": 0.0203, "step": 5630 }, { "epoch": 0.8882754269038136, "grad_norm": 3.8280105590820312, "learning_rate": 1.1399130574786668e-07, "loss": 0.0243, "step": 5631 }, { "epoch": 0.8884331742714043, "grad_norm": 5.004944324493408, "learning_rate": 1.1383030107873128e-07, "loss": 0.0711, "step": 5632 }, { "epoch": 0.8885909216389951, "grad_norm": 3.1511619091033936, "learning_rate": 1.1366929640959587e-07, "loss": 0.03, "step": 5633 }, { "epoch": 0.888748669006586, "grad_norm": 8.524422645568848, "learning_rate": 1.1350829174046046e-07, "loss": 0.0457, "step": 5634 }, { "epoch": 0.8889064163741768, "grad_norm": 5.389379024505615, "learning_rate": 1.1334728707132506e-07, "loss": 0.0539, "step": 5635 }, { "epoch": 0.8890641637417676, "grad_norm": 4.989780902862549, "learning_rate": 1.1318628240218966e-07, "loss": 0.0295, "step": 5636 }, { "epoch": 0.8892219111093583, "grad_norm": 21.108285903930664, "learning_rate": 1.1302527773305426e-07, "loss": 0.0565, "step": 5637 }, { "epoch": 0.8893796584769492, "grad_norm": 4.350667476654053, "learning_rate": 1.1286427306391885e-07, "loss": 0.0329, "step": 5638 }, { "epoch": 0.88953740584454, "grad_norm": 5.962086200714111, "learning_rate": 1.1270326839478344e-07, "loss": 0.0349, "step": 5639 }, { "epoch": 0.8896951532121308, "grad_norm": 8.710920333862305, "learning_rate": 1.1254226372564803e-07, "loss": 0.033, "step": 5640 }, { "epoch": 0.8898529005797216, "grad_norm": 3.4920713901519775, "learning_rate": 1.1238125905651263e-07, "loss": 0.0182, "step": 5641 }, { "epoch": 0.8900106479473123, "grad_norm": 5.161077499389648, "learning_rate": 1.1222025438737723e-07, "loss": 0.0682, "step": 5642 }, { "epoch": 0.8901683953149032, "grad_norm": 3.349120855331421, "learning_rate": 1.1205924971824183e-07, "loss": 0.0304, "step": 5643 }, { "epoch": 0.890326142682494, "grad_norm": 6.49302864074707, "learning_rate": 1.1189824504910642e-07, "loss": 0.0597, "step": 5644 }, { "epoch": 0.8904838900500848, "grad_norm": 4.746656894683838, "learning_rate": 1.1173724037997101e-07, "loss": 0.0291, "step": 5645 }, { "epoch": 0.8906416374176755, "grad_norm": 4.701689720153809, "learning_rate": 1.115762357108356e-07, "loss": 0.0465, "step": 5646 }, { "epoch": 0.8907993847852664, "grad_norm": 4.649779319763184, "learning_rate": 1.1141523104170021e-07, "loss": 0.0535, "step": 5647 }, { "epoch": 0.8909571321528572, "grad_norm": 3.2291858196258545, "learning_rate": 1.112542263725648e-07, "loss": 0.0285, "step": 5648 }, { "epoch": 0.891114879520448, "grad_norm": 4.260729789733887, "learning_rate": 1.110932217034294e-07, "loss": 0.0249, "step": 5649 }, { "epoch": 0.8912726268880388, "grad_norm": 3.859715700149536, "learning_rate": 1.1093221703429399e-07, "loss": 0.0289, "step": 5650 }, { "epoch": 0.8914303742556297, "grad_norm": 5.544247150421143, "learning_rate": 1.1077121236515858e-07, "loss": 0.0311, "step": 5651 }, { "epoch": 0.8915881216232204, "grad_norm": 4.745953559875488, "learning_rate": 1.1061020769602318e-07, "loss": 0.0807, "step": 5652 }, { "epoch": 0.8917458689908112, "grad_norm": 4.021941184997559, "learning_rate": 1.1044920302688777e-07, "loss": 0.0193, "step": 5653 }, { "epoch": 0.891903616358402, "grad_norm": 6.016777515411377, "learning_rate": 1.1028819835775236e-07, "loss": 0.0643, "step": 5654 }, { "epoch": 0.8920613637259929, "grad_norm": 3.4665982723236084, "learning_rate": 1.1012719368861696e-07, "loss": 0.0571, "step": 5655 }, { "epoch": 0.8922191110935836, "grad_norm": 2.344822406768799, "learning_rate": 1.0996618901948155e-07, "loss": 0.0413, "step": 5656 }, { "epoch": 0.8923768584611744, "grad_norm": 4.093752384185791, "learning_rate": 1.0980518435034616e-07, "loss": 0.0189, "step": 5657 }, { "epoch": 0.8925346058287652, "grad_norm": 6.18466854095459, "learning_rate": 1.0964417968121075e-07, "loss": 0.0839, "step": 5658 }, { "epoch": 0.892692353196356, "grad_norm": 2.5295774936676025, "learning_rate": 1.0948317501207536e-07, "loss": 0.0328, "step": 5659 }, { "epoch": 0.8928501005639469, "grad_norm": 6.288724422454834, "learning_rate": 1.0932217034293995e-07, "loss": 0.0529, "step": 5660 }, { "epoch": 0.8930078479315376, "grad_norm": 6.388190746307373, "learning_rate": 1.0916116567380454e-07, "loss": 0.0637, "step": 5661 }, { "epoch": 0.8931655952991284, "grad_norm": 4.875352382659912, "learning_rate": 1.0900016100466914e-07, "loss": 0.0607, "step": 5662 }, { "epoch": 0.8933233426667192, "grad_norm": 5.0760416984558105, "learning_rate": 1.0883915633553373e-07, "loss": 0.0463, "step": 5663 }, { "epoch": 0.8934810900343101, "grad_norm": 6.9283952713012695, "learning_rate": 1.0867815166639832e-07, "loss": 0.0366, "step": 5664 }, { "epoch": 0.8936388374019009, "grad_norm": 5.368336200714111, "learning_rate": 1.0851714699726291e-07, "loss": 0.0461, "step": 5665 }, { "epoch": 0.8937965847694916, "grad_norm": 2.2210261821746826, "learning_rate": 1.0835614232812751e-07, "loss": 0.0205, "step": 5666 }, { "epoch": 0.8939543321370824, "grad_norm": 4.281381607055664, "learning_rate": 1.081951376589921e-07, "loss": 0.0409, "step": 5667 }, { "epoch": 0.8941120795046733, "grad_norm": 4.237894058227539, "learning_rate": 1.080341329898567e-07, "loss": 0.0498, "step": 5668 }, { "epoch": 0.8942698268722641, "grad_norm": 3.750728130340576, "learning_rate": 1.0787312832072129e-07, "loss": 0.0535, "step": 5669 }, { "epoch": 0.8944275742398549, "grad_norm": 7.401185035705566, "learning_rate": 1.0771212365158588e-07, "loss": 0.0527, "step": 5670 }, { "epoch": 0.8945853216074456, "grad_norm": 5.04086446762085, "learning_rate": 1.0755111898245049e-07, "loss": 0.0443, "step": 5671 }, { "epoch": 0.8947430689750365, "grad_norm": 5.274075508117676, "learning_rate": 1.0739011431331509e-07, "loss": 0.0281, "step": 5672 }, { "epoch": 0.8949008163426273, "grad_norm": 6.070065975189209, "learning_rate": 1.0722910964417969e-07, "loss": 0.0435, "step": 5673 }, { "epoch": 0.8950585637102181, "grad_norm": 4.067622184753418, "learning_rate": 1.0706810497504428e-07, "loss": 0.0234, "step": 5674 }, { "epoch": 0.8952163110778089, "grad_norm": 5.839089393615723, "learning_rate": 1.0690710030590887e-07, "loss": 0.06, "step": 5675 }, { "epoch": 0.8953740584453997, "grad_norm": 5.103932857513428, "learning_rate": 1.0674609563677346e-07, "loss": 0.0285, "step": 5676 }, { "epoch": 0.8955318058129905, "grad_norm": 3.5581488609313965, "learning_rate": 1.0658509096763806e-07, "loss": 0.0353, "step": 5677 }, { "epoch": 0.8956895531805813, "grad_norm": 7.937819004058838, "learning_rate": 1.0642408629850265e-07, "loss": 0.0493, "step": 5678 }, { "epoch": 0.8958473005481721, "grad_norm": 7.44099760055542, "learning_rate": 1.0626308162936724e-07, "loss": 0.0642, "step": 5679 }, { "epoch": 0.8960050479157629, "grad_norm": 4.186953544616699, "learning_rate": 1.0610207696023184e-07, "loss": 0.0364, "step": 5680 }, { "epoch": 0.8961627952833537, "grad_norm": 3.0762956142425537, "learning_rate": 1.0594107229109643e-07, "loss": 0.0212, "step": 5681 }, { "epoch": 0.8963205426509445, "grad_norm": 5.375474452972412, "learning_rate": 1.0578006762196104e-07, "loss": 0.0589, "step": 5682 }, { "epoch": 0.8964782900185353, "grad_norm": 7.132761001586914, "learning_rate": 1.0561906295282563e-07, "loss": 0.0608, "step": 5683 }, { "epoch": 0.8966360373861261, "grad_norm": 5.897538661956787, "learning_rate": 1.0545805828369022e-07, "loss": 0.0642, "step": 5684 }, { "epoch": 0.896793784753717, "grad_norm": 5.504117965698242, "learning_rate": 1.0529705361455482e-07, "loss": 0.0168, "step": 5685 }, { "epoch": 0.8969515321213077, "grad_norm": 11.507904052734375, "learning_rate": 1.0513604894541941e-07, "loss": 0.0618, "step": 5686 }, { "epoch": 0.8971092794888985, "grad_norm": 7.014697074890137, "learning_rate": 1.04975044276284e-07, "loss": 0.0459, "step": 5687 }, { "epoch": 0.8972670268564893, "grad_norm": 3.058468818664551, "learning_rate": 1.0481403960714861e-07, "loss": 0.0183, "step": 5688 }, { "epoch": 0.8974247742240802, "grad_norm": 10.896299362182617, "learning_rate": 1.046530349380132e-07, "loss": 0.0667, "step": 5689 }, { "epoch": 0.897582521591671, "grad_norm": 5.112190246582031, "learning_rate": 1.044920302688778e-07, "loss": 0.0228, "step": 5690 }, { "epoch": 0.8977402689592617, "grad_norm": 3.9673049449920654, "learning_rate": 1.0433102559974239e-07, "loss": 0.0489, "step": 5691 }, { "epoch": 0.8978980163268525, "grad_norm": 6.478947639465332, "learning_rate": 1.0417002093060698e-07, "loss": 0.0684, "step": 5692 }, { "epoch": 0.8980557636944434, "grad_norm": 3.892296552658081, "learning_rate": 1.0400901626147157e-07, "loss": 0.0657, "step": 5693 }, { "epoch": 0.8982135110620342, "grad_norm": 4.397469520568848, "learning_rate": 1.0384801159233618e-07, "loss": 0.0319, "step": 5694 }, { "epoch": 0.898371258429625, "grad_norm": 4.551697731018066, "learning_rate": 1.0368700692320077e-07, "loss": 0.0425, "step": 5695 }, { "epoch": 0.8985290057972157, "grad_norm": 4.86274528503418, "learning_rate": 1.0352600225406537e-07, "loss": 0.0459, "step": 5696 }, { "epoch": 0.8986867531648066, "grad_norm": 3.842259168624878, "learning_rate": 1.0336499758492996e-07, "loss": 0.0317, "step": 5697 }, { "epoch": 0.8988445005323974, "grad_norm": 7.030752658843994, "learning_rate": 1.0320399291579455e-07, "loss": 0.0573, "step": 5698 }, { "epoch": 0.8990022478999882, "grad_norm": 5.519429683685303, "learning_rate": 1.0304298824665915e-07, "loss": 0.1221, "step": 5699 }, { "epoch": 0.899159995267579, "grad_norm": 7.13485860824585, "learning_rate": 1.0288198357752374e-07, "loss": 0.049, "step": 5700 }, { "epoch": 0.8993177426351697, "grad_norm": 1.5401393175125122, "learning_rate": 1.0272097890838833e-07, "loss": 0.0075, "step": 5701 }, { "epoch": 0.8994754900027606, "grad_norm": 4.875319480895996, "learning_rate": 1.0255997423925292e-07, "loss": 0.0361, "step": 5702 }, { "epoch": 0.8996332373703514, "grad_norm": 3.3554129600524902, "learning_rate": 1.0239896957011753e-07, "loss": 0.0293, "step": 5703 }, { "epoch": 0.8997909847379422, "grad_norm": 8.17493724822998, "learning_rate": 1.0223796490098212e-07, "loss": 0.0382, "step": 5704 }, { "epoch": 0.899948732105533, "grad_norm": 3.584365129470825, "learning_rate": 1.0207696023184672e-07, "loss": 0.0409, "step": 5705 }, { "epoch": 0.9001064794731238, "grad_norm": 4.148134231567383, "learning_rate": 1.0191595556271132e-07, "loss": 0.0425, "step": 5706 }, { "epoch": 0.9002642268407146, "grad_norm": 9.026497840881348, "learning_rate": 1.0175495089357592e-07, "loss": 0.0745, "step": 5707 }, { "epoch": 0.9004219742083054, "grad_norm": 8.715054512023926, "learning_rate": 1.0159394622444051e-07, "loss": 0.0645, "step": 5708 }, { "epoch": 0.9005797215758962, "grad_norm": 5.331550598144531, "learning_rate": 1.014329415553051e-07, "loss": 0.0175, "step": 5709 }, { "epoch": 0.900737468943487, "grad_norm": 4.127692699432373, "learning_rate": 1.012719368861697e-07, "loss": 0.0376, "step": 5710 }, { "epoch": 0.9008952163110778, "grad_norm": 6.854963302612305, "learning_rate": 1.0111093221703429e-07, "loss": 0.0876, "step": 5711 }, { "epoch": 0.9010529636786686, "grad_norm": 4.6490044593811035, "learning_rate": 1.0094992754789888e-07, "loss": 0.039, "step": 5712 }, { "epoch": 0.9012107110462594, "grad_norm": 6.645543098449707, "learning_rate": 1.0078892287876347e-07, "loss": 0.0349, "step": 5713 }, { "epoch": 0.9013684584138503, "grad_norm": 2.1745362281799316, "learning_rate": 1.0062791820962807e-07, "loss": 0.0121, "step": 5714 }, { "epoch": 0.901526205781441, "grad_norm": 2.395117998123169, "learning_rate": 1.0046691354049266e-07, "loss": 0.0068, "step": 5715 }, { "epoch": 0.9016839531490318, "grad_norm": 4.637762069702148, "learning_rate": 1.0030590887135725e-07, "loss": 0.0404, "step": 5716 }, { "epoch": 0.9018417005166226, "grad_norm": 2.9174087047576904, "learning_rate": 1.0014490420222186e-07, "loss": 0.0251, "step": 5717 }, { "epoch": 0.9019994478842134, "grad_norm": 4.064671039581299, "learning_rate": 9.998389953308647e-08, "loss": 0.0236, "step": 5718 }, { "epoch": 0.9021571952518043, "grad_norm": 8.09548568725586, "learning_rate": 9.982289486395106e-08, "loss": 0.0497, "step": 5719 }, { "epoch": 0.902314942619395, "grad_norm": 5.048455238342285, "learning_rate": 9.966189019481565e-08, "loss": 0.0369, "step": 5720 }, { "epoch": 0.9024726899869858, "grad_norm": 5.491619110107422, "learning_rate": 9.950088552568025e-08, "loss": 0.0354, "step": 5721 }, { "epoch": 0.9026304373545766, "grad_norm": 6.2281317710876465, "learning_rate": 9.933988085654484e-08, "loss": 0.0335, "step": 5722 }, { "epoch": 0.9027881847221675, "grad_norm": 1.4659321308135986, "learning_rate": 9.917887618740943e-08, "loss": 0.0068, "step": 5723 }, { "epoch": 0.9029459320897583, "grad_norm": 4.197894096374512, "learning_rate": 9.901787151827403e-08, "loss": 0.0271, "step": 5724 }, { "epoch": 0.903103679457349, "grad_norm": 3.937116861343384, "learning_rate": 9.885686684913862e-08, "loss": 0.0565, "step": 5725 }, { "epoch": 0.9032614268249398, "grad_norm": 2.79811429977417, "learning_rate": 9.869586218000321e-08, "loss": 0.0418, "step": 5726 }, { "epoch": 0.9034191741925307, "grad_norm": 4.082679271697998, "learning_rate": 9.85348575108678e-08, "loss": 0.0116, "step": 5727 }, { "epoch": 0.9035769215601215, "grad_norm": 5.4862446784973145, "learning_rate": 9.83738528417324e-08, "loss": 0.0516, "step": 5728 }, { "epoch": 0.9037346689277123, "grad_norm": 3.0281641483306885, "learning_rate": 9.8212848172597e-08, "loss": 0.0332, "step": 5729 }, { "epoch": 0.903892416295303, "grad_norm": 6.902345180511475, "learning_rate": 9.80518435034616e-08, "loss": 0.0528, "step": 5730 }, { "epoch": 0.9040501636628939, "grad_norm": 9.338648796081543, "learning_rate": 9.789083883432619e-08, "loss": 0.0614, "step": 5731 }, { "epoch": 0.9042079110304847, "grad_norm": 4.462130546569824, "learning_rate": 9.772983416519078e-08, "loss": 0.0512, "step": 5732 }, { "epoch": 0.9043656583980755, "grad_norm": 4.22948694229126, "learning_rate": 9.756882949605538e-08, "loss": 0.0714, "step": 5733 }, { "epoch": 0.9045234057656663, "grad_norm": 4.789216995239258, "learning_rate": 9.740782482691998e-08, "loss": 0.0626, "step": 5734 }, { "epoch": 0.9046811531332571, "grad_norm": 2.0657951831817627, "learning_rate": 9.724682015778458e-08, "loss": 0.0188, "step": 5735 }, { "epoch": 0.9048389005008479, "grad_norm": 3.498124122619629, "learning_rate": 9.708581548864917e-08, "loss": 0.0583, "step": 5736 }, { "epoch": 0.9049966478684387, "grad_norm": 2.890901565551758, "learning_rate": 9.692481081951376e-08, "loss": 0.0251, "step": 5737 }, { "epoch": 0.9051543952360295, "grad_norm": 5.114566326141357, "learning_rate": 9.676380615037835e-08, "loss": 0.0712, "step": 5738 }, { "epoch": 0.9053121426036203, "grad_norm": 4.458266735076904, "learning_rate": 9.660280148124295e-08, "loss": 0.0312, "step": 5739 }, { "epoch": 0.9054698899712111, "grad_norm": 4.075973987579346, "learning_rate": 9.644179681210755e-08, "loss": 0.0414, "step": 5740 }, { "epoch": 0.9056276373388019, "grad_norm": 4.164230823516846, "learning_rate": 9.628079214297215e-08, "loss": 0.0381, "step": 5741 }, { "epoch": 0.9057853847063927, "grad_norm": 2.8527257442474365, "learning_rate": 9.611978747383674e-08, "loss": 0.0186, "step": 5742 }, { "epoch": 0.9059431320739835, "grad_norm": 2.837355136871338, "learning_rate": 9.595878280470133e-08, "loss": 0.0278, "step": 5743 }, { "epoch": 0.9061008794415744, "grad_norm": 4.097024917602539, "learning_rate": 9.579777813556593e-08, "loss": 0.0384, "step": 5744 }, { "epoch": 0.9062586268091651, "grad_norm": 5.411277770996094, "learning_rate": 9.563677346643052e-08, "loss": 0.0378, "step": 5745 }, { "epoch": 0.9064163741767559, "grad_norm": 11.387405395507812, "learning_rate": 9.547576879729511e-08, "loss": 0.0499, "step": 5746 }, { "epoch": 0.9065741215443467, "grad_norm": 3.678576707839966, "learning_rate": 9.53147641281597e-08, "loss": 0.0359, "step": 5747 }, { "epoch": 0.9067318689119376, "grad_norm": 4.650317668914795, "learning_rate": 9.51537594590243e-08, "loss": 0.0657, "step": 5748 }, { "epoch": 0.9068896162795284, "grad_norm": 2.559847116470337, "learning_rate": 9.49927547898889e-08, "loss": 0.0326, "step": 5749 }, { "epoch": 0.9070473636471191, "grad_norm": 5.558595180511475, "learning_rate": 9.48317501207535e-08, "loss": 0.0775, "step": 5750 }, { "epoch": 0.9072051110147099, "grad_norm": 3.4408504962921143, "learning_rate": 9.467074545161809e-08, "loss": 0.0156, "step": 5751 }, { "epoch": 0.9073628583823008, "grad_norm": 3.8436343669891357, "learning_rate": 9.45097407824827e-08, "loss": 0.0348, "step": 5752 }, { "epoch": 0.9075206057498916, "grad_norm": 4.63906192779541, "learning_rate": 9.434873611334729e-08, "loss": 0.0233, "step": 5753 }, { "epoch": 0.9076783531174824, "grad_norm": 4.913653373718262, "learning_rate": 9.418773144421188e-08, "loss": 0.0455, "step": 5754 }, { "epoch": 0.9078361004850731, "grad_norm": 6.2280707359313965, "learning_rate": 9.402672677507648e-08, "loss": 0.087, "step": 5755 }, { "epoch": 0.9079938478526639, "grad_norm": 5.642157554626465, "learning_rate": 9.386572210594107e-08, "loss": 0.0851, "step": 5756 }, { "epoch": 0.9081515952202548, "grad_norm": 5.007143020629883, "learning_rate": 9.370471743680566e-08, "loss": 0.0507, "step": 5757 }, { "epoch": 0.9083093425878456, "grad_norm": 3.5131096839904785, "learning_rate": 9.354371276767026e-08, "loss": 0.0378, "step": 5758 }, { "epoch": 0.9084670899554363, "grad_norm": 4.084439277648926, "learning_rate": 9.338270809853485e-08, "loss": 0.0372, "step": 5759 }, { "epoch": 0.9086248373230271, "grad_norm": 3.037368059158325, "learning_rate": 9.322170342939944e-08, "loss": 0.0143, "step": 5760 }, { "epoch": 0.908782584690618, "grad_norm": 2.3745875358581543, "learning_rate": 9.306069876026404e-08, "loss": 0.0217, "step": 5761 }, { "epoch": 0.9089403320582088, "grad_norm": 4.520737171173096, "learning_rate": 9.289969409112863e-08, "loss": 0.0823, "step": 5762 }, { "epoch": 0.9090980794257996, "grad_norm": 5.6762542724609375, "learning_rate": 9.273868942199322e-08, "loss": 0.0649, "step": 5763 }, { "epoch": 0.9092558267933903, "grad_norm": 7.527730464935303, "learning_rate": 9.257768475285784e-08, "loss": 0.0406, "step": 5764 }, { "epoch": 0.9094135741609812, "grad_norm": 2.8972980976104736, "learning_rate": 9.241668008372243e-08, "loss": 0.02, "step": 5765 }, { "epoch": 0.909571321528572, "grad_norm": 3.597255229949951, "learning_rate": 9.225567541458703e-08, "loss": 0.0402, "step": 5766 }, { "epoch": 0.9097290688961628, "grad_norm": 5.695924758911133, "learning_rate": 9.209467074545162e-08, "loss": 0.051, "step": 5767 }, { "epoch": 0.9098868162637536, "grad_norm": 6.006032943725586, "learning_rate": 9.193366607631621e-08, "loss": 0.0357, "step": 5768 }, { "epoch": 0.9100445636313444, "grad_norm": 2.889230728149414, "learning_rate": 9.17726614071808e-08, "loss": 0.0176, "step": 5769 }, { "epoch": 0.9102023109989352, "grad_norm": 3.2964913845062256, "learning_rate": 9.16116567380454e-08, "loss": 0.0093, "step": 5770 }, { "epoch": 0.910360058366526, "grad_norm": 3.3851239681243896, "learning_rate": 9.145065206890999e-08, "loss": 0.0181, "step": 5771 }, { "epoch": 0.9105178057341168, "grad_norm": 4.152960300445557, "learning_rate": 9.128964739977459e-08, "loss": 0.0339, "step": 5772 }, { "epoch": 0.9106755531017077, "grad_norm": 6.54844331741333, "learning_rate": 9.112864273063918e-08, "loss": 0.0459, "step": 5773 }, { "epoch": 0.9108333004692984, "grad_norm": 4.084027290344238, "learning_rate": 9.096763806150377e-08, "loss": 0.0379, "step": 5774 }, { "epoch": 0.9109910478368892, "grad_norm": 6.559889316558838, "learning_rate": 9.080663339236838e-08, "loss": 0.045, "step": 5775 }, { "epoch": 0.91114879520448, "grad_norm": 2.890805721282959, "learning_rate": 9.064562872323297e-08, "loss": 0.022, "step": 5776 }, { "epoch": 0.9113065425720708, "grad_norm": 3.939265251159668, "learning_rate": 9.048462405409756e-08, "loss": 0.0385, "step": 5777 }, { "epoch": 0.9114642899396617, "grad_norm": 4.184532642364502, "learning_rate": 9.032361938496216e-08, "loss": 0.0332, "step": 5778 }, { "epoch": 0.9116220373072524, "grad_norm": 3.5728583335876465, "learning_rate": 9.016261471582675e-08, "loss": 0.0374, "step": 5779 }, { "epoch": 0.9117797846748432, "grad_norm": 5.712306499481201, "learning_rate": 9.000161004669136e-08, "loss": 0.0582, "step": 5780 }, { "epoch": 0.911937532042434, "grad_norm": 4.514135360717773, "learning_rate": 8.984060537755595e-08, "loss": 0.0317, "step": 5781 }, { "epoch": 0.9120952794100249, "grad_norm": 4.458296775817871, "learning_rate": 8.967960070842054e-08, "loss": 0.049, "step": 5782 }, { "epoch": 0.9122530267776157, "grad_norm": 2.0265932083129883, "learning_rate": 8.951859603928514e-08, "loss": 0.0106, "step": 5783 }, { "epoch": 0.9124107741452064, "grad_norm": 2.6315975189208984, "learning_rate": 8.935759137014973e-08, "loss": 0.0152, "step": 5784 }, { "epoch": 0.9125685215127972, "grad_norm": 4.49893045425415, "learning_rate": 8.919658670101432e-08, "loss": 0.0805, "step": 5785 }, { "epoch": 0.9127262688803881, "grad_norm": 7.787235736846924, "learning_rate": 8.903558203187892e-08, "loss": 0.0452, "step": 5786 }, { "epoch": 0.9128840162479789, "grad_norm": 5.546772003173828, "learning_rate": 8.887457736274352e-08, "loss": 0.0824, "step": 5787 }, { "epoch": 0.9130417636155697, "grad_norm": 5.021836280822754, "learning_rate": 8.871357269360811e-08, "loss": 0.0442, "step": 5788 }, { "epoch": 0.9131995109831604, "grad_norm": 3.255812883377075, "learning_rate": 8.855256802447271e-08, "loss": 0.0178, "step": 5789 }, { "epoch": 0.9133572583507513, "grad_norm": 5.3505096435546875, "learning_rate": 8.83915633553373e-08, "loss": 0.0491, "step": 5790 }, { "epoch": 0.9135150057183421, "grad_norm": 4.129081726074219, "learning_rate": 8.82305586862019e-08, "loss": 0.0313, "step": 5791 }, { "epoch": 0.9136727530859329, "grad_norm": 5.043010711669922, "learning_rate": 8.806955401706649e-08, "loss": 0.0213, "step": 5792 }, { "epoch": 0.9138305004535237, "grad_norm": 2.7486824989318848, "learning_rate": 8.790854934793108e-08, "loss": 0.0356, "step": 5793 }, { "epoch": 0.9139882478211144, "grad_norm": 2.983541250228882, "learning_rate": 8.774754467879567e-08, "loss": 0.0216, "step": 5794 }, { "epoch": 0.9141459951887053, "grad_norm": 2.1109204292297363, "learning_rate": 8.758654000966028e-08, "loss": 0.058, "step": 5795 }, { "epoch": 0.9143037425562961, "grad_norm": 3.1736154556274414, "learning_rate": 8.742553534052487e-08, "loss": 0.0378, "step": 5796 }, { "epoch": 0.9144614899238869, "grad_norm": 6.466217517852783, "learning_rate": 8.726453067138947e-08, "loss": 0.0473, "step": 5797 }, { "epoch": 0.9146192372914776, "grad_norm": 3.0933151245117188, "learning_rate": 8.710352600225406e-08, "loss": 0.0267, "step": 5798 }, { "epoch": 0.9147769846590685, "grad_norm": 7.6968770027160645, "learning_rate": 8.694252133311866e-08, "loss": 0.0586, "step": 5799 }, { "epoch": 0.9149347320266593, "grad_norm": 4.800207614898682, "learning_rate": 8.678151666398326e-08, "loss": 0.0374, "step": 5800 }, { "epoch": 0.9150924793942501, "grad_norm": 4.6078314781188965, "learning_rate": 8.662051199484785e-08, "loss": 0.0229, "step": 5801 }, { "epoch": 0.9152502267618409, "grad_norm": 2.8267221450805664, "learning_rate": 8.645950732571244e-08, "loss": 0.0293, "step": 5802 }, { "epoch": 0.9154079741294318, "grad_norm": 5.6814141273498535, "learning_rate": 8.629850265657704e-08, "loss": 0.0464, "step": 5803 }, { "epoch": 0.9155657214970225, "grad_norm": 10.271818161010742, "learning_rate": 8.613749798744163e-08, "loss": 0.0479, "step": 5804 }, { "epoch": 0.9157234688646133, "grad_norm": 5.682190418243408, "learning_rate": 8.597649331830622e-08, "loss": 0.0398, "step": 5805 }, { "epoch": 0.9158812162322041, "grad_norm": 5.16942834854126, "learning_rate": 8.581548864917082e-08, "loss": 0.0333, "step": 5806 }, { "epoch": 0.916038963599795, "grad_norm": 3.6980783939361572, "learning_rate": 8.565448398003541e-08, "loss": 0.0293, "step": 5807 }, { "epoch": 0.9161967109673858, "grad_norm": 5.025379657745361, "learning_rate": 8.54934793109e-08, "loss": 0.0434, "step": 5808 }, { "epoch": 0.9163544583349765, "grad_norm": 0.8971937298774719, "learning_rate": 8.53324746417646e-08, "loss": 0.0076, "step": 5809 }, { "epoch": 0.9165122057025673, "grad_norm": 7.095747947692871, "learning_rate": 8.517146997262922e-08, "loss": 0.0386, "step": 5810 }, { "epoch": 0.9166699530701582, "grad_norm": 2.7633774280548096, "learning_rate": 8.501046530349381e-08, "loss": 0.0246, "step": 5811 }, { "epoch": 0.916827700437749, "grad_norm": 6.486352920532227, "learning_rate": 8.48494606343584e-08, "loss": 0.0559, "step": 5812 }, { "epoch": 0.9169854478053397, "grad_norm": 7.435791015625, "learning_rate": 8.4688455965223e-08, "loss": 0.0635, "step": 5813 }, { "epoch": 0.9171431951729305, "grad_norm": 5.165439128875732, "learning_rate": 8.452745129608759e-08, "loss": 0.0384, "step": 5814 }, { "epoch": 0.9173009425405213, "grad_norm": 7.617971420288086, "learning_rate": 8.436644662695218e-08, "loss": 0.0721, "step": 5815 }, { "epoch": 0.9174586899081122, "grad_norm": 5.212454319000244, "learning_rate": 8.420544195781677e-08, "loss": 0.0636, "step": 5816 }, { "epoch": 0.917616437275703, "grad_norm": 6.4869489669799805, "learning_rate": 8.404443728868137e-08, "loss": 0.0487, "step": 5817 }, { "epoch": 0.9177741846432937, "grad_norm": 3.043604850769043, "learning_rate": 8.388343261954596e-08, "loss": 0.0208, "step": 5818 }, { "epoch": 0.9179319320108845, "grad_norm": 5.8964667320251465, "learning_rate": 8.372242795041055e-08, "loss": 0.0658, "step": 5819 }, { "epoch": 0.9180896793784754, "grad_norm": 6.04457950592041, "learning_rate": 8.356142328127515e-08, "loss": 0.053, "step": 5820 }, { "epoch": 0.9182474267460662, "grad_norm": 9.761399269104004, "learning_rate": 8.340041861213974e-08, "loss": 0.0687, "step": 5821 }, { "epoch": 0.918405174113657, "grad_norm": 2.7290096282958984, "learning_rate": 8.323941394300435e-08, "loss": 0.0194, "step": 5822 }, { "epoch": 0.9185629214812477, "grad_norm": 3.8870458602905273, "learning_rate": 8.307840927386894e-08, "loss": 0.0371, "step": 5823 }, { "epoch": 0.9187206688488386, "grad_norm": 4.762032985687256, "learning_rate": 8.291740460473353e-08, "loss": 0.0522, "step": 5824 }, { "epoch": 0.9188784162164294, "grad_norm": 4.974912643432617, "learning_rate": 8.275639993559812e-08, "loss": 0.0464, "step": 5825 }, { "epoch": 0.9190361635840202, "grad_norm": 4.439513206481934, "learning_rate": 8.259539526646273e-08, "loss": 0.0447, "step": 5826 }, { "epoch": 0.919193910951611, "grad_norm": 5.005949974060059, "learning_rate": 8.243439059732732e-08, "loss": 0.04, "step": 5827 }, { "epoch": 0.9193516583192018, "grad_norm": 6.744287014007568, "learning_rate": 8.227338592819192e-08, "loss": 0.0569, "step": 5828 }, { "epoch": 0.9195094056867926, "grad_norm": 7.064107894897461, "learning_rate": 8.211238125905651e-08, "loss": 0.0651, "step": 5829 }, { "epoch": 0.9196671530543834, "grad_norm": 6.484222888946533, "learning_rate": 8.19513765899211e-08, "loss": 0.0483, "step": 5830 }, { "epoch": 0.9198249004219742, "grad_norm": 5.973041534423828, "learning_rate": 8.17903719207857e-08, "loss": 0.0357, "step": 5831 }, { "epoch": 0.9199826477895651, "grad_norm": 11.105122566223145, "learning_rate": 8.162936725165029e-08, "loss": 0.0715, "step": 5832 }, { "epoch": 0.9201403951571558, "grad_norm": 6.121046543121338, "learning_rate": 8.146836258251488e-08, "loss": 0.0478, "step": 5833 }, { "epoch": 0.9202981425247466, "grad_norm": 4.047852039337158, "learning_rate": 8.130735791337949e-08, "loss": 0.0526, "step": 5834 }, { "epoch": 0.9204558898923374, "grad_norm": 7.392566204071045, "learning_rate": 8.114635324424408e-08, "loss": 0.0226, "step": 5835 }, { "epoch": 0.9206136372599282, "grad_norm": 5.444931983947754, "learning_rate": 8.098534857510868e-08, "loss": 0.034, "step": 5836 }, { "epoch": 0.9207713846275191, "grad_norm": 6.684887886047363, "learning_rate": 8.082434390597327e-08, "loss": 0.0243, "step": 5837 }, { "epoch": 0.9209291319951098, "grad_norm": 6.164416790008545, "learning_rate": 8.066333923683786e-08, "loss": 0.0401, "step": 5838 }, { "epoch": 0.9210868793627006, "grad_norm": 4.6397175788879395, "learning_rate": 8.050233456770245e-08, "loss": 0.0423, "step": 5839 }, { "epoch": 0.9212446267302914, "grad_norm": 5.3267340660095215, "learning_rate": 8.034132989856705e-08, "loss": 0.0586, "step": 5840 }, { "epoch": 0.9214023740978823, "grad_norm": 6.337176322937012, "learning_rate": 8.018032522943165e-08, "loss": 0.0669, "step": 5841 }, { "epoch": 0.9215601214654731, "grad_norm": 6.352117538452148, "learning_rate": 8.001932056029625e-08, "loss": 0.0827, "step": 5842 }, { "epoch": 0.9217178688330638, "grad_norm": 4.213643550872803, "learning_rate": 7.985831589116084e-08, "loss": 0.0245, "step": 5843 }, { "epoch": 0.9218756162006546, "grad_norm": 3.4744551181793213, "learning_rate": 7.969731122202543e-08, "loss": 0.0189, "step": 5844 }, { "epoch": 0.9220333635682455, "grad_norm": 4.864378929138184, "learning_rate": 7.953630655289004e-08, "loss": 0.0386, "step": 5845 }, { "epoch": 0.9221911109358363, "grad_norm": 4.443397045135498, "learning_rate": 7.937530188375463e-08, "loss": 0.0565, "step": 5846 }, { "epoch": 0.922348858303427, "grad_norm": 3.475379228591919, "learning_rate": 7.921429721461923e-08, "loss": 0.0184, "step": 5847 }, { "epoch": 0.9225066056710178, "grad_norm": 6.629957675933838, "learning_rate": 7.905329254548382e-08, "loss": 0.0682, "step": 5848 }, { "epoch": 0.9226643530386087, "grad_norm": 9.265862464904785, "learning_rate": 7.889228787634841e-08, "loss": 0.0616, "step": 5849 }, { "epoch": 0.9228221004061995, "grad_norm": 7.067864418029785, "learning_rate": 7.8731283207213e-08, "loss": 0.0488, "step": 5850 }, { "epoch": 0.9229798477737903, "grad_norm": 8.622649192810059, "learning_rate": 7.85702785380776e-08, "loss": 0.0287, "step": 5851 }, { "epoch": 0.923137595141381, "grad_norm": 3.200631618499756, "learning_rate": 7.840927386894219e-08, "loss": 0.0204, "step": 5852 }, { "epoch": 0.9232953425089718, "grad_norm": 6.694149017333984, "learning_rate": 7.824826919980678e-08, "loss": 0.0626, "step": 5853 }, { "epoch": 0.9234530898765627, "grad_norm": 3.36803936958313, "learning_rate": 7.808726453067138e-08, "loss": 0.0211, "step": 5854 }, { "epoch": 0.9236108372441535, "grad_norm": 5.772575855255127, "learning_rate": 7.792625986153597e-08, "loss": 0.0626, "step": 5855 }, { "epoch": 0.9237685846117443, "grad_norm": 3.5682120323181152, "learning_rate": 7.776525519240056e-08, "loss": 0.0298, "step": 5856 }, { "epoch": 0.923926331979335, "grad_norm": 9.057229042053223, "learning_rate": 7.760425052326518e-08, "loss": 0.0443, "step": 5857 }, { "epoch": 0.9240840793469259, "grad_norm": 8.135175704956055, "learning_rate": 7.744324585412978e-08, "loss": 0.0441, "step": 5858 }, { "epoch": 0.9242418267145167, "grad_norm": 3.027372121810913, "learning_rate": 7.728224118499437e-08, "loss": 0.0251, "step": 5859 }, { "epoch": 0.9243995740821075, "grad_norm": 16.04706382751465, "learning_rate": 7.712123651585896e-08, "loss": 0.0292, "step": 5860 }, { "epoch": 0.9245573214496983, "grad_norm": 2.796297550201416, "learning_rate": 7.696023184672356e-08, "loss": 0.0483, "step": 5861 }, { "epoch": 0.9247150688172892, "grad_norm": 7.089400291442871, "learning_rate": 7.679922717758815e-08, "loss": 0.0588, "step": 5862 }, { "epoch": 0.9248728161848799, "grad_norm": 11.826553344726562, "learning_rate": 7.663822250845274e-08, "loss": 0.0606, "step": 5863 }, { "epoch": 0.9250305635524707, "grad_norm": 4.3761677742004395, "learning_rate": 7.647721783931733e-08, "loss": 0.0277, "step": 5864 }, { "epoch": 0.9251883109200615, "grad_norm": 6.777541637420654, "learning_rate": 7.631621317018193e-08, "loss": 0.0583, "step": 5865 }, { "epoch": 0.9253460582876524, "grad_norm": 2.7931265830993652, "learning_rate": 7.615520850104652e-08, "loss": 0.0378, "step": 5866 }, { "epoch": 0.9255038056552432, "grad_norm": 4.795775413513184, "learning_rate": 7.599420383191111e-08, "loss": 0.0494, "step": 5867 }, { "epoch": 0.9256615530228339, "grad_norm": 4.076138019561768, "learning_rate": 7.583319916277572e-08, "loss": 0.0381, "step": 5868 }, { "epoch": 0.9258193003904247, "grad_norm": 7.07107400894165, "learning_rate": 7.567219449364031e-08, "loss": 0.0688, "step": 5869 }, { "epoch": 0.9259770477580156, "grad_norm": 4.784503936767578, "learning_rate": 7.55111898245049e-08, "loss": 0.0284, "step": 5870 }, { "epoch": 0.9261347951256064, "grad_norm": 2.3995094299316406, "learning_rate": 7.53501851553695e-08, "loss": 0.033, "step": 5871 }, { "epoch": 0.9262925424931971, "grad_norm": 5.072754383087158, "learning_rate": 7.51891804862341e-08, "loss": 0.0532, "step": 5872 }, { "epoch": 0.9264502898607879, "grad_norm": 3.824201822280884, "learning_rate": 7.50281758170987e-08, "loss": 0.0318, "step": 5873 }, { "epoch": 0.9266080372283787, "grad_norm": 5.722204208374023, "learning_rate": 7.486717114796329e-08, "loss": 0.0457, "step": 5874 }, { "epoch": 0.9267657845959696, "grad_norm": 18.639663696289062, "learning_rate": 7.470616647882788e-08, "loss": 0.0937, "step": 5875 }, { "epoch": 0.9269235319635604, "grad_norm": 2.2406606674194336, "learning_rate": 7.454516180969248e-08, "loss": 0.014, "step": 5876 }, { "epoch": 0.9270812793311511, "grad_norm": 4.865085601806641, "learning_rate": 7.438415714055707e-08, "loss": 0.0375, "step": 5877 }, { "epoch": 0.9272390266987419, "grad_norm": 4.385605335235596, "learning_rate": 7.422315247142166e-08, "loss": 0.0146, "step": 5878 }, { "epoch": 0.9273967740663328, "grad_norm": 4.06872034072876, "learning_rate": 7.406214780228626e-08, "loss": 0.0286, "step": 5879 }, { "epoch": 0.9275545214339236, "grad_norm": 10.020902633666992, "learning_rate": 7.390114313315086e-08, "loss": 0.0696, "step": 5880 }, { "epoch": 0.9277122688015144, "grad_norm": 3.6544883251190186, "learning_rate": 7.374013846401546e-08, "loss": 0.0246, "step": 5881 }, { "epoch": 0.9278700161691051, "grad_norm": 6.031393051147461, "learning_rate": 7.357913379488005e-08, "loss": 0.0259, "step": 5882 }, { "epoch": 0.928027763536696, "grad_norm": 3.699489116668701, "learning_rate": 7.341812912574464e-08, "loss": 0.0367, "step": 5883 }, { "epoch": 0.9281855109042868, "grad_norm": 4.587629318237305, "learning_rate": 7.325712445660924e-08, "loss": 0.0326, "step": 5884 }, { "epoch": 0.9283432582718776, "grad_norm": 1.859129548072815, "learning_rate": 7.309611978747383e-08, "loss": 0.0118, "step": 5885 }, { "epoch": 0.9285010056394684, "grad_norm": 4.384711742401123, "learning_rate": 7.293511511833842e-08, "loss": 0.0323, "step": 5886 }, { "epoch": 0.9286587530070592, "grad_norm": 7.893499851226807, "learning_rate": 7.277411044920303e-08, "loss": 0.1054, "step": 5887 }, { "epoch": 0.92881650037465, "grad_norm": 16.19036293029785, "learning_rate": 7.261310578006762e-08, "loss": 0.15, "step": 5888 }, { "epoch": 0.9289742477422408, "grad_norm": 4.184976100921631, "learning_rate": 7.245210111093221e-08, "loss": 0.0512, "step": 5889 }, { "epoch": 0.9291319951098316, "grad_norm": 5.087081432342529, "learning_rate": 7.229109644179681e-08, "loss": 0.0444, "step": 5890 }, { "epoch": 0.9292897424774224, "grad_norm": 6.60899543762207, "learning_rate": 7.21300917726614e-08, "loss": 0.026, "step": 5891 }, { "epoch": 0.9294474898450132, "grad_norm": 2.9725310802459717, "learning_rate": 7.196908710352601e-08, "loss": 0.0231, "step": 5892 }, { "epoch": 0.929605237212604, "grad_norm": 6.0414910316467285, "learning_rate": 7.18080824343906e-08, "loss": 0.0713, "step": 5893 }, { "epoch": 0.9297629845801948, "grad_norm": 4.175647735595703, "learning_rate": 7.164707776525519e-08, "loss": 0.023, "step": 5894 }, { "epoch": 0.9299207319477856, "grad_norm": 6.7042236328125, "learning_rate": 7.148607309611979e-08, "loss": 0.0254, "step": 5895 }, { "epoch": 0.9300784793153765, "grad_norm": 33.338382720947266, "learning_rate": 7.132506842698438e-08, "loss": 0.0215, "step": 5896 }, { "epoch": 0.9302362266829672, "grad_norm": 7.470001220703125, "learning_rate": 7.116406375784897e-08, "loss": 0.0527, "step": 5897 }, { "epoch": 0.930393974050558, "grad_norm": 4.17733907699585, "learning_rate": 7.100305908871357e-08, "loss": 0.0468, "step": 5898 }, { "epoch": 0.9305517214181488, "grad_norm": 5.177865505218506, "learning_rate": 7.084205441957816e-08, "loss": 0.0356, "step": 5899 }, { "epoch": 0.9307094687857397, "grad_norm": 5.825542449951172, "learning_rate": 7.068104975044275e-08, "loss": 0.0502, "step": 5900 }, { "epoch": 0.9308672161533305, "grad_norm": 5.751056671142578, "learning_rate": 7.052004508130734e-08, "loss": 0.0249, "step": 5901 }, { "epoch": 0.9310249635209212, "grad_norm": 4.16907262802124, "learning_rate": 7.035904041217194e-08, "loss": 0.0824, "step": 5902 }, { "epoch": 0.931182710888512, "grad_norm": 4.030389785766602, "learning_rate": 7.019803574303656e-08, "loss": 0.0541, "step": 5903 }, { "epoch": 0.9313404582561029, "grad_norm": 5.3484625816345215, "learning_rate": 7.003703107390115e-08, "loss": 0.0447, "step": 5904 }, { "epoch": 0.9314982056236937, "grad_norm": 5.1839094161987305, "learning_rate": 6.987602640476574e-08, "loss": 0.0312, "step": 5905 }, { "epoch": 0.9316559529912845, "grad_norm": 7.408259868621826, "learning_rate": 6.971502173563034e-08, "loss": 0.0373, "step": 5906 }, { "epoch": 0.9318137003588752, "grad_norm": 3.3852338790893555, "learning_rate": 6.955401706649493e-08, "loss": 0.0355, "step": 5907 }, { "epoch": 0.9319714477264661, "grad_norm": 5.942039489746094, "learning_rate": 6.939301239735952e-08, "loss": 0.0355, "step": 5908 }, { "epoch": 0.9321291950940569, "grad_norm": 5.299680709838867, "learning_rate": 6.923200772822412e-08, "loss": 0.0726, "step": 5909 }, { "epoch": 0.9322869424616477, "grad_norm": 1.9223432540893555, "learning_rate": 6.907100305908871e-08, "loss": 0.0134, "step": 5910 }, { "epoch": 0.9324446898292384, "grad_norm": 4.402111530303955, "learning_rate": 6.89099983899533e-08, "loss": 0.0334, "step": 5911 }, { "epoch": 0.9326024371968292, "grad_norm": 4.189789295196533, "learning_rate": 6.87489937208179e-08, "loss": 0.0312, "step": 5912 }, { "epoch": 0.9327601845644201, "grad_norm": 4.966956615447998, "learning_rate": 6.858798905168249e-08, "loss": 0.0456, "step": 5913 }, { "epoch": 0.9329179319320109, "grad_norm": 4.631741523742676, "learning_rate": 6.842698438254708e-08, "loss": 0.061, "step": 5914 }, { "epoch": 0.9330756792996017, "grad_norm": 6.761546611785889, "learning_rate": 6.826597971341169e-08, "loss": 0.0348, "step": 5915 }, { "epoch": 0.9332334266671924, "grad_norm": 3.471755027770996, "learning_rate": 6.810497504427628e-08, "loss": 0.0353, "step": 5916 }, { "epoch": 0.9333911740347833, "grad_norm": 5.140878200531006, "learning_rate": 6.794397037514087e-08, "loss": 0.0413, "step": 5917 }, { "epoch": 0.9335489214023741, "grad_norm": 5.290947914123535, "learning_rate": 6.778296570600548e-08, "loss": 0.0488, "step": 5918 }, { "epoch": 0.9337066687699649, "grad_norm": 7.529231548309326, "learning_rate": 6.762196103687007e-08, "loss": 0.0771, "step": 5919 }, { "epoch": 0.9338644161375557, "grad_norm": 4.304989337921143, "learning_rate": 6.746095636773467e-08, "loss": 0.0184, "step": 5920 }, { "epoch": 0.9340221635051466, "grad_norm": 4.198561191558838, "learning_rate": 6.729995169859926e-08, "loss": 0.028, "step": 5921 }, { "epoch": 0.9341799108727373, "grad_norm": 3.3721861839294434, "learning_rate": 6.713894702946385e-08, "loss": 0.0357, "step": 5922 }, { "epoch": 0.9343376582403281, "grad_norm": 4.256961345672607, "learning_rate": 6.697794236032845e-08, "loss": 0.0516, "step": 5923 }, { "epoch": 0.9344954056079189, "grad_norm": 5.280449390411377, "learning_rate": 6.681693769119304e-08, "loss": 0.0345, "step": 5924 }, { "epoch": 0.9346531529755098, "grad_norm": 2.9795844554901123, "learning_rate": 6.665593302205763e-08, "loss": 0.0453, "step": 5925 }, { "epoch": 0.9348109003431005, "grad_norm": 9.320393562316895, "learning_rate": 6.649492835292222e-08, "loss": 0.0659, "step": 5926 }, { "epoch": 0.9349686477106913, "grad_norm": 6.018735885620117, "learning_rate": 6.633392368378683e-08, "loss": 0.0435, "step": 5927 }, { "epoch": 0.9351263950782821, "grad_norm": 3.991745710372925, "learning_rate": 6.617291901465142e-08, "loss": 0.0246, "step": 5928 }, { "epoch": 0.9352841424458729, "grad_norm": 3.104764461517334, "learning_rate": 6.601191434551602e-08, "loss": 0.0304, "step": 5929 }, { "epoch": 0.9354418898134638, "grad_norm": 3.9032888412475586, "learning_rate": 6.585090967638061e-08, "loss": 0.0579, "step": 5930 }, { "epoch": 0.9355996371810545, "grad_norm": 5.164892196655273, "learning_rate": 6.56899050072452e-08, "loss": 0.0503, "step": 5931 }, { "epoch": 0.9357573845486453, "grad_norm": 8.195326805114746, "learning_rate": 6.55289003381098e-08, "loss": 0.0421, "step": 5932 }, { "epoch": 0.9359151319162361, "grad_norm": 7.684947967529297, "learning_rate": 6.53678956689744e-08, "loss": 0.0261, "step": 5933 }, { "epoch": 0.936072879283827, "grad_norm": 4.523174285888672, "learning_rate": 6.5206890999839e-08, "loss": 0.0563, "step": 5934 }, { "epoch": 0.9362306266514178, "grad_norm": 11.877717018127441, "learning_rate": 6.504588633070359e-08, "loss": 0.0922, "step": 5935 }, { "epoch": 0.9363883740190085, "grad_norm": 3.477673053741455, "learning_rate": 6.488488166156818e-08, "loss": 0.0202, "step": 5936 }, { "epoch": 0.9365461213865993, "grad_norm": 3.0422277450561523, "learning_rate": 6.472387699243277e-08, "loss": 0.0588, "step": 5937 }, { "epoch": 0.9367038687541902, "grad_norm": 4.104816913604736, "learning_rate": 6.456287232329738e-08, "loss": 0.0496, "step": 5938 }, { "epoch": 0.936861616121781, "grad_norm": 4.961500644683838, "learning_rate": 6.440186765416197e-08, "loss": 0.0594, "step": 5939 }, { "epoch": 0.9370193634893718, "grad_norm": 2.4108762741088867, "learning_rate": 6.424086298502657e-08, "loss": 0.0311, "step": 5940 }, { "epoch": 0.9371771108569625, "grad_norm": 5.95065975189209, "learning_rate": 6.407985831589116e-08, "loss": 0.0914, "step": 5941 }, { "epoch": 0.9373348582245534, "grad_norm": 8.552593231201172, "learning_rate": 6.391885364675575e-08, "loss": 0.0549, "step": 5942 }, { "epoch": 0.9374926055921442, "grad_norm": 5.621298313140869, "learning_rate": 6.375784897762035e-08, "loss": 0.0223, "step": 5943 }, { "epoch": 0.937650352959735, "grad_norm": 3.9253087043762207, "learning_rate": 6.359684430848494e-08, "loss": 0.0693, "step": 5944 }, { "epoch": 0.9378081003273258, "grad_norm": 6.854350566864014, "learning_rate": 6.343583963934953e-08, "loss": 0.0254, "step": 5945 }, { "epoch": 0.9379658476949166, "grad_norm": 4.1480278968811035, "learning_rate": 6.327483497021413e-08, "loss": 0.0681, "step": 5946 }, { "epoch": 0.9381235950625074, "grad_norm": 13.152607917785645, "learning_rate": 6.311383030107872e-08, "loss": 0.043, "step": 5947 }, { "epoch": 0.9382813424300982, "grad_norm": 13.167405128479004, "learning_rate": 6.295282563194331e-08, "loss": 0.0761, "step": 5948 }, { "epoch": 0.938439089797689, "grad_norm": 6.634943962097168, "learning_rate": 6.279182096280792e-08, "loss": 0.0764, "step": 5949 }, { "epoch": 0.9385968371652798, "grad_norm": 4.601792335510254, "learning_rate": 6.263081629367252e-08, "loss": 0.0447, "step": 5950 }, { "epoch": 0.9387545845328706, "grad_norm": 3.340606927871704, "learning_rate": 6.24698116245371e-08, "loss": 0.0354, "step": 5951 }, { "epoch": 0.9389123319004614, "grad_norm": 4.565184116363525, "learning_rate": 6.23088069554017e-08, "loss": 0.0227, "step": 5952 }, { "epoch": 0.9390700792680522, "grad_norm": 4.615509510040283, "learning_rate": 6.21478022862663e-08, "loss": 0.0612, "step": 5953 }, { "epoch": 0.939227826635643, "grad_norm": 2.9482510089874268, "learning_rate": 6.19867976171309e-08, "loss": 0.0296, "step": 5954 }, { "epoch": 0.9393855740032339, "grad_norm": 5.325509548187256, "learning_rate": 6.182579294799549e-08, "loss": 0.0212, "step": 5955 }, { "epoch": 0.9395433213708246, "grad_norm": 3.9348642826080322, "learning_rate": 6.166478827886008e-08, "loss": 0.024, "step": 5956 }, { "epoch": 0.9397010687384154, "grad_norm": 6.6087822914123535, "learning_rate": 6.150378360972468e-08, "loss": 0.0791, "step": 5957 }, { "epoch": 0.9398588161060062, "grad_norm": 6.441550254821777, "learning_rate": 6.134277894058927e-08, "loss": 0.0762, "step": 5958 }, { "epoch": 0.9400165634735971, "grad_norm": 4.344478607177734, "learning_rate": 6.118177427145388e-08, "loss": 0.0811, "step": 5959 }, { "epoch": 0.9401743108411879, "grad_norm": 3.4004554748535156, "learning_rate": 6.102076960231847e-08, "loss": 0.0282, "step": 5960 }, { "epoch": 0.9403320582087786, "grad_norm": 7.548405170440674, "learning_rate": 6.085976493318306e-08, "loss": 0.0444, "step": 5961 }, { "epoch": 0.9404898055763694, "grad_norm": 4.60588264465332, "learning_rate": 6.069876026404765e-08, "loss": 0.0214, "step": 5962 }, { "epoch": 0.9406475529439603, "grad_norm": 6.029146194458008, "learning_rate": 6.053775559491225e-08, "loss": 0.1035, "step": 5963 }, { "epoch": 0.9408053003115511, "grad_norm": 11.042107582092285, "learning_rate": 6.037675092577685e-08, "loss": 0.0226, "step": 5964 }, { "epoch": 0.9409630476791419, "grad_norm": 3.1142709255218506, "learning_rate": 6.021574625664145e-08, "loss": 0.0591, "step": 5965 }, { "epoch": 0.9411207950467326, "grad_norm": 2.8489291667938232, "learning_rate": 6.005474158750604e-08, "loss": 0.0252, "step": 5966 }, { "epoch": 0.9412785424143235, "grad_norm": 3.7460086345672607, "learning_rate": 5.989373691837063e-08, "loss": 0.0498, "step": 5967 }, { "epoch": 0.9414362897819143, "grad_norm": 3.1322598457336426, "learning_rate": 5.973273224923523e-08, "loss": 0.0292, "step": 5968 }, { "epoch": 0.9415940371495051, "grad_norm": 3.797261953353882, "learning_rate": 5.957172758009982e-08, "loss": 0.0148, "step": 5969 }, { "epoch": 0.9417517845170958, "grad_norm": 5.342018127441406, "learning_rate": 5.941072291096442e-08, "loss": 0.022, "step": 5970 }, { "epoch": 0.9419095318846866, "grad_norm": 6.661462306976318, "learning_rate": 5.924971824182901e-08, "loss": 0.0404, "step": 5971 }, { "epoch": 0.9420672792522775, "grad_norm": 2.1923716068267822, "learning_rate": 5.9088713572693605e-08, "loss": 0.0223, "step": 5972 }, { "epoch": 0.9422250266198683, "grad_norm": 5.065052509307861, "learning_rate": 5.89277089035582e-08, "loss": 0.0451, "step": 5973 }, { "epoch": 0.9423827739874591, "grad_norm": 6.447623252868652, "learning_rate": 5.876670423442279e-08, "loss": 0.064, "step": 5974 }, { "epoch": 0.9425405213550498, "grad_norm": 2.33776593208313, "learning_rate": 5.860569956528739e-08, "loss": 0.0204, "step": 5975 }, { "epoch": 0.9426982687226407, "grad_norm": 5.81252384185791, "learning_rate": 5.844469489615199e-08, "loss": 0.1056, "step": 5976 }, { "epoch": 0.9428560160902315, "grad_norm": 3.33357310295105, "learning_rate": 5.8283690227016584e-08, "loss": 0.0193, "step": 5977 }, { "epoch": 0.9430137634578223, "grad_norm": 4.074812412261963, "learning_rate": 5.812268555788118e-08, "loss": 0.0771, "step": 5978 }, { "epoch": 0.9431715108254131, "grad_norm": 2.8009839057922363, "learning_rate": 5.796168088874577e-08, "loss": 0.0192, "step": 5979 }, { "epoch": 0.943329258193004, "grad_norm": 4.452914237976074, "learning_rate": 5.780067621961036e-08, "loss": 0.0808, "step": 5980 }, { "epoch": 0.9434870055605947, "grad_norm": 3.7786972522735596, "learning_rate": 5.7639671550474956e-08, "loss": 0.0401, "step": 5981 }, { "epoch": 0.9436447529281855, "grad_norm": 3.053673505783081, "learning_rate": 5.7478666881339556e-08, "loss": 0.017, "step": 5982 }, { "epoch": 0.9438025002957763, "grad_norm": 6.5977630615234375, "learning_rate": 5.7317662212204156e-08, "loss": 0.0398, "step": 5983 }, { "epoch": 0.9439602476633672, "grad_norm": 3.19356369972229, "learning_rate": 5.715665754306875e-08, "loss": 0.0184, "step": 5984 }, { "epoch": 0.944117995030958, "grad_norm": 8.475666046142578, "learning_rate": 5.699565287393334e-08, "loss": 0.0418, "step": 5985 }, { "epoch": 0.9442757423985487, "grad_norm": 5.271622657775879, "learning_rate": 5.6834648204797935e-08, "loss": 0.0307, "step": 5986 }, { "epoch": 0.9444334897661395, "grad_norm": 7.270249843597412, "learning_rate": 5.667364353566253e-08, "loss": 0.0475, "step": 5987 }, { "epoch": 0.9445912371337303, "grad_norm": 4.548971652984619, "learning_rate": 5.651263886652713e-08, "loss": 0.0427, "step": 5988 }, { "epoch": 0.9447489845013212, "grad_norm": 4.646120071411133, "learning_rate": 5.635163419739172e-08, "loss": 0.0509, "step": 5989 }, { "epoch": 0.9449067318689119, "grad_norm": 5.53933572769165, "learning_rate": 5.6190629528256314e-08, "loss": 0.0686, "step": 5990 }, { "epoch": 0.9450644792365027, "grad_norm": 6.857639312744141, "learning_rate": 5.6029624859120913e-08, "loss": 0.0714, "step": 5991 }, { "epoch": 0.9452222266040935, "grad_norm": 3.0770936012268066, "learning_rate": 5.5868620189985507e-08, "loss": 0.0294, "step": 5992 }, { "epoch": 0.9453799739716844, "grad_norm": 5.394449710845947, "learning_rate": 5.5707615520850106e-08, "loss": 0.0787, "step": 5993 }, { "epoch": 0.9455377213392752, "grad_norm": 6.045137882232666, "learning_rate": 5.55466108517147e-08, "loss": 0.0477, "step": 5994 }, { "epoch": 0.9456954687068659, "grad_norm": 7.0374603271484375, "learning_rate": 5.538560618257929e-08, "loss": 0.0668, "step": 5995 }, { "epoch": 0.9458532160744567, "grad_norm": 3.169250249862671, "learning_rate": 5.5224601513443885e-08, "loss": 0.0134, "step": 5996 }, { "epoch": 0.9460109634420476, "grad_norm": 3.9192862510681152, "learning_rate": 5.506359684430848e-08, "loss": 0.0398, "step": 5997 }, { "epoch": 0.9461687108096384, "grad_norm": 10.280298233032227, "learning_rate": 5.490259217517308e-08, "loss": 0.0552, "step": 5998 }, { "epoch": 0.9463264581772292, "grad_norm": 2.0093114376068115, "learning_rate": 5.474158750603768e-08, "loss": 0.011, "step": 5999 }, { "epoch": 0.9464842055448199, "grad_norm": 3.823469877243042, "learning_rate": 5.458058283690227e-08, "loss": 0.0554, "step": 6000 }, { "epoch": 0.9466419529124108, "grad_norm": 6.715463638305664, "learning_rate": 5.4419578167766864e-08, "loss": 0.0386, "step": 6001 }, { "epoch": 0.9467997002800016, "grad_norm": 2.5882365703582764, "learning_rate": 5.425857349863146e-08, "loss": 0.0243, "step": 6002 }, { "epoch": 0.9469574476475924, "grad_norm": 3.5130350589752197, "learning_rate": 5.409756882949605e-08, "loss": 0.027, "step": 6003 }, { "epoch": 0.9471151950151832, "grad_norm": 3.3332033157348633, "learning_rate": 5.3936564160360643e-08, "loss": 0.034, "step": 6004 }, { "epoch": 0.947272942382774, "grad_norm": 2.0243804454803467, "learning_rate": 5.377555949122524e-08, "loss": 0.0151, "step": 6005 }, { "epoch": 0.9474306897503648, "grad_norm": 4.765673637390137, "learning_rate": 5.361455482208984e-08, "loss": 0.0329, "step": 6006 }, { "epoch": 0.9475884371179556, "grad_norm": 9.25135612487793, "learning_rate": 5.3453550152954436e-08, "loss": 0.0903, "step": 6007 }, { "epoch": 0.9477461844855464, "grad_norm": 6.916815757751465, "learning_rate": 5.329254548381903e-08, "loss": 0.0375, "step": 6008 }, { "epoch": 0.9479039318531371, "grad_norm": 4.9020094871521, "learning_rate": 5.313154081468362e-08, "loss": 0.0186, "step": 6009 }, { "epoch": 0.948061679220728, "grad_norm": 5.087789058685303, "learning_rate": 5.2970536145548215e-08, "loss": 0.0336, "step": 6010 }, { "epoch": 0.9482194265883188, "grad_norm": 3.896568536758423, "learning_rate": 5.2809531476412815e-08, "loss": 0.0462, "step": 6011 }, { "epoch": 0.9483771739559096, "grad_norm": 2.7155940532684326, "learning_rate": 5.264852680727741e-08, "loss": 0.0225, "step": 6012 }, { "epoch": 0.9485349213235004, "grad_norm": 5.200143337249756, "learning_rate": 5.2487522138142e-08, "loss": 0.024, "step": 6013 }, { "epoch": 0.9486926686910913, "grad_norm": 6.716253757476807, "learning_rate": 5.23265174690066e-08, "loss": 0.0562, "step": 6014 }, { "epoch": 0.948850416058682, "grad_norm": 3.017744302749634, "learning_rate": 5.2165512799871194e-08, "loss": 0.0344, "step": 6015 }, { "epoch": 0.9490081634262728, "grad_norm": 5.271416664123535, "learning_rate": 5.200450813073579e-08, "loss": 0.0305, "step": 6016 }, { "epoch": 0.9491659107938636, "grad_norm": 4.114175319671631, "learning_rate": 5.1843503461600386e-08, "loss": 0.0245, "step": 6017 }, { "epoch": 0.9493236581614545, "grad_norm": 2.249285936355591, "learning_rate": 5.168249879246498e-08, "loss": 0.0172, "step": 6018 }, { "epoch": 0.9494814055290453, "grad_norm": 6.96615743637085, "learning_rate": 5.152149412332957e-08, "loss": 0.0578, "step": 6019 }, { "epoch": 0.949639152896636, "grad_norm": 4.631272792816162, "learning_rate": 5.1360489454194166e-08, "loss": 0.0302, "step": 6020 }, { "epoch": 0.9497969002642268, "grad_norm": 4.814763069152832, "learning_rate": 5.1199484785058765e-08, "loss": 0.0646, "step": 6021 }, { "epoch": 0.9499546476318177, "grad_norm": 3.347012996673584, "learning_rate": 5.103848011592336e-08, "loss": 0.022, "step": 6022 }, { "epoch": 0.9501123949994085, "grad_norm": 3.0205132961273193, "learning_rate": 5.087747544678796e-08, "loss": 0.0391, "step": 6023 }, { "epoch": 0.9502701423669992, "grad_norm": 5.37874698638916, "learning_rate": 5.071647077765255e-08, "loss": 0.0513, "step": 6024 }, { "epoch": 0.95042788973459, "grad_norm": 4.942624568939209, "learning_rate": 5.0555466108517144e-08, "loss": 0.0548, "step": 6025 }, { "epoch": 0.9505856371021808, "grad_norm": 4.22371768951416, "learning_rate": 5.039446143938174e-08, "loss": 0.0291, "step": 6026 }, { "epoch": 0.9507433844697717, "grad_norm": 4.233780860900879, "learning_rate": 5.023345677024633e-08, "loss": 0.0218, "step": 6027 }, { "epoch": 0.9509011318373625, "grad_norm": 7.730014801025391, "learning_rate": 5.007245210111093e-08, "loss": 0.0375, "step": 6028 }, { "epoch": 0.9510588792049532, "grad_norm": 3.876887559890747, "learning_rate": 4.991144743197553e-08, "loss": 0.0279, "step": 6029 }, { "epoch": 0.951216626572544, "grad_norm": 4.554478168487549, "learning_rate": 4.975044276284012e-08, "loss": 0.0443, "step": 6030 }, { "epoch": 0.9513743739401349, "grad_norm": 5.302878379821777, "learning_rate": 4.9589438093704716e-08, "loss": 0.0309, "step": 6031 }, { "epoch": 0.9515321213077257, "grad_norm": 26.972261428833008, "learning_rate": 4.942843342456931e-08, "loss": 0.0218, "step": 6032 }, { "epoch": 0.9516898686753165, "grad_norm": 2.710054636001587, "learning_rate": 4.92674287554339e-08, "loss": 0.0134, "step": 6033 }, { "epoch": 0.9518476160429072, "grad_norm": 2.822371482849121, "learning_rate": 4.91064240862985e-08, "loss": 0.0187, "step": 6034 }, { "epoch": 0.9520053634104981, "grad_norm": 3.546046018600464, "learning_rate": 4.8945419417163095e-08, "loss": 0.0231, "step": 6035 }, { "epoch": 0.9521631107780889, "grad_norm": 2.5862648487091064, "learning_rate": 4.878441474802769e-08, "loss": 0.021, "step": 6036 }, { "epoch": 0.9523208581456797, "grad_norm": 4.88242244720459, "learning_rate": 4.862341007889229e-08, "loss": 0.0493, "step": 6037 }, { "epoch": 0.9524786055132705, "grad_norm": 4.476553440093994, "learning_rate": 4.846240540975688e-08, "loss": 0.035, "step": 6038 }, { "epoch": 0.9526363528808613, "grad_norm": 3.3754684925079346, "learning_rate": 4.8301400740621474e-08, "loss": 0.0362, "step": 6039 }, { "epoch": 0.9527941002484521, "grad_norm": 2.319370746612549, "learning_rate": 4.8140396071486074e-08, "loss": 0.0339, "step": 6040 }, { "epoch": 0.9529518476160429, "grad_norm": 4.899821758270264, "learning_rate": 4.797939140235067e-08, "loss": 0.06, "step": 6041 }, { "epoch": 0.9531095949836337, "grad_norm": 3.4484035968780518, "learning_rate": 4.781838673321526e-08, "loss": 0.031, "step": 6042 }, { "epoch": 0.9532673423512246, "grad_norm": 11.35166072845459, "learning_rate": 4.765738206407985e-08, "loss": 0.0473, "step": 6043 }, { "epoch": 0.9534250897188153, "grad_norm": 8.086166381835938, "learning_rate": 4.749637739494445e-08, "loss": 0.0767, "step": 6044 }, { "epoch": 0.9535828370864061, "grad_norm": 3.089240550994873, "learning_rate": 4.7335372725809046e-08, "loss": 0.0167, "step": 6045 }, { "epoch": 0.9537405844539969, "grad_norm": 10.3706693649292, "learning_rate": 4.7174368056673645e-08, "loss": 0.0625, "step": 6046 }, { "epoch": 0.9538983318215877, "grad_norm": 5.132999420166016, "learning_rate": 4.701336338753824e-08, "loss": 0.0456, "step": 6047 }, { "epoch": 0.9540560791891786, "grad_norm": 4.445789813995361, "learning_rate": 4.685235871840283e-08, "loss": 0.0553, "step": 6048 }, { "epoch": 0.9542138265567693, "grad_norm": 5.825806617736816, "learning_rate": 4.6691354049267425e-08, "loss": 0.0214, "step": 6049 }, { "epoch": 0.9543715739243601, "grad_norm": 5.145627975463867, "learning_rate": 4.653034938013202e-08, "loss": 0.0293, "step": 6050 }, { "epoch": 0.9545293212919509, "grad_norm": 4.06556510925293, "learning_rate": 4.636934471099661e-08, "loss": 0.0218, "step": 6051 }, { "epoch": 0.9546870686595418, "grad_norm": 3.483142614364624, "learning_rate": 4.620834004186122e-08, "loss": 0.0395, "step": 6052 }, { "epoch": 0.9548448160271326, "grad_norm": 5.402103900909424, "learning_rate": 4.604733537272581e-08, "loss": 0.0573, "step": 6053 }, { "epoch": 0.9550025633947233, "grad_norm": 8.97451114654541, "learning_rate": 4.58863307035904e-08, "loss": 0.0466, "step": 6054 }, { "epoch": 0.9551603107623141, "grad_norm": 3.5415401458740234, "learning_rate": 4.5725326034454996e-08, "loss": 0.0236, "step": 6055 }, { "epoch": 0.955318058129905, "grad_norm": 5.875942230224609, "learning_rate": 4.556432136531959e-08, "loss": 0.0526, "step": 6056 }, { "epoch": 0.9554758054974958, "grad_norm": 10.333345413208008, "learning_rate": 4.540331669618419e-08, "loss": 0.0621, "step": 6057 }, { "epoch": 0.9556335528650866, "grad_norm": 4.999233722686768, "learning_rate": 4.524231202704878e-08, "loss": 0.0257, "step": 6058 }, { "epoch": 0.9557913002326773, "grad_norm": 16.694101333618164, "learning_rate": 4.5081307357913375e-08, "loss": 0.0475, "step": 6059 }, { "epoch": 0.9559490476002682, "grad_norm": 4.000310897827148, "learning_rate": 4.4920302688777975e-08, "loss": 0.0371, "step": 6060 }, { "epoch": 0.956106794967859, "grad_norm": 4.216620922088623, "learning_rate": 4.475929801964257e-08, "loss": 0.0647, "step": 6061 }, { "epoch": 0.9562645423354498, "grad_norm": 3.991231918334961, "learning_rate": 4.459829335050716e-08, "loss": 0.0342, "step": 6062 }, { "epoch": 0.9564222897030406, "grad_norm": 6.847813129425049, "learning_rate": 4.443728868137176e-08, "loss": 0.0357, "step": 6063 }, { "epoch": 0.9565800370706314, "grad_norm": 3.854752779006958, "learning_rate": 4.4276284012236354e-08, "loss": 0.0705, "step": 6064 }, { "epoch": 0.9567377844382222, "grad_norm": 4.1908745765686035, "learning_rate": 4.411527934310095e-08, "loss": 0.0351, "step": 6065 }, { "epoch": 0.956895531805813, "grad_norm": 6.255070686340332, "learning_rate": 4.395427467396554e-08, "loss": 0.0405, "step": 6066 }, { "epoch": 0.9570532791734038, "grad_norm": 4.168485164642334, "learning_rate": 4.379327000483014e-08, "loss": 0.0946, "step": 6067 }, { "epoch": 0.9572110265409945, "grad_norm": 6.225264549255371, "learning_rate": 4.363226533569473e-08, "loss": 0.059, "step": 6068 }, { "epoch": 0.9573687739085854, "grad_norm": 4.073039531707764, "learning_rate": 4.347126066655933e-08, "loss": 0.0436, "step": 6069 }, { "epoch": 0.9575265212761762, "grad_norm": 3.783607244491577, "learning_rate": 4.3310255997423926e-08, "loss": 0.0524, "step": 6070 }, { "epoch": 0.957684268643767, "grad_norm": 2.8810980319976807, "learning_rate": 4.314925132828852e-08, "loss": 0.0207, "step": 6071 }, { "epoch": 0.9578420160113578, "grad_norm": 3.771087408065796, "learning_rate": 4.298824665915311e-08, "loss": 0.0233, "step": 6072 }, { "epoch": 0.9579997633789487, "grad_norm": 3.440264940261841, "learning_rate": 4.2827241990017705e-08, "loss": 0.0235, "step": 6073 }, { "epoch": 0.9581575107465394, "grad_norm": 6.002028942108154, "learning_rate": 4.26662373208823e-08, "loss": 0.0176, "step": 6074 }, { "epoch": 0.9583152581141302, "grad_norm": 3.9315836429595947, "learning_rate": 4.2505232651746904e-08, "loss": 0.0219, "step": 6075 }, { "epoch": 0.958473005481721, "grad_norm": 5.8993144035339355, "learning_rate": 4.23442279826115e-08, "loss": 0.0366, "step": 6076 }, { "epoch": 0.9586307528493119, "grad_norm": 4.322342395782471, "learning_rate": 4.218322331347609e-08, "loss": 0.0327, "step": 6077 }, { "epoch": 0.9587885002169027, "grad_norm": 3.4553074836730957, "learning_rate": 4.2022218644340683e-08, "loss": 0.0229, "step": 6078 }, { "epoch": 0.9589462475844934, "grad_norm": 5.900453567504883, "learning_rate": 4.1861213975205277e-08, "loss": 0.0379, "step": 6079 }, { "epoch": 0.9591039949520842, "grad_norm": 4.2595906257629395, "learning_rate": 4.170020930606987e-08, "loss": 0.0671, "step": 6080 }, { "epoch": 0.9592617423196751, "grad_norm": 4.484315872192383, "learning_rate": 4.153920463693447e-08, "loss": 0.0657, "step": 6081 }, { "epoch": 0.9594194896872659, "grad_norm": 5.0750274658203125, "learning_rate": 4.137819996779906e-08, "loss": 0.0484, "step": 6082 }, { "epoch": 0.9595772370548566, "grad_norm": 3.9562439918518066, "learning_rate": 4.121719529866366e-08, "loss": 0.0203, "step": 6083 }, { "epoch": 0.9597349844224474, "grad_norm": 2.25351619720459, "learning_rate": 4.1056190629528255e-08, "loss": 0.0264, "step": 6084 }, { "epoch": 0.9598927317900382, "grad_norm": 4.723305702209473, "learning_rate": 4.089518596039285e-08, "loss": 0.0391, "step": 6085 }, { "epoch": 0.9600504791576291, "grad_norm": 4.14780855178833, "learning_rate": 4.073418129125744e-08, "loss": 0.0317, "step": 6086 }, { "epoch": 0.9602082265252199, "grad_norm": 6.291996955871582, "learning_rate": 4.057317662212204e-08, "loss": 0.0471, "step": 6087 }, { "epoch": 0.9603659738928106, "grad_norm": 6.449141979217529, "learning_rate": 4.0412171952986634e-08, "loss": 0.0746, "step": 6088 }, { "epoch": 0.9605237212604014, "grad_norm": 7.540985584259033, "learning_rate": 4.025116728385123e-08, "loss": 0.0899, "step": 6089 }, { "epoch": 0.9606814686279923, "grad_norm": 4.7788920402526855, "learning_rate": 4.009016261471583e-08, "loss": 0.0269, "step": 6090 }, { "epoch": 0.9608392159955831, "grad_norm": 3.3653805255889893, "learning_rate": 3.992915794558042e-08, "loss": 0.0301, "step": 6091 }, { "epoch": 0.9609969633631739, "grad_norm": 6.402235507965088, "learning_rate": 3.976815327644502e-08, "loss": 0.049, "step": 6092 }, { "epoch": 0.9611547107307646, "grad_norm": 5.099925518035889, "learning_rate": 3.960714860730961e-08, "loss": 0.0632, "step": 6093 }, { "epoch": 0.9613124580983555, "grad_norm": 3.989698648452759, "learning_rate": 3.9446143938174206e-08, "loss": 0.0402, "step": 6094 }, { "epoch": 0.9614702054659463, "grad_norm": 5.849874019622803, "learning_rate": 3.92851392690388e-08, "loss": 0.0483, "step": 6095 }, { "epoch": 0.9616279528335371, "grad_norm": 6.019822120666504, "learning_rate": 3.912413459990339e-08, "loss": 0.0846, "step": 6096 }, { "epoch": 0.9617857002011279, "grad_norm": 7.444940090179443, "learning_rate": 3.8963129930767985e-08, "loss": 0.0414, "step": 6097 }, { "epoch": 0.9619434475687187, "grad_norm": 4.172461986541748, "learning_rate": 3.880212526163259e-08, "loss": 0.0756, "step": 6098 }, { "epoch": 0.9621011949363095, "grad_norm": 4.624844074249268, "learning_rate": 3.8641120592497184e-08, "loss": 0.0413, "step": 6099 }, { "epoch": 0.9622589423039003, "grad_norm": 7.130695819854736, "learning_rate": 3.848011592336178e-08, "loss": 0.0825, "step": 6100 }, { "epoch": 0.9624166896714911, "grad_norm": 3.7383406162261963, "learning_rate": 3.831911125422637e-08, "loss": 0.0435, "step": 6101 }, { "epoch": 0.962574437039082, "grad_norm": 3.9024624824523926, "learning_rate": 3.8158106585090964e-08, "loss": 0.0307, "step": 6102 }, { "epoch": 0.9627321844066727, "grad_norm": 4.302260398864746, "learning_rate": 3.799710191595556e-08, "loss": 0.0398, "step": 6103 }, { "epoch": 0.9628899317742635, "grad_norm": 5.5469441413879395, "learning_rate": 3.7836097246820156e-08, "loss": 0.0922, "step": 6104 }, { "epoch": 0.9630476791418543, "grad_norm": 3.2649874687194824, "learning_rate": 3.767509257768475e-08, "loss": 0.0238, "step": 6105 }, { "epoch": 0.9632054265094451, "grad_norm": 5.246920585632324, "learning_rate": 3.751408790854935e-08, "loss": 0.0472, "step": 6106 }, { "epoch": 0.963363173877036, "grad_norm": 4.312435150146484, "learning_rate": 3.735308323941394e-08, "loss": 0.0311, "step": 6107 }, { "epoch": 0.9635209212446267, "grad_norm": 7.261901378631592, "learning_rate": 3.7192078570278535e-08, "loss": 0.0479, "step": 6108 }, { "epoch": 0.9636786686122175, "grad_norm": 6.1279988288879395, "learning_rate": 3.703107390114313e-08, "loss": 0.0531, "step": 6109 }, { "epoch": 0.9638364159798083, "grad_norm": 5.212375164031982, "learning_rate": 3.687006923200773e-08, "loss": 0.0665, "step": 6110 }, { "epoch": 0.9639941633473992, "grad_norm": 3.3017594814300537, "learning_rate": 3.670906456287232e-08, "loss": 0.0344, "step": 6111 }, { "epoch": 0.96415191071499, "grad_norm": 6.285422325134277, "learning_rate": 3.6548059893736914e-08, "loss": 0.0567, "step": 6112 }, { "epoch": 0.9643096580825807, "grad_norm": 6.064441680908203, "learning_rate": 3.6387055224601514e-08, "loss": 0.0473, "step": 6113 }, { "epoch": 0.9644674054501715, "grad_norm": 4.454376220703125, "learning_rate": 3.622605055546611e-08, "loss": 0.0173, "step": 6114 }, { "epoch": 0.9646251528177624, "grad_norm": 5.166300296783447, "learning_rate": 3.60650458863307e-08, "loss": 0.0396, "step": 6115 }, { "epoch": 0.9647829001853532, "grad_norm": 19.746580123901367, "learning_rate": 3.59040412171953e-08, "loss": 0.1012, "step": 6116 }, { "epoch": 0.964940647552944, "grad_norm": 3.5285303592681885, "learning_rate": 3.574303654805989e-08, "loss": 0.0267, "step": 6117 }, { "epoch": 0.9650983949205347, "grad_norm": 4.301242828369141, "learning_rate": 3.5582031878924486e-08, "loss": 0.0195, "step": 6118 }, { "epoch": 0.9652561422881256, "grad_norm": 6.223407745361328, "learning_rate": 3.542102720978908e-08, "loss": 0.0286, "step": 6119 }, { "epoch": 0.9654138896557164, "grad_norm": 6.488699436187744, "learning_rate": 3.526002254065367e-08, "loss": 0.0337, "step": 6120 }, { "epoch": 0.9655716370233072, "grad_norm": 1.9815067052841187, "learning_rate": 3.509901787151828e-08, "loss": 0.0139, "step": 6121 }, { "epoch": 0.965729384390898, "grad_norm": 6.164742946624756, "learning_rate": 3.493801320238287e-08, "loss": 0.0587, "step": 6122 }, { "epoch": 0.9658871317584887, "grad_norm": 1.9928150177001953, "learning_rate": 3.4777008533247465e-08, "loss": 0.0176, "step": 6123 }, { "epoch": 0.9660448791260796, "grad_norm": 9.271769523620605, "learning_rate": 3.461600386411206e-08, "loss": 0.0497, "step": 6124 }, { "epoch": 0.9662026264936704, "grad_norm": 5.085504055023193, "learning_rate": 3.445499919497665e-08, "loss": 0.0592, "step": 6125 }, { "epoch": 0.9663603738612612, "grad_norm": 3.6184051036834717, "learning_rate": 3.4293994525841244e-08, "loss": 0.0758, "step": 6126 }, { "epoch": 0.966518121228852, "grad_norm": 6.860569000244141, "learning_rate": 3.4132989856705844e-08, "loss": 0.0336, "step": 6127 }, { "epoch": 0.9666758685964428, "grad_norm": 3.6731934547424316, "learning_rate": 3.3971985187570437e-08, "loss": 0.0332, "step": 6128 }, { "epoch": 0.9668336159640336, "grad_norm": 7.188792705535889, "learning_rate": 3.3810980518435036e-08, "loss": 0.0437, "step": 6129 }, { "epoch": 0.9669913633316244, "grad_norm": 4.120716571807861, "learning_rate": 3.364997584929963e-08, "loss": 0.0531, "step": 6130 }, { "epoch": 0.9671491106992152, "grad_norm": 2.426354169845581, "learning_rate": 3.348897118016422e-08, "loss": 0.0112, "step": 6131 }, { "epoch": 0.967306858066806, "grad_norm": 6.214199066162109, "learning_rate": 3.3327966511028816e-08, "loss": 0.0596, "step": 6132 }, { "epoch": 0.9674646054343968, "grad_norm": 3.4293365478515625, "learning_rate": 3.3166961841893415e-08, "loss": 0.0193, "step": 6133 }, { "epoch": 0.9676223528019876, "grad_norm": 2.90401291847229, "learning_rate": 3.300595717275801e-08, "loss": 0.013, "step": 6134 }, { "epoch": 0.9677801001695784, "grad_norm": 9.00876522064209, "learning_rate": 3.28449525036226e-08, "loss": 0.0345, "step": 6135 }, { "epoch": 0.9679378475371693, "grad_norm": 2.3712689876556396, "learning_rate": 3.26839478344872e-08, "loss": 0.0312, "step": 6136 }, { "epoch": 0.96809559490476, "grad_norm": 7.52162504196167, "learning_rate": 3.2522943165351794e-08, "loss": 0.1005, "step": 6137 }, { "epoch": 0.9682533422723508, "grad_norm": 5.890244007110596, "learning_rate": 3.236193849621639e-08, "loss": 0.0537, "step": 6138 }, { "epoch": 0.9684110896399416, "grad_norm": 5.929839134216309, "learning_rate": 3.220093382708099e-08, "loss": 0.0804, "step": 6139 }, { "epoch": 0.9685688370075325, "grad_norm": 2.614980936050415, "learning_rate": 3.203992915794558e-08, "loss": 0.0131, "step": 6140 }, { "epoch": 0.9687265843751233, "grad_norm": 5.683056354522705, "learning_rate": 3.187892448881017e-08, "loss": 0.0567, "step": 6141 }, { "epoch": 0.968884331742714, "grad_norm": 2.9339380264282227, "learning_rate": 3.1717919819674766e-08, "loss": 0.0213, "step": 6142 }, { "epoch": 0.9690420791103048, "grad_norm": 5.3598222732543945, "learning_rate": 3.155691515053936e-08, "loss": 0.0229, "step": 6143 }, { "epoch": 0.9691998264778956, "grad_norm": 4.69875431060791, "learning_rate": 3.139591048140396e-08, "loss": 0.0512, "step": 6144 }, { "epoch": 0.9691998264778956, "eval_accuracy": 0.9882576224396736, "eval_f1": 0.9882576224396736, "eval_loss": 0.038487568497657776, "eval_runtime": 4708.4491, "eval_samples_per_second": 43.083, "eval_steps_per_second": 2.693, "step": 6144 }, { "epoch": 0.9693575738454865, "grad_norm": 7.932188987731934, "learning_rate": 3.123490581226855e-08, "loss": 0.0437, "step": 6145 }, { "epoch": 0.9695153212130773, "grad_norm": 4.773967742919922, "learning_rate": 3.107390114313315e-08, "loss": 0.0307, "step": 6146 }, { "epoch": 0.969673068580668, "grad_norm": 4.265565395355225, "learning_rate": 3.0912896473997745e-08, "loss": 0.0284, "step": 6147 }, { "epoch": 0.9698308159482588, "grad_norm": 7.66091775894165, "learning_rate": 3.075189180486234e-08, "loss": 0.0528, "step": 6148 }, { "epoch": 0.9699885633158497, "grad_norm": 2.410403251647949, "learning_rate": 3.059088713572694e-08, "loss": 0.0158, "step": 6149 }, { "epoch": 0.9701463106834405, "grad_norm": 5.615338325500488, "learning_rate": 3.042988246659153e-08, "loss": 0.0306, "step": 6150 }, { "epoch": 0.9703040580510313, "grad_norm": 3.4606375694274902, "learning_rate": 3.0268877797456124e-08, "loss": 0.0633, "step": 6151 }, { "epoch": 0.970461805418622, "grad_norm": 5.322042942047119, "learning_rate": 3.0107873128320724e-08, "loss": 0.0615, "step": 6152 }, { "epoch": 0.9706195527862129, "grad_norm": 15.94222640991211, "learning_rate": 2.9946868459185317e-08, "loss": 0.0809, "step": 6153 }, { "epoch": 0.9707773001538037, "grad_norm": 1.9718109369277954, "learning_rate": 2.978586379004991e-08, "loss": 0.0109, "step": 6154 }, { "epoch": 0.9709350475213945, "grad_norm": 6.080210208892822, "learning_rate": 2.9624859120914506e-08, "loss": 0.0287, "step": 6155 }, { "epoch": 0.9710927948889853, "grad_norm": 4.3839569091796875, "learning_rate": 2.94638544517791e-08, "loss": 0.0642, "step": 6156 }, { "epoch": 0.9712505422565761, "grad_norm": 6.569242000579834, "learning_rate": 2.9302849782643696e-08, "loss": 0.0574, "step": 6157 }, { "epoch": 0.9714082896241669, "grad_norm": 4.099246978759766, "learning_rate": 2.9141845113508292e-08, "loss": 0.0293, "step": 6158 }, { "epoch": 0.9715660369917577, "grad_norm": 4.485950946807861, "learning_rate": 2.8980840444372885e-08, "loss": 0.0284, "step": 6159 }, { "epoch": 0.9717237843593485, "grad_norm": 2.693380355834961, "learning_rate": 2.8819835775237478e-08, "loss": 0.0228, "step": 6160 }, { "epoch": 0.9718815317269393, "grad_norm": 7.15558385848999, "learning_rate": 2.8658831106102078e-08, "loss": 0.0478, "step": 6161 }, { "epoch": 0.9720392790945301, "grad_norm": 2.9982993602752686, "learning_rate": 2.849782643696667e-08, "loss": 0.0467, "step": 6162 }, { "epoch": 0.9721970264621209, "grad_norm": 9.508600234985352, "learning_rate": 2.8336821767831264e-08, "loss": 0.0535, "step": 6163 }, { "epoch": 0.9723547738297117, "grad_norm": 1.484305739402771, "learning_rate": 2.817581709869586e-08, "loss": 0.0108, "step": 6164 }, { "epoch": 0.9725125211973025, "grad_norm": 7.577390670776367, "learning_rate": 2.8014812429560457e-08, "loss": 0.0834, "step": 6165 }, { "epoch": 0.9726702685648934, "grad_norm": 3.675633192062378, "learning_rate": 2.7853807760425053e-08, "loss": 0.0142, "step": 6166 }, { "epoch": 0.9728280159324841, "grad_norm": 4.311420440673828, "learning_rate": 2.7692803091289646e-08, "loss": 0.0394, "step": 6167 }, { "epoch": 0.9729857633000749, "grad_norm": 3.139410972595215, "learning_rate": 2.753179842215424e-08, "loss": 0.0108, "step": 6168 }, { "epoch": 0.9731435106676657, "grad_norm": 3.7535455226898193, "learning_rate": 2.737079375301884e-08, "loss": 0.0435, "step": 6169 }, { "epoch": 0.9733012580352566, "grad_norm": 4.458008766174316, "learning_rate": 2.7209789083883432e-08, "loss": 0.0391, "step": 6170 }, { "epoch": 0.9734590054028474, "grad_norm": 5.988290786743164, "learning_rate": 2.7048784414748025e-08, "loss": 0.0514, "step": 6171 }, { "epoch": 0.9736167527704381, "grad_norm": 6.685472011566162, "learning_rate": 2.688777974561262e-08, "loss": 0.0549, "step": 6172 }, { "epoch": 0.9737745001380289, "grad_norm": 4.843076705932617, "learning_rate": 2.6726775076477218e-08, "loss": 0.0521, "step": 6173 }, { "epoch": 0.9739322475056198, "grad_norm": 4.8866753578186035, "learning_rate": 2.656577040734181e-08, "loss": 0.0232, "step": 6174 }, { "epoch": 0.9740899948732106, "grad_norm": 7.195474624633789, "learning_rate": 2.6404765738206407e-08, "loss": 0.0383, "step": 6175 }, { "epoch": 0.9742477422408014, "grad_norm": 5.306886672973633, "learning_rate": 2.6243761069071e-08, "loss": 0.0575, "step": 6176 }, { "epoch": 0.9744054896083921, "grad_norm": 5.701973915100098, "learning_rate": 2.6082756399935597e-08, "loss": 0.0962, "step": 6177 }, { "epoch": 0.974563236975983, "grad_norm": 5.497990131378174, "learning_rate": 2.5921751730800193e-08, "loss": 0.0427, "step": 6178 }, { "epoch": 0.9747209843435738, "grad_norm": 3.343956708908081, "learning_rate": 2.5760747061664786e-08, "loss": 0.0153, "step": 6179 }, { "epoch": 0.9748787317111646, "grad_norm": 4.167031288146973, "learning_rate": 2.5599742392529383e-08, "loss": 0.0753, "step": 6180 }, { "epoch": 0.9750364790787553, "grad_norm": 2.9136710166931152, "learning_rate": 2.543873772339398e-08, "loss": 0.0248, "step": 6181 }, { "epoch": 0.9751942264463461, "grad_norm": 7.567819118499756, "learning_rate": 2.5277733054258572e-08, "loss": 0.0817, "step": 6182 }, { "epoch": 0.975351973813937, "grad_norm": 3.8892838954925537, "learning_rate": 2.5116728385123165e-08, "loss": 0.0153, "step": 6183 }, { "epoch": 0.9755097211815278, "grad_norm": 4.018087387084961, "learning_rate": 2.4955723715987765e-08, "loss": 0.0217, "step": 6184 }, { "epoch": 0.9756674685491186, "grad_norm": 5.014780521392822, "learning_rate": 2.4794719046852358e-08, "loss": 0.0355, "step": 6185 }, { "epoch": 0.9758252159167093, "grad_norm": 5.33723258972168, "learning_rate": 2.463371437771695e-08, "loss": 0.049, "step": 6186 }, { "epoch": 0.9759829632843002, "grad_norm": 2.67022967338562, "learning_rate": 2.4472709708581547e-08, "loss": 0.0454, "step": 6187 }, { "epoch": 0.976140710651891, "grad_norm": 1.7317997217178345, "learning_rate": 2.4311705039446144e-08, "loss": 0.0114, "step": 6188 }, { "epoch": 0.9762984580194818, "grad_norm": 5.719967365264893, "learning_rate": 2.4150700370310737e-08, "loss": 0.0529, "step": 6189 }, { "epoch": 0.9764562053870726, "grad_norm": 5.98928689956665, "learning_rate": 2.3989695701175333e-08, "loss": 0.0404, "step": 6190 }, { "epoch": 0.9766139527546635, "grad_norm": 3.8387529850006104, "learning_rate": 2.3828691032039926e-08, "loss": 0.0513, "step": 6191 }, { "epoch": 0.9767717001222542, "grad_norm": 2.3278558254241943, "learning_rate": 2.3667686362904523e-08, "loss": 0.0174, "step": 6192 }, { "epoch": 0.976929447489845, "grad_norm": 5.587123394012451, "learning_rate": 2.350668169376912e-08, "loss": 0.0136, "step": 6193 }, { "epoch": 0.9770871948574358, "grad_norm": 2.9909167289733887, "learning_rate": 2.3345677024633712e-08, "loss": 0.0427, "step": 6194 }, { "epoch": 0.9772449422250267, "grad_norm": 5.721266746520996, "learning_rate": 2.3184672355498305e-08, "loss": 0.062, "step": 6195 }, { "epoch": 0.9774026895926174, "grad_norm": 7.145239353179932, "learning_rate": 2.3023667686362905e-08, "loss": 0.044, "step": 6196 }, { "epoch": 0.9775604369602082, "grad_norm": 5.093533515930176, "learning_rate": 2.2862663017227498e-08, "loss": 0.0513, "step": 6197 }, { "epoch": 0.977718184327799, "grad_norm": 3.0863187313079834, "learning_rate": 2.2701658348092095e-08, "loss": 0.0454, "step": 6198 }, { "epoch": 0.9778759316953899, "grad_norm": 6.698001384735107, "learning_rate": 2.2540653678956688e-08, "loss": 0.0363, "step": 6199 }, { "epoch": 0.9780336790629807, "grad_norm": 3.7999961376190186, "learning_rate": 2.2379649009821284e-08, "loss": 0.0198, "step": 6200 }, { "epoch": 0.9781914264305714, "grad_norm": 6.384427070617676, "learning_rate": 2.221864434068588e-08, "loss": 0.0518, "step": 6201 }, { "epoch": 0.9783491737981622, "grad_norm": 3.1141104698181152, "learning_rate": 2.2057639671550473e-08, "loss": 0.012, "step": 6202 }, { "epoch": 0.978506921165753, "grad_norm": 4.341085910797119, "learning_rate": 2.189663500241507e-08, "loss": 0.0257, "step": 6203 }, { "epoch": 0.9786646685333439, "grad_norm": 6.542384624481201, "learning_rate": 2.1735630333279666e-08, "loss": 0.0359, "step": 6204 }, { "epoch": 0.9788224159009347, "grad_norm": 3.0185017585754395, "learning_rate": 2.157462566414426e-08, "loss": 0.0473, "step": 6205 }, { "epoch": 0.9789801632685254, "grad_norm": 6.168667793273926, "learning_rate": 2.1413620995008852e-08, "loss": 0.0841, "step": 6206 }, { "epoch": 0.9791379106361162, "grad_norm": 2.684790849685669, "learning_rate": 2.1252616325873452e-08, "loss": 0.0145, "step": 6207 }, { "epoch": 0.9792956580037071, "grad_norm": 6.765401840209961, "learning_rate": 2.1091611656738045e-08, "loss": 0.0161, "step": 6208 }, { "epoch": 0.9794534053712979, "grad_norm": 4.448896884918213, "learning_rate": 2.0930606987602638e-08, "loss": 0.0366, "step": 6209 }, { "epoch": 0.9796111527388887, "grad_norm": 4.182690620422363, "learning_rate": 2.0769602318467235e-08, "loss": 0.0153, "step": 6210 }, { "epoch": 0.9797689001064794, "grad_norm": 5.692211627960205, "learning_rate": 2.060859764933183e-08, "loss": 0.0534, "step": 6211 }, { "epoch": 0.9799266474740703, "grad_norm": 5.768784523010254, "learning_rate": 2.0447592980196424e-08, "loss": 0.0538, "step": 6212 }, { "epoch": 0.9800843948416611, "grad_norm": 4.9965596199035645, "learning_rate": 2.028658831106102e-08, "loss": 0.0812, "step": 6213 }, { "epoch": 0.9802421422092519, "grad_norm": 5.822160243988037, "learning_rate": 2.0125583641925614e-08, "loss": 0.0514, "step": 6214 }, { "epoch": 0.9803998895768427, "grad_norm": 9.266775131225586, "learning_rate": 1.996457897279021e-08, "loss": 0.0384, "step": 6215 }, { "epoch": 0.9805576369444335, "grad_norm": 5.855165004730225, "learning_rate": 1.9803574303654806e-08, "loss": 0.0849, "step": 6216 }, { "epoch": 0.9807153843120243, "grad_norm": 4.29231071472168, "learning_rate": 1.96425696345194e-08, "loss": 0.019, "step": 6217 }, { "epoch": 0.9808731316796151, "grad_norm": 5.040156841278076, "learning_rate": 1.9481564965383993e-08, "loss": 0.0281, "step": 6218 }, { "epoch": 0.9810308790472059, "grad_norm": 3.730966806411743, "learning_rate": 1.9320560296248592e-08, "loss": 0.0219, "step": 6219 }, { "epoch": 0.9811886264147966, "grad_norm": 3.881930112838745, "learning_rate": 1.9159555627113185e-08, "loss": 0.0335, "step": 6220 }, { "epoch": 0.9813463737823875, "grad_norm": 6.038278102874756, "learning_rate": 1.899855095797778e-08, "loss": 0.098, "step": 6221 }, { "epoch": 0.9815041211499783, "grad_norm": 3.6147210597991943, "learning_rate": 1.8837546288842375e-08, "loss": 0.0243, "step": 6222 }, { "epoch": 0.9816618685175691, "grad_norm": 5.513983726501465, "learning_rate": 1.867654161970697e-08, "loss": 0.076, "step": 6223 }, { "epoch": 0.9818196158851599, "grad_norm": 7.426619529724121, "learning_rate": 1.8515536950571564e-08, "loss": 0.0333, "step": 6224 }, { "epoch": 0.9819773632527508, "grad_norm": 3.414789915084839, "learning_rate": 1.835453228143616e-08, "loss": 0.0503, "step": 6225 }, { "epoch": 0.9821351106203415, "grad_norm": 9.266975402832031, "learning_rate": 1.8193527612300757e-08, "loss": 0.043, "step": 6226 }, { "epoch": 0.9822928579879323, "grad_norm": 3.6152501106262207, "learning_rate": 1.803252294316535e-08, "loss": 0.0377, "step": 6227 }, { "epoch": 0.9824506053555231, "grad_norm": 2.5475409030914307, "learning_rate": 1.7871518274029946e-08, "loss": 0.0451, "step": 6228 }, { "epoch": 0.982608352723114, "grad_norm": 2.931168556213379, "learning_rate": 1.771051360489454e-08, "loss": 0.0191, "step": 6229 }, { "epoch": 0.9827661000907048, "grad_norm": 1.9964200258255005, "learning_rate": 1.754950893575914e-08, "loss": 0.0101, "step": 6230 }, { "epoch": 0.9829238474582955, "grad_norm": 5.205655097961426, "learning_rate": 1.7388504266623732e-08, "loss": 0.0309, "step": 6231 }, { "epoch": 0.9830815948258863, "grad_norm": 3.76123046875, "learning_rate": 1.7227499597488325e-08, "loss": 0.0293, "step": 6232 }, { "epoch": 0.9832393421934772, "grad_norm": 4.543227672576904, "learning_rate": 1.7066494928352922e-08, "loss": 0.0499, "step": 6233 }, { "epoch": 0.983397089561068, "grad_norm": 6.56539249420166, "learning_rate": 1.6905490259217518e-08, "loss": 0.0418, "step": 6234 }, { "epoch": 0.9835548369286587, "grad_norm": 3.8159878253936768, "learning_rate": 1.674448559008211e-08, "loss": 0.0395, "step": 6235 }, { "epoch": 0.9837125842962495, "grad_norm": 4.285485744476318, "learning_rate": 1.6583480920946708e-08, "loss": 0.0381, "step": 6236 }, { "epoch": 0.9838703316638404, "grad_norm": 4.0999627113342285, "learning_rate": 1.64224762518113e-08, "loss": 0.04, "step": 6237 }, { "epoch": 0.9840280790314312, "grad_norm": 5.697142601013184, "learning_rate": 1.6261471582675897e-08, "loss": 0.038, "step": 6238 }, { "epoch": 0.984185826399022, "grad_norm": 3.3728394508361816, "learning_rate": 1.6100466913540494e-08, "loss": 0.0185, "step": 6239 }, { "epoch": 0.9843435737666127, "grad_norm": 4.262262344360352, "learning_rate": 1.5939462244405087e-08, "loss": 0.0318, "step": 6240 }, { "epoch": 0.9845013211342035, "grad_norm": 5.060286998748779, "learning_rate": 1.577845757526968e-08, "loss": 0.0268, "step": 6241 }, { "epoch": 0.9846590685017944, "grad_norm": 8.937932968139648, "learning_rate": 1.5617452906134276e-08, "loss": 0.0384, "step": 6242 }, { "epoch": 0.9848168158693852, "grad_norm": 5.5496110916137695, "learning_rate": 1.5456448236998872e-08, "loss": 0.0177, "step": 6243 }, { "epoch": 0.984974563236976, "grad_norm": 2.7683727741241455, "learning_rate": 1.529544356786347e-08, "loss": 0.0172, "step": 6244 }, { "epoch": 0.9851323106045667, "grad_norm": 4.80905294418335, "learning_rate": 1.5134438898728062e-08, "loss": 0.0532, "step": 6245 }, { "epoch": 0.9852900579721576, "grad_norm": 6.039252281188965, "learning_rate": 1.4973434229592658e-08, "loss": 0.0267, "step": 6246 }, { "epoch": 0.9854478053397484, "grad_norm": 5.504446983337402, "learning_rate": 1.4812429560457253e-08, "loss": 0.0386, "step": 6247 }, { "epoch": 0.9856055527073392, "grad_norm": 3.770838975906372, "learning_rate": 1.4651424891321848e-08, "loss": 0.0688, "step": 6248 }, { "epoch": 0.98576330007493, "grad_norm": 2.9576942920684814, "learning_rate": 1.4490420222186443e-08, "loss": 0.0292, "step": 6249 }, { "epoch": 0.9859210474425208, "grad_norm": 2.8827409744262695, "learning_rate": 1.4329415553051039e-08, "loss": 0.0583, "step": 6250 }, { "epoch": 0.9860787948101116, "grad_norm": 5.529241561889648, "learning_rate": 1.4168410883915632e-08, "loss": 0.0552, "step": 6251 }, { "epoch": 0.9862365421777024, "grad_norm": 7.1768293380737305, "learning_rate": 1.4007406214780228e-08, "loss": 0.0735, "step": 6252 }, { "epoch": 0.9863942895452932, "grad_norm": 5.332305431365967, "learning_rate": 1.3846401545644823e-08, "loss": 0.0702, "step": 6253 }, { "epoch": 0.9865520369128841, "grad_norm": 4.009863376617432, "learning_rate": 1.368539687650942e-08, "loss": 0.0185, "step": 6254 }, { "epoch": 0.9867097842804748, "grad_norm": 9.313681602478027, "learning_rate": 1.3524392207374013e-08, "loss": 0.0651, "step": 6255 }, { "epoch": 0.9868675316480656, "grad_norm": 4.814515113830566, "learning_rate": 1.3363387538238609e-08, "loss": 0.0566, "step": 6256 }, { "epoch": 0.9870252790156564, "grad_norm": 1.7556432485580444, "learning_rate": 1.3202382869103204e-08, "loss": 0.0116, "step": 6257 }, { "epoch": 0.9871830263832472, "grad_norm": 3.093322515487671, "learning_rate": 1.3041378199967798e-08, "loss": 0.0229, "step": 6258 }, { "epoch": 0.9873407737508381, "grad_norm": 5.676116466522217, "learning_rate": 1.2880373530832393e-08, "loss": 0.0502, "step": 6259 }, { "epoch": 0.9874985211184288, "grad_norm": 4.5010857582092285, "learning_rate": 1.271936886169699e-08, "loss": 0.0453, "step": 6260 }, { "epoch": 0.9876562684860196, "grad_norm": 5.022231101989746, "learning_rate": 1.2558364192561583e-08, "loss": 0.0413, "step": 6261 }, { "epoch": 0.9878140158536104, "grad_norm": 2.8914976119995117, "learning_rate": 1.2397359523426179e-08, "loss": 0.0098, "step": 6262 }, { "epoch": 0.9879717632212013, "grad_norm": 4.519913196563721, "learning_rate": 1.2236354854290774e-08, "loss": 0.0255, "step": 6263 }, { "epoch": 0.9881295105887921, "grad_norm": 3.478266477584839, "learning_rate": 1.2075350185155368e-08, "loss": 0.0163, "step": 6264 }, { "epoch": 0.9882872579563828, "grad_norm": 5.429773807525635, "learning_rate": 1.1914345516019963e-08, "loss": 0.0466, "step": 6265 }, { "epoch": 0.9884450053239736, "grad_norm": 5.458247184753418, "learning_rate": 1.175334084688456e-08, "loss": 0.0484, "step": 6266 }, { "epoch": 0.9886027526915645, "grad_norm": 4.081238269805908, "learning_rate": 1.1592336177749153e-08, "loss": 0.0208, "step": 6267 }, { "epoch": 0.9887605000591553, "grad_norm": 6.506401062011719, "learning_rate": 1.1431331508613749e-08, "loss": 0.0308, "step": 6268 }, { "epoch": 0.988918247426746, "grad_norm": 5.5140838623046875, "learning_rate": 1.1270326839478344e-08, "loss": 0.0566, "step": 6269 }, { "epoch": 0.9890759947943368, "grad_norm": 6.035703659057617, "learning_rate": 1.110932217034294e-08, "loss": 0.0482, "step": 6270 }, { "epoch": 0.9892337421619277, "grad_norm": 5.993376731872559, "learning_rate": 1.0948317501207535e-08, "loss": 0.0602, "step": 6271 }, { "epoch": 0.9893914895295185, "grad_norm": 4.419226169586182, "learning_rate": 1.078731283207213e-08, "loss": 0.0416, "step": 6272 }, { "epoch": 0.9895492368971093, "grad_norm": 4.298280239105225, "learning_rate": 1.0626308162936726e-08, "loss": 0.0289, "step": 6273 }, { "epoch": 0.9897069842647, "grad_norm": 3.5037503242492676, "learning_rate": 1.0465303493801319e-08, "loss": 0.0404, "step": 6274 }, { "epoch": 0.9898647316322909, "grad_norm": 5.784923553466797, "learning_rate": 1.0304298824665916e-08, "loss": 0.0409, "step": 6275 }, { "epoch": 0.9900224789998817, "grad_norm": 3.270423173904419, "learning_rate": 1.014329415553051e-08, "loss": 0.0308, "step": 6276 }, { "epoch": 0.9901802263674725, "grad_norm": 3.4138495922088623, "learning_rate": 9.982289486395105e-09, "loss": 0.016, "step": 6277 }, { "epoch": 0.9903379737350633, "grad_norm": 7.211464881896973, "learning_rate": 9.8212848172597e-09, "loss": 0.0435, "step": 6278 }, { "epoch": 0.990495721102654, "grad_norm": 5.604741096496582, "learning_rate": 9.660280148124296e-09, "loss": 0.0377, "step": 6279 }, { "epoch": 0.9906534684702449, "grad_norm": 5.227348327636719, "learning_rate": 9.49927547898889e-09, "loss": 0.0307, "step": 6280 }, { "epoch": 0.9908112158378357, "grad_norm": 4.079761505126953, "learning_rate": 9.338270809853486e-09, "loss": 0.0744, "step": 6281 }, { "epoch": 0.9909689632054265, "grad_norm": 7.002961158752441, "learning_rate": 9.17726614071808e-09, "loss": 0.0492, "step": 6282 }, { "epoch": 0.9911267105730173, "grad_norm": 6.92935037612915, "learning_rate": 9.016261471582675e-09, "loss": 0.0401, "step": 6283 }, { "epoch": 0.9912844579406082, "grad_norm": 3.1296815872192383, "learning_rate": 8.85525680244727e-09, "loss": 0.0186, "step": 6284 }, { "epoch": 0.9914422053081989, "grad_norm": 4.725761890411377, "learning_rate": 8.694252133311866e-09, "loss": 0.038, "step": 6285 }, { "epoch": 0.9915999526757897, "grad_norm": 8.60621452331543, "learning_rate": 8.533247464176461e-09, "loss": 0.0718, "step": 6286 }, { "epoch": 0.9917577000433805, "grad_norm": 2.2828660011291504, "learning_rate": 8.372242795041056e-09, "loss": 0.0181, "step": 6287 }, { "epoch": 0.9919154474109714, "grad_norm": 5.0544867515563965, "learning_rate": 8.21123812590565e-09, "loss": 0.0244, "step": 6288 }, { "epoch": 0.9920731947785622, "grad_norm": 8.647851943969727, "learning_rate": 8.050233456770247e-09, "loss": 0.0428, "step": 6289 }, { "epoch": 0.9922309421461529, "grad_norm": 2.197507858276367, "learning_rate": 7.88922878763484e-09, "loss": 0.0136, "step": 6290 }, { "epoch": 0.9923886895137437, "grad_norm": 3.088834524154663, "learning_rate": 7.728224118499436e-09, "loss": 0.0199, "step": 6291 }, { "epoch": 0.9925464368813346, "grad_norm": 5.041046619415283, "learning_rate": 7.567219449364031e-09, "loss": 0.0793, "step": 6292 }, { "epoch": 0.9927041842489254, "grad_norm": 4.296901702880859, "learning_rate": 7.4062147802286265e-09, "loss": 0.0247, "step": 6293 }, { "epoch": 0.9928619316165161, "grad_norm": 5.000230312347412, "learning_rate": 7.245210111093221e-09, "loss": 0.0306, "step": 6294 }, { "epoch": 0.9930196789841069, "grad_norm": 4.815591812133789, "learning_rate": 7.084205441957816e-09, "loss": 0.0305, "step": 6295 }, { "epoch": 0.9931774263516977, "grad_norm": 2.091221809387207, "learning_rate": 6.9232007728224115e-09, "loss": 0.0372, "step": 6296 }, { "epoch": 0.9933351737192886, "grad_norm": 7.173551559448242, "learning_rate": 6.762196103687006e-09, "loss": 0.0485, "step": 6297 }, { "epoch": 0.9934929210868794, "grad_norm": 9.944113731384277, "learning_rate": 6.601191434551602e-09, "loss": 0.1298, "step": 6298 }, { "epoch": 0.9936506684544701, "grad_norm": 2.1898152828216553, "learning_rate": 6.4401867654161966e-09, "loss": 0.0382, "step": 6299 }, { "epoch": 0.9938084158220609, "grad_norm": 7.77823543548584, "learning_rate": 6.279182096280791e-09, "loss": 0.0489, "step": 6300 }, { "epoch": 0.9939661631896518, "grad_norm": 1.7604883909225464, "learning_rate": 6.118177427145387e-09, "loss": 0.0093, "step": 6301 }, { "epoch": 0.9941239105572426, "grad_norm": 6.472969055175781, "learning_rate": 5.957172758009982e-09, "loss": 0.037, "step": 6302 }, { "epoch": 0.9942816579248334, "grad_norm": 4.241221904754639, "learning_rate": 5.796168088874576e-09, "loss": 0.0185, "step": 6303 }, { "epoch": 0.9944394052924241, "grad_norm": 4.061306476593018, "learning_rate": 5.635163419739172e-09, "loss": 0.0231, "step": 6304 }, { "epoch": 0.994597152660015, "grad_norm": 5.12284517288208, "learning_rate": 5.4741587506037675e-09, "loss": 0.0429, "step": 6305 }, { "epoch": 0.9947549000276058, "grad_norm": 5.1497368812561035, "learning_rate": 5.313154081468363e-09, "loss": 0.0668, "step": 6306 }, { "epoch": 0.9949126473951966, "grad_norm": 8.419878005981445, "learning_rate": 5.152149412332958e-09, "loss": 0.0274, "step": 6307 }, { "epoch": 0.9950703947627874, "grad_norm": 5.198375225067139, "learning_rate": 4.9911447431975525e-09, "loss": 0.0306, "step": 6308 }, { "epoch": 0.9952281421303782, "grad_norm": 2.8158767223358154, "learning_rate": 4.830140074062148e-09, "loss": 0.0337, "step": 6309 }, { "epoch": 0.995385889497969, "grad_norm": 3.8012020587921143, "learning_rate": 4.669135404926743e-09, "loss": 0.0688, "step": 6310 }, { "epoch": 0.9955436368655598, "grad_norm": 4.970787525177002, "learning_rate": 4.5081307357913375e-09, "loss": 0.0504, "step": 6311 }, { "epoch": 0.9957013842331506, "grad_norm": 4.551598072052002, "learning_rate": 4.347126066655933e-09, "loss": 0.0289, "step": 6312 }, { "epoch": 0.9958591316007415, "grad_norm": 4.775918006896973, "learning_rate": 4.186121397520528e-09, "loss": 0.0391, "step": 6313 }, { "epoch": 0.9960168789683322, "grad_norm": 18.21245765686035, "learning_rate": 4.025116728385123e-09, "loss": 0.0501, "step": 6314 }, { "epoch": 0.996174626335923, "grad_norm": 5.041269779205322, "learning_rate": 3.864112059249718e-09, "loss": 0.0385, "step": 6315 }, { "epoch": 0.9963323737035138, "grad_norm": 5.841076850891113, "learning_rate": 3.7031073901143133e-09, "loss": 0.0392, "step": 6316 }, { "epoch": 0.9964901210711046, "grad_norm": 5.748498439788818, "learning_rate": 3.542102720978908e-09, "loss": 0.04, "step": 6317 }, { "epoch": 0.9966478684386955, "grad_norm": 4.236159324645996, "learning_rate": 3.381098051843503e-09, "loss": 0.0744, "step": 6318 }, { "epoch": 0.9968056158062862, "grad_norm": 3.2368459701538086, "learning_rate": 3.2200933827080983e-09, "loss": 0.0172, "step": 6319 }, { "epoch": 0.996963363173877, "grad_norm": 6.562069892883301, "learning_rate": 3.0590887135726934e-09, "loss": 0.0441, "step": 6320 }, { "epoch": 0.9971211105414678, "grad_norm": 3.658777952194214, "learning_rate": 2.898084044437288e-09, "loss": 0.0324, "step": 6321 }, { "epoch": 0.9972788579090587, "grad_norm": 7.275805473327637, "learning_rate": 2.7370793753018837e-09, "loss": 0.0652, "step": 6322 }, { "epoch": 0.9974366052766495, "grad_norm": 8.29052734375, "learning_rate": 2.576074706166479e-09, "loss": 0.0461, "step": 6323 }, { "epoch": 0.9975943526442402, "grad_norm": 7.682597637176514, "learning_rate": 2.415070037031074e-09, "loss": 0.0827, "step": 6324 }, { "epoch": 0.997752100011831, "grad_norm": 7.659365653991699, "learning_rate": 2.2540653678956688e-09, "loss": 0.0365, "step": 6325 }, { "epoch": 0.9979098473794219, "grad_norm": 8.81132984161377, "learning_rate": 2.093060698760264e-09, "loss": 0.0539, "step": 6326 }, { "epoch": 0.9980675947470127, "grad_norm": 3.7465226650238037, "learning_rate": 1.932056029624859e-09, "loss": 0.0224, "step": 6327 }, { "epoch": 0.9982253421146035, "grad_norm": 5.081471920013428, "learning_rate": 1.771051360489454e-09, "loss": 0.04, "step": 6328 }, { "epoch": 0.9983830894821942, "grad_norm": 2.309492349624634, "learning_rate": 1.6100466913540491e-09, "loss": 0.0248, "step": 6329 }, { "epoch": 0.9985408368497851, "grad_norm": 13.910364151000977, "learning_rate": 1.449042022218644e-09, "loss": 0.0439, "step": 6330 }, { "epoch": 0.9986985842173759, "grad_norm": 3.937494993209839, "learning_rate": 1.2880373530832394e-09, "loss": 0.0307, "step": 6331 }, { "epoch": 0.9988563315849667, "grad_norm": 4.481929302215576, "learning_rate": 1.1270326839478344e-09, "loss": 0.0442, "step": 6332 }, { "epoch": 0.9990140789525574, "grad_norm": 6.280672550201416, "learning_rate": 9.660280148124295e-10, "loss": 0.0558, "step": 6333 }, { "epoch": 0.9991718263201483, "grad_norm": 4.704923629760742, "learning_rate": 8.050233456770246e-10, "loss": 0.0198, "step": 6334 }, { "epoch": 0.9993295736877391, "grad_norm": 2.0602962970733643, "learning_rate": 6.440186765416197e-10, "loss": 0.0178, "step": 6335 }, { "epoch": 0.9994873210553299, "grad_norm": 3.155426025390625, "learning_rate": 4.830140074062148e-10, "loss": 0.0311, "step": 6336 }, { "epoch": 0.9996450684229207, "grad_norm": 5.537027835845947, "learning_rate": 3.2200933827080986e-10, "loss": 0.0388, "step": 6337 }, { "epoch": 0.9998028157905114, "grad_norm": 3.089198589324951, "learning_rate": 1.6100466913540493e-10, "loss": 0.0198, "step": 6338 }, { "epoch": 0.9999605631581023, "grad_norm": 5.744074821472168, "learning_rate": 0.0, "loss": 0.0391, "step": 6339 }, { "epoch": 0.9999605631581023, "step": 6339, "total_flos": 1.5123386349736428e+18, "train_loss": 0.08585457195308795, "train_runtime": 144531.5462, "train_samples_per_second": 11.228, "train_steps_per_second": 0.044 } ], "logging_steps": 1, "max_steps": 6339, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 768, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5123386349736428e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }